read_data.py
9.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
#! /bin/env python
# coding: utf8
import ceflib
from datetime import datetime
from datetime import timedelta
import numpy as np
import os
import matplotlib.pyplot as plt
import pickle
class donnee(object) :
def __init__(self, fichier) :
"""
Donnee object : open data from cef file
This object is useful to manage timeseries with implemented calculation on timedeltas
all data are stored in self.donnee
REMAINING TO DO : Harmonize all methods conventional name (use either _ of capital)
Rename self.donnee into self.data
Rename the class into a more appropriate international word
Add Nan & Fillvall treatment to all methods
Add a write to cef file method
Add a get_as_tensor method (similar to get_as_field)
Vectorize all methods
"""
ceflib.verbosity(0)
self._fichier = fichier
ceflib.read_metadata(self._fichier)
self._metanames = np.copy(ceflib.metanames())
self._metadata = {}
self._varnames = np.copy(ceflib.varnames())
for name in self._metanames : self._metadata[name] = np.copy(ceflib.meta(name))
ceflib.close()
self.donnee = {}
self.varattributes = {}
self._get_time()
def _get_time(self) :
ceflib.read(self._fichier)
self.donnee['epoch'] = np.copy(np.array([datetime.utcfromtimestamp(ceflib.milli_to_timestamp(i)) for i in ceflib.var('epoch')]))
self.varattributes['epoch'] = {}
for attribute in ceflib.vattributes('epoch') : self.varattributes['epoch'][attribute] = ceflib.vattr('epoch', attribute)
ceflib.close()
def get_time(self) :
if 'epoch' not in self.donnee : self._get_time()
return self.donnee['epoch']
def get_varnames(self) :
return self._varnames
#def _get_data(self, var = 'all') :
##self.donnee = {}
#ceflib.read(self._fichier)
#if var == 'all' :
#for nom in self._varnames:
#if nom not in ['epoch', 'K001', 'K002'] : self.donnee[nom] = np.copy(ceflib.var(nom))
#else : self.donnee[var] = np.copy(ceflib.var(var))
#ceflib.close()
def _modify_fillval_to_nan(self, var):
self.donnee[var][self.donnee[var] == float(self.varattributes[var]['FILLVAL'])] = np.nan
return self.donnee[var]
def _load_var(self, var) :
if var not in self.donnee :
if var in self._varnames :
ceflib.read(self._fichier)
self.donnee[var] = np.copy(ceflib.var(var))
self.varattributes[var] = {}
for attribute in ceflib.vattributes(var) : self.varattributes[var][attribute] = ceflib.vattr(var, attribute)
ceflib.close()
if 'FILLVAL' in self.varattributes[var] : self._modify_fillval_to_nan(var)
else : print "!!!Input Variable Not Found!!!"
def get_data(self, var) :
self._load_var(var)
return self.donnee[var]
def get_data_as_field(self, varz, t1 = None, t2 = None, checkorder = False, withtime = False) :
"""
input as many field as wanted
return a numpy array of len(varz) times len(data) dimension, columns are sorted with varz names
t1 and t2 (optionnal) : if only a particular interval is required, input as time
WARNING : = used in this function instead of np.copy, this may lead to some issues
"""
for var in varz : self._load_var(var)
out = np.zeros([len(self.donnee['epoch']), len(varz)])
for i, var in enumerate(sorted(varz)) : out[:,i] = self.donnee[var]
if withtime : out = np.column_stack((self.donnee['epoch'], out))
if checkorder : print 'columns ordered as ', sorted(varz)
if t1 : t1 = abs(self.donnee['epoch'] - t1).argmin()
if t2 : t2 = abs(self.donnee['epoch'] - t2).argmin()
return out[t1:t2, :]
def get_std_as_field(self, varz, t1 = None, t2 = None) :
"""
use self.get_data_as_field() and mean it
return a numpy array of len(varz) dimension
"""
out2 = np.zeros(len(varz))
out = self.get_data_as_field(varz, t1, t2)
for i in range(len(varz)) : out2[i] = out[:,i].std()
return out2
def get_std(self, t1, t2, var) :
self._load_var(var)
i1 = abs(self.donnee['epoch'] - t1).argmin()
i2 = abs(self.donnee['epoch'] - t2).argmin()
return self.donnee[var][i1:i2].std()
def get_mean_as_field(self, varz, t1 = None, t2 = None) :
"""
use self.get_data_as_field() and mean it
return a numpy array of len(varz) dimension
"""
out2 = np.zeros(len(varz))
out = self.get_data_as_field(varz, t1, t2)
for i in range(len(varz)) : out2[i] = out[:,i].mean()
return out2
def get_mean(self, t1, t2, var) :
"""
# NOTE: a.mean() when a posseses NaN return NaN -> treatment required
get mean value of an interval. NaN treatment implemented
"""
self._load_var(var)
i1 = abs(self.donnee['epoch'] - t1).argmin()
i2 = abs(self.donnee['epoch'] - t2).argmin()
return self.donnee[var][i1:i2][~np.isnan(self.donnee[var][i1:i2])].mean()
#return self.donnee[var][i1:i2].mean()
def getIndex(self, t = None):
if t : return abs(self.donnee['epoch'] - t).argmin()
else : return None
def get_projected(self, varz, matrice, new_coord = ['l', 'm', 'n']) :
"""
!!! Only works with 3D array at the moment
user must provide a transport matrix, with L,M,N in each colums of the matrix, so that:
mat[:,0] = l, mat[:,1] = m, mat[:,2] = n, with mat being a numpy.mat object
and new_vect = old_vect([x,y,z]) * matrix, or new_vect = inv(matrix) * old_vect[[x], [y], [z]]
Example : l = numpy.array([ 0.16631492, -0.08297703, 0.98257527])
m = numpy.array([-0.4464368 , -0.89481517, 0. ])
n = numpy.array([ 0.87922325, -0.43865776, -0.18586511])
matrice = numpy.mat([l,m,n]).transpose()
"""
field = self.get_data_as_field(varz)
f = np.array([vect * matrice for vect in field])[:,0]
self.donnee[new_coord[0]] = f[:,0]
self.donnee[new_coord[1]] = f[:,1]
self.donnee[new_coord[2]] = f[:,2]
return {new_coord[i] : self.donnee[new_coord[i]] for i in range(3)}
def addField(self, field, abscisse = None, name = 'New_interpolated_field') :
"""
Add new field in the donnee object, use old abscisse to interpolate on current absisse
If no abscisse is given... do something?
TO DO : modify self._varnames ????
"""
def interp(xnew, xold, yold) :
def tofloat(d) :
ref = datetime(2001,1,1) #Timedelta Calculé à partir de 2001!!
return (d-ref).total_seconds()
tofloat = np.vectorize(tofloat)
ynew = np.interp(tofloat(xnew),tofloat(xold) ,yold)
return ynew
self.donnee[name] = interp(self.donnee['epoch'], abscisse, field)
return self.donnee[name]
def get_interval(self, var, t1 = None, t2 = None, include = False) :
"""
get part of the data set, between start time t1 and stop time t2
include : include last element, python drop it by default (default is False)
"""
self._load_var(var)
if t1 : t1 = abs(self.donnee['epoch'] - t1).argmin()
if t2 :
t2 = abs(self.donnee['epoch'] - t2).argmin()
return self.donnee[var][t1:t2 + int(include)]
else : return self.donnee[var][t1:t2]
def get_metadata(self, name = None) :
if name in self._metadata : return self._metadata[name]
else : return ''
def get_metanames(self) :
return [name for name in self._metadata]
def plot(self, var, show = True, newfig = False) :
if not isinstance(var, list) : var = [var,]
if newfig : f = plt.figure()
for v in var :
print 'plotting', v
self._load_var(v)
plt.plot_date(self.donnee['epoch'], self.donnee[v], '-', label = v)
plt.legend()
if show : plt.show()
def time_interpolation(x1, x2, y2) :
"""
Time interpolation for 2D data :
x1 = reference abscisse (in datetime objects)
x2, y2 = abscisse and ordinate of data to interpolate
return y3 = y2 on x1
"""
def tofloat(d) :
ref = datetime(2010,1,1) #Timedelta Calculé à partir de 2010!!
return (d-ref).total_seconds()
tofloat = np.vectorize(tofloat)
if y2.dtype != 'float' : y2 = y2.astype('float')
y3 = np.interp(tofloat(x1),tofloat(x2) ,y2)
return y3
def pickleSave(filename = 'pickleSave', data = ''):
"""
Input filename WITHOUT any extension
a .save extension will be added to the name
This routine DO NOT allow overwriting!!
"""
if os.path.exists(filename) :
i = 0
while os.path.exists(filename) :
filename = filename + str(i)
i+=1
filename = filename + '.save'
print 'Saving data into ', filename
print 'FYI current working directory is ', os.get_cwd()
with open(filename, 'w') as sv : pickle.dump(data, sv)
return data
def pickleLoad(filename = 'pickleSave', filename_has_extension = False):
if not filename_has_extension :
print 'Assume file extension is not provided, adding .save to the filename'
filename = filename + '.save'
if os.path.exists(filename):
print 'returning data'
with open(savename, 'rb') as sv: return pickle.load(sv)
else :
print 'file not found, please check filename and extension ', filename
#class field(np.ndarray) :
#def __init__(self, array):
#self = array
#self.header = {}
#for i in self.shape[1] : self.header[i] = ''
#def set_header(self, col, header) :
#self.header[col] = header
#class list_data(object) :
#def __init__(self) :
#self.chemin = '/data'
#self.tt_dirs = []
#for tt_rep in sorted(os.walk(self.chemin).next()[1]) : self.tt_dirs.append(tt_rep)
#for tt_rep in glob.glob(chemin , )
#for fichier in glob.glob(os.path.join(self.repertoire['default'].get(), self.listedir.selection_get(), '*.cef')):