Commit 7d16ef7a996a230b0e9769533808b852a8e44497
1 parent
3c1a161e
Exists in
master
and in
1 other branch
Improve checkDDBase script
Showing
1 changed file
with
105 additions
and
16 deletions
Show diff stats
tests/checkDDBase.py
... | ... | @@ -6,21 +6,20 @@ import netCDF4 as nc |
6 | 6 | import numpy as np |
7 | 7 | import gzip |
8 | 8 | import pickle |
9 | +import glob | |
9 | 10 | from datetime import datetime |
10 | 11 | |
11 | 12 | vi_to_exclude = [ |
12 | - 'juno_ephem_orb1', | |
13 | - 'juno_fgm_orbfull', | |
14 | - 'juno_fgm_orb1', | |
15 | - 'juno_fgm_orb60', | |
13 | + #'juno_fgm_orbfull', | |
16 | 14 | 'juno_jedi_i090', |
17 | 15 | 'juno_jedi_i180', |
18 | 16 | 'juno_jedi_i270', |
19 | 17 | 'juno_jedi_e270', |
20 | 18 | 'juno_jedi_e180', |
21 | 19 | 'juno_jedi_e090', |
22 | - 'juno_fgm_cruise60', | |
23 | 20 | 'ros_magob_1s', |
21 | + 'juno_fgm_orb1', | |
22 | + 'gtl_epic_ed' | |
24 | 23 | ] |
25 | 24 | |
26 | 25 | def datetime_to_ddtime(date_time): |
... | ... | @@ -30,8 +29,25 @@ def datetime_to_ddtime(date_time): |
30 | 29 | def is_sorted(l): |
31 | 30 | return all(a <= b for a, b in zip(l, l[1:])) |
32 | 31 | |
32 | +def diff_dicts(a, b, drop_similar=True): | |
33 | + res = a.copy() | |
34 | + | |
35 | + for k in res: | |
36 | + if k not in b: | |
37 | + res[k] = (res[k], None) | |
38 | + | |
39 | + for k in b: | |
40 | + if k in res: | |
41 | + res[k] = (res[k], b[k]) | |
42 | + else: | |
43 | + res[k] = (None, b[k]) | |
33 | 44 | |
34 | -def check_vi(e, cachedir): | |
45 | + if drop_similar: | |
46 | + res = {k:v for k,v in res.items() if v[0] != v[1]} | |
47 | + | |
48 | + return res | |
49 | + | |
50 | +def check_vi(e, cachedir, clean): | |
35 | 51 | name = e.find("NAME").text |
36 | 52 | logging.info('========== {} =========='.format(name)) |
37 | 53 | base = e.find("NAME").attrib['base'] |
... | ... | @@ -129,6 +145,11 @@ def check_vi(e, cachedir): |
129 | 145 | if os.path.isfile(os.path.join(location,'LOCK')): |
130 | 146 | logging.warning("LOCK file detected") |
131 | 147 | |
148 | + files_in_dir = [] | |
149 | + for f in glob.glob(location+'*.nc.gz'): | |
150 | + files_in_dir.append(f) | |
151 | + | |
152 | + first_file_structure = None | |
132 | 153 | for i in range(len(files_names)): |
133 | 154 | f = files_names[i] |
134 | 155 | start = start_times[i] |
... | ... | @@ -136,16 +157,19 @@ def check_vi(e, cachedir): |
136 | 157 | |
137 | 158 | gzipped_f = os.path.join(location, f) + ".gz" |
138 | 159 | |
139 | - if f in cache_check: | |
140 | - if cache_check[f]['status'] and os.path.getmtime(gzipped_f) == cache_check[f]['mtime']: | |
141 | - continue | |
142 | - | |
143 | - logging.info(f) | |
160 | + if gzipped_f in files_in_dir: | |
161 | + files_in_dir.remove(gzipped_f) | |
144 | 162 | |
145 | 163 | if not os.path.isfile(gzipped_f): |
146 | 164 | logging.error("Missing data file {}".format(gzipped_f)) |
147 | 165 | continue |
148 | 166 | |
167 | + if (f in cache_check) and (first_file_structure is not None): #always check first dataset file | |
168 | + if cache_check[f]['status'] and os.path.getmtime(gzipped_f) == cache_check[f]['mtime']: | |
169 | + continue | |
170 | + | |
171 | + logging.info(f) | |
172 | + | |
149 | 173 | try: |
150 | 174 | cache_check[f] = { |
151 | 175 | 'status': True, |
... | ... | @@ -167,6 +191,24 @@ def check_vi(e, cachedir): |
167 | 191 | cache_check[f]['status'] = False |
168 | 192 | continue |
169 | 193 | |
194 | + crt_file_structure = {} | |
195 | + for v in dds.variables: | |
196 | + if v in ['Time', 'StartTime', 'StopTime']: | |
197 | + continue | |
198 | + if not dds.dimensions[dds.variables[v].dimensions[0]].isunlimited(): | |
199 | + continue | |
200 | + crt_file_structure[v] = {"shape": dds.variables[v].shape[1:], "dtype": dds.variables[v].dtype} | |
201 | + | |
202 | + if first_file_structure is None: | |
203 | + first_file_structure = crt_file_structure | |
204 | + elif first_file_structure != crt_file_structure: | |
205 | + logging.error("Incoherence in file structure {}".format(gzipped_f)) | |
206 | + print(first_file_structure) | |
207 | + print(crt_file_structure) | |
208 | + print(diff_dicts(first_file_structure, crt_file_structure)) | |
209 | + cache_check[f]['status'] = False | |
210 | + continue | |
211 | + | |
170 | 212 | times = [] |
171 | 213 | for t in np.array(dds.variables["Time"]): |
172 | 214 | if dds['Time'].dtype == np.float64: |
... | ... | @@ -175,36 +217,80 @@ def check_vi(e, cachedir): |
175 | 217 | else: |
176 | 218 | # DDTime |
177 | 219 | t_str = "".join([k.decode("UTF-8") for k in t]) |
178 | - times.append(int(t_str)) | |
220 | + try: | |
221 | + times.append(int(t_str)) | |
222 | + except: | |
223 | + logging.error("Bad time format in data file {}".format(gzipped_f)) | |
224 | + cache_check[f]['status'] = False | |
225 | + continue | |
179 | 226 | |
227 | + remove_duplicate = False | |
180 | 228 | if len(times) != len(set(times)): |
181 | 229 | logging.warning("Duplicate times in {}".format(gzipped_f)) |
230 | + #print(set([x for x in times if times.count(x) > 1])) | |
182 | 231 | cache_check[f]['status'] = False |
232 | + if clean: | |
233 | + remove_duplicate = True | |
183 | 234 | |
235 | + sort_times = False | |
184 | 236 | if not is_sorted(times): |
237 | + prev = 0. | |
238 | + for t in times: | |
239 | + if prev > t: | |
240 | + print(t) | |
241 | + prev = t | |
185 | 242 | logging.warning("Time not sorted in {}".format(gzipped_f)) |
186 | 243 | cache_check[f]['status'] = False |
244 | + if clean: | |
245 | + sort_times = True | |
187 | 246 | |
247 | + remove_outside_times = False | |
188 | 248 | for t in times: |
189 | 249 | if t < start or t > stop: |
190 | - logging.warning("Time outside [StartTime, StopTime] detected in {}".format(gzipped_f)) | |
250 | + logging.warning("Time {} outside [{}, {}] detected in {}".format(t, start, stop, gzipped_f)) | |
191 | 251 | cache_check[f]['status'] = False |
252 | + if clean: | |
253 | + remove_outside_times = True | |
192 | 254 | break |
193 | 255 | |
256 | + dds.close() | |
257 | + if clean and (remove_duplicate or sort_times or remove_outside_times): | |
258 | + clean_f = os.path.join("./", "clean_"+f) | |
259 | + logging.warning(clean_f) | |
260 | + with open(clean_f, "wb") as clean_nc_file: | |
261 | + clean_nc_file.write(ncdata) | |
262 | + dds_clean = nc.Dataset(clean_f, mode='a') | |
263 | + clean_data_file(dds_clean, remove_duplicate, sort_times, remove_outside_times) | |
264 | + dds_clean.close() | |
265 | + | |
194 | 266 | with open(cachefile, 'wb') as handle: |
195 | 267 | pickle.dump(cache_check, handle, protocol=pickle.HIGHEST_PROTOCOL) |
196 | 268 | |
269 | + if files_in_dir: | |
270 | + for f in files_in_dir: | |
271 | + logging.warning("File {} in {} but not in *_times.nc file".format(f, location)) | |
272 | + | |
197 | 273 | return True |
198 | 274 | |
199 | 275 | |
200 | -def check_ddbase(ddsys, vi=None, cachedir=None): | |
276 | +def clean_data_file(dds_clean, remove_duplicate, sort_times, remove_outside_times): | |
277 | + records_to_remove = [] | |
278 | + if remove_outside_times: | |
279 | + pass | |
280 | + if remove_duplicate: | |
281 | + pass | |
282 | + if sort_times: | |
283 | + pass | |
284 | + | |
285 | + | |
286 | +def check_ddbase(ddsys, vi=None, cachedir=None, clean=False): | |
201 | 287 | tree=etree.parse(ddsys) |
202 | 288 | for e in tree.iter(tag="VI"): |
203 | 289 | if vi and e.find("NAME").text != vi: |
204 | 290 | continue |
205 | 291 | if not vi and e.find("NAME").text in vi_to_exclude: |
206 | 292 | continue |
207 | - check_vi(e, cachedir) | |
293 | + check_vi(e, cachedir, clean) | |
208 | 294 | |
209 | 295 | |
210 | 296 | if __name__ == '__main__': |
... | ... | @@ -215,6 +301,9 @@ if __name__ == '__main__': |
215 | 301 | if not os.path.exists(cachedir): |
216 | 302 | os.makedirs(cachedir) |
217 | 303 | vi = None |
304 | + clean = False | |
218 | 305 | if len(sys.argv) > 1: |
219 | 306 | vi = sys.argv[1] |
220 | - check_ddbase(ddsys, vi=vi, cachedir=cachedir) | |
307 | + if (len(sys.argv) > 2) and (sys.argv[2] == "clean"): | |
308 | + clean = True | |
309 | + check_ddbase(ddsys, vi=vi, cachedir=cachedir, clean=clean) | ... | ... |