Commit 7d16ef7a996a230b0e9769533808b852a8e44497

Authored by Myriam Bouchemit
1 parent 3c1a161e
Exists in master and in 1 other branch ubuntu

Improve checkDDBase script

Showing 1 changed file with 105 additions and 16 deletions   Show diff stats
tests/checkDDBase.py
... ... @@ -6,21 +6,20 @@ import netCDF4 as nc
6 6 import numpy as np
7 7 import gzip
8 8 import pickle
  9 +import glob
9 10 from datetime import datetime
10 11  
11 12 vi_to_exclude = [
12   - 'juno_ephem_orb1',
13   - 'juno_fgm_orbfull',
14   - 'juno_fgm_orb1',
15   - 'juno_fgm_orb60',
  13 + #'juno_fgm_orbfull',
16 14 'juno_jedi_i090',
17 15 'juno_jedi_i180',
18 16 'juno_jedi_i270',
19 17 'juno_jedi_e270',
20 18 'juno_jedi_e180',
21 19 'juno_jedi_e090',
22   - 'juno_fgm_cruise60',
23 20 'ros_magob_1s',
  21 + 'juno_fgm_orb1',
  22 + 'gtl_epic_ed'
24 23 ]
25 24  
26 25 def datetime_to_ddtime(date_time):
... ... @@ -30,8 +29,25 @@ def datetime_to_ddtime(date_time):
30 29 def is_sorted(l):
31 30 return all(a <= b for a, b in zip(l, l[1:]))
32 31  
  32 +def diff_dicts(a, b, drop_similar=True):
  33 + res = a.copy()
  34 +
  35 + for k in res:
  36 + if k not in b:
  37 + res[k] = (res[k], None)
  38 +
  39 + for k in b:
  40 + if k in res:
  41 + res[k] = (res[k], b[k])
  42 + else:
  43 + res[k] = (None, b[k])
33 44  
34   -def check_vi(e, cachedir):
  45 + if drop_similar:
  46 + res = {k:v for k,v in res.items() if v[0] != v[1]}
  47 +
  48 + return res
  49 +
  50 +def check_vi(e, cachedir, clean):
35 51 name = e.find("NAME").text
36 52 logging.info('========== {} =========='.format(name))
37 53 base = e.find("NAME").attrib['base']
... ... @@ -129,6 +145,11 @@ def check_vi(e, cachedir):
129 145 if os.path.isfile(os.path.join(location,'LOCK')):
130 146 logging.warning("LOCK file detected")
131 147  
  148 + files_in_dir = []
  149 + for f in glob.glob(location+'*.nc.gz'):
  150 + files_in_dir.append(f)
  151 +
  152 + first_file_structure = None
132 153 for i in range(len(files_names)):
133 154 f = files_names[i]
134 155 start = start_times[i]
... ... @@ -136,16 +157,19 @@ def check_vi(e, cachedir):
136 157  
137 158 gzipped_f = os.path.join(location, f) + ".gz"
138 159  
139   - if f in cache_check:
140   - if cache_check[f]['status'] and os.path.getmtime(gzipped_f) == cache_check[f]['mtime']:
141   - continue
142   -
143   - logging.info(f)
  160 + if gzipped_f in files_in_dir:
  161 + files_in_dir.remove(gzipped_f)
144 162  
145 163 if not os.path.isfile(gzipped_f):
146 164 logging.error("Missing data file {}".format(gzipped_f))
147 165 continue
148 166  
  167 + if (f in cache_check) and (first_file_structure is not None): #always check first dataset file
  168 + if cache_check[f]['status'] and os.path.getmtime(gzipped_f) == cache_check[f]['mtime']:
  169 + continue
  170 +
  171 + logging.info(f)
  172 +
149 173 try:
150 174 cache_check[f] = {
151 175 'status': True,
... ... @@ -167,6 +191,24 @@ def check_vi(e, cachedir):
167 191 cache_check[f]['status'] = False
168 192 continue
169 193  
  194 + crt_file_structure = {}
  195 + for v in dds.variables:
  196 + if v in ['Time', 'StartTime', 'StopTime']:
  197 + continue
  198 + if not dds.dimensions[dds.variables[v].dimensions[0]].isunlimited():
  199 + continue
  200 + crt_file_structure[v] = {"shape": dds.variables[v].shape[1:], "dtype": dds.variables[v].dtype}
  201 +
  202 + if first_file_structure is None:
  203 + first_file_structure = crt_file_structure
  204 + elif first_file_structure != crt_file_structure:
  205 + logging.error("Incoherence in file structure {}".format(gzipped_f))
  206 + print(first_file_structure)
  207 + print(crt_file_structure)
  208 + print(diff_dicts(first_file_structure, crt_file_structure))
  209 + cache_check[f]['status'] = False
  210 + continue
  211 +
170 212 times = []
171 213 for t in np.array(dds.variables["Time"]):
172 214 if dds['Time'].dtype == np.float64:
... ... @@ -175,36 +217,80 @@ def check_vi(e, cachedir):
175 217 else:
176 218 # DDTime
177 219 t_str = "".join([k.decode("UTF-8") for k in t])
178   - times.append(int(t_str))
  220 + try:
  221 + times.append(int(t_str))
  222 + except:
  223 + logging.error("Bad time format in data file {}".format(gzipped_f))
  224 + cache_check[f]['status'] = False
  225 + continue
179 226  
  227 + remove_duplicate = False
180 228 if len(times) != len(set(times)):
181 229 logging.warning("Duplicate times in {}".format(gzipped_f))
  230 + #print(set([x for x in times if times.count(x) > 1]))
182 231 cache_check[f]['status'] = False
  232 + if clean:
  233 + remove_duplicate = True
183 234  
  235 + sort_times = False
184 236 if not is_sorted(times):
  237 + prev = 0.
  238 + for t in times:
  239 + if prev > t:
  240 + print(t)
  241 + prev = t
185 242 logging.warning("Time not sorted in {}".format(gzipped_f))
186 243 cache_check[f]['status'] = False
  244 + if clean:
  245 + sort_times = True
187 246  
  247 + remove_outside_times = False
188 248 for t in times:
189 249 if t < start or t > stop:
190   - logging.warning("Time outside [StartTime, StopTime] detected in {}".format(gzipped_f))
  250 + logging.warning("Time {} outside [{}, {}] detected in {}".format(t, start, stop, gzipped_f))
191 251 cache_check[f]['status'] = False
  252 + if clean:
  253 + remove_outside_times = True
192 254 break
193 255  
  256 + dds.close()
  257 + if clean and (remove_duplicate or sort_times or remove_outside_times):
  258 + clean_f = os.path.join("./", "clean_"+f)
  259 + logging.warning(clean_f)
  260 + with open(clean_f, "wb") as clean_nc_file:
  261 + clean_nc_file.write(ncdata)
  262 + dds_clean = nc.Dataset(clean_f, mode='a')
  263 + clean_data_file(dds_clean, remove_duplicate, sort_times, remove_outside_times)
  264 + dds_clean.close()
  265 +
194 266 with open(cachefile, 'wb') as handle:
195 267 pickle.dump(cache_check, handle, protocol=pickle.HIGHEST_PROTOCOL)
196 268  
  269 + if files_in_dir:
  270 + for f in files_in_dir:
  271 + logging.warning("File {} in {} but not in *_times.nc file".format(f, location))
  272 +
197 273 return True
198 274  
199 275  
200   -def check_ddbase(ddsys, vi=None, cachedir=None):
  276 +def clean_data_file(dds_clean, remove_duplicate, sort_times, remove_outside_times):
  277 + records_to_remove = []
  278 + if remove_outside_times:
  279 + pass
  280 + if remove_duplicate:
  281 + pass
  282 + if sort_times:
  283 + pass
  284 +
  285 +
  286 +def check_ddbase(ddsys, vi=None, cachedir=None, clean=False):
201 287 tree=etree.parse(ddsys)
202 288 for e in tree.iter(tag="VI"):
203 289 if vi and e.find("NAME").text != vi:
204 290 continue
205 291 if not vi and e.find("NAME").text in vi_to_exclude:
206 292 continue
207   - check_vi(e, cachedir)
  293 + check_vi(e, cachedir, clean)
208 294  
209 295  
210 296 if __name__ == '__main__':
... ... @@ -215,6 +301,9 @@ if __name__ == &#39;__main__&#39;:
215 301 if not os.path.exists(cachedir):
216 302 os.makedirs(cachedir)
217 303 vi = None
  304 + clean = False
218 305 if len(sys.argv) > 1:
219 306 vi = sys.argv[1]
220   - check_ddbase(ddsys, vi=vi, cachedir=cachedir)
  307 + if (len(sys.argv) > 2) and (sys.argv[2] == "clean"):
  308 + clean = True
  309 + check_ddbase(ddsys, vi=vi, cachedir=cachedir, clean=clean)
... ...