From 7d16ef7a996a230b0e9769533808b852a8e44497 Mon Sep 17 00:00:00 2001 From: Myriam Bouchemit Date: Mon, 18 Mar 2024 11:54:01 +0100 Subject: [PATCH] Improve checkDDBase script --- tests/checkDDBase.py | 121 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 105 insertions(+), 16 deletions(-) diff --git a/tests/checkDDBase.py b/tests/checkDDBase.py index e824cca..ffa5df2 100644 --- a/tests/checkDDBase.py +++ b/tests/checkDDBase.py @@ -6,21 +6,20 @@ import netCDF4 as nc import numpy as np import gzip import pickle +import glob from datetime import datetime vi_to_exclude = [ - 'juno_ephem_orb1', - 'juno_fgm_orbfull', - 'juno_fgm_orb1', - 'juno_fgm_orb60', + #'juno_fgm_orbfull', 'juno_jedi_i090', 'juno_jedi_i180', 'juno_jedi_i270', 'juno_jedi_e270', 'juno_jedi_e180', 'juno_jedi_e090', - 'juno_fgm_cruise60', 'ros_magob_1s', + 'juno_fgm_orb1', + 'gtl_epic_ed' ] def datetime_to_ddtime(date_time): @@ -30,8 +29,25 @@ def datetime_to_ddtime(date_time): def is_sorted(l): return all(a <= b for a, b in zip(l, l[1:])) +def diff_dicts(a, b, drop_similar=True): + res = a.copy() + + for k in res: + if k not in b: + res[k] = (res[k], None) + + for k in b: + if k in res: + res[k] = (res[k], b[k]) + else: + res[k] = (None, b[k]) -def check_vi(e, cachedir): + if drop_similar: + res = {k:v for k,v in res.items() if v[0] != v[1]} + + return res + +def check_vi(e, cachedir, clean): name = e.find("NAME").text logging.info('========== {} =========='.format(name)) base = e.find("NAME").attrib['base'] @@ -129,6 +145,11 @@ def check_vi(e, cachedir): if os.path.isfile(os.path.join(location,'LOCK')): logging.warning("LOCK file detected") + files_in_dir = [] + for f in glob.glob(location+'*.nc.gz'): + files_in_dir.append(f) + + first_file_structure = None for i in range(len(files_names)): f = files_names[i] start = start_times[i] @@ -136,16 +157,19 @@ def check_vi(e, cachedir): gzipped_f = os.path.join(location, f) + ".gz" - if f in cache_check: - if cache_check[f]['status'] and os.path.getmtime(gzipped_f) == cache_check[f]['mtime']: - continue - - logging.info(f) + if gzipped_f in files_in_dir: + files_in_dir.remove(gzipped_f) if not os.path.isfile(gzipped_f): logging.error("Missing data file {}".format(gzipped_f)) continue + if (f in cache_check) and (first_file_structure is not None): #always check first dataset file + if cache_check[f]['status'] and os.path.getmtime(gzipped_f) == cache_check[f]['mtime']: + continue + + logging.info(f) + try: cache_check[f] = { 'status': True, @@ -167,6 +191,24 @@ def check_vi(e, cachedir): cache_check[f]['status'] = False continue + crt_file_structure = {} + for v in dds.variables: + if v in ['Time', 'StartTime', 'StopTime']: + continue + if not dds.dimensions[dds.variables[v].dimensions[0]].isunlimited(): + continue + crt_file_structure[v] = {"shape": dds.variables[v].shape[1:], "dtype": dds.variables[v].dtype} + + if first_file_structure is None: + first_file_structure = crt_file_structure + elif first_file_structure != crt_file_structure: + logging.error("Incoherence in file structure {}".format(gzipped_f)) + print(first_file_structure) + print(crt_file_structure) + print(diff_dicts(first_file_structure, crt_file_structure)) + cache_check[f]['status'] = False + continue + times = [] for t in np.array(dds.variables["Time"]): if dds['Time'].dtype == np.float64: @@ -175,36 +217,80 @@ def check_vi(e, cachedir): else: # DDTime t_str = "".join([k.decode("UTF-8") for k in t]) - times.append(int(t_str)) + try: + times.append(int(t_str)) + except: + logging.error("Bad time format in data file {}".format(gzipped_f)) + cache_check[f]['status'] = False + continue + remove_duplicate = False if len(times) != len(set(times)): logging.warning("Duplicate times in {}".format(gzipped_f)) + #print(set([x for x in times if times.count(x) > 1])) cache_check[f]['status'] = False + if clean: + remove_duplicate = True + sort_times = False if not is_sorted(times): + prev = 0. + for t in times: + if prev > t: + print(t) + prev = t logging.warning("Time not sorted in {}".format(gzipped_f)) cache_check[f]['status'] = False + if clean: + sort_times = True + remove_outside_times = False for t in times: if t < start or t > stop: - logging.warning("Time outside [StartTime, StopTime] detected in {}".format(gzipped_f)) + logging.warning("Time {} outside [{}, {}] detected in {}".format(t, start, stop, gzipped_f)) cache_check[f]['status'] = False + if clean: + remove_outside_times = True break + dds.close() + if clean and (remove_duplicate or sort_times or remove_outside_times): + clean_f = os.path.join("./", "clean_"+f) + logging.warning(clean_f) + with open(clean_f, "wb") as clean_nc_file: + clean_nc_file.write(ncdata) + dds_clean = nc.Dataset(clean_f, mode='a') + clean_data_file(dds_clean, remove_duplicate, sort_times, remove_outside_times) + dds_clean.close() + with open(cachefile, 'wb') as handle: pickle.dump(cache_check, handle, protocol=pickle.HIGHEST_PROTOCOL) + if files_in_dir: + for f in files_in_dir: + logging.warning("File {} in {} but not in *_times.nc file".format(f, location)) + return True -def check_ddbase(ddsys, vi=None, cachedir=None): +def clean_data_file(dds_clean, remove_duplicate, sort_times, remove_outside_times): + records_to_remove = [] + if remove_outside_times: + pass + if remove_duplicate: + pass + if sort_times: + pass + + +def check_ddbase(ddsys, vi=None, cachedir=None, clean=False): tree=etree.parse(ddsys) for e in tree.iter(tag="VI"): if vi and e.find("NAME").text != vi: continue if not vi and e.find("NAME").text in vi_to_exclude: continue - check_vi(e, cachedir) + check_vi(e, cachedir, clean) if __name__ == '__main__': @@ -215,6 +301,9 @@ if __name__ == '__main__': if not os.path.exists(cachedir): os.makedirs(cachedir) vi = None + clean = False if len(sys.argv) > 1: vi = sys.argv[1] - check_ddbase(ddsys, vi=vi, cachedir=cachedir) + if (len(sys.argv) > 2) and (sys.argv[2] == "clean"): + clean = True + check_ddbase(ddsys, vi=vi, cachedir=cachedir, clean=clean) -- libgit2 0.21.2