From 7d16ef7a996a230b0e9769533808b852a8e44497 Mon Sep 17 00:00:00 2001
From: Myriam Bouchemit <myriam.bouchemit@irap.omp.eu>
Date: Mon, 18 Mar 2024 11:54:01 +0100
Subject: [PATCH] Improve checkDDBase script

---
 tests/checkDDBase.py | 121 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 105 insertions(+), 16 deletions(-)

diff --git a/tests/checkDDBase.py b/tests/checkDDBase.py
index e824cca..ffa5df2 100644
--- a/tests/checkDDBase.py
+++ b/tests/checkDDBase.py
@@ -6,21 +6,20 @@ import netCDF4 as nc
 import numpy as np
 import gzip
 import pickle
+import glob
 from datetime import datetime
 
 vi_to_exclude = [
-    'juno_ephem_orb1',
-    'juno_fgm_orbfull',
-    'juno_fgm_orb1',
-    'juno_fgm_orb60',
+    #'juno_fgm_orbfull',
     'juno_jedi_i090',
     'juno_jedi_i180',
     'juno_jedi_i270',
     'juno_jedi_e270',
     'juno_jedi_e180',
     'juno_jedi_e090',
-    'juno_fgm_cruise60',
     'ros_magob_1s',
+    'juno_fgm_orb1',
+    'gtl_epic_ed'
 ]
 
 def datetime_to_ddtime(date_time):
@@ -30,8 +29,25 @@ def datetime_to_ddtime(date_time):
 def is_sorted(l):
     return all(a <= b for a, b in zip(l, l[1:]))
 
+def diff_dicts(a, b, drop_similar=True):
+    res = a.copy()
+
+    for k in res:
+        if k not in b:
+            res[k] = (res[k], None)
+
+    for k in b:
+        if k in res:
+            res[k] = (res[k], b[k])
+        else:
+            res[k] = (None, b[k])
 
-def check_vi(e, cachedir):
+    if drop_similar:
+        res = {k:v for k,v in res.items() if v[0] != v[1]}
+
+    return res
+
+def check_vi(e, cachedir, clean):
     name = e.find("NAME").text
     logging.info('========== {} =========='.format(name))
     base = e.find("NAME").attrib['base']
@@ -129,6 +145,11 @@ def check_vi(e, cachedir):
     if os.path.isfile(os.path.join(location,'LOCK')):
         logging.warning("LOCK file detected")
 
+    files_in_dir = []
+    for f in glob.glob(location+'*.nc.gz'):
+        files_in_dir.append(f)
+
+    first_file_structure = None
     for i in range(len(files_names)):
         f = files_names[i]
         start = start_times[i]
@@ -136,16 +157,19 @@ def check_vi(e, cachedir):
 
         gzipped_f = os.path.join(location, f) + ".gz"
 
-        if f in cache_check:
-            if cache_check[f]['status'] and os.path.getmtime(gzipped_f) == cache_check[f]['mtime']:
-                continue
-
-        logging.info(f)
+        if gzipped_f in files_in_dir:
+            files_in_dir.remove(gzipped_f)
 
         if not os.path.isfile(gzipped_f):
             logging.error("Missing data file {}".format(gzipped_f))
             continue
 
+        if (f in cache_check) and (first_file_structure is not None): #always check first dataset file
+            if cache_check[f]['status'] and os.path.getmtime(gzipped_f) == cache_check[f]['mtime']:
+                continue
+
+        logging.info(f)
+
         try:
             cache_check[f] = {
                 'status': True,
@@ -167,6 +191,24 @@ def check_vi(e, cachedir):
                 cache_check[f]['status'] = False
                 continue
 
+        crt_file_structure = {}
+        for v in dds.variables:
+            if v in ['Time', 'StartTime', 'StopTime']:
+                continue
+            if not dds.dimensions[dds.variables[v].dimensions[0]].isunlimited():
+                continue
+            crt_file_structure[v] = {"shape": dds.variables[v].shape[1:], "dtype": dds.variables[v].dtype}
+
+        if first_file_structure is None:
+            first_file_structure = crt_file_structure
+        elif first_file_structure != crt_file_structure:
+            logging.error("Incoherence in file structure {}".format(gzipped_f))
+            print(first_file_structure)
+            print(crt_file_structure)
+            print(diff_dicts(first_file_structure, crt_file_structure))
+            cache_check[f]['status'] = False
+            continue
+
         times = []
         for t in np.array(dds.variables["Time"]):
             if dds['Time'].dtype == np.float64:
@@ -175,36 +217,80 @@ def check_vi(e, cachedir):
             else:
                 # DDTime
                 t_str = "".join([k.decode("UTF-8") for k in t])
-            times.append(int(t_str))
+            try:
+                times.append(int(t_str))
+            except:
+                logging.error("Bad time format in data file {}".format(gzipped_f))
+                cache_check[f]['status'] = False
+                continue
 
+        remove_duplicate = False
         if len(times) != len(set(times)):
             logging.warning("Duplicate times in {}".format(gzipped_f))
+            #print(set([x for x in times if times.count(x) > 1]))
             cache_check[f]['status'] = False
+            if clean:
+                remove_duplicate = True
 
+        sort_times = False
         if not is_sorted(times):
+            prev = 0.
+            for t in times:
+                if prev > t:
+                    print(t)
+                prev = t
             logging.warning("Time not sorted in {}".format(gzipped_f))
             cache_check[f]['status'] = False
+            if clean:
+                sort_times = True
 
+        remove_outside_times = False
         for t in times:
             if t < start or t > stop:
-                logging.warning("Time outside [StartTime, StopTime] detected in {}".format(gzipped_f))
+                logging.warning("Time {} outside [{}, {}] detected in {}".format(t, start, stop, gzipped_f))
                 cache_check[f]['status'] = False
+                if clean:
+                    remove_outside_times = True
                 break
 
+        dds.close()
+        if clean and (remove_duplicate or sort_times or remove_outside_times):
+            clean_f = os.path.join("./", "clean_"+f)
+            logging.warning(clean_f)
+            with open(clean_f, "wb") as clean_nc_file:
+                clean_nc_file.write(ncdata)
+            dds_clean = nc.Dataset(clean_f, mode='a')
+            clean_data_file(dds_clean, remove_duplicate, sort_times, remove_outside_times)
+            dds_clean.close()
+
     with open(cachefile, 'wb') as handle:
         pickle.dump(cache_check, handle, protocol=pickle.HIGHEST_PROTOCOL)
 
+    if files_in_dir:
+        for f in files_in_dir:
+            logging.warning("File {} in {} but not in *_times.nc file".format(f, location))
+
     return True
 
 
-def check_ddbase(ddsys, vi=None, cachedir=None):
+def clean_data_file(dds_clean, remove_duplicate, sort_times, remove_outside_times):
+    records_to_remove = []
+    if remove_outside_times:
+        pass
+    if remove_duplicate:
+        pass
+    if sort_times:
+        pass
+
+
+def check_ddbase(ddsys, vi=None, cachedir=None, clean=False):
     tree=etree.parse(ddsys)
     for e in tree.iter(tag="VI"):
         if vi and e.find("NAME").text != vi:
             continue
         if not vi and e.find("NAME").text in vi_to_exclude:
             continue
-        check_vi(e, cachedir)
+        check_vi(e, cachedir, clean)
 
 
 if __name__ == '__main__':
@@ -215,6 +301,9 @@ if __name__ == '__main__':
     if not os.path.exists(cachedir):
         os.makedirs(cachedir)
     vi = None
+    clean = False
     if len(sys.argv) > 1:
         vi = sys.argv[1]
-    check_ddbase(ddsys, vi=vi, cachedir=cachedir)
+        if (len(sys.argv) > 2) and (sys.argv[2] == "clean"):
+            clean = True
+    check_ddbase(ddsys, vi=vi, cachedir=cachedir, clean=clean)
--
libgit2 0.21.2