Commit 9b88323ba893a7e0394ce521851bc6d2ac8c38ed

Authored by Benjamin Renard
1 parent 33de3750

Add script to check DDBASE integrity

Showing 1 changed file with 220 additions and 0 deletions   Show diff stats
tests/checkDDBase.py 0 → 100644
... ... @@ -0,0 +1,220 @@
  1 +import os
  2 +import sys
  3 +import logging
  4 +from lxml import etree
  5 +import netCDF4 as nc
  6 +import numpy as np
  7 +import gzip
  8 +import pickle
  9 +from datetime import datetime
  10 +
  11 +vi_to_exclude = [
  12 + 'juno_ephem_orb1',
  13 + 'juno_fgm_orbfull',
  14 + 'juno_fgm_orb1',
  15 + 'juno_fgm_orb60',
  16 + 'juno_jedi_i090',
  17 + 'juno_jedi_i180',
  18 + 'juno_jedi_i270',
  19 + 'juno_jedi_e270',
  20 + 'juno_jedi_e180',
  21 + 'juno_jedi_e090',
  22 + 'juno_fgm_cruise60',
  23 + 'ros_magob_1s',
  24 +]
  25 +
  26 +def datetime_to_ddtime(date_time):
  27 + return "%04d%03d%02d%02d%02d%03d" % (date_time.year, int(date_time.timetuple().tm_yday)-1, date_time.hour, date_time.minute, date_time.second, date_time.microsecond/1000.)
  28 +
  29 +
  30 +def is_sorted(l):
  31 + return all(a <= b for a, b in zip(l, l[1:]))
  32 +
  33 +
  34 +def check_vi(e, cachedir):
  35 + name = e.find("NAME").text
  36 + logging.info('========== {} =========='.format(name))
  37 + base = e.find("NAME").attrib['base']
  38 + if base != 'LOCAL':
  39 + return True
  40 + location = e.find("LOCATION").text
  41 + times = e.find("TIMES").text
  42 + info = e.find("INFO").text
  43 + cache = e.find("CACHE").text
  44 +
  45 + if name == 'iball_acc_all':
  46 + # specific VI used to manage users
  47 + return True
  48 +
  49 + cachefile = os.path.join(cachedir, name)
  50 + if os.path.isfile(cachefile):
  51 + with open(cachefile, 'rb') as handle:
  52 + cache_check = pickle.load(handle)
  53 + else:
  54 + cache_check = {}
  55 +
  56 + if not os.path.isdir(location):
  57 + logging.error('{} not exists'.format(location))
  58 + return False
  59 +
  60 + times_path = os.path.join(location, times)
  61 + info_path = os.path.join(location, info)
  62 + cache_path = os.path.join(location, cache)
  63 +
  64 + for f in [times_path, info_path, cache_path]:
  65 + if not os.path.isfile(f):
  66 + logging.error('{} not exists'.format(f))
  67 + return False
  68 +
  69 + ds = nc.Dataset(times_path)
  70 +
  71 + for v in ['StartTime', 'StopTime', 'FileName']:
  72 + if v not in ds.variables:
  73 + logging.error('Missing {} variable in times file'.format(v))
  74 + return False
  75 +
  76 + start_times = []
  77 + for st in np.array(ds.variables["StartTime"]):
  78 + try:
  79 + st_str = "".join([k.decode("UTF-8") for k in st])
  80 + start_times.append(int(st_str))
  81 + except:
  82 + logging.error('Cannot parse StartTime in times file')
  83 + return False
  84 +
  85 + stop_times = []
  86 + for et in np.array(ds.variables["StopTime"]):
  87 + try:
  88 + et_str = "".join([k.decode("UTF-8") for k in et])
  89 + stop_times.append(int(et_str))
  90 + except:
  91 + logging.error('Cannot parse StopTime in times file')
  92 + return False
  93 +
  94 + files_names = []
  95 + for fn in np.array(ds.variables["FileName"]):
  96 + try:
  97 + fn_str = "".join([k.decode("UTF-8") for k in fn])
  98 + files_names.append(fn_str)
  99 + except:
  100 + logging.error('Cannot parse FileName in times file')
  101 + return False
  102 +
  103 + if len(start_times) != len(stop_times) or len(start_times) != len(files_names):
  104 + logging.error('Incoherence between variables size in times file')
  105 + return False
  106 +
  107 + if len(start_times) == 0:
  108 + logging.warning('Dataset is empty')
  109 + return True
  110 +
  111 + prev = None
  112 + for d in start_times:
  113 + if prev:
  114 + if d < prev:
  115 + logging.warning("Previous start time is higher {}".format(d))
  116 + prev = d
  117 +
  118 + prev = None
  119 + for d in stop_times:
  120 + if prev:
  121 + if d < prev:
  122 + logging.warning("Previous stop time is higher {}".format(d))
  123 + prev = d
  124 +
  125 + for i in range(len(start_times)):
  126 + if int(start_times[i]) > int(stop_times[i]):
  127 + logging.warning("Start time is higher than Stop time {} - {}".format(start_times[i], stop_times[i]))
  128 +
  129 + if os.path.isfile(os.path.join(location,'LOCK')):
  130 + logging.warning("LOCK file detected")
  131 +
  132 + for i in range(len(files_names)):
  133 + f = files_names[i]
  134 + start = start_times[i]
  135 + stop = stop_times[i]
  136 +
  137 + gzipped_f = os.path.join(location, f) + ".gz"
  138 +
  139 + if f in cache_check:
  140 + if cache_check[f]['status'] and os.path.getmtime(gzipped_f) == cache_check[f]['mtime']:
  141 + continue
  142 +
  143 + logging.info(f)
  144 +
  145 + if not os.path.isfile(gzipped_f):
  146 + logging.error("Missing data file {}".format(gzipped_f))
  147 + continue
  148 +
  149 + try:
  150 + cache_check[f] = {
  151 + 'status': True,
  152 + 'mtime': os.path.getmtime(gzipped_f)
  153 + }
  154 +
  155 + gf = gzip.open(gzipped_f, 'rb')
  156 + ncdata = gf.read()
  157 +
  158 + dds = nc.Dataset("in-mem-file", mode='r', memory=ncdata)
  159 + except:
  160 + logging.error("Cannot load or extract data file {}".format(gzipped_f))
  161 + cache_check[f]['status'] = False
  162 + continue
  163 +
  164 + for v in ['Time', 'StartTime', 'StopTime']:
  165 + if v not in dds.variables:
  166 + logging.error("Missing {} variable in data file {}".format(v, gzipped_f))
  167 + cache_check[f]['status'] = False
  168 + continue
  169 +
  170 + times = []
  171 + for t in np.array(dds.variables["Time"]):
  172 + if dds['Time'].dtype == np.float64:
  173 + # TimeStamp
  174 + t_str = datetime_to_ddtime(datetime.utcfromtimestamp(t))
  175 + else:
  176 + # DDTime
  177 + t_str = "".join([k.decode("UTF-8") for k in t])
  178 + times.append(int(t_str))
  179 +
  180 + if len(times) != len(set(times)):
  181 + logging.warning("Duplicate times in {}".format(gzipped_f))
  182 + cache_check[f]['status'] = False
  183 +
  184 + if not is_sorted(times):
  185 + logging.warning("Time not sorted in {}".format(gzipped_f))
  186 + cache_check[f]['status'] = False
  187 +
  188 + for t in times:
  189 + if t < start or t > stop:
  190 + logging.warning("Time outside [StartTime, StopTime] detected in {}".format(gzipped_f))
  191 + cache_check[f]['status'] = False
  192 + break
  193 +
  194 + with open(cachefile, 'wb') as handle:
  195 + pickle.dump(cache_check, handle, protocol=pickle.HIGHEST_PROTOCOL)
  196 +
  197 + return True
  198 +
  199 +
  200 +def check_ddbase(ddsys, vi=None, cachedir=None):
  201 + tree=etree.parse(ddsys)
  202 + for e in tree.iter(tag="VI"):
  203 + if vi and e.find("NAME").text != vi:
  204 + continue
  205 + if not vi and e.find("NAME").text in vi_to_exclude:
  206 + continue
  207 + check_vi(e, cachedir)
  208 +
  209 +
  210 +if __name__ == '__main__':
  211 + logging.basicConfig(format='[%(levelname)s] %(message)s', level=logging.INFO)
  212 + ddbase = '/data1/DDBASE'
  213 + ddsys = os.path.join(ddbase, 'DATA', 'DDsys.xml')
  214 + cachedir = os.path.join(os.path.dirname(__file__), '.cache')
  215 + if not os.path.exists(cachedir):
  216 + os.makedirs(cachedir)
  217 + vi = None
  218 + if len(sys.argv) > 1:
  219 + vi = sys.argv[1]
  220 + check_ddbase(ddsys, vi=vi, cachedir=cachedir)
... ...