Commit 9b88323ba893a7e0394ce521851bc6d2ac8c38ed
1 parent
33de3750
Exists in
master
and in
4 other branches
Add script to check DDBASE integrity
Showing
1 changed file
with
220 additions
and
0 deletions
Show diff stats
... | ... | @@ -0,0 +1,220 @@ |
1 | +import os | |
2 | +import sys | |
3 | +import logging | |
4 | +from lxml import etree | |
5 | +import netCDF4 as nc | |
6 | +import numpy as np | |
7 | +import gzip | |
8 | +import pickle | |
9 | +from datetime import datetime | |
10 | + | |
11 | +vi_to_exclude = [ | |
12 | + 'juno_ephem_orb1', | |
13 | + 'juno_fgm_orbfull', | |
14 | + 'juno_fgm_orb1', | |
15 | + 'juno_fgm_orb60', | |
16 | + 'juno_jedi_i090', | |
17 | + 'juno_jedi_i180', | |
18 | + 'juno_jedi_i270', | |
19 | + 'juno_jedi_e270', | |
20 | + 'juno_jedi_e180', | |
21 | + 'juno_jedi_e090', | |
22 | + 'juno_fgm_cruise60', | |
23 | + 'ros_magob_1s', | |
24 | +] | |
25 | + | |
26 | +def datetime_to_ddtime(date_time): | |
27 | + return "%04d%03d%02d%02d%02d%03d" % (date_time.year, int(date_time.timetuple().tm_yday)-1, date_time.hour, date_time.minute, date_time.second, date_time.microsecond/1000.) | |
28 | + | |
29 | + | |
30 | +def is_sorted(l): | |
31 | + return all(a <= b for a, b in zip(l, l[1:])) | |
32 | + | |
33 | + | |
34 | +def check_vi(e, cachedir): | |
35 | + name = e.find("NAME").text | |
36 | + logging.info('========== {} =========='.format(name)) | |
37 | + base = e.find("NAME").attrib['base'] | |
38 | + if base != 'LOCAL': | |
39 | + return True | |
40 | + location = e.find("LOCATION").text | |
41 | + times = e.find("TIMES").text | |
42 | + info = e.find("INFO").text | |
43 | + cache = e.find("CACHE").text | |
44 | + | |
45 | + if name == 'iball_acc_all': | |
46 | + # specific VI used to manage users | |
47 | + return True | |
48 | + | |
49 | + cachefile = os.path.join(cachedir, name) | |
50 | + if os.path.isfile(cachefile): | |
51 | + with open(cachefile, 'rb') as handle: | |
52 | + cache_check = pickle.load(handle) | |
53 | + else: | |
54 | + cache_check = {} | |
55 | + | |
56 | + if not os.path.isdir(location): | |
57 | + logging.error('{} not exists'.format(location)) | |
58 | + return False | |
59 | + | |
60 | + times_path = os.path.join(location, times) | |
61 | + info_path = os.path.join(location, info) | |
62 | + cache_path = os.path.join(location, cache) | |
63 | + | |
64 | + for f in [times_path, info_path, cache_path]: | |
65 | + if not os.path.isfile(f): | |
66 | + logging.error('{} not exists'.format(f)) | |
67 | + return False | |
68 | + | |
69 | + ds = nc.Dataset(times_path) | |
70 | + | |
71 | + for v in ['StartTime', 'StopTime', 'FileName']: | |
72 | + if v not in ds.variables: | |
73 | + logging.error('Missing {} variable in times file'.format(v)) | |
74 | + return False | |
75 | + | |
76 | + start_times = [] | |
77 | + for st in np.array(ds.variables["StartTime"]): | |
78 | + try: | |
79 | + st_str = "".join([k.decode("UTF-8") for k in st]) | |
80 | + start_times.append(int(st_str)) | |
81 | + except: | |
82 | + logging.error('Cannot parse StartTime in times file') | |
83 | + return False | |
84 | + | |
85 | + stop_times = [] | |
86 | + for et in np.array(ds.variables["StopTime"]): | |
87 | + try: | |
88 | + et_str = "".join([k.decode("UTF-8") for k in et]) | |
89 | + stop_times.append(int(et_str)) | |
90 | + except: | |
91 | + logging.error('Cannot parse StopTime in times file') | |
92 | + return False | |
93 | + | |
94 | + files_names = [] | |
95 | + for fn in np.array(ds.variables["FileName"]): | |
96 | + try: | |
97 | + fn_str = "".join([k.decode("UTF-8") for k in fn]) | |
98 | + files_names.append(fn_str) | |
99 | + except: | |
100 | + logging.error('Cannot parse FileName in times file') | |
101 | + return False | |
102 | + | |
103 | + if len(start_times) != len(stop_times) or len(start_times) != len(files_names): | |
104 | + logging.error('Incoherence between variables size in times file') | |
105 | + return False | |
106 | + | |
107 | + if len(start_times) == 0: | |
108 | + logging.warning('Dataset is empty') | |
109 | + return True | |
110 | + | |
111 | + prev = None | |
112 | + for d in start_times: | |
113 | + if prev: | |
114 | + if d < prev: | |
115 | + logging.warning("Previous start time is higher {}".format(d)) | |
116 | + prev = d | |
117 | + | |
118 | + prev = None | |
119 | + for d in stop_times: | |
120 | + if prev: | |
121 | + if d < prev: | |
122 | + logging.warning("Previous stop time is higher {}".format(d)) | |
123 | + prev = d | |
124 | + | |
125 | + for i in range(len(start_times)): | |
126 | + if int(start_times[i]) > int(stop_times[i]): | |
127 | + logging.warning("Start time is higher than Stop time {} - {}".format(start_times[i], stop_times[i])) | |
128 | + | |
129 | + if os.path.isfile(os.path.join(location,'LOCK')): | |
130 | + logging.warning("LOCK file detected") | |
131 | + | |
132 | + for i in range(len(files_names)): | |
133 | + f = files_names[i] | |
134 | + start = start_times[i] | |
135 | + stop = stop_times[i] | |
136 | + | |
137 | + gzipped_f = os.path.join(location, f) + ".gz" | |
138 | + | |
139 | + if f in cache_check: | |
140 | + if cache_check[f]['status'] and os.path.getmtime(gzipped_f) == cache_check[f]['mtime']: | |
141 | + continue | |
142 | + | |
143 | + logging.info(f) | |
144 | + | |
145 | + if not os.path.isfile(gzipped_f): | |
146 | + logging.error("Missing data file {}".format(gzipped_f)) | |
147 | + continue | |
148 | + | |
149 | + try: | |
150 | + cache_check[f] = { | |
151 | + 'status': True, | |
152 | + 'mtime': os.path.getmtime(gzipped_f) | |
153 | + } | |
154 | + | |
155 | + gf = gzip.open(gzipped_f, 'rb') | |
156 | + ncdata = gf.read() | |
157 | + | |
158 | + dds = nc.Dataset("in-mem-file", mode='r', memory=ncdata) | |
159 | + except: | |
160 | + logging.error("Cannot load or extract data file {}".format(gzipped_f)) | |
161 | + cache_check[f]['status'] = False | |
162 | + continue | |
163 | + | |
164 | + for v in ['Time', 'StartTime', 'StopTime']: | |
165 | + if v not in dds.variables: | |
166 | + logging.error("Missing {} variable in data file {}".format(v, gzipped_f)) | |
167 | + cache_check[f]['status'] = False | |
168 | + continue | |
169 | + | |
170 | + times = [] | |
171 | + for t in np.array(dds.variables["Time"]): | |
172 | + if dds['Time'].dtype == np.float64: | |
173 | + # TimeStamp | |
174 | + t_str = datetime_to_ddtime(datetime.utcfromtimestamp(t)) | |
175 | + else: | |
176 | + # DDTime | |
177 | + t_str = "".join([k.decode("UTF-8") for k in t]) | |
178 | + times.append(int(t_str)) | |
179 | + | |
180 | + if len(times) != len(set(times)): | |
181 | + logging.warning("Duplicate times in {}".format(gzipped_f)) | |
182 | + cache_check[f]['status'] = False | |
183 | + | |
184 | + if not is_sorted(times): | |
185 | + logging.warning("Time not sorted in {}".format(gzipped_f)) | |
186 | + cache_check[f]['status'] = False | |
187 | + | |
188 | + for t in times: | |
189 | + if t < start or t > stop: | |
190 | + logging.warning("Time outside [StartTime, StopTime] detected in {}".format(gzipped_f)) | |
191 | + cache_check[f]['status'] = False | |
192 | + break | |
193 | + | |
194 | + with open(cachefile, 'wb') as handle: | |
195 | + pickle.dump(cache_check, handle, protocol=pickle.HIGHEST_PROTOCOL) | |
196 | + | |
197 | + return True | |
198 | + | |
199 | + | |
200 | +def check_ddbase(ddsys, vi=None, cachedir=None): | |
201 | + tree=etree.parse(ddsys) | |
202 | + for e in tree.iter(tag="VI"): | |
203 | + if vi and e.find("NAME").text != vi: | |
204 | + continue | |
205 | + if not vi and e.find("NAME").text in vi_to_exclude: | |
206 | + continue | |
207 | + check_vi(e, cachedir) | |
208 | + | |
209 | + | |
210 | +if __name__ == '__main__': | |
211 | + logging.basicConfig(format='[%(levelname)s] %(message)s', level=logging.INFO) | |
212 | + ddbase = '/data1/DDBASE' | |
213 | + ddsys = os.path.join(ddbase, 'DATA', 'DDsys.xml') | |
214 | + cachedir = os.path.join(os.path.dirname(__file__), '.cache') | |
215 | + if not os.path.exists(cachedir): | |
216 | + os.makedirs(cachedir) | |
217 | + vi = None | |
218 | + if len(sys.argv) > 1: | |
219 | + vi = sys.argv[1] | |
220 | + check_ddbase(ddsys, vi=vi, cachedir=cachedir) | ... | ... |