Commit 8b818b98d7bfd7912b6728e7055c3414f30753c4
1 parent
016e9465
Exists in
master
Add all files required to fill DaCHS database.
Showing
4 changed files
with
737 additions
and
0 deletions
Show diff stats
.gitignore
... | ... | @@ -0,0 +1,85 @@ |
1 | +<?xml version="1.0" encoding="UTF-8"?> | |
2 | +<resource schema="amdadb"> | |
3 | + <!-- Metadata describing the dataset --> | |
4 | + <meta name="title">Planetary and heliophysics plasma data at CDPP/AMDA</meta> | |
5 | + <meta name="creationDate">2016-08-05T16:00:00</meta> | |
6 | + <meta name="description" format="plain">Planetary and heliophysics plasma data at CDPP/AMDA</meta> | |
7 | + <meta name="creator.name">Vincent Genot</meta> | |
8 | + <meta name="contact.name">Vincent Genot</meta> | |
9 | + <meta name="contact.email">vincent.genot@irap.omp.eu</meta> | |
10 | + <meta name="contact.address">IRAP, 9 av. Colonel Roche, 31400 Toulouse, FRANCE</meta> | |
11 | + <meta name="subject">Virtual observatory</meta> | |
12 | + <meta name="subject">Plasma physics</meta> | |
13 | + <meta name="utype">ivo://cdpp.irap/std/EpnCore#schema-2.0</meta> <!-- not tested --> | |
14 | + <table id="epn_core" onDisk="True" adql="True"> | |
15 | + <meta name="info" infoName="SERVICE_PROTOCOL" infoValue="2.0"> EPN-TAP </meta> | |
16 | + <meta name="description">Planetary and heliophysics plasma data at CDPP/AMDA</meta> | |
17 | + <meta name="referenceURL">http://amda.cdpp.eu</meta> | |
18 | + <meta name="utype">EPN-TAP 2.0</meta> | |
19 | + <!-- header parameters --> | |
20 | + <column name="granule_uid" type="text" required="True" ucd="meta.id" description="Granule unique identifier, provides direct access"/> | |
21 | + <column name="dataproduct_type" type="text" ucd="meta.code.class" description="Organisation of the data product (from enumerated list)"/> | |
22 | + <column name="target_name" type="text" ucd="meta.id;src" description="Name of target (IAU standard)"/> | |
23 | + <column name="time_min" type="double precision" ucd="time.start" unit="d" description="Acquisition start time (in JD) (not necessary)"/> | |
24 | + <column name="time_max" type="double precision" ucd="time.end" unit="d" description="Acquisition stop time (in JD) (not necessary)"/> | |
25 | + <!-- important parameters --> | |
26 | + <column name="access_url" type="text" ucd="meta.ref.url;meta.file"/> | |
27 | + <column name="target_class" type="text" ucd="meta.code.class;src" description="Type of target, from enumerated list"/> | |
28 | + <column name="target_region" type="text" ucd="meta.id;src"/> | |
29 | + <column name="spase_region" type="text" ucd="phys.angArea;obs" description="(not necessary)"/> | |
30 | + <column name="instrument_host_name" type="text" ucd="meta.id;instr.obsty" description="(not necessary)"/> | |
31 | + <column name="instrument_name" type="text" ucd="meta.id;instr" description="(not necessary)"/> | |
32 | + <column name="measurement_type" type="text" ucd="meta.ucd" description="(not necessary)"/> | |
33 | + <column name="spase_measurement_type" type="text" ucd="meta.ucd" description="(not necessary)"/> | |
34 | + <column name="spatial_frame_type" type="text" ucd="meta.code.class;pos.frame" description="(can be necessary)"/> | |
35 | + <column name="processing_level" type="integer" ucd="meta.code;obs.calib" required="True"/> | |
36 | + <column name="release_date" type="date" ucd="time.release"/> | |
37 | + <column name="access_estsize" type="integer" ucd="phys.size;meta.file" required="True"/> | |
38 | + <column name="access_format" type="text" ucd="meta.code.mime"/> | |
39 | + <column name="time_sampling_step_min" type="double precision" ucd="time.interval;stat.min" unit="s" description="Min time sampling step (not necessary)"/> | |
40 | + <column name="time_sampling_step_max" type="double precision" ucd="time.interval;stat.max" unit="s" description="Max time sampling step (not necessary)"/> | |
41 | + <column name="time_exp_min" type="double precision" ucd="time.duration;stat.min" unit="s" description="Min integration time (not necessary)"/> | |
42 | + <!-- redundant or static parameters --> | |
43 | + <column name="time_exp_max" type="double precision" ucd="time.duration;stat.max" unit="s" description="Max integration time (not necessary)"/> | |
44 | + <column name="granule_gid" type="text" required="True" ucd="meta.id" description="Group identifier, identical for similar data products"/> | |
45 | + <column name="obs_id" type="text" required="True" ucd="meta.id" description="Identical for data products related to the same original data"/> | |
46 | + <column name="creation_date" type="date" ucd="time.creation"/> | |
47 | + <column name="modification_date" type="date" ucd="time.update"/> | |
48 | + <column name="service_title" type="text" ucd="meta.title"/> | |
49 | + <column name="publisher" type="text" ucd="meta.name"/> | |
50 | + <column name="time_scale" type="text" ucd="time.scale"/> | |
51 | + <!-- null parameters --> | |
52 | + <column name="spectral_range_min" type="double precision" ucd="em.freq;stat.min" unit="Hz" description="Min spectral range (not necessary)"/> | |
53 | + <column name="spectral_range_max" type="double precision" ucd="em.freq;stat.max" unit="Hz" description="Max spectral range (not necessary)"/> | |
54 | + <column name="spectral_sampling_step_min" type="double precision" ucd="em.freq.step;stat.min" unit="Hz" description="Min spectral sampling step (not necessary)"/> | |
55 | + <column name="spectral_sampling_step_max" type="double precision" ucd="em.freq.step;stat.max" unit="Hz" description="Max spectral sampling step (not necessary)"/> | |
56 | + <column name="spectral_resolution_min" type="double precision" ucd="spect.resolution;stat.min" unit="Hz" description="Min spectral resolution (not necessary)"/> | |
57 | + <column name="spectral_resolution_max" type="double precision" ucd="spect.resolution;stat.max" unit="Hz" description="Max spectral resolution (not necessary)"/> | |
58 | + <column name="c1min" type="double precision" ucd="pos;stat.min" unit="deg" description="(not necessary)"/> | |
59 | + <column name="c1max" type="double precision" ucd="pos;stat.max" unit="deg" description="(not necessary)"/> | |
60 | + <column name="c2min" type="double precision" ucd="pos;stat.min" unit="deg" description="(not necessary)"/> | |
61 | + <column name="c2max" type="double precision" ucd="pos;stat.max" unit="deg" description="(not necessary)"/> | |
62 | + <column name="c3min" type="double precision" ucd="pos;stat.min" unit="" description="(not necessary)"/> | |
63 | + <column name="c3max" type="double precision" ucd="pos;stat.max" unit="" description="(not necessary)"/> | |
64 | + <column name="c1_resol_min" type="double precision" ucd="pos.resolution;stat.min" unit="deg" description="(not necessary)"/> | |
65 | + <column name="c1_resol_max" type="double precision" ucd="pos.resolution;stat.max" unit="deg" description="(not necessary)"/> | |
66 | + <column name="c2_resol_min" type="double precision" ucd="pos.resolution;stat.min" unit="deg" description="Min resolution in latitude"/> | |
67 | + <column name="c2_resol_max" type="double precision" ucd="pos.resolution;stat.max" unit="deg" description="(not necessary)"/> | |
68 | + <column name="c3_resol_min" type="double precision" ucd="pos.resolution;stat.min" unit="" description="(not necessary)"/> | |
69 | + <column name="c3_resol_max" type="double precision" ucd="pos.resolution;stat.max" unit="" description="(not necessary)"/> | |
70 | + <column name="s_region" type="text" ucd="phys.angArea;obs" description="(not necessary)"/> | |
71 | + <column name="incidence_min" type="double precision" ucd="pos.posAng;stat.min" unit="deg" description="(not necessary)"/> | |
72 | + <column name="incidence_max" type="double precision" ucd="pos.posAng;stat.max" unit="deg" description="(not necessary)"/> | |
73 | + <column name="emergence_min" type="double precision" ucd="pos.posAng;stat.min" unit="deg" description="(not necessary)"/> | |
74 | + <column name="emergence_max" type="double precision" ucd="pos.posAng;stat.max" unit="deg" description="(not necessary)"/> | |
75 | + <column name="phase_min" type="double precision" ucd="pos.phaseAng;stat.min" unit="deg" description="(not necessary)"/> | |
76 | + <column name="phase_max" type="double precision" ucd="pos.phaseAng;stat.max" unit="deg" description="(not necessary)"/> | |
77 | + </table> | |
78 | + <data id="import"> | |
79 | + <make table="epn_core"/> | |
80 | + </data> | |
81 | + <data id="collection" auto="false"> | |
82 | + <register services="__system__/tap#run"/> | |
83 | + <make table="epn_core"/> | |
84 | + </data> | |
85 | +</resource> | ... | ... |
... | ... | @@ -0,0 +1,73 @@ |
1 | +-- SQL procedure to define amdadb data table. | |
2 | +-- Name: amdadb; Type: SCHEMA; Schema: amdadb; Owner: postgres | |
3 | + | |
4 | +SET client_encoding = 'UTF8'; | |
5 | + | |
6 | +DROP VIEW IF EXISTS amdadb.epn_core CASCADE; | |
7 | +CREATE VIEW amdadb.epn_core AS SELECT | |
8 | + -- header parameters | |
9 | + CAST(obs_id || '_cdf' AS TEXT) AS granule_uid, | |
10 | + dataproduct_type, | |
11 | + target_name, | |
12 | + time_min, | |
13 | + time_max, | |
14 | + -- important parameters | |
15 | + access_url, | |
16 | + target_class, | |
17 | + target_region, | |
18 | + spase_region, | |
19 | + instrument_host_name, | |
20 | + instrument_name, | |
21 | + measurement_type, | |
22 | + spase_measurement_type, | |
23 | + spatial_frame_type, | |
24 | + processing_level, | |
25 | + release_date, | |
26 | + access_estsize, | |
27 | + access_format, | |
28 | + time_sampling_step_min, | |
29 | + time_sampling_step_max, | |
30 | + time_exp_min, | |
31 | + -- redundant or static parameters | |
32 | + CAST(time_exp_min AS DOUBLE PRECISION) AS time_exp_max, | |
33 | + CAST('cdf' AS TEXT) AS granule_gid, | |
34 | + obs_id, | |
35 | + -- CAST('application/x-netcdf' AS TEXT) AS access_format, | |
36 | + CAST(release_date AS DATE) AS creation_date, | |
37 | + CAST(release_date AS DATE) AS modification_date, | |
38 | + CAST('AMDADB' AS TEXT) AS service_title, | |
39 | + CAST('CDPP' AS TEXT) AS publisher, | |
40 | + CAST('UTC' AS TEXT) AS time_scale, | |
41 | + -- null parameters | |
42 | + CAST(NULL AS DOUBLE PRECISION) AS spectral_range_min, | |
43 | + CAST(NULL AS DOUBLE PRECISION) AS spectral_range_max, | |
44 | + CAST(NULL AS DOUBLE PRECISION) AS spectral_sampling_step_min, | |
45 | + CAST(NULL AS DOUBLE PRECISION) AS spectral_sampling_step_max, | |
46 | + CAST(NULL AS DOUBLE PRECISION) AS spectral_resolution_min, | |
47 | + CAST(NULL AS DOUBLE PRECISION) AS spectral_resolution_max, | |
48 | + CAST(NULL AS DOUBLE PRECISION) AS c1min, | |
49 | + CAST(NULL AS DOUBLE PRECISION) AS c1max, | |
50 | + CAST(NULL AS DOUBLE PRECISION) AS c2min, | |
51 | + CAST(NULL AS DOUBLE PRECISION) AS c2max, | |
52 | + CAST(NULL AS DOUBLE PRECISION) AS c3min, | |
53 | + CAST(NULL AS DOUBLE PRECISION) AS c3max, | |
54 | + CAST(NULL AS DOUBLE PRECISION) AS c1_resol_min, | |
55 | + CAST(NULL AS DOUBLE PRECISION) AS c1_resol_max, | |
56 | + CAST(NULL AS DOUBLE PRECISION) AS c2_resol_min, | |
57 | + CAST(NULL AS DOUBLE PRECISION) AS c2_resol_max, | |
58 | + CAST(NULL AS DOUBLE PRECISION) AS c3_resol_min, | |
59 | + CAST(NULL AS DOUBLE PRECISION) AS c3_resol_max, | |
60 | + CAST(NULL AS TEXT) AS s_region, | |
61 | + CAST(NULL AS DOUBLE PRECISION) AS incidence_min, | |
62 | + CAST(NULL AS DOUBLE PRECISION) AS incidence_max, | |
63 | + CAST(NULL AS DOUBLE PRECISION) AS emergence_min, | |
64 | + CAST(NULL AS DOUBLE PRECISION) AS emergence_max, | |
65 | + CAST(NULL AS DOUBLE PRECISION) AS phase_min, | |
66 | + CAST(NULL AS DOUBLE PRECISION) AS phase_max, | |
67 | + -- parameters added to prevent warnings in the q.rd validator | |
68 | + CAST(NULL AS TEXT) AS thumbnail_url, | |
69 | + CAST(NULL AS TEXT) AS file_name, | |
70 | + CAST(NULL AS TEXT) AS species, | |
71 | + CAST(NULL AS TEXT) AS feature_name, | |
72 | + CAST(NULL AS TEXT) AS bib_reference | |
73 | +FROM amdadb.data_table; | ... | ... |
... | ... | @@ -0,0 +1,576 @@ |
1 | +#!/usr/bin/env python | |
2 | +# -*- coding: utf-8 -*- | |
3 | + | |
4 | +"""This script inspect a SPASE dataset folder (containing Granules, NumericalData, Instrument and | |
5 | +Observatory folders), then generate a SQL script which insert all the granules in a database, | |
6 | +formatted as epn-tap parameters. | |
7 | + | |
8 | +See | |
9 | +http://spase-group.org/data/reference/spase-2_2_6/ for more information about spase specification, | |
10 | +and https://voparis-confluence.obspm.fr/display/VES/EPN-TAP+V2.0+parameters for more information | |
11 | +about epn-tap-v2 specification.""" | |
12 | + | |
13 | +import math | |
14 | +import re | |
15 | +import xml.etree.ElementTree as ElTr | |
16 | +import os.path as op | |
17 | +from os import walk | |
18 | +from datetime import datetime, timedelta | |
19 | +from typing import Tuple, List, Dict, Optional | |
20 | +import sys | |
21 | + | |
22 | +# Type aliases | |
23 | +SQLDic = Dict[str, object] | |
24 | +SpaseDic = Dict[str, List[ElTr.Element]] | |
25 | + | |
26 | +# Paths | |
27 | +WORKING_DIR = op.dirname(op.dirname(op.abspath(__file__))) | |
28 | +OUTPUT_PATH = op.join(WORKING_DIR, 'SERVER') | |
29 | +SQL_FILE_PATH = op.join(OUTPUT_PATH, 'amdadb_db.sql') | |
30 | +SPASE_DIR = op.join(WORKING_DIR, 'DATA') | |
31 | +LOG_FILE_PATH = op.join(WORKING_DIR, 'build_granules.log') # Set to None if you want to log in stdout instead of a file | |
32 | + | |
33 | +# XML and SQL formats | |
34 | +XMLNS = 'http://www.spase-group.org/data/schema' | |
35 | +XML_DATE_FORMAT = '%Y-%m-%dT%H:%M:%SZ' | |
36 | +SQL_DATE_FORMAT = '%Y-%m-%d' | |
37 | +SEP = '#' | |
38 | + | |
39 | +# Dictionaries of values | |
40 | +DATAPRODUCT_TYPE_DIC = {'Image': 'im', 'Plasmagram': 'ds', 'Spectrogram': 'ds', 'StackPlot': 'ts', | |
41 | + 'TimeSeries': 'ts', 'time_series': 'ts', 'WaveForm': 'ts'} | |
42 | + | |
43 | +PROCESSING_LEVEL_DIC = {'Calibrated': 3, 'Raw': 1, 'Uncalibrated': 5} | |
44 | + | |
45 | +# Based on http://spase-group.org/ | |
46 | +TARGET_CLASS_DIC = {'Heliosphere': 'interplanetary_medium', 'Interstellar': 'galaxy', | |
47 | + 'Earth': 'planet', 'Saturn': 'planet', 'Mercury': 'planet', 'Uranus': 'planet', | |
48 | + 'Mars': 'planet', 'Neptune': 'planet', 'Jupiter': 'planet', 'Venus': 'planet', | |
49 | + 'Moon': 'satellite', 'Callisto': 'satellite', 'Europa': 'satellite', | |
50 | + 'Ganymede': 'satellite', 'Dione': 'satellite', 'Enceladus': 'satellite', | |
51 | + 'Mimas': 'satellite', 'Miranda': 'satellite', 'Phobos': 'satellite', | |
52 | + 'Iapetus': 'satellite', 'Titania': 'satellite', 'Oberon': 'satellite', | |
53 | + 'Puck': 'satellite', 'Deimos': 'satellite', 'Ariel': 'satellite', | |
54 | + 'Umbriel': 'satellite', 'Rhea': 'satellite', 'Tethys': 'satellite', | |
55 | + 'Titan': 'satellite', 'Io': 'satellite', | |
56 | + 'Pluto': 'dwarf_planet', | |
57 | + 'Comet': 'comet' | |
58 | + } | |
59 | + | |
60 | +MIME_TYPE_LIST = {'AVI': 'video/x-msvideo', | |
61 | + 'Binary': 'application/octet-stream', | |
62 | + 'CDF': 'application/x-cdf-istp', | |
63 | + 'CEF': 'application/x-cef1', | |
64 | + 'CEF1': 'application/x-cef1', | |
65 | + 'CEF2': 'application/x-cef2', | |
66 | + 'Excel': 'application/vnd.ms-excel', | |
67 | + 'FITS': 'application/x-fits-bintable', | |
68 | + 'GIF': 'image/gif', | |
69 | + 'HDF': 'application/x-hdf', | |
70 | + 'HDF4': 'application/x-hdf', | |
71 | + 'HDF5': 'application/x-hdf', | |
72 | + 'HTML': 'text/html', | |
73 | + 'Hardcopy': None, | |
74 | + 'Hardcopy.Film': None, | |
75 | + 'Hardcopy.Microfiche': None, | |
76 | + 'Hardcopy.Microfilm': None, | |
77 | + 'Hardcopy.Photograph': None, | |
78 | + 'Hardcopy.PhotographicPlate': None, | |
79 | + 'Hardcopy.Print': None, | |
80 | + 'IDFS': None, | |
81 | + 'IDL': 'application/octet-stream', | |
82 | + 'JPEG': 'image/jpeg ', | |
83 | + 'MATLAB_4': 'application/octet-stream', | |
84 | + 'MATLAB_6': 'application/octet-stream', | |
85 | + 'MATLAB_7': 'application/octet-stream', | |
86 | + 'MPEG': 'video/mpeg', | |
87 | + 'NCAR': None, | |
88 | + 'NetCDF': 'application/x-netcdf', | |
89 | + 'PDF': 'application/pdf', | |
90 | + 'PNG': 'image/png', | |
91 | + 'Postscript': 'application/postscript', | |
92 | + 'QuickTime': 'video/quicktime', | |
93 | + 'TIFF': 'image/tiff', | |
94 | + 'Text': 'text/plain', | |
95 | + 'Text.ASCII': 'text/plain', | |
96 | + 'Text.Unicode': 'text/plain', | |
97 | + 'UDF': None, | |
98 | + 'VOTable': 'application/x-votable+xml', | |
99 | + 'XML': 'text/xml'} | |
100 | + | |
101 | +# All default SQL values for missing parameters in dataset | |
102 | +DEFAULT_DATASET_VALUES = { | |
103 | + 'dataproduct_type': 'Unknown', | |
104 | + 'target_name': 'Unknown', | |
105 | + 'target_class': 'Unknown', | |
106 | + 'target_region': None, | |
107 | + 'spase_region': None, | |
108 | + 'instrument_host_name': None, | |
109 | + 'instrument_name': None, | |
110 | + 'measurement_type': None, | |
111 | + 'spatial_frame_type': None, | |
112 | + 'processing_level': 0, | |
113 | + 'time_sampling_step_min': None, | |
114 | + 'time_sampling_step_max': None, | |
115 | + 'time_exp_min': None, | |
116 | + 'access_format': 'application/x-cdf-istp' | |
117 | + } | |
118 | + | |
119 | +# All default SQL values for missing parameters in granule | |
120 | +DEFAULT_GRANULE_VALUES = { | |
121 | + # obs_id: if missing, the script exits directly. | |
122 | + 'time_min': 0.0, | |
123 | + 'time_max': 0.0, | |
124 | + 'access_url': None, | |
125 | + 'access_estsize': 0, | |
126 | + 'release_date': '01-01-0001' | |
127 | + } | |
128 | + | |
129 | +# SQL code | |
130 | +SQL_HEADER = '''-- Generated by build_BDD.py on %s. | |
131 | +-- SQL procedure to define amdadb data table. Other parameters comes in the epn_core view. | |
132 | +-- Name: amdadb; Type: SCHEMA; Schema: amdadb; Owner: postgres | |
133 | + | |
134 | +DROP SCHEMA IF EXISTS amdadb cascade; | |
135 | +CREATE SCHEMA amdadb; | |
136 | +SET search_path = public, pg_catalog; | |
137 | +SET default_tablespace = ''; | |
138 | +SET default_with_oids = false; | |
139 | +SET client_encoding = 'UTF8'; | |
140 | + | |
141 | +-- Name: data_table; Type: TABLE; Schema: amdadb; Owner: postgres; Tablespace: | |
142 | +CREATE TABLE amdadb.data_table ( | |
143 | + -- header parameters | |
144 | + id SERIAL PRIMARY KEY, | |
145 | + obs_id TEXT, | |
146 | + dataproduct_type TEXT, | |
147 | + target_name TEXT, | |
148 | + time_min DOUBLE PRECISION, -- date as JD | |
149 | + time_max DOUBLE PRECISION, -- date as JD | |
150 | + -- important parameters | |
151 | + access_url TEXT, | |
152 | + target_class TEXT, | |
153 | + target_region TEXT, | |
154 | + spase_region TEXT, | |
155 | + instrument_host_name TEXT, | |
156 | + instrument_name TEXT, | |
157 | + measurement_type TEXT, | |
158 | + spase_measurement_type TEXT, | |
159 | + spatial_frame_type TEXT, | |
160 | + processing_level INTEGER, | |
161 | + release_date DATE, | |
162 | + access_estsize INTEGER, | |
163 | + access_format TEXT, | |
164 | + time_sampling_step_min DOUBLE PRECISION, -- duration in seconds | |
165 | + time_sampling_step_max DOUBLE PRECISION, -- duration in seconds | |
166 | + time_exp_min DOUBLE PRECISION -- duration in seconds | |
167 | +); | |
168 | + | |
169 | +''' % datetime.now().strftime('%c') | |
170 | + | |
171 | +SQL_ROW = 'INSERT INTO amdadb.data_table(%s) VALUES (%s);\n' | |
172 | + | |
173 | +SQL_FOOTER = '''REVOKE ALL ON SCHEMA "amdadb" FROM PUBLIC; | |
174 | +REVOKE ALL ON SCHEMA "amdadb" FROM postgres; | |
175 | +GRANT ALL ON SCHEMA "amdadb" TO postgres; | |
176 | +GRANT ALL PRIVILEGES ON SCHEMA amdadb TO gavo WITH GRANT OPTION; | |
177 | +GRANT ALL PRIVILEGES ON SCHEMA amdadb TO gavoadmin WITH GRANT OPTION; | |
178 | +GRANT ALL PRIVILEGES ON amdadb.data_table TO gavo WITH GRANT OPTION; | |
179 | +GRANT ALL PRIVILEGES ON amdadb.data_table TO gavoadmin WITH GRANT OPTION;''' | |
180 | + | |
181 | + | |
182 | +def log(message: str) -> None: | |
183 | + """Log a warning in a log file or the stdout. | |
184 | + | |
185 | +- ``message``: The message to display or to print in the log file. | |
186 | +""" | |
187 | + | |
188 | + if log_file: | |
189 | + log_file.write(message + '\n') | |
190 | + else: | |
191 | + print(message) | |
192 | + | |
193 | + | |
194 | +def get_nb_files() -> int: | |
195 | + """Get the number of files in the ``SPASE`` directory, | |
196 | +in order to be able to show a progress bar.""" | |
197 | + | |
198 | + return sum([len(walker[2]) for walker in walk(SPASE_DIR)]) | |
199 | + | |
200 | + | |
201 | +def get_spase() -> Optional[SpaseDic]: | |
202 | + """Get all the spase files | |
203 | + | |
204 | +- ``return``: a dictionary, where: | |
205 | + | |
206 | + - **key** = dataset type ('numerical_data', 'granules', etc) ; | |
207 | + - **value** = A list of spase ElementTree nodes. | |
208 | +""" | |
209 | + | |
210 | + spase_dic = {} | |
211 | + n_file = 0 | |
212 | + for dir_path, _, files in walk(SPASE_DIR): | |
213 | + for file_path in [op.join(dir_path, file_name) for file_name in files]: | |
214 | + try: | |
215 | + root = ElTr.parse(file_path).getroot() | |
216 | + except FileNotFoundError: | |
217 | + print('\nThe spase file is not found on %s.\n' % file_path) | |
218 | + with open(file_path) as spase_file: | |
219 | + print(spase_file.read()) | |
220 | + return | |
221 | + for child in root: | |
222 | + key = str(child.tag).split('}')[-1] | |
223 | + if key != 'Version': | |
224 | + if key not in spase_dic: | |
225 | + spase_dic[key] = [] | |
226 | + | |
227 | + spase_dic[key].append(child) | |
228 | + | |
229 | + print('Parsed {:<23.23} {:<19.19} [{:<50.50}]'.format( | |
230 | + '%d/%d (%.2f%%)' % (n_file + 1, nb_files, 100 * float(n_file + 1) / nb_files), | |
231 | + op.splitext(op.basename(file_path))[0], | |
232 | + '.' * int((n_file + 1) / nb_files * 50)), end='\r') | |
233 | + n_file += 1 | |
234 | + print() | |
235 | + | |
236 | + if not spase_dic: | |
237 | + print('The SPASE dictionary is empty, please check the SPASE folder: %s.' % SPASE_DIR) | |
238 | + return | |
239 | + | |
240 | + return spase_dic | |
241 | + | |
242 | + | |
243 | +def get_observatory(spase_dic: SpaseDic, observatory_id: str) -> ElTr.Element: | |
244 | + """Given the ``observatory_id``, return the *observatory ElementTree node* | |
245 | +(by looking in the Observatory spase file). | |
246 | +""" | |
247 | + | |
248 | + obs_ids = [obs.find('{%s}ResourceID' % XMLNS).text for obs in spase_dic['Observatory']] | |
249 | + return spase_dic['Observatory'][obs_ids.index(observatory_id)] | |
250 | + | |
251 | + | |
252 | +def get_instrument(spase_dic: SpaseDic, instrument_id: str) -> ElTr.Element: | |
253 | + """Given the ``instrument_id``, return the *instrument ElementTree node*, | |
254 | +by looking in the Instrument spase file. | |
255 | +""" | |
256 | + | |
257 | + instru_ids = [instru.find('{%s}ResourceID' % XMLNS).text for instru in spase_dic['Instrument']] | |
258 | + return spase_dic['Instrument'][instru_ids.index(instrument_id)] | |
259 | + | |
260 | + | |
261 | +def get_access_format(numerical_data_node: ElTr.Element) -> SQLDic: | |
262 | + """Given the ``NumericalData`` node, return a dictionary containing the access format (mime-type).""" | |
263 | + | |
264 | + access_formats = set() | |
265 | + for access_info in numerical_data_node.findall('{%s}AccessInformation' % XMLNS): | |
266 | + spase_format_node = access_info.find('{%s}Format' % XMLNS) | |
267 | + if spase_format_node and spase_format_node.text: | |
268 | + access_formats.add(spase_format_node.text) | |
269 | + | |
270 | + access_format = SEP.join(access_formats) | |
271 | + try: | |
272 | + return {'access_format': MIME_TYPE_LIST[access_format]} | |
273 | + except KeyError: | |
274 | + return {'access_format': None} | |
275 | + | |
276 | + | |
277 | +def get_region_info(numerical_data_node: ElTr.Element) -> SQLDic: | |
278 | + """Given the ``NumericalData`` node, return a dictionary containing: | |
279 | + | |
280 | +- **target_class**: the ```target_class`` EPN-TAP parameter; | |
281 | +- **target_name**: the ```target_name`` EPN-TAP parameter; | |
282 | +- **target_region**: the ``target_region`` EPN-TAP parameter. | |
283 | +- **spase_region**: the ``spase_region`` parameter, added to the EPN-TAP parameters for the purposes of AMDA. | |
284 | +""" | |
285 | + | |
286 | + target_name = set() | |
287 | + target_class = set() | |
288 | + target_region = set() | |
289 | + spase_region = set() | |
290 | + obs_regions = numerical_data_node.findall('{%s}ObservedRegion' % XMLNS) | |
291 | + for target in [o_reg.text.split('.') for o_reg in obs_regions if o_reg.text is not None]: | |
292 | + offset = 1 if len(target) >= 2 and target[1] in TARGET_CLASS_DIC \ | |
293 | + and TARGET_CLASS_DIC[target[1]] == 'satellite' else 0 | |
294 | + target_class.add(TARGET_CLASS_DIC[target[offset]]) | |
295 | + target_name.add(target[offset] if target[offset] != 'Heliosphere' else 'Sun') | |
296 | + target_region.add('.'.join(target[offset + 1:])) | |
297 | + spase_region.add('.'.join(target)) | |
298 | + return {'target_class': SEP.join(target_class) if target_class else None, | |
299 | + 'target_name': SEP.join(target_name) if target_name else None, | |
300 | + 'target_region': SEP.join(target_region) if target_region else None, | |
301 | + 'spase_region': SEP.join(spase_region) if spase_region else None} | |
302 | + | |
303 | + | |
304 | +def get_instru_name_and_host_name(spase_dic: SpaseDic, numerical_data_node: ElTr.Element) -> SQLDic: | |
305 | + """Given the ``NumericalData`` node, return a dictionary containing: | |
306 | + | |
307 | +- **instrument_name**: the ``instrument_name`` EPN-TAP parameter; | |
308 | +- **instrument_host_name**: the ``instrument_host_name`` EPN-TAP parameter. | |
309 | +""" | |
310 | + | |
311 | + instru_names = set() | |
312 | + instru_host_names = set() | |
313 | + for instru_id in [i.text for i in numerical_data_node.findall('{%s}InstrumentID' % XMLNS)]: | |
314 | + instru = get_instrument(spase_dic, instru_id) | |
315 | + instru_names.add(instru.find('{%s}ResourceHeader' % XMLNS).find('{%s}ResourceName' % XMLNS).text) | |
316 | + observatory = get_observatory(spase_dic, instru.find('{%s}ObservatoryID' % XMLNS).text) | |
317 | + instru_host_names.add(observatory.find('{%s}ResourceHeader' % XMLNS).find('{%s}ResourceName' % XMLNS).text) | |
318 | + return {'instrument_name': SEP.join(instru_names) if instru_names else None, | |
319 | + 'instrument_host_name': SEP.join(instru_host_names) if instru_host_names else None} | |
320 | + | |
321 | + | |
322 | +def get_types(numerical_data_node: ElTr.Element) -> SQLDic: | |
323 | + """Given the ``NumericalData`` node, return a dictionary containing: | |
324 | + | |
325 | +- **dataproduct_type**: the ``dataproduct_type`` EPN-TAP parameter; | |
326 | +- **spatial_frame_type**: the ``spatial_frame_type`` EPN-TAP parameter; | |
327 | +- **measurement_type**: the ``measurement_type`` EPN-TAP parameter. | |
328 | +- **spase_measurement_type**: the ``spase_measurement_type`` parameter, | |
329 | + added to the EPN-TAP parameters for the purposes of AMDA. | |
330 | +""" | |
331 | + with open('log', 'w') as f_out: | |
332 | + dataproduct_types = set() | |
333 | + sp_frame_types = set() | |
334 | + measurement_types = set() | |
335 | + spase_measurement_type = getattr(numerical_data_node.find('{%s}MeasurementType' % XMLNS), 'text', None) | |
336 | + for param in numerical_data_node.findall('{%s}Parameter' % XMLNS): | |
337 | + hints = param.findall('{%s}RenderingHints' % XMLNS) | |
338 | + dt_nodes = [hint.find('{%s}DisplayType' % XMLNS) for hint in hints] | |
339 | + for display in [display.text for display in dt_nodes if display is not None and display.text is not None]: | |
340 | + dataproduct_types.add(DATAPRODUCT_TYPE_DIC[display]) | |
341 | + coord_sys = param.find('{%s}CoordinateSystem' % XMLNS) | |
342 | + if coord_sys is not None: | |
343 | + sp_frame_types.add(coord_sys.find('{%s}CoordinateRepresentation' % XMLNS).text.lower()) | |
344 | + measurement_type = param.find('{%s}Ucd' % XMLNS) | |
345 | + if measurement_type is not None and measurement_type.text is not None: | |
346 | + f_out.write(measurement_type.text) | |
347 | + measurement_types.add(measurement_type.text) | |
348 | + return {'dataproduct_type': SEP.join(dataproduct_types) if dataproduct_types else None, | |
349 | + 'spatial_frame_type': SEP.join(sp_frame_types) if sp_frame_types else None, | |
350 | + 'measurement_type': SEP.join(measurement_types) if measurement_types else None, | |
351 | + 'spase_measurement_type': spase_measurement_type} | |
352 | + | |
353 | + | |
354 | +def get_times_min_max(numerical_data_node: ElTr.Element) -> SQLDic: | |
355 | + """Given the ``NumericalData`` node, return a dictionary containing: | |
356 | + | |
357 | +- **time_sampling_step_min**: the ``time_sampling_step_min`` EPN-TAP parameter; | |
358 | +- **time_sampling_step_max**: the ``time_sampling_step_max`` EPN-TAP parameter; | |
359 | +- **time_exp_min**: the ``time_exp_min`` EPN-TAP parameter. | |
360 | +""" | |
361 | + | |
362 | + temporal_description_node = numerical_data_node.find('{%s}TemporalDescription' % XMLNS) | |
363 | + | |
364 | + if temporal_description_node is None: | |
365 | + return {'time_sampling_step_min': None, 'time_sampling_step_max': None, 'time_exp_min': None} | |
366 | + | |
367 | + return {'time_sampling_step_min': str(xml_duration_to_seconds(getattr(temporal_description_node.find( | |
368 | + '{%s}%s' % (XMLNS, 'Cadence_Min')), 'text', None))), | |
369 | + 'time_sampling_step_max': str(xml_duration_to_seconds(getattr(temporal_description_node.find( | |
370 | + '{%s}%s' % (XMLNS, 'Cadence_Max')), 'text', None))), | |
371 | + 'time_exp_min': str(xml_duration_to_seconds(getattr(temporal_description_node.find( | |
372 | + '{%s}%s' % (XMLNS, 'Exposure')), 'text', None))) | |
373 | + } | |
374 | + | |
375 | + | |
376 | +def get_processing_lvl(numerical_data_node: ElTr.Element) -> SQLDic: | |
377 | + """Given the ``NumericalData`` node, return a dictionary containing: | |
378 | + | |
379 | +- **processing_level**: the ``processing_level`` EPN-TAP parameter; | |
380 | +""" | |
381 | + | |
382 | + proc_lvl = getattr(numerical_data_node.find('{%s}ProcessingLevel' % XMLNS), 'text', None) | |
383 | + return {'processing_level': PROCESSING_LEVEL_DIC.get(proc_lvl, None)} | |
384 | + | |
385 | + | |
386 | +def get_granule_and_parent(gr_node: ElTr.Element) -> Tuple[str, SQLDic]: | |
387 | + """Given a Granule node, return a dictionary containing all the parameters inside it: | |
388 | + | |
389 | +- **obs_id**: the ``obs_id`` EPN-TAP parameter; | |
390 | +- **creation_date**: the ``creation_date`` EPN-TAP parameter; | |
391 | +- **release_date**: the ``release_date`` EPN-TAP parameter; | |
392 | +- **time_min**: the ``time_min`` EPN-TAP parameter; | |
393 | +- **time_max**: the ``time_max`` EPN-TAP parameter; | |
394 | +- **access_url**: the ``access_url`` EPN-TAP parameter; | |
395 | +- **access_estsize**: the ``access_estsize`` EPN-TAP parameter. | |
396 | +""" | |
397 | + | |
398 | + parent_id = getattr(gr_node.find('{%s}ParentID' % XMLNS), 'text', None) | |
399 | + obs_id = getattr(gr_node.find('{%s}ResourceID' % XMLNS), 'text', '').split('/')[-1] | |
400 | + if not obs_id: | |
401 | + print('Can not get the ResourceID content of a granule. Exiting here.') | |
402 | + sys.exit() | |
403 | + | |
404 | + release_date = getattr(gr_node.find('{%s}ReleaseDate' % XMLNS), 'text', None) | |
405 | + tim_min = xml_date_to_jd(getattr(gr_node.find('{%s}StartDate' % XMLNS), 'text', None)) | |
406 | + time_max = xml_date_to_jd(getattr(gr_node.find('{%s}StopDate' % XMLNS), 'text', None)) | |
407 | + src_n = gr_node.find('{%s}Source' % XMLNS) | |
408 | + access_url = getattr(src_n.find('{%s}URL' % XMLNS), 'text', None) if src_n else None | |
409 | + data_extent_node = src_n.find('{%s}DataExtent' % XMLNS) if src_n else None | |
410 | + access_estsize = getattr(data_extent_node.find('{%s}Quantity' % XMLNS), 'text', None) | |
411 | + | |
412 | + return parent_id, {'obs_id': obs_id, | |
413 | + 'release_date': release_date, | |
414 | + 'time_min': tim_min, | |
415 | + 'time_max': time_max, | |
416 | + 'access_url': access_url, | |
417 | + 'access_estsize': int(access_estsize) if access_estsize else None} | |
418 | + | |
419 | + | |
420 | +def xml_date_to_jd(xml_date: str) -> Optional[float]: | |
421 | + """Convert a *XML date* to *Julian day*.""" | |
422 | + | |
423 | + try: | |
424 | + output_date = datetime.strptime(xml_date, XML_DATE_FORMAT) | |
425 | + except ValueError: # Date is not well formatted | |
426 | + return None | |
427 | + | |
428 | + if output_date.month == 1 or output_date.month == 2: | |
429 | + year_p = output_date.year - 1 | |
430 | + month_p = output_date.month + 12 | |
431 | + else: | |
432 | + year_p = output_date.year | |
433 | + month_p = output_date.month | |
434 | + | |
435 | + # this checks where we are in relation to October 15, 1582, the beginning | |
436 | + # of the Gregorian calendar. | |
437 | + if ((output_date.year < 1582) or | |
438 | + (output_date.year == 1582 and output_date.month < 10) or | |
439 | + (output_date.year == 1582 and output_date.month == 10 and output_date.day < 15)): | |
440 | + j_day = 0 | |
441 | + else: | |
442 | + j_day = 2 - math.trunc(year_p / 100.) + math.trunc(math.trunc(year_p / 100.) / 4.) | |
443 | + | |
444 | + j_day += math.trunc((365.25 * year_p) - 0.75) if year_p < 0 else math.trunc(365.25 * year_p) | |
445 | + j_day += math.trunc(30.6001 * (month_p + 1)) + output_date.day + 1720994.5 | |
446 | + j_day += output_date.hour/24 + output_date.minute/1440 + output_date.second/86400 | |
447 | + | |
448 | + return j_day | |
449 | + | |
450 | + | |
451 | +def xml_date_to_sql_date(xml_date: str) -> str: | |
452 | + """Convert a *XML date* to a *SQL date*.""" | |
453 | + | |
454 | + return datetime.strptime(xml_date, XML_DATE_FORMAT).strftime(SQL_DATE_FORMAT) | |
455 | + | |
456 | + | |
457 | +def xml_duration_to_seconds(xml_duration: str) -> int: | |
458 | + """Convert a *XML duration* to seconds.""" | |
459 | + | |
460 | + if not xml_duration: | |
461 | + return 0 | |
462 | + | |
463 | + regex = re.compile(r'(?P<sign>-?)P(?:(?P<years>\d+)Y)?(?:(?P<months>\d+)M)?(?:(?P<days>\d+)D)?' + | |
464 | + r'(?:T(?:(?P<hours>\d+)H)?(?:(?P<minutes>\d+)M)?(?:(?P<seconds>\d+)S)?)?') | |
465 | + | |
466 | + time = regex.match(xml_duration.upper()).groupdict(0) | |
467 | + delta = timedelta( | |
468 | + days=int(time['days']) + (int(time['months']) * 30) + (int(time['years']) * 365), | |
469 | + hours=int(time['hours']), | |
470 | + minutes=int(time['minutes']), | |
471 | + seconds=int(time['seconds'])) | |
472 | + | |
473 | + return (delta * -1 if time['sign'] == "-" else delta).total_seconds() | |
474 | + | |
475 | + | |
476 | +def get_parameters(spase_dic: SpaseDic) -> List[SQLDic]: | |
477 | + """Get all the parameters of the entire dataset. | |
478 | +Return a list containing the granules, where each granule is a dictionary, with: | |
479 | + | |
480 | +- **keys**: the EPN-TAP parameter name; | |
481 | +- **values**: the EPN-TAP value corresponding to the parameter name. | |
482 | +""" | |
483 | + | |
484 | + datasets = {} | |
485 | + missing_parameters = {} | |
486 | + nb_elements = len(spase_dic['NumericalData']) + len(spase_dic['NumericalOutput']) + len(spase_dic['Granule']) | |
487 | + n_dataset = 0 | |
488 | + | |
489 | + for numerical_data_node in spase_dic['NumericalData'] + spase_dic['NumericalOutput']: | |
490 | + print('Dataset %d/%d' % (n_dataset, nb_elements), end=' ' * 99 + '\r') | |
491 | + n_dataset += 1 | |
492 | + try: | |
493 | + dataset_key = getattr(numerical_data_node.find('{%s}ResourceID' % XMLNS), 'text', None).split('/')[-1] | |
494 | + except AttributeError: | |
495 | + print('Can not get the ResourceID content of a dataset. Exiting here.') | |
496 | + sys.exit() | |
497 | + dataset = get_region_info(numerical_data_node) | |
498 | + dataset.update(get_instru_name_and_host_name(spase_dic, numerical_data_node)) | |
499 | + dataset.update(get_types(numerical_data_node)) | |
500 | + dataset.update(get_access_format(numerical_data_node)) | |
501 | + dataset.update(get_times_min_max(numerical_data_node)) | |
502 | + dataset.update(get_processing_lvl(numerical_data_node)) | |
503 | + | |
504 | + # Looking for None parameters in each dataset | |
505 | + for parameter, default_value in DEFAULT_DATASET_VALUES.items(): | |
506 | + if not dataset[parameter]: | |
507 | + dataset[parameter] = default_value | |
508 | + if dataset_key not in missing_parameters: | |
509 | + missing_parameters[dataset_key] = set() | |
510 | + missing_parameters[dataset_key].add(parameter) | |
511 | + datasets[dataset_key] = dataset | |
512 | + | |
513 | + granules_list = [] | |
514 | + for granule_node in spase_dic['Granule']: | |
515 | + parent_id, granule = get_granule_and_parent(granule_node) | |
516 | + dataset_key = parent_id.split('/')[-1] | |
517 | + | |
518 | + print('Granule {:<23.23} {:<18.18} [{:<50.50}]'.format( | |
519 | + '%d/%d (%.2f%%)' % (n_dataset + 1, nb_elements, 100 * float(n_dataset + 1) / nb_elements), | |
520 | + dataset_key, | |
521 | + '.' * int((n_dataset + 1) / nb_files * 50)), end='\r') | |
522 | + | |
523 | + # Looking for None parameters in each granule | |
524 | + for parameter, default_value in DEFAULT_GRANULE_VALUES.items(): | |
525 | + if not granule[parameter]: | |
526 | + granule[parameter] = default_value | |
527 | + if dataset_key not in missing_parameters: | |
528 | + missing_parameters[dataset_key] = set() | |
529 | + missing_parameters[dataset_key].add(parameter) | |
530 | + | |
531 | + try: | |
532 | + granule.update(datasets[dataset_key]) | |
533 | + except KeyError: | |
534 | + print('The parent id "%s" of the granule "%s" is not found in the dataset dictionary.' | |
535 | + % (parent_id, granule['access_url'])) | |
536 | + granules_list.append(granule) | |
537 | + n_dataset += 1 | |
538 | + print() | |
539 | + for bad_dataset, missings in missing_parameters.items(): | |
540 | + log('%s\tmissing %s' % (bad_dataset, ', '.join(missings))) | |
541 | + return granules_list | |
542 | + | |
543 | + | |
544 | +def write_sql(granules_list): | |
545 | + """Write a SQL script which insert all the granules in the database.""" | |
546 | + | |
547 | + with open(SQL_FILE_PATH, 'w') as sql_file: | |
548 | + sql_file.write(SQL_HEADER) | |
549 | + for gr in granules_list: | |
550 | + keys = ', '.join(gr.keys()) | |
551 | + values = ', '.join(['NULL' if param is None else "'%s'" % param if isinstance(param, str) else | |
552 | + str(param) for param in gr.values()]) | |
553 | + sql_file.write(SQL_ROW % (keys, values)) | |
554 | + sql_file.write(SQL_FOOTER) | |
555 | + | |
556 | + | |
557 | +if __name__ == '__main__': | |
558 | + log_file = open(LOG_FILE_PATH, 'w+') if LOG_FILE_PATH else None | |
559 | + | |
560 | + print('Getting number of files in %s...' % SPASE_DIR) | |
561 | + nb_files = get_nb_files() | |
562 | + | |
563 | + print('Parsing %d files...' % nb_files) | |
564 | + spase = get_spase() | |
565 | + | |
566 | + print('Done. Found these types of data: %s.' % ', '.join([key for (key, val) in spase.items()])) | |
567 | + | |
568 | + print('Loading numerical data...') | |
569 | + granules = get_parameters(spase) | |
570 | + | |
571 | + print('Creating SQL script...') | |
572 | + write_sql(granules) | |
573 | + | |
574 | + import subprocess | |
575 | + | |
576 | + subprocess.Popen(['notify-send', 'The SQL script %s has been generated.' % SQL_FILE_PATH]) | ... | ... |