Commit 8b818b98d7bfd7912b6728e7055c3414f30753c4
1 parent
016e9465
Exists in
master
Add all files required to fill DaCHS database.
Showing
4 changed files
with
737 additions
and
0 deletions
Show diff stats
.gitignore
@@ -0,0 +1,85 @@ | @@ -0,0 +1,85 @@ | ||
1 | +<?xml version="1.0" encoding="UTF-8"?> | ||
2 | +<resource schema="amdadb"> | ||
3 | + <!-- Metadata describing the dataset --> | ||
4 | + <meta name="title">Planetary and heliophysics plasma data at CDPP/AMDA</meta> | ||
5 | + <meta name="creationDate">2016-08-05T16:00:00</meta> | ||
6 | + <meta name="description" format="plain">Planetary and heliophysics plasma data at CDPP/AMDA</meta> | ||
7 | + <meta name="creator.name">Vincent Genot</meta> | ||
8 | + <meta name="contact.name">Vincent Genot</meta> | ||
9 | + <meta name="contact.email">vincent.genot@irap.omp.eu</meta> | ||
10 | + <meta name="contact.address">IRAP, 9 av. Colonel Roche, 31400 Toulouse, FRANCE</meta> | ||
11 | + <meta name="subject">Virtual observatory</meta> | ||
12 | + <meta name="subject">Plasma physics</meta> | ||
13 | + <meta name="utype">ivo://cdpp.irap/std/EpnCore#schema-2.0</meta> <!-- not tested --> | ||
14 | + <table id="epn_core" onDisk="True" adql="True"> | ||
15 | + <meta name="info" infoName="SERVICE_PROTOCOL" infoValue="2.0"> EPN-TAP </meta> | ||
16 | + <meta name="description">Planetary and heliophysics plasma data at CDPP/AMDA</meta> | ||
17 | + <meta name="referenceURL">http://amda.cdpp.eu</meta> | ||
18 | + <meta name="utype">EPN-TAP 2.0</meta> | ||
19 | + <!-- header parameters --> | ||
20 | + <column name="granule_uid" type="text" required="True" ucd="meta.id" description="Granule unique identifier, provides direct access"/> | ||
21 | + <column name="dataproduct_type" type="text" ucd="meta.code.class" description="Organisation of the data product (from enumerated list)"/> | ||
22 | + <column name="target_name" type="text" ucd="meta.id;src" description="Name of target (IAU standard)"/> | ||
23 | + <column name="time_min" type="double precision" ucd="time.start" unit="d" description="Acquisition start time (in JD) (not necessary)"/> | ||
24 | + <column name="time_max" type="double precision" ucd="time.end" unit="d" description="Acquisition stop time (in JD) (not necessary)"/> | ||
25 | + <!-- important parameters --> | ||
26 | + <column name="access_url" type="text" ucd="meta.ref.url;meta.file"/> | ||
27 | + <column name="target_class" type="text" ucd="meta.code.class;src" description="Type of target, from enumerated list"/> | ||
28 | + <column name="target_region" type="text" ucd="meta.id;src"/> | ||
29 | + <column name="spase_region" type="text" ucd="phys.angArea;obs" description="(not necessary)"/> | ||
30 | + <column name="instrument_host_name" type="text" ucd="meta.id;instr.obsty" description="(not necessary)"/> | ||
31 | + <column name="instrument_name" type="text" ucd="meta.id;instr" description="(not necessary)"/> | ||
32 | + <column name="measurement_type" type="text" ucd="meta.ucd" description="(not necessary)"/> | ||
33 | + <column name="spase_measurement_type" type="text" ucd="meta.ucd" description="(not necessary)"/> | ||
34 | + <column name="spatial_frame_type" type="text" ucd="meta.code.class;pos.frame" description="(can be necessary)"/> | ||
35 | + <column name="processing_level" type="integer" ucd="meta.code;obs.calib" required="True"/> | ||
36 | + <column name="release_date" type="date" ucd="time.release"/> | ||
37 | + <column name="access_estsize" type="integer" ucd="phys.size;meta.file" required="True"/> | ||
38 | + <column name="access_format" type="text" ucd="meta.code.mime"/> | ||
39 | + <column name="time_sampling_step_min" type="double precision" ucd="time.interval;stat.min" unit="s" description="Min time sampling step (not necessary)"/> | ||
40 | + <column name="time_sampling_step_max" type="double precision" ucd="time.interval;stat.max" unit="s" description="Max time sampling step (not necessary)"/> | ||
41 | + <column name="time_exp_min" type="double precision" ucd="time.duration;stat.min" unit="s" description="Min integration time (not necessary)"/> | ||
42 | + <!-- redundant or static parameters --> | ||
43 | + <column name="time_exp_max" type="double precision" ucd="time.duration;stat.max" unit="s" description="Max integration time (not necessary)"/> | ||
44 | + <column name="granule_gid" type="text" required="True" ucd="meta.id" description="Group identifier, identical for similar data products"/> | ||
45 | + <column name="obs_id" type="text" required="True" ucd="meta.id" description="Identical for data products related to the same original data"/> | ||
46 | + <column name="creation_date" type="date" ucd="time.creation"/> | ||
47 | + <column name="modification_date" type="date" ucd="time.update"/> | ||
48 | + <column name="service_title" type="text" ucd="meta.title"/> | ||
49 | + <column name="publisher" type="text" ucd="meta.name"/> | ||
50 | + <column name="time_scale" type="text" ucd="time.scale"/> | ||
51 | + <!-- null parameters --> | ||
52 | + <column name="spectral_range_min" type="double precision" ucd="em.freq;stat.min" unit="Hz" description="Min spectral range (not necessary)"/> | ||
53 | + <column name="spectral_range_max" type="double precision" ucd="em.freq;stat.max" unit="Hz" description="Max spectral range (not necessary)"/> | ||
54 | + <column name="spectral_sampling_step_min" type="double precision" ucd="em.freq.step;stat.min" unit="Hz" description="Min spectral sampling step (not necessary)"/> | ||
55 | + <column name="spectral_sampling_step_max" type="double precision" ucd="em.freq.step;stat.max" unit="Hz" description="Max spectral sampling step (not necessary)"/> | ||
56 | + <column name="spectral_resolution_min" type="double precision" ucd="spect.resolution;stat.min" unit="Hz" description="Min spectral resolution (not necessary)"/> | ||
57 | + <column name="spectral_resolution_max" type="double precision" ucd="spect.resolution;stat.max" unit="Hz" description="Max spectral resolution (not necessary)"/> | ||
58 | + <column name="c1min" type="double precision" ucd="pos;stat.min" unit="deg" description="(not necessary)"/> | ||
59 | + <column name="c1max" type="double precision" ucd="pos;stat.max" unit="deg" description="(not necessary)"/> | ||
60 | + <column name="c2min" type="double precision" ucd="pos;stat.min" unit="deg" description="(not necessary)"/> | ||
61 | + <column name="c2max" type="double precision" ucd="pos;stat.max" unit="deg" description="(not necessary)"/> | ||
62 | + <column name="c3min" type="double precision" ucd="pos;stat.min" unit="" description="(not necessary)"/> | ||
63 | + <column name="c3max" type="double precision" ucd="pos;stat.max" unit="" description="(not necessary)"/> | ||
64 | + <column name="c1_resol_min" type="double precision" ucd="pos.resolution;stat.min" unit="deg" description="(not necessary)"/> | ||
65 | + <column name="c1_resol_max" type="double precision" ucd="pos.resolution;stat.max" unit="deg" description="(not necessary)"/> | ||
66 | + <column name="c2_resol_min" type="double precision" ucd="pos.resolution;stat.min" unit="deg" description="Min resolution in latitude"/> | ||
67 | + <column name="c2_resol_max" type="double precision" ucd="pos.resolution;stat.max" unit="deg" description="(not necessary)"/> | ||
68 | + <column name="c3_resol_min" type="double precision" ucd="pos.resolution;stat.min" unit="" description="(not necessary)"/> | ||
69 | + <column name="c3_resol_max" type="double precision" ucd="pos.resolution;stat.max" unit="" description="(not necessary)"/> | ||
70 | + <column name="s_region" type="text" ucd="phys.angArea;obs" description="(not necessary)"/> | ||
71 | + <column name="incidence_min" type="double precision" ucd="pos.posAng;stat.min" unit="deg" description="(not necessary)"/> | ||
72 | + <column name="incidence_max" type="double precision" ucd="pos.posAng;stat.max" unit="deg" description="(not necessary)"/> | ||
73 | + <column name="emergence_min" type="double precision" ucd="pos.posAng;stat.min" unit="deg" description="(not necessary)"/> | ||
74 | + <column name="emergence_max" type="double precision" ucd="pos.posAng;stat.max" unit="deg" description="(not necessary)"/> | ||
75 | + <column name="phase_min" type="double precision" ucd="pos.phaseAng;stat.min" unit="deg" description="(not necessary)"/> | ||
76 | + <column name="phase_max" type="double precision" ucd="pos.phaseAng;stat.max" unit="deg" description="(not necessary)"/> | ||
77 | + </table> | ||
78 | + <data id="import"> | ||
79 | + <make table="epn_core"/> | ||
80 | + </data> | ||
81 | + <data id="collection" auto="false"> | ||
82 | + <register services="__system__/tap#run"/> | ||
83 | + <make table="epn_core"/> | ||
84 | + </data> | ||
85 | +</resource> |
@@ -0,0 +1,73 @@ | @@ -0,0 +1,73 @@ | ||
1 | +-- SQL procedure to define amdadb data table. | ||
2 | +-- Name: amdadb; Type: SCHEMA; Schema: amdadb; Owner: postgres | ||
3 | + | ||
4 | +SET client_encoding = 'UTF8'; | ||
5 | + | ||
6 | +DROP VIEW IF EXISTS amdadb.epn_core CASCADE; | ||
7 | +CREATE VIEW amdadb.epn_core AS SELECT | ||
8 | + -- header parameters | ||
9 | + CAST(obs_id || '_cdf' AS TEXT) AS granule_uid, | ||
10 | + dataproduct_type, | ||
11 | + target_name, | ||
12 | + time_min, | ||
13 | + time_max, | ||
14 | + -- important parameters | ||
15 | + access_url, | ||
16 | + target_class, | ||
17 | + target_region, | ||
18 | + spase_region, | ||
19 | + instrument_host_name, | ||
20 | + instrument_name, | ||
21 | + measurement_type, | ||
22 | + spase_measurement_type, | ||
23 | + spatial_frame_type, | ||
24 | + processing_level, | ||
25 | + release_date, | ||
26 | + access_estsize, | ||
27 | + access_format, | ||
28 | + time_sampling_step_min, | ||
29 | + time_sampling_step_max, | ||
30 | + time_exp_min, | ||
31 | + -- redundant or static parameters | ||
32 | + CAST(time_exp_min AS DOUBLE PRECISION) AS time_exp_max, | ||
33 | + CAST('cdf' AS TEXT) AS granule_gid, | ||
34 | + obs_id, | ||
35 | + -- CAST('application/x-netcdf' AS TEXT) AS access_format, | ||
36 | + CAST(release_date AS DATE) AS creation_date, | ||
37 | + CAST(release_date AS DATE) AS modification_date, | ||
38 | + CAST('AMDADB' AS TEXT) AS service_title, | ||
39 | + CAST('CDPP' AS TEXT) AS publisher, | ||
40 | + CAST('UTC' AS TEXT) AS time_scale, | ||
41 | + -- null parameters | ||
42 | + CAST(NULL AS DOUBLE PRECISION) AS spectral_range_min, | ||
43 | + CAST(NULL AS DOUBLE PRECISION) AS spectral_range_max, | ||
44 | + CAST(NULL AS DOUBLE PRECISION) AS spectral_sampling_step_min, | ||
45 | + CAST(NULL AS DOUBLE PRECISION) AS spectral_sampling_step_max, | ||
46 | + CAST(NULL AS DOUBLE PRECISION) AS spectral_resolution_min, | ||
47 | + CAST(NULL AS DOUBLE PRECISION) AS spectral_resolution_max, | ||
48 | + CAST(NULL AS DOUBLE PRECISION) AS c1min, | ||
49 | + CAST(NULL AS DOUBLE PRECISION) AS c1max, | ||
50 | + CAST(NULL AS DOUBLE PRECISION) AS c2min, | ||
51 | + CAST(NULL AS DOUBLE PRECISION) AS c2max, | ||
52 | + CAST(NULL AS DOUBLE PRECISION) AS c3min, | ||
53 | + CAST(NULL AS DOUBLE PRECISION) AS c3max, | ||
54 | + CAST(NULL AS DOUBLE PRECISION) AS c1_resol_min, | ||
55 | + CAST(NULL AS DOUBLE PRECISION) AS c1_resol_max, | ||
56 | + CAST(NULL AS DOUBLE PRECISION) AS c2_resol_min, | ||
57 | + CAST(NULL AS DOUBLE PRECISION) AS c2_resol_max, | ||
58 | + CAST(NULL AS DOUBLE PRECISION) AS c3_resol_min, | ||
59 | + CAST(NULL AS DOUBLE PRECISION) AS c3_resol_max, | ||
60 | + CAST(NULL AS TEXT) AS s_region, | ||
61 | + CAST(NULL AS DOUBLE PRECISION) AS incidence_min, | ||
62 | + CAST(NULL AS DOUBLE PRECISION) AS incidence_max, | ||
63 | + CAST(NULL AS DOUBLE PRECISION) AS emergence_min, | ||
64 | + CAST(NULL AS DOUBLE PRECISION) AS emergence_max, | ||
65 | + CAST(NULL AS DOUBLE PRECISION) AS phase_min, | ||
66 | + CAST(NULL AS DOUBLE PRECISION) AS phase_max, | ||
67 | + -- parameters added to prevent warnings in the q.rd validator | ||
68 | + CAST(NULL AS TEXT) AS thumbnail_url, | ||
69 | + CAST(NULL AS TEXT) AS file_name, | ||
70 | + CAST(NULL AS TEXT) AS species, | ||
71 | + CAST(NULL AS TEXT) AS feature_name, | ||
72 | + CAST(NULL AS TEXT) AS bib_reference | ||
73 | +FROM amdadb.data_table; |
@@ -0,0 +1,576 @@ | @@ -0,0 +1,576 @@ | ||
1 | +#!/usr/bin/env python | ||
2 | +# -*- coding: utf-8 -*- | ||
3 | + | ||
4 | +"""This script inspect a SPASE dataset folder (containing Granules, NumericalData, Instrument and | ||
5 | +Observatory folders), then generate a SQL script which insert all the granules in a database, | ||
6 | +formatted as epn-tap parameters. | ||
7 | + | ||
8 | +See | ||
9 | +http://spase-group.org/data/reference/spase-2_2_6/ for more information about spase specification, | ||
10 | +and https://voparis-confluence.obspm.fr/display/VES/EPN-TAP+V2.0+parameters for more information | ||
11 | +about epn-tap-v2 specification.""" | ||
12 | + | ||
13 | +import math | ||
14 | +import re | ||
15 | +import xml.etree.ElementTree as ElTr | ||
16 | +import os.path as op | ||
17 | +from os import walk | ||
18 | +from datetime import datetime, timedelta | ||
19 | +from typing import Tuple, List, Dict, Optional | ||
20 | +import sys | ||
21 | + | ||
22 | +# Type aliases | ||
23 | +SQLDic = Dict[str, object] | ||
24 | +SpaseDic = Dict[str, List[ElTr.Element]] | ||
25 | + | ||
26 | +# Paths | ||
27 | +WORKING_DIR = op.dirname(op.dirname(op.abspath(__file__))) | ||
28 | +OUTPUT_PATH = op.join(WORKING_DIR, 'SERVER') | ||
29 | +SQL_FILE_PATH = op.join(OUTPUT_PATH, 'amdadb_db.sql') | ||
30 | +SPASE_DIR = op.join(WORKING_DIR, 'DATA') | ||
31 | +LOG_FILE_PATH = op.join(WORKING_DIR, 'build_granules.log') # Set to None if you want to log in stdout instead of a file | ||
32 | + | ||
33 | +# XML and SQL formats | ||
34 | +XMLNS = 'http://www.spase-group.org/data/schema' | ||
35 | +XML_DATE_FORMAT = '%Y-%m-%dT%H:%M:%SZ' | ||
36 | +SQL_DATE_FORMAT = '%Y-%m-%d' | ||
37 | +SEP = '#' | ||
38 | + | ||
39 | +# Dictionaries of values | ||
40 | +DATAPRODUCT_TYPE_DIC = {'Image': 'im', 'Plasmagram': 'ds', 'Spectrogram': 'ds', 'StackPlot': 'ts', | ||
41 | + 'TimeSeries': 'ts', 'time_series': 'ts', 'WaveForm': 'ts'} | ||
42 | + | ||
43 | +PROCESSING_LEVEL_DIC = {'Calibrated': 3, 'Raw': 1, 'Uncalibrated': 5} | ||
44 | + | ||
45 | +# Based on http://spase-group.org/ | ||
46 | +TARGET_CLASS_DIC = {'Heliosphere': 'interplanetary_medium', 'Interstellar': 'galaxy', | ||
47 | + 'Earth': 'planet', 'Saturn': 'planet', 'Mercury': 'planet', 'Uranus': 'planet', | ||
48 | + 'Mars': 'planet', 'Neptune': 'planet', 'Jupiter': 'planet', 'Venus': 'planet', | ||
49 | + 'Moon': 'satellite', 'Callisto': 'satellite', 'Europa': 'satellite', | ||
50 | + 'Ganymede': 'satellite', 'Dione': 'satellite', 'Enceladus': 'satellite', | ||
51 | + 'Mimas': 'satellite', 'Miranda': 'satellite', 'Phobos': 'satellite', | ||
52 | + 'Iapetus': 'satellite', 'Titania': 'satellite', 'Oberon': 'satellite', | ||
53 | + 'Puck': 'satellite', 'Deimos': 'satellite', 'Ariel': 'satellite', | ||
54 | + 'Umbriel': 'satellite', 'Rhea': 'satellite', 'Tethys': 'satellite', | ||
55 | + 'Titan': 'satellite', 'Io': 'satellite', | ||
56 | + 'Pluto': 'dwarf_planet', | ||
57 | + 'Comet': 'comet' | ||
58 | + } | ||
59 | + | ||
60 | +MIME_TYPE_LIST = {'AVI': 'video/x-msvideo', | ||
61 | + 'Binary': 'application/octet-stream', | ||
62 | + 'CDF': 'application/x-cdf-istp', | ||
63 | + 'CEF': 'application/x-cef1', | ||
64 | + 'CEF1': 'application/x-cef1', | ||
65 | + 'CEF2': 'application/x-cef2', | ||
66 | + 'Excel': 'application/vnd.ms-excel', | ||
67 | + 'FITS': 'application/x-fits-bintable', | ||
68 | + 'GIF': 'image/gif', | ||
69 | + 'HDF': 'application/x-hdf', | ||
70 | + 'HDF4': 'application/x-hdf', | ||
71 | + 'HDF5': 'application/x-hdf', | ||
72 | + 'HTML': 'text/html', | ||
73 | + 'Hardcopy': None, | ||
74 | + 'Hardcopy.Film': None, | ||
75 | + 'Hardcopy.Microfiche': None, | ||
76 | + 'Hardcopy.Microfilm': None, | ||
77 | + 'Hardcopy.Photograph': None, | ||
78 | + 'Hardcopy.PhotographicPlate': None, | ||
79 | + 'Hardcopy.Print': None, | ||
80 | + 'IDFS': None, | ||
81 | + 'IDL': 'application/octet-stream', | ||
82 | + 'JPEG': 'image/jpeg ', | ||
83 | + 'MATLAB_4': 'application/octet-stream', | ||
84 | + 'MATLAB_6': 'application/octet-stream', | ||
85 | + 'MATLAB_7': 'application/octet-stream', | ||
86 | + 'MPEG': 'video/mpeg', | ||
87 | + 'NCAR': None, | ||
88 | + 'NetCDF': 'application/x-netcdf', | ||
89 | + 'PDF': 'application/pdf', | ||
90 | + 'PNG': 'image/png', | ||
91 | + 'Postscript': 'application/postscript', | ||
92 | + 'QuickTime': 'video/quicktime', | ||
93 | + 'TIFF': 'image/tiff', | ||
94 | + 'Text': 'text/plain', | ||
95 | + 'Text.ASCII': 'text/plain', | ||
96 | + 'Text.Unicode': 'text/plain', | ||
97 | + 'UDF': None, | ||
98 | + 'VOTable': 'application/x-votable+xml', | ||
99 | + 'XML': 'text/xml'} | ||
100 | + | ||
101 | +# All default SQL values for missing parameters in dataset | ||
102 | +DEFAULT_DATASET_VALUES = { | ||
103 | + 'dataproduct_type': 'Unknown', | ||
104 | + 'target_name': 'Unknown', | ||
105 | + 'target_class': 'Unknown', | ||
106 | + 'target_region': None, | ||
107 | + 'spase_region': None, | ||
108 | + 'instrument_host_name': None, | ||
109 | + 'instrument_name': None, | ||
110 | + 'measurement_type': None, | ||
111 | + 'spatial_frame_type': None, | ||
112 | + 'processing_level': 0, | ||
113 | + 'time_sampling_step_min': None, | ||
114 | + 'time_sampling_step_max': None, | ||
115 | + 'time_exp_min': None, | ||
116 | + 'access_format': 'application/x-cdf-istp' | ||
117 | + } | ||
118 | + | ||
119 | +# All default SQL values for missing parameters in granule | ||
120 | +DEFAULT_GRANULE_VALUES = { | ||
121 | + # obs_id: if missing, the script exits directly. | ||
122 | + 'time_min': 0.0, | ||
123 | + 'time_max': 0.0, | ||
124 | + 'access_url': None, | ||
125 | + 'access_estsize': 0, | ||
126 | + 'release_date': '01-01-0001' | ||
127 | + } | ||
128 | + | ||
129 | +# SQL code | ||
130 | +SQL_HEADER = '''-- Generated by build_BDD.py on %s. | ||
131 | +-- SQL procedure to define amdadb data table. Other parameters comes in the epn_core view. | ||
132 | +-- Name: amdadb; Type: SCHEMA; Schema: amdadb; Owner: postgres | ||
133 | + | ||
134 | +DROP SCHEMA IF EXISTS amdadb cascade; | ||
135 | +CREATE SCHEMA amdadb; | ||
136 | +SET search_path = public, pg_catalog; | ||
137 | +SET default_tablespace = ''; | ||
138 | +SET default_with_oids = false; | ||
139 | +SET client_encoding = 'UTF8'; | ||
140 | + | ||
141 | +-- Name: data_table; Type: TABLE; Schema: amdadb; Owner: postgres; Tablespace: | ||
142 | +CREATE TABLE amdadb.data_table ( | ||
143 | + -- header parameters | ||
144 | + id SERIAL PRIMARY KEY, | ||
145 | + obs_id TEXT, | ||
146 | + dataproduct_type TEXT, | ||
147 | + target_name TEXT, | ||
148 | + time_min DOUBLE PRECISION, -- date as JD | ||
149 | + time_max DOUBLE PRECISION, -- date as JD | ||
150 | + -- important parameters | ||
151 | + access_url TEXT, | ||
152 | + target_class TEXT, | ||
153 | + target_region TEXT, | ||
154 | + spase_region TEXT, | ||
155 | + instrument_host_name TEXT, | ||
156 | + instrument_name TEXT, | ||
157 | + measurement_type TEXT, | ||
158 | + spase_measurement_type TEXT, | ||
159 | + spatial_frame_type TEXT, | ||
160 | + processing_level INTEGER, | ||
161 | + release_date DATE, | ||
162 | + access_estsize INTEGER, | ||
163 | + access_format TEXT, | ||
164 | + time_sampling_step_min DOUBLE PRECISION, -- duration in seconds | ||
165 | + time_sampling_step_max DOUBLE PRECISION, -- duration in seconds | ||
166 | + time_exp_min DOUBLE PRECISION -- duration in seconds | ||
167 | +); | ||
168 | + | ||
169 | +''' % datetime.now().strftime('%c') | ||
170 | + | ||
171 | +SQL_ROW = 'INSERT INTO amdadb.data_table(%s) VALUES (%s);\n' | ||
172 | + | ||
173 | +SQL_FOOTER = '''REVOKE ALL ON SCHEMA "amdadb" FROM PUBLIC; | ||
174 | +REVOKE ALL ON SCHEMA "amdadb" FROM postgres; | ||
175 | +GRANT ALL ON SCHEMA "amdadb" TO postgres; | ||
176 | +GRANT ALL PRIVILEGES ON SCHEMA amdadb TO gavo WITH GRANT OPTION; | ||
177 | +GRANT ALL PRIVILEGES ON SCHEMA amdadb TO gavoadmin WITH GRANT OPTION; | ||
178 | +GRANT ALL PRIVILEGES ON amdadb.data_table TO gavo WITH GRANT OPTION; | ||
179 | +GRANT ALL PRIVILEGES ON amdadb.data_table TO gavoadmin WITH GRANT OPTION;''' | ||
180 | + | ||
181 | + | ||
182 | +def log(message: str) -> None: | ||
183 | + """Log a warning in a log file or the stdout. | ||
184 | + | ||
185 | +- ``message``: The message to display or to print in the log file. | ||
186 | +""" | ||
187 | + | ||
188 | + if log_file: | ||
189 | + log_file.write(message + '\n') | ||
190 | + else: | ||
191 | + print(message) | ||
192 | + | ||
193 | + | ||
194 | +def get_nb_files() -> int: | ||
195 | + """Get the number of files in the ``SPASE`` directory, | ||
196 | +in order to be able to show a progress bar.""" | ||
197 | + | ||
198 | + return sum([len(walker[2]) for walker in walk(SPASE_DIR)]) | ||
199 | + | ||
200 | + | ||
201 | +def get_spase() -> Optional[SpaseDic]: | ||
202 | + """Get all the spase files | ||
203 | + | ||
204 | +- ``return``: a dictionary, where: | ||
205 | + | ||
206 | + - **key** = dataset type ('numerical_data', 'granules', etc) ; | ||
207 | + - **value** = A list of spase ElementTree nodes. | ||
208 | +""" | ||
209 | + | ||
210 | + spase_dic = {} | ||
211 | + n_file = 0 | ||
212 | + for dir_path, _, files in walk(SPASE_DIR): | ||
213 | + for file_path in [op.join(dir_path, file_name) for file_name in files]: | ||
214 | + try: | ||
215 | + root = ElTr.parse(file_path).getroot() | ||
216 | + except FileNotFoundError: | ||
217 | + print('\nThe spase file is not found on %s.\n' % file_path) | ||
218 | + with open(file_path) as spase_file: | ||
219 | + print(spase_file.read()) | ||
220 | + return | ||
221 | + for child in root: | ||
222 | + key = str(child.tag).split('}')[-1] | ||
223 | + if key != 'Version': | ||
224 | + if key not in spase_dic: | ||
225 | + spase_dic[key] = [] | ||
226 | + | ||
227 | + spase_dic[key].append(child) | ||
228 | + | ||
229 | + print('Parsed {:<23.23} {:<19.19} [{:<50.50}]'.format( | ||
230 | + '%d/%d (%.2f%%)' % (n_file + 1, nb_files, 100 * float(n_file + 1) / nb_files), | ||
231 | + op.splitext(op.basename(file_path))[0], | ||
232 | + '.' * int((n_file + 1) / nb_files * 50)), end='\r') | ||
233 | + n_file += 1 | ||
234 | + print() | ||
235 | + | ||
236 | + if not spase_dic: | ||
237 | + print('The SPASE dictionary is empty, please check the SPASE folder: %s.' % SPASE_DIR) | ||
238 | + return | ||
239 | + | ||
240 | + return spase_dic | ||
241 | + | ||
242 | + | ||
243 | +def get_observatory(spase_dic: SpaseDic, observatory_id: str) -> ElTr.Element: | ||
244 | + """Given the ``observatory_id``, return the *observatory ElementTree node* | ||
245 | +(by looking in the Observatory spase file). | ||
246 | +""" | ||
247 | + | ||
248 | + obs_ids = [obs.find('{%s}ResourceID' % XMLNS).text for obs in spase_dic['Observatory']] | ||
249 | + return spase_dic['Observatory'][obs_ids.index(observatory_id)] | ||
250 | + | ||
251 | + | ||
252 | +def get_instrument(spase_dic: SpaseDic, instrument_id: str) -> ElTr.Element: | ||
253 | + """Given the ``instrument_id``, return the *instrument ElementTree node*, | ||
254 | +by looking in the Instrument spase file. | ||
255 | +""" | ||
256 | + | ||
257 | + instru_ids = [instru.find('{%s}ResourceID' % XMLNS).text for instru in spase_dic['Instrument']] | ||
258 | + return spase_dic['Instrument'][instru_ids.index(instrument_id)] | ||
259 | + | ||
260 | + | ||
261 | +def get_access_format(numerical_data_node: ElTr.Element) -> SQLDic: | ||
262 | + """Given the ``NumericalData`` node, return a dictionary containing the access format (mime-type).""" | ||
263 | + | ||
264 | + access_formats = set() | ||
265 | + for access_info in numerical_data_node.findall('{%s}AccessInformation' % XMLNS): | ||
266 | + spase_format_node = access_info.find('{%s}Format' % XMLNS) | ||
267 | + if spase_format_node and spase_format_node.text: | ||
268 | + access_formats.add(spase_format_node.text) | ||
269 | + | ||
270 | + access_format = SEP.join(access_formats) | ||
271 | + try: | ||
272 | + return {'access_format': MIME_TYPE_LIST[access_format]} | ||
273 | + except KeyError: | ||
274 | + return {'access_format': None} | ||
275 | + | ||
276 | + | ||
277 | +def get_region_info(numerical_data_node: ElTr.Element) -> SQLDic: | ||
278 | + """Given the ``NumericalData`` node, return a dictionary containing: | ||
279 | + | ||
280 | +- **target_class**: the ```target_class`` EPN-TAP parameter; | ||
281 | +- **target_name**: the ```target_name`` EPN-TAP parameter; | ||
282 | +- **target_region**: the ``target_region`` EPN-TAP parameter. | ||
283 | +- **spase_region**: the ``spase_region`` parameter, added to the EPN-TAP parameters for the purposes of AMDA. | ||
284 | +""" | ||
285 | + | ||
286 | + target_name = set() | ||
287 | + target_class = set() | ||
288 | + target_region = set() | ||
289 | + spase_region = set() | ||
290 | + obs_regions = numerical_data_node.findall('{%s}ObservedRegion' % XMLNS) | ||
291 | + for target in [o_reg.text.split('.') for o_reg in obs_regions if o_reg.text is not None]: | ||
292 | + offset = 1 if len(target) >= 2 and target[1] in TARGET_CLASS_DIC \ | ||
293 | + and TARGET_CLASS_DIC[target[1]] == 'satellite' else 0 | ||
294 | + target_class.add(TARGET_CLASS_DIC[target[offset]]) | ||
295 | + target_name.add(target[offset] if target[offset] != 'Heliosphere' else 'Sun') | ||
296 | + target_region.add('.'.join(target[offset + 1:])) | ||
297 | + spase_region.add('.'.join(target)) | ||
298 | + return {'target_class': SEP.join(target_class) if target_class else None, | ||
299 | + 'target_name': SEP.join(target_name) if target_name else None, | ||
300 | + 'target_region': SEP.join(target_region) if target_region else None, | ||
301 | + 'spase_region': SEP.join(spase_region) if spase_region else None} | ||
302 | + | ||
303 | + | ||
304 | +def get_instru_name_and_host_name(spase_dic: SpaseDic, numerical_data_node: ElTr.Element) -> SQLDic: | ||
305 | + """Given the ``NumericalData`` node, return a dictionary containing: | ||
306 | + | ||
307 | +- **instrument_name**: the ``instrument_name`` EPN-TAP parameter; | ||
308 | +- **instrument_host_name**: the ``instrument_host_name`` EPN-TAP parameter. | ||
309 | +""" | ||
310 | + | ||
311 | + instru_names = set() | ||
312 | + instru_host_names = set() | ||
313 | + for instru_id in [i.text for i in numerical_data_node.findall('{%s}InstrumentID' % XMLNS)]: | ||
314 | + instru = get_instrument(spase_dic, instru_id) | ||
315 | + instru_names.add(instru.find('{%s}ResourceHeader' % XMLNS).find('{%s}ResourceName' % XMLNS).text) | ||
316 | + observatory = get_observatory(spase_dic, instru.find('{%s}ObservatoryID' % XMLNS).text) | ||
317 | + instru_host_names.add(observatory.find('{%s}ResourceHeader' % XMLNS).find('{%s}ResourceName' % XMLNS).text) | ||
318 | + return {'instrument_name': SEP.join(instru_names) if instru_names else None, | ||
319 | + 'instrument_host_name': SEP.join(instru_host_names) if instru_host_names else None} | ||
320 | + | ||
321 | + | ||
322 | +def get_types(numerical_data_node: ElTr.Element) -> SQLDic: | ||
323 | + """Given the ``NumericalData`` node, return a dictionary containing: | ||
324 | + | ||
325 | +- **dataproduct_type**: the ``dataproduct_type`` EPN-TAP parameter; | ||
326 | +- **spatial_frame_type**: the ``spatial_frame_type`` EPN-TAP parameter; | ||
327 | +- **measurement_type**: the ``measurement_type`` EPN-TAP parameter. | ||
328 | +- **spase_measurement_type**: the ``spase_measurement_type`` parameter, | ||
329 | + added to the EPN-TAP parameters for the purposes of AMDA. | ||
330 | +""" | ||
331 | + with open('log', 'w') as f_out: | ||
332 | + dataproduct_types = set() | ||
333 | + sp_frame_types = set() | ||
334 | + measurement_types = set() | ||
335 | + spase_measurement_type = getattr(numerical_data_node.find('{%s}MeasurementType' % XMLNS), 'text', None) | ||
336 | + for param in numerical_data_node.findall('{%s}Parameter' % XMLNS): | ||
337 | + hints = param.findall('{%s}RenderingHints' % XMLNS) | ||
338 | + dt_nodes = [hint.find('{%s}DisplayType' % XMLNS) for hint in hints] | ||
339 | + for display in [display.text for display in dt_nodes if display is not None and display.text is not None]: | ||
340 | + dataproduct_types.add(DATAPRODUCT_TYPE_DIC[display]) | ||
341 | + coord_sys = param.find('{%s}CoordinateSystem' % XMLNS) | ||
342 | + if coord_sys is not None: | ||
343 | + sp_frame_types.add(coord_sys.find('{%s}CoordinateRepresentation' % XMLNS).text.lower()) | ||
344 | + measurement_type = param.find('{%s}Ucd' % XMLNS) | ||
345 | + if measurement_type is not None and measurement_type.text is not None: | ||
346 | + f_out.write(measurement_type.text) | ||
347 | + measurement_types.add(measurement_type.text) | ||
348 | + return {'dataproduct_type': SEP.join(dataproduct_types) if dataproduct_types else None, | ||
349 | + 'spatial_frame_type': SEP.join(sp_frame_types) if sp_frame_types else None, | ||
350 | + 'measurement_type': SEP.join(measurement_types) if measurement_types else None, | ||
351 | + 'spase_measurement_type': spase_measurement_type} | ||
352 | + | ||
353 | + | ||
354 | +def get_times_min_max(numerical_data_node: ElTr.Element) -> SQLDic: | ||
355 | + """Given the ``NumericalData`` node, return a dictionary containing: | ||
356 | + | ||
357 | +- **time_sampling_step_min**: the ``time_sampling_step_min`` EPN-TAP parameter; | ||
358 | +- **time_sampling_step_max**: the ``time_sampling_step_max`` EPN-TAP parameter; | ||
359 | +- **time_exp_min**: the ``time_exp_min`` EPN-TAP parameter. | ||
360 | +""" | ||
361 | + | ||
362 | + temporal_description_node = numerical_data_node.find('{%s}TemporalDescription' % XMLNS) | ||
363 | + | ||
364 | + if temporal_description_node is None: | ||
365 | + return {'time_sampling_step_min': None, 'time_sampling_step_max': None, 'time_exp_min': None} | ||
366 | + | ||
367 | + return {'time_sampling_step_min': str(xml_duration_to_seconds(getattr(temporal_description_node.find( | ||
368 | + '{%s}%s' % (XMLNS, 'Cadence_Min')), 'text', None))), | ||
369 | + 'time_sampling_step_max': str(xml_duration_to_seconds(getattr(temporal_description_node.find( | ||
370 | + '{%s}%s' % (XMLNS, 'Cadence_Max')), 'text', None))), | ||
371 | + 'time_exp_min': str(xml_duration_to_seconds(getattr(temporal_description_node.find( | ||
372 | + '{%s}%s' % (XMLNS, 'Exposure')), 'text', None))) | ||
373 | + } | ||
374 | + | ||
375 | + | ||
376 | +def get_processing_lvl(numerical_data_node: ElTr.Element) -> SQLDic: | ||
377 | + """Given the ``NumericalData`` node, return a dictionary containing: | ||
378 | + | ||
379 | +- **processing_level**: the ``processing_level`` EPN-TAP parameter; | ||
380 | +""" | ||
381 | + | ||
382 | + proc_lvl = getattr(numerical_data_node.find('{%s}ProcessingLevel' % XMLNS), 'text', None) | ||
383 | + return {'processing_level': PROCESSING_LEVEL_DIC.get(proc_lvl, None)} | ||
384 | + | ||
385 | + | ||
386 | +def get_granule_and_parent(gr_node: ElTr.Element) -> Tuple[str, SQLDic]: | ||
387 | + """Given a Granule node, return a dictionary containing all the parameters inside it: | ||
388 | + | ||
389 | +- **obs_id**: the ``obs_id`` EPN-TAP parameter; | ||
390 | +- **creation_date**: the ``creation_date`` EPN-TAP parameter; | ||
391 | +- **release_date**: the ``release_date`` EPN-TAP parameter; | ||
392 | +- **time_min**: the ``time_min`` EPN-TAP parameter; | ||
393 | +- **time_max**: the ``time_max`` EPN-TAP parameter; | ||
394 | +- **access_url**: the ``access_url`` EPN-TAP parameter; | ||
395 | +- **access_estsize**: the ``access_estsize`` EPN-TAP parameter. | ||
396 | +""" | ||
397 | + | ||
398 | + parent_id = getattr(gr_node.find('{%s}ParentID' % XMLNS), 'text', None) | ||
399 | + obs_id = getattr(gr_node.find('{%s}ResourceID' % XMLNS), 'text', '').split('/')[-1] | ||
400 | + if not obs_id: | ||
401 | + print('Can not get the ResourceID content of a granule. Exiting here.') | ||
402 | + sys.exit() | ||
403 | + | ||
404 | + release_date = getattr(gr_node.find('{%s}ReleaseDate' % XMLNS), 'text', None) | ||
405 | + tim_min = xml_date_to_jd(getattr(gr_node.find('{%s}StartDate' % XMLNS), 'text', None)) | ||
406 | + time_max = xml_date_to_jd(getattr(gr_node.find('{%s}StopDate' % XMLNS), 'text', None)) | ||
407 | + src_n = gr_node.find('{%s}Source' % XMLNS) | ||
408 | + access_url = getattr(src_n.find('{%s}URL' % XMLNS), 'text', None) if src_n else None | ||
409 | + data_extent_node = src_n.find('{%s}DataExtent' % XMLNS) if src_n else None | ||
410 | + access_estsize = getattr(data_extent_node.find('{%s}Quantity' % XMLNS), 'text', None) | ||
411 | + | ||
412 | + return parent_id, {'obs_id': obs_id, | ||
413 | + 'release_date': release_date, | ||
414 | + 'time_min': tim_min, | ||
415 | + 'time_max': time_max, | ||
416 | + 'access_url': access_url, | ||
417 | + 'access_estsize': int(access_estsize) if access_estsize else None} | ||
418 | + | ||
419 | + | ||
420 | +def xml_date_to_jd(xml_date: str) -> Optional[float]: | ||
421 | + """Convert a *XML date* to *Julian day*.""" | ||
422 | + | ||
423 | + try: | ||
424 | + output_date = datetime.strptime(xml_date, XML_DATE_FORMAT) | ||
425 | + except ValueError: # Date is not well formatted | ||
426 | + return None | ||
427 | + | ||
428 | + if output_date.month == 1 or output_date.month == 2: | ||
429 | + year_p = output_date.year - 1 | ||
430 | + month_p = output_date.month + 12 | ||
431 | + else: | ||
432 | + year_p = output_date.year | ||
433 | + month_p = output_date.month | ||
434 | + | ||
435 | + # this checks where we are in relation to October 15, 1582, the beginning | ||
436 | + # of the Gregorian calendar. | ||
437 | + if ((output_date.year < 1582) or | ||
438 | + (output_date.year == 1582 and output_date.month < 10) or | ||
439 | + (output_date.year == 1582 and output_date.month == 10 and output_date.day < 15)): | ||
440 | + j_day = 0 | ||
441 | + else: | ||
442 | + j_day = 2 - math.trunc(year_p / 100.) + math.trunc(math.trunc(year_p / 100.) / 4.) | ||
443 | + | ||
444 | + j_day += math.trunc((365.25 * year_p) - 0.75) if year_p < 0 else math.trunc(365.25 * year_p) | ||
445 | + j_day += math.trunc(30.6001 * (month_p + 1)) + output_date.day + 1720994.5 | ||
446 | + j_day += output_date.hour/24 + output_date.minute/1440 + output_date.second/86400 | ||
447 | + | ||
448 | + return j_day | ||
449 | + | ||
450 | + | ||
451 | +def xml_date_to_sql_date(xml_date: str) -> str: | ||
452 | + """Convert a *XML date* to a *SQL date*.""" | ||
453 | + | ||
454 | + return datetime.strptime(xml_date, XML_DATE_FORMAT).strftime(SQL_DATE_FORMAT) | ||
455 | + | ||
456 | + | ||
457 | +def xml_duration_to_seconds(xml_duration: str) -> int: | ||
458 | + """Convert a *XML duration* to seconds.""" | ||
459 | + | ||
460 | + if not xml_duration: | ||
461 | + return 0 | ||
462 | + | ||
463 | + regex = re.compile(r'(?P<sign>-?)P(?:(?P<years>\d+)Y)?(?:(?P<months>\d+)M)?(?:(?P<days>\d+)D)?' + | ||
464 | + r'(?:T(?:(?P<hours>\d+)H)?(?:(?P<minutes>\d+)M)?(?:(?P<seconds>\d+)S)?)?') | ||
465 | + | ||
466 | + time = regex.match(xml_duration.upper()).groupdict(0) | ||
467 | + delta = timedelta( | ||
468 | + days=int(time['days']) + (int(time['months']) * 30) + (int(time['years']) * 365), | ||
469 | + hours=int(time['hours']), | ||
470 | + minutes=int(time['minutes']), | ||
471 | + seconds=int(time['seconds'])) | ||
472 | + | ||
473 | + return (delta * -1 if time['sign'] == "-" else delta).total_seconds() | ||
474 | + | ||
475 | + | ||
476 | +def get_parameters(spase_dic: SpaseDic) -> List[SQLDic]: | ||
477 | + """Get all the parameters of the entire dataset. | ||
478 | +Return a list containing the granules, where each granule is a dictionary, with: | ||
479 | + | ||
480 | +- **keys**: the EPN-TAP parameter name; | ||
481 | +- **values**: the EPN-TAP value corresponding to the parameter name. | ||
482 | +""" | ||
483 | + | ||
484 | + datasets = {} | ||
485 | + missing_parameters = {} | ||
486 | + nb_elements = len(spase_dic['NumericalData']) + len(spase_dic['NumericalOutput']) + len(spase_dic['Granule']) | ||
487 | + n_dataset = 0 | ||
488 | + | ||
489 | + for numerical_data_node in spase_dic['NumericalData'] + spase_dic['NumericalOutput']: | ||
490 | + print('Dataset %d/%d' % (n_dataset, nb_elements), end=' ' * 99 + '\r') | ||
491 | + n_dataset += 1 | ||
492 | + try: | ||
493 | + dataset_key = getattr(numerical_data_node.find('{%s}ResourceID' % XMLNS), 'text', None).split('/')[-1] | ||
494 | + except AttributeError: | ||
495 | + print('Can not get the ResourceID content of a dataset. Exiting here.') | ||
496 | + sys.exit() | ||
497 | + dataset = get_region_info(numerical_data_node) | ||
498 | + dataset.update(get_instru_name_and_host_name(spase_dic, numerical_data_node)) | ||
499 | + dataset.update(get_types(numerical_data_node)) | ||
500 | + dataset.update(get_access_format(numerical_data_node)) | ||
501 | + dataset.update(get_times_min_max(numerical_data_node)) | ||
502 | + dataset.update(get_processing_lvl(numerical_data_node)) | ||
503 | + | ||
504 | + # Looking for None parameters in each dataset | ||
505 | + for parameter, default_value in DEFAULT_DATASET_VALUES.items(): | ||
506 | + if not dataset[parameter]: | ||
507 | + dataset[parameter] = default_value | ||
508 | + if dataset_key not in missing_parameters: | ||
509 | + missing_parameters[dataset_key] = set() | ||
510 | + missing_parameters[dataset_key].add(parameter) | ||
511 | + datasets[dataset_key] = dataset | ||
512 | + | ||
513 | + granules_list = [] | ||
514 | + for granule_node in spase_dic['Granule']: | ||
515 | + parent_id, granule = get_granule_and_parent(granule_node) | ||
516 | + dataset_key = parent_id.split('/')[-1] | ||
517 | + | ||
518 | + print('Granule {:<23.23} {:<18.18} [{:<50.50}]'.format( | ||
519 | + '%d/%d (%.2f%%)' % (n_dataset + 1, nb_elements, 100 * float(n_dataset + 1) / nb_elements), | ||
520 | + dataset_key, | ||
521 | + '.' * int((n_dataset + 1) / nb_files * 50)), end='\r') | ||
522 | + | ||
523 | + # Looking for None parameters in each granule | ||
524 | + for parameter, default_value in DEFAULT_GRANULE_VALUES.items(): | ||
525 | + if not granule[parameter]: | ||
526 | + granule[parameter] = default_value | ||
527 | + if dataset_key not in missing_parameters: | ||
528 | + missing_parameters[dataset_key] = set() | ||
529 | + missing_parameters[dataset_key].add(parameter) | ||
530 | + | ||
531 | + try: | ||
532 | + granule.update(datasets[dataset_key]) | ||
533 | + except KeyError: | ||
534 | + print('The parent id "%s" of the granule "%s" is not found in the dataset dictionary.' | ||
535 | + % (parent_id, granule['access_url'])) | ||
536 | + granules_list.append(granule) | ||
537 | + n_dataset += 1 | ||
538 | + print() | ||
539 | + for bad_dataset, missings in missing_parameters.items(): | ||
540 | + log('%s\tmissing %s' % (bad_dataset, ', '.join(missings))) | ||
541 | + return granules_list | ||
542 | + | ||
543 | + | ||
544 | +def write_sql(granules_list): | ||
545 | + """Write a SQL script which insert all the granules in the database.""" | ||
546 | + | ||
547 | + with open(SQL_FILE_PATH, 'w') as sql_file: | ||
548 | + sql_file.write(SQL_HEADER) | ||
549 | + for gr in granules_list: | ||
550 | + keys = ', '.join(gr.keys()) | ||
551 | + values = ', '.join(['NULL' if param is None else "'%s'" % param if isinstance(param, str) else | ||
552 | + str(param) for param in gr.values()]) | ||
553 | + sql_file.write(SQL_ROW % (keys, values)) | ||
554 | + sql_file.write(SQL_FOOTER) | ||
555 | + | ||
556 | + | ||
557 | +if __name__ == '__main__': | ||
558 | + log_file = open(LOG_FILE_PATH, 'w+') if LOG_FILE_PATH else None | ||
559 | + | ||
560 | + print('Getting number of files in %s...' % SPASE_DIR) | ||
561 | + nb_files = get_nb_files() | ||
562 | + | ||
563 | + print('Parsing %d files...' % nb_files) | ||
564 | + spase = get_spase() | ||
565 | + | ||
566 | + print('Done. Found these types of data: %s.' % ', '.join([key for (key, val) in spase.items()])) | ||
567 | + | ||
568 | + print('Loading numerical data...') | ||
569 | + granules = get_parameters(spase) | ||
570 | + | ||
571 | + print('Creating SQL script...') | ||
572 | + write_sql(granules) | ||
573 | + | ||
574 | + import subprocess | ||
575 | + | ||
576 | + subprocess.Popen(['notify-send', 'The SQL script %s has been generated.' % SQL_FILE_PATH]) |