Commit 8b818b98d7bfd7912b6728e7055c3414f30753c4

Authored by Nathanael Jourdane
1 parent 016e9465
Exists in master

Add all files required to fill DaCHS database.

.gitignore
1 1 __pycache__/
2 2 temp
3 3 .idea/
  4 +
  5 +# Definitely too big:
  6 +DaCHS/amdadb_db.sql
... ...
DaCHS/amdadb_q.rd 0 → 100755
... ... @@ -0,0 +1,85 @@
  1 +<?xml version="1.0" encoding="UTF-8"?>
  2 +<resource schema="amdadb">
  3 + <!-- Metadata describing the dataset -->
  4 + <meta name="title">Planetary and heliophysics plasma data at CDPP/AMDA</meta>
  5 + <meta name="creationDate">2016-08-05T16:00:00</meta>
  6 + <meta name="description" format="plain">Planetary and heliophysics plasma data at CDPP/AMDA</meta>
  7 + <meta name="creator.name">Vincent Genot</meta>
  8 + <meta name="contact.name">Vincent Genot</meta>
  9 + <meta name="contact.email">vincent.genot@irap.omp.eu</meta>
  10 + <meta name="contact.address">IRAP, 9 av. Colonel Roche, 31400 Toulouse, FRANCE</meta>
  11 + <meta name="subject">Virtual observatory</meta>
  12 + <meta name="subject">Plasma physics</meta>
  13 + <meta name="utype">ivo://cdpp.irap/std/EpnCore#schema-2.0</meta> <!-- not tested -->
  14 + <table id="epn_core" onDisk="True" adql="True">
  15 + <meta name="info" infoName="SERVICE_PROTOCOL" infoValue="2.0"> EPN-TAP </meta>
  16 + <meta name="description">Planetary and heliophysics plasma data at CDPP/AMDA</meta>
  17 + <meta name="referenceURL">http://amda.cdpp.eu</meta>
  18 + <meta name="utype">EPN-TAP 2.0</meta>
  19 + <!-- header parameters -->
  20 + <column name="granule_uid" type="text" required="True" ucd="meta.id" description="Granule unique identifier, provides direct access"/>
  21 + <column name="dataproduct_type" type="text" ucd="meta.code.class" description="Organisation of the data product (from enumerated list)"/>
  22 + <column name="target_name" type="text" ucd="meta.id;src" description="Name of target (IAU standard)"/>
  23 + <column name="time_min" type="double precision" ucd="time.start" unit="d" description="Acquisition start time (in JD) (not necessary)"/>
  24 + <column name="time_max" type="double precision" ucd="time.end" unit="d" description="Acquisition stop time (in JD) (not necessary)"/>
  25 + <!-- important parameters -->
  26 + <column name="access_url" type="text" ucd="meta.ref.url;meta.file"/>
  27 + <column name="target_class" type="text" ucd="meta.code.class;src" description="Type of target, from enumerated list"/>
  28 + <column name="target_region" type="text" ucd="meta.id;src"/>
  29 + <column name="spase_region" type="text" ucd="phys.angArea;obs" description="(not necessary)"/>
  30 + <column name="instrument_host_name" type="text" ucd="meta.id;instr.obsty" description="(not necessary)"/>
  31 + <column name="instrument_name" type="text" ucd="meta.id;instr" description="(not necessary)"/>
  32 + <column name="measurement_type" type="text" ucd="meta.ucd" description="(not necessary)"/>
  33 + <column name="spase_measurement_type" type="text" ucd="meta.ucd" description="(not necessary)"/>
  34 + <column name="spatial_frame_type" type="text" ucd="meta.code.class;pos.frame" description="(can be necessary)"/>
  35 + <column name="processing_level" type="integer" ucd="meta.code;obs.calib" required="True"/>
  36 + <column name="release_date" type="date" ucd="time.release"/>
  37 + <column name="access_estsize" type="integer" ucd="phys.size;meta.file" required="True"/>
  38 + <column name="access_format" type="text" ucd="meta.code.mime"/>
  39 + <column name="time_sampling_step_min" type="double precision" ucd="time.interval;stat.min" unit="s" description="Min time sampling step (not necessary)"/>
  40 + <column name="time_sampling_step_max" type="double precision" ucd="time.interval;stat.max" unit="s" description="Max time sampling step (not necessary)"/>
  41 + <column name="time_exp_min" type="double precision" ucd="time.duration;stat.min" unit="s" description="Min integration time (not necessary)"/>
  42 + <!-- redundant or static parameters -->
  43 + <column name="time_exp_max" type="double precision" ucd="time.duration;stat.max" unit="s" description="Max integration time (not necessary)"/>
  44 + <column name="granule_gid" type="text" required="True" ucd="meta.id" description="Group identifier, identical for similar data products"/>
  45 + <column name="obs_id" type="text" required="True" ucd="meta.id" description="Identical for data products related to the same original data"/>
  46 + <column name="creation_date" type="date" ucd="time.creation"/>
  47 + <column name="modification_date" type="date" ucd="time.update"/>
  48 + <column name="service_title" type="text" ucd="meta.title"/>
  49 + <column name="publisher" type="text" ucd="meta.name"/>
  50 + <column name="time_scale" type="text" ucd="time.scale"/>
  51 + <!-- null parameters -->
  52 + <column name="spectral_range_min" type="double precision" ucd="em.freq;stat.min" unit="Hz" description="Min spectral range (not necessary)"/>
  53 + <column name="spectral_range_max" type="double precision" ucd="em.freq;stat.max" unit="Hz" description="Max spectral range (not necessary)"/>
  54 + <column name="spectral_sampling_step_min" type="double precision" ucd="em.freq.step;stat.min" unit="Hz" description="Min spectral sampling step (not necessary)"/>
  55 + <column name="spectral_sampling_step_max" type="double precision" ucd="em.freq.step;stat.max" unit="Hz" description="Max spectral sampling step (not necessary)"/>
  56 + <column name="spectral_resolution_min" type="double precision" ucd="spect.resolution;stat.min" unit="Hz" description="Min spectral resolution (not necessary)"/>
  57 + <column name="spectral_resolution_max" type="double precision" ucd="spect.resolution;stat.max" unit="Hz" description="Max spectral resolution (not necessary)"/>
  58 + <column name="c1min" type="double precision" ucd="pos;stat.min" unit="deg" description="(not necessary)"/>
  59 + <column name="c1max" type="double precision" ucd="pos;stat.max" unit="deg" description="(not necessary)"/>
  60 + <column name="c2min" type="double precision" ucd="pos;stat.min" unit="deg" description="(not necessary)"/>
  61 + <column name="c2max" type="double precision" ucd="pos;stat.max" unit="deg" description="(not necessary)"/>
  62 + <column name="c3min" type="double precision" ucd="pos;stat.min" unit="" description="(not necessary)"/>
  63 + <column name="c3max" type="double precision" ucd="pos;stat.max" unit="" description="(not necessary)"/>
  64 + <column name="c1_resol_min" type="double precision" ucd="pos.resolution;stat.min" unit="deg" description="(not necessary)"/>
  65 + <column name="c1_resol_max" type="double precision" ucd="pos.resolution;stat.max" unit="deg" description="(not necessary)"/>
  66 + <column name="c2_resol_min" type="double precision" ucd="pos.resolution;stat.min" unit="deg" description="Min resolution in latitude"/>
  67 + <column name="c2_resol_max" type="double precision" ucd="pos.resolution;stat.max" unit="deg" description="(not necessary)"/>
  68 + <column name="c3_resol_min" type="double precision" ucd="pos.resolution;stat.min" unit="" description="(not necessary)"/>
  69 + <column name="c3_resol_max" type="double precision" ucd="pos.resolution;stat.max" unit="" description="(not necessary)"/>
  70 + <column name="s_region" type="text" ucd="phys.angArea;obs" description="(not necessary)"/>
  71 + <column name="incidence_min" type="double precision" ucd="pos.posAng;stat.min" unit="deg" description="(not necessary)"/>
  72 + <column name="incidence_max" type="double precision" ucd="pos.posAng;stat.max" unit="deg" description="(not necessary)"/>
  73 + <column name="emergence_min" type="double precision" ucd="pos.posAng;stat.min" unit="deg" description="(not necessary)"/>
  74 + <column name="emergence_max" type="double precision" ucd="pos.posAng;stat.max" unit="deg" description="(not necessary)"/>
  75 + <column name="phase_min" type="double precision" ucd="pos.phaseAng;stat.min" unit="deg" description="(not necessary)"/>
  76 + <column name="phase_max" type="double precision" ucd="pos.phaseAng;stat.max" unit="deg" description="(not necessary)"/>
  77 + </table>
  78 + <data id="import">
  79 + <make table="epn_core"/>
  80 + </data>
  81 + <data id="collection" auto="false">
  82 + <register services="__system__/tap#run"/>
  83 + <make table="epn_core"/>
  84 + </data>
  85 +</resource>
... ...
DaCHS/amdadb_view.sql 0 → 100644
... ... @@ -0,0 +1,73 @@
  1 +-- SQL procedure to define amdadb data table.
  2 +-- Name: amdadb; Type: SCHEMA; Schema: amdadb; Owner: postgres
  3 +
  4 +SET client_encoding = 'UTF8';
  5 +
  6 +DROP VIEW IF EXISTS amdadb.epn_core CASCADE;
  7 +CREATE VIEW amdadb.epn_core AS SELECT
  8 + -- header parameters
  9 + CAST(obs_id || '_cdf' AS TEXT) AS granule_uid,
  10 + dataproduct_type,
  11 + target_name,
  12 + time_min,
  13 + time_max,
  14 + -- important parameters
  15 + access_url,
  16 + target_class,
  17 + target_region,
  18 + spase_region,
  19 + instrument_host_name,
  20 + instrument_name,
  21 + measurement_type,
  22 + spase_measurement_type,
  23 + spatial_frame_type,
  24 + processing_level,
  25 + release_date,
  26 + access_estsize,
  27 + access_format,
  28 + time_sampling_step_min,
  29 + time_sampling_step_max,
  30 + time_exp_min,
  31 + -- redundant or static parameters
  32 + CAST(time_exp_min AS DOUBLE PRECISION) AS time_exp_max,
  33 + CAST('cdf' AS TEXT) AS granule_gid,
  34 + obs_id,
  35 + -- CAST('application/x-netcdf' AS TEXT) AS access_format,
  36 + CAST(release_date AS DATE) AS creation_date,
  37 + CAST(release_date AS DATE) AS modification_date,
  38 + CAST('AMDADB' AS TEXT) AS service_title,
  39 + CAST('CDPP' AS TEXT) AS publisher,
  40 + CAST('UTC' AS TEXT) AS time_scale,
  41 + -- null parameters
  42 + CAST(NULL AS DOUBLE PRECISION) AS spectral_range_min,
  43 + CAST(NULL AS DOUBLE PRECISION) AS spectral_range_max,
  44 + CAST(NULL AS DOUBLE PRECISION) AS spectral_sampling_step_min,
  45 + CAST(NULL AS DOUBLE PRECISION) AS spectral_sampling_step_max,
  46 + CAST(NULL AS DOUBLE PRECISION) AS spectral_resolution_min,
  47 + CAST(NULL AS DOUBLE PRECISION) AS spectral_resolution_max,
  48 + CAST(NULL AS DOUBLE PRECISION) AS c1min,
  49 + CAST(NULL AS DOUBLE PRECISION) AS c1max,
  50 + CAST(NULL AS DOUBLE PRECISION) AS c2min,
  51 + CAST(NULL AS DOUBLE PRECISION) AS c2max,
  52 + CAST(NULL AS DOUBLE PRECISION) AS c3min,
  53 + CAST(NULL AS DOUBLE PRECISION) AS c3max,
  54 + CAST(NULL AS DOUBLE PRECISION) AS c1_resol_min,
  55 + CAST(NULL AS DOUBLE PRECISION) AS c1_resol_max,
  56 + CAST(NULL AS DOUBLE PRECISION) AS c2_resol_min,
  57 + CAST(NULL AS DOUBLE PRECISION) AS c2_resol_max,
  58 + CAST(NULL AS DOUBLE PRECISION) AS c3_resol_min,
  59 + CAST(NULL AS DOUBLE PRECISION) AS c3_resol_max,
  60 + CAST(NULL AS TEXT) AS s_region,
  61 + CAST(NULL AS DOUBLE PRECISION) AS incidence_min,
  62 + CAST(NULL AS DOUBLE PRECISION) AS incidence_max,
  63 + CAST(NULL AS DOUBLE PRECISION) AS emergence_min,
  64 + CAST(NULL AS DOUBLE PRECISION) AS emergence_max,
  65 + CAST(NULL AS DOUBLE PRECISION) AS phase_min,
  66 + CAST(NULL AS DOUBLE PRECISION) AS phase_max,
  67 + -- parameters added to prevent warnings in the q.rd validator
  68 + CAST(NULL AS TEXT) AS thumbnail_url,
  69 + CAST(NULL AS TEXT) AS file_name,
  70 + CAST(NULL AS TEXT) AS species,
  71 + CAST(NULL AS TEXT) AS feature_name,
  72 + CAST(NULL AS TEXT) AS bib_reference
  73 +FROM amdadb.data_table;
... ...
DaCHS/build_BDD.py 0 → 100755
... ... @@ -0,0 +1,576 @@
  1 +#!/usr/bin/env python
  2 +# -*- coding: utf-8 -*-
  3 +
  4 +"""This script inspect a SPASE dataset folder (containing Granules, NumericalData, Instrument and
  5 +Observatory folders), then generate a SQL script which insert all the granules in a database,
  6 +formatted as epn-tap parameters.
  7 +
  8 +See
  9 +http://spase-group.org/data/reference/spase-2_2_6/ for more information about spase specification,
  10 +and https://voparis-confluence.obspm.fr/display/VES/EPN-TAP+V2.0+parameters for more information
  11 +about epn-tap-v2 specification."""
  12 +
  13 +import math
  14 +import re
  15 +import xml.etree.ElementTree as ElTr
  16 +import os.path as op
  17 +from os import walk
  18 +from datetime import datetime, timedelta
  19 +from typing import Tuple, List, Dict, Optional
  20 +import sys
  21 +
  22 +# Type aliases
  23 +SQLDic = Dict[str, object]
  24 +SpaseDic = Dict[str, List[ElTr.Element]]
  25 +
  26 +# Paths
  27 +WORKING_DIR = op.dirname(op.dirname(op.abspath(__file__)))
  28 +OUTPUT_PATH = op.join(WORKING_DIR, 'SERVER')
  29 +SQL_FILE_PATH = op.join(OUTPUT_PATH, 'amdadb_db.sql')
  30 +SPASE_DIR = op.join(WORKING_DIR, 'DATA')
  31 +LOG_FILE_PATH = op.join(WORKING_DIR, 'build_granules.log') # Set to None if you want to log in stdout instead of a file
  32 +
  33 +# XML and SQL formats
  34 +XMLNS = 'http://www.spase-group.org/data/schema'
  35 +XML_DATE_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
  36 +SQL_DATE_FORMAT = '%Y-%m-%d'
  37 +SEP = '#'
  38 +
  39 +# Dictionaries of values
  40 +DATAPRODUCT_TYPE_DIC = {'Image': 'im', 'Plasmagram': 'ds', 'Spectrogram': 'ds', 'StackPlot': 'ts',
  41 + 'TimeSeries': 'ts', 'time_series': 'ts', 'WaveForm': 'ts'}
  42 +
  43 +PROCESSING_LEVEL_DIC = {'Calibrated': 3, 'Raw': 1, 'Uncalibrated': 5}
  44 +
  45 +# Based on http://spase-group.org/
  46 +TARGET_CLASS_DIC = {'Heliosphere': 'interplanetary_medium', 'Interstellar': 'galaxy',
  47 + 'Earth': 'planet', 'Saturn': 'planet', 'Mercury': 'planet', 'Uranus': 'planet',
  48 + 'Mars': 'planet', 'Neptune': 'planet', 'Jupiter': 'planet', 'Venus': 'planet',
  49 + 'Moon': 'satellite', 'Callisto': 'satellite', 'Europa': 'satellite',
  50 + 'Ganymede': 'satellite', 'Dione': 'satellite', 'Enceladus': 'satellite',
  51 + 'Mimas': 'satellite', 'Miranda': 'satellite', 'Phobos': 'satellite',
  52 + 'Iapetus': 'satellite', 'Titania': 'satellite', 'Oberon': 'satellite',
  53 + 'Puck': 'satellite', 'Deimos': 'satellite', 'Ariel': 'satellite',
  54 + 'Umbriel': 'satellite', 'Rhea': 'satellite', 'Tethys': 'satellite',
  55 + 'Titan': 'satellite', 'Io': 'satellite',
  56 + 'Pluto': 'dwarf_planet',
  57 + 'Comet': 'comet'
  58 + }
  59 +
  60 +MIME_TYPE_LIST = {'AVI': 'video/x-msvideo',
  61 + 'Binary': 'application/octet-stream',
  62 + 'CDF': 'application/x-cdf-istp',
  63 + 'CEF': 'application/x-cef1',
  64 + 'CEF1': 'application/x-cef1',
  65 + 'CEF2': 'application/x-cef2',
  66 + 'Excel': 'application/vnd.ms-excel',
  67 + 'FITS': 'application/x-fits-bintable',
  68 + 'GIF': 'image/gif',
  69 + 'HDF': 'application/x-hdf',
  70 + 'HDF4': 'application/x-hdf',
  71 + 'HDF5': 'application/x-hdf',
  72 + 'HTML': 'text/html',
  73 + 'Hardcopy': None,
  74 + 'Hardcopy.Film': None,
  75 + 'Hardcopy.Microfiche': None,
  76 + 'Hardcopy.Microfilm': None,
  77 + 'Hardcopy.Photograph': None,
  78 + 'Hardcopy.PhotographicPlate': None,
  79 + 'Hardcopy.Print': None,
  80 + 'IDFS': None,
  81 + 'IDL': 'application/octet-stream',
  82 + 'JPEG': 'image/jpeg ',
  83 + 'MATLAB_4': 'application/octet-stream',
  84 + 'MATLAB_6': 'application/octet-stream',
  85 + 'MATLAB_7': 'application/octet-stream',
  86 + 'MPEG': 'video/mpeg',
  87 + 'NCAR': None,
  88 + 'NetCDF': 'application/x-netcdf',
  89 + 'PDF': 'application/pdf',
  90 + 'PNG': 'image/png',
  91 + 'Postscript': 'application/postscript',
  92 + 'QuickTime': 'video/quicktime',
  93 + 'TIFF': 'image/tiff',
  94 + 'Text': 'text/plain',
  95 + 'Text.ASCII': 'text/plain',
  96 + 'Text.Unicode': 'text/plain',
  97 + 'UDF': None,
  98 + 'VOTable': 'application/x-votable+xml',
  99 + 'XML': 'text/xml'}
  100 +
  101 +# All default SQL values for missing parameters in dataset
  102 +DEFAULT_DATASET_VALUES = {
  103 + 'dataproduct_type': 'Unknown',
  104 + 'target_name': 'Unknown',
  105 + 'target_class': 'Unknown',
  106 + 'target_region': None,
  107 + 'spase_region': None,
  108 + 'instrument_host_name': None,
  109 + 'instrument_name': None,
  110 + 'measurement_type': None,
  111 + 'spatial_frame_type': None,
  112 + 'processing_level': 0,
  113 + 'time_sampling_step_min': None,
  114 + 'time_sampling_step_max': None,
  115 + 'time_exp_min': None,
  116 + 'access_format': 'application/x-cdf-istp'
  117 + }
  118 +
  119 +# All default SQL values for missing parameters in granule
  120 +DEFAULT_GRANULE_VALUES = {
  121 + # obs_id: if missing, the script exits directly.
  122 + 'time_min': 0.0,
  123 + 'time_max': 0.0,
  124 + 'access_url': None,
  125 + 'access_estsize': 0,
  126 + 'release_date': '01-01-0001'
  127 + }
  128 +
  129 +# SQL code
  130 +SQL_HEADER = '''-- Generated by build_BDD.py on %s.
  131 +-- SQL procedure to define amdadb data table. Other parameters comes in the epn_core view.
  132 +-- Name: amdadb; Type: SCHEMA; Schema: amdadb; Owner: postgres
  133 +
  134 +DROP SCHEMA IF EXISTS amdadb cascade;
  135 +CREATE SCHEMA amdadb;
  136 +SET search_path = public, pg_catalog;
  137 +SET default_tablespace = '';
  138 +SET default_with_oids = false;
  139 +SET client_encoding = 'UTF8';
  140 +
  141 +-- Name: data_table; Type: TABLE; Schema: amdadb; Owner: postgres; Tablespace:
  142 +CREATE TABLE amdadb.data_table (
  143 + -- header parameters
  144 + id SERIAL PRIMARY KEY,
  145 + obs_id TEXT,
  146 + dataproduct_type TEXT,
  147 + target_name TEXT,
  148 + time_min DOUBLE PRECISION, -- date as JD
  149 + time_max DOUBLE PRECISION, -- date as JD
  150 + -- important parameters
  151 + access_url TEXT,
  152 + target_class TEXT,
  153 + target_region TEXT,
  154 + spase_region TEXT,
  155 + instrument_host_name TEXT,
  156 + instrument_name TEXT,
  157 + measurement_type TEXT,
  158 + spase_measurement_type TEXT,
  159 + spatial_frame_type TEXT,
  160 + processing_level INTEGER,
  161 + release_date DATE,
  162 + access_estsize INTEGER,
  163 + access_format TEXT,
  164 + time_sampling_step_min DOUBLE PRECISION, -- duration in seconds
  165 + time_sampling_step_max DOUBLE PRECISION, -- duration in seconds
  166 + time_exp_min DOUBLE PRECISION -- duration in seconds
  167 +);
  168 +
  169 +''' % datetime.now().strftime('%c')
  170 +
  171 +SQL_ROW = 'INSERT INTO amdadb.data_table(%s) VALUES (%s);\n'
  172 +
  173 +SQL_FOOTER = '''REVOKE ALL ON SCHEMA "amdadb" FROM PUBLIC;
  174 +REVOKE ALL ON SCHEMA "amdadb" FROM postgres;
  175 +GRANT ALL ON SCHEMA "amdadb" TO postgres;
  176 +GRANT ALL PRIVILEGES ON SCHEMA amdadb TO gavo WITH GRANT OPTION;
  177 +GRANT ALL PRIVILEGES ON SCHEMA amdadb TO gavoadmin WITH GRANT OPTION;
  178 +GRANT ALL PRIVILEGES ON amdadb.data_table TO gavo WITH GRANT OPTION;
  179 +GRANT ALL PRIVILEGES ON amdadb.data_table TO gavoadmin WITH GRANT OPTION;'''
  180 +
  181 +
  182 +def log(message: str) -> None:
  183 + """Log a warning in a log file or the stdout.
  184 +
  185 +- ``message``: The message to display or to print in the log file.
  186 +"""
  187 +
  188 + if log_file:
  189 + log_file.write(message + '\n')
  190 + else:
  191 + print(message)
  192 +
  193 +
  194 +def get_nb_files() -> int:
  195 + """Get the number of files in the ``SPASE`` directory,
  196 +in order to be able to show a progress bar."""
  197 +
  198 + return sum([len(walker[2]) for walker in walk(SPASE_DIR)])
  199 +
  200 +
  201 +def get_spase() -> Optional[SpaseDic]:
  202 + """Get all the spase files
  203 +
  204 +- ``return``: a dictionary, where:
  205 +
  206 + - **key** = dataset type ('numerical_data', 'granules', etc) ;
  207 + - **value** = A list of spase ElementTree nodes.
  208 +"""
  209 +
  210 + spase_dic = {}
  211 + n_file = 0
  212 + for dir_path, _, files in walk(SPASE_DIR):
  213 + for file_path in [op.join(dir_path, file_name) for file_name in files]:
  214 + try:
  215 + root = ElTr.parse(file_path).getroot()
  216 + except FileNotFoundError:
  217 + print('\nThe spase file is not found on %s.\n' % file_path)
  218 + with open(file_path) as spase_file:
  219 + print(spase_file.read())
  220 + return
  221 + for child in root:
  222 + key = str(child.tag).split('}')[-1]
  223 + if key != 'Version':
  224 + if key not in spase_dic:
  225 + spase_dic[key] = []
  226 +
  227 + spase_dic[key].append(child)
  228 +
  229 + print('Parsed {:<23.23} {:<19.19} [{:<50.50}]'.format(
  230 + '%d/%d (%.2f%%)' % (n_file + 1, nb_files, 100 * float(n_file + 1) / nb_files),
  231 + op.splitext(op.basename(file_path))[0],
  232 + '.' * int((n_file + 1) / nb_files * 50)), end='\r')
  233 + n_file += 1
  234 + print()
  235 +
  236 + if not spase_dic:
  237 + print('The SPASE dictionary is empty, please check the SPASE folder: %s.' % SPASE_DIR)
  238 + return
  239 +
  240 + return spase_dic
  241 +
  242 +
  243 +def get_observatory(spase_dic: SpaseDic, observatory_id: str) -> ElTr.Element:
  244 + """Given the ``observatory_id``, return the *observatory ElementTree node*
  245 +(by looking in the Observatory spase file).
  246 +"""
  247 +
  248 + obs_ids = [obs.find('{%s}ResourceID' % XMLNS).text for obs in spase_dic['Observatory']]
  249 + return spase_dic['Observatory'][obs_ids.index(observatory_id)]
  250 +
  251 +
  252 +def get_instrument(spase_dic: SpaseDic, instrument_id: str) -> ElTr.Element:
  253 + """Given the ``instrument_id``, return the *instrument ElementTree node*,
  254 +by looking in the Instrument spase file.
  255 +"""
  256 +
  257 + instru_ids = [instru.find('{%s}ResourceID' % XMLNS).text for instru in spase_dic['Instrument']]
  258 + return spase_dic['Instrument'][instru_ids.index(instrument_id)]
  259 +
  260 +
  261 +def get_access_format(numerical_data_node: ElTr.Element) -> SQLDic:
  262 + """Given the ``NumericalData`` node, return a dictionary containing the access format (mime-type)."""
  263 +
  264 + access_formats = set()
  265 + for access_info in numerical_data_node.findall('{%s}AccessInformation' % XMLNS):
  266 + spase_format_node = access_info.find('{%s}Format' % XMLNS)
  267 + if spase_format_node and spase_format_node.text:
  268 + access_formats.add(spase_format_node.text)
  269 +
  270 + access_format = SEP.join(access_formats)
  271 + try:
  272 + return {'access_format': MIME_TYPE_LIST[access_format]}
  273 + except KeyError:
  274 + return {'access_format': None}
  275 +
  276 +
  277 +def get_region_info(numerical_data_node: ElTr.Element) -> SQLDic:
  278 + """Given the ``NumericalData`` node, return a dictionary containing:
  279 +
  280 +- **target_class**: the ```target_class`` EPN-TAP parameter;
  281 +- **target_name**: the ```target_name`` EPN-TAP parameter;
  282 +- **target_region**: the ``target_region`` EPN-TAP parameter.
  283 +- **spase_region**: the ``spase_region`` parameter, added to the EPN-TAP parameters for the purposes of AMDA.
  284 +"""
  285 +
  286 + target_name = set()
  287 + target_class = set()
  288 + target_region = set()
  289 + spase_region = set()
  290 + obs_regions = numerical_data_node.findall('{%s}ObservedRegion' % XMLNS)
  291 + for target in [o_reg.text.split('.') for o_reg in obs_regions if o_reg.text is not None]:
  292 + offset = 1 if len(target) >= 2 and target[1] in TARGET_CLASS_DIC \
  293 + and TARGET_CLASS_DIC[target[1]] == 'satellite' else 0
  294 + target_class.add(TARGET_CLASS_DIC[target[offset]])
  295 + target_name.add(target[offset] if target[offset] != 'Heliosphere' else 'Sun')
  296 + target_region.add('.'.join(target[offset + 1:]))
  297 + spase_region.add('.'.join(target))
  298 + return {'target_class': SEP.join(target_class) if target_class else None,
  299 + 'target_name': SEP.join(target_name) if target_name else None,
  300 + 'target_region': SEP.join(target_region) if target_region else None,
  301 + 'spase_region': SEP.join(spase_region) if spase_region else None}
  302 +
  303 +
  304 +def get_instru_name_and_host_name(spase_dic: SpaseDic, numerical_data_node: ElTr.Element) -> SQLDic:
  305 + """Given the ``NumericalData`` node, return a dictionary containing:
  306 +
  307 +- **instrument_name**: the ``instrument_name`` EPN-TAP parameter;
  308 +- **instrument_host_name**: the ``instrument_host_name`` EPN-TAP parameter.
  309 +"""
  310 +
  311 + instru_names = set()
  312 + instru_host_names = set()
  313 + for instru_id in [i.text for i in numerical_data_node.findall('{%s}InstrumentID' % XMLNS)]:
  314 + instru = get_instrument(spase_dic, instru_id)
  315 + instru_names.add(instru.find('{%s}ResourceHeader' % XMLNS).find('{%s}ResourceName' % XMLNS).text)
  316 + observatory = get_observatory(spase_dic, instru.find('{%s}ObservatoryID' % XMLNS).text)
  317 + instru_host_names.add(observatory.find('{%s}ResourceHeader' % XMLNS).find('{%s}ResourceName' % XMLNS).text)
  318 + return {'instrument_name': SEP.join(instru_names) if instru_names else None,
  319 + 'instrument_host_name': SEP.join(instru_host_names) if instru_host_names else None}
  320 +
  321 +
  322 +def get_types(numerical_data_node: ElTr.Element) -> SQLDic:
  323 + """Given the ``NumericalData`` node, return a dictionary containing:
  324 +
  325 +- **dataproduct_type**: the ``dataproduct_type`` EPN-TAP parameter;
  326 +- **spatial_frame_type**: the ``spatial_frame_type`` EPN-TAP parameter;
  327 +- **measurement_type**: the ``measurement_type`` EPN-TAP parameter.
  328 +- **spase_measurement_type**: the ``spase_measurement_type`` parameter,
  329 + added to the EPN-TAP parameters for the purposes of AMDA.
  330 +"""
  331 + with open('log', 'w') as f_out:
  332 + dataproduct_types = set()
  333 + sp_frame_types = set()
  334 + measurement_types = set()
  335 + spase_measurement_type = getattr(numerical_data_node.find('{%s}MeasurementType' % XMLNS), 'text', None)
  336 + for param in numerical_data_node.findall('{%s}Parameter' % XMLNS):
  337 + hints = param.findall('{%s}RenderingHints' % XMLNS)
  338 + dt_nodes = [hint.find('{%s}DisplayType' % XMLNS) for hint in hints]
  339 + for display in [display.text for display in dt_nodes if display is not None and display.text is not None]:
  340 + dataproduct_types.add(DATAPRODUCT_TYPE_DIC[display])
  341 + coord_sys = param.find('{%s}CoordinateSystem' % XMLNS)
  342 + if coord_sys is not None:
  343 + sp_frame_types.add(coord_sys.find('{%s}CoordinateRepresentation' % XMLNS).text.lower())
  344 + measurement_type = param.find('{%s}Ucd' % XMLNS)
  345 + if measurement_type is not None and measurement_type.text is not None:
  346 + f_out.write(measurement_type.text)
  347 + measurement_types.add(measurement_type.text)
  348 + return {'dataproduct_type': SEP.join(dataproduct_types) if dataproduct_types else None,
  349 + 'spatial_frame_type': SEP.join(sp_frame_types) if sp_frame_types else None,
  350 + 'measurement_type': SEP.join(measurement_types) if measurement_types else None,
  351 + 'spase_measurement_type': spase_measurement_type}
  352 +
  353 +
  354 +def get_times_min_max(numerical_data_node: ElTr.Element) -> SQLDic:
  355 + """Given the ``NumericalData`` node, return a dictionary containing:
  356 +
  357 +- **time_sampling_step_min**: the ``time_sampling_step_min`` EPN-TAP parameter;
  358 +- **time_sampling_step_max**: the ``time_sampling_step_max`` EPN-TAP parameter;
  359 +- **time_exp_min**: the ``time_exp_min`` EPN-TAP parameter.
  360 +"""
  361 +
  362 + temporal_description_node = numerical_data_node.find('{%s}TemporalDescription' % XMLNS)
  363 +
  364 + if temporal_description_node is None:
  365 + return {'time_sampling_step_min': None, 'time_sampling_step_max': None, 'time_exp_min': None}
  366 +
  367 + return {'time_sampling_step_min': str(xml_duration_to_seconds(getattr(temporal_description_node.find(
  368 + '{%s}%s' % (XMLNS, 'Cadence_Min')), 'text', None))),
  369 + 'time_sampling_step_max': str(xml_duration_to_seconds(getattr(temporal_description_node.find(
  370 + '{%s}%s' % (XMLNS, 'Cadence_Max')), 'text', None))),
  371 + 'time_exp_min': str(xml_duration_to_seconds(getattr(temporal_description_node.find(
  372 + '{%s}%s' % (XMLNS, 'Exposure')), 'text', None)))
  373 + }
  374 +
  375 +
  376 +def get_processing_lvl(numerical_data_node: ElTr.Element) -> SQLDic:
  377 + """Given the ``NumericalData`` node, return a dictionary containing:
  378 +
  379 +- **processing_level**: the ``processing_level`` EPN-TAP parameter;
  380 +"""
  381 +
  382 + proc_lvl = getattr(numerical_data_node.find('{%s}ProcessingLevel' % XMLNS), 'text', None)
  383 + return {'processing_level': PROCESSING_LEVEL_DIC.get(proc_lvl, None)}
  384 +
  385 +
  386 +def get_granule_and_parent(gr_node: ElTr.Element) -> Tuple[str, SQLDic]:
  387 + """Given a Granule node, return a dictionary containing all the parameters inside it:
  388 +
  389 +- **obs_id**: the ``obs_id`` EPN-TAP parameter;
  390 +- **creation_date**: the ``creation_date`` EPN-TAP parameter;
  391 +- **release_date**: the ``release_date`` EPN-TAP parameter;
  392 +- **time_min**: the ``time_min`` EPN-TAP parameter;
  393 +- **time_max**: the ``time_max`` EPN-TAP parameter;
  394 +- **access_url**: the ``access_url`` EPN-TAP parameter;
  395 +- **access_estsize**: the ``access_estsize`` EPN-TAP parameter.
  396 +"""
  397 +
  398 + parent_id = getattr(gr_node.find('{%s}ParentID' % XMLNS), 'text', None)
  399 + obs_id = getattr(gr_node.find('{%s}ResourceID' % XMLNS), 'text', '').split('/')[-1]
  400 + if not obs_id:
  401 + print('Can not get the ResourceID content of a granule. Exiting here.')
  402 + sys.exit()
  403 +
  404 + release_date = getattr(gr_node.find('{%s}ReleaseDate' % XMLNS), 'text', None)
  405 + tim_min = xml_date_to_jd(getattr(gr_node.find('{%s}StartDate' % XMLNS), 'text', None))
  406 + time_max = xml_date_to_jd(getattr(gr_node.find('{%s}StopDate' % XMLNS), 'text', None))
  407 + src_n = gr_node.find('{%s}Source' % XMLNS)
  408 + access_url = getattr(src_n.find('{%s}URL' % XMLNS), 'text', None) if src_n else None
  409 + data_extent_node = src_n.find('{%s}DataExtent' % XMLNS) if src_n else None
  410 + access_estsize = getattr(data_extent_node.find('{%s}Quantity' % XMLNS), 'text', None)
  411 +
  412 + return parent_id, {'obs_id': obs_id,
  413 + 'release_date': release_date,
  414 + 'time_min': tim_min,
  415 + 'time_max': time_max,
  416 + 'access_url': access_url,
  417 + 'access_estsize': int(access_estsize) if access_estsize else None}
  418 +
  419 +
  420 +def xml_date_to_jd(xml_date: str) -> Optional[float]:
  421 + """Convert a *XML date* to *Julian day*."""
  422 +
  423 + try:
  424 + output_date = datetime.strptime(xml_date, XML_DATE_FORMAT)
  425 + except ValueError: # Date is not well formatted
  426 + return None
  427 +
  428 + if output_date.month == 1 or output_date.month == 2:
  429 + year_p = output_date.year - 1
  430 + month_p = output_date.month + 12
  431 + else:
  432 + year_p = output_date.year
  433 + month_p = output_date.month
  434 +
  435 + # this checks where we are in relation to October 15, 1582, the beginning
  436 + # of the Gregorian calendar.
  437 + if ((output_date.year < 1582) or
  438 + (output_date.year == 1582 and output_date.month < 10) or
  439 + (output_date.year == 1582 and output_date.month == 10 and output_date.day < 15)):
  440 + j_day = 0
  441 + else:
  442 + j_day = 2 - math.trunc(year_p / 100.) + math.trunc(math.trunc(year_p / 100.) / 4.)
  443 +
  444 + j_day += math.trunc((365.25 * year_p) - 0.75) if year_p < 0 else math.trunc(365.25 * year_p)
  445 + j_day += math.trunc(30.6001 * (month_p + 1)) + output_date.day + 1720994.5
  446 + j_day += output_date.hour/24 + output_date.minute/1440 + output_date.second/86400
  447 +
  448 + return j_day
  449 +
  450 +
  451 +def xml_date_to_sql_date(xml_date: str) -> str:
  452 + """Convert a *XML date* to a *SQL date*."""
  453 +
  454 + return datetime.strptime(xml_date, XML_DATE_FORMAT).strftime(SQL_DATE_FORMAT)
  455 +
  456 +
  457 +def xml_duration_to_seconds(xml_duration: str) -> int:
  458 + """Convert a *XML duration* to seconds."""
  459 +
  460 + if not xml_duration:
  461 + return 0
  462 +
  463 + regex = re.compile(r'(?P<sign>-?)P(?:(?P<years>\d+)Y)?(?:(?P<months>\d+)M)?(?:(?P<days>\d+)D)?' +
  464 + r'(?:T(?:(?P<hours>\d+)H)?(?:(?P<minutes>\d+)M)?(?:(?P<seconds>\d+)S)?)?')
  465 +
  466 + time = regex.match(xml_duration.upper()).groupdict(0)
  467 + delta = timedelta(
  468 + days=int(time['days']) + (int(time['months']) * 30) + (int(time['years']) * 365),
  469 + hours=int(time['hours']),
  470 + minutes=int(time['minutes']),
  471 + seconds=int(time['seconds']))
  472 +
  473 + return (delta * -1 if time['sign'] == "-" else delta).total_seconds()
  474 +
  475 +
  476 +def get_parameters(spase_dic: SpaseDic) -> List[SQLDic]:
  477 + """Get all the parameters of the entire dataset.
  478 +Return a list containing the granules, where each granule is a dictionary, with:
  479 +
  480 +- **keys**: the EPN-TAP parameter name;
  481 +- **values**: the EPN-TAP value corresponding to the parameter name.
  482 +"""
  483 +
  484 + datasets = {}
  485 + missing_parameters = {}
  486 + nb_elements = len(spase_dic['NumericalData']) + len(spase_dic['NumericalOutput']) + len(spase_dic['Granule'])
  487 + n_dataset = 0
  488 +
  489 + for numerical_data_node in spase_dic['NumericalData'] + spase_dic['NumericalOutput']:
  490 + print('Dataset %d/%d' % (n_dataset, nb_elements), end=' ' * 99 + '\r')
  491 + n_dataset += 1
  492 + try:
  493 + dataset_key = getattr(numerical_data_node.find('{%s}ResourceID' % XMLNS), 'text', None).split('/')[-1]
  494 + except AttributeError:
  495 + print('Can not get the ResourceID content of a dataset. Exiting here.')
  496 + sys.exit()
  497 + dataset = get_region_info(numerical_data_node)
  498 + dataset.update(get_instru_name_and_host_name(spase_dic, numerical_data_node))
  499 + dataset.update(get_types(numerical_data_node))
  500 + dataset.update(get_access_format(numerical_data_node))
  501 + dataset.update(get_times_min_max(numerical_data_node))
  502 + dataset.update(get_processing_lvl(numerical_data_node))
  503 +
  504 + # Looking for None parameters in each dataset
  505 + for parameter, default_value in DEFAULT_DATASET_VALUES.items():
  506 + if not dataset[parameter]:
  507 + dataset[parameter] = default_value
  508 + if dataset_key not in missing_parameters:
  509 + missing_parameters[dataset_key] = set()
  510 + missing_parameters[dataset_key].add(parameter)
  511 + datasets[dataset_key] = dataset
  512 +
  513 + granules_list = []
  514 + for granule_node in spase_dic['Granule']:
  515 + parent_id, granule = get_granule_and_parent(granule_node)
  516 + dataset_key = parent_id.split('/')[-1]
  517 +
  518 + print('Granule {:<23.23} {:<18.18} [{:<50.50}]'.format(
  519 + '%d/%d (%.2f%%)' % (n_dataset + 1, nb_elements, 100 * float(n_dataset + 1) / nb_elements),
  520 + dataset_key,
  521 + '.' * int((n_dataset + 1) / nb_files * 50)), end='\r')
  522 +
  523 + # Looking for None parameters in each granule
  524 + for parameter, default_value in DEFAULT_GRANULE_VALUES.items():
  525 + if not granule[parameter]:
  526 + granule[parameter] = default_value
  527 + if dataset_key not in missing_parameters:
  528 + missing_parameters[dataset_key] = set()
  529 + missing_parameters[dataset_key].add(parameter)
  530 +
  531 + try:
  532 + granule.update(datasets[dataset_key])
  533 + except KeyError:
  534 + print('The parent id "%s" of the granule "%s" is not found in the dataset dictionary.'
  535 + % (parent_id, granule['access_url']))
  536 + granules_list.append(granule)
  537 + n_dataset += 1
  538 + print()
  539 + for bad_dataset, missings in missing_parameters.items():
  540 + log('%s\tmissing %s' % (bad_dataset, ', '.join(missings)))
  541 + return granules_list
  542 +
  543 +
  544 +def write_sql(granules_list):
  545 + """Write a SQL script which insert all the granules in the database."""
  546 +
  547 + with open(SQL_FILE_PATH, 'w') as sql_file:
  548 + sql_file.write(SQL_HEADER)
  549 + for gr in granules_list:
  550 + keys = ', '.join(gr.keys())
  551 + values = ', '.join(['NULL' if param is None else "'%s'" % param if isinstance(param, str) else
  552 + str(param) for param in gr.values()])
  553 + sql_file.write(SQL_ROW % (keys, values))
  554 + sql_file.write(SQL_FOOTER)
  555 +
  556 +
  557 +if __name__ == '__main__':
  558 + log_file = open(LOG_FILE_PATH, 'w+') if LOG_FILE_PATH else None
  559 +
  560 + print('Getting number of files in %s...' % SPASE_DIR)
  561 + nb_files = get_nb_files()
  562 +
  563 + print('Parsing %d files...' % nb_files)
  564 + spase = get_spase()
  565 +
  566 + print('Done. Found these types of data: %s.' % ', '.join([key for (key, val) in spase.items()]))
  567 +
  568 + print('Loading numerical data...')
  569 + granules = get_parameters(spase)
  570 +
  571 + print('Creating SQL script...')
  572 + write_sql(granules)
  573 +
  574 + import subprocess
  575 +
  576 + subprocess.Popen(['notify-send', 'The SQL script %s has been generated.' % SQL_FILE_PATH])
... ...