From 016e94652d58de4be6d29c91a8d35d7c25d4e747 Mon Sep 17 00:00:00 2001 From: Nathanael Jourdane Date: Mon, 27 Feb 2017 17:47:13 +0100 Subject: [PATCH] Add granules builder --- create_granules.py | 450 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ nc_parser.py | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 644 insertions(+), 0 deletions(-) create mode 100755 create_granules.py create mode 100755 nc_parser.py diff --git a/create_granules.py b/create_granules.py new file mode 100755 index 0000000..7d7f8a9 --- /dev/null +++ b/create_granules.py @@ -0,0 +1,450 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# interpreter: Python 3.6 with anaconda. Please set and prepare the conda environment. +# set PATH $HOME/.anaconda2/bin/ $PATH; and source $HOME/.anaconda2/etc/fish/conf.d/conda.fish +# set PATH $HOME/.anaconda3/bin/ $PATH; and source $HOME/.anaconda3/etc/fish/conf.d/conda.fish +# Add this lines in your init.fish (adapt for Bash terms), so you can choose which conda version to use: +# conda3 # Using conda3 +# conda create --name granules # 1st time only +# activate granules # or `conda activate granules` in Bash terms +# conda install netCDF4 # 1st time only + +"""This script download all files from a ``SPASE`` registry, then log and correct eventual errors +and add several files and information, such as granules estimation size.""" + +import os.path as op +from os import makedirs +import xml.etree.ElementTree as ElTr +import re +import shutil +import json +import sys +from tempfile import gettempdir +from datetime import datetime +from urllib.request import urlretrieve +from urllib.error import HTTPError +from time import time, strftime, gmtime +from typing import Tuple, List, Dict +from nc_parser import GranuleIndexReader, GranuleIndex + +# URLs +GET_INDEXES_WEBSERVICE = 'http://amda-dev.irap.omp.eu/BASE/DDService/getGranulesIndex.php' +GET_ESTSIZE_WEBSERVICE = 'http://amda-dev.irap.omp.eu/BASE/DDService/getGranulesSize.php' +RESOLVER_URL = 'http://apus.irap.omp.eu:8080/amda-registry/resolver' +XMLNS = 'http://www.spase-group.org/data/schema' +TARGET_URL_PREFIX = 'http://amda-dev.irap.omp.eu/BASE/DDService/get_cdf.php?id=' +# Used if you want to apply a filter to the downloaded files. +SPASE_PREFIX = 'spase://CDPP/' +# SPASE_PREFIX = 'spase://CDPP/NumericalData/AMDA/THEMIS/A/' + +NUMDATA_KEYWORDS = ['/NumericalData/', '/NumericalOutput/'] +GRANULE_KEYWORD = '/Granules/' + +# local paths +BASE_DIR = op.dirname(op.dirname(op.abspath(__file__))) +SPASE_DIR = op.join(BASE_DIR, 'DATA') # /!\ Double-check this : this directory will be recursively deleted. +LOG_FILE_PATH = op.join(BASE_DIR, 'create_granules.log') +BLACKLIST_PATH = op.join(BASE_DIR, 'blacklist') + +LOG_FILE = open(LOG_FILE_PATH, 'w+') # Please set to None if you want to log in stdout instead of a file. + +# dates format +SPASE_DATE_FORMAT = '%Y%j%H%M%S' # ex: 2016238000000* +XML_DATE_FORMAT = '%Y-%m-%dT%H:%M:%SZ' # ex: 2016-08-26T00:00:00Z + +GRANULE_TEMPLATE = ''' + + 2.2.6 + + %s + %s + %s + %s + %s + + Data + %s + + %s + + + +''' + + +def log(error: str, location: str, problem: str, what_is_done: str) -> None: + """Log a warning in a log file or the stdout. + +- ``error``: The error code, ex: ``BAD_BYTES``. +- ``location``: The granule name, or dataset name, or any location information related to the error. +- ``problem``: A phrase describing the problem. +- ``what_is_done``: A phrase describing how the error has been corrected. +""" + + message = '%s\ton %s.\t%s\t%s\n' % (error, location, problem, what_is_done) + if LOG_FILE is not None: + LOG_FILE.write(message) + else: + print(message) + + +def get_datasets_ids(datasets_ids: List[str] = None, spase_id: str = None) -> List[str]: + """Recursively get all dataset ids (``NumericalData``, ``Instrument``, ``Person``, etc.), +using the amda registry resolver. + +- no arguments required (``datasets_ids`` and ``spase_id`` are used for the recursion); +- ``return``: A list containing all the dataset spase ids. +""" + + datasets_ids = [] if datasets_ids is None else datasets_ids + id_param = '' if spase_id is None else 'id=%s&' % spase_id + with open(urlretrieve('%s?%st=yes' % (RESOLVER_URL, id_param))[0]) as http_content: + for node in ElTr.fromstring(http_content.read()): + node_id = node.attrib.get('id') + if node.tag == 'node': + print('Found dataset {:<50.50}'.format(node_id), end='\r') + get_datasets_ids(datasets_ids, node_id) + elif node.tag == 'leaf': + print('Found leaf {:<50.50}'.format(node_id), end='\r') + datasets_ids.append(node_id) + if spase_id is None: + return datasets_ids + + +def download_dataset_files(datasets_spase_raw_ids: List[str], black_list: Tuple[str]) -> Dict[str, str]: + """Download all the spase dataset files, according to the spase id list, and store them +recursively to appropriated folders. + +- ``datasets_spase_raw_ids``: The list of all datasets, returned by get_datasets_ids(); +- ``return``: a dictionary with: + + - **key** = dataset spase id ; + - **value** = dataset local path*. +""" + + nb_datasets = len(datasets_spase_raw_ids) + if nb_datasets == 0: + print('There is no dataset to parse... :/') + sys.exit() + + datasets_path = {} + for n_dataset, dataset_raw_id in enumerate(datasets_spase_raw_ids): + if dataset_raw_id.startswith(black_list): + continue + + dataset_path = op.abspath(op.join(*([SPASE_DIR] + dataset_raw_id[8:].split('/'))) + '.xml') + if not op.isdir(op.dirname(dataset_path)): + makedirs(op.dirname(dataset_path)) + dataset_raw_id = dataset_raw_id.strip().replace(' ', '+') + + try: + urlretrieve('%s?id=%s' % (RESOLVER_URL, dataset_raw_id), filename=dataset_path) + except HTTPError as err: + log('INDEX_RESOLVER_INACCESSIBLE', + 'dataset %s' % dataset_path, + 'Can not connect to URL %s, because %s' % ('%s?id=%s' % (RESOLVER_URL, dataset_raw_id), err), + 'Ignoring this dataset.') + + try: + resource_node = ElTr.parse(dataset_path).getroot().find(".//{%s}ResourceID" % XMLNS) + new_dataset_id = getattr(resource_node, 'text', dataset_raw_id) + except ElTr.ParseError: + log('RESOURCE_ID_NOT_FOUND', + 'dataset %s' % dataset_path, + 'Can not find ResourceID in the dataset.', + 'Ignoring this dataset.') + continue + datasets_path[new_dataset_id.split('/')[-1]] = dataset_path + + print('{:<50.50} [{:<50.50}] {:<11.11}'.format('Downloaded ' + new_dataset_id.split('/')[-1], + '.' * int((n_dataset + 1) / nb_datasets * 50), + '%d/%d' % (n_dataset + 1, nb_datasets)), end='\r') + print() + return datasets_path + + +def get_granules_indexes_url() -> Tuple[str, Dict[str, str]]: + """Get the granules indexes URL. + +- ``return``: A tuple containing: + - **The URL prefix (ie. *http://manunja.irap.omp.eu/BASE/DATA/*); + - a dictionary as: + - **key**: the dataset id (ie: *ros-magib-rsmp*); + - **value**: the granule URL suffix (ie. *ROS/MAG.PSA/IB.RESAMPLED/mag_times.nc*).""" + + try: + with open(urlretrieve(GET_INDEXES_WEBSERVICE)[0]) as http_content: + ws_response = http_content.read().strip() + except HTTPError: + log('GET_INDEXES_WEBSERVICE_INACCESSIBLE', + 'all datasets', + 'Can not access to get_indexes webservice (%s).' % GET_INDEXES_WEBSERVICE, + 'Filled all datasets with 1 granule containing default values, all granules URLs will be wrong!') + return '', {} + + try: + gr_indexes = json.loads(ws_response) + except ValueError: + ws_res_path = op.join(gettempdir(), 'indexes_response') + with open(ws_res_path, 'w') as f_indexes: + f_indexes.write(ws_response) + log('INDEXES_NOT_JSON', + 'all datasets', + 'get_indexes webservice (%s) did not returned a Json file. See %s.' % (GET_INDEXES_WEBSERVICE, ws_res_path), + 'Filled all datasets with 1 granule containing default values, all granules URLs will be wrong!') + return '', {} + + url_prefix = list(gr_indexes.keys())[0] if len(gr_indexes) > 0 else None + granules = gr_indexes.get(url_prefix, None) + if not url_prefix or not url_prefix.startswith('http://') or len(granules) <= 1 or type(granules) is not dict: + indexes_path = op.join(gettempdir(), 'get_indexes.json') + with open(indexes_path) as f_indexes: + f_indexes.write(gr_indexes) + log('INCONSISTENT_INDEXES', + 'all datasets', + 'The get_indexes Json file is supposed to contain one root element, ' + 'containing a pair (dataset_url, granules dictionary). See %s.' % indexes_path, + 'Filled all datasets with 1 granule containing default values, all granules URLs will be wrong!') + return '', {} + + return url_prefix.replace('manunja', 'amda-dev'), {k: v for (k, v) in granules.items()} + + +def get_grs_size_dic(dataset_spase_id: str) -> Dict[str, int]: + """Download the dictionary containing the granules sizes.""" + + url = '%s?id=%s' % (GET_ESTSIZE_WEBSERVICE, dataset_spase_id) + try: + with open(urlretrieve(url)[0]) as http_content: + try: + gr_dic = json.loads(http_content.read().strip()) + for dataset_prefix, granules_sizes in gr_dic.items(): + return granules_sizes # There is only one item in the dictionary. + except ValueError: + log('GRANULES_SIZE_BAD_JSON', + 'dataset %s' % dataset_spase_id, + 'When querying the granules size, can not decode the json string (`%s`...).' + % http_content.read().strip()[:30], + 'Set the granules size to 0.') + return {} + except HTTPError: + log('GRANULES_SIZE_SERVICE_INACCESSIBLE', + 'dataset %s', + 'Can not access to the webservice on %s when querying the granules size.' % url, + 'Set the granules size to 0.') + return {} + + +def get_gr_size(granules_size: Dict[str, int], granule_name: str) -> int: + """Get the granule size, by looking for the granule id in the dictionary.""" + + if not granules_size: + log('NO_GRANULES_SIZE', + 'granule %s' % granule_name, + 'There is no granules size dictionary.' % granule_name, + 'Set granule estimation size to 0.') + return 0 + try: + return int(granules_size[granule_name]) + except KeyError: + log('GRANULES_KEY_ERROR', + 'granule %s' % granule_name, + 'Can not access to the item %s in the dictionary.' % granule_name, + 'Set granule estimation size to 0.') + return 0 + except ValueError: + log('GRANULE_SIZE_NOT_INTEGER', + 'granule %s' % granule_name, + 'When retrieving the granule estsize, can not convert `%s` to an integer.' % granule_name, + 'Set granule estimation size to 0.') + return 0 + except TypeError: + log('GRANULES_SIZE_NOT_DIC', + 'granule %s' % granule_name, + 'The returned json is not a dictionary: `%s...`.' % str(granules_size)[:30], + 'Set granule estimation size to 0.') + return 0 + + +def write_granules(dataset_spase_id: str, granules_dir: str, release_date: str, gr_dir_url_prefix: str, + gr_idx_list: List[GranuleIndex], dataset_info: str) -> int: + """Write the granule files. + +- ``dataset_id``: the spase id of dataset that we want to get the granules; +- ``granules_dir``: the local directory where the granules must be writen; +- ``release_date``: The release date of the granule (ie, now); +- ``gr_idx_list``: a list of all GranuleIndex of this dataset; +- ``dataset_info``: Some information about the dataset which will be printed in the standard output; +- ``return``: The number of created files.""" + + gr_sizes = get_grs_size_dic(dataset_spase_id) + if not gr_sizes: + return 0 + + log_size = LOG_FILE.tell() + gr_nb = 1 + start_time = time() + info = '' + for n, granule in enumerate(gr_idx_list): + granule_name = op.splitext(granule.filename)[0] + granule_id = dataset_spase_id + '-%05d' % n + info = '{:<50.50} [{:<50.50}] {:<12.12}'.format(dataset_info, '.' * int(gr_nb / len(gr_idx_list) * 50), + '%d/%d' % (gr_nb, len(gr_idx_list))) + print(info, end='\r') + + access_url = TARGET_URL_PREFIX + gr_dir_url_prefix + '/' + granule_name # CDF file + # access_url = gr_dir_url_prefix + '/' + granule.filename + '.gz' # NetCDF file + + granule = GRANULE_TEMPLATE % (granule_id, release_date, dataset_spase_id, granule.start_date, granule.stop_date, + access_url, get_gr_size(gr_sizes, granule_name)) + gr_nb += 1 + + with open(op.join(granules_dir, granule_id + '.xml'), 'w+') as granule_file: + granule_file.write(granule) + + str_time = strftime('elapsed: %Hh%Mm%S', gmtime(time() - start_time)) + warning = ' see log file' if log_size != LOG_FILE.tell() else '' + print(info + str_time + warning) + return gr_nb + + +def check_num_data(paths: Dict[str, str]) -> None: + """Check the *NumericalData* files, particularly the dataproduct type and XML duration format.""" + + regex_xml_duration = re.compile(r'(?P-?)P(?:(?P\d+)Y)?(?:(?P\d+)M)?(?:(?P\d+)D)?' + + r'(?:T(?:(?P\d+)H)?(?:(?P\d+)M)?(?:(?P\d+)S)?)?') + + for _, dataset_local_path in paths.items(): + tree = ElTr.parse(dataset_local_path) + + if tree.getroot().tag == 'Message': + log('NUM-DATA_XML_MESSAGE', + 'On NumericalData file %s' % dataset_local_path, + 'The XML file contains this message: ' + tree.getroot().text, + 'Set the duration to 0.') + return + + numdata_node = tree.getroot().find('{%s}NumericalData' % XMLNS) + numdata_node = tree.getroot().find('{%s}NumericalOutput' % XMLNS) if numdata_node is None else numdata_node + + temporal_description_node = numdata_node.find('{%s}TemporalDescription' % XMLNS) + + dataproduct_types = set() + for param in numdata_node.findall('{%s}Parameter' % XMLNS): + hints = param.findall('{%s}RenderingHints' % XMLNS) + dt_nodes = [hint.find('{%s}DisplayType' % XMLNS) for hint in hints] + for display in [display.text for display in dt_nodes if display is not None and display.text is not None]: + dataproduct_types.add(display) + if not dataproduct_types: + log('NO_DATAPRODUCT_TYPE', + 'On NumericalData file %s' % dataset_local_path, + 'There is no dataproduct type.', + 'Set the dataproduct type to "TimeSeries".') + # ts is added in build_BDD.py + + if temporal_description_node is not None: + for duration_key in ('Cadence_Min', 'Cadence_Max', 'Exposure'): + duration_node = temporal_description_node.find('{%s}%s' % (XMLNS, duration_key)) + xml_duration = getattr(duration_node, 'text', 'P0D') + try: + regex_xml_duration.match(xml_duration.upper()).groupdict(0) + except AttributeError: + log('NUM-DATA_BAD_DATE', + 'On NumericalData file %s' % dataset_local_path, + 'Can not decode duration: %s.' % xml_duration, + 'Set the duration to 0.') + duration_node.text = 'P0D' + tree.write(dataset_local_path) + + +def write_all_granules() -> None: + """Create the granules.""" + + black_list = tuple() + try: + with open(BLACKLIST_PATH) as f: + black_list += tuple(l.strip() for l in f.readlines() if l.strip() and not l.startswith('#')) + except IOError: + pass + print('ignored datasets: %s' % ', '.join(black_list)) + + print('Getting datasets spase ids...') + all_spase_id = get_datasets_ids() + + print('Downloading dataset files into %s...' % SPASE_DIR) + datasets_spase_id = [num_data for num_data in all_spase_id if num_data.startswith(SPASE_PREFIX)] + + spase_files_path = download_dataset_files(datasets_spase_id, black_list) + # We don't want to write granules from files which are not NumData + paths = {d_id: path for (d_id, path) in spase_files_path.items() + if True in [keyword in path for keyword in NUMDATA_KEYWORDS]} + + print('Checking numerical data files...') + check_num_data(paths) + + print('Getting granules index file paths...') + url_prefix, grs_idx_url = get_granules_indexes_url() + reader = GranuleIndexReader(log) + + n_datasets = 0 + n_gr = 0 + + for gr_idx_url in grs_idx_url: + if gr_idx_url not in paths: + log('DATASET_INDEX_NOT_LINKED', + 'dataset %s' % gr_idx_url, + 'This dataset is found in the granules indexes json file (returned by %s), ' + 'but not in the resolver (%s).' % (GET_INDEXES_WEBSERVICE, RESOLVER_URL), + 'Ignored this dataset.') + + print('Creating granules...') + start_time = time() + + for dataset_spase_id, dataset_local_path in paths.items(): + nc_file_path = grs_idx_url.get(dataset_spase_id, '') + if not nc_file_path: + log('DATASET_NOT_IN_IDX_DIC', + 'dataset %s' % dataset_spase_id, + 'This dataset is not found in the granules indexes json file returned by %s.' % GET_INDEXES_WEBSERVICE, + 'Set default times values for all granules of this dataset.') + grs_idx_list = reader.get_granules_index(dataset_spase_id, url_prefix + nc_file_path) + + for keyword in NUMDATA_KEYWORDS: + dataset_local_path = dataset_local_path.replace(keyword, GRANULE_KEYWORD) + grs_local_dir = op.dirname(dataset_local_path) + if not op.exists(grs_local_dir): + makedirs(grs_local_dir) + + release_date = datetime.now().strftime(XML_DATE_FORMAT) + dataset_info = '%s dataset %d/%d (%.2f%%) %s' % \ + (strftime('%H:%M'), n_datasets + 1, len(paths), + (n_datasets / len(paths) * 100), dataset_spase_id) + gr_dir_url_suffix = '' if not nc_file_path else '/'.join(nc_file_path.split('/')[:-1]) + try: + n_gr += write_granules(dataset_spase_id, grs_local_dir, release_date, gr_dir_url_suffix, grs_idx_list, + dataset_info) + except Exception as error: + print('A problem occurred when creating a granule from dataset %s:' % dataset_spase_id) + LOG_FILE.close() + raise error + n_datasets += 1 + + elapsed = strftime('%Hh%Mm%S', gmtime(time() - start_time)) + print('100%%, %d files created in %s.' % (n_gr, elapsed)) + + +if __name__ == '__main__': + if not op.exists(BASE_DIR): + makedirs(BASE_DIR) + + if op.isdir(SPASE_DIR): + print('Clearing SPASE directory (%s)...' % SPASE_DIR) + shutil.rmtree(SPASE_DIR) + + write_all_granules() + + LOG_FILE.close() diff --git a/nc_parser.py b/nc_parser.py new file mode 100755 index 0000000..df86a47 --- /dev/null +++ b/nc_parser.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""This script parses netCdf files.""" + +import re +import os +import os.path as op +from datetime import datetime +from mimetypes import MimeTypes +from netCDF4 import Dataset +import pathlib +from collections import namedtuple +from typing import List, Optional +from tempfile import gettempdir +from urllib.request import urlretrieve +from urllib.error import HTTPError + +# dates format +SPASE_DATE_FORMAT = '%Y%j%H%M%S' # ex: 2016238000000* +XML_DATE_FORMAT = '%Y-%m-%dT%H:%M:%SZ' # ex: 2016-08-26T00:00:00Z +SPASE_INDEX_TEMP_PATH = op.join(gettempdir(), 'index.nc') + +GranuleIndex = namedtuple('GranuleIndex', 'start_date stop_date filename') + + +class GranuleIndexReader: + + def __init__(self, log_fct): + self.log_fct = log_fct + + def load_dataset(self, target_name: str, granule_index_url: str) -> Optional[Dataset]: + """Load the Dataset stored in `self.nc_file_path`.""" + if op.isfile(SPASE_INDEX_TEMP_PATH): + os.remove(SPASE_INDEX_TEMP_PATH) + + try: + urlretrieve(granule_index_url, SPASE_INDEX_TEMP_PATH) + except HTTPError: + self.log_fct('INDEX_INACCESSIBLE', + 'dataset %s' % target_name, + 'Can not access to %s.' % granule_index_url, + 'Filled this dataset with 1 granule containing default values, granules URLs will be wrong!') + return + + if not op.isfile(SPASE_INDEX_TEMP_PATH): + self.log_fct('INDEX_FILE_NOT_FOUND', + 'dataset %s' % target_name, + 'The granules index file has not been correctly downloaded.', + 'Filled this dataset with 1 granule containing default values, granules URLs will be wrong!') + return + + mime_type = MimeTypes().guess_type(pathlib.Path(op.abspath(SPASE_INDEX_TEMP_PATH)).as_uri())[0] + if mime_type != 'application/x-netcdf': + self.log_fct('INDEX_FILE_NOT_NET-CDF', + 'dataset %s' % target_name, + 'The mime-type of the granules index file is not application/netcdf but "%s". See %s.' % + (mime_type, SPASE_INDEX_TEMP_PATH), + 'Filled this dataset with 1 granule containing default values, granules URLs will be wrong!') + return + + try: + return Dataset(SPASE_INDEX_TEMP_PATH) + except Exception as e: + self.log_fct('CANT_LOAD_INDEX_FILE', + 'dataset %s' % target_name, + 'Can not load the granules index file with NetCDF4 (%e).' + 'See %s.' % (e.__cause__, SPASE_INDEX_TEMP_PATH), + 'Filled this dataset with 1 granule containing default values, granules URLs will be wrong!') + + def get_granules_index(self, target_name: str, nc_file_path: str) -> List[GranuleIndex]: + if not nc_file_path: + return [GranuleIndex('0001-01-01T00:00:00Z', '0001-01-01T00:00:00Z', target_name + '_unknown.nc')] + + dataset = self.load_dataset(target_name, nc_file_path) + if not dataset: + return [GranuleIndex('0001-01-01T00:00:00Z', '0001-01-01T00:00:00Z', target_name + '_unknown.nc')] + + str_start_time = self.nc_ba_to_strings(target_name, 'StartTime', dataset.variables['StartTime'][:]) + str_stop_time = self.nc_ba_to_strings(target_name, 'StopTime', dataset.variables['StopTime'][:]) + file_names = self.nc_ba_to_strings(target_name, 'FileName', dataset.variables['FileName'][:]) + xml_start_times = self.get_nc_times(target_name, str_start_time) + xml_stop_times = self.get_nc_times(target_name, str_stop_time) + + rec_len = dataset.dimensions['record'].size + granules_index = [GranuleIndex(xml_start_times[i], xml_stop_times[i], file_names[i]) for i in range(rec_len)] + dataset.close() + + return granules_index + + def nc_ba_to_strings(self, target_name: str, col_name: str, byte_arrays: List): + """Convert a net-cdf byte array to a string. + If ``UnicodeDecodeError`` is raised, converts only the bytes before the first ``b''``. + + - ``byte_arrays``: A net-cdf bytes array; + - ``return``: The string representation of the bytes array.""" + + strings = [] + for i, bytes_array in enumerate(byte_arrays): + txt = [] + string_ended = False + for j, byte in enumerate(bytes_array): + if byte: + if string_ended: + hex_array = ', '.join([str(byte) for byte in bytes_array]) + self.log_fct('INVISIBLE_BYTES', + 'granules index "%s" on column %s and row %d' % (target_name, col_name, i), + 'The bytes array contains the byte b\'\' (at index %d), ' % j + + 'followed by other characters: [%s]. ' % hex_array, + 'Removed all characters after the first occurrence of b\'\' in the array.') + break + try: + txt.append(byte.decode('utf-8')) + except UnicodeDecodeError: + hex_array = ', '.join([str(byte) for byte in bytes_array]) + self.log_fct('BAD_BYTES', + 'granules index "%s" on column %s and row %d' % (target_name, col_name, i), + 'Can not decode byte %s at index %d on the the bytes array: [%s].' + % (str(byte), j, hex_array), + 'Changed bad byte by byte\'\'.') + break + else: + string_ended = True + strings.append(''.join(txt)) + return strings + + def get_nc_times(self, target_name: str, nc_times: List[str]): + """Converts an array of *SPASE dates* to an array of **XML dates*. + + - ``nc_times``: An array of string, containing the dates in their net-cdf format. + - ``self.target_name``: The url of the net-cdf file of the granule, only used to print it in log_fct. + - ``return``: An array of string, containing the dates in their XML format.""" + + contains_no_digit_chars = re.compile(r'.*\D.*') + dates = [] + for nc_time in nc_times: + if contains_no_digit_chars.match(nc_time): + self.log_fct('DATE_NO_NUM', + 'granules index "%s"' % target_name, + 'The date "%s" contains non numerical characters.' % nc_time, + 'Removed other chars.') + nc_time = re.sub(r'\D', '', nc_time) + if len(nc_time) > 16: + self.log_fct('DATE_TOO_LONG', + 'granules index "%s"' % target_name, + 'The length of the date "%s" is more than 16 chars.' % nc_time, + 'Removed other chars.') + nc_time = nc_time[:16] + if len(nc_time) < 16: + self.log_fct('DATE_TOO_SHORT', + 'granules index "%s"' % target_name, + 'The length of the date "%s" is less than 16 chars.' % nc_time, + 'Replaced other chars by 0.') + nc_time = nc_time.ljust(16, '0') + + year, days = int(nc_time[:4]), int(nc_time[4:7]) + 1 + hour, minute, sec = int(nc_time[7:9]), int(nc_time[9:11]), int(nc_time[11:13]) + + if year == 0: + self.log_fct('WRONG_YEAR', + 'granules index "%s", date ' % target_name, + 'The year of the date "%s" is 0.' % nc_time, + 'Replaced by 1.') + year = 1 + # check leap years: + max_days = 366 if (year % 4 == 0 and not (year % 100 == 0 and year % 400 != 0)) else 365 + if days > max_days: + self.log_fct('WRONG_DAY', + 'granules index "%s"' % target_name, + 'The day of the year in the date "%s" is > %d.' % (nc_time, max_days), + 'Replaced by %d.' % max_days) + days = max_days + if hour > 23: + self.log_fct('WRONG_HOUR', + 'granules index "%s"' % target_name, + 'The hour of the time "%s" is > 23.' % nc_time, + 'Replaced by 23.') + hour = 23 + if minute > 59: + self.log_fct('WRONG_MIN', + 'granules index "%s"' % target_name, + 'The minute of the time %s is > 59.' % nc_time, + 'Replaced by 59.') + minute = 59 + if sec > 59: + self.log_fct('WRONG_SEC', + 'granules index "%s"' % target_name, + 'The second of the time "%s" is > 59.' % nc_time, + 'Replaced by 59.') + sec = 59 + + str_date = '%04d%03d%02d%02d%02d' % (year, days, hour, minute, sec) + dates.append(datetime.strptime(str_date, SPASE_DATE_FORMAT).strftime(XML_DATE_FORMAT)) + return dates -- libgit2 0.21.2