From 016e94652d58de4be6d29c91a8d35d7c25d4e747 Mon Sep 17 00:00:00 2001
From: Nathanael Jourdane <nathanael.jourdane@irap.omp.eu>
Date: Mon, 27 Feb 2017 17:47:13 +0100
Subject: [PATCH] Add granules builder

---
 create_granules.py | 450 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 nc_parser.py       | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 644 insertions(+), 0 deletions(-)
 create mode 100755 create_granules.py
 create mode 100755 nc_parser.py

diff --git a/create_granules.py b/create_granules.py
new file mode 100755
index 0000000..7d7f8a9
--- /dev/null
+++ b/create_granules.py
@@ -0,0 +1,450 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# interpreter: Python 3.6 with anaconda. Please set and prepare the conda environment.
+# set PATH $HOME/.anaconda2/bin/ $PATH; and source $HOME/.anaconda2/etc/fish/conf.d/conda.fish
+# set PATH $HOME/.anaconda3/bin/ $PATH; and source $HOME/.anaconda3/etc/fish/conf.d/conda.fish
+# Add this lines in your init.fish (adapt for Bash terms), so you can choose which conda version to use:
+# conda3 # Using conda3
+# conda create --name granules # 1st time only
+# activate granules # or `conda activate granules` in Bash terms
+# conda install netCDF4 # 1st time only
+
+"""This script download all files from a ``SPASE`` registry, then log and correct eventual errors
+and add several files and information, such as granules estimation size."""
+
+import os.path as op
+from os import makedirs
+import xml.etree.ElementTree as ElTr
+import re
+import shutil
+import json
+import sys
+from tempfile import gettempdir
+from datetime import datetime
+from urllib.request import urlretrieve
+from urllib.error import HTTPError
+from time import time, strftime, gmtime
+from typing import Tuple, List, Dict
+from nc_parser import GranuleIndexReader, GranuleIndex
+
+# URLs
+GET_INDEXES_WEBSERVICE = 'http://amda-dev.irap.omp.eu/BASE/DDService/getGranulesIndex.php'
+GET_ESTSIZE_WEBSERVICE = 'http://amda-dev.irap.omp.eu/BASE/DDService/getGranulesSize.php'
+RESOLVER_URL = 'http://apus.irap.omp.eu:8080/amda-registry/resolver'
+XMLNS = 'http://www.spase-group.org/data/schema'
+TARGET_URL_PREFIX = 'http://amda-dev.irap.omp.eu/BASE/DDService/get_cdf.php?id='
+# Used if you want to apply a filter to the downloaded files.
+SPASE_PREFIX = 'spase://CDPP/'
+# SPASE_PREFIX = 'spase://CDPP/NumericalData/AMDA/THEMIS/A/'
+
+NUMDATA_KEYWORDS = ['/NumericalData/', '/NumericalOutput/']
+GRANULE_KEYWORD = '/Granules/'
+
+# local paths
+BASE_DIR = op.dirname(op.dirname(op.abspath(__file__)))
+SPASE_DIR = op.join(BASE_DIR, 'DATA')  # /!\ Double-check this : this directory will be recursively deleted.
+LOG_FILE_PATH = op.join(BASE_DIR, 'create_granules.log')
+BLACKLIST_PATH = op.join(BASE_DIR, 'blacklist')
+
+LOG_FILE = open(LOG_FILE_PATH, 'w+')  # Please set to None if you want to log in stdout instead of a file.
+
+# dates format
+SPASE_DATE_FORMAT = '%Y%j%H%M%S'  # ex: 2016238000000*
+XML_DATE_FORMAT = '%Y-%m-%dT%H:%M:%SZ'  # ex: <StartDate>2016-08-26T00:00:00Z</StartDate>
+
+GRANULE_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
+<Spase xmlns="http://www.spase-group.org/data/schema"
+       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+       xsi:schemaLocation="http://www.spase-group.org/data/schema
+       http://cdpp1.cesr.fr/AMDA-NG/public/schemas/spase-amda-1_2_0.xsd">
+    <Version>2.2.6</Version>
+    <Granule>
+        <ResourceID>%s</ResourceID>
+        <ReleaseDate>%s</ReleaseDate>
+        <ParentID>%s</ParentID>
+        <StartDate>%s</StartDate>
+        <StopDate>%s</StopDate>
+        <Source>
+            <SourceType>Data</SourceType>
+            <URL>%s</URL>
+            <DataExtent>
+                <Quantity>%s</Quantity>
+            </DataExtent>
+        </Source>
+    </Granule>
+</Spase>'''
+
+
+def log(error: str, location: str, problem: str, what_is_done: str) -> None:
+    """Log a warning in a log file or the stdout.
+
+- ``error``: The error code, ex: ``BAD_BYTES``.
+- ``location``: The granule name, or dataset name, or any location information related to the error.
+- ``problem``: A phrase describing the problem.
+- ``what_is_done``: A phrase describing how the error has been corrected.
+"""
+
+    message = '%s\ton %s.\t%s\t%s\n' % (error, location, problem, what_is_done)
+    if LOG_FILE is not None:
+        LOG_FILE.write(message)
+    else:
+        print(message)
+
+
+def get_datasets_ids(datasets_ids: List[str] = None, spase_id: str = None) -> List[str]:
+    """Recursively get all dataset ids (``NumericalData``, ``Instrument``, ``Person``, etc.),
+using the amda registry resolver.
+
+- no arguments required (``datasets_ids`` and ``spase_id`` are used for the recursion);
+- ``return``: A list containing all the dataset spase ids.
+"""
+
+    datasets_ids = [] if datasets_ids is None else datasets_ids
+    id_param = '' if spase_id is None else 'id=%s&' % spase_id
+    with open(urlretrieve('%s?%st=yes' % (RESOLVER_URL, id_param))[0]) as http_content:
+        for node in ElTr.fromstring(http_content.read()):
+            node_id = node.attrib.get('id')
+            if node.tag == 'node':
+                print('Found dataset {:<50.50}'.format(node_id), end='\r')
+                get_datasets_ids(datasets_ids, node_id)
+            elif node.tag == 'leaf':
+                print('Found leaf {:<50.50}'.format(node_id), end='\r')
+                datasets_ids.append(node_id)
+    if spase_id is None:
+        return datasets_ids
+
+
+def download_dataset_files(datasets_spase_raw_ids: List[str], black_list: Tuple[str]) -> Dict[str, str]:
+    """Download all the spase dataset files, according to the spase id list, and store them
+recursively to appropriated folders.
+
+- ``datasets_spase_raw_ids``: The list of all datasets, returned by get_datasets_ids();
+- ``return``: a dictionary with:
+
+    - **key** = dataset spase id ;
+    - **value** = dataset local path*.
+"""
+
+    nb_datasets = len(datasets_spase_raw_ids)
+    if nb_datasets == 0:
+        print('There is no dataset to parse... :/')
+        sys.exit()
+
+    datasets_path = {}
+    for n_dataset, dataset_raw_id in enumerate(datasets_spase_raw_ids):
+        if dataset_raw_id.startswith(black_list):
+            continue
+
+        dataset_path = op.abspath(op.join(*([SPASE_DIR] + dataset_raw_id[8:].split('/'))) + '.xml')
+        if not op.isdir(op.dirname(dataset_path)):
+            makedirs(op.dirname(dataset_path))
+        dataset_raw_id = dataset_raw_id.strip().replace(' ', '+')
+
+        try:
+            urlretrieve('%s?id=%s' % (RESOLVER_URL, dataset_raw_id), filename=dataset_path)
+        except HTTPError as err:
+            log('INDEX_RESOLVER_INACCESSIBLE',
+                'dataset %s' % dataset_path,
+                'Can not connect to URL %s, because %s' % ('%s?id=%s' % (RESOLVER_URL, dataset_raw_id), err),
+                'Ignoring this dataset.')
+
+        try:
+            resource_node = ElTr.parse(dataset_path).getroot().find(".//{%s}ResourceID" % XMLNS)
+            new_dataset_id = getattr(resource_node, 'text', dataset_raw_id)
+        except ElTr.ParseError:
+            log('RESOURCE_ID_NOT_FOUND',
+                'dataset %s' % dataset_path,
+                'Can not find ResourceID in the dataset.',
+                'Ignoring this dataset.')
+            continue
+        datasets_path[new_dataset_id.split('/')[-1]] = dataset_path
+
+        print('{:<50.50} [{:<50.50}] {:<11.11}'.format('Downloaded ' + new_dataset_id.split('/')[-1],
+                                                       '.' * int((n_dataset + 1) / nb_datasets * 50),
+                                                       '%d/%d' % (n_dataset + 1, nb_datasets)), end='\r')
+    print()
+    return datasets_path
+
+
+def get_granules_indexes_url() -> Tuple[str, Dict[str, str]]:
+    """Get the granules indexes URL.
+
+- ``return``: A tuple containing:
+    - **The URL prefix (ie. *http://manunja.irap.omp.eu/BASE/DATA/*);
+    - a dictionary as:
+        - **key**: the dataset id (ie: *ros-magib-rsmp*);
+        - **value**: the granule URL suffix (ie. *ROS/MAG.PSA/IB.RESAMPLED/mag_times.nc*)."""
+
+    try:
+        with open(urlretrieve(GET_INDEXES_WEBSERVICE)[0]) as http_content:
+            ws_response = http_content.read().strip()
+    except HTTPError:
+        log('GET_INDEXES_WEBSERVICE_INACCESSIBLE',
+            'all datasets',
+            'Can not access to get_indexes webservice (%s).' % GET_INDEXES_WEBSERVICE,
+            'Filled all datasets with 1 granule containing default values, all granules URLs will be wrong!')
+        return '', {}
+
+    try:
+        gr_indexes = json.loads(ws_response)
+    except ValueError:
+        ws_res_path = op.join(gettempdir(), 'indexes_response')
+        with open(ws_res_path, 'w') as f_indexes:
+            f_indexes.write(ws_response)
+        log('INDEXES_NOT_JSON',
+            'all datasets',
+            'get_indexes webservice (%s) did not returned a Json file. See %s.' % (GET_INDEXES_WEBSERVICE, ws_res_path),
+            'Filled all datasets with 1 granule containing default values, all granules URLs will be wrong!')
+        return '', {}
+
+    url_prefix = list(gr_indexes.keys())[0] if len(gr_indexes) > 0 else None
+    granules = gr_indexes.get(url_prefix, None)
+    if not url_prefix or not url_prefix.startswith('http://') or len(granules) <= 1 or type(granules) is not dict:
+        indexes_path = op.join(gettempdir(), 'get_indexes.json')
+        with open(indexes_path) as f_indexes:
+            f_indexes.write(gr_indexes)
+        log('INCONSISTENT_INDEXES',
+            'all datasets',
+            'The get_indexes Json file is supposed to contain one root element, '
+            'containing a pair (dataset_url, granules dictionary). See %s.' % indexes_path,
+            'Filled all datasets with 1 granule containing default values, all granules URLs will be wrong!')
+        return '', {}
+
+    return url_prefix.replace('manunja', 'amda-dev'), {k: v for (k, v) in granules.items()}
+
+
+def get_grs_size_dic(dataset_spase_id: str) -> Dict[str, int]:
+    """Download the dictionary containing the granules sizes."""
+
+    url = '%s?id=%s' % (GET_ESTSIZE_WEBSERVICE, dataset_spase_id)
+    try:
+        with open(urlretrieve(url)[0]) as http_content:
+            try:
+                gr_dic = json.loads(http_content.read().strip())
+                for dataset_prefix, granules_sizes in gr_dic.items():
+                    return granules_sizes  # There is only one item in the dictionary.
+            except ValueError:
+                log('GRANULES_SIZE_BAD_JSON',
+                    'dataset %s' % dataset_spase_id,
+                    'When querying the granules size, can not decode the json string (`%s`...).'
+                    % http_content.read().strip()[:30],
+                    'Set the granules size to 0.')
+                return {}
+    except HTTPError:
+        log('GRANULES_SIZE_SERVICE_INACCESSIBLE',
+            'dataset %s',
+            'Can not access to the webservice on %s when querying the granules size.' % url,
+            'Set the granules size to 0.')
+        return {}
+
+
+def get_gr_size(granules_size: Dict[str, int], granule_name: str) -> int:
+    """Get the granule size, by looking for the granule id in the dictionary."""
+
+    if not granules_size:
+        log('NO_GRANULES_SIZE',
+            'granule %s' % granule_name,
+            'There is no granules size dictionary.' % granule_name,
+            'Set granule estimation size to 0.')
+        return 0
+    try:
+        return int(granules_size[granule_name])
+    except KeyError:
+        log('GRANULES_KEY_ERROR',
+            'granule %s' % granule_name,
+            'Can not access to the item %s in the dictionary.' % granule_name,
+            'Set granule estimation size to 0.')
+        return 0
+    except ValueError:
+        log('GRANULE_SIZE_NOT_INTEGER',
+            'granule %s' % granule_name,
+            'When retrieving the granule estsize, can not convert `%s` to an integer.' % granule_name,
+            'Set granule estimation size to 0.')
+        return 0
+    except TypeError:
+        log('GRANULES_SIZE_NOT_DIC',
+            'granule %s' % granule_name,
+            'The returned json is not a dictionary: `%s...`.' % str(granules_size)[:30],
+            'Set granule estimation size to 0.')
+        return 0
+
+
+def write_granules(dataset_spase_id: str, granules_dir: str, release_date: str, gr_dir_url_prefix: str,
+                   gr_idx_list: List[GranuleIndex], dataset_info: str) -> int:
+    """Write the granule files.
+
+- ``dataset_id``: the spase id of dataset that we want to get the granules;
+- ``granules_dir``: the local directory where the granules must be writen;
+- ``release_date``: The release date of the granule (ie, now);
+- ``gr_idx_list``: a list of all GranuleIndex of this dataset;
+- ``dataset_info``: Some information about the dataset which will be printed in the standard output;
+- ``return``: The number of created files."""
+
+    gr_sizes = get_grs_size_dic(dataset_spase_id)
+    if not gr_sizes:
+        return 0
+
+    log_size = LOG_FILE.tell()
+    gr_nb = 1
+    start_time = time()
+    info = ''
+    for n, granule in enumerate(gr_idx_list):
+        granule_name = op.splitext(granule.filename)[0]
+        granule_id = dataset_spase_id + '-%05d' % n
+        info = '{:<50.50} [{:<50.50}] {:<12.12}'.format(dataset_info, '.' * int(gr_nb / len(gr_idx_list) * 50),
+                                                        '%d/%d' % (gr_nb, len(gr_idx_list)))
+        print(info, end='\r')
+
+        access_url = TARGET_URL_PREFIX + gr_dir_url_prefix + '/' + granule_name  # CDF file
+        # access_url = gr_dir_url_prefix + '/' + granule.filename + '.gz'  # NetCDF file
+
+        granule = GRANULE_TEMPLATE % (granule_id, release_date, dataset_spase_id, granule.start_date, granule.stop_date,
+                                      access_url, get_gr_size(gr_sizes, granule_name))
+        gr_nb += 1
+
+        with open(op.join(granules_dir, granule_id + '.xml'), 'w+') as granule_file:
+            granule_file.write(granule)
+
+    str_time = strftime('elapsed: %Hh%Mm%S', gmtime(time() - start_time))
+    warning = ' see log file' if log_size != LOG_FILE.tell() else ''
+    print(info + str_time + warning)
+    return gr_nb
+
+
+def check_num_data(paths: Dict[str, str]) -> None:
+    """Check the *NumericalData* files, particularly the dataproduct type and XML duration format."""
+
+    regex_xml_duration = re.compile(r'(?P<sign>-?)P(?:(?P<years>\d+)Y)?(?:(?P<months>\d+)M)?(?:(?P<days>\d+)D)?' +
+                                    r'(?:T(?:(?P<hours>\d+)H)?(?:(?P<minutes>\d+)M)?(?:(?P<seconds>\d+)S)?)?')
+
+    for _, dataset_local_path in paths.items():
+        tree = ElTr.parse(dataset_local_path)
+
+        if tree.getroot().tag == 'Message':
+            log('NUM-DATA_XML_MESSAGE',
+                'On NumericalData file %s' % dataset_local_path,
+                'The XML file contains this message: ' + tree.getroot().text,
+                'Set the duration to 0.')
+            return
+
+        numdata_node = tree.getroot().find('{%s}NumericalData' % XMLNS)
+        numdata_node = tree.getroot().find('{%s}NumericalOutput' % XMLNS) if numdata_node is None else numdata_node
+
+        temporal_description_node = numdata_node.find('{%s}TemporalDescription' % XMLNS)
+
+        dataproduct_types = set()
+        for param in numdata_node.findall('{%s}Parameter' % XMLNS):
+            hints = param.findall('{%s}RenderingHints' % XMLNS)
+            dt_nodes = [hint.find('{%s}DisplayType' % XMLNS) for hint in hints]
+            for display in [display.text for display in dt_nodes if display is not None and display.text is not None]:
+                dataproduct_types.add(display)
+        if not dataproduct_types:
+            log('NO_DATAPRODUCT_TYPE',
+                'On NumericalData file %s' % dataset_local_path,
+                'There is no dataproduct type.',
+                'Set the dataproduct type to "TimeSeries".')
+            # ts is added in build_BDD.py
+
+        if temporal_description_node is not None:
+            for duration_key in ('Cadence_Min', 'Cadence_Max', 'Exposure'):
+                duration_node = temporal_description_node.find('{%s}%s' % (XMLNS, duration_key))
+                xml_duration = getattr(duration_node, 'text', 'P0D')
+                try:
+                    regex_xml_duration.match(xml_duration.upper()).groupdict(0)
+                except AttributeError:
+                    log('NUM-DATA_BAD_DATE',
+                        'On NumericalData file %s' % dataset_local_path,
+                        'Can not decode duration: %s.' % xml_duration,
+                        'Set the duration to 0.')
+                    duration_node.text = 'P0D'
+                    tree.write(dataset_local_path)
+
+
+def write_all_granules() -> None:
+    """Create the granules."""
+
+    black_list = tuple()
+    try:
+        with open(BLACKLIST_PATH) as f:
+            black_list += tuple(l.strip() for l in f.readlines() if l.strip() and not l.startswith('#'))
+    except IOError:
+        pass
+    print('ignored datasets: %s' % ', '.join(black_list))
+
+    print('Getting datasets spase ids...')
+    all_spase_id = get_datasets_ids()
+
+    print('Downloading dataset files into %s...' % SPASE_DIR)
+    datasets_spase_id = [num_data for num_data in all_spase_id if num_data.startswith(SPASE_PREFIX)]
+
+    spase_files_path = download_dataset_files(datasets_spase_id, black_list)
+    # We don't want to write granules from files which are not NumData
+    paths = {d_id: path for (d_id, path) in spase_files_path.items()
+             if True in [keyword in path for keyword in NUMDATA_KEYWORDS]}
+
+    print('Checking numerical data files...')
+    check_num_data(paths)
+
+    print('Getting granules index file paths...')
+    url_prefix, grs_idx_url = get_granules_indexes_url()
+    reader = GranuleIndexReader(log)
+
+    n_datasets = 0
+    n_gr = 0
+
+    for gr_idx_url in grs_idx_url:
+        if gr_idx_url not in paths:
+            log('DATASET_INDEX_NOT_LINKED',
+                'dataset %s' % gr_idx_url,
+                'This dataset is found in the granules indexes json file (returned by %s), '
+                'but not in the resolver (%s).' % (GET_INDEXES_WEBSERVICE, RESOLVER_URL),
+                'Ignored this dataset.')
+
+    print('Creating granules...')
+    start_time = time()
+
+    for dataset_spase_id, dataset_local_path in paths.items():
+        nc_file_path = grs_idx_url.get(dataset_spase_id, '')
+        if not nc_file_path:
+            log('DATASET_NOT_IN_IDX_DIC',
+                'dataset %s' % dataset_spase_id,
+                'This dataset is not found in the granules indexes json file returned by %s.' % GET_INDEXES_WEBSERVICE,
+                'Set default times values for all granules of this dataset.')
+        grs_idx_list = reader.get_granules_index(dataset_spase_id, url_prefix + nc_file_path)
+
+        for keyword in NUMDATA_KEYWORDS:
+            dataset_local_path = dataset_local_path.replace(keyword, GRANULE_KEYWORD)
+        grs_local_dir = op.dirname(dataset_local_path)
+        if not op.exists(grs_local_dir):
+            makedirs(grs_local_dir)
+
+        release_date = datetime.now().strftime(XML_DATE_FORMAT)
+        dataset_info = '%s dataset %d/%d (%.2f%%) %s' % \
+                       (strftime('%H:%M'), n_datasets + 1, len(paths),
+                        (n_datasets / len(paths) * 100), dataset_spase_id)
+        gr_dir_url_suffix = '' if not nc_file_path else '/'.join(nc_file_path.split('/')[:-1])
+        try:
+            n_gr += write_granules(dataset_spase_id, grs_local_dir, release_date, gr_dir_url_suffix, grs_idx_list,
+                                   dataset_info)
+        except Exception as error:
+            print('A problem occurred when creating a granule from dataset %s:' % dataset_spase_id)
+            LOG_FILE.close()
+            raise error
+        n_datasets += 1
+
+    elapsed = strftime('%Hh%Mm%S', gmtime(time() - start_time))
+    print('100%%, %d files created in %s.' % (n_gr, elapsed))
+
+
+if __name__ == '__main__':
+    if not op.exists(BASE_DIR):
+        makedirs(BASE_DIR)
+
+    if op.isdir(SPASE_DIR):
+        print('Clearing SPASE directory (%s)...' % SPASE_DIR)
+        shutil.rmtree(SPASE_DIR)
+
+    write_all_granules()
+
+    LOG_FILE.close()
diff --git a/nc_parser.py b/nc_parser.py
new file mode 100755
index 0000000..df86a47
--- /dev/null
+++ b/nc_parser.py
@@ -0,0 +1,194 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""This script parses netCdf files."""
+
+import re
+import os
+import os.path as op
+from datetime import datetime
+from mimetypes import MimeTypes
+from netCDF4 import Dataset
+import pathlib
+from collections import namedtuple
+from typing import List, Optional
+from tempfile import gettempdir
+from urllib.request import urlretrieve
+from urllib.error import HTTPError
+
+# dates format
+SPASE_DATE_FORMAT = '%Y%j%H%M%S'  # ex: 2016238000000*
+XML_DATE_FORMAT = '%Y-%m-%dT%H:%M:%SZ'  # ex: <StartDate>2016-08-26T00:00:00Z</StartDate>
+SPASE_INDEX_TEMP_PATH = op.join(gettempdir(), 'index.nc')
+
+GranuleIndex = namedtuple('GranuleIndex', 'start_date stop_date filename')
+
+
+class GranuleIndexReader:
+
+    def __init__(self, log_fct):
+        self.log_fct = log_fct
+
+    def load_dataset(self, target_name: str, granule_index_url: str) -> Optional[Dataset]:
+        """Load the Dataset stored in `self.nc_file_path`."""
+        if op.isfile(SPASE_INDEX_TEMP_PATH):
+            os.remove(SPASE_INDEX_TEMP_PATH)
+
+        try:
+            urlretrieve(granule_index_url, SPASE_INDEX_TEMP_PATH)
+        except HTTPError:
+            self.log_fct('INDEX_INACCESSIBLE',
+                         'dataset %s' % target_name,
+                         'Can not access to %s.' % granule_index_url,
+                         'Filled this dataset with 1 granule containing default values, granules URLs will be wrong!')
+            return
+
+        if not op.isfile(SPASE_INDEX_TEMP_PATH):
+            self.log_fct('INDEX_FILE_NOT_FOUND',
+                         'dataset %s' % target_name,
+                         'The granules index file has not been correctly downloaded.',
+                         'Filled this dataset with 1 granule containing default values, granules URLs will be wrong!')
+            return
+
+        mime_type = MimeTypes().guess_type(pathlib.Path(op.abspath(SPASE_INDEX_TEMP_PATH)).as_uri())[0]
+        if mime_type != 'application/x-netcdf':
+            self.log_fct('INDEX_FILE_NOT_NET-CDF',
+                         'dataset %s' % target_name,
+                         'The mime-type of the granules index file is not application/netcdf but "%s". See %s.' %
+                         (mime_type, SPASE_INDEX_TEMP_PATH),
+                         'Filled this dataset with 1 granule containing default values, granules URLs will be wrong!')
+            return
+
+        try:
+            return Dataset(SPASE_INDEX_TEMP_PATH)
+        except Exception as e:
+            self.log_fct('CANT_LOAD_INDEX_FILE',
+                         'dataset %s' % target_name,
+                         'Can not load the granules index file with NetCDF4 (%e).'
+                         'See %s.' % (e.__cause__, SPASE_INDEX_TEMP_PATH),
+                         'Filled this dataset with 1 granule containing default values, granules URLs will be wrong!')
+
+    def get_granules_index(self, target_name: str, nc_file_path: str) -> List[GranuleIndex]:
+        if not nc_file_path:
+            return [GranuleIndex('0001-01-01T00:00:00Z', '0001-01-01T00:00:00Z', target_name + '_unknown.nc')]
+
+        dataset = self.load_dataset(target_name, nc_file_path)
+        if not dataset:
+            return [GranuleIndex('0001-01-01T00:00:00Z', '0001-01-01T00:00:00Z', target_name + '_unknown.nc')]
+
+        str_start_time = self.nc_ba_to_strings(target_name, 'StartTime', dataset.variables['StartTime'][:])
+        str_stop_time = self.nc_ba_to_strings(target_name, 'StopTime', dataset.variables['StopTime'][:])
+        file_names = self.nc_ba_to_strings(target_name, 'FileName', dataset.variables['FileName'][:])
+        xml_start_times = self.get_nc_times(target_name, str_start_time)
+        xml_stop_times = self.get_nc_times(target_name, str_stop_time)
+
+        rec_len = dataset.dimensions['record'].size
+        granules_index = [GranuleIndex(xml_start_times[i], xml_stop_times[i], file_names[i]) for i in range(rec_len)]
+        dataset.close()
+
+        return granules_index
+
+    def nc_ba_to_strings(self, target_name: str, col_name: str, byte_arrays: List):
+        """Convert a net-cdf byte array to a string.
+    If ``UnicodeDecodeError`` is raised, converts only the bytes before the first ``b''``.
+
+    - ``byte_arrays``: A net-cdf bytes array;
+    - ``return``: The string representation of the bytes array."""
+
+        strings = []
+        for i, bytes_array in enumerate(byte_arrays):
+            txt = []
+            string_ended = False
+            for j, byte in enumerate(bytes_array):
+                if byte:
+                    if string_ended:
+                        hex_array = ', '.join([str(byte) for byte in bytes_array])
+                        self.log_fct('INVISIBLE_BYTES',
+                                     'granules index "%s" on column %s and row %d' % (target_name, col_name, i),
+                                     'The bytes array contains the byte b\'\' (at index %d), ' % j +
+                                     'followed by other characters: [%s]. ' % hex_array,
+                                     'Removed all characters after the first occurrence of b\'\' in the array.')
+                        break
+                    try:
+                        txt.append(byte.decode('utf-8'))
+                    except UnicodeDecodeError:
+                        hex_array = ', '.join([str(byte) for byte in bytes_array])
+                        self.log_fct('BAD_BYTES',
+                                     'granules index "%s" on column %s and row %d' % (target_name, col_name, i),
+                                     'Can not decode byte %s at index %d on the the bytes array: [%s].'
+                                     % (str(byte), j, hex_array),
+                                     'Changed bad byte by byte\'\'.')
+                        break
+                else:
+                    string_ended = True
+            strings.append(''.join(txt))
+        return strings
+
+    def get_nc_times(self, target_name: str, nc_times: List[str]):
+        """Converts an array of *SPASE dates* to an array of **XML dates*.
+
+    - ``nc_times``: An array of string, containing the dates in their net-cdf format.
+    - ``self.target_name``: The url of the net-cdf file of the granule, only used to print it in log_fct.
+    - ``return``: An array of string, containing the dates in their XML format."""
+
+        contains_no_digit_chars = re.compile(r'.*\D.*')
+        dates = []
+        for nc_time in nc_times:
+            if contains_no_digit_chars.match(nc_time):
+                self.log_fct('DATE_NO_NUM',
+                             'granules index "%s"' % target_name,
+                             'The date "%s" contains non numerical characters.' % nc_time,
+                             'Removed other chars.')
+                nc_time = re.sub(r'\D', '', nc_time)
+            if len(nc_time) > 16:
+                self.log_fct('DATE_TOO_LONG',
+                             'granules index "%s"' % target_name,
+                             'The length of the date "%s" is more than 16 chars.' % nc_time,
+                             'Removed other chars.')
+                nc_time = nc_time[:16]
+            if len(nc_time) < 16:
+                self.log_fct('DATE_TOO_SHORT',
+                             'granules index "%s"' % target_name,
+                             'The length of the date "%s" is less than 16 chars.' % nc_time,
+                             'Replaced other chars by 0.')
+                nc_time = nc_time.ljust(16, '0')
+
+            year, days = int(nc_time[:4]), int(nc_time[4:7]) + 1
+            hour, minute, sec = int(nc_time[7:9]), int(nc_time[9:11]), int(nc_time[11:13])
+
+            if year == 0:
+                self.log_fct('WRONG_YEAR',
+                             'granules index "%s", date ' % target_name,
+                             'The year of the date "%s" is 0.' % nc_time,
+                             'Replaced by 1.')
+                year = 1
+            # check leap years:
+            max_days = 366 if (year % 4 == 0 and not (year % 100 == 0 and year % 400 != 0)) else 365
+            if days > max_days:
+                self.log_fct('WRONG_DAY',
+                             'granules index "%s"' % target_name,
+                             'The day of the year in the date "%s" is > %d.' % (nc_time, max_days),
+                             'Replaced by %d.' % max_days)
+                days = max_days
+            if hour > 23:
+                self.log_fct('WRONG_HOUR',
+                             'granules index "%s"' % target_name,
+                             'The hour of the time "%s" is > 23.' % nc_time,
+                             'Replaced by 23.')
+                hour = 23
+            if minute > 59:
+                self.log_fct('WRONG_MIN',
+                             'granules index "%s"' % target_name,
+                             'The minute of the time %s is > 59.' % nc_time,
+                             'Replaced by 59.')
+                minute = 59
+            if sec > 59:
+                self.log_fct('WRONG_SEC',
+                             'granules index "%s"' % target_name,
+                             'The second of the time "%s" is > 59.' % nc_time,
+                             'Replaced by 59.')
+                sec = 59
+
+            str_date = '%04d%03d%02d%02d%02d' % (year, days, hour, minute, sec)
+            dates.append(datetime.strptime(str_date, SPASE_DATE_FORMAT).strftime(XML_DATE_FORMAT))
+        return dates
--
libgit2 0.21.2