reader.py 12.3 KB
#! /usr/bin/python
# -*- coding: utf-8 -*-


"""
    Python reader to stream csv HAPI formatted data
    python reader.py -tmin 1991-01-01T00:00:00 -tmax 1992-01-01T00:00:00 -id tao_mars_dsc -path /Users/aloh/Documents/Work/HAPI/hapi_amda/data/tao/TAO/MARS/SW/
    
    gcc GetFileNames.c DD_time.c -lnetcdf -o GetFileNames
"""


import sys
import os
import shutil
import gzip
import tempfile
import time
import calendar
import datetime
import dateutil.parser
import xml.etree.ElementTree as ET
import numpy
from numpy import __version__ as numpy_version
from collections import OrderedDict

#import ctypes
import subprocess



#_DDSYS_PATH = os.environ['DDBASE'] + '/DDsys.xml'
#_GETFILENAMES = os.environ['DDBASEBIN'] + '/GetFileNames'
_DDSYS_PATH = '/data/DDBASE/DATA/DDsys.xml'
_GETFILENAMES = '/home/budnik/AMDANEW/DDLIB/bin/GetFileNames'
os.environ['LD_LIBRARY_PATH'] = '/home/budnik/AMDANEW/DDLIB/lib/' 

from netCDF4 import Dataset as ncDataset
from netCDF4 import __version__ as netCDF_version

#print('# numpy: {}, netCDF4: {}'.format(numpy_version, netCDF_version))
# recommended: numpy='1.11.3', netCDF4='1.4.2'


def parse_arguments(argv):
    """ Re-organize the arguments
        ['-x', '34', '-y', '-z', '2']
        would become
        [['-x', '34'], ['-y', True], ['-z', '2']]
    """
    newargv = []
    i = 0
    while i < len(argv):
        argtuple = [0, 0]
        if argv[i].startswith('-'):
            argtuple[0] = argv[i]
            if i == len(argv)-1:
                argtuple[1] = True
                i += 1
            elif argv[i+1].startswith('-'):
                argtuple[1] = True
                i += 1
            else:
                argtuple[1] = argv[i+1]
                i += 2
        else:
            i += 1
        newargv.append( tuple(argtuple) )
    return newargv


def read_time_arg(argv, name):
    """ Read the requested time argument.

        Parameters
        ----------
        argv : list
            List of arguments
        name : str
            tmin or tmax

        Returns
        -------
        time : str
            Time correctly formatted
    """
    assert name in ['tmin', 'tmax'], 'name should be tmin or tmax'

    for arg in argv:

        if name in arg[0].lower():

            # Check the time format
            time = arg[1]
            assert isinstance(time, str), 'time is not a string'

            try:
                istime = (time[4] == '-') & (time[8] == 'T') & (time[11] == ':') & (time[14] == ':')
            except:
                istime = False
            #assert istime, 'time doesnt seem like yyyy-dddThh:mm:ss'

            return dateutil.parser.parse(time, ignoretz=True)

    return None


def read_id_arg(argv):
    """ Read the requested ID argument

        Parameters
        ----------
        argv : list
            List of arguments
    """
    for arg in argv:

        if 'id' in arg[0].lower():

            iden = arg[1]

            return iden

    return None


def read_param_arg(argv):
    """ Read the requested parameters argument

        Parameters
        ----------
        argv : list
            List of arguments
    """
    for arg in argv:

        if ('param' in arg[0].lower()) or ('parameters' in arg[0].lower()):

            params = arg[1].split(',')

            return params

    return None


def read_path_arg(argv): 
    """ Read the requested parameters argument

        Parameters
        ----------
        argv : list
            List of arguments
    """
    for arg in argv:

        if 'path' in arg[0].lower():

            path = arg[1]

            return path

    return None

def open_netcdf(fname):
    if fname.endswith(".gz"):
        infile = gzip.open(fname, 'rb')
        tmp = tempfile.NamedTemporaryFile(delete=False)
        shutil.copyfileobj(infile, tmp)
        infile.close()
        tmp.close()
        data = ncDataset(tmp.name)
        os.unlink(tmp.name)
    else:
        data = ncDataset(fname)
    return data


def amda_to_datetime(amda_time):
    """
    Convert amda DD time to datetime.
    :param amda_time: (string) encoded time.
    :return: (datetime.datetime)
    """
    # str_time_year = "".join([c.item().decode('ascii') for c in amda_time[0:4]])
    # str_time_jday = "{:03d}".format(int("".join([c.item().decode('ascii') for c in amda_time[4:7]]))+1)
    # str_time_hmsm = "".join([c.item().decode('ascii') for c in amda_time[7:]])

    str_time_year = "".join(amda_time[0:4])
    str_time_jday = "{:03d}".format(int("".join(amda_time[4:7]))+1)
    str_time_hmsm = "".join([c for c in amda_time[7:] if isinstance(c, str)])
    return datetime.datetime.strptime(str_time_year+str_time_jday+str_time_hmsm+'000',"%Y%j%H%M%S%f")


def get_metadata_from_ddsys(name, localpath):
    """
    Get dataset information from ddsys.xml file, using dataset name
    :param name: (string) DD dataset name
    :return: (dict)
    """

    metadata = {}
    dd_sys = ET.parse(_DDSYS_PATH)
    for record in dd_sys.getroot().iter('VI'):

        # selecting only the <VI>...</VI> element with the correct ID
        if record[0].text == name:

            for child in record:
                metadata[child.tag] = child.text

    if localpath is None:
        localpath = metadata['LOCATION']

    dd_info = ET.parse('{}{}'.format(localpath, metadata['INFO'].replace('.nc', '.xml')))
    for child in dd_info.getroot():
        if child.tag.startswith('Global') or child.tag.startswith('Local'):
            metadata[child.tag] = dateutil.parser.parse(child.text, ignoretz=True)
        else:
            metadata[child.tag] = child.text

    return metadata


def get_start_stop_from_times_nc_file_v0(metadata, localpath):
    """
    Get StartTime, StopTime and FileName from the times.nc file of th current dataset
    :param metadata: (dict) DDsys metadata for the current dataset
    :return: (numpy.array, numpy.array, numpy.array)
    """
    if localpath is None:
        localpath = metadata['LOCATION']

    times_nc_file = "{}{}".format(localpath, metadata['TIMES'])
    
    times_nc_data = open_netcdf(times_nc_file)

    def parse_filename(name_array, localpath=localpath):
        return "{}{}".format(localpath, ''.join(name_array.data))

    start = map(amda_to_datetime, times_nc_data.variables['StartTime'])
    stop = map(amda_to_datetime, times_nc_data.variables['StopTime'])
    filenames = map(parse_filename, times_nc_data.variables['FileName'])

    # return numpy.array([amda_to_datetime(item) for item in times_nc_data.variables['StartTime']]), \
    #        numpy.array([amda_to_datetime(item) for item in times_nc_data.variables['StopTime']]), \
    #        numpy.array(["{}{}".format(localpath, ''.join(item.data))
    #                     for item in times_nc_data.variables['FileName']])
    return numpy.array(start), numpy.array(stop), numpy.array(filenames)


def get_start_stop_from_times_nc_file(metadata, localpath, tmin, tmax):
    """
    """

    if localpath is None:
        localpath = metadata['LOCATION']

    times_nc_file = "{}{}".format(localpath, metadata['TIMES'])
    start =  calendar.timegm(tmin.timetuple())
    stop =  calendar.timegm(tmax.timetuple())

    command = _GETFILENAMES+' '+str(times_nc_file)+' '+str(start)+' '+str(stop)
    result = subprocess.check_output(command, shell=True)
    result = result.strip(';').split(';')
    result = ['{}{}'.format(localpath, rr) for rr in result]
    return result


def check_time_range(tmin, tmax, metadata):
    """ Check that start and stop are within the metadata Global time range

        Parameters
        ----------
        tmin : datetime
        tmax : datetime
        metadata : dict
    """
    if tmin < metadata['GlobalStart']:
        raise ValueError('tmin lower than GlobalStart {}'.format(metadata['GlobalStart']))
    if tmax > metadata['GlobalStop']:
        raise ValueError('tmax greater than GlobalStop {}'.format(metadata['GlobalStop']))
    return


#--------------------------------------------------#
def main(argv):
    argv = parse_arguments(argv)

    tmin = read_time_arg(argv, 'tmin')
    tmax = read_time_arg(argv, 'tmax')
    iden = read_id_arg(argv)
    para = read_param_arg(argv)
    path = read_path_arg(argv)

    meta = get_metadata_from_ddsys(iden, localpath=path)
    check_time_range(tmin, tmax, meta)

    # start, stop, files = get_start_stop_from_times_nc_file(meta, localpath=path, tmin=tmin, tmax=tmax)

    # timemask = (start <= tmax) & (stop >= tmin)

    # for ncfile in files[timemask]:

    files = get_start_stop_from_times_nc_file(meta, localpath=path, tmin=tmin, tmax=tmax)
    for ncfile in files:
        nc = open_netcdf(ncfile + '.gz')
        cur_start_time = amda_to_datetime(nc.variables['StartTime'])  # extract amda-formatted start_time
        cur_end_time = amda_to_datetime(nc.variables['StopTime'])  # extract amda-formatted end_time

        variables = nc.variables
        # if para is not None:
            # para_dict = []
            # para_vari = []
            # for par in para:
            #     if ('[' in par) & (']' in par):
            #         para_index = int(par[par.find('[')+1:par.find(']')])
            #         para_name = par.split('[')[0]
            #     else:
            #         para_index = None
            #         para_name  = par
            #     para_dict.append( (para_name, para_index) )
            #     para_vari.append( (para_name, variables[para_name]))

            # var_index = OrderedDict(para_dict)
            # variables = OrderedDict(para_vari)

            # print(var_index)
            # print(variables)

        if para is not None:
            variables = OrderedDict([(key.split('[')[0], variables[key.split('[')[0]]) for key in para])
            var_index = OrderedDict([(key.split('[')[0], []) for key in para])

            for par in para:
                if ('[' in par) & (']' in par):
                    para_index = int(par[par.find('[')+1:par.find(']')])
                    para_name = par.split('[')[0]
                else:
                    para_index = None
                    para_name  = par
                var_index[para_name].append(para_index)
         
            #variables = OrderedDict([(key, variables[key]) for key in para])
        if tmin <= cur_end_time and tmax >= cur_start_time:  # checking if file contains data within interval
                # time = [amda_to_datetime(cur_time) for cur_time in nc.variables['Time']]
                time = map(amda_to_datetime, nc.variables['Time'])

                for cur_index, cur_dt in enumerate(time):
                    if cur_dt > tmax:
                        break

                    if cur_dt >= tmin and cur_dt <= tmax:
                        stream = '{}'.format(cur_dt.isoformat())                      
                        for var in variables:
                            if not 'Time' in var:
                                cur_data = nc.variables[var][cur_index]

                                # check if this is a vector (for the velocity)
                                # assert isinstance(cur_data, numpy.ndarray), '{}, index {} is not a numpy.ndarray'.format(var, cur_index)
                                if isinstance(cur_data, numpy.ndarray):
                                    if cur_data.size > 1:
                                        # for item in cur_data:
                                        #    stream += ', {}'.format(item)
                                        
                                        if para is not None:
                                            if var_index[var] == [None]:
                                                for item in cur_data:
                                                    stream += ', {}'.format(item)
                                            else:
                                                for i in var_index[var]:
                                                    stream += ', {}'.format(cur_data[i])
                                        else:
                                            for item in cur_data:
                                                stream += ', {}'.format(item)
                                    else:  
                                        stream += ', {}'.format(cur_data)
                                else:  
                                        stream += ', {}'.format(cur_data)
                        print(stream)
        
#--------------------------------------------------#


if __name__ == "__main__":
    main(sys.argv[1:])