reader.py 12.3 KB
Edit Raw Blame History



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385


#! /usr/bin/python
# -*- coding: utf-8 -*-


"""
    Python reader to stream csv HAPI formatted data
    python reader.py -tmin 1991-01-01T00:00:00 -tmax 1992-01-01T00:00:00 -id tao_mars_dsc -path /Users/aloh/Documents/Work/HAPI/hapi_amda/data/tao/TAO/MARS/SW/
    
    gcc GetFileNames.c DD_time.c -lnetcdf -o GetFileNames
"""


import sys
import os
import shutil
import gzip
import tempfile
import time
import calendar
import datetime
import dateutil.parser
import xml.etree.ElementTree as ET
import numpy
from numpy import __version__ as numpy_version
from collections import OrderedDict

#import ctypes
import subprocess


#_DDSYS_PATH = os.environ['DDBASE'] + '/DDsys.xml'
#_GETFILENAMES = os.environ['DDBASEBIN'] + '/GetFileNames'
_DDSYS_PATH = '/data/DDBASE/DATA/DDsys.xml'
_GETFILENAMES = '/home/budnik/AMDANEW/DDLIB/bin/GetFileNames'
os.environ['LD_LIBRARY_PATH'] = '/home/budnik/AMDANEW/DDLIB/lib/' 

from netCDF4 import Dataset as ncDataset
from netCDF4 import __version__ as netCDF_version

#print('# numpy: {}, netCDF4: {}'.format(numpy_version, netCDF_version))
# recommended: numpy='1.11.3', netCDF4='1.4.2'


def parse_arguments(argv):
    """ Re-organize the arguments
        ['-x', '34', '-y', '-z', '2']
        would become
        [['-x', '34'], ['-y', True], ['-z', '2']]
    """
    newargv = []
    i = 0
    while i < len(argv):
        argtuple = [0, 0]
        if argv[i].startswith('-'):
            argtuple[0] = argv[i]
            if i == len(argv)-1:
                argtuple[1] = True
                i += 1
            elif argv[i+1].startswith('-'):
                argtuple[1] = True
                i += 1
            else:
                argtuple[1] = argv[i+1]
                i += 2
        else:
            i += 1
        newargv.append( tuple(argtuple) )
    return newargv


def read_time_arg(argv, name):
    """ Read the requested time argument.

        Parameters
        ----------
        argv : list
            List of arguments
        name : str
            tmin or tmax

        Returns
        -------
        time : str
            Time correctly formatted
    """
    assert name in ['tmin', 'tmax'], 'name should be tmin or tmax'

    for arg in argv:

        if name in arg[0].lower():

            # Check the time format
            time = arg[1]
            assert isinstance(time, str), 'time is not a string'

            try:
                istime = (time[4] == '-') & (time[8] == 'T') & (time[11] == ':') & (time[14] == ':')
            except:
                istime = False
            #assert istime, 'time doesnt seem like yyyy-dddThh:mm:ss'

            return dateutil.parser.parse(time, ignoretz=True)

    return None


def read_id_arg(argv):
    """ Read the requested ID argument

        Parameters
        ----------
        argv : list
            List of arguments
    """
    for arg in argv:

        if 'id' in arg[0].lower():

            iden = arg[1]

            return iden

    return None


def read_param_arg(argv):
    """ Read the requested parameters argument

        Parameters
        ----------
        argv : list
            List of arguments
    """
    for arg in argv:

        if ('param' in arg[0].lower()) or ('parameters' in arg[0].lower()):

            params = arg[1].split(',')

            return params

    return None


def read_path_arg(argv): 
    """ Read the requested parameters argument

        Parameters
        ----------
        argv : list
            List of arguments
    """
    for arg in argv:

        if 'path' in arg[0].lower():

            path = arg[1]

            return path

    return None

def open_netcdf(fname):
    if fname.endswith(".gz"):
        infile = gzip.open(fname, 'rb')
        tmp = tempfile.NamedTemporaryFile(delete=False)
        shutil.copyfileobj(infile, tmp)
        infile.close()
        tmp.close()
        data = ncDataset(tmp.name)
        os.unlink(tmp.name)
    else:
        data = ncDataset(fname)
    return data


def amda_to_datetime(amda_time):
    """
    Convert amda DD time to datetime.
    :param amda_time: (string) encoded time.
    :return: (datetime.datetime)
    """
    # str_time_year = "".join([c.item().decode('ascii') for c in amda_time[0:4]])
    # str_time_jday = "{:03d}".format(int("".join([c.item().decode('ascii') for c in amda_time[4:7]]))+1)
    # str_time_hmsm = "".join([c.item().decode('ascii') for c in amda_time[7:]])

    str_time_year = "".join(amda_time[0:4])
    str_time_jday = "{:03d}".format(int("".join(amda_time[4:7]))+1)
    str_time_hmsm = "".join([c for c in amda_time[7:] if isinstance(c, str)])
    return datetime.datetime.strptime(str_time_year+str_time_jday+str_time_hmsm+'000',"%Y%j%H%M%S%f")


def get_metadata_from_ddsys(name, localpath):
    """
    Get dataset information from ddsys.xml file, using dataset name
    :param name: (string) DD dataset name
    :return: (dict)
    """

    metadata = {}
    dd_sys = ET.parse(_DDSYS_PATH)
    for record in dd_sys.getroot().iter('VI'):

        # selecting only the <VI>...</VI> element with the correct ID
        if record[0].text == name:

            for child in record:
                metadata[child.tag] = child.text

    if localpath is None:
        localpath = metadata['LOCATION']

    dd_info = ET.parse('{}{}'.format(localpath, metadata['INFO'].replace('.nc', '.xml')))
    for child in dd_info.getroot():
        if child.tag.startswith('Global') or child.tag.startswith('Local'):
            metadata[child.tag] = dateutil.parser.parse(child.text, ignoretz=True)
        else:
            metadata[child.tag] = child.text

    return metadata


def get_start_stop_from_times_nc_file_v0(metadata, localpath):
    """
    Get StartTime, StopTime and FileName from the times.nc file of th current dataset
    :param metadata: (dict) DDsys metadata for the current dataset
    :return: (numpy.array, numpy.array, numpy.array)
    """
    if localpath is None:
        localpath = metadata['LOCATION']

    times_nc_file = "{}{}".format(localpath, metadata['TIMES'])
    
    times_nc_data = open_netcdf(times_nc_file)

    def parse_filename(name_array, localpath=localpath):
        return "{}{}".format(localpath, ''.join(name_array.data))

    start = map(amda_to_datetime, times_nc_data.variables['StartTime'])
    stop = map(amda_to_datetime, times_nc_data.variables['StopTime'])
    filenames = map(parse_filename, times_nc_data.variables['FileName'])

    # return numpy.array([amda_to_datetime(item) for item in times_nc_data.variables['StartTime']]), \
    #        numpy.array([amda_to_datetime(item) for item in times_nc_data.variables['StopTime']]), \
    #        numpy.array(["{}{}".format(localpath, ''.join(item.data))
    #                     for item in times_nc_data.variables['FileName']])
    return numpy.array(start), numpy.array(stop), numpy.array(filenames)


def get_start_stop_from_times_nc_file(metadata, localpath, tmin, tmax):
    """
    """

    if localpath is None:
        localpath = metadata['LOCATION']

    times_nc_file = "{}{}".format(localpath, metadata['TIMES'])
    start =  calendar.timegm(tmin.timetuple())
    stop =  calendar.timegm(tmax.timetuple())

    command = _GETFILENAMES+' '+str(times_nc_file)+' '+str(start)+' '+str(stop)
    result = subprocess.check_output(command, shell=True)
    result = result.strip(';').split(';')
    result = ['{}{}'.format(localpath, rr) for rr in result]
    return result


def check_time_range(tmin, tmax, metadata):
    """ Check that start and stop are within the metadata Global time range

        Parameters
        ----------
        tmin : datetime
        tmax : datetime
        metadata : dict
    """
    if tmin < metadata['GlobalStart']:
        raise ValueError('tmin lower than GlobalStart {}'.format(metadata['GlobalStart']))
    if tmax > metadata['GlobalStop']:
        raise ValueError('tmax greater than GlobalStop {}'.format(metadata['GlobalStop']))
    return


#--------------------------------------------------#
def main(argv):
    argv = parse_arguments(argv)

    tmin = read_time_arg(argv, 'tmin')
    tmax = read_time_arg(argv, 'tmax')
    iden = read_id_arg(argv)
    para = read_param_arg(argv)
    path = read_path_arg(argv)

    meta = get_metadata_from_ddsys(iden, localpath=path)
    check_time_range(tmin, tmax, meta)

    # start, stop, files = get_start_stop_from_times_nc_file(meta, localpath=path, tmin=tmin, tmax=tmax)

    # timemask = (start <= tmax) & (stop >= tmin)

    # for ncfile in files[timemask]:

    files = get_start_stop_from_times_nc_file(meta, localpath=path, tmin=tmin, tmax=tmax)
    for ncfile in files:
        nc = open_netcdf(ncfile + '.gz')
        cur_start_time = amda_to_datetime(nc.variables['StartTime'])  # extract amda-formatted start_time
        cur_end_time = amda_to_datetime(nc.variables['StopTime'])  # extract amda-formatted end_time

        variables = nc.variables
        # if para is not None:
            # para_dict = []
            # para_vari = []
            # for par in para:
            #     if ('[' in par) & (']' in par):
            #         para_index = int(par[par.find('[')+1:par.find(']')])
            #         para_name = par.split('[')[0]
            #     else:
            #         para_index = None
            #         para_name  = par
            #     para_dict.append( (para_name, para_index) )
            #     para_vari.append( (para_name, variables[para_name]))

            # var_index = OrderedDict(para_dict)
            # variables = OrderedDict(para_vari)

            # print(var_index)
            # print(variables)

        if para is not None:
            variables = OrderedDict([(key.split('[')[0], variables[key.split('[')[0]]) for key in para])
            var_index = OrderedDict([(key.split('[')[0], []) for key in para])

            for par in para:
                if ('[' in par) & (']' in par):
                    para_index = int(par[par.find('[')+1:par.find(']')])
                    para_name = par.split('[')[0]
                else:
                    para_index = None
                    para_name  = par
                var_index[para_name].append(para_index)
         
            #variables = OrderedDict([(key, variables[key]) for key in para])
        if tmin <= cur_end_time and tmax >= cur_start_time:  # checking if file contains data within interval
                # time = [amda_to_datetime(cur_time) for cur_time in nc.variables['Time']]
                time = map(amda_to_datetime, nc.variables['Time'])

                for cur_index, cur_dt in enumerate(time):
                    if cur_dt > tmax:
                        break

                    if cur_dt >= tmin and cur_dt <= tmax:
                        stream = '{}'.format(cur_dt.isoformat())                      
                        for var in variables:
                            if not 'Time' in var:
                                cur_data = nc.variables[var][cur_index]

                                # check if this is a vector (for the velocity)
                                # assert isinstance(cur_data, numpy.ndarray), '{}, index {} is not a numpy.ndarray'.format(var, cur_index)
                                if isinstance(cur_data, numpy.ndarray):
                                    if cur_data.size > 1:
                                        # for item in cur_data:
                                        #    stream += ', {}'.format(item)
                                        
                                        if para is not None:
                                            if var_index[var] == [None]:
                                                for item in cur_data:
                                                    stream += ', {}'.format(item)
                                            else:
                                                for i in var_index[var]:
                                                    stream += ', {}'.format(cur_data[i])
                                        else:
                                            for item in cur_data:
                                                stream += ', {}'.format(item)
                                    else:  
                                        stream += ', {}'.format(cur_data)
                                else:  
                                        stream += ', {}'.format(cur_data)
                        print(stream)
        
#--------------------------------------------------#


if __name__ == "__main__":
    main(sys.argv[1:])