From 9f59dba799ae9abdfb89e83e52a40d878c0cf610 Mon Sep 17 00:00:00 2001
From: Benjamin Renard <benjamin.renard@akka.eu>
Date: Mon, 26 Jul 2021 15:07:15 +0200
Subject: [PATCH] Fix CDAWeb tree generation (#8899)

---
 src/REMOTEDATA/CDAWEB.php            | 253 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------------------------------------------------------------------
 src/REMOTEDATA/CDAWebConfigClass.php |   6 +-----
 2 files changed, 163 insertions(+), 96 deletions(-)

diff --git a/src/REMOTEDATA/CDAWEB.php b/src/REMOTEDATA/CDAWEB.php
index 6824cdc..7871120 100644
--- a/src/REMOTEDATA/CDAWEB.php
+++ b/src/REMOTEDATA/CDAWEB.php
@@ -6,7 +6,9 @@
  */
 class CDAWEB extends RemoteDataCenterClass
 {
-	private $ch, $res, $dataViewUR; 
+	private $dataViewURL = NULL;
+
+	private $ch; 
 	private $obsGroupsIds;
 	private $spase_res, $insXML, $xp = null;
 	
@@ -66,7 +68,7 @@ class CDAWEB extends RemoteDataCenterClass
 	{	
 		$this->ch = curl_init();			 
 		curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, 1);
-		curl_setopt($this->ch, CURLOPT_TIMEOUT, 60);
+		curl_setopt($this->ch, CURLOPT_TIMEOUT, 600);
 		// Add proxy definition
 		$PROXY_HOST=getenv('PROXY_HOST');
 		$PROXY_USERPWD=getenv('PROXY_USERPWD');
@@ -85,35 +87,21 @@ class CDAWEB extends RemoteDataCenterClass
 
 	private function initStreamContext()
 	{
+		$context = array(
+			'http' => array(
+				'timeout' => 600,
+			),
+		);
 		$PROXY_HOST=getenv('PROXY_HOST');
 		$PROXY_USERPWD=getenv('PROXY_USERPWD');
 		if (!empty($PROXY_HOST)) {
-			$context = array(
-				'http' => array(
-					'proxy' => "tcp://$PROXY_HOST",
-					'request_fulluri' => true,
-				),
-			);
+			$context['http']['proxy'] = "tcp://$PROXY_HOST";
+			$context['http']['request_fulluri'] = TRUE;
 			if (!empty($PROXY_USERPWD)) {
 				$context['http']['header'] = "Proxy-Authorization: Basic ".base64_encode($PROXY_USERPWD);
                         }
-			stream_context_set_default($context);
 		}
-	}
-	
-	protected function setDataViewURL()
-	{
-		curl_setopt($this->ch, CURLOPT_URL, CDAWebConfigClass::$restUrl."/dataviews");
-		
-		$this->res = new DomDocument();	
-		$this->res->loadXML(curl_exec($this->ch));
-		
-		if ($this->res->getElementsByTagName('EndpointAddress')->length == 0) {
-				error_log("Problem connect to ".CDAWebConfigClass::$restUrl."/dataviews", 3, err);
-				exit("Problem connect to ".CDAWebConfigClass::$restUrl."/dataviews".PHP_EOL);
-		}
-		
-		$this->dataViewURL = $this->res->getElementsByTagName('EndpointAddress')->item(0)->nodeValue;
+		stream_context_set_default($context);
 	}
 	
 	/**
@@ -123,16 +111,30 @@ class CDAWEB extends RemoteDataCenterClass
 	protected function getRemoteTree()	
 	{
         $this->openConnection();
-		
-		$this->setDataViewURL();     
         $this->obsGroupsIds = array();   
-        
-        curl_setopt($this->ch, CURLOPT_URL, $this->dataViewURL."/datasets");			   
 
-        $this->res->loadXML(curl_exec($this->ch));
-     //   $this->res->save("datasets.xml"); 
-     //   $this->res->load("datasets.xml");
-        $datasets = $this->res->getElementsByTagName("DatasetDescription");
+	$dom = NULL;
+	$reuse_cache_file = FALSE;
+	if (file_exists($this->location."/datasets.xml")) {
+		if (time() - filemtime($this->location."/datasets.xml") < 86400) {
+			$dom = new DOMDocument();
+			if ($dom->load($this->location."/datasets.xml")) {
+				echo "Re-use cache file ".$this->location."/datasets.xml".PHP_EOL;
+				$reuse_cache_file = TRUE;
+			}
+		}
+	}
+
+	if (!$reuse_cache_file) {
+		$dom = $this->loadFromCDAWebWS("datasets", array());
+		if (!$dom) {
+			error_log('Cannot retrieve CDAWeb datasets list');
+			return;
+		}
+		$dom->save($this->location."/datasets.xml");
+	}
+
+        $datasets = $dom->getElementsByTagName("DatasetDescription");
     
         echo "All CDAWeb datasets : ".$datasets->length.PHP_EOL;
 
@@ -195,14 +197,16 @@ class CDAWEB extends RemoteDataCenterClass
 	{		
 		if ( !array_key_exists($dsId, $this->CDAWEB ))
 			return null; // no description in SpaseRegistry => we do not add this dataset
-		
+
 		$dsNode = $this->domAmda->createElement('dataset');
+
 		
-		curl_setopt($this->ch, CURLOPT_HTTPHEADER, array("Accept: application/json"));
-		curl_setopt($this->ch, CURLOPT_URL, $this->dataViewURL."/datasets?idPattern=".$dsId);			   
+		$obj = $this->loadFromCDAWebWS("datasets", array("idPattern" => $dsId), TRUE);
+		if (!$obj) {
+			return NULL;
+		}
 			
-		$obj = json_decode(curl_exec($this->ch));
-        $dataSet = $obj->DatasetDescription;
+		$dataSet = $obj->DatasetDescription;
       
 		$dsNode->setAttribute("xml:id",$this->baseID.":".$dsId);
 		$dsNode->setAttribute("name", $dsId);
@@ -240,13 +244,13 @@ class CDAWEB extends RemoteDataCenterClass
 		}
         else 
 			$this->updateDDServerXml("MinSampling",$sampling);	
-		
+
 		$dsNode->setAttribute('spaseUrl',$this->CDAWEB[$dsId]);
 	//   $dsNode->setAttribute('masterCdf',$this->existsMasterCdf($dsId));
 		$dsNode->setAttribute("desc", "$label; $startTime - $endTime");
 		$dsNode->setAttribute("dataStart", $startTime); 
 		$dsNode->setAttribute("dataStop", $endTime); 
-	   
+
 		$parameterNodes = $this->createParameterNodes($dsId);
 		foreach ($parameterNodes as $parameterNode)
 		{
@@ -258,10 +262,9 @@ class CDAWEB extends RemoteDataCenterClass
 	
 	protected function createParameterNodes($dsId)
 	{	
-		curl_setopt($this->ch, CURLOPT_HTTPHEADER, array("Accept: application/json"));
-		curl_setopt($this->ch, CURLOPT_URL, $this->dataViewURL."/datasets/".$dsId."/variables");			   
-			 
-	 	$obj = json_decode(curl_exec($this->ch));
+	 	$obj = $this->loadFromCDAWebWS("datasets/".$dsId."/variables", array(), TRUE);
+		if (!$obj)
+			return array();
 	 	$parameters = $obj->VariableDescription;
 
 		$paramNodes = array();
@@ -280,15 +283,11 @@ class CDAWEB extends RemoteDataCenterClass
 	
 	protected function getDatasetSpaseDescription($dsID)
 	{		 
-		curl_setopt($this->ch, CURLOPT_HTTPHEADER, array("Accept: application/xml"));
- 
 		if ( !array_key_exists($dsID, $this->CDAWEB ))
 			return -1; // no description in SpaseRegistry
-			
-		curl_setopt($this->ch, CURLOPT_URL, CDAWebConfigClass::$spaseResolver."id=".$this->CDAWEB[$dsID]);
-
-		if (!$this->spase_res->loadXML(curl_exec($this->ch))) return -100;
- 
+		
+		if (!$this->loadSpaseResource($this->CDAWEB[$dsID], $this->spase_res)) return -100;
+	
 		$messages = $this->spase_res->getElementsByTagName('Message');
 		
 		if ($messages->length > 0)
@@ -327,27 +326,35 @@ class CDAWEB extends RemoteDataCenterClass
 	*/
 	protected function getAllSpaseDatasets()
 	{
+		$this->initStreamContext();
+
 		require_once "simple_html_dom.php";
 
-		if (file_exists($this->location."/NumericalData.html")) 
-				rename($this->location."/NumericalData.html",$this->location."/NumericalData.html.bak");
+		if (file_exists($this->location."/NumericalData.html"))
+			rename($this->location."/NumericalData.html",$this->location."/NumericalData.html.bak");
+
 		
 		// if cannot reach CDAWEB Spase Registry use an old file
-		if (!copy(CDAWebConfigClass::$spaseRegistry, $this->location."/NumericalData.html"))
+		if (!copy(CDAWebConfigClass::$spaseRegistry, $this->location."/NumericalData.html")) {
+				error_log('ERROR');
 				copy($this->location."/NumericalData.html.bak", $this->location."/NumericalData.html");
+		}
+
+		$doc = new DOMDocument('1.0', 'utf-8');
+		if (!$doc->loadHTMLFile($this->location.'/NumericalData.html')) {
+			error_log("Cannot load ".$this->location.'/NumericalData.html');
+		}
+
+		$xpath = new DOMXPath($doc);
+		$ids = $xpath->query("//td[@class='Spase.URL.ProductID']");
 
-		$html = file_get_html($this->location.'/NumericalData.html');
-		$ids = $html->find('td[class="Spase.URL.ProductID"]');
-		
 		foreach ($ids as $id)
-		{	
-            if ($id->next_sibling()) {
-                $ref = $id->next_sibling()->find('a');
-                $key = $id->find('a');
-                if (strlen($ref[0]->innertext) > 25)
-                    $this->CDAWEB[$key[0]->innertext] = $ref[0]->innertext;     // if SPASE description exists  
-                
-            }   
+		{
+			$key = $id->nodeValue;
+			$spase = $id->nextSibling->nodeValue;
+			if (!empty($key) && (strlen($spase) > 25)) {
+				$this->CDAWEB[$key] = $spase;
+			}
 		}
 
 		error_log("SPASE-defined CDAWEB datasets from ".CDAWebConfigClass::$spaseRegistry." : ".count($this->CDAWEB).PHP_EOL,3,log);
@@ -415,23 +422,21 @@ class CDAWEB extends RemoteDataCenterClass
 	{
 		$this->initStreamContext();
 		$this->openConnection();
-		$this->setDataViewURL(); 
 
-	   curl_setopt($this->ch, CURLOPT_URL,$this->dataViewURL."/datasets/$ds/orig_data/$start,$stop/");
-	
-	   $res = new DomDocument("1.0");
-	    
-	   $res->loadXML(curl_exec($this->ch));
-	   
-	   if ($res->getElementsByTagName("html")->length > 0)
-	   {
+		$res = $this->loadFromCDAWebWS("datasets/$ds/orig_data/$start,$stop/", array());
+		if (!$res) {
+			return array();
+		}
+
+		if ($res->getElementsByTagName("html")->length > 0)
+		{
 			error_log("ERROR no response for : ".$ds." : ".$start." - ".$stop.PHP_EOL,3,err);
 			error_log($res->saveXML(),3,err);
 			$this->closeConnection();
 			return array();
-	   }
+		}
 	   
-	   $fileNames = $res->getElementsByTagName("Name");
+		$fileNames = $res->getElementsByTagName("Name");
 	   	   
 		$nc_prefix = strlen($ds) > RemoteDataCenterClass::$MAX_VI_NAME_LENGTH ? 
 				substr(strtolower($ds),0,RemoteDataCenterClass::$MAX_VI_NAME_LENGTH - 1): strtolower($ds);
@@ -543,7 +548,7 @@ class CDAWEB extends RemoteDataCenterClass
                         if (!$this->domAmda->getElementById($dsId)){
                             $this->initDDServerXml($ds,$ins,$obs);
                             $dsNode = $this->createDatasetNode($ds);
-                              
+
                             if ($dsNode){
                                 $this->saveDDServerXml();
                                 $dsNodes[] = $dsNode;
@@ -555,16 +560,16 @@ class CDAWEB extends RemoteDataCenterClass
                     {
                             // last $spase_res : instrument should be the same
                             $insSpaseId = $this->getInstrumentSpase();   
-                                
+
                             if (!$insSpaseId || is_array($insSpaseId)) 
                                 $insSpaseId = $ins;
-                            
+
                             $obsSpaseId = strtolower($this->getObservatorySpase());
                             if (!$obsSpaseId ) 
                                 $obsSpaseId = strtolower($obs);
-                            
+
                             $insId = $this->baseID.":".$this->param2dd($groupId).":".$this->param2dd($obsSpaseId).":".$this->param2dd($insSpaseId);
-                                
+
                             if (!($insNode = $this->domAmda->getElementById($insId))){
                                 $insNode = $this->createInstrumentNode($insSpaseId, $obsSpaseId, $groupId);
                             }
@@ -576,7 +581,7 @@ class CDAWEB extends RemoteDataCenterClass
                             $insNodes[] = $insNode;
                         }
                     } // foreach ($inss as $ins => $dss)
-                    
+
                     if (!empty($insNodes))
                     {					
                         $obsId = $this->baseID.":".$this->param2dd($groupId).":".$this->param2dd($obsSpaseId);
@@ -591,6 +596,7 @@ class CDAWEB extends RemoteDataCenterClass
                         $obsNodes[] = $obsNode;
                     }
                 } // foreach ($obss as $obs => $inss)
+
                 
                 if (!empty($obsNodes))
                 {
@@ -617,7 +623,6 @@ class CDAWEB extends RemoteDataCenterClass
                 }                                
             } 
 
-		
 		$this->closeConnection();
 	}
 	
@@ -634,18 +639,17 @@ class CDAWEB extends RemoteDataCenterClass
 	protected function getInstrumentSpase()
 	{	
 		$this->insXML = new DomDocument("1.0");
-		
+
 		if ($this->spase_res){
 			$instrument = $this->spase_res->getElementsByTagName('InstrumentID');
-			
+
 			if ($instrument->length > 0) {
                 if ($instrument->length == 1) {
-                    curl_setopt($this->ch, CURLOPT_URL, CDAWebConfigClass::$spaseResolver."id=".$instrument->item(0)->nodeValue);
-                    $this->insXML->loadXML(curl_exec($this->ch));
-                    
+		    $this->loadSpaseResource($instrument->item(0)->nodeValue, $this->insXML);
+
                     return $this->getIdFromSpase($instrument->item(0)->nodeValue);
-				}
-				else {
+		}
+		else {
                     // $insIds = "";
                     $insIds = array();
                     for  ($i = 0; $i < $instrument->length; $i++ ) {                                            
@@ -655,8 +659,7 @@ class CDAWEB extends RemoteDataCenterClass
                         $insIds[] = $this->getIdFromSpase($instrument->item($i)->nodeValue);
                     }
                     
-                    curl_setopt($this->ch, CURLOPT_URL, CDAWebConfigClass::$spaseResolver."id=".$instrument->item($i)->nodeValue); 
-                    $this->insXML->loadXML(curl_exec($this->ch));
+		    $this->loadSpaseResource($instrument->item($instrument->length - 1)->nodeValue, $this->insXML);
                     return $insIds;
 				}
 				//InstrumentType
@@ -688,6 +691,74 @@ class CDAWEB extends RemoteDataCenterClass
 	
 	protected function makeArgumentsList(){}
 
+
+	private function loadSpaseResource($resourceID, &$dom) {
+		if (empty($resourceID)) {
+			return FALSE;
+		}
+		$url = str_replace('spase://', 'https://hpde.io/', $resourceID) . '.xml';
+		curl_setopt($this->ch, CURLOPT_HTTPHEADER, array("Accept: application/xml"));
+		curl_setopt($this->ch, CURLOPT_URL, $url);
+                if (!$dom->loadXML(curl_exec($this->ch))) {
+			error_log('Cannot load Spase Resource: '.$resourceID);
+			return FALSE;
+		}
+		return TRUE;
+	}
+
+	private function loadFromCDAWebWS($api, $params, $asJSON = FALSE) {
+		if (!$this->dataViewURL) {
+			curl_setopt($this->ch, CURLOPT_URL, CDAWebConfigClass::$restUrl."/dataviews");
+			curl_setopt($this->ch, CURLOPT_HTTPHEADER, array("Accept: application/xml"));
+
+			$res = new DomDocument();
+			if (!$res->loadXML(curl_exec($this->ch))) {
+				return FALSE;
+			}
+
+			if ($res->getElementsByTagName('EndpointAddress')->length == 0) {
+				error_log("Problem connect to ".CDAWebConfigClass::$restUrl."/dataviews", 3, err);
+				return FALSE;
+			}
+
+			$this->dataViewURL = $res->getElementsByTagName('EndpointAddress')->item(0)->nodeValue;
+		}
+		$url = $this->dataViewURL . '/' . $api;
+		if (!empty($params)) {
+			$params_str = "";
+			foreach ($params as $key => $value) {
+				if (!empty($params_str)) {
+					$params_str .= "&";
+				}
+				$params_str .= $key."=".urlencode($value);
+			}
+			$url .= '?'.$params_str;
+		}
+
+		if ($asJSON) {
+			curl_setopt($this->ch, CURLOPT_HTTPHEADER, array("Accept: application/json"));
+		}
+		else {
+			curl_setopt($this->ch, CURLOPT_HTTPHEADER, array("Accept: application/xml"));
+		}
+		curl_setopt($this->ch, CURLOPT_URL, $url);
+
+		$res = curl_exec($this->ch);
+		if (!$res) {
+			return FALSE;
+		}
+
+		if ($asJSON) {
+			return json_decode($res);
+		}
+
+		$dom = new DOMDocument();
+     		if (!$dom->loadXML(curl_exec($this->ch))) {
+			return FALSE;
+		}
+
+		return $dom;
+	}
 } 
 ?>
 
diff --git a/src/REMOTEDATA/CDAWebConfigClass.php b/src/REMOTEDATA/CDAWebConfigClass.php
index 60bb7ff..2f0bb32 100644
--- a/src/REMOTEDATA/CDAWebConfigClass.php
+++ b/src/REMOTEDATA/CDAWebConfigClass.php
@@ -13,11 +13,7 @@ class CDAWebConfigClass
 	// Master CDFs URL
 	public static $masterUrl = "https://cdaweb.gsfc.nasa.gov/pub/software/cdawlib/0MASTERS/";
 	// SPASE  CDAWeb dataset descriptionlist
-	public static $spaseRegistry = "https://heliophysicsdata.sci.gsfc.nasa.gov/queries/CDAWeb_SPASE.xql"; 	
-	// https://cdaweb.gsfc.nasa.gov/registry/hdp/NumericalData.xql : limited list
-	// http://spase-group.org/registry/explorer/ SMWG
-	// Service to get SPASE description by SPASE ID (from $spaseRegistry)
-	public static $spaseResolver = "http://www.spase-group.org/registry/resolver?";		
+	public static $spaseRegistry = "https://heliophysicsdata.gsfc.nasa.gov/queries/CDAWeb_SPASE.html"; 	
 	public static $format = "CDF";
 }
 ?>
--
libgit2 0.21.2