bes  Updated for version 3.20.10
NgapApi.cc
1 // -*- mode: c++; c-basic-offset:4 -*-
2 
3 // This file is part of ngap_module, A C++ module that can be loaded in to
4 // the OPeNDAP Back-End Server (BES) and is able to handle remote requests.
5 
6 // Copyright (c) 2020 OPeNDAP, Inc.
7 // Author: Nathan Potter <ndp@opendap.org>
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 //
23 // You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
24 
25 #include "config.h"
26 
27 #include <cstdio>
28 #include <cstring>
29 #include <iostream>
30 #include <sstream>
31 #include <memory>
32 #include <time.h>
33 #include <curl/curl.h>
34 
35 #include <libdap/util.h>
36 #include <libdap/debug.h>
37 
38 #include "rapidjson/document.h"
39 #include "rapidjson/writer.h"
40 #include "rapidjson/prettywriter.h"
41 #include "rapidjson/stringbuffer.h"
42 #include "rapidjson/filereadstream.h"
43 
44 #include "BESError.h"
45 #include "BESNotFoundError.h"
46 #include "BESSyntaxUserError.h"
47 #include "BESDebug.h"
48 #include "BESUtil.h"
49 #include "BESStopWatch.h"
50 #include "BESLog.h"
51 #include "TheBESKeys.h"
52 #include "CurlUtils.h"
53 #include "url_impl.h"
54 #include "RemoteResource.h"
55 
56 #include "NgapApi.h"
57 #include "NgapNames.h"
58 #include "NgapError.h"
59 
60 using namespace std;
61 
62 #define prolog string("NgapApi::").append(__func__).append("() - ")
63 
64 namespace ngap {
65 
66 const unsigned int REFRESH_THRESHOLD = 3600; // An hour
67 
68 
69 NgapApi::NgapApi() : d_cmr_hostname(DEFAULT_CMR_ENDPOINT_URL), d_cmr_search_endpoint_path(DEFAULT_CMR_SEARCH_ENDPOINT_PATH) {
70  bool found;
71  string cmr_hostname;
72  TheBESKeys::TheKeys()->get_value(NGAP_CMR_HOSTNAME_KEY, cmr_hostname, found);
73  if (found) {
74  d_cmr_hostname = cmr_hostname;
75  }
76 
77  string cmr_search_endpoint_path;
78  TheBESKeys::TheKeys()->get_value(NGAP_CMR_SEARCH_ENDPOINT_PATH_KEY, cmr_search_endpoint_path, found);
79  if (found) {
80  d_cmr_search_endpoint_path = cmr_search_endpoint_path;
81  }
82 
83 
84 }
85 
86 std::string NgapApi::get_cmr_search_endpoint_url(){
87  return BESUtil::assemblePath(d_cmr_hostname , d_cmr_search_endpoint_path);
88 }
89 
90 
91 
99 std::string NgapApi::build_cmr_query_url_old_rpath_format(const std::string &restified_path) {
100 
101  // Make sure it starts with a '/' (see key strings above)
102  string r_path = ( restified_path[0] != '/' ? "/" : "") + restified_path;
103 
104  size_t provider_index = r_path.find(NGAP_PROVIDERS_KEY);
105  if(provider_index == string::npos){
106  stringstream msg;
107  msg << prolog << "The specified path '" << r_path << "'";
108  msg << " does not contain the required path element '" << NGAP_PROVIDERS_KEY << "'";
109  throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
110  }
111  if(provider_index != 0){
112  stringstream msg;
113  msg << prolog << "The specified path '" << r_path << "'";
114  msg << " has the path element '" << NGAP_PROVIDERS_KEY << "' located in the incorrect position (";
115  msg << provider_index << ") expected 0.";
116  throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
117  }
118  provider_index += string(NGAP_PROVIDERS_KEY).length();
119 
120  bool use_collection_concept_id = false;
121  size_t collection_index = r_path.find(NGAP_COLLECTIONS_KEY);
122  if(collection_index == string::npos) {
123  size_t concepts_index = r_path.find(NGAP_CONCEPTS_KEY);
124  if (concepts_index == string::npos) {
125  stringstream msg;
126  msg << prolog << "The specified path '" << r_path << "'";
127  msg << " contains neither the '" << NGAP_COLLECTIONS_KEY << "'";
128  msg << " nor the '" << NGAP_CONCEPTS_KEY << "'";
129  msg << " key, one must be provided.";
130  throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
131  }
132  collection_index = concepts_index;
133  use_collection_concept_id = true;
134  }
135  if(collection_index <= provider_index+1){ // The value of provider has to be at least 1 character
136  stringstream msg;
137  msg << prolog << "The specified path '" << r_path << "'";
138  msg << " has the path element '" << (use_collection_concept_id?NGAP_CONCEPTS_KEY:NGAP_COLLECTIONS_KEY) << "' located in the incorrect position (";
139  msg << collection_index << ") expected at least " << provider_index+1;
140  throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
141  }
142  string provider = r_path.substr(provider_index,collection_index - provider_index);
143  collection_index += use_collection_concept_id?string(NGAP_CONCEPTS_KEY).length():string(NGAP_COLLECTIONS_KEY).length();
144 
145 
146  size_t granule_index = r_path.find(NGAP_GRANULES_KEY);
147  if(granule_index == string::npos){
148  stringstream msg;
149  msg << prolog << "The specified path '" << r_path << "'";
150  msg << " does not contain the required path element '" << NGAP_GRANULES_KEY << "'";
151  throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
152  }
153  if(granule_index <= collection_index+1){ // The value of collection must have at least one character.
154  stringstream msg;
155  msg << prolog << "The specified path '" << r_path << "'";
156  msg << " has the path element '" << NGAP_GRANULES_KEY << "' located in the incorrect position (";
157  msg << granule_index << ") expected at least " << collection_index+1;
158  throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
159  }
160  string collection = r_path.substr(collection_index,granule_index - collection_index);
161  granule_index += string(NGAP_GRANULES_KEY).length();
162 
163  // The granule value is the path terminus so it's every thing after the key
164  string granule = r_path.substr(granule_index);
165 
166  // Build the CMR query URL for the dataset
167  string cmr_url = get_cmr_search_endpoint_url() + "?";
168  {
169  // This easy handle is only created so we can use the curl_easy_escape() on the token values
170  CURL *ceh = curl_easy_init();
171  char *esc_url_content;
172 
173  // Add provider
174  esc_url_content = curl_easy_escape(ceh, provider.c_str(), provider.size());
175  cmr_url += string(CMR_PROVIDER).append("=").append(esc_url_content).append("&");
176  curl_free(esc_url_content);
177 
178  esc_url_content = curl_easy_escape(ceh, collection.c_str(), collection.size());
179  if(use_collection_concept_id){
180  // Add collection_concept_id
181  cmr_url += string(CMR_COLLECTION_CONCEPT_ID).append("=").append(esc_url_content).append("&");
182  }
183  else {
184  // Add entry_title
185  cmr_url += string(CMR_ENTRY_TITLE).append("=").append(esc_url_content).append("&");
186 
187  }
188  curl_free(esc_url_content);
189 
190  esc_url_content = curl_easy_escape(ceh, granule.c_str(), granule.size());
191  cmr_url += string(CMR_GRANULE_UR).append("=").append(esc_url_content);
192  curl_free(esc_url_content);
193 
194  curl_easy_cleanup(ceh);
195  }
196  return cmr_url;
197 }
198 
215 std::string NgapApi::build_cmr_query_url(const std::string &restified_path) {
216 
217  // Make sure it starts with a '/' (see key strings above)
218  string r_path = ( restified_path[0] != '/' ? "/" : "") + restified_path;
219 
220  size_t provider_index = r_path.find(NGAP_PROVIDERS_KEY);
221  if(provider_index != string::npos){
222  return build_cmr_query_url_old_rpath_format(restified_path);
223  }
224 
225  size_t collections_key_index = r_path.find(NGAP_COLLECTIONS_KEY);
226  if(collections_key_index == string::npos) {
227  stringstream msg;
228  msg << prolog << "The specified path '" << r_path << "'";
229  msg << " contains neither the '" << NGAP_COLLECTIONS_KEY << "'";
230  msg << " nor the '" << NGAP_CONCEPTS_KEY << "'";
231  msg << " one must be provided.";
232  throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
233  }
234  if(collections_key_index != 0){ // The COLLECTIONS_KEY comes first
235  stringstream msg;
236  msg << prolog << "The specified path '" << r_path << "'";
237  msg << " has the path element '" << NGAP_COLLECTIONS_KEY << "' located in the incorrect position (";
238  msg << collections_key_index << ") expected at least " << provider_index + 1;
239  throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
240  }
241  // This is now the beginning of the collection_concept_id value.
242  size_t collections_index = collections_key_index + string(NGAP_COLLECTIONS_KEY).length();
243 
244  size_t granules_key_index = r_path.find(NGAP_GRANULES_KEY);
245  if(granules_key_index == string::npos){
246  stringstream msg;
247  msg << prolog << "The specified path '" << r_path << "'";
248  msg << " does not contain the required path element '" << NGAP_GRANULES_KEY << "'";
249  throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
250  }
251 
252  // The collection key must precede the granules key in the path,
253  // and the collection name must have at least one character.
254  if(granules_key_index <= collections_index + 1){
255  stringstream msg;
256  msg << prolog << "The specified path '" << r_path << "'";
257  msg << " has the path element '" << NGAP_GRANULES_KEY << "' located in the incorrect position (";
258  msg << granules_key_index << ") expected at least " << collections_index + 1;
259  throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
260  }
261  size_t granules_index = granules_key_index + string(NGAP_GRANULES_KEY).length();
262  // The granule_name value is the path terminus so it's every thing after the key
263  string granule_name = r_path.substr(granules_index);
264 
265  // Now we need to work on the collections value to eliminate the optional parts.
266  // This is the entire collections string including any optional components.
267  string collection_name = r_path.substr(collections_index, granules_key_index - collections_index);
268 
269  // Since there may be optional parameters we need to strip them off to get the collection_concept_id
270  // And, since we know that collection_concept_id will never contain a '/', and we know that the optional
271  // part is separated from the collection_concept_id by a '/' we look for that and of we find it we truncate
272  // the value at that spot.
273  string optional_part;
274  size_t slash_pos = collection_name.find('/');
275  if(slash_pos != string::npos){
276  optional_part = collection_name.substr(slash_pos);
277  BESDEBUG(MODULE, prolog << "Found optional collections name component: " << optional_part << endl);
278  collection_name = collection_name.substr(0,slash_pos);
279  }
280  BESDEBUG(MODULE, prolog << "Found collection_name (aka collection_concept_id): " << collection_name << endl);
281 
282  // Build the CMR query URL for the dataset
283  string cmr_url = get_cmr_search_endpoint_url() + "?";
284  {
285  // This easy handle is only created so we can use the curl_easy_escape() on the token values
286  CURL *ceh = curl_easy_init();
287  char *esc_url_content;
288 
289  esc_url_content = curl_easy_escape(ceh, collection_name.c_str(), collection_name.size());
290  cmr_url += string(CMR_COLLECTION_CONCEPT_ID).append("=").append(esc_url_content).append("&");
291  curl_free(esc_url_content);
292 
293  esc_url_content = curl_easy_escape(ceh, granule_name.c_str(), granule_name.size());
294  cmr_url += string(CMR_GRANULE_UR).append("=").append(esc_url_content);
295  curl_free(esc_url_content);
296 
297  curl_easy_cleanup(ceh);
298  }
299  return cmr_url;
300 }
301 
312 std::string NgapApi::find_get_data_url_in_granules_umm_json_v1_4(const std::string &restified_path, rapidjson::Document &cmr_granule_response)
313 {
314 
315  string data_access_url;
316 
317  rapidjson::Value &val = cmr_granule_response["hits"];
318  int hits = val.GetInt();
319  if (hits < 1) {
320  throw BESNotFoundError(string("The specified path '").append(restified_path).append(
321  "' does not identify a granule in CMR."), __FILE__, __LINE__);
322  }
323 
324  rapidjson::Value &items = cmr_granule_response["items"];
325  if (items.IsArray()) {
326  stringstream ss;
327  if(BESDebug::IsSet(MODULE)){
328  const string RJ_TYPE_NAMES[] = {string("kNullType"),string("kFalseType"),string("kTrueType"),
329  string("kObjectType"),string("kArrayType"),string("kStringType"),string("kNumberType")};
330  for (rapidjson::SizeType i = 0; i < items.Size(); i++) // Uses SizeType instead of size_t
331  ss << "items[" << i << "]: " << RJ_TYPE_NAMES[items[i].GetType()] << endl;
332  BESDEBUG(MODULE, prolog << "items size: " << items.Size() << endl << ss.str() << endl);
333  }
334 
335  rapidjson::Value &items_obj = items[0];
336  // rapidjson::GenericMemberIterator<false, rapidjson::UTF8<char>, rapidjson::MemoryPoolAllocator<rapidjson::CrtAllocator>> mitr = items_obj.FindMember("umm");
337  auto mitr = items_obj.FindMember("umm");
338 
339  rapidjson::Value &umm = mitr->value;
340  mitr = umm.FindMember("RelatedUrls");
341  if (mitr == umm.MemberEnd()) {
342  throw BESInternalError("Error! The umm/RelatedUrls object was not located!", __FILE__, __LINE__);
343  }
344  rapidjson::Value &related_urls = mitr->value;
345 
346  if (!related_urls.IsArray()) {
347  throw BESNotFoundError("Error! The RelatedUrls object in the CMR response is not an array!", __FILE__,
348  __LINE__);
349  }
350 
351  BESDEBUG(MODULE, prolog << " Found RelatedUrls array in CMR response." << endl);
352 
353  bool noSubtype;
354  for (rapidjson::SizeType i = 0; i < related_urls.Size() && data_access_url.empty(); i++) {
355  rapidjson::Value &obj = related_urls[i];
356  mitr = obj.FindMember("URL");
357  if (mitr == obj.MemberEnd()) {
358  stringstream err;
359  err << "Error! The umm/RelatedUrls[" << i << "] does not contain the URL object";
360  throw BESInternalError(err.str(), __FILE__, __LINE__);
361  }
362  rapidjson::Value &r_url = mitr->value;
363 
364  mitr = obj.FindMember("Type");
365  if (mitr == obj.MemberEnd()) {
366  stringstream err;
367  err << "Error! The umm/RelatedUrls[" << i << "] does not contain the Type object";
368  throw BESInternalError(err.str(), __FILE__, __LINE__);
369  }
370  rapidjson::Value &r_type = mitr->value;
371 
372  noSubtype = obj.FindMember("Subtype") == obj.MemberEnd();
373 
374  BESDEBUG(MODULE, prolog << "RelatedUrl Object:" <<
375  " URL: '" << r_url.GetString() << "'" <<
376  " Type: '" << r_type.GetString() << "'" <<
377  " SubType: '" << (noSubtype ? "Absent" : "Present") << "'" << endl);
378 
379  if ((r_type.GetString() == string(CMR_URL_TYPE_GET_DATA)) && noSubtype) {
380 
381  // Because a member of RelatedUrls may contain a URL of Type GET DATA with the s3:// protocol
382  // as well as a Type GET DATA URL which uses https:// or http://
383  string candidate_url = r_url.GetString();
384  if(candidate_url.substr(0,8) == "https://" || candidate_url.substr(0,7) == "http://"){
385  data_access_url = candidate_url;
386  }
387  }
388  }
389  }
390 
391  if (data_access_url.empty()) {
392  throw BESInternalError(string("ERROR! Failed to locate a data access URL for the path: ") + restified_path,
393  __FILE__, __LINE__);
394  }
395 
396  return data_access_url;
397 }
398 
399 
400 
423  string NgapApi::convert_ngap_resty_path_to_data_access_url(
424  const std::string &restified_path,
425  const std::string &uid
426  ) {
427  BESDEBUG(MODULE, prolog << "BEGIN" << endl);
428  string data_access_url;
429 
430  string cmr_query_url = build_cmr_query_url(restified_path);
431 
432  BESDEBUG(MODULE, prolog << "CMR Request URL: " << cmr_query_url << endl);
433 
434  BESDEBUG(MODULE, prolog << "Building new RemoteResource." << endl);
435  std::shared_ptr<http::url> cmr_query_url_ptr(new http::url(cmr_query_url));
436  http::RemoteResource cmr_query(cmr_query_url_ptr, uid);
437  {
438  BESStopWatch besTimer;
439  if (BESISDEBUG(MODULE) || BESDebug::IsSet(TIMING_LOG_KEY) || BESLog::TheLog()->is_verbose()){
440  besTimer.start("CMR Query: " + cmr_query_url);
441  }
442  cmr_query.retrieveResource();
443  }
444  rapidjson::Document cmr_response = cmr_query.get_as_json();
445 
446  data_access_url = find_get_data_url_in_granules_umm_json_v1_4(restified_path, cmr_response);
447 
448  BESDEBUG(MODULE, prolog << "END (data_access_url: "<< data_access_url << ")" << endl);
449 
450  return data_access_url;
451  }
452 
453 
454 
455 
456  bool NgapApi::signed_url_is_expired(const http::url &signed_url)
457  {
458  bool is_expired;
459  time_t now;
460  time(&now); /* get current time; same as: timer = time(NULL) */
461  BESDEBUG(MODULE, prolog << "now: " << now << endl);
462 
463  time_t expires = now;
464  string cf_expires = signed_url.query_parameter_value(CLOUDFRONT_EXPIRES_HEADER_KEY);
465  string aws_expires = signed_url.query_parameter_value(AMS_EXPIRES_HEADER_KEY);
466  time_t ingest_time = signed_url.ingest_time();
467 
468  if(!cf_expires.empty()){ // CloudFront expires header?
469  expires = stoll(cf_expires);
470  BESDEBUG(MODULE, prolog << "Using "<< CLOUDFRONT_EXPIRES_HEADER_KEY << ": " << expires << endl);
471  }
472  else if(!aws_expires.empty()){
473  // AWS Expires header?
474  //
475  // By default we'll use the time we made the URL object, ingest_time
476  time_t start_time = ingest_time;
477  // But if there's an AWS Date we'll parse that and compute the time
478  // @TODO move to NgapApi::decompose_url() and add the result to the map
479  string aws_date = signed_url.query_parameter_value(AWS_DATE_HEADER_KEY);
480  if(!aws_date.empty()){
481  string date = aws_date; // 20200624T175046Z
482  string year = date.substr(0,4);
483  string month = date.substr(4,2);
484  string day = date.substr(6,2);
485  string hour = date.substr(9,2);
486  string minute = date.substr(11,2);
487  string second = date.substr(13,2);
488 
489  BESDEBUG(MODULE, prolog << "date: "<< date <<
490  " year: " << year << " month: " << month << " day: " << day <<
491  " hour: " << hour << " minute: " << minute << " second: " << second << endl);
492 
493  struct tm *ti = gmtime(&now);
494  ti->tm_year = stoll(year) - 1900;
495  ti->tm_mon = stoll(month) - 1;
496  ti->tm_mday = stoll(day);
497  ti->tm_hour = stoll(hour);
498  ti->tm_min = stoll(minute);
499  ti->tm_sec = stoll(second);
500 
501  BESDEBUG(MODULE, prolog << "ti->tm_year: "<< ti->tm_year <<
502  " ti->tm_mon: " << ti->tm_mon <<
503  " ti->tm_mday: " << ti->tm_mday <<
504  " ti->tm_hour: " << ti->tm_hour <<
505  " ti->tm_min: " << ti->tm_min <<
506  " ti->tm_sec: " << ti->tm_sec << endl);
507 
508 
509  start_time = mktime(ti);
510  BESDEBUG(MODULE, prolog << "AWS (computed) start_time: "<< start_time << endl);
511  }
512  expires = start_time + stoll(aws_expires);
513  BESDEBUG(MODULE, prolog << "Using "<< AMS_EXPIRES_HEADER_KEY << ": " << aws_expires <<
514  " (expires: " << expires << ")" << endl);
515  }
516  time_t remaining = expires - now;
517  BESDEBUG(MODULE, prolog << "expires_time: " << expires <<
518  " remaining_time: " << remaining <<
519  " refresh_threshold: " << REFRESH_THRESHOLD << endl);
520 
521  is_expired = remaining < REFRESH_THRESHOLD;
522  BESDEBUG(MODULE, prolog << "is_expired: " << (is_expired?"true":"false") << endl);
523 
524  return is_expired;
525  }
526 
527 } // namespace ngap
528 
static bool IsSet(const std::string &flagName)
see if the debug context flagName is set to true
Definition: BESDebug.h:168
exception thrown if internal error encountered
error thrown if the resource requested cannot be found
virtual bool start(std::string name)
Definition: BESStopWatch.cc:67
error thrown if there is a user syntax error in the request or any other user error
static std::string assemblePath(const std::string &firstPart, const std::string &secondPart, bool leadingSlash=false, bool trailingSlash=false)
Assemble path fragments making sure that they are separated by a single '/' character.
Definition: BESUtil.cc:840
void get_value(const std::string &s, std::string &val, bool &found)
Retrieve the value of a given key, if set.
Definition: TheBESKeys.cc:340
static TheBESKeys * TheKeys()
Definition: TheBESKeys.cc:71
rapidjson::Document get_as_json()
get_as_json() This function returns the cached resource parsed into a JSON document.
virtual std::string query_parameter_value(const std::string &key) const
Definition: url_impl.cc:252
GenericValue< UTF8<> > Value
GenericValue with UTF8 encoding.
Definition: document.h:2189
GenericDocument< UTF8<> > Document
GenericDocument with UTF8 encoding.
Definition: document.h:2585
RAPIDJSON_NAMESPACE_BEGIN typedef unsigned SizeType
Size type (for string lengths, array sizes, etc.)
Definition: rapidjson.h:384