bes  Updated for version 3.20.10
HttpdDirScraper.cc
1 // -*- mode: c++; c-basic-offset:4 -*-
2 //
3 // This file is part of httpd_catalog_module, A C++ module that can be loaded in to
4 // the OPeNDAP Back-End Server (BES) and is able to handle remote requests.
5 //
6 // Copyright (c) 2018 OPeNDAP, Inc.
7 // Author: Nathan Potter <ndp@opendap.org>
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 //
23 // You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
24 
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28 #include <stdlib.h> /* atol */
29 #include <ctype.h> /* isalpha and isdigit */
30 #include <time.h> /* mktime */
31 
32 #include <BESDebug.h>
33 #include <BESUtil.h>
34 #include <BESRegex.h>
35 #include <BESCatalogList.h>
36 #include <BESCatalogUtils.h>
37 #include <CatalogItem.h>
38 
39 #include "RemoteResource.h"
40 #include "HttpdCatalogNames.h"
41 
42 #include "HttpdDirScraper.h"
43 
44 using namespace std;
45 using bes::CatalogItem;
46 
47 #define prolog std::string("HttpdDirScraper::").append(__func__).append("() - ")
48 
49 namespace httpd_catalog {
50 
51 HttpdDirScraper::HttpdDirScraper()
52 {
53  // There was probably a better way to make this association but this worked.
54  d_months.insert(pair<string, int>(string("jan"), 0));
55  d_months.insert(pair<string, int>(string("feb"), 1));
56  d_months.insert(pair<string, int>(string("mar"), 2));
57  d_months.insert(pair<string, int>(string("apr"), 3));
58  d_months.insert(pair<string, int>(string("may"), 4));
59  d_months.insert(pair<string, int>(string("jun"), 5));
60  d_months.insert(pair<string, int>(string("jul"), 6));
61  d_months.insert(pair<string, int>(string("aug"), 7));
62  d_months.insert(pair<string, int>(string("sep"), 8));
63  d_months.insert(pair<string, int>(string("oct"), 9));
64  d_months.insert(pair<string, int>(string("nov"), 10));
65  d_months.insert(pair<string, int>(string("dec"), 11));
66 }
67 
68 /*
69  * @brief Converts an Apache httpd directory page "size" string (23K, 45M, 32G, etc)
70  * to an actual value, approximate though it may be.
71  */
72 long HttpdDirScraper::get_size_val(const string size_str) const
73 {
74  char scale_c = *size_str.rbegin();
75  long scale = 1;
76 
77  switch (scale_c) {
78  case 'K':
79  scale = 1e3;
80  break;
81  case 'M':
82  scale = 1e6;
83  break;
84  case 'G':
85  scale = 1e9;
86  break;
87  case 'T':
88  scale = 1e12;
89  break;
90  case 'P':
91  scale = 1e15;
92  break;
93  default:
94  scale = 1;
95  break;
96  }
97  BESDEBUG(MODULE, prolog << "scale: " << scale << endl);
98 
99  string result = size_str;
100  if (isalpha(scale_c)) result = size_str.substr(0, size_str.length() - 1);
101 
102  long size = atol(result.c_str());
103  BESDEBUG(MODULE, prolog << "raw size: " << size << endl);
104 
105  size *= scale;
106  BESDEBUG(MODULE, prolog << "scaled size: " << size << endl);
107  return size;
108 }
109 
113 string show_tm_struct(const tm tms)
114 {
115  stringstream ss;
116  ss << "tm_sec: " << tms.tm_sec << endl;
117  ss << "tm_min: " << tms.tm_min << endl;
118  ss << "tm_hour: " << tms.tm_hour << endl;
119  ss << "tm_mday: " << tms.tm_mday << endl;
120  ss << "tm_mon: " << tms.tm_mon << endl;
121  ss << "tm_year: " << tms.tm_year << endl;
122  ss << "tm_wday: " << tms.tm_wday << endl;
123  ss << "tm_yday: " << tms.tm_yday << endl;
124  ss << "tm_isdst: " << tms.tm_isdst << endl;
125  return ss.str();
126 }
127 
131 void zero_tm_struct(tm &tms)
132 {
133  tms.tm_sec = 0;
134  tms.tm_min = 0;
135  tms.tm_hour = 0;
136  tms.tm_mday = 1;
137  tms.tm_mon = 0;
138  tms.tm_year = 0;
139  tms.tm_wday = 0;
140  tms.tm_yday = 0;
141  tms.tm_isdst = 0;
142 }
143 
144 
145 string HttpdDirScraper::httpd_time_to_iso_8601(const string httpd_time) const
146 {
147  vector<string> tokens;
148  string delimiters = "- :";
149  BESUtil::tokenize(httpd_time, tokens, delimiters);
150 
151  BESDEBUG(MODULE, prolog << "Found " << tokens.size() << " tokens." << endl);
152  vector<string>::iterator it = tokens.begin();
153  int i = 0;
154  if (BESDebug::IsSet(MODULE)) {
155  while (it != tokens.end()) {
156  BESDEBUG(MODULE, prolog << " token["<< i++ << "]: "<< *it << endl);
157  it++;
158  }
159  }
160 
161  BESDEBUG(MODULE, prolog << "Second Field: "<< tokens[1] << endl);
162 
163  const char *second_field = tokens[1].c_str();
164  bool is_alpha = true;
165  for(unsigned long i=0; is_alpha && i< tokens[1].length(); i++){
166  is_alpha = isalpha(second_field[i]);
167  }
168  time_t theTime;
169  if(is_alpha){
170  BESDEBUG(MODULE, prolog << "Detected Time Format A (\"DD-MM-YYY hh:mm\")" << endl);
171  theTime = parse_time_format_A(tokens);
172  }
173  else {
174  BESDEBUG(MODULE, prolog << "Detected Time Format B (\"YYYY-MM-DD hh:mm\")" << endl);
175  theTime = parse_time_format_B(tokens);
176  }
177  return BESUtil::get_time(theTime, false);
178 
179 }
180 
186 time_t HttpdDirScraper::parse_time_format_A(const vector<string> tokens) const
187 {
188  // void BESUtil::tokenize(const string& str, vector<string>& tokens, const string& delimiters)
189  struct tm tm;
190  zero_tm_struct(tm);
191 
192  if (tokens.size() > 2) {
193  std::istringstream(tokens[0]) >> tm.tm_mday;
194  BESDEBUG(MODULE, prolog << " tm.tm_mday: "<< tm.tm_mday << endl);
195 
196  pair<string, int> mnth = *d_months.find(BESUtil::lowercase(tokens[1]));
197  BESDEBUG(MODULE, prolog << " mnth.first: "<< mnth.first << endl);
198  BESDEBUG(MODULE, prolog << " mnth.second: "<< mnth.second << endl);
199  tm.tm_mon = mnth.second;
200  BESDEBUG(MODULE, prolog << " tm.tm_mon: "<< tm.tm_mon << endl);
201 
202  std::istringstream(tokens[2]) >> tm.tm_year;
203  tm.tm_year -= 1900;
204  BESDEBUG(MODULE, prolog << " tm.tm_year: "<< tm.tm_year << endl);
205 
206  if (tokens.size() > 4) {
207  std::istringstream(tokens[3]) >> tm.tm_hour;
208  BESDEBUG(MODULE, prolog << " tm.tm_hour: "<< tm.tm_hour << endl);
209  std::istringstream(tokens[4]) >> tm.tm_min;
210  BESDEBUG(MODULE, prolog << " tm.tm_min: "<< tm.tm_min << endl);
211  }
212  }
213 
214  BESDEBUG(MODULE, prolog << "tm struct: " << endl << show_tm_struct(tm));
215 
216  time_t theTime = mktime(&tm);
217  BESDEBUG(MODULE, prolog << "theTime: " << theTime << endl);
218  return theTime;
219 }
220 
226 time_t HttpdDirScraper::parse_time_format_B(const vector<string> tokens) const
227 {
228  // void BESUtil::tokenize(const string& str, vector<string>& tokens, const string& delimiters)
229  struct tm tm;
230  zero_tm_struct(tm);
231 
232  if (tokens.size() > 2) {
233  std::istringstream(tokens[0]) >> tm.tm_year;
234  tm.tm_year -= 1900;
235  BESDEBUG(MODULE, prolog << " tm.tm_year: "<< tm.tm_year << endl);
236 
237  std::istringstream(tokens[1]) >> tm.tm_mon;
238  BESDEBUG(MODULE, prolog << " tm.tm_mon: "<< tm.tm_mon << endl);
239 
240  std::istringstream(tokens[2]) >> tm.tm_mday;
241  BESDEBUG(MODULE, prolog << " tm.tm_mday: "<< tm.tm_mday << endl);
242 
243  if (tokens.size() > 4) {
244  std::istringstream(tokens[3]) >> tm.tm_hour;
245  BESDEBUG(MODULE, prolog << " tm.tm_hour: "<< tm.tm_hour << endl);
246  std::istringstream(tokens[4]) >> tm.tm_min;
247  BESDEBUG(MODULE, prolog << " tm.tm_min: "<< tm.tm_min << endl);
248  }
249  }
250 
251  BESDEBUG(MODULE, prolog << "tm struct: " << endl << show_tm_struct(tm));
252 
253  time_t theTime = mktime(&tm);
254  BESDEBUG(MODULE, prolog << "ISO-8601 Time: " << theTime << endl);
255  return theTime;
256 }
257 
274 void HttpdDirScraper::createHttpdDirectoryPageMap(std::string url, std::map<std::string, bes::CatalogItem *> &items) const
275 {
276  const BESCatalogUtils *cat_utils = BESCatalogList::TheCatalogList()->find_catalog(BES_DEFAULT_CATALOG)->get_catalog_utils();
277 
278  // Go get the text from the remote resource
279  std::shared_ptr<http::url> url_ptr(new http::url(url));
280  http::RemoteResource rhr(url_ptr);
281  rhr.retrieveResource();
282  stringstream buffer;
283 
284  ifstream cache_file_is(rhr.getCacheFileName().c_str());
285  if(!cache_file_is.is_open()){
286  string msg = prolog + "ERROR - Failed to open cache file: " + rhr.getCacheFileName();
287  BESDEBUG(MODULE, msg << endl);
288  throw BESInternalError(msg ,__FILE__, __LINE__ );
289  }
290 
291  buffer << cache_file_is.rdbuf();
292  string pageStr = buffer.str();
293  BESDEBUG(MODULE, prolog << "Page Content: " << endl << pageStr << endl);
294 
295  // Does it look like an Apache httpd Index listing?
296  if(pageStr.find("<title>Index of ") == string::npos){
297  // Nope. Time to leave.
298  BESDEBUG(MODULE, prolog << "The url: " << url << " does not appear to reference an Apache httpd Index page." << endl);
299  return;
300  }
301 
302  string aOpenStr = "<a ";
303  string aCloseStr = "</a>";
304  string hrefStr = "href=\"";
305  string tdOpenStr = "<td ";
306  string tdCloseStr = "</td>";
307 
308  BESRegex hrefExcludeRegex("(^#.*$)|(^\\?C.*$)|(redirect\\/)|(^\\/$)|(^<img.*$)");
309  BESRegex nameExcludeRegex("^Parent Directory$");
310 
311  bool done = false;
312  int next_start = 0;
313  while (!done) {
314  int aOpenIndex = pageStr.find(aOpenStr, next_start);
315  if (aOpenIndex < 0) {
316  done = true;
317  }
318  else {
319  int aCloseIndex = pageStr.find(aCloseStr, aOpenIndex + aOpenStr.length());
320  if (aCloseIndex < 0) {
321  done = true;
322  }
323  else {
324  int length;
325 
326  // Locate the entire <a /> element
327  BESDEBUG(MODULE, prolog << "aOpenIndex: " << aOpenIndex << endl);
328  BESDEBUG(MODULE, prolog << "aCloseIndex: " << aCloseIndex << endl);
329  length = aCloseIndex + aCloseStr.length() - aOpenIndex;
330  string aElemStr = pageStr.substr(aOpenIndex, length);
331  BESDEBUG(MODULE, prolog << "Processing link: " << aElemStr << endl);
332 
333  // Find the link text
334  int start = aElemStr.find(">") + 1;
335  int end = aElemStr.find("<", start);
336  length = end - start;
337  string linkText = aElemStr.substr(start, length);
338  BESDEBUG(MODULE, prolog << "Link Text: " << linkText << endl);
339 
340  // Locate the href attribute
341  start = aElemStr.find(hrefStr) + hrefStr.length();
342  end = aElemStr.find("\"", start);
343  length = end - start;
344  string href = aElemStr.substr(start, length);
345  BESDEBUG(MODULE, prolog << "href: " << href << endl);
346 
347  // attempt to get time string
348  string time_str;
349  int start_pos = getNextElementText(pageStr, "td", aCloseIndex + aCloseStr.length(), time_str);
350  BESDEBUG(MODULE, prolog << "time_str: '" << time_str << "'" << endl);
351 
352  // attempt to get size string
353  string size_str;
354  start_pos = getNextElementText(pageStr, "td", start_pos, size_str);
355  BESDEBUG(MODULE, prolog << "size_str: '" << size_str << "'" << endl);
356 
357  if ((linkText.find("<img") != string::npos) || !(linkText.length()) || (linkText.find("<<<") != string::npos)
358  || (linkText.find(">>>") != string::npos)) {
359  BESDEBUG(MODULE, prolog << "SKIPPING(image|copy|<<<|>>>): " << aElemStr << endl);
360  }
361  else {
362  if (href.length() == 0 || (((href.find("http://") == 0) || (href.find("https://") == 0)) && !(href.find(url) == 0))) {
363  // SKIPPING
364  BESDEBUG(MODULE, prolog << "SKIPPING(null or remote): " << href << endl);
365  }
366  else if (hrefExcludeRegex.match(href.c_str(), href.length(), 0) > 0) {
367  // SKIPPING
368  BESDEBUG(MODULE, prolog << "SKIPPING(hrefExcludeRegex) - href: '" << href << "'"<< endl);
369  }
370  else if (nameExcludeRegex.match(linkText.c_str(), linkText.length(), 0) > 0) {
371  // SKIPPING
372  BESDEBUG(MODULE, prolog << "SKIPPING(nameExcludeRegex) - name: '" << linkText << "'" << endl);
373  }
374  else if (BESUtil::endsWith(href, "/")) {
375  string node_name = href.substr(0, href.length() - 1);
376  // it's a directory aka a node
377  BESDEBUG(MODULE, prolog << "NODE: " << node_name << endl);
378  bes::CatalogItem *childNode = new bes::CatalogItem();
379  childNode->set_type(CatalogItem::node);
380  childNode->set_name(node_name);
381  childNode->set_is_data(false);
382  string iso_8601_time = httpd_time_to_iso_8601(time_str);
383  childNode->set_lmt(iso_8601_time);
384  // FIXME: For nodes the size should be the number of children, but how without crawling?
385  long size = get_size_val(size_str);
386  childNode->set_size(size);
387 
388  items.insert(pair<std::string, bes::CatalogItem *>(node_name, childNode));
389  }
390  else {
391  // It's a file aka a leaf
392  BESDEBUG(MODULE, prolog << "LEAF: " << href << endl);
393  CatalogItem *leafItem = new CatalogItem();
394  leafItem->set_type(CatalogItem::leaf);
395  leafItem->set_name(href);
396  leafItem->set_is_data(cat_utils->is_data(href));
397  string iso_8601_time = httpd_time_to_iso_8601(time_str);
398  leafItem->set_lmt(iso_8601_time);
399  long size = get_size_val(size_str);
400  leafItem->set_size(size);
401 
402  items.insert(pair<std::string, bes::CatalogItem *>(href, leafItem));
403  }
404  }
405  }
406  next_start = aCloseIndex + aCloseStr.length();
407  }
408  }
409 }
410 
423 int HttpdDirScraper::getNextElementText(const string &page_str, const string element_name, int startIndex, string &resultText, bool trim) const
424 {
425  string e_open_str = "<" + element_name + " ";
426  string e_close_str = "</" + element_name + ">";
427 
428  // Locate the next "element_name" element
429  int start = page_str.find(e_open_str, startIndex);
430  int end = page_str.find(e_close_str, start + e_open_str.length());
431  if(start<0 || end<0 || end<start){
432  resultText="";
433  return startIndex;
434  }
435 
436  int length = end + e_close_str.length() - start;
437  string element_str = page_str.substr(start, length);
438 
439  // Find the text
440  start = element_str.find(">") + 1;
441  end = element_str.find("<", start);
442  length = end - start;
443  resultText = element_str.substr(start, length);
444 
445  if (trim) BESUtil::removeLeadingAndTrailingBlanks(resultText);
446 
447  BESDEBUG(MODULE, prolog << "resultText: '" << resultText << "'" << endl);
448  return startIndex + element_str.length();
449 }
450 
451 /*
452  * @brief Returns the catalog node represented by the httpd directory page returned
453  * by dereferencing the passed url.
454  * @param url The url of the Apache httpd directory to process.
455  * @param path The path prefix that associates the location of this generated CatalogNode with it's
456  * correct position in the local service path.
457  */
458 bes::CatalogNode *HttpdDirScraper::get_node(const string &url, const string &path) const
459 {
460  BESDEBUG(MODULE, prolog << "Processing url: '" << url << "'"<< endl);
461  bes::CatalogNode *node = new bes::CatalogNode(path);
462 
463  if (BESUtil::endsWith(url, "/")) {
464  // This always means the URL points to a node when coming from httpd
465  map<string, bes::CatalogItem *> items;
466  createHttpdDirectoryPageMap(url, items);
467 
468  BESDEBUG(MODULE, prolog << "Found " << items.size() << " items." << endl);
469  map<string, bes::CatalogItem *>::iterator it;
470  it = items.begin();
471  while (it != items.end()) {
472  bes::CatalogItem *item = it->second;
473  BESDEBUG(MODULE, prolog << "Adding item: '" << item->get_name() << "'"<< endl);
474  if (item->get_type() == CatalogItem::node)
475  node->add_node(item);
476  else
477  node->add_leaf(item);
478  it++;
479  }
480  }
481  else {
482  // It's a leaf aka "item" response.
483  const BESCatalogUtils *cat_utils = BESCatalogList::TheCatalogList()->find_catalog(BES_DEFAULT_CATALOG)->get_catalog_utils();
484  std::vector<std::string> url_parts = BESUtil::split(url, '/', true);
485  string leaf_name = url_parts.back();
486 
487  CatalogItem *item = new CatalogItem();
488  item->set_type(CatalogItem::leaf);
489  item->set_name(leaf_name);
490  item->set_is_data(cat_utils->is_data(leaf_name));
491 
492  // FIXME: Find the Last Modified date? Head??
493  item->set_lmt(BESUtil::get_time(true));
494 
495  // FIXME: Determine size of this thing? Do we "HEAD" all the leaves?
496  item->set_size(1);
497 
498  node->set_leaf(item);
499  }
500  return node;
501 }
502 
503 #if 0
504 
505 bes::CatalogNode *HttpdDirScraper::get_node(const string &url, const string &path) const
506 {
507  BESDEBUG(MODULE, prolog << "Processing url: '" << url << "'"<< endl);
508  bes::CatalogNode *node = new bes::CatalogNode(path);
509 
510  if (BESUtil::endsWith(url, "/")) {
511 
512  set<string> pageNodes;
513  set<string> pageLeaves;
514  createHttpdDirectoryPageMap(url, pageNodes, pageLeaves);
515 
516  BESDEBUG(MODULE, prolog << "Found " << pageNodes.size() << " nodes." << endl);
517  BESDEBUG(MODULE, prolog << "Found " << pageLeaves.size() << " leaves." << endl);
518 
519  set<string>::iterator it;
520 
521  it = pageNodes.begin();
522  while (it != pageNodes.end()) {
523  string pageNode = *it;
524  if (BESUtil::endsWith(pageNode, "/")) pageNode = pageNode.substr(0, pageNode.length() - 1);
525 
526  bes::CatalogItem *childNode = new bes::CatalogItem();
527  childNode->set_type(CatalogItem::node);
528 
529  childNode->set_name(pageNode);
530  childNode->set_is_data(false);
531 
532  // FIXME: Figure out the LMT if we can... HEAD?
533  childNode->set_lmt(BESUtil::get_time(true));
534 
535  // FIXME: For nodes the size should be the number of children, but how without crawling?
536  childNode->set_size(0);
537 
538  node->add_node(childNode);
539  it++;
540  }
541 
542  it = pageLeaves.begin();
543  while (it != pageLeaves.end()) {
544  string leaf = *it;
545  CatalogItem *leafItem = new CatalogItem();
546  leafItem->set_type(CatalogItem::leaf);
547  leafItem->set_name(leaf);
548 
549  // FIXME: wrangle up the Typematch and see if we think this thing is data or not.
550  leafItem->set_is_data(false);
551 
552  // FIXME: Find the Last Modified date?
553  leafItem->set_lmt(BESUtil::get_time(true));
554 
555  // FIXME: Determine size of this thing? Do we "HEAD" all the leaves?
556  leafItem->set_size(1);
557 
558  node->add_leaf(leafItem);
559  it++;
560  }
561  }
562  else {
563  std::vector<std::string> url_parts = BESUtil::split(url,'/',true);
564  string leaf_name = url_parts.back();
565 
566  CatalogItem *item = new CatalogItem();
567  item->set_type(CatalogItem::leaf);
568  item->set_name(leaf_name);
569  // FIXME: Find the Last Modified date?
570  item->set_lmt(BESUtil::get_time(true));
571 
572  // FIXME: Determine size of this thing? Do we "HEAD" all the leaves?
573  item->set_size(1);
574 
575  node->set_leaf(item);
576 
577  }
578  return node;
579 
580 }
581 #endif
582 
583 }
584  // namespace httpd_catalog
585 
static BESCatalogList * TheCatalogList()
Get the singleton BESCatalogList instance.
bool is_data(const std::string &item) const
is there a handler that can process this
virtual BESCatalogUtils * get_catalog_utils() const
Get a pointer to the utilities, customized for this catalog.
Definition: BESCatalog.h:113
static bool IsSet(const std::string &flagName)
see if the debug context flagName is set to true
Definition: BESDebug.h:168
exception thrown if internal error encountered
Regular expression matching.
Definition: BESRegex.h:53
static std::vector< std::string > split(const std::string &s, char delim='/', bool skip_empty=true)
Splits the string s into the return vector of tokens using the delimiter delim and skipping empty val...
Definition: BESUtil.cc:1159
static bool endsWith(std::string const &fullString, std::string const &ending)
Definition: BESUtil.cc:961
static void tokenize(const std::string &str, std::vector< std::string > &tokens, const std::string &delimiters="/")
Definition: BESUtil.cc:1086
static std::string lowercase(const std::string &s)
Definition: BESUtil.cc:206
static void removeLeadingAndTrailingBlanks(std::string &key)
Definition: BESUtil.cc:485
static std::string get_time(bool use_local_time=false)
Definition: BESUtil.cc:1108
void set_name(std::string n)
Set the name of the item.
Definition: CatalogItem.h:135
std::string get_name() const
The name of this item in the node.
Definition: CatalogItem.h:133
void set_size(size_t s)
Set the size of the item.
Definition: CatalogItem.h:140
void set_is_data(bool id)
Is this item data that the BES should interpret?
Definition: CatalogItem.h:150
void set_lmt(std::string lmt)
Set the LMT for this item.
Definition: CatalogItem.h:145
item_type get_type() const
Get the type of this item (unknown, node or leaf)
Definition: CatalogItem.h:153
void set_type(item_type t)
Set the type for this item.
Definition: CatalogItem.h:155