bes  Updated for version 3.20.10
EffectiveUrlCache.cc
1 // -*- mode: c++; c-basic-offset:4 -*-
2 
3 // This file is part of the BES http package, part of the Hyrax data server.
4 
5 // Copyright (c) 2020 OPeNDAP, Inc.
6 // Author: Nathan Potter <ndp@opendap.org>
7 //
8 // This library is free software; you can redistribute it and/or
9 // modify it under the terms of the GNU Lesser General Public
10 // License as published by the Free Software Foundation; either
11 // version 2.1 of the License, or (at your option) any later version.
12 //
13 // This library is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 // Lesser General Public License for more details.
17 //
18 // You should have received a copy of the GNU Lesser General Public
19 // License along with this library; if not, write to the Free Software
20 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 //
22 // You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
23 
24 // Authors:
25 // ndp Nathan Potter <ndp@opendap.org>
26 
27 #include "config.h"
28 
29 #ifdef HAVE_STDLIB_H
30 #include <cstdlib>
31 #endif
32 
33 #include <mutex>
34 
35 #include <sstream>
36 #include <string>
37 
38 #include "EffectiveUrlCache.h"
39 
40 #include "BESSyntaxUserError.h"
41 #include "TheBESKeys.h"
42 #include "BESDebug.h"
43 #include "BESStopWatch.h"
44 #include "BESUtil.h"
45 #include "BESLog.h"
46 #include "CurlUtils.h"
47 #include "HttpNames.h"
48 #include "EffectiveUrl.h"
49 
50 using namespace std;
51 
52 #define MODULE "euc"
53 #define MODULE_DUMPER "euc:dump"
54 #define prolog std::string("EffectiveUrlCache::").append(__func__).append("() - ")
55 
56 namespace http {
57 
58 EffectiveUrlCache *EffectiveUrlCache::d_instance = nullptr;
59 static std::once_flag d_euc_init_once;
60 
69 EffectiveUrlCache *
70 EffectiveUrlCache::TheCache()
71 {
72  std::call_once(d_euc_init_once,EffectiveUrlCache::initialize_instance);
73 
74  return d_instance;
75 }
76 
81 void EffectiveUrlCache::initialize_instance()
82 {
83 
84  d_instance = new EffectiveUrlCache;
85 #ifdef HAVE_ATEXIT
86  atexit(delete_instance);
87 #endif
88 
89 }
90 
94 void EffectiveUrlCache::delete_instance()
95 {
96  delete d_instance;
97  d_instance = 0;
98 }
99 
100 
105 EffectiveUrlCache::~EffectiveUrlCache()
106 {
107  d_effective_urls.clear();
108 
109  if(d_skip_regex){
110  delete d_skip_regex;
111  d_skip_regex = 0;
112  }
113 }
114 
115 
123 void EffectiveUrlCache::dump(ostream &strm) const
124 {
125  strm << BESIndent::LMarg << prolog << "(this: " << (void *) this << ")" << endl;
126  BESIndent::Indent();
127  strm << BESIndent::LMarg << "d_skip_regex: " << (d_skip_regex?d_skip_regex->pattern():"WAS NOT SET") << endl;
128  if (!d_effective_urls.empty()) {
129  strm << BESIndent::LMarg << "effective url list:" << endl;
130  BESIndent::Indent();
131  auto it = d_effective_urls.begin();
132  while( it!= d_effective_urls.end()){
133  strm << BESIndent::LMarg << (*it).first << " --> " << (*it).second->str();
134  it++;
135  }
136  BESIndent::UnIndent();
137  }
138  else {
139  strm << BESIndent::LMarg << "effective url list: EMPTY" << endl;
140  }
141  BESIndent::UnIndent();
142 }
143 
151 string EffectiveUrlCache::dump() const
152 {
153  stringstream sstrm;
154  dump(sstrm);
155  return sstrm.str();
156 }
157 
158 
163 shared_ptr<http::EffectiveUrl> EffectiveUrlCache::get_cached_eurl(string const &url_key){
164  shared_ptr<http::EffectiveUrl> effective_url(nullptr);
165  auto it = d_effective_urls.find(url_key);
166  if(it!=d_effective_urls.end()){
167  effective_url = (*it).second;
168  }
169  return effective_url;
170 }
171 
172 
173 //########################################################################################
174 //########################################################################################
175 //########################################################################################
176 
177 
185 shared_ptr<http::EffectiveUrl> EffectiveUrlCache::get_effective_url(shared_ptr<http::url> source_url) {
186 
187  // This lock is a RAII implementation. It will block until the mutex is
188  // available and the lock will be released when the instance is destroyed.
189  std::lock_guard<std::mutex> lock_me(d_cache_lock_mutex);
190 
191  BESDEBUG(MODULE, prolog << "BEGIN url: " << source_url->str() << endl);
192  BESDEBUG(MODULE_DUMPER, prolog << "dump: " << endl << dump() << endl);
193 
194  if (!is_enabled()) {
195  BESDEBUG(MODULE, prolog << "CACHE IS DISABLED." << endl);
196  return shared_ptr<http::EffectiveUrl>(new http::EffectiveUrl(source_url));
197 
198  }
199 
200  // if it's not an HTTP url there is nothing to cache.
201  if (source_url->str().find(HTTP_PROTOCOL) != 0 && source_url->str().find(HTTPS_PROTOCOL) != 0) {
202  BESDEBUG(MODULE, prolog << "END Not an HTTP request, SKIPPING." << endl);
203  return shared_ptr<http::EffectiveUrl>(new http::EffectiveUrl(source_url));
204  }
205 
206  BESRegex *skip_regex = get_skip_regex();
207  if( skip_regex ) {
208  size_t match_length = 0;
209  match_length = skip_regex->match(source_url->str().c_str(), source_url->str().length());
210  if (match_length == source_url->str().length()) {
211  BESDEBUG(MODULE, prolog << "END Candidate url matches the "
212  "no_redirects_regex_pattern [" << skip_regex->pattern() <<
213  "][match_length=" << match_length << "] SKIPPING." << endl);
214  return shared_ptr<http::EffectiveUrl>(new http::EffectiveUrl(source_url));
215  }
216  BESDEBUG(MODULE, prolog << "Candidate url: '" << source_url->str() << "' does NOT match the "
217  "skip_regex pattern [" << skip_regex->pattern() << "]" << endl);
218  }
219  else {
220  BESDEBUG(MODULE, prolog << "The cache_effective_urls_skip_regex() was NOT SET "<< endl);
221  }
222 
223  shared_ptr<http::EffectiveUrl> effective_url = get_cached_eurl(source_url->str());
224 
225  // If the source_url does not have an associated EffectiveUrl instance in the cache
226  // then we know we have to get one.
227  bool retrieve_and_cache = !effective_url;
228 
229  // But, if there is a value in the cache, we must check to see
230  // if it is expired, in which case we will retrieve and cache it.
231  if(effective_url){
232  // It was in the cache. w00t. But, is it expired?.
233  BESDEBUG(MODULE, prolog << "Cache hit for: " << source_url->str() << endl);
234  retrieve_and_cache = effective_url->is_expired();
235  BESDEBUG(MODULE, prolog << "Cached target URL is " << (retrieve_and_cache?"":"not ") << "expired." << endl);
236  }
237 
238  // It not found or expired, reload.
239  if(retrieve_and_cache){
240  BESDEBUG(MODULE, prolog << "Acquiring effective URL for " << source_url->str() << endl);
241  {
242  BESStopWatch sw;
243  if(BESDebug::IsSet(MODULE) || BESDebug::IsSet(TIMING_LOG_KEY))
244  sw.start(prolog + "Retrieve and cache effective url for source url: " + source_url->str());
245  effective_url = curl::retrieve_effective_url(source_url);
246  }
247  BESDEBUG(MODULE, prolog << " source_url: " << source_url->str() << " (" << (source_url->is_trusted()?"":"NOT ") << "trusted)" << endl);
248  BESDEBUG(MODULE, prolog << "effective_url: " << effective_url->dump() << " (" << (source_url->is_trusted()?"":"NOT ") << "trusted)" << endl);
249 
250  d_effective_urls[source_url->str()] = effective_url;
251 
252  BESDEBUG(MODULE, prolog << "Updated record for "<< source_url->str() << " cache size: " << d_effective_urls.size() << endl);
253 
254  // Since we don't want there to be a concurrency issue when we release the lock, we don't
255  // return the instance of shared_ptr<EffectiveUrl> that we placed in the cache. Rather
256  // we make a clone and return that. It will have it's own lifecycle independent of
257  // the instance we placed in the cache - it can be modified and the one in the cache
258  // is unchanged. Trusted state was established from source_url when effective_url was
259  // created in curl::retrieve_effective_url()
260  effective_url = shared_ptr<EffectiveUrl>(new EffectiveUrl(effective_url));
261  }
262  else {
263  // Here we have a !expired instance of a shared_ptr<EffectiveUrl> retrieved from the cache.
264  // Now we need to make a copy to return, inheriting trust from the
265  // requesting URL.
266  effective_url = shared_ptr<EffectiveUrl>(new EffectiveUrl(effective_url,source_url->is_trusted()));
267  }
268 
269  BESDEBUG(MODULE_DUMPER, prolog << "dump: " << endl << dump() << endl);
270 
271  BESDEBUG(MODULE, prolog << "END" << endl);
272 
273  return effective_url;
274 }// The lock is released when the point of execution reaches this brace and lock_me goes out of scope.
275 
276 
281 bool EffectiveUrlCache::is_enabled()
282 {
283  // The first time here, the value of d_enabled is -1. Once we check for it in TheBESKeys
284  // The value will be 0 (false) or 1 (true) and TheBESKeys will not be checked again.
285  if(d_enabled < 0){
286  bool found;
287  string value;
288  TheBESKeys::TheKeys()->get_value(HTTP_CACHE_EFFECTIVE_URLS_KEY,value,found);
289  BESDEBUG(MODULE, prolog << HTTP_CACHE_EFFECTIVE_URLS_KEY <<": '" << value << "'" << endl);
290  d_enabled = found && BESUtil::lowercase(value)=="true";
291  }
292  BESDEBUG(MODULE, prolog << "d_enabled: " << (d_enabled?"true":"false") << endl);
293  return d_enabled;
294 }
295 
300 BESRegex *EffectiveUrlCache::get_skip_regex()
301 {
302  if(!d_skip_regex){
303  bool found;
304  string value;
305  TheBESKeys::TheKeys()->get_value(HTTP_CACHE_EFFECTIVE_URLS_SKIP_REGEX_KEY, value, found);
306  if(found && value.length()){
307  BESDEBUG(MODULE, prolog << HTTP_CACHE_EFFECTIVE_URLS_SKIP_REGEX_KEY <<": " << value << endl);
308  d_skip_regex = new BESRegex(value.c_str());
309  }
310  }
311  BESDEBUG(MODULE, prolog << "d_skip_regex: " << (d_skip_regex?d_skip_regex->pattern():"Value has not been set.") << endl);
312  return d_skip_regex;
313 }
314 
315 
316 
317 
318 
319 } // namespace http
static bool IsSet(const std::string &flagName)
see if the debug context flagName is set to true
Definition: BESDebug.h:168
Regular expression matching.
Definition: BESRegex.h:53
int match(const char *s, int len, int pos=0) const
Does the pattern match.
Definition: BESRegex.cc:127
virtual bool start(std::string name)
Definition: BESStopWatch.cc:67
static std::string lowercase(const std::string &s)
Definition: BESUtil.cc:206
void get_value(const std::string &s, std::string &val, bool &found)
Retrieve the value of a given key, if set.
Definition: TheBESKeys.cc:340
static TheBESKeys * TheKeys()
Definition: TheBESKeys.cc:71
utility class for the HTTP catalog module
Definition: AllowedHosts.cc:55