00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00034 #include "blocxx/PerlRegEx.hpp"
00035
00036 #ifdef BLOCXX_HAVE_PCRE
00037 #ifdef BLOCXX_HAVE_PCRE_H
00038
00039 #include "blocxx/ExceptionIds.hpp"
00040 #include "blocxx/Assertion.hpp"
00041 #include "blocxx/Format.hpp"
00042 #include <climits>
00043
00044
00045 namespace BLOCXX_NAMESPACE
00046 {
00047
00048
00049
00050 static String
00051 substitute_caps(const PerlRegEx::MatchArray &sub,
00052 const String &str, const String &rep)
00053 {
00054 static const char *cap_refs[] = {
00055 NULL, "\\1", "\\2", "\\3", "\\4",
00056 "\\5", "\\6", "\\7", "\\8", "\\9", NULL
00057 };
00058
00059 String res( rep);
00060 size_t pos;
00061
00062 for(size_t i=1; cap_refs[i] != NULL; i++)
00063 {
00064 String cap;
00065 if( i < sub.size() && sub[i].rm_so >= 0 && sub[i].rm_eo >= 0)
00066 {
00067 cap = str.substring(sub[i].rm_so, sub[i].rm_eo
00068 - sub[i].rm_so);
00069 }
00070
00071 pos = res.indexOf(cap_refs[i]);
00072 while( pos != String::npos)
00073 {
00074 size_t quotes = 0;
00075 size_t at = pos;
00076
00077 while( at > 0 && res.charAt(--at) == '\\')
00078 quotes++;
00079
00080 if( quotes % 2)
00081 {
00082 quotes = (quotes + 1) / 2;
00083
00084 res = res.erase(pos - quotes, quotes);
00085
00086 pos = res.indexOf(cap_refs[i],
00087 pos + 2 - quotes);
00088 }
00089 else
00090 {
00091 quotes = quotes / 2;
00092
00093 res = res.substring(0, pos - quotes) +
00094 cap +
00095 res.substring(pos + 2);
00096
00097 pos = res.indexOf(cap_refs[i],
00098 pos + cap.length() - quotes);
00099 }
00100 }
00101 }
00102 return res;
00103 }
00104
00105
00106
00107 static inline String
00108 getError(const int errcode)
00109 {
00110 const char *ptr;
00111 switch(errcode)
00112 {
00113 case 0:
00114 ptr = "match vector to small";
00115 break;
00116
00117 case PCRE_ERROR_NOMATCH:
00118 ptr = "match failed";
00119 break;
00120
00121 case PCRE_ERROR_NULL:
00122 ptr = "invalid argument";
00123 break;
00124
00125 case PCRE_ERROR_BADOPTION:
00126 ptr = "unrecognized option";
00127 break;
00128
00129 case PCRE_ERROR_BADMAGIC:
00130 ptr = "invalid magic number";
00131 break;
00132
00133 case PCRE_ERROR_UNKNOWN_NODE:
00134 ptr = "unknown item in the compiled pattern";
00135 break;
00136
00137 case PCRE_ERROR_NOMEMORY:
00138 ptr = "failed to allocate memory";
00139 break;
00140
00141 case PCRE_ERROR_NOSUBSTRING:
00142
00143 ptr = "failed to retrieve substring";
00144 break;
00145
00146 case PCRE_ERROR_MATCHLIMIT:
00147
00148 ptr = "recursion or backtracking limit reached";
00149 break;
00150
00151 case PCRE_ERROR_CALLOUT:
00152
00153 ptr = "callout failure";
00154 break;
00155
00156 case PCRE_ERROR_BADUTF8:
00157 ptr = "invalid UTF-8 byte sequence found";
00158 break;
00159
00160 case PCRE_ERROR_BADUTF8_OFFSET:
00161 ptr = "not a UTF-8 character at specified index";
00162 break;
00163
00164 case PCRE_ERROR_PARTIAL:
00165 ptr = "partial match";
00166 break;
00167
00168 case PCRE_ERROR_BADPARTIAL:
00169 ptr = "pattern item not supported for partial matching";
00170 break;
00171
00172 case PCRE_ERROR_INTERNAL:
00173 ptr = "unexpected internal error occurred";
00174 break;
00175
00176 case PCRE_ERROR_BADCOUNT:
00177 ptr = "invalid (negative) match vector count";
00178 break;
00179
00180 default:
00181 ptr = "unknown error code";
00182 break;
00183 }
00184 return String(ptr);
00185 }
00186
00187
00188 PerlRegEx::PerlRegEx()
00189 : m_pcre(NULL)
00190 , m_flags(0)
00191 , m_ecode(0)
00192 {
00193 }
00194
00195
00196
00197 PerlRegEx::PerlRegEx(const String ®ex, int cflags)
00198 : m_pcre(NULL)
00199 , m_flags(0)
00200 , m_ecode(0)
00201 {
00202 if( !compile(regex, cflags))
00203 {
00204 BLOCXX_THROW_ERR(RegExCompileException,
00205 errorString().c_str(), m_ecode);
00206 }
00207 }
00208
00209
00210
00211 PerlRegEx::PerlRegEx(const PerlRegEx &ref)
00212 : m_pcre(NULL)
00213 , m_flags(ref.m_flags)
00214 , m_ecode(0)
00215 , m_rxstr(ref.m_rxstr)
00216 {
00217 if( ref.m_pcre != NULL && !compile(ref.m_rxstr, ref.m_flags))
00218 {
00219 BLOCXX_THROW_ERR(RegExCompileException,
00220 errorString().c_str(), m_ecode);
00221 }
00222 }
00223
00224
00225 PerlRegEx::~PerlRegEx()
00226 {
00227 if( m_pcre)
00228 {
00229 free(m_pcre);
00230 m_pcre = NULL;
00231 }
00232 }
00233
00234
00235
00236 PerlRegEx &
00237 PerlRegEx::operator = (const PerlRegEx &ref)
00238 {
00239 if( ref.m_pcre == NULL)
00240 {
00241 m_ecode = 0;
00242 m_error.erase();
00243 m_flags = ref.m_flags;
00244 m_rxstr = ref.m_rxstr;
00245 if( m_pcre != NULL)
00246 {
00247 free(m_pcre);
00248 m_pcre = NULL;
00249 }
00250 }
00251 else if( !compile(ref.m_rxstr, ref.m_flags))
00252 {
00253 BLOCXX_THROW_ERR(RegExCompileException,
00254 errorString().c_str(), m_ecode);
00255 }
00256 return *this;
00257 }
00258
00259
00260
00261 bool
00262 PerlRegEx::compile(const String ®ex, int cflags)
00263 {
00264 if( m_pcre)
00265 {
00266 free(m_pcre);
00267 m_pcre = NULL;
00268 }
00269
00270 const char *errptr = NULL;
00271
00272 m_ecode = 0;
00273 m_pcre = ::pcre_compile(regex.c_str(), cflags,
00274 &errptr, &m_ecode, NULL);
00275 if( m_pcre == NULL)
00276 {
00277 m_error = String(errptr ? errptr : "");
00278 m_rxstr.erase();
00279 m_flags = 0;
00280 return false;
00281 }
00282 else
00283 {
00284 m_error.erase();
00285 m_rxstr = regex;
00286 m_flags = cflags;
00287 return true;
00288 }
00289 }
00290
00291
00292
00293 int
00294 PerlRegEx::errorCode()
00295 {
00296 return m_ecode;
00297 }
00298
00299
00300
00301 String
00302 PerlRegEx::errorString() const
00303 {
00304 return m_error;
00305 }
00306
00307
00308
00309 String
00310 PerlRegEx::patternString() const
00311 {
00312 return m_rxstr;
00313 }
00314
00315
00316
00317 int
00318 PerlRegEx::compileFlags() const
00319 {
00320 return m_flags;
00321 }
00322
00323
00324
00325 bool
00326 PerlRegEx::isCompiled() const
00327 {
00328 return (m_pcre != NULL);
00329 }
00330
00331
00332
00333 bool
00334 PerlRegEx::execute(MatchArray &sub, const String &str,
00335 size_t index, size_t count, int eflags)
00336 {
00337 if( m_pcre == NULL)
00338 {
00339 BLOCXX_THROW(RegExCompileException,
00340 "Regular expression is not compiled");
00341 }
00342 if( count >= size_t(INT_MAX / 3))
00343 {
00344 BLOCXX_THROW(AssertionException,
00345 "Match count limit exceeded");
00346 }
00347
00348 if( index > str.length())
00349 {
00350 BLOCXX_THROW(OutOfBoundsException,
00351 Format("String index out of bounds ("
00352 "length = %1, index = %2).",
00353 str.length(), index
00354 ).c_str());
00355 }
00356
00357 if( count == 0)
00358 {
00359 int cnt = 0;
00360 int ret = ::pcre_fullinfo(m_pcre, NULL,
00361 PCRE_INFO_CAPTURECOUNT, &cnt);
00362 if( ret)
00363 {
00364 m_error = getError(m_ecode);
00365 return false;
00366 }
00367 count = cnt > 0 ? cnt + 1 : 1;
00368 }
00369 int vsub[count * 3];
00370
00371 sub.clear();
00372 m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(), str.length(),
00373 index, eflags, vsub, count * 3);
00374
00375
00376
00377
00378 if( m_ecode > 0)
00379 {
00380 sub.resize(count);
00381 for(size_t i = 0, n = 0; i < count; i++, n += 2)
00382 {
00383 match_t m = { vsub[n], vsub[n+1] };
00384
00385
00386 if( i >= (size_t)m_ecode)
00387 m.rm_so = m.rm_eo = -1;
00388
00389 sub[i] = m;
00390 }
00391 m_error.erase();
00392 return true;
00393 }
00394 else
00395 {
00396 m_error = getError(m_ecode);
00397 return false;
00398 }
00399 }
00400
00401
00402
00403 bool
00404 PerlRegEx::execute(MatchVector &sub, const String &str,
00405 size_t index, size_t count, int eflags)
00406 {
00407 if( m_pcre == NULL)
00408 {
00409 BLOCXX_THROW(RegExCompileException,
00410 "Regular expression is not compiled");
00411 }
00412 if( count >= size_t(INT_MAX / 3))
00413 {
00414 BLOCXX_THROW(AssertionException,
00415 "Match count limit exceeded");
00416 }
00417
00418 if( index > str.length())
00419 {
00420 BLOCXX_THROW(OutOfBoundsException,
00421 Format("String index out of bounds ("
00422 "length = %1, index = %2)",
00423 str.length(), index
00424 ).c_str());
00425 }
00426
00427 if( count == 0)
00428 {
00429 int cnt = 0;
00430 int ret = ::pcre_fullinfo(m_pcre, NULL,
00431 PCRE_INFO_CAPTURECOUNT, &cnt);
00432 if( ret)
00433 {
00434 m_error = getError(m_ecode);
00435 return false;
00436 }
00437 count = cnt > 0 ? cnt + 1 : 1;
00438 }
00439 int vsub[count * 3];
00440
00441 sub.clear();
00442 m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(), str.length(),
00443 index, eflags, vsub, count * 3);
00444
00445
00446
00447
00448 if( m_ecode > 0)
00449 {
00450 count *= 2;
00451 m_ecode *= 2;
00452 sub.resize(count);
00453 for(size_t i = 0; i < count; i++)
00454 {
00455
00456 if( i >= (size_t)m_ecode)
00457 vsub[i] = -1;
00458
00459 sub[i] = vsub[i];
00460 }
00461 return true;
00462 }
00463 else
00464 {
00465 m_error = getError(m_ecode);
00466 return false;
00467 }
00468 }
00469
00470
00471
00472 StringArray
00473 PerlRegEx::capture(const String &str, size_t index, size_t count, int eflags)
00474 {
00475 if( m_pcre == NULL)
00476 {
00477 BLOCXX_THROW(RegExCompileException,
00478 "Regular expression is not compiled");
00479 }
00480
00481 MatchArray rsub;
00482 StringArray ssub;
00483
00484 bool match = execute(rsub, str, index, count, eflags);
00485 if( match)
00486 {
00487 if( rsub.empty())
00488 {
00489 BLOCXX_THROW(RegExCompileException,
00490 "Non-capturing regular expression");
00491 }
00492
00493 MatchArray::const_iterator i=rsub.begin();
00494 for( ; i != rsub.end(); ++i)
00495 {
00496 if( i->rm_so >= 0 && i->rm_eo >= 0)
00497 {
00498 ssub.push_back(str.substring(i->rm_so,
00499 i->rm_eo - i->rm_so));
00500 }
00501 else
00502 {
00503 ssub.push_back(String(""));
00504 }
00505 }
00506 }
00507 else if(m_ecode != PCRE_ERROR_NOMATCH)
00508 {
00509 BLOCXX_THROW_ERR(RegExExecuteException,
00510 errorString().c_str(), m_ecode);
00511 }
00512 return ssub;
00513 }
00514
00515
00516
00517 blocxx::String
00518 PerlRegEx::replace(const String &str, const String &rep,
00519 bool global, int eflags)
00520 {
00521 if( m_pcre == NULL)
00522 {
00523 BLOCXX_THROW(RegExCompileException,
00524 "Regular expression is not compiled");
00525 }
00526
00527 MatchArray rsub;
00528 bool match;
00529 size_t off = 0;
00530 String out = str;
00531
00532 do
00533 {
00534 match = execute(rsub, out, off, 0, eflags);
00535 if( match)
00536 {
00537 if( rsub.empty() ||
00538 rsub[0].rm_so < 0 ||
00539 rsub[0].rm_eo < 0)
00540 {
00541
00542 BLOCXX_THROW(RegExCompileException,
00543 "Non-capturing regular expression");
00544 }
00545
00546 String res = substitute_caps(rsub, out, rep);
00547
00548 out = out.substring(0, rsub[0].rm_so) +
00549 res + out.substring(rsub[0].rm_eo);
00550
00551 off = rsub[0].rm_so + res.length();
00552 }
00553 else if(m_ecode == PCRE_ERROR_NOMATCH)
00554 {
00555 m_ecode = 0;
00556 m_error.erase();
00557 }
00558 else
00559 {
00560 BLOCXX_THROW_ERR(RegExExecuteException,
00561 errorString().c_str(), m_ecode);
00562 }
00563 } while(global && match && out.length() > off);
00564
00565 return out;
00566 }
00567
00568
00569
00570 StringArray
00571 PerlRegEx::split(const String &str, bool empty, int eflags)
00572 {
00573 if( m_pcre == NULL)
00574 {
00575 BLOCXX_THROW(RegExCompileException,
00576 "Regular expression is not compiled");
00577 }
00578
00579 MatchArray rsub;
00580 StringArray ssub;
00581 bool match;
00582 size_t off = 0;
00583 size_t len = str.length();
00584
00585 do
00586 {
00587 match = execute(rsub, str, off, 0, eflags);
00588 if( match)
00589 {
00590 if( rsub.empty() ||
00591 rsub[0].rm_so < 0 ||
00592 rsub[0].rm_eo < 0)
00593 {
00594 BLOCXX_THROW(RegExCompileException,
00595 "Non-capturing regular expression");
00596 }
00597
00598 if( empty || ((size_t)rsub[0].rm_so > off))
00599 {
00600 ssub.push_back(str.substring(off,
00601 rsub[0].rm_so - off));
00602 }
00603 off = rsub[0].rm_eo;
00604 }
00605 else if(m_ecode == PCRE_ERROR_NOMATCH)
00606 {
00607 String tmp = str.substring(off);
00608 if( empty || !tmp.empty())
00609 {
00610 ssub.push_back(tmp);
00611 }
00612 m_ecode = 0;
00613 m_error.erase();
00614 }
00615 else
00616 {
00617 BLOCXX_THROW_ERR(RegExExecuteException,
00618 errorString().c_str(), m_ecode);
00619 }
00620 } while(match && len > off);
00621
00622 return ssub;
00623 }
00624
00625
00626
00627 StringArray
00628 PerlRegEx::grep(const StringArray &src, int eflags)
00629 {
00630 if( m_pcre == NULL)
00631 {
00632 BLOCXX_THROW(RegExCompileException,
00633 "Regular expression is not compiled");
00634 }
00635
00636 m_ecode = 0;
00637 m_error.erase();
00638
00639 StringArray out;
00640 if( !src.empty())
00641 {
00642 StringArray::const_iterator i=src.begin();
00643 for( ; i != src.end(); ++i)
00644 {
00645 int ret = ::pcre_exec(m_pcre, NULL, i->c_str(),
00646 i->length(), 0, eflags, NULL, 0);
00647 if( ret >= 0)
00648 {
00649 out.push_back(*i);
00650 }
00651 else if( ret != PCRE_ERROR_NOMATCH)
00652 {
00653 m_ecode = ret;
00654 m_error = getError(m_ecode);
00655 BLOCXX_THROW_ERR(RegExExecuteException,
00656 errorString().c_str(), m_ecode);
00657 }
00658 }
00659 }
00660 return out;
00661 }
00662
00663
00664
00665 bool
00666 PerlRegEx::match(const String &str, size_t index, int eflags) const
00667 {
00668 if( m_pcre == NULL)
00669 {
00670 BLOCXX_THROW(RegExCompileException,
00671 "Regular expression is not compiled");
00672 }
00673
00674 if( index > str.length())
00675 {
00676 BLOCXX_THROW(OutOfBoundsException,
00677 Format("String index out of bounds."
00678 "length = %1, index = %2",
00679 str.length(), index
00680 ).c_str());
00681 }
00682
00683 m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(),
00684 str.length(), 0, eflags, NULL, 0);
00685 if( m_ecode >= 0)
00686 {
00687 m_error.erase();
00688 return true;
00689 }
00690 else if( m_ecode == PCRE_ERROR_NOMATCH)
00691 {
00692 m_error = getError(m_ecode);
00693 return false;
00694 }
00695 else
00696 {
00697 m_error = getError(m_ecode);
00698 BLOCXX_THROW_ERR(RegExExecuteException,
00699 errorString().c_str(), m_ecode);
00700 }
00701 }
00702
00703
00704
00705 }
00706
00707 #endif // BLOCXX_HAVE_PCRE_H
00708 #endif // BLOCXX_HAVE_PCRE
00709
00710
00711