Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members   Search  

regex.h

Go to the documentation of this file.
00001 /*
00002 **********************************************************************
00003 *   Copyright (C) 2002-2003, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 **********************************************************************
00006 *   file name:  regex.h
00007 *   encoding:   US-ASCII
00008 *   indentation:4
00009 *
00010 *   created on: 2002oct22
00011 *   created by: Andy Heninger
00012 *
00013 *   ICU Regular Expressions, API for C++
00014 */
00015 
00016 #ifndef REGEX_H
00017 #define REGEX_H
00018 
00019 
00039 #include "unicode/utypes.h"
00040 
00041 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
00042 
00043 #include "unicode/uobject.h"
00044 #include "unicode/unistr.h"
00045 #include "unicode/parseerr.h"
00046 
00047 U_NAMESPACE_BEGIN
00048 
00049 
00050 // Forward Declarations...
00051 
00052 class RegexMatcher;
00053 class UVector;
00054 class UVector32;
00055 class UnicodeSet;
00056 struct REStackFrame;
00057 struct Regex8BitSet;
00058 class  RuleBasedBreakIterator;
00059 
00060 
00061 
00066 enum {
00068     UREGEX_CANON_EQ         = 128,
00069 
00071     UREGEX_CASE_INSENSITIVE = 2,
00072 
00074     UREGEX_COMMENTS         = 4,
00075 
00078     UREGEX_DOTALL           = 32,
00079 
00084     UREGEX_MULTILINE        = 8,
00085 
00093     UREGEX_UWORD            = 256
00094 };
00095 
00096 
00097 
00109 class U_I18N_API RegexPattern: public UObject {
00110 public:
00111 
00119     RegexPattern();
00120 
00126     RegexPattern(const RegexPattern &source);
00127 
00133     virtual ~RegexPattern();
00134 
00143     UBool           operator==(const RegexPattern& that) const;
00144 
00153     inline UBool    operator!=(const RegexPattern& that) const {return ! operator ==(that);};
00154 
00160     RegexPattern  &operator =(const RegexPattern &source);
00161 
00169     virtual RegexPattern  *clone() const;
00170 
00171 
00192     static RegexPattern *compile( const UnicodeString &regex,
00193         UParseError          &pe,
00194         UErrorCode           &status);
00195 
00216     static RegexPattern *compile( const UnicodeString &regex,
00217         uint32_t             flags,
00218         UParseError          &pe,
00219         UErrorCode           &status);
00220 
00221 
00240     static RegexPattern *compile( const UnicodeString &regex,
00241         uint32_t             flags,
00242         UErrorCode           &status);
00243 
00244 
00250     virtual uint32_t flags() const;
00251 
00264     virtual RegexMatcher *matcher(const UnicodeString &input,
00265         UErrorCode          &status) const;
00266 
00267 
00279     virtual RegexMatcher *matcher(UErrorCode  &status) const;
00280 
00281 
00296     static UBool matches(const UnicodeString   &regex,
00297         const UnicodeString   &input,
00298         UParseError     &pe,
00299         UErrorCode      &status);
00300 
00301 
00306     virtual UnicodeString pattern() const;
00307 
00308 
00334     virtual int32_t  split(const UnicodeString &input,
00335         UnicodeString    dest[],
00336         int32_t          destCapacity,
00337         UErrorCode       &status) const;
00338 
00339 
00340 
00345     void dump() const;
00346 
00352     virtual UClassID getDynamicClassID() const; 
00353 
00359     static UClassID getStaticClassID(); 
00360 
00361 private:
00362     //
00363     //  Implementation Data
00364     //
00365     UnicodeString   fPattern;      // The original pattern string.
00366     uint32_t        fFlags;        // The flags used when compiling the pattern.
00367                                    //
00368     UVector32       *fCompiledPat; // The compiled pattern p-code.
00369     UnicodeString   fLiteralText;  // Any literal string data from the pattern,
00370                                    //   after un-escaping, for use during the match.
00371 
00372     UVector         *fSets;        // Any UnicodeSets referenced from the pattern.
00373     Regex8BitSet    *fSets8;       //      (and fast sets for latin-1 range.)
00374 
00375 
00376     UErrorCode      fDeferredStatus; // status if some prior error has left this
00377                                    //  RegexPattern in an unusable state.
00378 
00379     int32_t         fMinMatchLen;  // Minimum Match Length.  All matches will have length
00380                                    //   >= this value.  For some patterns, this calculated
00381                                    //   value may be less than the true shortest
00382                                    //   possible match.
00383 
00384     int32_t         fFrameSize;    // Size of a state stack frame in the
00385                                    //   execution engine.
00386 
00387     int32_t         fDataSize;     // The size of the data needed by the pattern that
00388                                    //   does not go on the state stack, but has just
00389                                    //   a single copy per matcher.
00390 
00391     UVector32       *fGroupMap;    // Map from capture group number to position of
00392                                    //   the group's variables in the matcher stack frame.
00393 
00394     int32_t         fMaxCaptureDigits;
00395 
00396     UnicodeSet     **fStaticSets;  // Ptr to static (shared) sets for predefined
00397                                    //   regex character classes, e.g. Word.
00398 
00399     Regex8BitSet   *fStaticSets8;  // Ptr to the static (shared) latin-1 only
00400                                    //  sets for predefined regex classes.
00401 
00402     int32_t         fStartType;    // Info on how a match must start.
00403     int32_t         fInitialStringIdx;     //  
00404     int32_t         fInitialStringLen;
00405     UnicodeSet     *fInitialChars;  
00406     UChar32         fInitialChar;
00407     Regex8BitSet   *fInitialChars8;
00408 
00409     friend class RegexCompile;
00410     friend class RegexMatcher;
00411 
00412     //
00413     //  Implementation Methods
00414     //
00415     void        init();            // Common initialization, for use by constructors.
00416     void        zap();             // Common cleanup
00417     void        dumpOp(int32_t index) const;
00418 
00419 
00420 };
00421 
00422 
00423 
00424 
00425 
00426 
00427 
00428 
00429 
00439 class U_I18N_API RegexMatcher: public UObject {
00440 public:
00441 
00456     RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
00457 
00473     RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
00474         uint32_t flags, UErrorCode &status);
00475 
00476 
00482     virtual ~RegexMatcher();
00483 
00484 
00491     virtual UBool matches(UErrorCode &status);
00492 
00501     virtual UBool matches(int32_t startIndex, UErrorCode &status);
00502 
00503 
00504 
00505 
00518     virtual UBool lookingAt(UErrorCode &status);
00519 
00520 
00534     virtual UBool lookingAt(int32_t startIndex, UErrorCode &status);
00535 
00548     virtual UBool find();
00549 
00550 
00560     virtual UBool find(int32_t start, UErrorCode &status);
00561 
00562 
00572     virtual UnicodeString group(UErrorCode &status) const;
00573 
00574 
00587     virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
00588 
00589 
00595     virtual int32_t groupCount() const;
00596 
00597 
00605     virtual int32_t start(UErrorCode &status) const;
00606 
00607 
00621     virtual int32_t start(int group, UErrorCode &status) const;
00622 
00623 
00633     virtual int32_t end(UErrorCode &status) const;
00634 
00635 
00649     virtual int32_t end(int group, UErrorCode &status) const;
00650 
00651 
00661     virtual UBool touchedEnd();
00662 
00663 
00672     virtual RegexMatcher &reset();
00673 
00674 
00684     virtual RegexMatcher &reset(int32_t index, UErrorCode &status);
00685 
00686 
00694     virtual RegexMatcher &reset(const UnicodeString &input);
00695 
00696 
00703     virtual const UnicodeString &input() const;
00704 
00705 
00711     virtual const RegexPattern &pattern() const;
00712 
00713 
00730     virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
00731 
00732 
00753     virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
00754 
00782     virtual RegexMatcher &appendReplacement(UnicodeString &dest,
00783         const UnicodeString &replacement, UErrorCode &status);
00784 
00785 
00796     virtual UnicodeString &appendTail(UnicodeString &dest);
00797 
00798 
00799 
00824     virtual int32_t  split(const UnicodeString &input,
00825         UnicodeString    dest[],
00826         int32_t          destCapacity,
00827         UErrorCode       &status);
00828 
00829 
00830 
00836     void setTrace(UBool state);
00837 
00838 
00844     static UClassID getStaticClassID();
00845 
00851     virtual UClassID getDynamicClassID() const;
00852 
00853 private:
00854     // Constructors and other object boilerplate are private.
00855     // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
00856     RegexMatcher(); // default constructor not implemented
00857     RegexMatcher(const RegexPattern *pat);
00858     RegexMatcher(const RegexMatcher &other);
00859     RegexMatcher &operator =(const RegexMatcher &rhs);
00860     friend class RegexPattern;
00861 
00862 
00863     //
00864     //  MatchAt   This is the internal interface to the match engine itself.
00865     //            Match status comes back in matcher member variables.
00866     //
00867     void                 MatchAt(int32_t startIdx, UErrorCode &status);
00868     inline void          backTrack(int32_t &inputIdx, int32_t &patIdx);
00869     UBool                isWordBoundary(int32_t pos);         // perform Perl-like  \b test
00870     UBool                isUWordBoundary(int32_t pos);        // perform RBBI based \b test
00871     REStackFrame        *resetStack();
00872     inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx,
00873                                    int32_t frameSize, UErrorCode &status);
00874 
00875 
00876     const RegexPattern  *fPattern;
00877     RegexPattern        *fPatternOwned;    // Non-NULL if this matcher owns the pattern, and
00878                                            //   should delete it when through.
00879     const UnicodeString *fInput;
00880 
00881     UBool                fMatch;           // True if the last match was successful.
00882     int32_t              fMatchStart;      // Position of the start of the most recent match
00883     int32_t              fMatchEnd;        // First position after the end of the most recent match
00884     int32_t              fLastMatchEnd;    // First position after the end of the previous match.
00885 
00886     UVector32           *fStack;
00887     REStackFrame        *fFrame;           // After finding a match, the last active stack
00888                                            //   frame, which will contain the capture group results.
00889                                            //   NOT valid while match engine is running.
00890 
00891     int32_t             *fData;            // Data area for use by the compiled pattern.
00892     int32_t             fSmallData[8];     //   Use this for data if it's enough.
00893 
00894     UBool               fTraceDebug;       // Set true for debug tracing of match engine.
00895 
00896     UErrorCode          fDeferredStatus;   // Save error state if that cannot be immediately
00897                                            //   reported, or that permanently disables this matcher.
00898 
00899     UBool               fTouchedEnd;       // Set true if match engine reaches eof on input
00900                                            //   while attempting a match.
00901 
00902     RuleBasedBreakIterator  *fWordBreakItr;
00903 
00904 };
00905 
00906 U_NAMESPACE_END
00907 #endif  // UCONFIG_NO_REGULAR_EXPRESSIONS
00908 #endif

Generated on Mon Nov 24 14:35:35 2003 for ICU 2.8 by doxygen1.2.11.1 written by Dimitri van Heesch, © 1997-2001