1 /* 2 * Copyright (C) 2001-2004, International Business Machines Corporation 3 * and others. All Rights Reserved. 4 ********************************************************************** 5 * Date Name Description 6 * 07/23/01 aliu Creation. 7 ********************************************************************** 8 */ 9 #ifndef STRMATCH_H 10 #define STRMATCH_H 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_TRANSLITERATION 15 16 #include "unicode/unistr.h" 17 #include "unicode/unifunct.h" 18 #include "unicode/unimatch.h" 19 #include "unicode/unirepl.h" 20 21 U_NAMESPACE_BEGIN 22 23 class TransliterationRuleData; 24 25 /** 26 * An object that matches a fixed input string, implementing the 27 * UnicodeMatcher API. This object also implements the 28 * UnicodeReplacer API, allowing it to emit the matched text as 29 * output. Since the match text may contain flexible match elements, 30 * such as UnicodeSets, the emitted text is not the match pattern, but 31 * instead a substring of the actual matched text. Following 32 * convention, the output text is the leftmost match seen up to this 33 * point. 34 * 35 * A StringMatcher may represent a segment, in which case it has a 36 * positive segment number. This affects how the matcher converts 37 * itself to a pattern but does not otherwise affect its function. 38 * 39 * A StringMatcher that is not a segment should not be used as a 40 * UnicodeReplacer. 41 */ 42 class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer { 43 44 public: 45 46 /** 47 * Construct a matcher that matches the given pattern string. 48 * @param string the pattern to be matched, possibly containing 49 * stand-ins that represent nested UnicodeMatcher objects. 50 * @param start inclusive start index of text to be replaced 51 * @param limit exclusive end index of text to be replaced; 52 * must be greater than or equal to start 53 * @param segmentNum the segment number from 1..n, or 0 if this is 54 * not a segment. 55 * @param data context object mapping stand-ins to 56 * UnicodeMatcher objects. 57 */ 58 StringMatcher(const UnicodeString& string, 59 int32_t start, 60 int32_t limit, 61 int32_t segmentNum, 62 const TransliterationRuleData& data); 63 64 /** 65 * Copy constructor 66 * @param o the object to be copied. 67 */ 68 StringMatcher(const StringMatcher& o); 69 70 /** 71 * Destructor 72 */ 73 virtual ~StringMatcher(); 74 75 /** 76 * Implement UnicodeFunctor 77 * @return a copy of the object. 78 */ 79 virtual UnicodeFunctor* clone() const; 80 81 /** 82 * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer 83 * and return the pointer. 84 * @return the UnicodeMatcher point. 85 */ 86 virtual UnicodeMatcher* toMatcher() const; 87 88 /** 89 * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer 90 * and return the pointer. 91 * @return the UnicodeReplacer pointer. 92 */ 93 virtual UnicodeReplacer* toReplacer() const; 94 95 /** 96 * Implement UnicodeMatcher 97 * @param text the text to be matched 98 * @param offset on input, the index into text at which to begin 99 * matching. On output, the limit of the matched text. The 100 * number of matched characters is the output value of offset 101 * minus the input value. Offset should always point to the 102 * HIGH SURROGATE (leading code unit) of a pair of surrogates, 103 * both on entry and upon return. 104 * @param limit the limit index of text to be matched. Greater 105 * than offset for a forward direction match, less than offset for 106 * a backward direction match. The last character to be 107 * considered for matching will be text.charAt(limit-1) in the 108 * forward direction or text.charAt(limit+1) in the backward 109 * direction. 110 * @param incremental if TRUE, then assume further characters may 111 * be inserted at limit and check for partial matching. Otherwise 112 * assume the text as given is complete. 113 * @return a match degree value indicating a full match, a partial 114 * match, or a mismatch. If incremental is FALSE then 115 * U_PARTIAL_MATCH should never be returned. 116 */ 117 virtual UMatchDegree matches(const Replaceable& text, 118 int32_t& offset, 119 int32_t limit, 120 UBool incremental); 121 122 /** 123 * Implement UnicodeMatcher 124 * @param result Output param to receive the pattern. 125 * @param escapeUnprintable if True then escape the unprintable characters. 126 * @return A reference to 'result'. 127 */ 128 virtual UnicodeString& toPattern(UnicodeString& result, 129 UBool escapeUnprintable = FALSE) const; 130 131 /** 132 * Implement UnicodeMatcher 133 * Returns TRUE if this matcher will match a character c, where c 134 * & 0xFF == v, at offset, in the forward direction (with limit > 135 * offset). This is used by <tt>RuleBasedTransliterator</tt> for 136 * indexing. 137 * @param v the given value 138 * @return TRUE if this matcher will match a character c, 139 * where c & 0xFF == v 140 */ 141 virtual UBool matchesIndexValue(uint8_t v) const; 142 143 /** 144 * Implement UnicodeMatcher 145 */ 146 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const; 147 148 /** 149 * Implement UnicodeFunctor 150 */ 151 virtual void setData(const TransliterationRuleData*); 152 153 /** 154 * Replace characters in 'text' from 'start' to 'limit' with the 155 * output text of this object. Update the 'cursor' parameter to 156 * give the cursor position and return the length of the 157 * replacement text. 158 * 159 * @param text the text to be matched 160 * @param start inclusive start index of text to be replaced 161 * @param limit exclusive end index of text to be replaced; 162 * must be greater than or equal to start 163 * @param cursor output parameter for the cursor position. 164 * Not all replacer objects will update this, but in a complete 165 * tree of replacer objects, representing the entire output side 166 * of a transliteration rule, at least one must update it. 167 * @return the number of 16-bit code units in the text replacing 168 * the characters at offsets start..(limit-1) in text 169 */ 170 virtual int32_t replace(Replaceable& text, 171 int32_t start, 172 int32_t limit, 173 int32_t& cursor); 174 175 /** 176 * Returns a string representation of this replacer. If the 177 * result of calling this function is passed to the appropriate 178 * parser, typically TransliteratorParser, it will produce another 179 * replacer that is equal to this one. 180 * @param result the string to receive the pattern. Previous 181 * contents will be deleted. 182 * @param escapeUnprintable if TRUE then convert unprintable 183 * character to their hex escape representations, \\uxxxx or 184 * \\Uxxxxxxxx. Unprintable characters are defined by 185 * Utility.isUnprintable(). 186 * @return a reference to 'result'. 187 */ 188 virtual UnicodeString& toReplacerPattern(UnicodeString& result, 189 UBool escapeUnprintable) const; 190 191 /** 192 * Remove any match data. This must be called before performing a 193 * set of matches with this segment. 194 */ 195 void resetMatch(); 196 197 /** 198 * ICU "poor man's RTTI", returns a UClassID for the actual class. 199 * 200 * @draft ICU 2.2 201 */ 202 virtual UClassID getDynamicClassID() const; 203 204 /** 205 * ICU "poor man's RTTI", returns a UClassID for this class. 206 * 207 * @draft ICU 2.2 208 */ 209 static UClassID U_EXPORT2 getStaticClassID(); 210 211 /** 212 * Union the set of all characters that may output by this object 213 * into the given set. 214 * @param toUnionTo the set into which to union the output characters 215 */ 216 virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const; 217 218 private: 219 220 /** 221 * The text to be matched. 222 */ 223 UnicodeString pattern; 224 225 /** 226 * Context object that maps stand-ins to matcher and replacer 227 * objects. 228 */ 229 const TransliterationRuleData* data; 230 231 /** 232 * The segment number, 1-based, or 0 if not a segment. 233 */ 234 int32_t segmentNumber; 235 236 /** 237 * Start offset, in the match text, of the <em>rightmost</em> 238 * match. 239 */ 240 int32_t matchStart; 241 242 /** 243 * Limit offset, in the match text, of the <em>rightmost</em> 244 * match. 245 */ 246 int32_t matchLimit; 247 248 }; 249 250 U_NAMESPACE_END 251 252 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 253 254 #endif 255