1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 * Copyright (C) 2001-2011, International Business Machines Corporation 5 * and others. All Rights Reserved. 6 ********************************************************************** 7 * Date Name Description 8 * 07/23/01 aliu Creation. 9 ********************************************************************** 10 */ 11 #ifndef STRMATCH_H 12 #define STRMATCH_H 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_TRANSLITERATION 17 18 #include "unicode/unistr.h" 19 #include "unicode/unifunct.h" 20 #include "unicode/unimatch.h" 21 #include "unicode/unirepl.h" 22 23 U_NAMESPACE_BEGIN 24 25 class TransliterationRuleData; 26 27 /** 28 * An object that matches a fixed input string, implementing the 29 * UnicodeMatcher API. This object also implements the 30 * UnicodeReplacer API, allowing it to emit the matched text as 31 * output. Since the match text may contain flexible match elements, 32 * such as UnicodeSets, the emitted text is not the match pattern, but 33 * instead a substring of the actual matched text. Following 34 * convention, the output text is the leftmost match seen up to this 35 * point. 36 * 37 * A StringMatcher may represent a segment, in which case it has a 38 * positive segment number. This affects how the matcher converts 39 * itself to a pattern but does not otherwise affect its function. 40 * 41 * A StringMatcher that is not a segment should not be used as a 42 * UnicodeReplacer. 43 */ 44 class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer { 45 46 public: 47 48 /** 49 * Construct a matcher that matches the given pattern string. 50 * @param string the pattern to be matched, possibly containing 51 * stand-ins that represent nested UnicodeMatcher objects. 52 * @param start inclusive start index of text to be replaced 53 * @param limit exclusive end index of text to be replaced; 54 * must be greater than or equal to start 55 * @param segmentNum the segment number from 1..n, or 0 if this is 56 * not a segment. 57 * @param data context object mapping stand-ins to 58 * UnicodeMatcher objects. 59 */ 60 StringMatcher(const UnicodeString& string, 61 int32_t start, 62 int32_t limit, 63 int32_t segmentNum, 64 const TransliterationRuleData& data); 65 66 /** 67 * Copy constructor 68 * @param o the object to be copied. 69 */ 70 StringMatcher(const StringMatcher& o); 71 72 /** 73 * Destructor 74 */ 75 virtual ~StringMatcher(); 76 77 /** 78 * Implement UnicodeFunctor 79 * @return a copy of the object. 80 */ 81 virtual UnicodeFunctor* clone() const; 82 83 /** 84 * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer 85 * and return the pointer. 86 * @return the UnicodeMatcher point. 87 */ 88 virtual UnicodeMatcher* toMatcher() const; 89 90 /** 91 * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer 92 * and return the pointer. 93 * @return the UnicodeReplacer pointer. 94 */ 95 virtual UnicodeReplacer* toReplacer() const; 96 97 /** 98 * Implement UnicodeMatcher 99 * @param text the text to be matched 100 * @param offset on input, the index into text at which to begin 101 * matching. On output, the limit of the matched text. The 102 * number of matched characters is the output value of offset 103 * minus the input value. Offset should always point to the 104 * HIGH SURROGATE (leading code unit) of a pair of surrogates, 105 * both on entry and upon return. 106 * @param limit the limit index of text to be matched. Greater 107 * than offset for a forward direction match, less than offset for 108 * a backward direction match. The last character to be 109 * considered for matching will be text.charAt(limit-1) in the 110 * forward direction or text.charAt(limit+1) in the backward 111 * direction. 112 * @param incremental if TRUE, then assume further characters may 113 * be inserted at limit and check for partial matching. Otherwise 114 * assume the text as given is complete. 115 * @return a match degree value indicating a full match, a partial 116 * match, or a mismatch. If incremental is FALSE then 117 * U_PARTIAL_MATCH should never be returned. 118 */ 119 virtual UMatchDegree matches(const Replaceable& text, 120 int32_t& offset, 121 int32_t limit, 122 UBool incremental); 123 124 /** 125 * Implement UnicodeMatcher 126 * @param result Output param to receive the pattern. 127 * @param escapeUnprintable if True then escape the unprintable characters. 128 * @return A reference to 'result'. 129 */ 130 virtual UnicodeString& toPattern(UnicodeString& result, 131 UBool escapeUnprintable = FALSE) const; 132 133 /** 134 * Implement UnicodeMatcher 135 * Returns TRUE if this matcher will match a character c, where c 136 * & 0xFF == v, at offset, in the forward direction (with limit > 137 * offset). This is used by <tt>RuleBasedTransliterator</tt> for 138 * indexing. 139 * @param v the given value 140 * @return TRUE if this matcher will match a character c, 141 * where c & 0xFF == v 142 */ 143 virtual UBool matchesIndexValue(uint8_t v) const; 144 145 /** 146 * Implement UnicodeMatcher 147 */ 148 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const; 149 150 /** 151 * Implement UnicodeFunctor 152 */ 153 virtual void setData(const TransliterationRuleData*); 154 155 /** 156 * Replace characters in 'text' from 'start' to 'limit' with the 157 * output text of this object. Update the 'cursor' parameter to 158 * give the cursor position and return the length of the 159 * replacement text. 160 * 161 * @param text the text to be matched 162 * @param start inclusive start index of text to be replaced 163 * @param limit exclusive end index of text to be replaced; 164 * must be greater than or equal to start 165 * @param cursor output parameter for the cursor position. 166 * Not all replacer objects will update this, but in a complete 167 * tree of replacer objects, representing the entire output side 168 * of a transliteration rule, at least one must update it. 169 * @return the number of 16-bit code units in the text replacing 170 * the characters at offsets start..(limit-1) in text 171 */ 172 virtual int32_t replace(Replaceable& text, 173 int32_t start, 174 int32_t limit, 175 int32_t& cursor); 176 177 /** 178 * Returns a string representation of this replacer. If the 179 * result of calling this function is passed to the appropriate 180 * parser, typically TransliteratorParser, it will produce another 181 * replacer that is equal to this one. 182 * @param result the string to receive the pattern. Previous 183 * contents will be deleted. 184 * @param escapeUnprintable if TRUE then convert unprintable 185 * character to their hex escape representations, \\uxxxx or 186 * \\Uxxxxxxxx. Unprintable characters are defined by 187 * Utility.isUnprintable(). 188 * @return a reference to 'result'. 189 */ 190 virtual UnicodeString& toReplacerPattern(UnicodeString& result, 191 UBool escapeUnprintable) const; 192 193 /** 194 * Remove any match data. This must be called before performing a 195 * set of matches with this segment. 196 */ 197 void resetMatch(); 198 199 /** 200 * ICU "poor man's RTTI", returns a UClassID for the actual class. 201 */ 202 virtual UClassID getDynamicClassID() const; 203 204 /** 205 * ICU "poor man's RTTI", returns a UClassID for this class. 206 */ 207 static UClassID U_EXPORT2 getStaticClassID(); 208 209 /** 210 * Union the set of all characters that may output by this object 211 * into the given set. 212 * @param toUnionTo the set into which to union the output characters 213 */ 214 virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const; 215 216 private: 217 218 /** 219 * The text to be matched. 220 */ 221 UnicodeString pattern; 222 223 /** 224 * Context object that maps stand-ins to matcher and replacer 225 * objects. 226 */ 227 const TransliterationRuleData* data; 228 229 /** 230 * The segment number, 1-based, or 0 if not a segment. 231 */ 232 int32_t segmentNumber; 233 234 /** 235 * Start offset, in the match text, of the <em>rightmost</em> 236 * match. 237 */ 238 int32_t matchStart; 239 240 /** 241 * Limit offset, in the match text, of the <em>rightmost</em> 242 * match. 243 */ 244 int32_t matchLimit; 245 246 }; 247 248 U_NAMESPACE_END 249 250 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 251 252 #endif 253