• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (c) 2001-2004, International Business Machines Corporation
4 *   and others.  All Rights Reserved.
5 **********************************************************************
6 *   Date        Name        Description
7 *   07/23/01    aliu        Creation.
8 **********************************************************************
9 */
10 
11 #include "unicode/utypes.h"
12 
13 #if !UCONFIG_NO_TRANSLITERATION
14 
15 #include "strmatch.h"
16 #include "rbt_data.h"
17 #include "util.h"
18 #include "unicode/uniset.h"
19 
20 U_NAMESPACE_BEGIN
21 
22 static const UChar EMPTY[] = { 0 }; // empty string: ""
23 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
25 
26 StringMatcher::StringMatcher(const UnicodeString& theString,
27                              int32_t start,
28                              int32_t limit,
29                              int32_t segmentNum,
30                              const TransliterationRuleData& theData) :
31     data(&theData),
32     segmentNumber(segmentNum),
33     matchStart(-1),
34     matchLimit(-1)
35 {
36     theString.extractBetween(start, limit, pattern);
37 }
38 
StringMatcher(const StringMatcher & o)39 StringMatcher::StringMatcher(const StringMatcher& o) :
40     UnicodeFunctor(o),
41     UnicodeMatcher(o),
42     UnicodeReplacer(o),
43     pattern(o.pattern),
44     data(o.data),
45     segmentNumber(o.segmentNumber),
46     matchStart(o.matchStart),
47     matchLimit(o.matchLimit)
48 {
49 }
50 
51 /**
52  * Destructor
53  */
~StringMatcher()54 StringMatcher::~StringMatcher() {
55 }
56 
57 /**
58  * Implement UnicodeFunctor
59  */
clone() const60 UnicodeFunctor* StringMatcher::clone() const {
61     return new StringMatcher(*this);
62 }
63 
64 /**
65  * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
66  * and return the pointer.
67  */
toMatcher() const68 UnicodeMatcher* StringMatcher::toMatcher() const {
69     return (UnicodeMatcher*) this;
70 }
71 
72 /**
73  * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
74  * and return the pointer.
75  */
toReplacer() const76 UnicodeReplacer* StringMatcher::toReplacer() const {
77     return (UnicodeReplacer*) this;
78 }
79 
80 /**
81  * Implement UnicodeMatcher
82  */
matches(const Replaceable & text,int32_t & offset,int32_t limit,UBool incremental)83 UMatchDegree StringMatcher::matches(const Replaceable& text,
84                                     int32_t& offset,
85                                     int32_t limit,
86                                     UBool incremental) {
87     int32_t i;
88     int32_t cursor = offset;
89     if (limit < cursor) {
90         // Match in the reverse direction
91         for (i=pattern.length()-1; i>=0; --i) {
92             UChar keyChar = pattern.charAt(i);
93             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
94             if (subm == 0) {
95                 if (cursor > limit &&
96                     keyChar == text.charAt(cursor)) {
97                     --cursor;
98                 } else {
99                     return U_MISMATCH;
100                 }
101             } else {
102                 UMatchDegree m =
103                     subm->matches(text, cursor, limit, incremental);
104                 if (m != U_MATCH) {
105                     return m;
106                 }
107             }
108         }
109         // Record the match position, but adjust for a normal
110         // forward start, limit, and only if a prior match does not
111         // exist -- we want the rightmost match.
112         if (matchStart < 0) {
113             matchStart = cursor+1;
114             matchLimit = offset+1;
115         }
116     } else {
117         for (i=0; i<pattern.length(); ++i) {
118             if (incremental && cursor == limit) {
119                 // We've reached the context limit without a mismatch and
120                 // without completing our match.
121                 return U_PARTIAL_MATCH;
122             }
123             UChar keyChar = pattern.charAt(i);
124             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
125             if (subm == 0) {
126                 // Don't need the cursor < limit check if
127                 // incremental is TRUE (because it's done above); do need
128                 // it otherwise.
129                 if (cursor < limit &&
130                     keyChar == text.charAt(cursor)) {
131                     ++cursor;
132                 } else {
133                     return U_MISMATCH;
134                 }
135             } else {
136                 UMatchDegree m =
137                     subm->matches(text, cursor, limit, incremental);
138                 if (m != U_MATCH) {
139                     return m;
140                 }
141             }
142         }
143         // Record the match position
144         matchStart = offset;
145         matchLimit = cursor;
146     }
147 
148     offset = cursor;
149     return U_MATCH;
150 }
151 
152 /**
153  * Implement UnicodeMatcher
154  */
toPattern(UnicodeString & result,UBool escapeUnprintable) const155 UnicodeString& StringMatcher::toPattern(UnicodeString& result,
156                                         UBool escapeUnprintable) const
157 {
158     result.truncate(0);
159     UnicodeString str, quoteBuf;
160     if (segmentNumber > 0) {
161         result.append((UChar)40); /*(*/
162     }
163     for (int32_t i=0; i<pattern.length(); ++i) {
164         UChar keyChar = pattern.charAt(i);
165         const UnicodeMatcher* m = data->lookupMatcher(keyChar);
166         if (m == 0) {
167             ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
168         } else {
169             ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
170                          TRUE, escapeUnprintable, quoteBuf);
171         }
172     }
173     if (segmentNumber > 0) {
174         result.append((UChar)41); /*)*/
175     }
176     // Flush quoteBuf out to result
177     ICU_Utility::appendToRule(result, -1,
178                               TRUE, escapeUnprintable, quoteBuf);
179     return result;
180 }
181 
182 /**
183  * Implement UnicodeMatcher
184  */
matchesIndexValue(uint8_t v) const185 UBool StringMatcher::matchesIndexValue(uint8_t v) const {
186     if (pattern.length() == 0) {
187         return TRUE;
188     }
189     UChar32 c = pattern.char32At(0);
190     const UnicodeMatcher *m = data->lookupMatcher(c);
191     return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
192 }
193 
194 /**
195  * Implement UnicodeMatcher
196  */
addMatchSetTo(UnicodeSet & toUnionTo) const197 void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
198     UChar32 ch;
199     for (int32_t i=0; i<pattern.length(); i+=UTF_CHAR_LENGTH(ch)) {
200         ch = pattern.char32At(i);
201         const UnicodeMatcher* matcher = data->lookupMatcher(ch);
202         if (matcher == NULL) {
203             toUnionTo.add(ch);
204         } else {
205             matcher->addMatchSetTo(toUnionTo);
206         }
207     }
208 }
209 
210 /**
211  * UnicodeReplacer API
212  */
replace(Replaceable & text,int32_t start,int32_t limit,int32_t &)213 int32_t StringMatcher::replace(Replaceable& text,
214                                int32_t start,
215                                int32_t limit,
216                                int32_t& /*cursor*/) {
217 
218     int32_t outLen = 0;
219 
220     // Copy segment with out-of-band data
221     int32_t dest = limit;
222     // If there was no match, that means that a quantifier
223     // matched zero-length.  E.g., x (a)* y matched "xy".
224     if (matchStart >= 0) {
225         if (matchStart != matchLimit) {
226             text.copy(matchStart, matchLimit, dest);
227             outLen = matchLimit - matchStart;
228         }
229     }
230 
231     text.handleReplaceBetween(start, limit, EMPTY); // delete original text
232 
233     return outLen;
234 }
235 
236 /**
237  * UnicodeReplacer API
238  */
toReplacerPattern(UnicodeString & rule,UBool) const239 UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
240                                                 UBool /*escapeUnprintable*/) const {
241     // assert(segmentNumber > 0);
242     rule.truncate(0);
243     rule.append((UChar)0x0024 /*$*/);
244     ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
245     return rule;
246 }
247 
248 /**
249  * Remove any match info.  This must be called before performing a
250  * set of matches with this segment.
251  */
resetMatch()252  void StringMatcher::resetMatch() {
253     matchStart = matchLimit = -1;
254 }
255 
256 /**
257  * Union the set of all characters that may output by this object
258  * into the given set.
259  * @param toUnionTo the set into which to union the output characters
260  */
addReplacementSetTo(UnicodeSet &) const261 void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
262     // The output of this replacer varies; it is the source text between
263     // matchStart and matchLimit.  Since this varies depending on the
264     // input text, we can't compute it here.  We can either do nothing
265     // or we can add ALL characters to the set.  It's probably more useful
266     // to do nothing.
267 }
268 
269 /**
270  * Implement UnicodeFunctor
271  */
setData(const TransliterationRuleData * d)272 void StringMatcher::setData(const TransliterationRuleData* d) {
273     data = d;
274     int32_t i = 0;
275     while (i<pattern.length()) {
276         UChar32 c = pattern.char32At(i);
277         UnicodeFunctor* f = data->lookup(c);
278         if (f != NULL) {
279             f->setData(data);
280         }
281         i += UTF_CHAR_LENGTH(c);
282     }
283 }
284 
285 U_NAMESPACE_END
286 
287 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
288 
289 //eof
290