1 /*
2 **********************************************************************
3 * Copyright (c) 2001-2004, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 07/23/01 aliu Creation.
8 **********************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_TRANSLITERATION
14
15 #include "strmatch.h"
16 #include "rbt_data.h"
17 #include "util.h"
18 #include "unicode/uniset.h"
19
20 U_NAMESPACE_BEGIN
21
22 static const UChar EMPTY[] = { 0 }; // empty string: ""
23
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
25
26 StringMatcher::StringMatcher(const UnicodeString& theString,
27 int32_t start,
28 int32_t limit,
29 int32_t segmentNum,
30 const TransliterationRuleData& theData) :
31 data(&theData),
32 segmentNumber(segmentNum),
33 matchStart(-1),
34 matchLimit(-1)
35 {
36 theString.extractBetween(start, limit, pattern);
37 }
38
StringMatcher(const StringMatcher & o)39 StringMatcher::StringMatcher(const StringMatcher& o) :
40 UnicodeFunctor(o),
41 UnicodeMatcher(o),
42 UnicodeReplacer(o),
43 pattern(o.pattern),
44 data(o.data),
45 segmentNumber(o.segmentNumber),
46 matchStart(o.matchStart),
47 matchLimit(o.matchLimit)
48 {
49 }
50
51 /**
52 * Destructor
53 */
~StringMatcher()54 StringMatcher::~StringMatcher() {
55 }
56
57 /**
58 * Implement UnicodeFunctor
59 */
clone() const60 UnicodeFunctor* StringMatcher::clone() const {
61 return new StringMatcher(*this);
62 }
63
64 /**
65 * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
66 * and return the pointer.
67 */
toMatcher() const68 UnicodeMatcher* StringMatcher::toMatcher() const {
69 return (UnicodeMatcher*) this;
70 }
71
72 /**
73 * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
74 * and return the pointer.
75 */
toReplacer() const76 UnicodeReplacer* StringMatcher::toReplacer() const {
77 return (UnicodeReplacer*) this;
78 }
79
80 /**
81 * Implement UnicodeMatcher
82 */
matches(const Replaceable & text,int32_t & offset,int32_t limit,UBool incremental)83 UMatchDegree StringMatcher::matches(const Replaceable& text,
84 int32_t& offset,
85 int32_t limit,
86 UBool incremental) {
87 int32_t i;
88 int32_t cursor = offset;
89 if (limit < cursor) {
90 // Match in the reverse direction
91 for (i=pattern.length()-1; i>=0; --i) {
92 UChar keyChar = pattern.charAt(i);
93 UnicodeMatcher* subm = data->lookupMatcher(keyChar);
94 if (subm == 0) {
95 if (cursor > limit &&
96 keyChar == text.charAt(cursor)) {
97 --cursor;
98 } else {
99 return U_MISMATCH;
100 }
101 } else {
102 UMatchDegree m =
103 subm->matches(text, cursor, limit, incremental);
104 if (m != U_MATCH) {
105 return m;
106 }
107 }
108 }
109 // Record the match position, but adjust for a normal
110 // forward start, limit, and only if a prior match does not
111 // exist -- we want the rightmost match.
112 if (matchStart < 0) {
113 matchStart = cursor+1;
114 matchLimit = offset+1;
115 }
116 } else {
117 for (i=0; i<pattern.length(); ++i) {
118 if (incremental && cursor == limit) {
119 // We've reached the context limit without a mismatch and
120 // without completing our match.
121 return U_PARTIAL_MATCH;
122 }
123 UChar keyChar = pattern.charAt(i);
124 UnicodeMatcher* subm = data->lookupMatcher(keyChar);
125 if (subm == 0) {
126 // Don't need the cursor < limit check if
127 // incremental is TRUE (because it's done above); do need
128 // it otherwise.
129 if (cursor < limit &&
130 keyChar == text.charAt(cursor)) {
131 ++cursor;
132 } else {
133 return U_MISMATCH;
134 }
135 } else {
136 UMatchDegree m =
137 subm->matches(text, cursor, limit, incremental);
138 if (m != U_MATCH) {
139 return m;
140 }
141 }
142 }
143 // Record the match position
144 matchStart = offset;
145 matchLimit = cursor;
146 }
147
148 offset = cursor;
149 return U_MATCH;
150 }
151
152 /**
153 * Implement UnicodeMatcher
154 */
toPattern(UnicodeString & result,UBool escapeUnprintable) const155 UnicodeString& StringMatcher::toPattern(UnicodeString& result,
156 UBool escapeUnprintable) const
157 {
158 result.truncate(0);
159 UnicodeString str, quoteBuf;
160 if (segmentNumber > 0) {
161 result.append((UChar)40); /*(*/
162 }
163 for (int32_t i=0; i<pattern.length(); ++i) {
164 UChar keyChar = pattern.charAt(i);
165 const UnicodeMatcher* m = data->lookupMatcher(keyChar);
166 if (m == 0) {
167 ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
168 } else {
169 ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
170 TRUE, escapeUnprintable, quoteBuf);
171 }
172 }
173 if (segmentNumber > 0) {
174 result.append((UChar)41); /*)*/
175 }
176 // Flush quoteBuf out to result
177 ICU_Utility::appendToRule(result, -1,
178 TRUE, escapeUnprintable, quoteBuf);
179 return result;
180 }
181
182 /**
183 * Implement UnicodeMatcher
184 */
matchesIndexValue(uint8_t v) const185 UBool StringMatcher::matchesIndexValue(uint8_t v) const {
186 if (pattern.length() == 0) {
187 return TRUE;
188 }
189 UChar32 c = pattern.char32At(0);
190 const UnicodeMatcher *m = data->lookupMatcher(c);
191 return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
192 }
193
194 /**
195 * Implement UnicodeMatcher
196 */
addMatchSetTo(UnicodeSet & toUnionTo) const197 void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
198 UChar32 ch;
199 for (int32_t i=0; i<pattern.length(); i+=UTF_CHAR_LENGTH(ch)) {
200 ch = pattern.char32At(i);
201 const UnicodeMatcher* matcher = data->lookupMatcher(ch);
202 if (matcher == NULL) {
203 toUnionTo.add(ch);
204 } else {
205 matcher->addMatchSetTo(toUnionTo);
206 }
207 }
208 }
209
210 /**
211 * UnicodeReplacer API
212 */
replace(Replaceable & text,int32_t start,int32_t limit,int32_t &)213 int32_t StringMatcher::replace(Replaceable& text,
214 int32_t start,
215 int32_t limit,
216 int32_t& /*cursor*/) {
217
218 int32_t outLen = 0;
219
220 // Copy segment with out-of-band data
221 int32_t dest = limit;
222 // If there was no match, that means that a quantifier
223 // matched zero-length. E.g., x (a)* y matched "xy".
224 if (matchStart >= 0) {
225 if (matchStart != matchLimit) {
226 text.copy(matchStart, matchLimit, dest);
227 outLen = matchLimit - matchStart;
228 }
229 }
230
231 text.handleReplaceBetween(start, limit, EMPTY); // delete original text
232
233 return outLen;
234 }
235
236 /**
237 * UnicodeReplacer API
238 */
toReplacerPattern(UnicodeString & rule,UBool) const239 UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
240 UBool /*escapeUnprintable*/) const {
241 // assert(segmentNumber > 0);
242 rule.truncate(0);
243 rule.append((UChar)0x0024 /*$*/);
244 ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
245 return rule;
246 }
247
248 /**
249 * Remove any match info. This must be called before performing a
250 * set of matches with this segment.
251 */
resetMatch()252 void StringMatcher::resetMatch() {
253 matchStart = matchLimit = -1;
254 }
255
256 /**
257 * Union the set of all characters that may output by this object
258 * into the given set.
259 * @param toUnionTo the set into which to union the output characters
260 */
addReplacementSetTo(UnicodeSet &) const261 void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
262 // The output of this replacer varies; it is the source text between
263 // matchStart and matchLimit. Since this varies depending on the
264 // input text, we can't compute it here. We can either do nothing
265 // or we can add ALL characters to the set. It's probably more useful
266 // to do nothing.
267 }
268
269 /**
270 * Implement UnicodeFunctor
271 */
setData(const TransliterationRuleData * d)272 void StringMatcher::setData(const TransliterationRuleData* d) {
273 data = d;
274 int32_t i = 0;
275 while (i<pattern.length()) {
276 UChar32 c = pattern.char32At(i);
277 UnicodeFunctor* f = data->lookup(c);
278 if (f != NULL) {
279 f->setData(data);
280 }
281 i += UTF_CHAR_LENGTH(c);
282 }
283 }
284
285 U_NAMESPACE_END
286
287 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
288
289 //eof
290