1 /*
2 **********************************************************************
3 * Copyright (c) 2001-2012, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 07/23/01 aliu Creation.
8 **********************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_TRANSLITERATION
14
15 #include "strmatch.h"
16 #include "rbt_data.h"
17 #include "util.h"
18 #include "unicode/uniset.h"
19 #include "unicode/utf16.h"
20
21 U_NAMESPACE_BEGIN
22
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)23 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
24
25 StringMatcher::StringMatcher(const UnicodeString& theString,
26 int32_t start,
27 int32_t limit,
28 int32_t segmentNum,
29 const TransliterationRuleData& theData) :
30 data(&theData),
31 segmentNumber(segmentNum),
32 matchStart(-1),
33 matchLimit(-1)
34 {
35 theString.extractBetween(start, limit, pattern);
36 }
37
StringMatcher(const StringMatcher & o)38 StringMatcher::StringMatcher(const StringMatcher& o) :
39 UnicodeFunctor(o),
40 UnicodeMatcher(o),
41 UnicodeReplacer(o),
42 pattern(o.pattern),
43 data(o.data),
44 segmentNumber(o.segmentNumber),
45 matchStart(o.matchStart),
46 matchLimit(o.matchLimit)
47 {
48 }
49
50 /**
51 * Destructor
52 */
~StringMatcher()53 StringMatcher::~StringMatcher() {
54 }
55
56 /**
57 * Implement UnicodeFunctor
58 */
clone() const59 UnicodeFunctor* StringMatcher::clone() const {
60 return new StringMatcher(*this);
61 }
62
63 /**
64 * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
65 * and return the pointer.
66 */
toMatcher() const67 UnicodeMatcher* StringMatcher::toMatcher() const {
68 StringMatcher *nonconst_this = const_cast<StringMatcher *>(this);
69 UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this);
70
71 return nonconst_base;
72 }
73
74 /**
75 * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
76 * and return the pointer.
77 */
toReplacer() const78 UnicodeReplacer* StringMatcher::toReplacer() const {
79 StringMatcher *nonconst_this = const_cast<StringMatcher *>(this);
80 UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this);
81
82 return nonconst_base;
83 }
84
85 /**
86 * Implement UnicodeMatcher
87 */
matches(const Replaceable & text,int32_t & offset,int32_t limit,UBool incremental)88 UMatchDegree StringMatcher::matches(const Replaceable& text,
89 int32_t& offset,
90 int32_t limit,
91 UBool incremental) {
92 int32_t i;
93 int32_t cursor = offset;
94 if (limit < cursor) {
95 // Match in the reverse direction
96 for (i=pattern.length()-1; i>=0; --i) {
97 UChar keyChar = pattern.charAt(i);
98 UnicodeMatcher* subm = data->lookupMatcher(keyChar);
99 if (subm == 0) {
100 if (cursor > limit &&
101 keyChar == text.charAt(cursor)) {
102 --cursor;
103 } else {
104 return U_MISMATCH;
105 }
106 } else {
107 UMatchDegree m =
108 subm->matches(text, cursor, limit, incremental);
109 if (m != U_MATCH) {
110 return m;
111 }
112 }
113 }
114 // Record the match position, but adjust for a normal
115 // forward start, limit, and only if a prior match does not
116 // exist -- we want the rightmost match.
117 if (matchStart < 0) {
118 matchStart = cursor+1;
119 matchLimit = offset+1;
120 }
121 } else {
122 for (i=0; i<pattern.length(); ++i) {
123 if (incremental && cursor == limit) {
124 // We've reached the context limit without a mismatch and
125 // without completing our match.
126 return U_PARTIAL_MATCH;
127 }
128 UChar keyChar = pattern.charAt(i);
129 UnicodeMatcher* subm = data->lookupMatcher(keyChar);
130 if (subm == 0) {
131 // Don't need the cursor < limit check if
132 // incremental is TRUE (because it's done above); do need
133 // it otherwise.
134 if (cursor < limit &&
135 keyChar == text.charAt(cursor)) {
136 ++cursor;
137 } else {
138 return U_MISMATCH;
139 }
140 } else {
141 UMatchDegree m =
142 subm->matches(text, cursor, limit, incremental);
143 if (m != U_MATCH) {
144 return m;
145 }
146 }
147 }
148 // Record the match position
149 matchStart = offset;
150 matchLimit = cursor;
151 }
152
153 offset = cursor;
154 return U_MATCH;
155 }
156
157 /**
158 * Implement UnicodeMatcher
159 */
toPattern(UnicodeString & result,UBool escapeUnprintable) const160 UnicodeString& StringMatcher::toPattern(UnicodeString& result,
161 UBool escapeUnprintable) const
162 {
163 result.truncate(0);
164 UnicodeString str, quoteBuf;
165 if (segmentNumber > 0) {
166 result.append((UChar)40); /*(*/
167 }
168 for (int32_t i=0; i<pattern.length(); ++i) {
169 UChar keyChar = pattern.charAt(i);
170 const UnicodeMatcher* m = data->lookupMatcher(keyChar);
171 if (m == 0) {
172 ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
173 } else {
174 ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
175 TRUE, escapeUnprintable, quoteBuf);
176 }
177 }
178 if (segmentNumber > 0) {
179 result.append((UChar)41); /*)*/
180 }
181 // Flush quoteBuf out to result
182 ICU_Utility::appendToRule(result, -1,
183 TRUE, escapeUnprintable, quoteBuf);
184 return result;
185 }
186
187 /**
188 * Implement UnicodeMatcher
189 */
matchesIndexValue(uint8_t v) const190 UBool StringMatcher::matchesIndexValue(uint8_t v) const {
191 if (pattern.length() == 0) {
192 return TRUE;
193 }
194 UChar32 c = pattern.char32At(0);
195 const UnicodeMatcher *m = data->lookupMatcher(c);
196 return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
197 }
198
199 /**
200 * Implement UnicodeMatcher
201 */
addMatchSetTo(UnicodeSet & toUnionTo) const202 void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
203 UChar32 ch;
204 for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
205 ch = pattern.char32At(i);
206 const UnicodeMatcher* matcher = data->lookupMatcher(ch);
207 if (matcher == NULL) {
208 toUnionTo.add(ch);
209 } else {
210 matcher->addMatchSetTo(toUnionTo);
211 }
212 }
213 }
214
215 /**
216 * UnicodeReplacer API
217 */
replace(Replaceable & text,int32_t start,int32_t limit,int32_t &)218 int32_t StringMatcher::replace(Replaceable& text,
219 int32_t start,
220 int32_t limit,
221 int32_t& /*cursor*/) {
222
223 int32_t outLen = 0;
224
225 // Copy segment with out-of-band data
226 int32_t dest = limit;
227 // If there was no match, that means that a quantifier
228 // matched zero-length. E.g., x (a)* y matched "xy".
229 if (matchStart >= 0) {
230 if (matchStart != matchLimit) {
231 text.copy(matchStart, matchLimit, dest);
232 outLen = matchLimit - matchStart;
233 }
234 }
235
236 text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text
237
238 return outLen;
239 }
240
241 /**
242 * UnicodeReplacer API
243 */
toReplacerPattern(UnicodeString & rule,UBool) const244 UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
245 UBool /*escapeUnprintable*/) const {
246 // assert(segmentNumber > 0);
247 rule.truncate(0);
248 rule.append((UChar)0x0024 /*$*/);
249 ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
250 return rule;
251 }
252
253 /**
254 * Remove any match info. This must be called before performing a
255 * set of matches with this segment.
256 */
resetMatch()257 void StringMatcher::resetMatch() {
258 matchStart = matchLimit = -1;
259 }
260
261 /**
262 * Union the set of all characters that may output by this object
263 * into the given set.
264 * @param toUnionTo the set into which to union the output characters
265 */
addReplacementSetTo(UnicodeSet &) const266 void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
267 // The output of this replacer varies; it is the source text between
268 // matchStart and matchLimit. Since this varies depending on the
269 // input text, we can't compute it here. We can either do nothing
270 // or we can add ALL characters to the set. It's probably more useful
271 // to do nothing.
272 }
273
274 /**
275 * Implement UnicodeFunctor
276 */
setData(const TransliterationRuleData * d)277 void StringMatcher::setData(const TransliterationRuleData* d) {
278 data = d;
279 int32_t i = 0;
280 while (i<pattern.length()) {
281 UChar32 c = pattern.char32At(i);
282 UnicodeFunctor* f = data->lookup(c);
283 if (f != NULL) {
284 f->setData(data);
285 }
286 i += U16_LENGTH(c);
287 }
288 }
289
290 U_NAMESPACE_END
291
292 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
293
294 //eof
295