1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (c) 2001-2012, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 **********************************************************************
8 * Date Name Description
9 * 07/23/01 aliu Creation.
10 **********************************************************************
11 */
12
13 #include "unicode/utypes.h"
14
15 #if !UCONFIG_NO_TRANSLITERATION
16
17 #include "strmatch.h"
18 #include "rbt_data.h"
19 #include "util.h"
20 #include "unicode/uniset.h"
21 #include "unicode/utf16.h"
22
23 U_NAMESPACE_BEGIN
24
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
26
27 StringMatcher::StringMatcher(const UnicodeString& theString,
28 int32_t start,
29 int32_t limit,
30 int32_t segmentNum,
31 const TransliterationRuleData& theData) :
32 data(&theData),
33 segmentNumber(segmentNum),
34 matchStart(-1),
35 matchLimit(-1)
36 {
37 theString.extractBetween(start, limit, pattern);
38 }
39
StringMatcher(const StringMatcher & o)40 StringMatcher::StringMatcher(const StringMatcher& o) :
41 UnicodeFunctor(o),
42 UnicodeMatcher(o),
43 UnicodeReplacer(o),
44 pattern(o.pattern),
45 data(o.data),
46 segmentNumber(o.segmentNumber),
47 matchStart(o.matchStart),
48 matchLimit(o.matchLimit)
49 {
50 }
51
52 /**
53 * Destructor
54 */
~StringMatcher()55 StringMatcher::~StringMatcher() {
56 }
57
58 /**
59 * Implement UnicodeFunctor
60 */
clone() const61 UnicodeFunctor* StringMatcher::clone() const {
62 return new StringMatcher(*this);
63 }
64
65 /**
66 * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
67 * and return the pointer.
68 */
toMatcher() const69 UnicodeMatcher* StringMatcher::toMatcher() const {
70 StringMatcher *nonconst_this = const_cast<StringMatcher *>(this);
71 UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this);
72
73 return nonconst_base;
74 }
75
76 /**
77 * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
78 * and return the pointer.
79 */
toReplacer() const80 UnicodeReplacer* StringMatcher::toReplacer() const {
81 StringMatcher *nonconst_this = const_cast<StringMatcher *>(this);
82 UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this);
83
84 return nonconst_base;
85 }
86
87 /**
88 * Implement UnicodeMatcher
89 */
matches(const Replaceable & text,int32_t & offset,int32_t limit,UBool incremental)90 UMatchDegree StringMatcher::matches(const Replaceable& text,
91 int32_t& offset,
92 int32_t limit,
93 UBool incremental) {
94 int32_t i;
95 int32_t cursor = offset;
96 if (limit < cursor) {
97 // Match in the reverse direction
98 for (i=pattern.length()-1; i>=0; --i) {
99 UChar keyChar = pattern.charAt(i);
100 UnicodeMatcher* subm = data->lookupMatcher(keyChar);
101 if (subm == 0) {
102 if (cursor > limit &&
103 keyChar == text.charAt(cursor)) {
104 --cursor;
105 } else {
106 return U_MISMATCH;
107 }
108 } else {
109 UMatchDegree m =
110 subm->matches(text, cursor, limit, incremental);
111 if (m != U_MATCH) {
112 return m;
113 }
114 }
115 }
116 // Record the match position, but adjust for a normal
117 // forward start, limit, and only if a prior match does not
118 // exist -- we want the rightmost match.
119 if (matchStart < 0) {
120 matchStart = cursor+1;
121 matchLimit = offset+1;
122 }
123 } else {
124 for (i=0; i<pattern.length(); ++i) {
125 if (incremental && cursor == limit) {
126 // We've reached the context limit without a mismatch and
127 // without completing our match.
128 return U_PARTIAL_MATCH;
129 }
130 UChar keyChar = pattern.charAt(i);
131 UnicodeMatcher* subm = data->lookupMatcher(keyChar);
132 if (subm == 0) {
133 // Don't need the cursor < limit check if
134 // incremental is TRUE (because it's done above); do need
135 // it otherwise.
136 if (cursor < limit &&
137 keyChar == text.charAt(cursor)) {
138 ++cursor;
139 } else {
140 return U_MISMATCH;
141 }
142 } else {
143 UMatchDegree m =
144 subm->matches(text, cursor, limit, incremental);
145 if (m != U_MATCH) {
146 return m;
147 }
148 }
149 }
150 // Record the match position
151 matchStart = offset;
152 matchLimit = cursor;
153 }
154
155 offset = cursor;
156 return U_MATCH;
157 }
158
159 /**
160 * Implement UnicodeMatcher
161 */
toPattern(UnicodeString & result,UBool escapeUnprintable) const162 UnicodeString& StringMatcher::toPattern(UnicodeString& result,
163 UBool escapeUnprintable) const
164 {
165 result.truncate(0);
166 UnicodeString str, quoteBuf;
167 if (segmentNumber > 0) {
168 result.append((UChar)40); /*(*/
169 }
170 for (int32_t i=0; i<pattern.length(); ++i) {
171 UChar keyChar = pattern.charAt(i);
172 const UnicodeMatcher* m = data->lookupMatcher(keyChar);
173 if (m == 0) {
174 ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
175 } else {
176 ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
177 TRUE, escapeUnprintable, quoteBuf);
178 }
179 }
180 if (segmentNumber > 0) {
181 result.append((UChar)41); /*)*/
182 }
183 // Flush quoteBuf out to result
184 ICU_Utility::appendToRule(result, -1,
185 TRUE, escapeUnprintable, quoteBuf);
186 return result;
187 }
188
189 /**
190 * Implement UnicodeMatcher
191 */
matchesIndexValue(uint8_t v) const192 UBool StringMatcher::matchesIndexValue(uint8_t v) const {
193 if (pattern.length() == 0) {
194 return TRUE;
195 }
196 UChar32 c = pattern.char32At(0);
197 const UnicodeMatcher *m = data->lookupMatcher(c);
198 return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
199 }
200
201 /**
202 * Implement UnicodeMatcher
203 */
addMatchSetTo(UnicodeSet & toUnionTo) const204 void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
205 UChar32 ch;
206 for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
207 ch = pattern.char32At(i);
208 const UnicodeMatcher* matcher = data->lookupMatcher(ch);
209 if (matcher == NULL) {
210 toUnionTo.add(ch);
211 } else {
212 matcher->addMatchSetTo(toUnionTo);
213 }
214 }
215 }
216
217 /**
218 * UnicodeReplacer API
219 */
replace(Replaceable & text,int32_t start,int32_t limit,int32_t &)220 int32_t StringMatcher::replace(Replaceable& text,
221 int32_t start,
222 int32_t limit,
223 int32_t& /*cursor*/) {
224
225 int32_t outLen = 0;
226
227 // Copy segment with out-of-band data
228 int32_t dest = limit;
229 // If there was no match, that means that a quantifier
230 // matched zero-length. E.g., x (a)* y matched "xy".
231 if (matchStart >= 0) {
232 if (matchStart != matchLimit) {
233 text.copy(matchStart, matchLimit, dest);
234 outLen = matchLimit - matchStart;
235 }
236 }
237
238 text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text
239
240 return outLen;
241 }
242
243 /**
244 * UnicodeReplacer API
245 */
toReplacerPattern(UnicodeString & rule,UBool) const246 UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
247 UBool /*escapeUnprintable*/) const {
248 // assert(segmentNumber > 0);
249 rule.truncate(0);
250 rule.append((UChar)0x0024 /*$*/);
251 ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
252 return rule;
253 }
254
255 /**
256 * Remove any match info. This must be called before performing a
257 * set of matches with this segment.
258 */
resetMatch()259 void StringMatcher::resetMatch() {
260 matchStart = matchLimit = -1;
261 }
262
263 /**
264 * Union the set of all characters that may output by this object
265 * into the given set.
266 * @param toUnionTo the set into which to union the output characters
267 */
addReplacementSetTo(UnicodeSet &) const268 void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
269 // The output of this replacer varies; it is the source text between
270 // matchStart and matchLimit. Since this varies depending on the
271 // input text, we can't compute it here. We can either do nothing
272 // or we can add ALL characters to the set. It's probably more useful
273 // to do nothing.
274 }
275
276 /**
277 * Implement UnicodeFunctor
278 */
setData(const TransliterationRuleData * d)279 void StringMatcher::setData(const TransliterationRuleData* d) {
280 data = d;
281 int32_t i = 0;
282 while (i<pattern.length()) {
283 UChar32 c = pattern.char32At(i);
284 UnicodeFunctor* f = data->lookup(c);
285 if (f != NULL) {
286 f->setData(data);
287 }
288 i += U16_LENGTH(c);
289 }
290 }
291
292 U_NAMESPACE_END
293
294 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
295
296 //eof
297