1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (c) 2002-2012, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 **********************************************************************
8 * Date Name Description
9 * 01/21/2002 aliu Creation.
10 **********************************************************************
11 */
12
13 #include "unicode/utypes.h"
14
15 #if !UCONFIG_NO_TRANSLITERATION
16
17 #include "unicode/uniset.h"
18 #include "unicode/utf16.h"
19 #include "strrepl.h"
20 #include "rbt_data.h"
21 #include "util.h"
22
23 U_NAMESPACE_BEGIN
24
~UnicodeReplacer()25 UnicodeReplacer::~UnicodeReplacer() {}
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)26 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)
27
28 /**
29 * Construct a StringReplacer that sets the emits the given output
30 * text and sets the cursor to the given position.
31 * @param theOutput text that will replace input text when the
32 * replace() method is called. May contain stand-in characters
33 * that represent nested replacers.
34 * @param theCursorPos cursor position that will be returned by
35 * the replace() method
36 * @param theData transliterator context object that translates
37 * stand-in characters to UnicodeReplacer objects
38 */
39 StringReplacer::StringReplacer(const UnicodeString& theOutput,
40 int32_t theCursorPos,
41 const TransliterationRuleData* theData) {
42 output = theOutput;
43 cursorPos = theCursorPos;
44 hasCursor = true;
45 data = theData;
46 isComplex = true;
47 }
48
49 /**
50 * Construct a StringReplacer that sets the emits the given output
51 * text and does not modify the cursor.
52 * @param theOutput text that will replace input text when the
53 * replace() method is called. May contain stand-in characters
54 * that represent nested replacers.
55 * @param theData transliterator context object that translates
56 * stand-in characters to UnicodeReplacer objects
57 */
StringReplacer(const UnicodeString & theOutput,const TransliterationRuleData * theData)58 StringReplacer::StringReplacer(const UnicodeString& theOutput,
59 const TransliterationRuleData* theData) {
60 output = theOutput;
61 cursorPos = 0;
62 hasCursor = false;
63 data = theData;
64 isComplex = true;
65 }
66
67 /**
68 * Copy constructor.
69 */
StringReplacer(const StringReplacer & other)70 StringReplacer::StringReplacer(const StringReplacer& other) :
71 UnicodeFunctor(other),
72 UnicodeReplacer(other)
73 {
74 output = other.output;
75 cursorPos = other.cursorPos;
76 hasCursor = other.hasCursor;
77 data = other.data;
78 isComplex = other.isComplex;
79 }
80
81 /**
82 * Destructor
83 */
~StringReplacer()84 StringReplacer::~StringReplacer() {
85 }
86
87 /**
88 * Implement UnicodeFunctor
89 */
clone() const90 StringReplacer* StringReplacer::clone() const {
91 return new StringReplacer(*this);
92 }
93
94 /**
95 * Implement UnicodeFunctor
96 */
toReplacer() const97 UnicodeReplacer* StringReplacer::toReplacer() const {
98 return const_cast<StringReplacer *>(this);
99 }
100
101 /**
102 * UnicodeReplacer API
103 */
replace(Replaceable & text,int32_t start,int32_t limit,int32_t & cursor)104 int32_t StringReplacer::replace(Replaceable& text,
105 int32_t start,
106 int32_t limit,
107 int32_t& cursor) {
108 int32_t outLen;
109 int32_t newStart = 0;
110
111 // NOTE: It should be possible to _always_ run the complex
112 // processing code; just slower. If not, then there is a bug
113 // in the complex processing code.
114
115 // Simple (no nested replacers) Processing Code :
116 if (!isComplex) {
117 text.handleReplaceBetween(start, limit, output);
118 outLen = output.length();
119
120 // Setup default cursor position (for cursorPos within output)
121 newStart = cursorPos;
122 }
123
124 // Complex (nested replacers) Processing Code :
125 else {
126 /* When there are segments to be copied, use the Replaceable.copy()
127 * API in order to retain out-of-band data. Copy everything to the
128 * end of the string, then copy them back over the key. This preserves
129 * the integrity of indices into the key and surrounding context while
130 * generating the output text.
131 */
132 UnicodeString buf;
133 int32_t oOutput; // offset into 'output'
134 isComplex = false;
135
136 // The temporary buffer starts at tempStart, and extends
137 // to destLimit. The start of the buffer has a single
138 // character from before the key. This provides style
139 // data when addition characters are filled into the
140 // temporary buffer. If there is nothing to the left, use
141 // the non-character U+FFFF, which Replaceable subclasses
142 // should treat specially as a "no-style character."
143 // destStart points to the point after the style context
144 // character, so it is tempStart+1 or tempStart+2.
145 int32_t tempStart = text.length(); // start of temp buffer
146 int32_t destStart = tempStart; // copy new text to here
147 if (start > 0) {
148 int32_t len = U16_LENGTH(text.char32At(start-1));
149 text.copy(start-len, start, tempStart);
150 destStart += len;
151 } else {
152 UnicodeString str((char16_t) 0xFFFF);
153 text.handleReplaceBetween(tempStart, tempStart, str);
154 destStart++;
155 }
156 int32_t destLimit = destStart;
157
158 for (oOutput=0; oOutput<output.length(); ) {
159 if (oOutput == cursorPos) {
160 // Record the position of the cursor
161 newStart = destLimit - destStart; // relative to start
162 }
163 UChar32 c = output.char32At(oOutput);
164 UnicodeReplacer* r = data->lookupReplacer(c);
165 if (r == nullptr) {
166 // Accumulate straight (non-segment) text.
167 buf.append(c);
168 } else {
169 isComplex = true;
170
171 // Insert any accumulated straight text.
172 if (buf.length() > 0) {
173 text.handleReplaceBetween(destLimit, destLimit, buf);
174 destLimit += buf.length();
175 buf.truncate(0);
176 }
177
178 // Delegate output generation to replacer object
179 int32_t len = r->replace(text, destLimit, destLimit, cursor);
180 destLimit += len;
181 }
182 oOutput += U16_LENGTH(c);
183 }
184 // Insert any accumulated straight text.
185 if (buf.length() > 0) {
186 text.handleReplaceBetween(destLimit, destLimit, buf);
187 destLimit += buf.length();
188 }
189 if (oOutput == cursorPos) {
190 // Record the position of the cursor
191 newStart = destLimit - destStart; // relative to start
192 }
193
194 outLen = destLimit - destStart;
195
196 // Copy new text to start, and delete it
197 text.copy(destStart, destLimit, start);
198 text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString());
199
200 // Delete the old text (the key)
201 text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString());
202 }
203
204 if (hasCursor) {
205 // Adjust the cursor for positions outside the key. These
206 // refer to code points rather than code units. If cursorPos
207 // is within the output string, then use newStart, which has
208 // already been set above.
209 if (cursorPos < 0) {
210 newStart = start;
211 int32_t n = cursorPos;
212 // Outside the output string, cursorPos counts code points
213 while (n < 0 && newStart > 0) {
214 newStart -= U16_LENGTH(text.char32At(newStart-1));
215 ++n;
216 }
217 newStart += n;
218 } else if (cursorPos > output.length()) {
219 newStart = start + outLen;
220 int32_t n = cursorPos - output.length();
221 // Outside the output string, cursorPos counts code points
222 while (n > 0 && newStart < text.length()) {
223 newStart += U16_LENGTH(text.char32At(newStart));
224 --n;
225 }
226 newStart += n;
227 } else {
228 // Cursor is within output string. It has been set up above
229 // to be relative to start.
230 newStart += start;
231 }
232
233 cursor = newStart;
234 }
235
236 return outLen;
237 }
238
239 /**
240 * UnicodeReplacer API
241 */
toReplacerPattern(UnicodeString & rule,UBool escapeUnprintable) const242 UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
243 UBool escapeUnprintable) const {
244 rule.truncate(0);
245 UnicodeString quoteBuf;
246
247 int32_t cursor = cursorPos;
248
249 // Handle a cursor preceding the output
250 if (hasCursor && cursor < 0) {
251 while (cursor++ < 0) {
252 ICU_Utility::appendToRule(rule, (char16_t)0x0040 /*@*/, true, escapeUnprintable, quoteBuf);
253 }
254 // Fall through and append '|' below
255 }
256
257 for (int32_t i=0; i<output.length(); ++i) {
258 if (hasCursor && i == cursor) {
259 ICU_Utility::appendToRule(rule, (char16_t)0x007C /*|*/, true, escapeUnprintable, quoteBuf);
260 }
261 char16_t c = output.charAt(i); // Ok to use 16-bits here
262
263 UnicodeReplacer* r = data->lookupReplacer(c);
264 if (r == nullptr) {
265 ICU_Utility::appendToRule(rule, c, false, escapeUnprintable, quoteBuf);
266 } else {
267 UnicodeString buf;
268 r->toReplacerPattern(buf, escapeUnprintable);
269 buf.insert(0, (char16_t)0x20);
270 buf.append((char16_t)0x20);
271 ICU_Utility::appendToRule(rule, buf,
272 true, escapeUnprintable, quoteBuf);
273 }
274 }
275
276 // Handle a cursor after the output. Use > rather than >= because
277 // if cursor == output.length() it is at the end of the output,
278 // which is the default position, so we need not emit it.
279 if (hasCursor && cursor > output.length()) {
280 cursor -= output.length();
281 while (cursor-- > 0) {
282 ICU_Utility::appendToRule(rule, (char16_t)0x0040 /*@*/, true, escapeUnprintable, quoteBuf);
283 }
284 ICU_Utility::appendToRule(rule, (char16_t)0x007C /*|*/, true, escapeUnprintable, quoteBuf);
285 }
286 // Flush quoteBuf out to result
287 ICU_Utility::appendToRule(rule, -1,
288 true, escapeUnprintable, quoteBuf);
289
290 return rule;
291 }
292
293 /**
294 * Implement UnicodeReplacer
295 */
addReplacementSetTo(UnicodeSet & toUnionTo) const296 void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
297 UChar32 ch;
298 for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) {
299 ch = output.char32At(i);
300 UnicodeReplacer* r = data->lookupReplacer(ch);
301 if (r == nullptr) {
302 toUnionTo.add(ch);
303 } else {
304 r->addReplacementSetTo(toUnionTo);
305 }
306 }
307 }
308
309 /**
310 * UnicodeFunctor API
311 */
setData(const TransliterationRuleData * d)312 void StringReplacer::setData(const TransliterationRuleData* d) {
313 data = d;
314 int32_t i = 0;
315 while (i<output.length()) {
316 UChar32 c = output.char32At(i);
317 UnicodeFunctor* f = data->lookup(c);
318 if (f != nullptr) {
319 f->setData(data);
320 }
321 i += U16_LENGTH(c);
322 }
323 }
324
325 U_NAMESPACE_END
326
327 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
328
329 //eof
330