• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (c) 2002-2004, International Business Machines Corporation
4 *   and others.  All Rights Reserved.
5 **********************************************************************
6 *   Date        Name        Description
7 *   01/21/2002  aliu        Creation.
8 **********************************************************************
9 */
10 
11 #include "unicode/utypes.h"
12 
13 #if !UCONFIG_NO_TRANSLITERATION
14 
15 #include "strrepl.h"
16 #include "rbt_data.h"
17 #include "util.h"
18 #include "unicode/uniset.h"
19 
20 U_NAMESPACE_BEGIN
21 
22 static const UChar EMPTY[] = { 0 }; // empty string: ""
23 
~UnicodeReplacer()24 UnicodeReplacer::~UnicodeReplacer() {}
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)
26 
27 /**
28  * Construct a StringReplacer that sets the emits the given output
29  * text and sets the cursor to the given position.
30  * @param theOutput text that will replace input text when the
31  * replace() method is called.  May contain stand-in characters
32  * that represent nested replacers.
33  * @param theCursorPos cursor position that will be returned by
34  * the replace() method
35  * @param theData transliterator context object that translates
36  * stand-in characters to UnicodeReplacer objects
37  */
38 StringReplacer::StringReplacer(const UnicodeString& theOutput,
39                                int32_t theCursorPos,
40                                const TransliterationRuleData* theData) {
41     output = theOutput;
42     cursorPos = theCursorPos;
43     hasCursor = TRUE;
44     data = theData;
45     isComplex = TRUE;
46 }
47 
48 /**
49  * Construct a StringReplacer that sets the emits the given output
50  * text and does not modify the cursor.
51  * @param theOutput text that will replace input text when the
52  * replace() method is called.  May contain stand-in characters
53  * that represent nested replacers.
54  * @param theData transliterator context object that translates
55  * stand-in characters to UnicodeReplacer objects
56  */
StringReplacer(const UnicodeString & theOutput,const TransliterationRuleData * theData)57 StringReplacer::StringReplacer(const UnicodeString& theOutput,
58                                const TransliterationRuleData* theData) {
59     output = theOutput;
60     cursorPos = 0;
61     hasCursor = FALSE;
62     data = theData;
63     isComplex = TRUE;
64 }
65 
66 /**
67  * Copy constructor.
68  */
StringReplacer(const StringReplacer & other)69 StringReplacer::StringReplacer(const StringReplacer& other) :
70     UnicodeFunctor(other),
71     UnicodeReplacer(other)
72 {
73     output = other.output;
74     cursorPos = other.cursorPos;
75     hasCursor = other.hasCursor;
76     data = other.data;
77     isComplex = other.isComplex;
78 }
79 
80 /**
81  * Destructor
82  */
~StringReplacer()83 StringReplacer::~StringReplacer() {
84 }
85 
86 /**
87  * Implement UnicodeFunctor
88  */
clone() const89 UnicodeFunctor* StringReplacer::clone() const {
90     return new StringReplacer(*this);
91 }
92 
93 /**
94  * Implement UnicodeFunctor
95  */
toReplacer() const96 UnicodeReplacer* StringReplacer::toReplacer() const {
97     return (UnicodeReplacer*) this;
98 }
99 
100 /**
101  * UnicodeReplacer API
102  */
replace(Replaceable & text,int32_t start,int32_t limit,int32_t & cursor)103 int32_t StringReplacer::replace(Replaceable& text,
104                                 int32_t start,
105                                 int32_t limit,
106                                 int32_t& cursor) {
107     int32_t outLen;
108     int32_t newStart = 0;
109 
110     // NOTE: It should be possible to _always_ run the complex
111     // processing code; just slower.  If not, then there is a bug
112     // in the complex processing code.
113 
114     // Simple (no nested replacers) Processing Code :
115     if (!isComplex) {
116         text.handleReplaceBetween(start, limit, output);
117         outLen = output.length();
118 
119         // Setup default cursor position (for cursorPos within output)
120         newStart = cursorPos;
121     }
122 
123     // Complex (nested replacers) Processing Code :
124     else {
125         /* When there are segments to be copied, use the Replaceable.copy()
126          * API in order to retain out-of-band data.  Copy everything to the
127          * end of the string, then copy them back over the key.  This preserves
128          * the integrity of indices into the key and surrounding context while
129          * generating the output text.
130          */
131         UnicodeString buf;
132         int32_t oOutput; // offset into 'output'
133         isComplex = FALSE;
134 
135         // The temporary buffer starts at tempStart, and extends
136         // to destLimit.  The start of the buffer has a single
137         // character from before the key.  This provides style
138         // data when addition characters are filled into the
139         // temporary buffer.  If there is nothing to the left, use
140         // the non-character U+FFFF, which Replaceable subclasses
141         // should treat specially as a "no-style character."
142         // destStart points to the point after the style context
143         // character, so it is tempStart+1 or tempStart+2.
144         int32_t tempStart = text.length(); // start of temp buffer
145         int32_t destStart = tempStart; // copy new text to here
146         if (start > 0) {
147             int32_t len = UTF_CHAR_LENGTH(text.char32At(start-1));
148             text.copy(start-len, start, tempStart);
149             destStart += len;
150         } else {
151             UnicodeString str((UChar) 0xFFFF);
152             text.handleReplaceBetween(tempStart, tempStart, str);
153             destStart++;
154         }
155         int32_t destLimit = destStart;
156 
157         for (oOutput=0; oOutput<output.length(); ) {
158             if (oOutput == cursorPos) {
159                 // Record the position of the cursor
160                 newStart = destLimit - destStart; // relative to start
161             }
162             UChar32 c = output.char32At(oOutput);
163             UnicodeReplacer* r = data->lookupReplacer(c);
164             if (r == NULL) {
165                 // Accumulate straight (non-segment) text.
166                 buf.append(c);
167             } else {
168                 isComplex = TRUE;
169 
170                 // Insert any accumulated straight text.
171                 if (buf.length() > 0) {
172                     text.handleReplaceBetween(destLimit, destLimit, buf);
173                     destLimit += buf.length();
174                     buf.truncate(0);
175                 }
176 
177                 // Delegate output generation to replacer object
178                 int32_t len = r->replace(text, destLimit, destLimit, cursor);
179                 destLimit += len;
180             }
181             oOutput += UTF_CHAR_LENGTH(c);
182         }
183         // Insert any accumulated straight text.
184         if (buf.length() > 0) {
185             text.handleReplaceBetween(destLimit, destLimit, buf);
186             destLimit += buf.length();
187         }
188         if (oOutput == cursorPos) {
189             // Record the position of the cursor
190             newStart = destLimit - destStart; // relative to start
191         }
192 
193         outLen = destLimit - destStart;
194 
195         // Copy new text to start, and delete it
196         text.copy(destStart, destLimit, start);
197         text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, EMPTY);
198 
199         // Delete the old text (the key)
200         text.handleReplaceBetween(start + outLen, limit + outLen, EMPTY);
201     }
202 
203     if (hasCursor) {
204         // Adjust the cursor for positions outside the key.  These
205         // refer to code points rather than code units.  If cursorPos
206         // is within the output string, then use newStart, which has
207         // already been set above.
208         if (cursorPos < 0) {
209             newStart = start;
210             int32_t n = cursorPos;
211             // Outside the output string, cursorPos counts code points
212             while (n < 0 && newStart > 0) {
213                 newStart -= UTF_CHAR_LENGTH(text.char32At(newStart-1));
214                 ++n;
215             }
216             newStart += n;
217         } else if (cursorPos > output.length()) {
218             newStart = start + outLen;
219             int32_t n = cursorPos - output.length();
220             // Outside the output string, cursorPos counts code points
221             while (n > 0 && newStart < text.length()) {
222                 newStart += UTF_CHAR_LENGTH(text.char32At(newStart));
223                 --n;
224             }
225             newStart += n;
226         } else {
227             // Cursor is within output string.  It has been set up above
228             // to be relative to start.
229             newStart += start;
230         }
231 
232         cursor = newStart;
233     }
234 
235     return outLen;
236 }
237 
238 /**
239  * UnicodeReplacer API
240  */
toReplacerPattern(UnicodeString & rule,UBool escapeUnprintable) const241 UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
242                                                  UBool escapeUnprintable) const {
243     rule.truncate(0);
244     UnicodeString quoteBuf;
245 
246     int32_t cursor = cursorPos;
247 
248     // Handle a cursor preceding the output
249     if (hasCursor && cursor < 0) {
250         while (cursor++ < 0) {
251             ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
252         }
253         // Fall through and append '|' below
254     }
255 
256     for (int32_t i=0; i<output.length(); ++i) {
257         if (hasCursor && i == cursor) {
258             ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
259         }
260         UChar c = output.charAt(i); // Ok to use 16-bits here
261 
262         UnicodeReplacer* r = data->lookupReplacer(c);
263         if (r == NULL) {
264             ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
265         } else {
266             UnicodeString buf;
267             r->toReplacerPattern(buf, escapeUnprintable);
268             buf.insert(0, (UChar)0x20);
269             buf.append((UChar)0x20);
270             ICU_Utility::appendToRule(rule, buf,
271                                       TRUE, escapeUnprintable, quoteBuf);
272         }
273     }
274 
275     // Handle a cursor after the output.  Use > rather than >= because
276     // if cursor == output.length() it is at the end of the output,
277     // which is the default position, so we need not emit it.
278     if (hasCursor && cursor > output.length()) {
279         cursor -= output.length();
280         while (cursor-- > 0) {
281             ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
282         }
283         ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
284     }
285     // Flush quoteBuf out to result
286     ICU_Utility::appendToRule(rule, -1,
287                               TRUE, escapeUnprintable, quoteBuf);
288 
289     return rule;
290 }
291 
292 /**
293  * Implement UnicodeReplacer
294  */
addReplacementSetTo(UnicodeSet & toUnionTo) const295 void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
296     UChar32 ch;
297     for (int32_t i=0; i<output.length(); i+=UTF_CHAR_LENGTH(ch)) {
298     ch = output.char32At(i);
299     UnicodeReplacer* r = data->lookupReplacer(ch);
300     if (r == NULL) {
301         toUnionTo.add(ch);
302     } else {
303         r->addReplacementSetTo(toUnionTo);
304     }
305     }
306 }
307 
308 /**
309  * UnicodeFunctor API
310  */
setData(const TransliterationRuleData * d)311 void StringReplacer::setData(const TransliterationRuleData* d) {
312     data = d;
313     int32_t i = 0;
314     while (i<output.length()) {
315         UChar32 c = output.char32At(i);
316         UnicodeFunctor* f = data->lookup(c);
317         if (f != NULL) {
318             f->setData(data);
319         }
320         i += UTF_CHAR_LENGTH(c);
321     }
322 }
323 
324 U_NAMESPACE_END
325 
326 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
327 
328 //eof
329