• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (c) 2002-2012, International Business Machines Corporation
4 *   and others.  All Rights Reserved.
5 **********************************************************************
6 *   Date        Name        Description
7 *   01/21/2002  aliu        Creation.
8 **********************************************************************
9 */
10 
11 #include "unicode/utypes.h"
12 
13 #if !UCONFIG_NO_TRANSLITERATION
14 
15 #include "unicode/uniset.h"
16 #include "unicode/utf16.h"
17 #include "strrepl.h"
18 #include "rbt_data.h"
19 #include "util.h"
20 
21 U_NAMESPACE_BEGIN
22 
~UnicodeReplacer()23 UnicodeReplacer::~UnicodeReplacer() {}
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)
25 
26 /**
27  * Construct a StringReplacer that sets the emits the given output
28  * text and sets the cursor to the given position.
29  * @param theOutput text that will replace input text when the
30  * replace() method is called.  May contain stand-in characters
31  * that represent nested replacers.
32  * @param theCursorPos cursor position that will be returned by
33  * the replace() method
34  * @param theData transliterator context object that translates
35  * stand-in characters to UnicodeReplacer objects
36  */
37 StringReplacer::StringReplacer(const UnicodeString& theOutput,
38                                int32_t theCursorPos,
39                                const TransliterationRuleData* theData) {
40     output = theOutput;
41     cursorPos = theCursorPos;
42     hasCursor = TRUE;
43     data = theData;
44     isComplex = TRUE;
45 }
46 
47 /**
48  * Construct a StringReplacer that sets the emits the given output
49  * text and does not modify the cursor.
50  * @param theOutput text that will replace input text when the
51  * replace() method is called.  May contain stand-in characters
52  * that represent nested replacers.
53  * @param theData transliterator context object that translates
54  * stand-in characters to UnicodeReplacer objects
55  */
StringReplacer(const UnicodeString & theOutput,const TransliterationRuleData * theData)56 StringReplacer::StringReplacer(const UnicodeString& theOutput,
57                                const TransliterationRuleData* theData) {
58     output = theOutput;
59     cursorPos = 0;
60     hasCursor = FALSE;
61     data = theData;
62     isComplex = TRUE;
63 }
64 
65 /**
66  * Copy constructor.
67  */
StringReplacer(const StringReplacer & other)68 StringReplacer::StringReplacer(const StringReplacer& other) :
69     UnicodeFunctor(other),
70     UnicodeReplacer(other)
71 {
72     output = other.output;
73     cursorPos = other.cursorPos;
74     hasCursor = other.hasCursor;
75     data = other.data;
76     isComplex = other.isComplex;
77 }
78 
79 /**
80  * Destructor
81  */
~StringReplacer()82 StringReplacer::~StringReplacer() {
83 }
84 
85 /**
86  * Implement UnicodeFunctor
87  */
clone() const88 UnicodeFunctor* StringReplacer::clone() const {
89     return new StringReplacer(*this);
90 }
91 
92 /**
93  * Implement UnicodeFunctor
94  */
toReplacer() const95 UnicodeReplacer* StringReplacer::toReplacer() const {
96   return const_cast<StringReplacer *>(this);
97 }
98 
99 /**
100  * UnicodeReplacer API
101  */
replace(Replaceable & text,int32_t start,int32_t limit,int32_t & cursor)102 int32_t StringReplacer::replace(Replaceable& text,
103                                 int32_t start,
104                                 int32_t limit,
105                                 int32_t& cursor) {
106     int32_t outLen;
107     int32_t newStart = 0;
108 
109     // NOTE: It should be possible to _always_ run the complex
110     // processing code; just slower.  If not, then there is a bug
111     // in the complex processing code.
112 
113     // Simple (no nested replacers) Processing Code :
114     if (!isComplex) {
115         text.handleReplaceBetween(start, limit, output);
116         outLen = output.length();
117 
118         // Setup default cursor position (for cursorPos within output)
119         newStart = cursorPos;
120     }
121 
122     // Complex (nested replacers) Processing Code :
123     else {
124         /* When there are segments to be copied, use the Replaceable.copy()
125          * API in order to retain out-of-band data.  Copy everything to the
126          * end of the string, then copy them back over the key.  This preserves
127          * the integrity of indices into the key and surrounding context while
128          * generating the output text.
129          */
130         UnicodeString buf;
131         int32_t oOutput; // offset into 'output'
132         isComplex = FALSE;
133 
134         // The temporary buffer starts at tempStart, and extends
135         // to destLimit.  The start of the buffer has a single
136         // character from before the key.  This provides style
137         // data when addition characters are filled into the
138         // temporary buffer.  If there is nothing to the left, use
139         // the non-character U+FFFF, which Replaceable subclasses
140         // should treat specially as a "no-style character."
141         // destStart points to the point after the style context
142         // character, so it is tempStart+1 or tempStart+2.
143         int32_t tempStart = text.length(); // start of temp buffer
144         int32_t destStart = tempStart; // copy new text to here
145         if (start > 0) {
146             int32_t len = U16_LENGTH(text.char32At(start-1));
147             text.copy(start-len, start, tempStart);
148             destStart += len;
149         } else {
150             UnicodeString str((UChar) 0xFFFF);
151             text.handleReplaceBetween(tempStart, tempStart, str);
152             destStart++;
153         }
154         int32_t destLimit = destStart;
155 
156         for (oOutput=0; oOutput<output.length(); ) {
157             if (oOutput == cursorPos) {
158                 // Record the position of the cursor
159                 newStart = destLimit - destStart; // relative to start
160             }
161             UChar32 c = output.char32At(oOutput);
162             UnicodeReplacer* r = data->lookupReplacer(c);
163             if (r == NULL) {
164                 // Accumulate straight (non-segment) text.
165                 buf.append(c);
166             } else {
167                 isComplex = TRUE;
168 
169                 // Insert any accumulated straight text.
170                 if (buf.length() > 0) {
171                     text.handleReplaceBetween(destLimit, destLimit, buf);
172                     destLimit += buf.length();
173                     buf.truncate(0);
174                 }
175 
176                 // Delegate output generation to replacer object
177                 int32_t len = r->replace(text, destLimit, destLimit, cursor);
178                 destLimit += len;
179             }
180             oOutput += U16_LENGTH(c);
181         }
182         // Insert any accumulated straight text.
183         if (buf.length() > 0) {
184             text.handleReplaceBetween(destLimit, destLimit, buf);
185             destLimit += buf.length();
186         }
187         if (oOutput == cursorPos) {
188             // Record the position of the cursor
189             newStart = destLimit - destStart; // relative to start
190         }
191 
192         outLen = destLimit - destStart;
193 
194         // Copy new text to start, and delete it
195         text.copy(destStart, destLimit, start);
196         text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString());
197 
198         // Delete the old text (the key)
199         text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString());
200     }
201 
202     if (hasCursor) {
203         // Adjust the cursor for positions outside the key.  These
204         // refer to code points rather than code units.  If cursorPos
205         // is within the output string, then use newStart, which has
206         // already been set above.
207         if (cursorPos < 0) {
208             newStart = start;
209             int32_t n = cursorPos;
210             // Outside the output string, cursorPos counts code points
211             while (n < 0 && newStart > 0) {
212                 newStart -= U16_LENGTH(text.char32At(newStart-1));
213                 ++n;
214             }
215             newStart += n;
216         } else if (cursorPos > output.length()) {
217             newStart = start + outLen;
218             int32_t n = cursorPos - output.length();
219             // Outside the output string, cursorPos counts code points
220             while (n > 0 && newStart < text.length()) {
221                 newStart += U16_LENGTH(text.char32At(newStart));
222                 --n;
223             }
224             newStart += n;
225         } else {
226             // Cursor is within output string.  It has been set up above
227             // to be relative to start.
228             newStart += start;
229         }
230 
231         cursor = newStart;
232     }
233 
234     return outLen;
235 }
236 
237 /**
238  * UnicodeReplacer API
239  */
toReplacerPattern(UnicodeString & rule,UBool escapeUnprintable) const240 UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
241                                                  UBool escapeUnprintable) const {
242     rule.truncate(0);
243     UnicodeString quoteBuf;
244 
245     int32_t cursor = cursorPos;
246 
247     // Handle a cursor preceding the output
248     if (hasCursor && cursor < 0) {
249         while (cursor++ < 0) {
250             ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
251         }
252         // Fall through and append '|' below
253     }
254 
255     for (int32_t i=0; i<output.length(); ++i) {
256         if (hasCursor && i == cursor) {
257             ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
258         }
259         UChar c = output.charAt(i); // Ok to use 16-bits here
260 
261         UnicodeReplacer* r = data->lookupReplacer(c);
262         if (r == NULL) {
263             ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
264         } else {
265             UnicodeString buf;
266             r->toReplacerPattern(buf, escapeUnprintable);
267             buf.insert(0, (UChar)0x20);
268             buf.append((UChar)0x20);
269             ICU_Utility::appendToRule(rule, buf,
270                                       TRUE, escapeUnprintable, quoteBuf);
271         }
272     }
273 
274     // Handle a cursor after the output.  Use > rather than >= because
275     // if cursor == output.length() it is at the end of the output,
276     // which is the default position, so we need not emit it.
277     if (hasCursor && cursor > output.length()) {
278         cursor -= output.length();
279         while (cursor-- > 0) {
280             ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
281         }
282         ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
283     }
284     // Flush quoteBuf out to result
285     ICU_Utility::appendToRule(rule, -1,
286                               TRUE, escapeUnprintable, quoteBuf);
287 
288     return rule;
289 }
290 
291 /**
292  * Implement UnicodeReplacer
293  */
addReplacementSetTo(UnicodeSet & toUnionTo) const294 void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
295     UChar32 ch;
296     for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) {
297     ch = output.char32At(i);
298     UnicodeReplacer* r = data->lookupReplacer(ch);
299     if (r == NULL) {
300         toUnionTo.add(ch);
301     } else {
302         r->addReplacementSetTo(toUnionTo);
303     }
304     }
305 }
306 
307 /**
308  * UnicodeFunctor API
309  */
setData(const TransliterationRuleData * d)310 void StringReplacer::setData(const TransliterationRuleData* d) {
311     data = d;
312     int32_t i = 0;
313     while (i<output.length()) {
314         UChar32 c = output.char32At(i);
315         UnicodeFunctor* f = data->lookup(c);
316         if (f != NULL) {
317             f->setData(data);
318         }
319         i += U16_LENGTH(c);
320     }
321 }
322 
323 U_NAMESPACE_END
324 
325 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
326 
327 //eof
328