• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **********************************************************************
3  *   Copyright (c) 2001-2008, International Business Machines
4  *   Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  *   Date        Name        Description
7  *   11/19/2001  aliu        Creation.
8  **********************************************************************
9  */
10 
11 #include "unicode/utypes.h"
12 
13 #if !UCONFIG_NO_TRANSLITERATION
14 
15 #include "unicode/uchar.h"
16 #include "unesctrn.h"
17 #include "util.h"
18 
19 #include "cmemory.h"
20 
21 U_NAMESPACE_BEGIN
22 
23 /**
24  * Special character marking the end of the spec[] array.
25  */
26 static const UChar END = 0xFFFF;
27 
28 // Unicode: "U+10FFFF" hex, min=4, max=6
29 static const UChar SPEC_Unicode[] = {
30     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
31     END
32 };
33 
34 // Java: "\\uFFFF" hex, min=4, max=4
35 static const UChar SPEC_Java[] = {
36     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
37     END
38 };
39 
40 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
41 static const UChar SPEC_C[] = {
42     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
43     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
44     END
45 };
46 
47 // XML: "" hex, min=1, max=6
48 static const UChar SPEC_XML[] = {
49     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
50     END
51 };
52 
53 // XML10: "" dec, min=1, max=7 (not really "Hex-Any")
54 static const UChar SPEC_XML10[] = {
55     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
56     END
57 };
58 
59 // Perl: "\\x{263A}" hex, min=1, max=6
60 static const UChar SPEC_Perl[] = {
61     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
62     END
63 };
64 
65 // All: Java, C, Perl, XML, XML10, Unicode
66 static const UChar SPEC_Any[] = {
67     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,                      // Unicode
68     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,                     // Java
69     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,                      // C (surrogates)
70     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,   // XML
71     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,             // XML10
72     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
73     END
74 };
75 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)76 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
77 
78 static UChar* copySpec(const UChar* spec) {
79     int32_t len = 0;
80     while (spec[len] != END) {
81         ++len;
82     }
83     ++len;
84     UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
85     // Check for memory allocation error.
86     if (result != NULL) {
87     	uprv_memcpy(result, spec, len*sizeof(result[0]));
88     }
89     return result;
90 }
91 
92 /**
93  * Factory methods.  Ignore the context.
94  */
_createUnicode(const UnicodeString & ID,Transliterator::Token)95 static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
96     return new UnescapeTransliterator(ID, SPEC_Unicode);
97 }
_createJava(const UnicodeString & ID,Transliterator::Token)98 static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
99     return new UnescapeTransliterator(ID, SPEC_Java);
100 }
_createC(const UnicodeString & ID,Transliterator::Token)101 static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
102     return new UnescapeTransliterator(ID, SPEC_C);
103 }
_createXML(const UnicodeString & ID,Transliterator::Token)104 static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
105     return new UnescapeTransliterator(ID, SPEC_XML);
106 }
_createXML10(const UnicodeString & ID,Transliterator::Token)107 static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
108     return new UnescapeTransliterator(ID, SPEC_XML10);
109 }
_createPerl(const UnicodeString & ID,Transliterator::Token)110 static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
111     return new UnescapeTransliterator(ID, SPEC_Perl);
112 }
_createAny(const UnicodeString & ID,Transliterator::Token)113 static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
114     return new UnescapeTransliterator(ID, SPEC_Any);
115 }
116 
117 /**
118  * Registers standard variants with the system.  Called by
119  * Transliterator during initialization.
120  */
registerIDs()121 void UnescapeTransliterator::registerIDs() {
122     Token t = integerToken(0);
123 
124     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
125 
126     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
127 
128     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
129 
130     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
131 
132     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
133 
134     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
135 
136     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
137 }
138 
139 /**
140  * Constructor.  Takes the encoded spec array.
141  */
UnescapeTransliterator(const UnicodeString & newID,const UChar * newSpec)142 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
143                                                const UChar *newSpec) :
144     Transliterator(newID, NULL)
145 {
146     this->spec = copySpec(newSpec);
147 }
148 
149 /**
150  * Copy constructor.
151  */
UnescapeTransliterator(const UnescapeTransliterator & o)152 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
153     Transliterator(o) {
154     this->spec = copySpec(o.spec);
155 }
156 
~UnescapeTransliterator()157 UnescapeTransliterator::~UnescapeTransliterator() {
158     uprv_free(spec);
159 }
160 
161 /**
162  * Transliterator API.
163  */
clone() const164 Transliterator* UnescapeTransliterator::clone() const {
165     return new UnescapeTransliterator(*this);
166 }
167 
168 /**
169  * Implements {@link Transliterator#handleTransliterate}.
170  */
handleTransliterate(Replaceable & text,UTransPosition & pos,UBool isIncremental) const171 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
172                                                  UBool isIncremental) const {
173     int32_t start = pos.start;
174     int32_t limit = pos.limit;
175     int32_t i, j, ipat;
176 
177     while (start < limit) {
178         // Loop over the forms in spec[].  Exit this loop when we
179         // match one of the specs.  Exit the outer loop if a
180         // partial match is detected and isIncremental is true.
181         for (j=0, ipat=0; spec[ipat] != END; ++j) {
182 
183             // Read the header
184             int32_t prefixLen = spec[ipat++];
185             int32_t suffixLen = spec[ipat++];
186             int8_t  radix     = (int8_t) spec[ipat++];
187             int32_t minDigits = spec[ipat++];
188             int32_t maxDigits = spec[ipat++];
189 
190             // s is a copy of start that is advanced over the
191             // characters as we parse them.
192             int32_t s = start;
193             UBool match = TRUE;
194 
195             for (i=0; i<prefixLen; ++i) {
196                 if (s >= limit) {
197                     if (i > 0) {
198                         // We've already matched a character.  This is
199                         // a partial match, so we return if in
200                         // incremental mode.  In non-incremental mode,
201                         // go to the next spec.
202                         if (isIncremental) {
203                             goto exit;
204                         }
205                         match = FALSE;
206                         break;
207                     }
208                 }
209                 UChar c = text.charAt(s++);
210                 if (c != spec[ipat + i]) {
211                     match = FALSE;
212                     break;
213                 }
214             }
215 
216             if (match) {
217                 UChar32 u = 0;
218                 int32_t digitCount = 0;
219                 for (;;) {
220                     if (s >= limit) {
221                         // Check for partial match in incremental mode.
222                         if (s > start && isIncremental) {
223                             goto exit;
224                         }
225                         break;
226                     }
227                     UChar32 ch = text.char32At(s);
228                     int32_t digit = u_digit(ch, radix);
229                     if (digit < 0) {
230                         break;
231                     }
232                     s += UTF_CHAR_LENGTH(ch);
233                     u = (u * radix) + digit;
234                     if (++digitCount == maxDigits) {
235                         break;
236                     }
237                 }
238 
239                 match = (digitCount >= minDigits);
240 
241                 if (match) {
242                     for (i=0; i<suffixLen; ++i) {
243                         if (s >= limit) {
244                             // Check for partial match in incremental mode.
245                             if (s > start && isIncremental) {
246                                 goto exit;
247                             }
248                             match = FALSE;
249                             break;
250                         }
251                         UChar c = text.charAt(s++);
252                         if (c != spec[ipat + prefixLen + i]) {
253                             match = FALSE;
254                             break;
255                         }
256                     }
257 
258                     if (match) {
259                         // At this point, we have a match
260                         UnicodeString str(u);
261                         text.handleReplaceBetween(start, s, str);
262                         limit -= s - start - str.length();
263                         // The following break statement leaves the
264                         // loop that is traversing the forms in
265                         // spec[].  We then parse the next input
266                         // character.
267                         break;
268                     }
269                 }
270             }
271 
272             ipat += prefixLen + suffixLen;
273         }
274 
275         if (start < limit) {
276             start += UTF_CHAR_LENGTH(text.char32At(start));
277         }
278     }
279 
280   exit:
281     pos.contextLimit += limit - pos.limit;
282     pos.limit = limit;
283     pos.start = start;
284 }
285 
286 U_NAMESPACE_END
287 
288 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
289 
290 //eof
291