1 /*
2 **********************************************************************
3 * Copyright (c) 2001-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 11/19/2001 aliu Creation.
8 **********************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_TRANSLITERATION
14
15 #include "unicode/uchar.h"
16 #include "unesctrn.h"
17 #include "util.h"
18
19 #include "cmemory.h"
20
21 U_NAMESPACE_BEGIN
22
23 /**
24 * Special character marking the end of the spec[] array.
25 */
26 static const UChar END = 0xFFFF;
27
28 // Unicode: "U+10FFFF" hex, min=4, max=6
29 static const UChar SPEC_Unicode[] = {
30 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
31 END
32 };
33
34 // Java: "\\uFFFF" hex, min=4, max=4
35 static const UChar SPEC_Java[] = {
36 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
37 END
38 };
39
40 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
41 static const UChar SPEC_C[] = {
42 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
43 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
44 END
45 };
46
47 // XML: "" hex, min=1, max=6
48 static const UChar SPEC_XML[] = {
49 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
50 END
51 };
52
53 // XML10: "" dec, min=1, max=7 (not really "Hex-Any")
54 static const UChar SPEC_XML10[] = {
55 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
56 END
57 };
58
59 // Perl: "\\x{263A}" hex, min=1, max=6
60 static const UChar SPEC_Perl[] = {
61 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
62 END
63 };
64
65 // All: Java, C, Perl, XML, XML10, Unicode
66 static const UChar SPEC_Any[] = {
67 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, // Unicode
68 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, // Java
69 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, // C (surrogates)
70 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, // XML
71 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, // XML10
72 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
73 END
74 };
75
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)76 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
77
78 static UChar* copySpec(const UChar* spec) {
79 int32_t len = 0;
80 while (spec[len] != END) {
81 ++len;
82 }
83 ++len;
84 UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
85 // Check for memory allocation error.
86 if (result != NULL) {
87 uprv_memcpy(result, spec, len*sizeof(result[0]));
88 }
89 return result;
90 }
91
92 /**
93 * Factory methods. Ignore the context.
94 */
_createUnicode(const UnicodeString & ID,Transliterator::Token)95 static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
96 return new UnescapeTransliterator(ID, SPEC_Unicode);
97 }
_createJava(const UnicodeString & ID,Transliterator::Token)98 static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
99 return new UnescapeTransliterator(ID, SPEC_Java);
100 }
_createC(const UnicodeString & ID,Transliterator::Token)101 static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
102 return new UnescapeTransliterator(ID, SPEC_C);
103 }
_createXML(const UnicodeString & ID,Transliterator::Token)104 static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
105 return new UnescapeTransliterator(ID, SPEC_XML);
106 }
_createXML10(const UnicodeString & ID,Transliterator::Token)107 static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
108 return new UnescapeTransliterator(ID, SPEC_XML10);
109 }
_createPerl(const UnicodeString & ID,Transliterator::Token)110 static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
111 return new UnescapeTransliterator(ID, SPEC_Perl);
112 }
_createAny(const UnicodeString & ID,Transliterator::Token)113 static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
114 return new UnescapeTransliterator(ID, SPEC_Any);
115 }
116
117 /**
118 * Registers standard variants with the system. Called by
119 * Transliterator during initialization.
120 */
registerIDs()121 void UnescapeTransliterator::registerIDs() {
122 Token t = integerToken(0);
123
124 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
125
126 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
127
128 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
129
130 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
131
132 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
133
134 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
135
136 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
137 }
138
139 /**
140 * Constructor. Takes the encoded spec array.
141 */
UnescapeTransliterator(const UnicodeString & newID,const UChar * newSpec)142 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
143 const UChar *newSpec) :
144 Transliterator(newID, NULL)
145 {
146 this->spec = copySpec(newSpec);
147 }
148
149 /**
150 * Copy constructor.
151 */
UnescapeTransliterator(const UnescapeTransliterator & o)152 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
153 Transliterator(o) {
154 this->spec = copySpec(o.spec);
155 }
156
~UnescapeTransliterator()157 UnescapeTransliterator::~UnescapeTransliterator() {
158 uprv_free(spec);
159 }
160
161 /**
162 * Transliterator API.
163 */
clone() const164 Transliterator* UnescapeTransliterator::clone() const {
165 return new UnescapeTransliterator(*this);
166 }
167
168 /**
169 * Implements {@link Transliterator#handleTransliterate}.
170 */
handleTransliterate(Replaceable & text,UTransPosition & pos,UBool isIncremental) const171 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
172 UBool isIncremental) const {
173 int32_t start = pos.start;
174 int32_t limit = pos.limit;
175 int32_t i, j, ipat;
176
177 while (start < limit) {
178 // Loop over the forms in spec[]. Exit this loop when we
179 // match one of the specs. Exit the outer loop if a
180 // partial match is detected and isIncremental is true.
181 for (j=0, ipat=0; spec[ipat] != END; ++j) {
182
183 // Read the header
184 int32_t prefixLen = spec[ipat++];
185 int32_t suffixLen = spec[ipat++];
186 int8_t radix = (int8_t) spec[ipat++];
187 int32_t minDigits = spec[ipat++];
188 int32_t maxDigits = spec[ipat++];
189
190 // s is a copy of start that is advanced over the
191 // characters as we parse them.
192 int32_t s = start;
193 UBool match = TRUE;
194
195 for (i=0; i<prefixLen; ++i) {
196 if (s >= limit) {
197 if (i > 0) {
198 // We've already matched a character. This is
199 // a partial match, so we return if in
200 // incremental mode. In non-incremental mode,
201 // go to the next spec.
202 if (isIncremental) {
203 goto exit;
204 }
205 match = FALSE;
206 break;
207 }
208 }
209 UChar c = text.charAt(s++);
210 if (c != spec[ipat + i]) {
211 match = FALSE;
212 break;
213 }
214 }
215
216 if (match) {
217 UChar32 u = 0;
218 int32_t digitCount = 0;
219 for (;;) {
220 if (s >= limit) {
221 // Check for partial match in incremental mode.
222 if (s > start && isIncremental) {
223 goto exit;
224 }
225 break;
226 }
227 UChar32 ch = text.char32At(s);
228 int32_t digit = u_digit(ch, radix);
229 if (digit < 0) {
230 break;
231 }
232 s += UTF_CHAR_LENGTH(ch);
233 u = (u * radix) + digit;
234 if (++digitCount == maxDigits) {
235 break;
236 }
237 }
238
239 match = (digitCount >= minDigits);
240
241 if (match) {
242 for (i=0; i<suffixLen; ++i) {
243 if (s >= limit) {
244 // Check for partial match in incremental mode.
245 if (s > start && isIncremental) {
246 goto exit;
247 }
248 match = FALSE;
249 break;
250 }
251 UChar c = text.charAt(s++);
252 if (c != spec[ipat + prefixLen + i]) {
253 match = FALSE;
254 break;
255 }
256 }
257
258 if (match) {
259 // At this point, we have a match
260 UnicodeString str(u);
261 text.handleReplaceBetween(start, s, str);
262 limit -= s - start - str.length();
263 // The following break statement leaves the
264 // loop that is traversing the forms in
265 // spec[]. We then parse the next input
266 // character.
267 break;
268 }
269 }
270 }
271
272 ipat += prefixLen + suffixLen;
273 }
274
275 if (start < limit) {
276 start += UTF_CHAR_LENGTH(text.char32At(start));
277 }
278 }
279
280 exit:
281 pos.contextLimit += limit - pos.limit;
282 pos.limit = limit;
283 pos.start = start;
284 }
285
286 U_NAMESPACE_END
287
288 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
289
290 //eof
291