1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (c) 2001-2011, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * Date Name Description
9 * 11/19/2001 aliu Creation.
10 **********************************************************************
11 */
12
13 #include "unicode/utypes.h"
14
15 #if !UCONFIG_NO_TRANSLITERATION
16
17 #include "unicode/uchar.h"
18 #include "unicode/utf16.h"
19 #include "unesctrn.h"
20 #include "util.h"
21
22 #include "cmemory.h"
23
24 U_NAMESPACE_BEGIN
25
26 /**
27 * Special character marking the end of the spec[] array.
28 */
29 static const UChar END = 0xFFFF;
30
31 // Unicode: "U+10FFFF" hex, min=4, max=6
32 static const UChar SPEC_Unicode[] = {
33 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
34 END
35 };
36
37 // Java: "\\uFFFF" hex, min=4, max=4
38 static const UChar SPEC_Java[] = {
39 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
40 END
41 };
42
43 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
44 static const UChar SPEC_C[] = {
45 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
46 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
47 END
48 };
49
50 // XML: "" hex, min=1, max=6
51 static const UChar SPEC_XML[] = {
52 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
53 END
54 };
55
56 // XML10: "" dec, min=1, max=7 (not really "Hex-Any")
57 static const UChar SPEC_XML10[] = {
58 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
59 END
60 };
61
62 // Perl: "\\x{263A}" hex, min=1, max=6
63 static const UChar SPEC_Perl[] = {
64 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
65 END
66 };
67
68 // All: Java, C, Perl, XML, XML10, Unicode
69 static const UChar SPEC_Any[] = {
70 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, // Unicode
71 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, // Java
72 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, // C (surrogates)
73 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, // XML
74 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, // XML10
75 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
76 END
77 };
78
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)79 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
80
81 static UChar* copySpec(const UChar* spec) {
82 int32_t len = 0;
83 while (spec[len] != END) {
84 ++len;
85 }
86 ++len;
87 UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
88 // Check for memory allocation error.
89 if (result != NULL) {
90 uprv_memcpy(result, spec, (size_t)len*sizeof(result[0]));
91 }
92 return result;
93 }
94
95 /**
96 * Factory methods. Ignore the context.
97 */
_createUnicode(const UnicodeString & ID,Transliterator::Token)98 static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
99 return new UnescapeTransliterator(ID, SPEC_Unicode);
100 }
_createJava(const UnicodeString & ID,Transliterator::Token)101 static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
102 return new UnescapeTransliterator(ID, SPEC_Java);
103 }
_createC(const UnicodeString & ID,Transliterator::Token)104 static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
105 return new UnescapeTransliterator(ID, SPEC_C);
106 }
_createXML(const UnicodeString & ID,Transliterator::Token)107 static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
108 return new UnescapeTransliterator(ID, SPEC_XML);
109 }
_createXML10(const UnicodeString & ID,Transliterator::Token)110 static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
111 return new UnescapeTransliterator(ID, SPEC_XML10);
112 }
_createPerl(const UnicodeString & ID,Transliterator::Token)113 static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
114 return new UnescapeTransliterator(ID, SPEC_Perl);
115 }
_createAny(const UnicodeString & ID,Transliterator::Token)116 static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
117 return new UnescapeTransliterator(ID, SPEC_Any);
118 }
119
120 /**
121 * Registers standard variants with the system. Called by
122 * Transliterator during initialization.
123 */
registerIDs()124 void UnescapeTransliterator::registerIDs() {
125 Token t = integerToken(0);
126
127 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
128
129 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
130
131 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
132
133 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
134
135 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
136
137 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
138
139 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
140 }
141
142 /**
143 * Constructor. Takes the encoded spec array.
144 */
UnescapeTransliterator(const UnicodeString & newID,const UChar * newSpec)145 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
146 const UChar *newSpec) :
147 Transliterator(newID, NULL)
148 {
149 this->spec = copySpec(newSpec);
150 }
151
152 /**
153 * Copy constructor.
154 */
UnescapeTransliterator(const UnescapeTransliterator & o)155 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
156 Transliterator(o) {
157 this->spec = copySpec(o.spec);
158 }
159
~UnescapeTransliterator()160 UnescapeTransliterator::~UnescapeTransliterator() {
161 uprv_free(spec);
162 }
163
164 /**
165 * Transliterator API.
166 */
clone() const167 Transliterator* UnescapeTransliterator::clone() const {
168 return new UnescapeTransliterator(*this);
169 }
170
171 /**
172 * Implements {@link Transliterator#handleTransliterate}.
173 */
handleTransliterate(Replaceable & text,UTransPosition & pos,UBool isIncremental) const174 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
175 UBool isIncremental) const {
176 int32_t start = pos.start;
177 int32_t limit = pos.limit;
178 int32_t i, j, ipat;
179
180 while (start < limit) {
181 // Loop over the forms in spec[]. Exit this loop when we
182 // match one of the specs. Exit the outer loop if a
183 // partial match is detected and isIncremental is true.
184 for (j=0, ipat=0; spec[ipat] != END; ++j) {
185
186 // Read the header
187 int32_t prefixLen = spec[ipat++];
188 int32_t suffixLen = spec[ipat++];
189 int8_t radix = (int8_t) spec[ipat++];
190 int32_t minDigits = spec[ipat++];
191 int32_t maxDigits = spec[ipat++];
192
193 // s is a copy of start that is advanced over the
194 // characters as we parse them.
195 int32_t s = start;
196 UBool match = TRUE;
197
198 for (i=0; i<prefixLen; ++i) {
199 if (s >= limit) {
200 if (i > 0) {
201 // We've already matched a character. This is
202 // a partial match, so we return if in
203 // incremental mode. In non-incremental mode,
204 // go to the next spec.
205 if (isIncremental) {
206 goto exit;
207 }
208 match = FALSE;
209 break;
210 }
211 }
212 UChar c = text.charAt(s++);
213 if (c != spec[ipat + i]) {
214 match = FALSE;
215 break;
216 }
217 }
218
219 if (match) {
220 UChar32 u = 0;
221 int32_t digitCount = 0;
222 for (;;) {
223 if (s >= limit) {
224 // Check for partial match in incremental mode.
225 if (s > start && isIncremental) {
226 goto exit;
227 }
228 break;
229 }
230 UChar32 ch = text.char32At(s);
231 int32_t digit = u_digit(ch, radix);
232 if (digit < 0) {
233 break;
234 }
235 s += U16_LENGTH(ch);
236 u = (u * radix) + digit;
237 if (++digitCount == maxDigits) {
238 break;
239 }
240 }
241
242 match = (digitCount >= minDigits);
243
244 if (match) {
245 for (i=0; i<suffixLen; ++i) {
246 if (s >= limit) {
247 // Check for partial match in incremental mode.
248 if (s > start && isIncremental) {
249 goto exit;
250 }
251 match = FALSE;
252 break;
253 }
254 UChar c = text.charAt(s++);
255 if (c != spec[ipat + prefixLen + i]) {
256 match = FALSE;
257 break;
258 }
259 }
260
261 if (match) {
262 // At this point, we have a match
263 UnicodeString str(u);
264 text.handleReplaceBetween(start, s, str);
265 limit -= s - start - str.length();
266 // The following break statement leaves the
267 // loop that is traversing the forms in
268 // spec[]. We then parse the next input
269 // character.
270 break;
271 }
272 }
273 }
274
275 ipat += prefixLen + suffixLen;
276 }
277
278 if (start < limit) {
279 start += U16_LENGTH(text.char32At(start));
280 }
281 }
282
283 exit:
284 pos.contextLimit += limit - pos.limit;
285 pos.limit = limit;
286 pos.start = start;
287 }
288
289 U_NAMESPACE_END
290
291 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
292
293 //eof
294