1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *****************************************************************************
5 *
6 * Copyright (C) 1998-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *****************************************************************************
10 *
11 * ucnv_err.c
12 * Implements error behaviour functions called by T_UConverter_{from,to}Unicode
13 *
14 *
15 * Change history:
16 *
17 * 06/29/2000 helena Major rewrite of the callback APIs.
18 */
19
20 #include "unicode/utypes.h"
21
22 #if !UCONFIG_NO_CONVERSION
23
24 #include "unicode/ucnv_err.h"
25 #include "unicode/ucnv_cb.h"
26 #include "ucnv_cnv.h"
27 #include "cmemory.h"
28 #include "unicode/ucnv.h"
29 #include "ustrfmt.h"
30
31 #define VALUE_STRING_LENGTH 48
32 /*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
33 #define UNICODE_PERCENT_SIGN_CODEPOINT 0x0025
34 #define UNICODE_U_CODEPOINT 0x0055
35 #define UNICODE_X_CODEPOINT 0x0058
36 #define UNICODE_RS_CODEPOINT 0x005C
37 #define UNICODE_U_LOW_CODEPOINT 0x0075
38 #define UNICODE_X_LOW_CODEPOINT 0x0078
39 #define UNICODE_AMP_CODEPOINT 0x0026
40 #define UNICODE_HASH_CODEPOINT 0x0023
41 #define UNICODE_SEMICOLON_CODEPOINT 0x003B
42 #define UNICODE_PLUS_CODEPOINT 0x002B
43 #define UNICODE_LEFT_CURLY_CODEPOINT 0x007B
44 #define UNICODE_RIGHT_CURLY_CODEPOINT 0x007D
45 #define UNICODE_SPACE_CODEPOINT 0x0020
46 #define UCNV_PRV_ESCAPE_ICU 0
47 #define UCNV_PRV_ESCAPE_C 'C'
48 #define UCNV_PRV_ESCAPE_XML_DEC 'D'
49 #define UCNV_PRV_ESCAPE_XML_HEX 'X'
50 #define UCNV_PRV_ESCAPE_JAVA 'J'
51 #define UCNV_PRV_ESCAPE_UNICODE 'U'
52 #define UCNV_PRV_ESCAPE_CSS2 'S'
53 #define UCNV_PRV_STOP_ON_ILLEGAL 'i'
54
55 /*
56 * IS_DEFAULT_IGNORABLE_CODE_POINT
57 * This is to check if a code point has the default ignorable unicode property.
58 * As such, this list needs to be updated if the ignorable code point list ever
59 * changes.
60 * To avoid dependency on other code, this list is hard coded here.
61 * When an ignorable code point is found and is unmappable, the default callbacks
62 * will ignore them.
63 * For a list of the default ignorable code points, use this link: http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[%3ADI%3A]&g=
64 *
65 * This list should be sync with the one in CharsetCallback.java
66 */
67 #define IS_DEFAULT_IGNORABLE_CODE_POINT(c) (\
68 (c == 0x00AD) || \
69 (c == 0x034F) || \
70 (c == 0x061C) || \
71 (c == 0x115F) || \
72 (c == 0x1160) || \
73 (0x17B4 <= c && c <= 0x17B5) || \
74 (0x180B <= c && c <= 0x180E) || \
75 (0x200B <= c && c <= 0x200F) || \
76 (0x202A <= c && c <= 0x202E) || \
77 (c == 0x2060) || \
78 (0x2066 <= c && c <= 0x2069) || \
79 (0x2061 <= c && c <= 0x2064) || \
80 (0x206A <= c && c <= 0x206F) || \
81 (c == 0x3164) || \
82 (0x0FE00 <= c && c <= 0x0FE0F) || \
83 (c == 0x0FEFF) || \
84 (c == 0x0FFA0) || \
85 (0x01BCA0 <= c && c <= 0x01BCA3) || \
86 (0x01D173 <= c && c <= 0x01D17A) || \
87 (c == 0x0E0001) || \
88 (0x0E0020 <= c && c <= 0x0E007F) || \
89 (0x0E0100 <= c && c <= 0x0E01EF) || \
90 (c == 0x2065) || \
91 (0x0FFF0 <= c && c <= 0x0FFF8) || \
92 (c == 0x0E0000) || \
93 (0x0E0002 <= c && c <= 0x0E001F) || \
94 (0x0E0080 <= c && c <= 0x0E00FF) || \
95 (0x0E01F0 <= c && c <= 0x0E0FFF) \
96 )
97
98
99 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
100 U_CAPI void U_EXPORT2
UCNV_FROM_U_CALLBACK_STOP(const void * context,UConverterFromUnicodeArgs * fromUArgs,const UChar * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)101 UCNV_FROM_U_CALLBACK_STOP (
102 const void *context,
103 UConverterFromUnicodeArgs *fromUArgs,
104 const UChar* codeUnits,
105 int32_t length,
106 UChar32 codePoint,
107 UConverterCallbackReason reason,
108 UErrorCode * err)
109 {
110 (void)context;
111 (void)fromUArgs;
112 (void)codeUnits;
113 (void)length;
114 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
115 {
116 /*
117 * Skip if the codepoint has unicode property of default ignorable.
118 */
119 *err = U_ZERO_ERROR;
120 }
121 /* the caller must have set the error code accordingly */
122 return;
123 }
124
125
126 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
127 U_CAPI void U_EXPORT2
UCNV_TO_U_CALLBACK_STOP(const void * context,UConverterToUnicodeArgs * toUArgs,const char * codePoints,int32_t length,UConverterCallbackReason reason,UErrorCode * err)128 UCNV_TO_U_CALLBACK_STOP (
129 const void *context,
130 UConverterToUnicodeArgs *toUArgs,
131 const char* codePoints,
132 int32_t length,
133 UConverterCallbackReason reason,
134 UErrorCode * err)
135 {
136 /* the caller must have set the error code accordingly */
137 (void)context; (void)toUArgs; (void)codePoints; (void)length; (void)reason; (void)err;
138 return;
139 }
140
141 U_CAPI void U_EXPORT2
UCNV_FROM_U_CALLBACK_SKIP(const void * context,UConverterFromUnicodeArgs * fromUArgs,const UChar * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)142 UCNV_FROM_U_CALLBACK_SKIP (
143 const void *context,
144 UConverterFromUnicodeArgs *fromUArgs,
145 const UChar* codeUnits,
146 int32_t length,
147 UChar32 codePoint,
148 UConverterCallbackReason reason,
149 UErrorCode * err)
150 {
151 (void)fromUArgs;
152 (void)codeUnits;
153 (void)length;
154 if (reason <= UCNV_IRREGULAR)
155 {
156 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
157 {
158 /*
159 * Skip if the codepoint has unicode property of default ignorable.
160 */
161 *err = U_ZERO_ERROR;
162 }
163 else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
164 {
165 *err = U_ZERO_ERROR;
166 }
167 /* else the caller must have set the error code accordingly. */
168 }
169 /* else ignore the reset, close and clone calls. */
170 }
171
172 U_CAPI void U_EXPORT2
UCNV_FROM_U_CALLBACK_SUBSTITUTE(const void * context,UConverterFromUnicodeArgs * fromArgs,const UChar * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)173 UCNV_FROM_U_CALLBACK_SUBSTITUTE (
174 const void *context,
175 UConverterFromUnicodeArgs *fromArgs,
176 const UChar* codeUnits,
177 int32_t length,
178 UChar32 codePoint,
179 UConverterCallbackReason reason,
180 UErrorCode * err)
181 {
182 (void)codeUnits;
183 (void)length;
184 if (reason <= UCNV_IRREGULAR)
185 {
186 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
187 {
188 /*
189 * Skip if the codepoint has unicode property of default ignorable.
190 */
191 *err = U_ZERO_ERROR;
192 }
193 else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
194 {
195 *err = U_ZERO_ERROR;
196 ucnv_cbFromUWriteSub(fromArgs, 0, err);
197 }
198 /* else the caller must have set the error code accordingly. */
199 }
200 /* else ignore the reset, close and clone calls. */
201 }
202
203 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
204 *uses a clean copy (resetted) of the converter, to convert that unicode
205 *escape sequence to the target codepage (if conversion failure happens then
206 *we revert to substituting with subchar)
207 */
208 U_CAPI void U_EXPORT2
UCNV_FROM_U_CALLBACK_ESCAPE(const void * context,UConverterFromUnicodeArgs * fromArgs,const UChar * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)209 UCNV_FROM_U_CALLBACK_ESCAPE (
210 const void *context,
211 UConverterFromUnicodeArgs *fromArgs,
212 const UChar *codeUnits,
213 int32_t length,
214 UChar32 codePoint,
215 UConverterCallbackReason reason,
216 UErrorCode * err)
217 {
218
219 UChar valueString[VALUE_STRING_LENGTH];
220 int32_t valueStringLength = 0;
221 int32_t i = 0;
222
223 const UChar *myValueSource = NULL;
224 UErrorCode err2 = U_ZERO_ERROR;
225 UConverterFromUCallback original = NULL;
226 const void *originalContext;
227
228 UConverterFromUCallback ignoredCallback = NULL;
229 const void *ignoredContext;
230
231 if (reason > UCNV_IRREGULAR)
232 {
233 return;
234 }
235 else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
236 {
237 /*
238 * Skip if the codepoint has unicode property of default ignorable.
239 */
240 *err = U_ZERO_ERROR;
241 return;
242 }
243
244 ucnv_setFromUCallBack (fromArgs->converter,
245 (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
246 NULL,
247 &original,
248 &originalContext,
249 &err2);
250
251 if (U_FAILURE (err2))
252 {
253 *err = err2;
254 return;
255 }
256 if(context==NULL)
257 {
258 while (i < length)
259 {
260 valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
261 valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
262 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
263 }
264 }
265 else
266 {
267 switch(*((char*)context))
268 {
269 case UCNV_PRV_ESCAPE_JAVA:
270 while (i < length)
271 {
272 valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */
273 valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
274 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
275 }
276 break;
277
278 case UCNV_PRV_ESCAPE_C:
279 valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */
280
281 if(length==2){
282 valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
283 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8);
284
285 }
286 else{
287 valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
288 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
289 }
290 break;
291
292 case UCNV_PRV_ESCAPE_XML_DEC:
293
294 valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */
295 valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */
296 if(length==2){
297 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0);
298 }
299 else{
300 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0);
301 }
302 valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
303 break;
304
305 case UCNV_PRV_ESCAPE_XML_HEX:
306
307 valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */
308 valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */
309 valueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
310 if(length==2){
311 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
312 }
313 else{
314 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0);
315 }
316 valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
317 break;
318
319 case UCNV_PRV_ESCAPE_UNICODE:
320 valueString[valueStringLength++] = (UChar) UNICODE_LEFT_CURLY_CODEPOINT; /* adding { */
321 valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
322 valueString[valueStringLength++] = (UChar) UNICODE_PLUS_CODEPOINT; /* adding + */
323 if (length == 2) {
324 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4);
325 } else {
326 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
327 }
328 valueString[valueStringLength++] = (UChar) UNICODE_RIGHT_CURLY_CODEPOINT; /* adding } */
329 break;
330
331 case UCNV_PRV_ESCAPE_CSS2:
332 valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */
333 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
334 /* Always add space character, becase the next character might be whitespace,
335 which would erroneously be considered the termination of the escape sequence. */
336 valueString[valueStringLength++] = (UChar) UNICODE_SPACE_CODEPOINT;
337 break;
338
339 default:
340 while (i < length)
341 {
342 valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
343 valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
344 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
345 }
346 }
347 }
348 myValueSource = valueString;
349
350 /* reset the error */
351 *err = U_ZERO_ERROR;
352
353 ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err);
354
355 ucnv_setFromUCallBack (fromArgs->converter,
356 original,
357 originalContext,
358 &ignoredCallback,
359 &ignoredContext,
360 &err2);
361 if (U_FAILURE (err2))
362 {
363 *err = err2;
364 return;
365 }
366
367 return;
368 }
369
370
371
372 U_CAPI void U_EXPORT2
UCNV_TO_U_CALLBACK_SKIP(const void * context,UConverterToUnicodeArgs * toArgs,const char * codeUnits,int32_t length,UConverterCallbackReason reason,UErrorCode * err)373 UCNV_TO_U_CALLBACK_SKIP (
374 const void *context,
375 UConverterToUnicodeArgs *toArgs,
376 const char* codeUnits,
377 int32_t length,
378 UConverterCallbackReason reason,
379 UErrorCode * err)
380 {
381 (void)toArgs;
382 (void)codeUnits;
383 (void)length;
384 if (reason <= UCNV_IRREGULAR)
385 {
386 if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
387 {
388 *err = U_ZERO_ERROR;
389 }
390 /* else the caller must have set the error code accordingly. */
391 }
392 /* else ignore the reset, close and clone calls. */
393 }
394
395 U_CAPI void U_EXPORT2
UCNV_TO_U_CALLBACK_SUBSTITUTE(const void * context,UConverterToUnicodeArgs * toArgs,const char * codeUnits,int32_t length,UConverterCallbackReason reason,UErrorCode * err)396 UCNV_TO_U_CALLBACK_SUBSTITUTE (
397 const void *context,
398 UConverterToUnicodeArgs *toArgs,
399 const char* codeUnits,
400 int32_t length,
401 UConverterCallbackReason reason,
402 UErrorCode * err)
403 {
404 (void)codeUnits;
405 (void)length;
406 if (reason <= UCNV_IRREGULAR)
407 {
408 if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
409 {
410 *err = U_ZERO_ERROR;
411 ucnv_cbToUWriteSub(toArgs,0,err);
412 }
413 /* else the caller must have set the error code accordingly. */
414 }
415 /* else ignore the reset, close and clone calls. */
416 }
417
418 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
419 *and uses that as the substitution sequence
420 */
421 U_CAPI void U_EXPORT2
UCNV_TO_U_CALLBACK_ESCAPE(const void * context,UConverterToUnicodeArgs * toArgs,const char * codeUnits,int32_t length,UConverterCallbackReason reason,UErrorCode * err)422 UCNV_TO_U_CALLBACK_ESCAPE (
423 const void *context,
424 UConverterToUnicodeArgs *toArgs,
425 const char* codeUnits,
426 int32_t length,
427 UConverterCallbackReason reason,
428 UErrorCode * err)
429 {
430 UChar uniValueString[VALUE_STRING_LENGTH];
431 int32_t valueStringLength = 0;
432 int32_t i = 0;
433
434 if (reason > UCNV_IRREGULAR)
435 {
436 return;
437 }
438
439 if(context==NULL)
440 {
441 while (i < length)
442 {
443 uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
444 uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT; /* adding X */
445 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
446 }
447 }
448 else
449 {
450 switch(*((char*)context))
451 {
452 case UCNV_PRV_ESCAPE_XML_DEC:
453 while (i < length)
454 {
455 uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */
456 uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */
457 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0);
458 uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
459 }
460 break;
461
462 case UCNV_PRV_ESCAPE_XML_HEX:
463 while (i < length)
464 {
465 uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */
466 uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */
467 uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
468 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0);
469 uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
470 }
471 break;
472 case UCNV_PRV_ESCAPE_C:
473 while (i < length)
474 {
475 uniValueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */
476 uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
477 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2);
478 }
479 break;
480 default:
481 while (i < length)
482 {
483 uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
484 uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT; /* adding X */
485 uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
486 valueStringLength += 2;
487 }
488 }
489 }
490 /* reset the error */
491 *err = U_ZERO_ERROR;
492
493 ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err);
494 }
495
496 #endif
497