1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *****************************************************************************
5 *
6 * Copyright (C) 1998-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *****************************************************************************
10 *
11 * ucnv_err.c
12 * Implements error behaviour functions called by T_UConverter_{from,to}Unicode
13 *
14 *
15 * Change history:
16 *
17 * 06/29/2000 helena Major rewrite of the callback APIs.
18 */
19
20 #include "unicode/utypes.h"
21
22 #if !UCONFIG_NO_CONVERSION
23
24 #include "unicode/ucnv_err.h"
25 #include "unicode/ucnv_cb.h"
26 #include "ucnv_cnv.h"
27 #include "cmemory.h"
28 #include "unicode/ucnv.h"
29 #include "ustrfmt.h"
30
31 #define VALUE_STRING_LENGTH 48
32 /*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
33 #define UNICODE_PERCENT_SIGN_CODEPOINT 0x0025
34 #define UNICODE_U_CODEPOINT 0x0055
35 #define UNICODE_X_CODEPOINT 0x0058
36 #define UNICODE_RS_CODEPOINT 0x005C
37 #define UNICODE_U_LOW_CODEPOINT 0x0075
38 #define UNICODE_X_LOW_CODEPOINT 0x0078
39 #define UNICODE_AMP_CODEPOINT 0x0026
40 #define UNICODE_HASH_CODEPOINT 0x0023
41 #define UNICODE_SEMICOLON_CODEPOINT 0x003B
42 #define UNICODE_PLUS_CODEPOINT 0x002B
43 #define UNICODE_LEFT_CURLY_CODEPOINT 0x007B
44 #define UNICODE_RIGHT_CURLY_CODEPOINT 0x007D
45 #define UNICODE_SPACE_CODEPOINT 0x0020
46 #define UCNV_PRV_ESCAPE_ICU 0
47 #define UCNV_PRV_ESCAPE_C 'C'
48 #define UCNV_PRV_ESCAPE_XML_DEC 'D'
49 #define UCNV_PRV_ESCAPE_XML_HEX 'X'
50 #define UCNV_PRV_ESCAPE_JAVA 'J'
51 #define UCNV_PRV_ESCAPE_UNICODE 'U'
52 #define UCNV_PRV_ESCAPE_CSS2 'S'
53 #define UCNV_PRV_STOP_ON_ILLEGAL 'i'
54
55 /*
56 * IS_DEFAULT_IGNORABLE_CODE_POINT
57 * This is to check if a code point has the default ignorable unicode property.
58 * As such, this list needs to be updated if the ignorable code point list ever
59 * changes.
60 * To avoid dependency on other code, this list is hard coded here.
61 * When an ignorable code point is found and is unmappable, the default callbacks
62 * will ignore them.
63 * For a list of the default ignorable code points, use this link: http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[%3ADI%3A]&g=
64 *
65 * This list should be sync with the one in CharsetCallback.java
66 */
67 #define IS_DEFAULT_IGNORABLE_CODE_POINT(c) (\
68 (c == 0x00AD) || \
69 (c == 0x034F) || \
70 (c == 0x061C) || \
71 (c == 0x115F) || \
72 (c == 0x1160) || \
73 (0x17B4 <= c && c <= 0x17B5) || \
74 (0x180B <= c && c <= 0x180E) || \
75 (0x200B <= c && c <= 0x200F) || \
76 (0x202A <= c && c <= 0x202E) || \
77 (c == 0x2060) || \
78 (0x2066 <= c && c <= 0x2069) || \
79 (0x2061 <= c && c <= 0x2064) || \
80 (0x206A <= c && c <= 0x206F) || \
81 (c == 0x3164) || \
82 (0x0FE00 <= c && c <= 0x0FE0F) || \
83 (c == 0x0FEFF) || \
84 (c == 0x0FFA0) || \
85 (0x01BCA0 <= c && c <= 0x01BCA3) || \
86 (0x01D173 <= c && c <= 0x01D17A) || \
87 (c == 0x0E0001) || \
88 (0x0E0020 <= c && c <= 0x0E007F) || \
89 (0x0E0100 <= c && c <= 0x0E01EF) || \
90 (c == 0x2065) || \
91 (0x0FFF0 <= c && c <= 0x0FFF8) || \
92 (c == 0x0E0000) || \
93 (0x0E0002 <= c && c <= 0x0E001F) || \
94 (0x0E0080 <= c && c <= 0x0E00FF) || \
95 (0x0E01F0 <= c && c <= 0x0E0FFF) \
96 )
97
98
99 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
100 U_CAPI void U_EXPORT2
UCNV_FROM_U_CALLBACK_STOP(const void * context,UConverterFromUnicodeArgs * fromUArgs,const UChar * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)101 UCNV_FROM_U_CALLBACK_STOP (
102 const void *context,
103 UConverterFromUnicodeArgs *fromUArgs,
104 const UChar* codeUnits,
105 int32_t length,
106 UChar32 codePoint,
107 UConverterCallbackReason reason,
108 UErrorCode * err)
109 {
110 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
111 {
112 /*
113 * Skip if the codepoint has unicode property of default ignorable.
114 */
115 *err = U_ZERO_ERROR;
116 }
117 /* the caller must have set the error code accordingly */
118 return;
119 }
120
121
122 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
123 U_CAPI void U_EXPORT2
UCNV_TO_U_CALLBACK_STOP(const void * context,UConverterToUnicodeArgs * toUArgs,const char * codePoints,int32_t length,UConverterCallbackReason reason,UErrorCode * err)124 UCNV_TO_U_CALLBACK_STOP (
125 const void *context,
126 UConverterToUnicodeArgs *toUArgs,
127 const char* codePoints,
128 int32_t length,
129 UConverterCallbackReason reason,
130 UErrorCode * err)
131 {
132 /* the caller must have set the error code accordingly */
133 return;
134 }
135
136 U_CAPI void U_EXPORT2
UCNV_FROM_U_CALLBACK_SKIP(const void * context,UConverterFromUnicodeArgs * fromUArgs,const UChar * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)137 UCNV_FROM_U_CALLBACK_SKIP (
138 const void *context,
139 UConverterFromUnicodeArgs *fromUArgs,
140 const UChar* codeUnits,
141 int32_t length,
142 UChar32 codePoint,
143 UConverterCallbackReason reason,
144 UErrorCode * err)
145 {
146 if (reason <= UCNV_IRREGULAR)
147 {
148 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
149 {
150 /*
151 * Skip if the codepoint has unicode property of default ignorable.
152 */
153 *err = U_ZERO_ERROR;
154 }
155 else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
156 {
157 *err = U_ZERO_ERROR;
158 }
159 /* else the caller must have set the error code accordingly. */
160 }
161 /* else ignore the reset, close and clone calls. */
162 }
163
164 U_CAPI void U_EXPORT2
UCNV_FROM_U_CALLBACK_SUBSTITUTE(const void * context,UConverterFromUnicodeArgs * fromArgs,const UChar * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)165 UCNV_FROM_U_CALLBACK_SUBSTITUTE (
166 const void *context,
167 UConverterFromUnicodeArgs *fromArgs,
168 const UChar* codeUnits,
169 int32_t length,
170 UChar32 codePoint,
171 UConverterCallbackReason reason,
172 UErrorCode * err)
173 {
174 if (reason <= UCNV_IRREGULAR)
175 {
176 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
177 {
178 /*
179 * Skip if the codepoint has unicode property of default ignorable.
180 */
181 *err = U_ZERO_ERROR;
182 }
183 else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
184 {
185 *err = U_ZERO_ERROR;
186 ucnv_cbFromUWriteSub(fromArgs, 0, err);
187 }
188 /* else the caller must have set the error code accordingly. */
189 }
190 /* else ignore the reset, close and clone calls. */
191 }
192
193 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
194 *uses a clean copy (resetted) of the converter, to convert that unicode
195 *escape sequence to the target codepage (if conversion failure happens then
196 *we revert to substituting with subchar)
197 */
198 U_CAPI void U_EXPORT2
UCNV_FROM_U_CALLBACK_ESCAPE(const void * context,UConverterFromUnicodeArgs * fromArgs,const UChar * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)199 UCNV_FROM_U_CALLBACK_ESCAPE (
200 const void *context,
201 UConverterFromUnicodeArgs *fromArgs,
202 const UChar *codeUnits,
203 int32_t length,
204 UChar32 codePoint,
205 UConverterCallbackReason reason,
206 UErrorCode * err)
207 {
208
209 UChar valueString[VALUE_STRING_LENGTH];
210 int32_t valueStringLength = 0;
211 int32_t i = 0;
212
213 const UChar *myValueSource = NULL;
214 UErrorCode err2 = U_ZERO_ERROR;
215 UConverterFromUCallback original = NULL;
216 const void *originalContext;
217
218 UConverterFromUCallback ignoredCallback = NULL;
219 const void *ignoredContext;
220
221 if (reason > UCNV_IRREGULAR)
222 {
223 return;
224 }
225 else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
226 {
227 /*
228 * Skip if the codepoint has unicode property of default ignorable.
229 */
230 *err = U_ZERO_ERROR;
231 return;
232 }
233
234 ucnv_setFromUCallBack (fromArgs->converter,
235 (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
236 NULL,
237 &original,
238 &originalContext,
239 &err2);
240
241 if (U_FAILURE (err2))
242 {
243 *err = err2;
244 return;
245 }
246 if(context==NULL)
247 {
248 while (i < length)
249 {
250 valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
251 valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
252 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
253 }
254 }
255 else
256 {
257 switch(*((char*)context))
258 {
259 case UCNV_PRV_ESCAPE_JAVA:
260 while (i < length)
261 {
262 valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */
263 valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
264 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
265 }
266 break;
267
268 case UCNV_PRV_ESCAPE_C:
269 valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */
270
271 if(length==2){
272 valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
273 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8);
274
275 }
276 else{
277 valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
278 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
279 }
280 break;
281
282 case UCNV_PRV_ESCAPE_XML_DEC:
283
284 valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */
285 valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */
286 if(length==2){
287 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0);
288 }
289 else{
290 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0);
291 }
292 valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
293 break;
294
295 case UCNV_PRV_ESCAPE_XML_HEX:
296
297 valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */
298 valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */
299 valueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
300 if(length==2){
301 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
302 }
303 else{
304 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0);
305 }
306 valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
307 break;
308
309 case UCNV_PRV_ESCAPE_UNICODE:
310 valueString[valueStringLength++] = (UChar) UNICODE_LEFT_CURLY_CODEPOINT; /* adding { */
311 valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
312 valueString[valueStringLength++] = (UChar) UNICODE_PLUS_CODEPOINT; /* adding + */
313 if (length == 2) {
314 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4);
315 } else {
316 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
317 }
318 valueString[valueStringLength++] = (UChar) UNICODE_RIGHT_CURLY_CODEPOINT; /* adding } */
319 break;
320
321 case UCNV_PRV_ESCAPE_CSS2:
322 valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */
323 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
324 /* Always add space character, becase the next character might be whitespace,
325 which would erroneously be considered the termination of the escape sequence. */
326 valueString[valueStringLength++] = (UChar) UNICODE_SPACE_CODEPOINT;
327 break;
328
329 default:
330 while (i < length)
331 {
332 valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
333 valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
334 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
335 }
336 }
337 }
338 myValueSource = valueString;
339
340 /* reset the error */
341 *err = U_ZERO_ERROR;
342
343 ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err);
344
345 ucnv_setFromUCallBack (fromArgs->converter,
346 original,
347 originalContext,
348 &ignoredCallback,
349 &ignoredContext,
350 &err2);
351 if (U_FAILURE (err2))
352 {
353 *err = err2;
354 return;
355 }
356
357 return;
358 }
359
360
361
362 U_CAPI void U_EXPORT2
UCNV_TO_U_CALLBACK_SKIP(const void * context,UConverterToUnicodeArgs * toArgs,const char * codeUnits,int32_t length,UConverterCallbackReason reason,UErrorCode * err)363 UCNV_TO_U_CALLBACK_SKIP (
364 const void *context,
365 UConverterToUnicodeArgs *toArgs,
366 const char* codeUnits,
367 int32_t length,
368 UConverterCallbackReason reason,
369 UErrorCode * err)
370 {
371 if (reason <= UCNV_IRREGULAR)
372 {
373 if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
374 {
375 *err = U_ZERO_ERROR;
376 }
377 /* else the caller must have set the error code accordingly. */
378 }
379 /* else ignore the reset, close and clone calls. */
380 }
381
382 U_CAPI void U_EXPORT2
UCNV_TO_U_CALLBACK_SUBSTITUTE(const void * context,UConverterToUnicodeArgs * toArgs,const char * codeUnits,int32_t length,UConverterCallbackReason reason,UErrorCode * err)383 UCNV_TO_U_CALLBACK_SUBSTITUTE (
384 const void *context,
385 UConverterToUnicodeArgs *toArgs,
386 const char* codeUnits,
387 int32_t length,
388 UConverterCallbackReason reason,
389 UErrorCode * err)
390 {
391 if (reason <= UCNV_IRREGULAR)
392 {
393 if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
394 {
395 *err = U_ZERO_ERROR;
396 ucnv_cbToUWriteSub(toArgs,0,err);
397 }
398 /* else the caller must have set the error code accordingly. */
399 }
400 /* else ignore the reset, close and clone calls. */
401 }
402
403 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
404 *and uses that as the substitution sequence
405 */
406 U_CAPI void U_EXPORT2
UCNV_TO_U_CALLBACK_ESCAPE(const void * context,UConverterToUnicodeArgs * toArgs,const char * codeUnits,int32_t length,UConverterCallbackReason reason,UErrorCode * err)407 UCNV_TO_U_CALLBACK_ESCAPE (
408 const void *context,
409 UConverterToUnicodeArgs *toArgs,
410 const char* codeUnits,
411 int32_t length,
412 UConverterCallbackReason reason,
413 UErrorCode * err)
414 {
415 UChar uniValueString[VALUE_STRING_LENGTH];
416 int32_t valueStringLength = 0;
417 int32_t i = 0;
418
419 if (reason > UCNV_IRREGULAR)
420 {
421 return;
422 }
423
424 if(context==NULL)
425 {
426 while (i < length)
427 {
428 uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
429 uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT; /* adding X */
430 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
431 }
432 }
433 else
434 {
435 switch(*((char*)context))
436 {
437 case UCNV_PRV_ESCAPE_XML_DEC:
438 while (i < length)
439 {
440 uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */
441 uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */
442 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0);
443 uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
444 }
445 break;
446
447 case UCNV_PRV_ESCAPE_XML_HEX:
448 while (i < length)
449 {
450 uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */
451 uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */
452 uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
453 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0);
454 uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
455 }
456 break;
457 case UCNV_PRV_ESCAPE_C:
458 while (i < length)
459 {
460 uniValueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */
461 uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
462 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2);
463 }
464 break;
465 default:
466 while (i < length)
467 {
468 uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
469 uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT; /* adding X */
470 uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
471 valueStringLength += 2;
472 }
473 }
474 }
475 /* reset the error */
476 *err = U_ZERO_ERROR;
477
478 ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err);
479 }
480
481 #endif
482