1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *****************************************************************************
5 *
6 * Copyright (C) 1998-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *****************************************************************************
10 *
11 * ucnv_err.c
12 * Implements error behaviour functions called by T_UConverter_{from,to}Unicode
13 *
14 *
15 * Change history:
16 *
17 * 06/29/2000 helena Major rewrite of the callback APIs.
18 */
19
20 #include "unicode/utypes.h"
21
22 #if !UCONFIG_NO_CONVERSION
23
24 #include "unicode/ucnv_err.h"
25 #include "unicode/ucnv_cb.h"
26 #include "ucnv_cnv.h"
27 #include "cmemory.h"
28 #include "unicode/ucnv.h"
29 #include "ustrfmt.h"
30
31 #define VALUE_STRING_LENGTH 48
32 /*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
33 #define UNICODE_PERCENT_SIGN_CODEPOINT 0x0025
34 #define UNICODE_U_CODEPOINT 0x0055
35 #define UNICODE_X_CODEPOINT 0x0058
36 #define UNICODE_RS_CODEPOINT 0x005C
37 #define UNICODE_U_LOW_CODEPOINT 0x0075
38 #define UNICODE_X_LOW_CODEPOINT 0x0078
39 #define UNICODE_AMP_CODEPOINT 0x0026
40 #define UNICODE_HASH_CODEPOINT 0x0023
41 #define UNICODE_SEMICOLON_CODEPOINT 0x003B
42 #define UNICODE_PLUS_CODEPOINT 0x002B
43 #define UNICODE_LEFT_CURLY_CODEPOINT 0x007B
44 #define UNICODE_RIGHT_CURLY_CODEPOINT 0x007D
45 #define UNICODE_SPACE_CODEPOINT 0x0020
46 #define UCNV_PRV_ESCAPE_ICU 0
47 #define UCNV_PRV_ESCAPE_C 'C'
48 #define UCNV_PRV_ESCAPE_XML_DEC 'D'
49 #define UCNV_PRV_ESCAPE_XML_HEX 'X'
50 #define UCNV_PRV_ESCAPE_JAVA 'J'
51 #define UCNV_PRV_ESCAPE_UNICODE 'U'
52 #define UCNV_PRV_ESCAPE_CSS2 'S'
53 #define UCNV_PRV_STOP_ON_ILLEGAL 'i'
54
55 /*
56 * IS_DEFAULT_IGNORABLE_CODE_POINT
57 * This is to check if a code point has the default ignorable unicode property.
58 * As such, this list needs to be updated if the ignorable code point list ever
59 * changes.
60 * To avoid dependency on other code, this list is hard coded here.
61 * When an ignorable code point is found and is unmappable, the default callbacks
62 * will ignore them.
63 * For a list of the default ignorable code points, use this link:
64 * https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3ADI%3A%5D&abb=on&g=&i=
65 *
66 * This list should be sync with the one in CharsetCallback.java
67 */
68 #define IS_DEFAULT_IGNORABLE_CODE_POINT(c) ( \
69 (c == 0x00AD) || \
70 (c == 0x034F) || \
71 (c == 0x061C) || \
72 (c == 0x115F) || \
73 (c == 0x1160) || \
74 (0x17B4 <= c && c <= 0x17B5) || \
75 (0x180B <= c && c <= 0x180F) || \
76 (0x200B <= c && c <= 0x200F) || \
77 (0x202A <= c && c <= 0x202E) || \
78 (0x2060 <= c && c <= 0x206F) || \
79 (c == 0x3164) || \
80 (0xFE00 <= c && c <= 0xFE0F) || \
81 (c == 0xFEFF) || \
82 (c == 0xFFA0) || \
83 (0xFFF0 <= c && c <= 0xFFF8) || \
84 (0x1BCA0 <= c && c <= 0x1BCA3) || \
85 (0x1D173 <= c && c <= 0x1D17A) || \
86 (0xE0000 <= c && c <= 0xE0FFF))
87
88
89 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
90 U_CAPI void U_EXPORT2
UCNV_FROM_U_CALLBACK_STOP(const void * context,UConverterFromUnicodeArgs * fromUArgs,const char16_t * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)91 UCNV_FROM_U_CALLBACK_STOP (
92 const void *context,
93 UConverterFromUnicodeArgs *fromUArgs,
94 const char16_t* codeUnits,
95 int32_t length,
96 UChar32 codePoint,
97 UConverterCallbackReason reason,
98 UErrorCode * err)
99 {
100 (void)context;
101 (void)fromUArgs;
102 (void)codeUnits;
103 (void)length;
104 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
105 {
106 /*
107 * Skip if the codepoint has unicode property of default ignorable.
108 */
109 *err = U_ZERO_ERROR;
110 }
111 /* the caller must have set the error code accordingly */
112 }
113
114
115 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
116 U_CAPI void U_EXPORT2
UCNV_TO_U_CALLBACK_STOP(const void * context,UConverterToUnicodeArgs * toUArgs,const char * codePoints,int32_t length,UConverterCallbackReason reason,UErrorCode * err)117 UCNV_TO_U_CALLBACK_STOP (
118 const void *context,
119 UConverterToUnicodeArgs *toUArgs,
120 const char* codePoints,
121 int32_t length,
122 UConverterCallbackReason reason,
123 UErrorCode * err)
124 {
125 /* the caller must have set the error code accordingly */
126 (void)context; (void)toUArgs; (void)codePoints; (void)length; (void)reason; (void)err;
127 }
128
129 U_CAPI void U_EXPORT2
UCNV_FROM_U_CALLBACK_SKIP(const void * context,UConverterFromUnicodeArgs * fromUArgs,const char16_t * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)130 UCNV_FROM_U_CALLBACK_SKIP (
131 const void *context,
132 UConverterFromUnicodeArgs *fromUArgs,
133 const char16_t* codeUnits,
134 int32_t length,
135 UChar32 codePoint,
136 UConverterCallbackReason reason,
137 UErrorCode * err)
138 {
139 (void)fromUArgs;
140 (void)codeUnits;
141 (void)length;
142 if (reason <= UCNV_IRREGULAR)
143 {
144 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
145 {
146 /*
147 * Skip if the codepoint has unicode property of default ignorable.
148 */
149 *err = U_ZERO_ERROR;
150 }
151 else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
152 {
153 *err = U_ZERO_ERROR;
154 }
155 /* else the caller must have set the error code accordingly. */
156 }
157 /* else ignore the reset, close and clone calls. */
158 }
159
160 U_CAPI void U_EXPORT2
UCNV_FROM_U_CALLBACK_SUBSTITUTE(const void * context,UConverterFromUnicodeArgs * fromArgs,const char16_t * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)161 UCNV_FROM_U_CALLBACK_SUBSTITUTE (
162 const void *context,
163 UConverterFromUnicodeArgs *fromArgs,
164 const char16_t* codeUnits,
165 int32_t length,
166 UChar32 codePoint,
167 UConverterCallbackReason reason,
168 UErrorCode * err)
169 {
170 (void)codeUnits;
171 (void)length;
172 if (reason <= UCNV_IRREGULAR)
173 {
174 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
175 {
176 /*
177 * Skip if the codepoint has unicode property of default ignorable.
178 */
179 *err = U_ZERO_ERROR;
180 }
181 else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
182 {
183 *err = U_ZERO_ERROR;
184 ucnv_cbFromUWriteSub(fromArgs, 0, err);
185 }
186 /* else the caller must have set the error code accordingly. */
187 }
188 /* else ignore the reset, close and clone calls. */
189 }
190
191 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
192 *uses a clean copy (resetted) of the converter, to convert that unicode
193 *escape sequence to the target codepage (if conversion failure happens then
194 *we revert to substituting with subchar)
195 */
196 U_CAPI void U_EXPORT2
UCNV_FROM_U_CALLBACK_ESCAPE(const void * context,UConverterFromUnicodeArgs * fromArgs,const char16_t * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)197 UCNV_FROM_U_CALLBACK_ESCAPE (
198 const void *context,
199 UConverterFromUnicodeArgs *fromArgs,
200 const char16_t *codeUnits,
201 int32_t length,
202 UChar32 codePoint,
203 UConverterCallbackReason reason,
204 UErrorCode * err)
205 {
206
207 char16_t valueString[VALUE_STRING_LENGTH];
208 int32_t valueStringLength = 0;
209 int32_t i = 0;
210
211 const char16_t *myValueSource = nullptr;
212 UErrorCode err2 = U_ZERO_ERROR;
213 UConverterFromUCallback original = nullptr;
214 const void *originalContext;
215
216 UConverterFromUCallback ignoredCallback = nullptr;
217 const void *ignoredContext;
218
219 if (reason > UCNV_IRREGULAR)
220 {
221 return;
222 }
223 else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
224 {
225 /*
226 * Skip if the codepoint has unicode property of default ignorable.
227 */
228 *err = U_ZERO_ERROR;
229 return;
230 }
231
232 ucnv_setFromUCallBack (fromArgs->converter,
233 (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
234 nullptr,
235 &original,
236 &originalContext,
237 &err2);
238
239 if (U_FAILURE (err2))
240 {
241 *err = err2;
242 return;
243 }
244 if(context==nullptr)
245 {
246 while (i < length)
247 {
248 valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
249 valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
250 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
251 }
252 }
253 else
254 {
255 switch(*((char*)context))
256 {
257 case UCNV_PRV_ESCAPE_JAVA:
258 while (i < length)
259 {
260 valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */
261 valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */
262 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
263 }
264 break;
265
266 case UCNV_PRV_ESCAPE_C:
267 valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */
268
269 if(length==2){
270 valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
271 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8);
272
273 }
274 else{
275 valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */
276 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
277 }
278 break;
279
280 case UCNV_PRV_ESCAPE_XML_DEC:
281
282 valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */
283 valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */
284 if(length==2){
285 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0);
286 }
287 else{
288 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0);
289 }
290 valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
291 break;
292
293 case UCNV_PRV_ESCAPE_XML_HEX:
294
295 valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */
296 valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */
297 valueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
298 if(length==2){
299 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
300 }
301 else{
302 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0);
303 }
304 valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
305 break;
306
307 case UCNV_PRV_ESCAPE_UNICODE:
308 valueString[valueStringLength++] = (char16_t) UNICODE_LEFT_CURLY_CODEPOINT; /* adding { */
309 valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
310 valueString[valueStringLength++] = (char16_t) UNICODE_PLUS_CODEPOINT; /* adding + */
311 if (length == 2) {
312 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4);
313 } else {
314 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
315 }
316 valueString[valueStringLength++] = (char16_t) UNICODE_RIGHT_CURLY_CODEPOINT; /* adding } */
317 break;
318
319 case UCNV_PRV_ESCAPE_CSS2:
320 valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */
321 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
322 /* Always add space character, because the next character might be whitespace,
323 which would erroneously be considered the termination of the escape sequence. */
324 valueString[valueStringLength++] = (char16_t) UNICODE_SPACE_CODEPOINT;
325 break;
326
327 default:
328 while (i < length)
329 {
330 valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
331 valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
332 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
333 }
334 }
335 }
336 myValueSource = valueString;
337
338 /* reset the error */
339 *err = U_ZERO_ERROR;
340
341 ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err);
342
343 ucnv_setFromUCallBack (fromArgs->converter,
344 original,
345 originalContext,
346 &ignoredCallback,
347 &ignoredContext,
348 &err2);
349 if (U_FAILURE (err2))
350 {
351 *err = err2;
352 return;
353 }
354 }
355
356
357
358 U_CAPI void U_EXPORT2
UCNV_TO_U_CALLBACK_SKIP(const void * context,UConverterToUnicodeArgs * toArgs,const char * codeUnits,int32_t length,UConverterCallbackReason reason,UErrorCode * err)359 UCNV_TO_U_CALLBACK_SKIP (
360 const void *context,
361 UConverterToUnicodeArgs *toArgs,
362 const char* codeUnits,
363 int32_t length,
364 UConverterCallbackReason reason,
365 UErrorCode * err)
366 {
367 (void)toArgs;
368 (void)codeUnits;
369 (void)length;
370 if (reason <= UCNV_IRREGULAR)
371 {
372 if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
373 {
374 *err = U_ZERO_ERROR;
375 }
376 /* else the caller must have set the error code accordingly. */
377 }
378 /* else ignore the reset, close and clone calls. */
379 }
380
381 U_CAPI void U_EXPORT2
UCNV_TO_U_CALLBACK_SUBSTITUTE(const void * context,UConverterToUnicodeArgs * toArgs,const char * codeUnits,int32_t length,UConverterCallbackReason reason,UErrorCode * err)382 UCNV_TO_U_CALLBACK_SUBSTITUTE (
383 const void *context,
384 UConverterToUnicodeArgs *toArgs,
385 const char* codeUnits,
386 int32_t length,
387 UConverterCallbackReason reason,
388 UErrorCode * err)
389 {
390 (void)codeUnits;
391 (void)length;
392 if (reason <= UCNV_IRREGULAR)
393 {
394 if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
395 {
396 *err = U_ZERO_ERROR;
397 ucnv_cbToUWriteSub(toArgs,0,err);
398 }
399 /* else the caller must have set the error code accordingly. */
400 }
401 /* else ignore the reset, close and clone calls. */
402 }
403
404 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
405 *and uses that as the substitution sequence
406 */
407 U_CAPI void U_EXPORT2
UCNV_TO_U_CALLBACK_ESCAPE(const void * context,UConverterToUnicodeArgs * toArgs,const char * codeUnits,int32_t length,UConverterCallbackReason reason,UErrorCode * err)408 UCNV_TO_U_CALLBACK_ESCAPE (
409 const void *context,
410 UConverterToUnicodeArgs *toArgs,
411 const char* codeUnits,
412 int32_t length,
413 UConverterCallbackReason reason,
414 UErrorCode * err)
415 {
416 char16_t uniValueString[VALUE_STRING_LENGTH];
417 int32_t valueStringLength = 0;
418 int32_t i = 0;
419
420 if (reason > UCNV_IRREGULAR)
421 {
422 return;
423 }
424
425 if(context==nullptr)
426 {
427 while (i < length)
428 {
429 uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
430 uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT; /* adding X */
431 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
432 }
433 }
434 else
435 {
436 switch(*((char*)context))
437 {
438 case UCNV_PRV_ESCAPE_XML_DEC:
439 while (i < length)
440 {
441 uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */
442 uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */
443 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0);
444 uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
445 }
446 break;
447
448 case UCNV_PRV_ESCAPE_XML_HEX:
449 while (i < length)
450 {
451 uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */
452 uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */
453 uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
454 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0);
455 uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
456 }
457 break;
458 case UCNV_PRV_ESCAPE_C:
459 while (i < length)
460 {
461 uniValueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */
462 uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
463 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2);
464 }
465 break;
466 default:
467 while (i < length)
468 {
469 uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
470 uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT; /* adding X */
471 uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
472 valueStringLength += 2;
473 }
474 }
475 }
476 /* reset the error */
477 *err = U_ZERO_ERROR;
478
479 ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err);
480 }
481
482 #endif
483