• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  *****************************************************************************
5  *
6  *   Copyright (C) 1998-2016, International Business Machines
7  *   Corporation and others.  All Rights Reserved.
8  *
9  *****************************************************************************
10  *
11  *  ucnv_err.c
12  *  Implements error behaviour functions called by T_UConverter_{from,to}Unicode
13  *
14  *
15 *   Change history:
16 *
17 *   06/29/2000  helena      Major rewrite of the callback APIs.
18 */
19 
20 #include "unicode/utypes.h"
21 
22 #if !UCONFIG_NO_CONVERSION
23 
24 #include "unicode/ucnv_err.h"
25 #include "unicode/ucnv_cb.h"
26 #include "ucnv_cnv.h"
27 #include "cmemory.h"
28 #include "unicode/ucnv.h"
29 #include "ustrfmt.h"
30 
31 #define VALUE_STRING_LENGTH 48
32 /*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
33 #define UNICODE_PERCENT_SIGN_CODEPOINT  0x0025
34 #define UNICODE_U_CODEPOINT             0x0055
35 #define UNICODE_X_CODEPOINT             0x0058
36 #define UNICODE_RS_CODEPOINT            0x005C
37 #define UNICODE_U_LOW_CODEPOINT         0x0075
38 #define UNICODE_X_LOW_CODEPOINT         0x0078
39 #define UNICODE_AMP_CODEPOINT           0x0026
40 #define UNICODE_HASH_CODEPOINT          0x0023
41 #define UNICODE_SEMICOLON_CODEPOINT     0x003B
42 #define UNICODE_PLUS_CODEPOINT          0x002B
43 #define UNICODE_LEFT_CURLY_CODEPOINT    0x007B
44 #define UNICODE_RIGHT_CURLY_CODEPOINT   0x007D
45 #define UNICODE_SPACE_CODEPOINT         0x0020
46 #define UCNV_PRV_ESCAPE_ICU         0
47 #define UCNV_PRV_ESCAPE_C           'C'
48 #define UCNV_PRV_ESCAPE_XML_DEC     'D'
49 #define UCNV_PRV_ESCAPE_XML_HEX     'X'
50 #define UCNV_PRV_ESCAPE_JAVA        'J'
51 #define UCNV_PRV_ESCAPE_UNICODE     'U'
52 #define UCNV_PRV_ESCAPE_CSS2        'S'
53 #define UCNV_PRV_STOP_ON_ILLEGAL    'i'
54 
55 /*
56  * IS_DEFAULT_IGNORABLE_CODE_POINT
57  * This is to check if a code point has the default ignorable unicode property.
58  * As such, this list needs to be updated if the ignorable code point list ever
59  * changes.
60  * To avoid dependency on other code, this list is hard coded here.
61  * When an ignorable code point is found and is unmappable, the default callbacks
62  * will ignore them.
63  * For a list of the default ignorable code points, use this link:
64  * https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3ADI%3A%5D&abb=on&g=&i=
65  *
66  * This list should be sync with the one in CharsetCallback.java
67  */
68 #define IS_DEFAULT_IGNORABLE_CODE_POINT(c) ( \
69     (c == 0x00AD) || \
70     (c == 0x034F) || \
71     (c == 0x061C) || \
72     (c == 0x115F) || \
73     (c == 0x1160) || \
74     (0x17B4 <= c && c <= 0x17B5) || \
75     (0x180B <= c && c <= 0x180F) || \
76     (0x200B <= c && c <= 0x200F) || \
77     (0x202A <= c && c <= 0x202E) || \
78     (0x2060 <= c && c <= 0x206F) || \
79     (c == 0x3164) || \
80     (0xFE00 <= c && c <= 0xFE0F) || \
81     (c == 0xFEFF) || \
82     (c == 0xFFA0) || \
83     (0xFFF0 <= c && c <= 0xFFF8) || \
84     (0x1BCA0 <= c && c <= 0x1BCA3) || \
85     (0x1D173 <= c && c <= 0x1D17A) || \
86     (0xE0000 <= c && c <= 0xE0FFF))
87 
88 
89 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
90 U_CAPI void    U_EXPORT2
UCNV_FROM_U_CALLBACK_STOP(const void * context,UConverterFromUnicodeArgs * fromUArgs,const char16_t * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)91 UCNV_FROM_U_CALLBACK_STOP (
92                   const void *context,
93                   UConverterFromUnicodeArgs *fromUArgs,
94                   const char16_t* codeUnits,
95                   int32_t length,
96                   UChar32 codePoint,
97                   UConverterCallbackReason reason,
98                   UErrorCode * err)
99 {
100     (void)context;
101     (void)fromUArgs;
102     (void)codeUnits;
103     (void)length;
104     if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
105     {
106         /*
107          * Skip if the codepoint has unicode property of default ignorable.
108          */
109         *err = U_ZERO_ERROR;
110     }
111     /* the caller must have set the error code accordingly */
112 }
113 
114 
115 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
116 U_CAPI void    U_EXPORT2
UCNV_TO_U_CALLBACK_STOP(const void * context,UConverterToUnicodeArgs * toUArgs,const char * codePoints,int32_t length,UConverterCallbackReason reason,UErrorCode * err)117 UCNV_TO_U_CALLBACK_STOP (
118                    const void *context,
119                    UConverterToUnicodeArgs *toUArgs,
120                    const char* codePoints,
121                    int32_t length,
122                    UConverterCallbackReason reason,
123                    UErrorCode * err)
124 {
125     /* the caller must have set the error code accordingly */
126     (void)context; (void)toUArgs; (void)codePoints; (void)length; (void)reason; (void)err;
127 }
128 
129 U_CAPI void    U_EXPORT2
UCNV_FROM_U_CALLBACK_SKIP(const void * context,UConverterFromUnicodeArgs * fromUArgs,const char16_t * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)130 UCNV_FROM_U_CALLBACK_SKIP (
131                   const void *context,
132                   UConverterFromUnicodeArgs *fromUArgs,
133                   const char16_t* codeUnits,
134                   int32_t length,
135                   UChar32 codePoint,
136                   UConverterCallbackReason reason,
137                   UErrorCode * err)
138 {
139     (void)fromUArgs;
140     (void)codeUnits;
141     (void)length;
142     if (reason <= UCNV_IRREGULAR)
143     {
144         if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
145         {
146             /*
147              * Skip if the codepoint has unicode property of default ignorable.
148              */
149             *err = U_ZERO_ERROR;
150         }
151         else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
152         {
153             *err = U_ZERO_ERROR;
154         }
155         /* else the caller must have set the error code accordingly. */
156     }
157     /* else ignore the reset, close and clone calls. */
158 }
159 
160 U_CAPI void    U_EXPORT2
UCNV_FROM_U_CALLBACK_SUBSTITUTE(const void * context,UConverterFromUnicodeArgs * fromArgs,const char16_t * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)161 UCNV_FROM_U_CALLBACK_SUBSTITUTE (
162                   const void *context,
163                   UConverterFromUnicodeArgs *fromArgs,
164                   const char16_t* codeUnits,
165                   int32_t length,
166                   UChar32 codePoint,
167                   UConverterCallbackReason reason,
168                   UErrorCode * err)
169 {
170     (void)codeUnits;
171     (void)length;
172     if (reason <= UCNV_IRREGULAR)
173     {
174         if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
175         {
176             /*
177              * Skip if the codepoint has unicode property of default ignorable.
178              */
179             *err = U_ZERO_ERROR;
180         }
181         else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
182         {
183             *err = U_ZERO_ERROR;
184             ucnv_cbFromUWriteSub(fromArgs, 0, err);
185         }
186         /* else the caller must have set the error code accordingly. */
187     }
188     /* else ignore the reset, close and clone calls. */
189 }
190 
191 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
192  *uses a clean copy (resetted) of the converter, to convert that unicode
193  *escape sequence to the target codepage (if conversion failure happens then
194  *we revert to substituting with subchar)
195  */
196 U_CAPI void    U_EXPORT2
UCNV_FROM_U_CALLBACK_ESCAPE(const void * context,UConverterFromUnicodeArgs * fromArgs,const char16_t * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)197 UCNV_FROM_U_CALLBACK_ESCAPE (
198                          const void *context,
199                          UConverterFromUnicodeArgs *fromArgs,
200                          const char16_t *codeUnits,
201                          int32_t length,
202                          UChar32 codePoint,
203                          UConverterCallbackReason reason,
204                          UErrorCode * err)
205 {
206 
207   char16_t valueString[VALUE_STRING_LENGTH];
208   int32_t valueStringLength = 0;
209   int32_t i = 0;
210 
211   const char16_t *myValueSource = nullptr;
212   UErrorCode err2 = U_ZERO_ERROR;
213   UConverterFromUCallback original = nullptr;
214   const void *originalContext;
215 
216   UConverterFromUCallback ignoredCallback = nullptr;
217   const void *ignoredContext;
218 
219   if (reason > UCNV_IRREGULAR)
220   {
221       return;
222   }
223   else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
224   {
225       /*
226        * Skip if the codepoint has unicode property of default ignorable.
227        */
228       *err = U_ZERO_ERROR;
229       return;
230   }
231 
232   ucnv_setFromUCallBack (fromArgs->converter,
233                      (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
234                      nullptr,
235                      &original,
236                      &originalContext,
237                      &err2);
238 
239   if (U_FAILURE (err2))
240   {
241     *err = err2;
242     return;
243   }
244   if(context==nullptr)
245   {
246       while (i < length)
247       {
248         valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
249         valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
250         valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
251       }
252   }
253   else
254   {
255       switch(*((char*)context))
256       {
257       case UCNV_PRV_ESCAPE_JAVA:
258           while (i < length)
259           {
260               valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT;    /* adding \ */
261               valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */
262               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
263           }
264           break;
265 
266       case UCNV_PRV_ESCAPE_C:
267           valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT;    /* adding \ */
268 
269           if(length==2){
270               valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
271               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8);
272 
273           }
274           else{
275               valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */
276               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
277           }
278           break;
279 
280       case UCNV_PRV_ESCAPE_XML_DEC:
281 
282           valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT;   /* adding & */
283           valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT;  /* adding # */
284           if(length==2){
285               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0);
286           }
287           else{
288               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0);
289           }
290           valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
291           break;
292 
293       case UCNV_PRV_ESCAPE_XML_HEX:
294 
295           valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT;   /* adding & */
296           valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT;  /* adding # */
297           valueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
298           if(length==2){
299               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
300           }
301           else{
302               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0);
303           }
304           valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
305           break;
306 
307       case UCNV_PRV_ESCAPE_UNICODE:
308           valueString[valueStringLength++] = (char16_t) UNICODE_LEFT_CURLY_CODEPOINT;    /* adding { */
309           valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT;    /* adding U */
310           valueString[valueStringLength++] = (char16_t) UNICODE_PLUS_CODEPOINT; /* adding + */
311           if (length == 2) {
312               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4);
313           } else {
314               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
315           }
316           valueString[valueStringLength++] = (char16_t) UNICODE_RIGHT_CURLY_CODEPOINT;    /* adding } */
317           break;
318 
319       case UCNV_PRV_ESCAPE_CSS2:
320           valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT;    /* adding \ */
321           valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
322           /* Always add space character, because the next character might be whitespace,
323              which would erroneously be considered the termination of the escape sequence. */
324           valueString[valueStringLength++] = (char16_t) UNICODE_SPACE_CODEPOINT;
325           break;
326 
327       default:
328           while (i < length)
329           {
330               valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
331               valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT;             /* adding U */
332               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
333           }
334       }
335   }
336   myValueSource = valueString;
337 
338   /* reset the error */
339   *err = U_ZERO_ERROR;
340 
341   ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err);
342 
343   ucnv_setFromUCallBack (fromArgs->converter,
344                          original,
345                          originalContext,
346                          &ignoredCallback,
347                          &ignoredContext,
348                          &err2);
349   if (U_FAILURE (err2))
350   {
351       *err = err2;
352       return;
353   }
354 }
355 
356 
357 
358 U_CAPI void  U_EXPORT2
UCNV_TO_U_CALLBACK_SKIP(const void * context,UConverterToUnicodeArgs * toArgs,const char * codeUnits,int32_t length,UConverterCallbackReason reason,UErrorCode * err)359 UCNV_TO_U_CALLBACK_SKIP (
360                  const void *context,
361                  UConverterToUnicodeArgs *toArgs,
362                  const char* codeUnits,
363                  int32_t length,
364                  UConverterCallbackReason reason,
365                  UErrorCode * err)
366 {
367     (void)toArgs;
368     (void)codeUnits;
369     (void)length;
370     if (reason <= UCNV_IRREGULAR)
371     {
372         if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
373         {
374             *err = U_ZERO_ERROR;
375         }
376         /* else the caller must have set the error code accordingly. */
377     }
378     /* else ignore the reset, close and clone calls. */
379 }
380 
381 U_CAPI void    U_EXPORT2
UCNV_TO_U_CALLBACK_SUBSTITUTE(const void * context,UConverterToUnicodeArgs * toArgs,const char * codeUnits,int32_t length,UConverterCallbackReason reason,UErrorCode * err)382 UCNV_TO_U_CALLBACK_SUBSTITUTE (
383                  const void *context,
384                  UConverterToUnicodeArgs *toArgs,
385                  const char* codeUnits,
386                  int32_t length,
387                  UConverterCallbackReason reason,
388                  UErrorCode * err)
389 {
390     (void)codeUnits;
391     (void)length;
392     if (reason <= UCNV_IRREGULAR)
393     {
394         if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
395         {
396             *err = U_ZERO_ERROR;
397             ucnv_cbToUWriteSub(toArgs,0,err);
398         }
399         /* else the caller must have set the error code accordingly. */
400     }
401     /* else ignore the reset, close and clone calls. */
402 }
403 
404 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
405  *and uses that as the substitution sequence
406  */
407 U_CAPI void   U_EXPORT2
UCNV_TO_U_CALLBACK_ESCAPE(const void * context,UConverterToUnicodeArgs * toArgs,const char * codeUnits,int32_t length,UConverterCallbackReason reason,UErrorCode * err)408 UCNV_TO_U_CALLBACK_ESCAPE (
409                  const void *context,
410                  UConverterToUnicodeArgs *toArgs,
411                  const char* codeUnits,
412                  int32_t length,
413                  UConverterCallbackReason reason,
414                  UErrorCode * err)
415 {
416     char16_t uniValueString[VALUE_STRING_LENGTH];
417     int32_t valueStringLength = 0;
418     int32_t i = 0;
419 
420     if (reason > UCNV_IRREGULAR)
421     {
422         return;
423     }
424 
425     if(context==nullptr)
426     {
427         while (i < length)
428         {
429             uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
430             uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT;    /* adding X */
431             valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
432         }
433     }
434     else
435     {
436         switch(*((char*)context))
437         {
438         case UCNV_PRV_ESCAPE_XML_DEC:
439             while (i < length)
440             {
441                 uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT;   /* adding & */
442                 uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT;  /* adding # */
443                 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0);
444                 uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
445             }
446             break;
447 
448         case UCNV_PRV_ESCAPE_XML_HEX:
449             while (i < length)
450             {
451                 uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT;   /* adding & */
452                 uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT;  /* adding # */
453                 uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
454                 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0);
455                 uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
456             }
457             break;
458         case UCNV_PRV_ESCAPE_C:
459             while (i < length)
460             {
461                 uniValueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT;    /* adding \ */
462                 uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
463                 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2);
464             }
465             break;
466         default:
467             while (i < length)
468             {
469                 uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
470                 uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT;    /* adding X */
471                 uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
472                 valueStringLength += 2;
473             }
474         }
475     }
476     /* reset the error */
477     *err = U_ZERO_ERROR;
478 
479     ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err);
480 }
481 
482 #endif
483