• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  *****************************************************************************
5  *
6  *   Copyright (C) 1998-2016, International Business Machines
7  *   Corporation and others.  All Rights Reserved.
8  *
9  *****************************************************************************
10  *
11  *  ucnv_err.c
12  *  Implements error behaviour functions called by T_UConverter_{from,to}Unicode
13  *
14  *
15 *   Change history:
16 *
17 *   06/29/2000  helena      Major rewrite of the callback APIs.
18 */
19 
20 #include "unicode/utypes.h"
21 
22 #if !UCONFIG_NO_CONVERSION
23 
24 #include "unicode/ucnv_err.h"
25 #include "unicode/ucnv_cb.h"
26 #include "ucnv_cnv.h"
27 #include "cmemory.h"
28 #include "unicode/ucnv.h"
29 #include "ustrfmt.h"
30 
31 #define VALUE_STRING_LENGTH 48
32 /*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
33 #define UNICODE_PERCENT_SIGN_CODEPOINT  0x0025
34 #define UNICODE_U_CODEPOINT             0x0055
35 #define UNICODE_X_CODEPOINT             0x0058
36 #define UNICODE_RS_CODEPOINT            0x005C
37 #define UNICODE_U_LOW_CODEPOINT         0x0075
38 #define UNICODE_X_LOW_CODEPOINT         0x0078
39 #define UNICODE_AMP_CODEPOINT           0x0026
40 #define UNICODE_HASH_CODEPOINT          0x0023
41 #define UNICODE_SEMICOLON_CODEPOINT     0x003B
42 #define UNICODE_PLUS_CODEPOINT          0x002B
43 #define UNICODE_LEFT_CURLY_CODEPOINT    0x007B
44 #define UNICODE_RIGHT_CURLY_CODEPOINT   0x007D
45 #define UNICODE_SPACE_CODEPOINT         0x0020
46 #define UCNV_PRV_ESCAPE_ICU         0
47 #define UCNV_PRV_ESCAPE_C           'C'
48 #define UCNV_PRV_ESCAPE_XML_DEC     'D'
49 #define UCNV_PRV_ESCAPE_XML_HEX     'X'
50 #define UCNV_PRV_ESCAPE_JAVA        'J'
51 #define UCNV_PRV_ESCAPE_UNICODE     'U'
52 #define UCNV_PRV_ESCAPE_CSS2        'S'
53 #define UCNV_PRV_STOP_ON_ILLEGAL    'i'
54 
55 /*
56  * IS_DEFAULT_IGNORABLE_CODE_POINT
57  * This is to check if a code point has the default ignorable unicode property.
58  * As such, this list needs to be updated if the ignorable code point list ever
59  * changes.
60  * To avoid dependency on other code, this list is hard coded here.
61  * When an ignorable code point is found and is unmappable, the default callbacks
62  * will ignore them.
63  * For a list of the default ignorable code points, use this link: http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[%3ADI%3A]&g=
64  *
65  * This list should be sync with the one in CharsetCallback.java
66  */
67 #define IS_DEFAULT_IGNORABLE_CODE_POINT(c) (\
68     (c == 0x00AD) || \
69     (c == 0x034F) || \
70     (c == 0x061C) || \
71     (c == 0x115F) || \
72     (c == 0x1160) || \
73     (0x17B4 <= c && c <= 0x17B5) || \
74     (0x180B <= c && c <= 0x180E) || \
75     (0x200B <= c && c <= 0x200F) || \
76     (0x202A <= c && c <= 0x202E) || \
77     (c == 0x2060) || \
78     (0x2066 <= c && c <= 0x2069) || \
79     (0x2061 <= c && c <= 0x2064) || \
80     (0x206A <= c && c <= 0x206F) || \
81     (c == 0x3164) || \
82     (0x0FE00 <= c && c <= 0x0FE0F) || \
83     (c == 0x0FEFF) || \
84     (c == 0x0FFA0) || \
85     (0x01BCA0  <= c && c <= 0x01BCA3) || \
86     (0x01D173 <= c && c <= 0x01D17A) || \
87     (c == 0x0E0001) || \
88     (0x0E0020 <= c && c <= 0x0E007F) || \
89     (0x0E0100 <= c && c <= 0x0E01EF) || \
90     (c == 0x2065) || \
91     (0x0FFF0 <= c && c <= 0x0FFF8) || \
92     (c == 0x0E0000) || \
93     (0x0E0002 <= c && c <= 0x0E001F) || \
94     (0x0E0080 <= c && c <= 0x0E00FF) || \
95     (0x0E01F0 <= c && c <= 0x0E0FFF) \
96     )
97 
98 
99 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
100 U_CAPI void    U_EXPORT2
UCNV_FROM_U_CALLBACK_STOP(const void * context,UConverterFromUnicodeArgs * fromUArgs,const UChar * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)101 UCNV_FROM_U_CALLBACK_STOP (
102                   const void *context,
103                   UConverterFromUnicodeArgs *fromUArgs,
104                   const UChar* codeUnits,
105                   int32_t length,
106                   UChar32 codePoint,
107                   UConverterCallbackReason reason,
108                   UErrorCode * err)
109 {
110     if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
111     {
112         /*
113          * Skip if the codepoint has unicode property of default ignorable.
114          */
115         *err = U_ZERO_ERROR;
116     }
117     /* the caller must have set the error code accordingly */
118     return;
119 }
120 
121 
122 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
123 U_CAPI void    U_EXPORT2
UCNV_TO_U_CALLBACK_STOP(const void * context,UConverterToUnicodeArgs * toUArgs,const char * codePoints,int32_t length,UConverterCallbackReason reason,UErrorCode * err)124 UCNV_TO_U_CALLBACK_STOP (
125                    const void *context,
126                    UConverterToUnicodeArgs *toUArgs,
127                    const char* codePoints,
128                    int32_t length,
129                    UConverterCallbackReason reason,
130                    UErrorCode * err)
131 {
132     /* the caller must have set the error code accordingly */
133     return;
134 }
135 
136 U_CAPI void    U_EXPORT2
UCNV_FROM_U_CALLBACK_SKIP(const void * context,UConverterFromUnicodeArgs * fromUArgs,const UChar * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)137 UCNV_FROM_U_CALLBACK_SKIP (
138                   const void *context,
139                   UConverterFromUnicodeArgs *fromUArgs,
140                   const UChar* codeUnits,
141                   int32_t length,
142                   UChar32 codePoint,
143                   UConverterCallbackReason reason,
144                   UErrorCode * err)
145 {
146     if (reason <= UCNV_IRREGULAR)
147     {
148         if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
149         {
150             /*
151              * Skip if the codepoint has unicode property of default ignorable.
152              */
153             *err = U_ZERO_ERROR;
154         }
155         else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
156         {
157             *err = U_ZERO_ERROR;
158         }
159         /* else the caller must have set the error code accordingly. */
160     }
161     /* else ignore the reset, close and clone calls. */
162 }
163 
164 U_CAPI void    U_EXPORT2
UCNV_FROM_U_CALLBACK_SUBSTITUTE(const void * context,UConverterFromUnicodeArgs * fromArgs,const UChar * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)165 UCNV_FROM_U_CALLBACK_SUBSTITUTE (
166                   const void *context,
167                   UConverterFromUnicodeArgs *fromArgs,
168                   const UChar* codeUnits,
169                   int32_t length,
170                   UChar32 codePoint,
171                   UConverterCallbackReason reason,
172                   UErrorCode * err)
173 {
174     if (reason <= UCNV_IRREGULAR)
175     {
176         if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
177         {
178             /*
179              * Skip if the codepoint has unicode property of default ignorable.
180              */
181             *err = U_ZERO_ERROR;
182         }
183         else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
184         {
185             *err = U_ZERO_ERROR;
186             ucnv_cbFromUWriteSub(fromArgs, 0, err);
187         }
188         /* else the caller must have set the error code accordingly. */
189     }
190     /* else ignore the reset, close and clone calls. */
191 }
192 
193 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
194  *uses a clean copy (resetted) of the converter, to convert that unicode
195  *escape sequence to the target codepage (if conversion failure happens then
196  *we revert to substituting with subchar)
197  */
198 U_CAPI void    U_EXPORT2
UCNV_FROM_U_CALLBACK_ESCAPE(const void * context,UConverterFromUnicodeArgs * fromArgs,const UChar * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)199 UCNV_FROM_U_CALLBACK_ESCAPE (
200                          const void *context,
201                          UConverterFromUnicodeArgs *fromArgs,
202                          const UChar *codeUnits,
203                          int32_t length,
204                          UChar32 codePoint,
205                          UConverterCallbackReason reason,
206                          UErrorCode * err)
207 {
208 
209   UChar valueString[VALUE_STRING_LENGTH];
210   int32_t valueStringLength = 0;
211   int32_t i = 0;
212 
213   const UChar *myValueSource = NULL;
214   UErrorCode err2 = U_ZERO_ERROR;
215   UConverterFromUCallback original = NULL;
216   const void *originalContext;
217 
218   UConverterFromUCallback ignoredCallback = NULL;
219   const void *ignoredContext;
220 
221   if (reason > UCNV_IRREGULAR)
222   {
223       return;
224   }
225   else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
226   {
227       /*
228        * Skip if the codepoint has unicode property of default ignorable.
229        */
230       *err = U_ZERO_ERROR;
231       return;
232   }
233 
234   ucnv_setFromUCallBack (fromArgs->converter,
235                      (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
236                      NULL,
237                      &original,
238                      &originalContext,
239                      &err2);
240 
241   if (U_FAILURE (err2))
242   {
243     *err = err2;
244     return;
245   }
246   if(context==NULL)
247   {
248       while (i < length)
249       {
250         valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
251         valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
252         valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
253       }
254   }
255   else
256   {
257       switch(*((char*)context))
258       {
259       case UCNV_PRV_ESCAPE_JAVA:
260           while (i < length)
261           {
262               valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
263               valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
264               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
265           }
266           break;
267 
268       case UCNV_PRV_ESCAPE_C:
269           valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
270 
271           if(length==2){
272               valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
273               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8);
274 
275           }
276           else{
277               valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
278               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
279           }
280           break;
281 
282       case UCNV_PRV_ESCAPE_XML_DEC:
283 
284           valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;   /* adding & */
285           valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;  /* adding # */
286           if(length==2){
287               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0);
288           }
289           else{
290               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0);
291           }
292           valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
293           break;
294 
295       case UCNV_PRV_ESCAPE_XML_HEX:
296 
297           valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;   /* adding & */
298           valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;  /* adding # */
299           valueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
300           if(length==2){
301               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
302           }
303           else{
304               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0);
305           }
306           valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
307           break;
308 
309       case UCNV_PRV_ESCAPE_UNICODE:
310           valueString[valueStringLength++] = (UChar) UNICODE_LEFT_CURLY_CODEPOINT;    /* adding { */
311           valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT;    /* adding U */
312           valueString[valueStringLength++] = (UChar) UNICODE_PLUS_CODEPOINT; /* adding + */
313           if (length == 2) {
314               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4);
315           } else {
316               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
317           }
318           valueString[valueStringLength++] = (UChar) UNICODE_RIGHT_CURLY_CODEPOINT;    /* adding } */
319           break;
320 
321       case UCNV_PRV_ESCAPE_CSS2:
322           valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
323           valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
324           /* Always add space character, becase the next character might be whitespace,
325              which would erroneously be considered the termination of the escape sequence. */
326           valueString[valueStringLength++] = (UChar) UNICODE_SPACE_CODEPOINT;
327           break;
328 
329       default:
330           while (i < length)
331           {
332               valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
333               valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT;             /* adding U */
334               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
335           }
336       }
337   }
338   myValueSource = valueString;
339 
340   /* reset the error */
341   *err = U_ZERO_ERROR;
342 
343   ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err);
344 
345   ucnv_setFromUCallBack (fromArgs->converter,
346                          original,
347                          originalContext,
348                          &ignoredCallback,
349                          &ignoredContext,
350                          &err2);
351   if (U_FAILURE (err2))
352   {
353       *err = err2;
354       return;
355   }
356 
357   return;
358 }
359 
360 
361 
362 U_CAPI void  U_EXPORT2
UCNV_TO_U_CALLBACK_SKIP(const void * context,UConverterToUnicodeArgs * toArgs,const char * codeUnits,int32_t length,UConverterCallbackReason reason,UErrorCode * err)363 UCNV_TO_U_CALLBACK_SKIP (
364                  const void *context,
365                  UConverterToUnicodeArgs *toArgs,
366                  const char* codeUnits,
367                  int32_t length,
368                  UConverterCallbackReason reason,
369                  UErrorCode * err)
370 {
371     if (reason <= UCNV_IRREGULAR)
372     {
373         if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
374         {
375             *err = U_ZERO_ERROR;
376         }
377         /* else the caller must have set the error code accordingly. */
378     }
379     /* else ignore the reset, close and clone calls. */
380 }
381 
382 U_CAPI void    U_EXPORT2
UCNV_TO_U_CALLBACK_SUBSTITUTE(const void * context,UConverterToUnicodeArgs * toArgs,const char * codeUnits,int32_t length,UConverterCallbackReason reason,UErrorCode * err)383 UCNV_TO_U_CALLBACK_SUBSTITUTE (
384                  const void *context,
385                  UConverterToUnicodeArgs *toArgs,
386                  const char* codeUnits,
387                  int32_t length,
388                  UConverterCallbackReason reason,
389                  UErrorCode * err)
390 {
391     if (reason <= UCNV_IRREGULAR)
392     {
393         if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
394         {
395             *err = U_ZERO_ERROR;
396             ucnv_cbToUWriteSub(toArgs,0,err);
397         }
398         /* else the caller must have set the error code accordingly. */
399     }
400     /* else ignore the reset, close and clone calls. */
401 }
402 
403 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
404  *and uses that as the substitution sequence
405  */
406 U_CAPI void   U_EXPORT2
UCNV_TO_U_CALLBACK_ESCAPE(const void * context,UConverterToUnicodeArgs * toArgs,const char * codeUnits,int32_t length,UConverterCallbackReason reason,UErrorCode * err)407 UCNV_TO_U_CALLBACK_ESCAPE (
408                  const void *context,
409                  UConverterToUnicodeArgs *toArgs,
410                  const char* codeUnits,
411                  int32_t length,
412                  UConverterCallbackReason reason,
413                  UErrorCode * err)
414 {
415     UChar uniValueString[VALUE_STRING_LENGTH];
416     int32_t valueStringLength = 0;
417     int32_t i = 0;
418 
419     if (reason > UCNV_IRREGULAR)
420     {
421         return;
422     }
423 
424     if(context==NULL)
425     {
426         while (i < length)
427         {
428             uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
429             uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT;    /* adding X */
430             valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
431         }
432     }
433     else
434     {
435         switch(*((char*)context))
436         {
437         case UCNV_PRV_ESCAPE_XML_DEC:
438             while (i < length)
439             {
440                 uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;   /* adding & */
441                 uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;  /* adding # */
442                 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0);
443                 uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
444             }
445             break;
446 
447         case UCNV_PRV_ESCAPE_XML_HEX:
448             while (i < length)
449             {
450                 uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;   /* adding & */
451                 uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;  /* adding # */
452                 uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
453                 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0);
454                 uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
455             }
456             break;
457         case UCNV_PRV_ESCAPE_C:
458             while (i < length)
459             {
460                 uniValueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
461                 uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
462                 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2);
463             }
464             break;
465         default:
466             while (i < length)
467             {
468                 uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
469                 uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT;    /* adding X */
470                 uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
471                 valueStringLength += 2;
472             }
473         }
474     }
475     /* reset the error */
476     *err = U_ZERO_ERROR;
477 
478     ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err);
479 }
480 
481 #endif
482