• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /**
4 *******************************************************************************
5 * Copyright (C) 2006-2014, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
8 */
9 
10 package com.ibm.icu.charset;
11 
12 import java.nio.ByteBuffer;
13 import java.nio.CharBuffer;
14 import java.nio.IntBuffer;
15 import java.nio.charset.CoderResult;
16 
17 /**
18  * <h2> Callback API for CharsetICU API </h2>
19  *
20  *  CharsetCallback class defines some error behaviour functions called
21  *  by CharsetDecoderICU and CharsetEncoderICU. The class also provides
22  *  the facility by which clients can write their own callbacks.
23  *
24  *  These functions, although public, should NEVER be called directly.
25  *  They should be used as parameters to the onUmappableCharacter() and
26  *  onMalformedInput() methods, to set the behaviour of a converter
27  *  when it encounters UNMAPPED/INVALID sequences.
28  *  Currently the only way to set callbacks is by using CodingErrorAction.
29  *  In the future we will provide set methods on CharsetEncoder and CharsetDecoder
30  *  that will accept CharsetCallback fields.
31  *
32  * @stable ICU 3.6
33  */
34 
35 public class CharsetCallback {
36     /*
37      * FROM_U, TO_U context options for sub callback
38      */
39     private static final String SUB_STOP_ON_ILLEGAL = "i";
40 
41 //    /*
42 //     * FROM_U, TO_U context options for skip callback
43 //     */
44 //    private static final String SKIP_STOP_ON_ILLEGAL = "i";
45 
46 //    /*
47 //     * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX)
48 //     */
49 //    private static final String ESCAPE_ICU  = null;
50 
51     /*
52      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX)
53      */
54     private static final String ESCAPE_JAVA     =  "J";
55 
56     /*
57      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX)
58      * TO_U_CALLBACK_ESCAPE option to escape the character value accoding to C (\\xXXXX)
59      */
60     private static final String ESCAPE_C        = "C";
61 
62     /*
63      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&amp;#DDDD;)\endhtmlonly
64      * TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Decimal escape \htmlonly(&amp;#DDDD;)\endhtmlonly
65      */
66     private static final String ESCAPE_XML_DEC  = "D";
67 
68     /*
69      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&amp;#xXXXX;)\endhtmlonly
70      * TO_U_CALLBACK_ESCAPE context option to escape the character value according to XML Hex escape \htmlonly(&amp;#xXXXX;)\endhtmlonly
71      */
72     private static final String ESCAPE_XML_HEX  = "X";
73 
74     /*
75      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX)
76      */
77     private static final String ESCAPE_UNICODE  = "U";
78 
79     /*
80      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX)
81      */
82     private static final String ESCAPE_CSS2  = "S";
83 
84     /*
85      * IS_DEFAULT_IGNORABLE_CODE_POINT
86      * This is to check if a code point has the default ignorable unicode property.
87      * As such, this list needs to be updated if the ignorable code point list ever
88      * changes.
89      * To avoid dependency on other code, this list is hard coded here.
90      * When an ignorable code point is found and is unmappable, the default callbacks
91      * will ignore them.
92      * For a list of the default ignorable code points, use this link: http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[%3ADI%3A]&g=
93      *
94      * This list should be sync with the one in ucnv_err.c
95      *
96      */
IS_DEFAULT_IGNORABLE_CODE_POINT(int c)97     private static boolean IS_DEFAULT_IGNORABLE_CODE_POINT(int c) {
98         return ((c == 0x00AD) ||
99                 (c == 0x034F) ||
100                 (c == 0x061C) ||
101                 (c == 0x115F) ||
102                 (c == 0x1160) ||
103                 (0x17B4 <= c && c <= 0x17B5) ||
104                 (0x180B <= c && c <= 0x180E) ||
105                 (0x200B <= c && c <= 0x200F) ||
106                 (0x202A <= c && c <= 0x202E) ||
107                 (c == 0x2060) ||
108                 (0x2066 <= c && c <= 0x2069) ||
109                 (0x2061 <= c && c <= 0x2064) ||
110                 (0x206A <= c && c <= 0x206F) ||
111                 (c == 0x3164) ||
112                 (0x0FE00 <= c && c <= 0x0FE0F) ||
113                 (c == 0x0FEFF) ||
114                 (c == 0x0FFA0) ||
115                 (0x01BCA0  <= c && c <= 0x01BCA3) ||
116                 (0x01D173 <= c && c <= 0x01D17A) ||
117                 (c == 0x0E0001) ||
118                 (0x0E0020 <= c && c <= 0x0E007F) ||
119                 (0x0E0100 <= c && c <= 0x0E01EF) ||
120                 (c == 0x2065) ||
121                 (0x0FFF0 <= c && c <= 0x0FFF8) ||
122                 (c == 0x0E0000) ||
123                 (0x0E0002 <= c && c <= 0x0E001F) ||
124                 (0x0E0080 <= c && c <= 0x0E00FF) ||
125                 (0x0E01F0 <= c && c <= 0x0E0FFF)
126                 );
127     }
128     /**
129      * Decoder Callback interface
130      * @stable ICU 3.6
131      */
132     public interface Decoder {
133         /**
134          * This function is called when the bytes in the source cannot be handled,
135          * and this function is meant to handle or fix the error if possible.
136          *
137          * @return Result of decoding action. This returned object is set to an error
138          *  if this function could not handle the conversion.
139          * @stable ICU 3.6
140          */
call(CharsetDecoderICU decoder, Object context, ByteBuffer source, CharBuffer target, IntBuffer offsets, char[] buffer, int length, CoderResult cr)141         public CoderResult call(CharsetDecoderICU decoder, Object context,
142                                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
143                                 char[] buffer, int length, CoderResult cr);
144     }
145     /**
146      * Encoder Callback interface
147      * @stable ICU 3.6
148      */
149     public interface Encoder {
150         /**
151          * This function is called when the Unicode characters in the source cannot be handled,
152          * and this function is meant to handle or fix the error if possible.
153          * @return Result of decoding action. This returned object is set to an error
154          *  if this function could not handle the conversion.
155          * @stable ICU 3.6
156          */
call(CharsetEncoderICU encoder, Object context, CharBuffer source, ByteBuffer target, IntBuffer offsets, char[] buffer, int length, int cp, CoderResult cr)157         public CoderResult call(CharsetEncoderICU encoder, Object context,
158                                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
159                                 char[] buffer, int length, int cp, CoderResult cr);
160     }
161     /**
162      * Skip callback
163      * @stable ICU 3.6
164      */
165     public static final Encoder FROM_U_CALLBACK_SKIP = new Encoder() {
166         @Override
167         public CoderResult call(CharsetEncoderICU encoder, Object context,
168                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
169                 char[] buffer, int length, int cp, CoderResult cr){
170             if(context==null){
171                 return CoderResult.UNDERFLOW;
172             }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
173                 if(!cr.isUnmappable()){
174                     return cr;
175                 }else{
176                     return CoderResult.UNDERFLOW;
177                 }
178             }
179             return cr;
180         }
181     };
182     /**
183      * Skip callback
184      * @stable ICU 3.6
185      */
186     public static final Decoder TO_U_CALLBACK_SKIP = new Decoder() {
187         @Override
188         public CoderResult call(CharsetDecoderICU decoder, Object context,
189                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
190                 char[] buffer, int length, CoderResult cr){
191             if(context==null){
192                 return CoderResult.UNDERFLOW;
193             }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
194                 if(!cr.isUnmappable()){
195                     return cr;
196                 }else{
197                     return CoderResult.UNDERFLOW;
198                 }
199             }
200             return cr;
201         }
202     };
203     /**
204      * Write substitute callback
205      * @stable ICU 3.6
206      */
207     public static final Encoder FROM_U_CALLBACK_SUBSTITUTE = new Encoder(){
208         @Override
209         public CoderResult call(CharsetEncoderICU encoder, Object context,
210                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
211                 char[] buffer, int length, int cp, CoderResult cr){
212             if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) {
213                 return CoderResult.UNDERFLOW;
214             }else if(context==null){
215                 return encoder.cbFromUWriteSub(encoder, source, target, offsets);
216             }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
217                 if(!cr.isUnmappable()){
218                     return cr;
219                 }else{
220                    return encoder.cbFromUWriteSub(encoder, source, target, offsets);
221                 }
222             }
223             return cr;
224         }
225     };
226     private static final char[] kSubstituteChar1 = new char[]{0x1A};
227     private static final char[] kSubstituteChar = new char[] {0xFFFD};
228     /**
229      * Write substitute callback
230      * @stable ICU 3.6
231      */
232     public static final Decoder TO_U_CALLBACK_SUBSTITUTE  = new Decoder() {
233         @Override
234         public CoderResult call(CharsetDecoderICU decoder, Object context,
235                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
236                 char[] buffer, int length, CoderResult cr){
237 
238             CharsetICU cs = (CharsetICU) decoder.charset();
239             /* Use the specified replacement character if it is different than the default one. */
240             boolean useReplacement = true;
241             char [] replacementChar = decoder.replacement().toCharArray();
242             if (replacementChar.length == 1 && (replacementChar[0] == kSubstituteChar1[0] || replacementChar[0] == kSubstituteChar[0])) {
243                 useReplacement = false;
244             }
245 
246             /* could optimize this case, just one uchar */
247             if(decoder.invalidCharLength == 1 && cs.subChar1 != 0) {
248                 return CharsetDecoderICU.toUWriteUChars(decoder, useReplacement ? replacementChar : kSubstituteChar1, 0, useReplacement ? replacementChar.length : 1, target, offsets, source.position());
249             } else {
250                 return CharsetDecoderICU.toUWriteUChars(decoder, useReplacement ? replacementChar : kSubstituteChar, 0, useReplacement ? replacementChar.length : 1, target, offsets, source.position());
251             }
252         }
253     };
254     /**
255      * Stop callback
256      * @stable ICU 3.6
257      */
258     public static final Encoder FROM_U_CALLBACK_STOP = new Encoder() {
259         @Override
260         public CoderResult call(CharsetEncoderICU encoder, Object context,
261                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
262                 char[] buffer, int length, int cp, CoderResult cr){
263             if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) {
264                 return CoderResult.UNDERFLOW;
265             }
266             return cr;
267         }
268     };
269     /**
270      * Stop callback
271      * @stable ICU 3.6
272      */
273     public static final Decoder TO_U_CALLBACK_STOP = new Decoder() {
274         @Override
275         public CoderResult call(CharsetDecoderICU decoder, Object context,
276                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
277                 char[] buffer, int length, CoderResult cr){
278             return cr;
279         }
280     };
281     private static final int VALUE_STRING_LENGTH = 32;
282     private static final char UNICODE_PERCENT_SIGN_CODEPOINT    = 0x0025;
283     private static final char UNICODE_U_CODEPOINT               = 0x0055;
284     private static final char UNICODE_X_CODEPOINT               = 0x0058;
285     private static final char UNICODE_RS_CODEPOINT              = 0x005C;
286     private static final char UNICODE_U_LOW_CODEPOINT           = 0x0075;
287     private static final char UNICODE_X_LOW_CODEPOINT           = 0x0078;
288     private static final char UNICODE_AMP_CODEPOINT             = 0x0026;
289     private static final char UNICODE_HASH_CODEPOINT            = 0x0023;
290     private static final char UNICODE_SEMICOLON_CODEPOINT       = 0x003B;
291     private static final char UNICODE_PLUS_CODEPOINT            = 0x002B;
292     private static final char UNICODE_LEFT_CURLY_CODEPOINT      = 0x007B;
293     private static final char UNICODE_RIGHT_CURLY_CODEPOINT     = 0x007D;
294     private static final char UNICODE_SPACE_CODEPOINT           = 0x0020;
295     /**
296      * Write escape callback
297      * @stable ICU 4.0
298      */
299     public static final Encoder FROM_U_CALLBACK_ESCAPE = new Encoder() {
300         @Override
301         public CoderResult call(CharsetEncoderICU encoder, Object context,
302                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
303                 char[] buffer, int length, int cp, CoderResult cr){
304             char[] valueString = new char[VALUE_STRING_LENGTH];
305             int valueStringLength = 0;
306             int i = 0;
307 
308             if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) {
309                 return CoderResult.UNDERFLOW;
310             }
311 
312             if (context == null || !(context instanceof String)) {
313                 while (i < length) {
314                     valueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
315                     valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */
316                     valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4);
317                 }
318             } else {
319                 if (((String)context).equals(ESCAPE_JAVA)) {
320                     while (i < length) {
321                         valueString[valueStringLength++] = UNICODE_RS_CODEPOINT;    /* adding \ */
322                         valueString[valueStringLength++] = UNICODE_U_LOW_CODEPOINT; /* adding u */
323                         valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4);
324                     }
325                 } else if (((String)context).equals(ESCAPE_C)) {
326                     valueString[valueStringLength++] = UNICODE_RS_CODEPOINT;    /* adding \ */
327 
328                     if (length == 2) {
329                         valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */
330                         valueStringLength = itou(valueString, valueStringLength, cp, 16, 8);
331                     } else {
332                         valueString[valueStringLength++] = UNICODE_U_LOW_CODEPOINT; /* adding u */
333                         valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 4);
334                     }
335                 } else if (((String)context).equals(ESCAPE_XML_DEC)) {
336                     valueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;   /* adding & */
337                     valueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;  /* adding # */
338                     if (length == 2) {
339                         valueStringLength += itou(valueString, valueStringLength, cp, 10, 0);
340                     } else {
341                         valueStringLength += itou(valueString, valueStringLength, buffer[0], 10, 0);
342                     }
343                     valueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
344                 } else if (((String)context).equals(ESCAPE_XML_HEX)) {
345                     valueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;   /* adding & */
346                     valueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;  /* adding # */
347                     valueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT; /* adding x */
348                     if (length == 2) {
349                         valueStringLength += itou(valueString, valueStringLength, cp, 16, 0);
350                     } else {
351                         valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 0);
352                     }
353                     valueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
354                 } else if (((String)context).equals(ESCAPE_UNICODE)) {
355                     valueString[valueStringLength++] = UNICODE_LEFT_CURLY_CODEPOINT;    /* adding { */
356                     valueString[valueStringLength++] = UNICODE_U_CODEPOINT;             /* adding U */
357                     valueString[valueStringLength++] = UNICODE_PLUS_CODEPOINT;          /* adding + */
358                     if (length == 2) {
359                         valueStringLength += itou(valueString, valueStringLength,cp, 16, 4);
360                     } else {
361                         valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 4);
362                     }
363                     valueString[valueStringLength++] = UNICODE_RIGHT_CURLY_CODEPOINT;   /* adding } */
364                 } else if (((String)context).equals(ESCAPE_CSS2)) {
365                     valueString[valueStringLength++] = UNICODE_RS_CODEPOINT;    /* adding \ */
366                     valueStringLength += itou(valueString, valueStringLength, cp, 16, 0);
367                     /* Always add space character, because the next character might be whitespace,
368                        which would erroneously be considered the termination of the escape sequence. */
369                     valueString[valueStringLength++] = UNICODE_SPACE_CODEPOINT;
370                 } else {
371                     while (i < length) {
372                         valueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
373                         valueString[valueStringLength++] = UNICODE_U_CODEPOINT;             /* adding U */
374                         valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4);
375                     }
376                 }
377             }
378             return encoder.cbFromUWriteUChars(encoder, CharBuffer.wrap(valueString, 0, valueStringLength), target, offsets);
379         }
380     };
381     /**
382      * Write escape callback
383      * @stable ICU 4.0
384      */
385     public static final Decoder TO_U_CALLBACK_ESCAPE = new Decoder() {
386         @Override
387         public CoderResult call(CharsetDecoderICU decoder, Object context,
388                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
389                 char[] buffer, int length, CoderResult cr){
390             char[] uniValueString = new char[VALUE_STRING_LENGTH];
391             int valueStringLength = 0;
392             int i = 0;
393 
394             if (context == null || !(context instanceof String)) {
395                 while (i < length) {
396                     uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT;   /* adding % */
397                     uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT;              /* adding U */
398                     valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2);
399                 }
400             } else {
401                 if (((String)context).equals(ESCAPE_XML_DEC)) {
402                     while (i < length) {
403                         uniValueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;    /* adding & */
404                         uniValueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;   /* adding # */
405                         valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 10, 0);
406                         uniValueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT;  /* adding ; */
407                     }
408                 } else if (((String)context).equals(ESCAPE_XML_HEX)) {
409                     while (i < length) {
410                         uniValueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;    /* adding & */
411                         uniValueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;   /* adding # */
412                         uniValueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT;  /* adding x */
413                         valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 0);
414                         uniValueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT;  /* adding ; */
415                     }
416                 } else if (((String)context).equals(ESCAPE_C)) {
417                     while (i < length) {
418                         uniValueString[valueStringLength++] = UNICODE_RS_CODEPOINT;         /* adding \ */
419                         uniValueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT;      /* adding x */
420                         valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2);
421                     }
422                 } else {
423                     while (i < length) {
424                         uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT;   /* adding % */
425                         uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT;              /* adding X */
426                         itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2);
427                         valueStringLength += 2;
428                     }
429                 }
430             }
431 
432             cr = CharsetDecoderICU.toUWriteUChars(decoder, uniValueString, 0, valueStringLength, target, offsets, 0);
433 
434             return cr;
435         }
436     };
437     /***
438      * Java port of uprv_itou() in ICU4C used by TO_U_CALLBACK_ESCAPE and FROM_U_CALLBACK_ESCAPE.
439      * Fills in a char string with the radix-based representation of a number padded with zeroes
440      * to minwidth.
441      */
itou(char[] buffer, int sourceIndex, int i, int radix, int minwidth)442     private static final int itou(char[] buffer, int sourceIndex, int i, int radix, int minwidth) {
443         int length = 0;
444         int digit;
445         int j;
446         char temp;
447 
448         do {
449             digit = i % radix;
450             buffer[sourceIndex + length++] = (char)(digit <= 9 ? (0x0030+digit) : (0x0030+digit+7));
451             i = i/radix;
452         } while (i != 0 && (sourceIndex + length) < buffer.length);
453 
454         while (length < minwidth) {
455             buffer[sourceIndex + length++] = (char)0x0030; /* zero padding */
456         }
457         /* reverses the string */
458         for (j = 0; j < (length / 2); j++) {
459             temp = buffer[(sourceIndex + length - 1) - j];
460             buffer[(sourceIndex + length-1) -j] = buffer[sourceIndex + j];
461             buffer[sourceIndex + j] = temp;
462         }
463 
464         return length;
465     }
466 
467     /*
468      * No need to create an instance
469      */
CharsetCallback()470     private CharsetCallback() {
471     }
472 }
473