1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /** 4 ******************************************************************************* 5 * Copyright (C) 2006-2014, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.charset; 11 12 import java.nio.ByteBuffer; 13 import java.nio.CharBuffer; 14 import java.nio.IntBuffer; 15 import java.nio.charset.CoderResult; 16 17 /** 18 * <h2> Callback API for CharsetICU API </h2> 19 * 20 * CharsetCallback class defines some error behaviour functions called 21 * by CharsetDecoderICU and CharsetEncoderICU. The class also provides 22 * the facility by which clients can write their own callbacks. 23 * 24 * These functions, although public, should NEVER be called directly. 25 * They should be used as parameters to the onUmappableCharacter() and 26 * onMalformedInput() methods, to set the behaviour of a converter 27 * when it encounters UNMAPPED/INVALID sequences. 28 * Currently the only way to set callbacks is by using CodingErrorAction. 29 * In the future we will provide set methods on CharsetEncoder and CharsetDecoder 30 * that will accept CharsetCallback fields. 31 * 32 * @stable ICU 3.6 33 */ 34 35 public class CharsetCallback { 36 /* 37 * FROM_U, TO_U context options for sub callback 38 */ 39 private static final String SUB_STOP_ON_ILLEGAL = "i"; 40 41 // /* 42 // * FROM_U, TO_U context options for skip callback 43 // */ 44 // private static final String SKIP_STOP_ON_ILLEGAL = "i"; 45 46 // /* 47 // * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX) 48 // */ 49 // private static final String ESCAPE_ICU = null; 50 51 /* 52 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX) 53 */ 54 private static final String ESCAPE_JAVA = "J"; 55 56 /* 57 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX) 58 * TO_U_CALLBACK_ESCAPE option to escape the character value accoding to C (\\xXXXX) 59 */ 60 private static final String ESCAPE_C = "C"; 61 62 /* 63 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly 64 * TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly 65 */ 66 private static final String ESCAPE_XML_DEC = "D"; 67 68 /* 69 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly 70 * TO_U_CALLBACK_ESCAPE context option to escape the character value according to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly 71 */ 72 private static final String ESCAPE_XML_HEX = "X"; 73 74 /* 75 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX) 76 */ 77 private static final String ESCAPE_UNICODE = "U"; 78 79 /* 80 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX) 81 */ 82 private static final String ESCAPE_CSS2 = "S"; 83 84 /* 85 * IS_DEFAULT_IGNORABLE_CODE_POINT 86 * This is to check if a code point has the default ignorable unicode property. 87 * As such, this list needs to be updated if the ignorable code point list ever 88 * changes. 89 * To avoid dependency on other code, this list is hard coded here. 90 * When an ignorable code point is found and is unmappable, the default callbacks 91 * will ignore them. 92 * For a list of the default ignorable code points, use this link: 93 * https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3ADI%3A%5D&abb=on&g=&i= 94 * 95 * This list should be sync with the one in ucnv_err.cpp. 96 */ IS_DEFAULT_IGNORABLE_CODE_POINT(int c)97 private static boolean IS_DEFAULT_IGNORABLE_CODE_POINT(int c) { 98 return 99 (c == 0x00AD) || 100 (c == 0x034F) || 101 (c == 0x061C) || 102 (c == 0x115F) || 103 (c == 0x1160) || 104 (0x17B4 <= c && c <= 0x17B5) || 105 (0x180B <= c && c <= 0x180E) || 106 (0x200B <= c && c <= 0x200F) || 107 (0x202A <= c && c <= 0x202E) || 108 (0x2060 <= c && c <= 0x206F) || 109 (c == 0x3164) || 110 (0xFE00 <= c && c <= 0xFE0F) || 111 (c == 0xFEFF) || 112 (c == 0xFFA0) || 113 (0xFFF0 <= c && c <= 0xFFF8) || 114 (0x1BCA0 <= c && c <= 0x1BCA3) || 115 (0x1D173 <= c && c <= 0x1D17A) || 116 (0xE0000 <= c && c <= 0xE0FFF); 117 } 118 /** 119 * Decoder Callback interface 120 * @stable ICU 3.6 121 */ 122 public interface Decoder { 123 /** 124 * This function is called when the bytes in the source cannot be handled, 125 * and this function is meant to handle or fix the error if possible. 126 * 127 * @return Result of decoding action. This returned object is set to an error 128 * if this function could not handle the conversion. 129 * @stable ICU 3.6 130 */ call(CharsetDecoderICU decoder, Object context, ByteBuffer source, CharBuffer target, IntBuffer offsets, char[] buffer, int length, CoderResult cr)131 public CoderResult call(CharsetDecoderICU decoder, Object context, 132 ByteBuffer source, CharBuffer target, IntBuffer offsets, 133 char[] buffer, int length, CoderResult cr); 134 } 135 /** 136 * Encoder Callback interface 137 * @stable ICU 3.6 138 */ 139 public interface Encoder { 140 /** 141 * This function is called when the Unicode characters in the source cannot be handled, 142 * and this function is meant to handle or fix the error if possible. 143 * @return Result of decoding action. This returned object is set to an error 144 * if this function could not handle the conversion. 145 * @stable ICU 3.6 146 */ call(CharsetEncoderICU encoder, Object context, CharBuffer source, ByteBuffer target, IntBuffer offsets, char[] buffer, int length, int cp, CoderResult cr)147 public CoderResult call(CharsetEncoderICU encoder, Object context, 148 CharBuffer source, ByteBuffer target, IntBuffer offsets, 149 char[] buffer, int length, int cp, CoderResult cr); 150 } 151 /** 152 * Skip callback 153 * @stable ICU 3.6 154 */ 155 public static final Encoder FROM_U_CALLBACK_SKIP = new Encoder() { 156 @Override 157 public CoderResult call(CharsetEncoderICU encoder, Object context, 158 CharBuffer source, ByteBuffer target, IntBuffer offsets, 159 char[] buffer, int length, int cp, CoderResult cr){ 160 if(context==null){ 161 return CoderResult.UNDERFLOW; 162 }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){ 163 if(!cr.isUnmappable()){ 164 return cr; 165 }else{ 166 return CoderResult.UNDERFLOW; 167 } 168 } 169 return cr; 170 } 171 }; 172 /** 173 * Skip callback 174 * @stable ICU 3.6 175 */ 176 public static final Decoder TO_U_CALLBACK_SKIP = new Decoder() { 177 @Override 178 public CoderResult call(CharsetDecoderICU decoder, Object context, 179 ByteBuffer source, CharBuffer target, IntBuffer offsets, 180 char[] buffer, int length, CoderResult cr){ 181 if(context==null){ 182 return CoderResult.UNDERFLOW; 183 }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){ 184 if(!cr.isUnmappable()){ 185 return cr; 186 }else{ 187 return CoderResult.UNDERFLOW; 188 } 189 } 190 return cr; 191 } 192 }; 193 /** 194 * Write substitute callback 195 * @stable ICU 3.6 196 */ 197 public static final Encoder FROM_U_CALLBACK_SUBSTITUTE = new Encoder(){ 198 @Override 199 public CoderResult call(CharsetEncoderICU encoder, Object context, 200 CharBuffer source, ByteBuffer target, IntBuffer offsets, 201 char[] buffer, int length, int cp, CoderResult cr){ 202 if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) { 203 return CoderResult.UNDERFLOW; 204 }else if(context==null){ 205 return encoder.cbFromUWriteSub(encoder, source, target, offsets); 206 }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){ 207 if(!cr.isUnmappable()){ 208 return cr; 209 }else{ 210 return encoder.cbFromUWriteSub(encoder, source, target, offsets); 211 } 212 } 213 return cr; 214 } 215 }; 216 private static final char[] kSubstituteChar1 = new char[]{0x1A}; 217 private static final char[] kSubstituteChar = new char[] {0xFFFD}; 218 /** 219 * Write substitute callback 220 * @stable ICU 3.6 221 */ 222 public static final Decoder TO_U_CALLBACK_SUBSTITUTE = new Decoder() { 223 @Override 224 public CoderResult call(CharsetDecoderICU decoder, Object context, 225 ByteBuffer source, CharBuffer target, IntBuffer offsets, 226 char[] buffer, int length, CoderResult cr){ 227 228 CharsetICU cs = (CharsetICU) decoder.charset(); 229 /* Use the specified replacement character if it is different than the default one. */ 230 boolean useReplacement = true; 231 char [] replacementChar = decoder.replacement().toCharArray(); 232 if (replacementChar.length == 1 && (replacementChar[0] == kSubstituteChar1[0] || replacementChar[0] == kSubstituteChar[0])) { 233 useReplacement = false; 234 } 235 236 /* could optimize this case, just one uchar */ 237 if(decoder.invalidCharLength == 1 && cs.subChar1 != 0) { 238 return CharsetDecoderICU.toUWriteUChars(decoder, useReplacement ? replacementChar : kSubstituteChar1, 0, useReplacement ? replacementChar.length : 1, target, offsets, source.position()); 239 } else { 240 return CharsetDecoderICU.toUWriteUChars(decoder, useReplacement ? replacementChar : kSubstituteChar, 0, useReplacement ? replacementChar.length : 1, target, offsets, source.position()); 241 } 242 } 243 }; 244 /** 245 * Stop callback 246 * @stable ICU 3.6 247 */ 248 public static final Encoder FROM_U_CALLBACK_STOP = new Encoder() { 249 @Override 250 public CoderResult call(CharsetEncoderICU encoder, Object context, 251 CharBuffer source, ByteBuffer target, IntBuffer offsets, 252 char[] buffer, int length, int cp, CoderResult cr){ 253 if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) { 254 return CoderResult.UNDERFLOW; 255 } 256 return cr; 257 } 258 }; 259 /** 260 * Stop callback 261 * @stable ICU 3.6 262 */ 263 public static final Decoder TO_U_CALLBACK_STOP = new Decoder() { 264 @Override 265 public CoderResult call(CharsetDecoderICU decoder, Object context, 266 ByteBuffer source, CharBuffer target, IntBuffer offsets, 267 char[] buffer, int length, CoderResult cr){ 268 return cr; 269 } 270 }; 271 private static final int VALUE_STRING_LENGTH = 32; 272 private static final char UNICODE_PERCENT_SIGN_CODEPOINT = 0x0025; 273 private static final char UNICODE_U_CODEPOINT = 0x0055; 274 private static final char UNICODE_X_CODEPOINT = 0x0058; 275 private static final char UNICODE_RS_CODEPOINT = 0x005C; 276 private static final char UNICODE_U_LOW_CODEPOINT = 0x0075; 277 private static final char UNICODE_X_LOW_CODEPOINT = 0x0078; 278 private static final char UNICODE_AMP_CODEPOINT = 0x0026; 279 private static final char UNICODE_HASH_CODEPOINT = 0x0023; 280 private static final char UNICODE_SEMICOLON_CODEPOINT = 0x003B; 281 private static final char UNICODE_PLUS_CODEPOINT = 0x002B; 282 private static final char UNICODE_LEFT_CURLY_CODEPOINT = 0x007B; 283 private static final char UNICODE_RIGHT_CURLY_CODEPOINT = 0x007D; 284 private static final char UNICODE_SPACE_CODEPOINT = 0x0020; 285 /** 286 * Write escape callback 287 * @stable ICU 4.0 288 */ 289 public static final Encoder FROM_U_CALLBACK_ESCAPE = new Encoder() { 290 @Override 291 public CoderResult call(CharsetEncoderICU encoder, Object context, 292 CharBuffer source, ByteBuffer target, IntBuffer offsets, 293 char[] buffer, int length, int cp, CoderResult cr){ 294 char[] valueString = new char[VALUE_STRING_LENGTH]; 295 int valueStringLength = 0; 296 int i = 0; 297 298 if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) { 299 return CoderResult.UNDERFLOW; 300 } 301 302 if (context == null || !(context instanceof String)) { 303 while (i < length) { 304 valueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 305 valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */ 306 valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4); 307 } 308 } else { 309 if (((String)context).equals(ESCAPE_JAVA)) { 310 while (i < length) { 311 valueString[valueStringLength++] = UNICODE_RS_CODEPOINT; /* adding \ */ 312 valueString[valueStringLength++] = UNICODE_U_LOW_CODEPOINT; /* adding u */ 313 valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4); 314 } 315 } else if (((String)context).equals(ESCAPE_C)) { 316 valueString[valueStringLength++] = UNICODE_RS_CODEPOINT; /* adding \ */ 317 318 if (length == 2) { 319 valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */ 320 valueStringLength = itou(valueString, valueStringLength, cp, 16, 8); 321 } else { 322 valueString[valueStringLength++] = UNICODE_U_LOW_CODEPOINT; /* adding u */ 323 valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 4); 324 } 325 } else if (((String)context).equals(ESCAPE_XML_DEC)) { 326 valueString[valueStringLength++] = UNICODE_AMP_CODEPOINT; /* adding & */ 327 valueString[valueStringLength++] = UNICODE_HASH_CODEPOINT; /* adding # */ 328 if (length == 2) { 329 valueStringLength += itou(valueString, valueStringLength, cp, 10, 0); 330 } else { 331 valueStringLength += itou(valueString, valueStringLength, buffer[0], 10, 0); 332 } 333 valueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 334 } else if (((String)context).equals(ESCAPE_XML_HEX)) { 335 valueString[valueStringLength++] = UNICODE_AMP_CODEPOINT; /* adding & */ 336 valueString[valueStringLength++] = UNICODE_HASH_CODEPOINT; /* adding # */ 337 valueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT; /* adding x */ 338 if (length == 2) { 339 valueStringLength += itou(valueString, valueStringLength, cp, 16, 0); 340 } else { 341 valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 0); 342 } 343 valueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 344 } else if (((String)context).equals(ESCAPE_UNICODE)) { 345 valueString[valueStringLength++] = UNICODE_LEFT_CURLY_CODEPOINT; /* adding { */ 346 valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */ 347 valueString[valueStringLength++] = UNICODE_PLUS_CODEPOINT; /* adding + */ 348 if (length == 2) { 349 valueStringLength += itou(valueString, valueStringLength,cp, 16, 4); 350 } else { 351 valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 4); 352 } 353 valueString[valueStringLength++] = UNICODE_RIGHT_CURLY_CODEPOINT; /* adding } */ 354 } else if (((String)context).equals(ESCAPE_CSS2)) { 355 valueString[valueStringLength++] = UNICODE_RS_CODEPOINT; /* adding \ */ 356 valueStringLength += itou(valueString, valueStringLength, cp, 16, 0); 357 /* Always add space character, because the next character might be whitespace, 358 which would erroneously be considered the termination of the escape sequence. */ 359 valueString[valueStringLength++] = UNICODE_SPACE_CODEPOINT; 360 } else { 361 while (i < length) { 362 valueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 363 valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */ 364 valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4); 365 } 366 } 367 } 368 return encoder.cbFromUWriteUChars(encoder, CharBuffer.wrap(valueString, 0, valueStringLength), target, offsets); 369 } 370 }; 371 /** 372 * Write escape callback 373 * @stable ICU 4.0 374 */ 375 public static final Decoder TO_U_CALLBACK_ESCAPE = new Decoder() { 376 @Override 377 public CoderResult call(CharsetDecoderICU decoder, Object context, 378 ByteBuffer source, CharBuffer target, IntBuffer offsets, 379 char[] buffer, int length, CoderResult cr){ 380 char[] uniValueString = new char[VALUE_STRING_LENGTH]; 381 int valueStringLength = 0; 382 int i = 0; 383 384 if (context == null || !(context instanceof String)) { 385 while (i < length) { 386 uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 387 uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT; /* adding U */ 388 valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2); 389 } 390 } else { 391 if (((String)context).equals(ESCAPE_XML_DEC)) { 392 while (i < length) { 393 uniValueString[valueStringLength++] = UNICODE_AMP_CODEPOINT; /* adding & */ 394 uniValueString[valueStringLength++] = UNICODE_HASH_CODEPOINT; /* adding # */ 395 valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 10, 0); 396 uniValueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 397 } 398 } else if (((String)context).equals(ESCAPE_XML_HEX)) { 399 while (i < length) { 400 uniValueString[valueStringLength++] = UNICODE_AMP_CODEPOINT; /* adding & */ 401 uniValueString[valueStringLength++] = UNICODE_HASH_CODEPOINT; /* adding # */ 402 uniValueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT; /* adding x */ 403 valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 0); 404 uniValueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 405 } 406 } else if (((String)context).equals(ESCAPE_C)) { 407 while (i < length) { 408 uniValueString[valueStringLength++] = UNICODE_RS_CODEPOINT; /* adding \ */ 409 uniValueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT; /* adding x */ 410 valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2); 411 } 412 } else { 413 while (i < length) { 414 uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 415 uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT; /* adding X */ 416 itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2); 417 valueStringLength += 2; 418 } 419 } 420 } 421 422 cr = CharsetDecoderICU.toUWriteUChars(decoder, uniValueString, 0, valueStringLength, target, offsets, 0); 423 424 return cr; 425 } 426 }; 427 /*** 428 * Java port of uprv_itou() in ICU4C used by TO_U_CALLBACK_ESCAPE and FROM_U_CALLBACK_ESCAPE. 429 * Fills in a char string with the radix-based representation of a number padded with zeroes 430 * to minwidth. 431 */ itou(char[] buffer, int sourceIndex, int i, int radix, int minwidth)432 private static final int itou(char[] buffer, int sourceIndex, int i, int radix, int minwidth) { 433 int length = 0; 434 int digit; 435 int j; 436 char temp; 437 438 do { 439 digit = i % radix; 440 buffer[sourceIndex + length++] = (char)(digit <= 9 ? (0x0030+digit) : (0x0030+digit+7)); 441 i = i/radix; 442 } while (i != 0 && (sourceIndex + length) < buffer.length); 443 444 while (length < minwidth) { 445 buffer[sourceIndex + length++] = (char)0x0030; /* zero padding */ 446 } 447 /* reverses the string */ 448 for (j = 0; j < (length / 2); j++) { 449 temp = buffer[(sourceIndex + length - 1) - j]; 450 buffer[(sourceIndex + length-1) -j] = buffer[sourceIndex + j]; 451 buffer[sourceIndex + j] = temp; 452 } 453 454 return length; 455 } 456 457 /* 458 * No need to create an instance 459 */ CharsetCallback()460 private CharsetCallback() { 461 } 462 } 463