1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2006-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.charset; 11 12 import java.io.IOException; 13 import java.io.InputStream; 14 import java.nio.Buffer; 15 import java.nio.BufferOverflowException; 16 import java.nio.ByteBuffer; 17 import java.nio.CharBuffer; 18 import java.nio.IntBuffer; 19 import java.nio.charset.CharsetDecoder; 20 import java.nio.charset.CharsetEncoder; 21 import java.nio.charset.CoderResult; 22 import java.util.Locale; 23 24 import com.ibm.icu.charset.UConverterSharedData.UConverterType; 25 import com.ibm.icu.impl.ICUBinary; 26 import com.ibm.icu.impl.ICUData; 27 import com.ibm.icu.impl.InvalidFormatException; 28 import com.ibm.icu.lang.UCharacter; 29 import com.ibm.icu.text.UTF16; 30 import com.ibm.icu.text.UnicodeSet; 31 32 class CharsetMBCS extends CharsetICU { 33 34 private byte[] fromUSubstitution = null; 35 UConverterSharedData sharedData = null; 36 private static final int MAX_VERSION_LENGTH = 4; 37 38 // these variables are used in getUnicodeSet() and may be changed in future 39 // typedef enum UConverterSetFilter { 40 static final int UCNV_SET_FILTER_NONE = 1; 41 static final int UCNV_SET_FILTER_DBCS_ONLY = 2; 42 static final int UCNV_SET_FILTER_2022_CN = 3; 43 static final int UCNV_SET_FILTER_SJIS= 4 ; 44 static final int UCNV_SET_FILTER_GR94DBCS = 5; 45 static final int UCNV_SET_FILTER_HZ = 6; 46 static final int UCNV_SET_FILTER_COUNT = 7; 47 // } UConverterSetFilter; 48 49 /** 50 * Fallbacks to Unicode are stored outside the normal state table and code point structures in a vector of items of 51 * this type. They are sorted by offset. 52 */ 53 final static class MBCSToUFallback { 54 int offset; 55 int codePoint; 56 MBCSToUFallback(int off, int cp)57 MBCSToUFallback(int off, int cp) { 58 offset = off; 59 codePoint = cp; 60 } 61 } 62 63 /** 64 * This is the MBCS part of the UConverterTable union (a runtime data structure). It keeps all the per-converter 65 * data and points into the loaded mapping tables. 66 */ 67 static final class UConverterMBCSTable { 68 /* toUnicode */ 69 short countStates; 70 byte dbcsOnlyState; 71 boolean stateTableOwned; 72 int countToUFallbacks; 73 74 int stateTable[/* countStates */][/* 256 */]; 75 int swapLFNLStateTable[/* countStates */][/* 256 */]; /* for swaplfnl */ 76 char unicodeCodeUnits[/* countUnicodeResults */]; 77 MBCSToUFallback toUFallbacks[/* countToUFallbacks */]; 78 79 /* fromUnicode */ 80 char fromUnicodeTable[]; // stage1, and for MBCS_OUTPUT_1 also contains stage2 81 int fromUnicodeTableInts[]; // stage1 and stage2 together as int[] 82 // Exactly one of the fromUnicode(Type) tables is not null, 83 // depending on the outputType. 84 byte fromUnicodeBytes[]; 85 char fromUnicodeChars[]; 86 int fromUnicodeInts[]; 87 char swapLFNLFromUnicodeChars[]; /* for swaplfnl */ 88 int fromUBytesLength; 89 short outputType, unicodeMask; 90 91 /* converter name for swaplfnl */ 92 String swapLFNLName; 93 94 /* extension data */ 95 UConverterSharedData baseSharedData; 96 // int extIndexes[]; 97 ByteBuffer extIndexes; // create int[] view etc. as needed 98 99 CharBuffer mbcsIndex; /* for fast conversion from most of BMP to MBCS (utf8Friendly data) */ 100 // char sbcsIndex[/* SBCS_FAST_LIMIT>>6 */]; /* for fast conversion from low BMP to SBCS (utf8Friendly data) */ 101 boolean utf8Friendly; /* for utf8Friendly data */ 102 char maxFastUChar; /* for utf8Friendly data */ 103 104 /* roundtrips */ 105 int asciiRoundtrips; 106 UConverterMBCSTable()107 UConverterMBCSTable() { 108 utf8Friendly = false; 109 mbcsIndex = null; 110 } 111 hasSupplementary()112 boolean hasSupplementary() { 113 return (unicodeMask & UConverterConstants.HAS_SUPPLEMENTARY) != 0; 114 } 115 116 /* 117 * UConverterMBCSTable(UConverterMBCSTable t) { countStates = t.countStates; dbcsOnlyState = t.dbcsOnlyState; 118 * stateTableOwned = t.stateTableOwned; countToUFallbacks = t.countToUFallbacks; stateTable = t.stateTable; 119 * swapLFNLStateTable = t.swapLFNLStateTable; unicodeCodeUnits = t.unicodeCodeUnits; toUFallbacks = 120 * t.toUFallbacks; fromUnicodeTable = t.fromUnicodeTable; fromUnicodeBytes = t.fromUnicodeBytes; 121 * swapLFNLFromUnicodeChars = t.swapLFNLFromUnicodeChars; fromUBytesLength = t.fromUBytesLength; outputType = 122 * t.outputType; unicodeMask = t.unicodeMask; swapLFNLName = t.swapLFNLName; baseSharedData = t.baseSharedData; 123 * extIndexes = t.extIndexes; } 124 */ 125 } 126 127 /* Constants used in MBCS data header */ 128 // enum { 129 static final int MBCS_OPT_LENGTH_MASK=0x3f; 130 static final int MBCS_OPT_NO_FROM_U=0x40; 131 /* 132 * If any of the following options bits are set, 133 * then the file must be rejected. 134 */ 135 static final int MBCS_OPT_INCOMPATIBLE_MASK=0xffc0; 136 /* 137 * Remove bits from this mask as more options are recognized 138 * by all implementations that use this constant. 139 */ 140 static final int MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK=0xff80; 141 // }; 142 /* Constants for fast and UTF-8-friendly conversion. */ 143 // enum { 144 static final int SBCS_FAST_MAX=0x0fff; /* maximum code point with UTF-8-friendly SBCS runtime code, see makeconv SBCS_UTF8_MAX */ 145 static final int SBCS_FAST_LIMIT=SBCS_FAST_MAX+1; /* =0x1000 */ 146 static final int MBCS_FAST_MAX=0xd7ff; /* maximum code point with UTF-8-friendly MBCS runtime code, see makeconv MBCS_UTF8_MAX */ 147 static final int MBCS_FAST_LIMIT=MBCS_FAST_MAX+1; /* =0xd800 */ 148 // }; 149 /** 150 * MBCS data header. See data format description above. 151 */ 152 final static class MBCSHeader { 153 byte version[/* U_MAX_VERSION_LENGTH */]; 154 int countStates, countToUFallbacks, offsetToUCodeUnits, offsetFromUTable, offsetFromUBytes; 155 int flags; 156 int fromUBytesLength; 157 158 /* new and required in version 5 */ 159 int options; 160 161 /* new and optional in version 5; used if options&MBCS_OPT_NO_FROM_U */ 162 int fullStage2Length; /* number of 32-bit units */ 163 MBCSHeader()164 MBCSHeader() { 165 version = new byte[MAX_VERSION_LENGTH]; 166 } 167 } 168 CharsetMBCS(String icuCanonicalName, String javaCanonicalName, String[] aliases, String classPath, ClassLoader loader)169 public CharsetMBCS(String icuCanonicalName, String javaCanonicalName, String[] aliases, String classPath, 170 ClassLoader loader) throws InvalidFormatException { 171 super(icuCanonicalName, javaCanonicalName, aliases); 172 173 /* See if the icuCanonicalName contains certain option information. */ 174 if (icuCanonicalName.indexOf(UConverterConstants.OPTION_SWAP_LFNL_STRING) > -1) { 175 options = UConverterConstants.OPTION_SWAP_LFNL; 176 icuCanonicalName = icuCanonicalName.substring(0, icuCanonicalName.indexOf(UConverterConstants.OPTION_SWAP_LFNL_STRING)); 177 super.icuCanonicalName = icuCanonicalName; 178 } 179 180 // now try to load the data 181 sharedData = loadConverter(1, icuCanonicalName, classPath, loader); 182 183 maxBytesPerChar = sharedData.staticData.maxBytesPerChar; 184 minBytesPerChar = sharedData.staticData.minBytesPerChar; 185 maxCharsPerByte = 1; 186 fromUSubstitution = sharedData.staticData.subChar; 187 subChar = sharedData.staticData.subChar; 188 subCharLen = sharedData.staticData.subCharLen; 189 subChar1 = sharedData.staticData.subChar1; 190 fromUSubstitution = new byte[sharedData.staticData.subCharLen]; 191 System.arraycopy(sharedData.staticData.subChar, 0, fromUSubstitution, 0, sharedData.staticData.subCharLen); 192 193 initializeConverter(options); 194 } 195 CharsetMBCS(String icuCanonicalName, String javaCanonicalName, String[] aliases)196 public CharsetMBCS(String icuCanonicalName, String javaCanonicalName, String[] aliases) 197 throws InvalidFormatException { 198 this(icuCanonicalName, javaCanonicalName, aliases, ICUData.ICU_BUNDLE, null); 199 } 200 loadConverter(int nestedLoads, String myName, String classPath, ClassLoader loader)201 private UConverterSharedData loadConverter(int nestedLoads, String myName, String classPath, ClassLoader loader) 202 throws InvalidFormatException { 203 boolean noFromU = false; 204 // Read converter data from file 205 UConverterStaticData staticData = new UConverterStaticData(); 206 UConverterDataReader reader = null; 207 try { 208 String itemName = myName + '.' + UConverterSharedData.DATA_TYPE; 209 String resourceName = classPath + '/' + itemName; 210 ByteBuffer b; 211 212 if (loader != null) { 213 @SuppressWarnings("resource") // Closed by getByteBufferFromInputStreamAndCloseStream(). 214 InputStream i = ICUData.getRequiredStream(loader, resourceName); 215 b = ICUBinary.getByteBufferFromInputStreamAndCloseStream(i); 216 } else if (!classPath.equals(ICUData.ICU_BUNDLE)) { 217 @SuppressWarnings("resource") // Closed by getByteBufferFromInputStreamAndCloseStream(). 218 InputStream i = ICUData.getRequiredStream(resourceName); 219 b = ICUBinary.getByteBufferFromInputStreamAndCloseStream(i); 220 } else { 221 b = ICUBinary.getRequiredData(itemName); 222 } 223 reader = new UConverterDataReader(b); 224 reader.readStaticData(staticData); 225 } catch (IOException e) { 226 throw new InvalidFormatException(e); 227 } catch (Exception e) { 228 throw new InvalidFormatException(e); 229 } 230 231 int type = staticData.conversionType; 232 233 if (type != UConverterSharedData.UConverterType.MBCS 234 || staticData.structSize != UConverterStaticData.SIZE_OF_UCONVERTER_STATIC_DATA) { 235 throw new InvalidFormatException(); 236 } 237 238 UConverterSharedData data = new UConverterSharedData(staticData); 239 240 // Load data 241 UConverterMBCSTable mbcsTable = data.mbcs; 242 MBCSHeader header = new MBCSHeader(); 243 try { 244 reader.readMBCSHeader(header); 245 } catch (IOException e) { 246 throw new InvalidFormatException(); 247 } 248 249 int offset; 250 // int[] extIndexesArray = null; 251 String baseNameString = null; 252 253 if (header.version[0] == 5 && header.version[1] >= 3 && (header.options & MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK) == 0) { 254 noFromU = ((header.options & MBCS_OPT_NO_FROM_U) != 0); 255 } else if (header.version[0] != 4) { 256 throw new InvalidFormatException(); 257 } 258 259 mbcsTable.outputType = (byte) header.flags; 260 261 /* extension data, header version 4.2 and higher */ 262 offset = header.flags >>> 8; 263 // if(offset!=0 && mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) { 264 if (mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) { 265 try { 266 baseNameString = reader.readBaseTableName(); 267 if (offset != 0) { 268 // agljport:commment subtract 32 for sizeof(_MBCSHeader) and length of baseNameString and 1 null 269 // terminator byte all already read; 270 mbcsTable.extIndexes = reader.readExtIndexes(offset - reader.bytesReadAfterStaticData()); 271 } 272 } catch (IOException e) { 273 throw new InvalidFormatException(); 274 } 275 } 276 277 // agljport:add this would be unnecessary if extIndexes were memory mapped 278 /* 279 * if(mbcsTable.extIndexes != null) { 280 * 281 * try { //int nbytes = mbcsTable.extIndexes[UConverterExt.UCNV_EXT_TO_U_LENGTH]*4 + 282 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_TO_U_UCHARS_LENGTH]*2 + 283 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_LENGTH]*6 + 284 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_BYTES_LENGTH] + 285 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_12_LENGTH]*2 + 286 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_3_LENGTH]*2 + 287 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_3B_LENGTH]*4; //int nbytes = 288 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_SIZE] //byte[] extTables = dataReader.readExtTables(nbytes); 289 * //mbcsTable.extTables = ByteBuffer.wrap(extTables); } catch(IOException e) { System.err.println("Caught 290 * IOException: " + e.getMessage()); pErrorCode[0] = UErrorCode.U_INVALID_FORMAT_ERROR; return; } } 291 */ 292 if (mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) { 293 UConverterSharedData baseSharedData = null; 294 ByteBuffer extIndexes; 295 String baseName; 296 297 /* extension-only file, load the base table and set values appropriately */ 298 extIndexes = mbcsTable.extIndexes; 299 if (extIndexes == null) { 300 /* extension-only file without extension */ 301 throw new InvalidFormatException(); 302 } 303 304 if (nestedLoads != 1) { 305 /* an extension table must not be loaded as a base table */ 306 throw new InvalidFormatException(); 307 } 308 309 /* load the base table */ 310 baseName = baseNameString; 311 if (baseName.equals(staticData.name)) { 312 /* forbid loading this same extension-only file */ 313 throw new InvalidFormatException(); 314 } 315 316 // agljport:fix args.size=sizeof(UConverterLoadArgs); 317 baseSharedData = loadConverter(2, baseName, classPath, loader); 318 319 if (baseSharedData.staticData.conversionType != UConverterType.MBCS 320 || baseSharedData.mbcs.baseSharedData != null) { 321 // agljport:fix ucnv_unload(baseSharedData); 322 throw new InvalidFormatException(); 323 } 324 325 /* copy the base table data */ 326 // agljport:comment deep copy in C changes mbcs through local reference mbcsTable; in java we probably don't 327 // need the deep copy so can just make sure mbcs and its local reference both refer to the same new object 328 mbcsTable = data.mbcs = baseSharedData.mbcs; 329 330 /* overwrite values with relevant ones for the extension converter */ 331 mbcsTable.baseSharedData = baseSharedData; 332 mbcsTable.extIndexes = extIndexes; 333 334 /* 335 * It would be possible to share the swapLFNL data with a base converter, but the generated name would have 336 * to be different, and the memory would have to be free'd only once. It is easier to just create the data 337 * for the extension converter separately when it is requested. 338 */ 339 mbcsTable.swapLFNLStateTable = null; 340 mbcsTable.swapLFNLFromUnicodeChars = null; 341 mbcsTable.swapLFNLName = null; 342 343 /* 344 * Set a special, runtime-only outputType if the extension converter is a DBCS version of a base converter 345 * that also maps single bytes. 346 */ 347 if (staticData.conversionType == UConverterType.DBCS 348 || (staticData.conversionType == UConverterType.MBCS && staticData.minBytesPerChar >= 2)) { 349 350 if (baseSharedData.mbcs.outputType == MBCS_OUTPUT_2_SISO) { 351 /* the base converter is SI/SO-stateful */ 352 int entry; 353 354 /* get the dbcs state from the state table entry for SO=0x0e */ 355 entry = mbcsTable.stateTable[0][0xe]; 356 if (MBCS_ENTRY_IS_FINAL(entry) && MBCS_ENTRY_FINAL_ACTION(entry) == MBCS_STATE_CHANGE_ONLY 357 && MBCS_ENTRY_FINAL_STATE(entry) != 0) { 358 mbcsTable.dbcsOnlyState = (byte) MBCS_ENTRY_FINAL_STATE(entry); 359 360 mbcsTable.outputType = MBCS_OUTPUT_DBCS_ONLY; 361 } 362 } else if (baseSharedData.staticData.conversionType == UConverterType.MBCS 363 && baseSharedData.staticData.minBytesPerChar == 1 364 && baseSharedData.staticData.maxBytesPerChar == 2 && mbcsTable.countStates <= 127) { 365 366 /* non-stateful base converter, need to modify the state table */ 367 int newStateTable[][/* 256 */]; 368 int state[]; // this works because java 2-D array is array of references and we can have state = 369 // newStateTable[i]; 370 int i, count; 371 372 /* allocate a new state table and copy the base state table contents */ 373 count = mbcsTable.countStates; 374 newStateTable = new int[(count + 1) * 1024][256]; 375 376 for (i = 0; i < mbcsTable.stateTable.length; ++i) 377 System.arraycopy(mbcsTable.stateTable[i], 0, newStateTable[i], 0, 378 mbcsTable.stateTable[i].length); 379 380 /* change all final single-byte entries to go to a new all-illegal state */ 381 state = newStateTable[0]; 382 for (i = 0; i < 256; ++i) { 383 if (MBCS_ENTRY_IS_FINAL(state[i])) { 384 state[i] = MBCS_ENTRY_TRANSITION(count, 0); 385 } 386 } 387 388 /* build the new all-illegal state */ 389 state = newStateTable[count]; 390 for (i = 0; i < 256; ++i) { 391 state[i] = MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0); 392 } 393 mbcsTable.stateTable = newStateTable; 394 mbcsTable.countStates = (byte) (count + 1); 395 mbcsTable.stateTableOwned = true; 396 397 mbcsTable.outputType = MBCS_OUTPUT_DBCS_ONLY; 398 } 399 } 400 401 /* 402 * unlike below for files with base tables, do not get the unicodeMask from the sharedData; instead, use the 403 * base table's unicodeMask, which we copied in the memcpy above; this is necessary because the static data 404 * unicodeMask, especially the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data 405 */ 406 } else { 407 /* conversion file with a base table; an additional extension table is optional */ 408 /* make sure that the output type is known */ 409 switch (mbcsTable.outputType) { 410 case MBCS_OUTPUT_1: 411 case MBCS_OUTPUT_2: 412 case MBCS_OUTPUT_3: 413 case MBCS_OUTPUT_4: 414 case MBCS_OUTPUT_3_EUC: 415 case MBCS_OUTPUT_4_EUC: 416 case MBCS_OUTPUT_2_SISO: 417 /* OK */ 418 break; 419 default: 420 throw new InvalidFormatException(); 421 } 422 423 /* 424 * converter versions 6.1 and up contain a unicodeMask that is used here to select the most efficient 425 * function implementations 426 */ 427 // agljport:fix info.size=sizeof(UDataInfo); 428 // agljport:fix udata_getInfo((UDataMemory *)sharedData->dataMemory, &info); 429 if (reader.dataFormatHasUnicodeMask()) { 430 /* mask off possible future extensions to be safe */ 431 mbcsTable.unicodeMask = (short) (staticData.unicodeMask & 3); 432 } else { 433 /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */ 434 mbcsTable.unicodeMask = UConverterConstants.HAS_SUPPLEMENTARY | UConverterConstants.HAS_SURROGATES; 435 } 436 try { 437 reader.readMBCSTable(header, mbcsTable); 438 } catch (IOException e) { 439 throw new InvalidFormatException(); 440 } 441 442 if (offset != 0) { 443 try { 444 // agljport:commment subtract 32 for sizeof(_MBCSHeader) and length of baseNameString and 1 null 445 // terminator byte all already read; 446 // int namelen = baseNameString != null? baseNameString.length() + 1: 0; 447 mbcsTable.extIndexes = reader.readExtIndexes(offset - reader.bytesReadAfterStaticData()); 448 } catch (IOException e) { 449 throw new InvalidFormatException(); 450 } 451 } 452 453 if (header.version[1] >= 3 && (mbcsTable.unicodeMask & UConverterConstants.HAS_SURROGATES) == 0 && 454 (mbcsTable.countStates == 1 ? ((char)header.version[2] >= (SBCS_FAST_MAX>>8)) : ((char)header.version[2] >= (MBCS_FAST_MAX>>8)))) { 455 mbcsTable.utf8Friendly = true; 456 457 if (mbcsTable.countStates == 1) { 458 /* 459 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher. 460 * Build a table with indexes to each block, to be used instead of 461 * the regular stage 1/2 table. 462 */ 463 // sbcsIndex = new char[SBCS_FAST_LIMIT>>6]; 464 // for (int i = 0; i < (SBCS_FAST_LIMIT>>6); ++i) { 465 // mbcsTable.sbcsIndex[i] = mbcsTable.fromUnicodeTable[mbcsTable.fromUnicodeTable[i>>4]+((i<<2)&0x3c)]; 466 // } 467 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header.version[2]>(SBCS_FAST_MAX>>8) */ 468 mbcsTable.maxFastUChar = SBCS_FAST_MAX; 469 } else { 470 /* 471 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher. 472 * The .cnv file is prebuilt with an additional stage table with indexes to each block. 473 */ 474 mbcsTable.maxFastUChar = (char)((header.version[2]<<8) | 0xff); 475 } 476 } 477 /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */ 478 { 479 int asciiRoundtrips = 0xffffffff; 480 for (int i = 0; i < 0x80; ++i) { 481 if (mbcsTable.stateTable[0][i] != MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) { 482 asciiRoundtrips &= ~(1 << (i >> 2)); 483 } 484 } 485 mbcsTable.asciiRoundtrips = asciiRoundtrips; 486 } 487 // TODO: Use asciiRoundtrips to speed up conversion, like in ICU4C. 488 489 if (noFromU) { 490 int stage1Length = (mbcsTable.unicodeMask&UConverterConstants.HAS_SUPPLEMENTARY) != 0 ? 0x440 : 0x40; 491 int stage2Length = (header.offsetFromUBytes - header.offsetFromUTable)/4 - stage1Length/2; 492 reconstituteData(mbcsTable, stage1Length, stage2Length, header.fullStage2Length); 493 } 494 if (mbcsTable.outputType == MBCS_OUTPUT_DBCS_ONLY || mbcsTable.outputType == MBCS_OUTPUT_2_SISO) { 495 /* 496 * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip. 497 * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly. 498 */ 499 mbcsTable.asciiRoundtrips = 0; 500 } 501 } 502 // TODO: Use mbcsIndex to speed up UTF-16 conversion, like in ICU4C. 503 mbcsTable.mbcsIndex = null; 504 return data; 505 } 506 writeStage3Roundtrip(UConverterMBCSTable mbcsTable, long value, int codePoints[])507 private static boolean writeStage3Roundtrip(UConverterMBCSTable mbcsTable, long value, int codePoints[]) { 508 char[] table; 509 byte[] bytes; 510 int stage2; 511 int p; 512 int c; 513 int i, st3; 514 515 table = mbcsTable.fromUnicodeTable; 516 int[] tableInts = mbcsTable.fromUnicodeTableInts; 517 bytes = mbcsTable.fromUnicodeBytes; 518 char[] chars = mbcsTable.fromUnicodeChars; 519 int[] ints = mbcsTable.fromUnicodeInts; 520 521 /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */ 522 switch(mbcsTable.outputType) { 523 case MBCS_OUTPUT_3_EUC: 524 if(value<=0xffff) { 525 /* short sequences are stored directly */ 526 /* code set 0 or 1 */ 527 } else if(value<=0x8effff) { 528 /* code set 2 */ 529 value&=0x7fff; 530 } else /* first byte is 0x8f */ { 531 /* code set 3 */ 532 value&=0xff7f; 533 } 534 break; 535 case MBCS_OUTPUT_4_EUC: 536 if(value<=0xffffff) { 537 /* short sequences are stored directly */ 538 /* code set 0 or 1 */ 539 } else if(value<=0x8effffffL) { 540 /* code set 2 */ 541 value&=0x7fffff; 542 } else /* first byte is 0x8f */ { 543 /* code set 3 */ 544 value&=0xff7fff; 545 } 546 break; 547 default: 548 break; 549 } 550 551 for(i=0; i<=0x1f; ++value, ++i) { 552 c=codePoints[i]; 553 if(c<0) { 554 continue; 555 } 556 557 /* locate the stage 2 & 3 data */ 558 stage2 = table[c>>10] + ((c>>4)&0x3f); 559 st3 = tableInts[stage2]; 560 st3 = (char)(st3 * 16 + (c&0xf)); 561 562 /* write the codepage bytes into stage 3 */ 563 switch(mbcsTable.outputType) { 564 case MBCS_OUTPUT_3: 565 case MBCS_OUTPUT_4_EUC: 566 p = st3*3; 567 bytes[p] = (byte)(value>>16); 568 bytes[p+1] = (byte)(value>>8); 569 bytes[p+2] = (byte)value; 570 break; 571 case MBCS_OUTPUT_4: 572 ints[st3] = (int)value; 573 break; 574 default: 575 /* 2 bytes per character */ 576 chars[st3] = (char)value; 577 break; 578 } 579 580 // Set the roundtrip flag. 581 int shift = 16 + (c & 0x0F); 582 tableInts[stage2] |= (1L << shift); 583 } 584 return true; 585 } 586 reconstituteData(UConverterMBCSTable mbcsTable, int stage1Length, int stage2Length, int fullStage2Length)587 private static void reconstituteData(UConverterMBCSTable mbcsTable, 588 int stage1Length, int stage2Length, int fullStage2Length) { 589 char[] stage1 = mbcsTable.fromUnicodeTable; 590 591 // stage2 starts with unused stage1 space. 592 // Indexes into stage 2 count from the bottom of the fromUnicodeTable. 593 int numStage1Ints = stage1Length / 2; // 2 chars = 1 int 594 int[] stage2 = new int[numStage1Ints + fullStage2Length]; 595 System.arraycopy(mbcsTable.fromUnicodeTableInts, numStage1Ints, 596 stage2, (fullStage2Length - stage2Length) + numStage1Ints, 597 stage2Length); 598 mbcsTable.fromUnicodeTableInts = stage2; 599 600 /* reconstitute the initial part of stage 2 from the mbcsIndex */ 601 { 602 int stageUTF8Length=(mbcsTable.maxFastUChar+1)>>6; 603 int stageUTF8Index=0; 604 int st1, st2, st3, i; 605 606 for (st1 = 0; stageUTF8Index < stageUTF8Length; ++st1) { 607 st2 = stage1[st1]; 608 if (st2 != stage1Length/2) { 609 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */ 610 for (i = 0; i < 16; ++i) { 611 st3 = mbcsTable.mbcsIndex.get(stageUTF8Index++); 612 if (st3 != 0) { 613 /* a stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */ 614 st3>>=4; 615 /* 616 * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are 617 * allocated together as a single 64-block for access from the mbcsIndex 618 */ 619 stage2[st2++] = st3++; 620 stage2[st2++] = st3++; 621 stage2[st2++] = st3++; 622 stage2[st2++] = st3; 623 } else { 624 /* no stage 3 block, skip */ 625 st2+=4; 626 } 627 } 628 } else { 629 /* no stage 2 block, skip */ 630 stageUTF8Index+=16; 631 } 632 } 633 } 634 635 switch (mbcsTable.outputType) { 636 case CharsetMBCS.MBCS_OUTPUT_2: 637 case CharsetMBCS.MBCS_OUTPUT_2_SISO: 638 case CharsetMBCS.MBCS_OUTPUT_3_EUC: 639 mbcsTable.fromUnicodeChars = new char[mbcsTable.fromUBytesLength / 2]; 640 break; 641 case CharsetMBCS.MBCS_OUTPUT_3: 642 case CharsetMBCS.MBCS_OUTPUT_4_EUC: 643 mbcsTable.fromUnicodeBytes = new byte[mbcsTable.fromUBytesLength]; 644 break; 645 case CharsetMBCS.MBCS_OUTPUT_4: 646 mbcsTable.fromUnicodeInts = new int[mbcsTable.fromUBytesLength / 4]; 647 break; 648 default: 649 // Cannot occur, caller checked already. 650 assert false; 651 } 652 653 /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */ 654 MBCSEnumToUnicode(mbcsTable); 655 } 656 657 /* 658 * Internal function enumerating the toUnicode data of an MBCS converter. 659 * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U 660 * table, but could also be used for a future getUnicodeSet() option 661 * that includes reverse fallbacks (after updating this function's implementation). 662 * Currently only handles roundtrip mappings. 663 * Does not currently handle extensions. 664 */ MBCSEnumToUnicode(UConverterMBCSTable mbcsTable)665 private static void MBCSEnumToUnicode(UConverterMBCSTable mbcsTable) { 666 /* 667 * Properties for each state, to speed up the enumeration. 668 * Ignorable actions are unassigned/illegal/state-change-only: 669 * They do not lead to mappings. 670 * 671 * Bits 7..6 672 * 1 direct/initial state (stateful converters have mulitple) 673 * 0 non-initial state with transitions or with nonignorable result actions 674 * -1 final state with only ignorable actions 675 * 676 * Bits 5..3 677 * The lowest byte value with non-ignorable actions is 678 * value<<5 (rounded down). 679 * 680 * Bits 2..0: 681 * The highest byte value with non-ignorable actions is 682 * (value<<5)&0x1f (rounded up). 683 */ 684 byte stateProps[] = new byte[MBCS_MAX_STATE_COUNT]; 685 int state; 686 687 /* recurse from state 0 and set all stateProps */ 688 getStateProp(mbcsTable.stateTable, stateProps, 0); 689 690 for (state = 0; state < mbcsTable.countStates; ++state) { 691 if (stateProps[state] >= 0x40) { 692 /* start from each direct state */ 693 enumToU(mbcsTable, stateProps, state, 0, 0); 694 } 695 } 696 697 698 } 699 enumToU(UConverterMBCSTable mbcsTable, byte stateProps[], int state, int offset, int value)700 private static boolean enumToU(UConverterMBCSTable mbcsTable, byte stateProps[], int state, int offset, int value) { 701 int[] codePoints = new int[32]; 702 int[] row; 703 char[] unicodeCodeUnits; 704 int anyCodePoints; 705 int b, limit; 706 707 row = mbcsTable.stateTable[state]; 708 unicodeCodeUnits = mbcsTable.unicodeCodeUnits; 709 710 value<<=8; 711 anyCodePoints = -1; /* becomes non-negative if there is a mapping */ 712 713 b = (stateProps[state]&0x38)<<2; 714 if (b == 0 && stateProps[state] >= 0x40) { 715 /* skip byte sequences with leading zeros because they are note stored in the fromUnicode table */ 716 codePoints[0] = UConverterConstants.U_SENTINEL; 717 b = 1; 718 } 719 limit = ((stateProps[state]&7)+1)<<5; 720 while (b < limit) { 721 int entry = row[b]; 722 if (MBCS_ENTRY_IS_TRANSITION(entry)) { 723 int nextState = MBCS_ENTRY_TRANSITION_STATE(entry); 724 if (stateProps[nextState] >= 0) { 725 /* recurse to a state with non-ignorable actions */ 726 if (!enumToU(mbcsTable, stateProps, nextState, offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), value|b)) { 727 return false; 728 } 729 } 730 codePoints[b&0x1f] = UConverterConstants.U_SENTINEL; 731 } else { 732 int c; 733 int action; 734 735 /* 736 * An if-else-if chain provides more reliable performance for 737 * the most common cases compared to a switch. 738 */ 739 action = MBCS_ENTRY_FINAL_ACTION(entry); 740 if (action == MBCS_STATE_VALID_DIRECT_16) { 741 /* output BMP code point */ 742 c = MBCS_ENTRY_FINAL_VALUE_16(entry); 743 } else if (action == MBCS_STATE_VALID_16) { 744 int finalOffset = offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 745 c = unicodeCodeUnits[finalOffset]; 746 if (c < 0xfffe) { 747 /* output BMP code point */ 748 } else { 749 c = UConverterConstants.U_SENTINEL; 750 } 751 } else if (action == MBCS_STATE_VALID_16_PAIR) { 752 int finalOffset = offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 753 c = unicodeCodeUnits[finalOffset++]; 754 if (c < 0xd800) { 755 /* output BMP code point below 0xd800 */ 756 } else if (c <= 0xdbff) { 757 /* output roundtrip or fallback supplementary code point */ 758 c = ((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00); 759 } else if (c == 0xe000) { 760 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 761 c = unicodeCodeUnits[finalOffset]; 762 } else { 763 c = UConverterConstants.U_SENTINEL; 764 } 765 } else if (action == MBCS_STATE_VALID_DIRECT_20) { 766 /* output supplementary code point */ 767 c = MBCS_ENTRY_FINAL_VALUE(entry)+0x10000; 768 } else { 769 c = UConverterConstants.U_SENTINEL; 770 } 771 772 codePoints[b&0x1f] = c; 773 anyCodePoints&=c; 774 } 775 if (((++b)&0x1f) == 0) { 776 if(anyCodePoints>=0) { 777 if(!writeStage3Roundtrip(mbcsTable, value|(b-0x20), codePoints)) { 778 return false; 779 } 780 anyCodePoints=-1; 781 } 782 } 783 } 784 785 return true; 786 } 787 788 /* 789 * Only called if stateProps[state]==-1. 790 * A recursive call may do stateProps[state]|=0x40 if this state is the target of an 791 * MBCS_STATE_CHANGE_ONLY. 792 */ getStateProp(int stateTable[][], byte stateProps[], int state)793 private static byte getStateProp(int stateTable[][], byte stateProps[], int state) { 794 int[] row; 795 int min, max, entry, nextState; 796 797 row = stateTable[state]; 798 stateProps[state] = 0; 799 800 /* find first non-ignorable state */ 801 for (min = 0;;++min) { 802 entry = row[min]; 803 nextState = MBCS_ENTRY_STATE(entry); 804 if (stateProps[nextState] == -1) { 805 getStateProp(stateTable, stateProps, nextState); 806 } 807 if (MBCS_ENTRY_IS_TRANSITION(entry)) { 808 if (stateProps[nextState] >- 0) { 809 break; 810 } 811 } else if (MBCS_ENTRY_FINAL_ACTION(entry) < MBCS_STATE_UNASSIGNED) { 812 break; 813 } 814 if (min == 0xff) { 815 stateProps[state] = -0x40; /* (byte)0xc0 */ 816 return stateProps[state]; 817 } 818 } 819 stateProps[state]|=(byte)((min>>5)<<3); 820 821 /* find last non-ignorable state */ 822 for (max = 0xff; min < max; --max) { 823 entry = row[max]; 824 nextState = MBCS_ENTRY_STATE(entry); 825 if (stateProps[nextState] == -1) { 826 getStateProp(stateTable, stateProps, nextState); 827 } 828 if (MBCS_ENTRY_IS_TRANSITION(entry)) { 829 if (stateProps[nextState] >- 0) { 830 break; 831 } 832 } else if (MBCS_ENTRY_FINAL_ACTION(entry) < MBCS_STATE_UNASSIGNED) { 833 break; 834 } 835 } 836 stateProps[state]|=(byte)(max>>5); 837 838 /* recurse further and collect direct-state information */ 839 while (min <= max) { 840 entry = row[min]; 841 nextState = MBCS_ENTRY_STATE(entry); 842 if (stateProps[nextState] == -1) { 843 getStateProp(stateTable, stateProps, nextState); 844 } 845 if (MBCS_ENTRY_IS_TRANSITION(entry)) { 846 stateProps[nextState]|=0x40; 847 if (MBCS_ENTRY_FINAL_ACTION(entry) <= MBCS_STATE_FALLBACK_DIRECT_20) { 848 stateProps[state]|=0x40; 849 } 850 } 851 ++min; 852 } 853 return stateProps[state]; 854 } 855 initializeConverter(int myOptions)856 protected void initializeConverter(int myOptions) { 857 UConverterMBCSTable mbcsTable; 858 ByteBuffer extIndexes; 859 short outputType; 860 byte maxBytesPerUChar; 861 862 mbcsTable = sharedData.mbcs; 863 outputType = mbcsTable.outputType; 864 865 if (outputType == MBCS_OUTPUT_DBCS_ONLY) { 866 /* the swaplfnl option does not apply, remove it */ 867 this.options = myOptions &= ~UConverterConstants.OPTION_SWAP_LFNL; 868 } 869 870 if ((myOptions & UConverterConstants.OPTION_SWAP_LFNL) != 0) { 871 /* do this because double-checked locking is broken */ 872 boolean isCached; 873 874 // agljport:todo umtx_lock(NULL); 875 isCached = mbcsTable.swapLFNLStateTable != null; 876 // agljport:todo umtx_unlock(NULL); 877 878 if (!isCached) { 879 try { 880 if (!EBCDICSwapLFNL()) { 881 /* this option does not apply, remove it */ 882 this.options = myOptions & ~UConverterConstants.OPTION_SWAP_LFNL; 883 } 884 } catch (Exception e) { 885 /* something went wrong. */ 886 return; 887 } 888 } 889 } 890 891 String lowerCaseName = icuCanonicalName.toLowerCase(Locale.ENGLISH); 892 if (lowerCaseName.indexOf("gb18030") >= 0) { 893 /* set a flag for GB 18030 mode, which changes the callback behavior */ 894 this.options |= MBCS_OPTION_GB18030; 895 } else if (lowerCaseName.indexOf("keis") >= 0) { 896 this.options |= MBCS_OPTION_KEIS; 897 } else if (lowerCaseName.indexOf("jef") >= 0) { 898 this.options |= MBCS_OPTION_JEF; 899 } else if (lowerCaseName.indexOf("jips") >= 0) { 900 this.options |= MBCS_OPTION_JIPS; 901 } 902 903 /* fix maxBytesPerUChar depending on outputType and options etc. */ 904 if (outputType == MBCS_OUTPUT_2_SISO) { 905 /* changed from 3 to 4 in ICU4J only. #9205 */ 906 maxBytesPerChar = 4; /* SO+DBCS+SI*/ 907 } 908 909 extIndexes = mbcsTable.extIndexes; 910 if (extIndexes != null) { 911 maxBytesPerUChar = (byte) GET_MAX_BYTES_PER_UCHAR(extIndexes); 912 if (outputType == MBCS_OUTPUT_2_SISO) { 913 ++maxBytesPerUChar; /* SO + multiple DBCS */ 914 } 915 916 if (maxBytesPerUChar > maxBytesPerChar) { 917 maxBytesPerChar = maxBytesPerUChar; 918 } 919 } 920 } 921 /* EBCDIC swap LF<->NL--------------------------------------------------------------------------------*/ 922 /* 923 * This code modifies a standard EBCDIC<->Unicode mappling table for 924 * OS/390 (z/OS) Unix System Services (Open Edition). 925 * The difference is in the mapping of Line Feed and New Line control codes: 926 * Standard EBDIC maps 927 * 928 * <U000A> \x25 |0 929 * <U0085> \x15 |0 930 * 931 * but OS/390 USS EBCDIC swaps the control codes for LF and NL, 932 * mapping 933 * 934 * <U000A> \x15 |0 935 * <U0085> \x25 |0 936 * 937 * This code modifies a loaded standard EBCDIC<->Unicode mapping table 938 * by copying it into allocated memory and swapping the LF and NL values. 939 * It allows to support the same EBCDIC charset in both version without 940 * duplicating the entire installed table. 941 */ 942 /* standard EBCDIC codes */ 943 private static final short EBCDIC_LF = 0x0025; 944 private static final short EBCDIC_NL = 0x0015; 945 946 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */ 947 private static final short EBCDIC_RT_LF = 0x0f25; 948 private static final short EBCDIC_RT_NL = 0x0f15; 949 950 /* Unicode code points */ 951 private static final short U_LF = 0x000A; 952 private static final short U_NL = 0x0085; 953 EBCDICSwapLFNL()954 private boolean EBCDICSwapLFNL() throws Exception { 955 UConverterMBCSTable mbcsTable; 956 957 char[] table; 958 959 int[][] newStateTable; 960 String newName; 961 962 int stage2Entry; 963 964 mbcsTable = sharedData.mbcs; 965 966 table = mbcsTable.fromUnicodeTable; 967 int[] tableInts = sharedData.mbcs.fromUnicodeTableInts; 968 char[] chars = mbcsTable.fromUnicodeChars; 969 char[] results = chars; 970 971 /* 972 * Check that this is an EBCDIC table with SBCS portion - 973 * SBCS or EBCDIC with standard EBCDIC LF and NL mappings. 974 * 975 * If not, ignore the option. Options are always ignored if they do not apply. 976 */ 977 if (!((mbcsTable.outputType == MBCS_OUTPUT_1 || mbcsTable.outputType == MBCS_OUTPUT_2_SISO) && 978 mbcsTable.stateTable[0][EBCDIC_LF] == MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) && 979 mbcsTable.stateTable[0][EBCDIC_NL] == MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL))) { 980 return false; 981 } 982 983 if (mbcsTable.outputType == MBCS_OUTPUT_1) { 984 if (!(EBCDIC_RT_LF == MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) && 985 EBCDIC_RT_NL == MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL))) { 986 return false; 987 } 988 } else /* MBCS_OUTPUT_2_SISO */ { 989 stage2Entry = MBCS_STAGE_2_FROM_U(table, tableInts, U_LF); 990 if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF) && 991 EBCDIC_LF == MBCS_VALUE_2_FROM_STAGE_2(chars, stage2Entry, U_LF))) { 992 return false; 993 } 994 995 stage2Entry = MBCS_STAGE_2_FROM_U(table, tableInts, U_NL); 996 if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL) && 997 EBCDIC_NL == MBCS_VALUE_2_FROM_STAGE_2(chars, stage2Entry, U_NL))) { 998 return false; 999 } 1000 } 1001 1002 if (mbcsTable.fromUBytesLength > 0) { 1003 /* 1004 * We _know_ the number of bytes in the fromUnicodeBytes array 1005 * starting with header.version 4.1. 1006 */ 1007 // sizeofFromUBytes = mbcsTable.fromUBytesLength; 1008 } else { 1009 /* 1010 * Otherwise: 1011 * There used to be code to enumerate the fromUnicode 1012 * trie and find the highest entry, but it was removed in ICU 3.2 1013 * because it was not tested and caused a low code coverage number. 1014 */ 1015 throw new Exception("U_INVALID_FORMAT_ERROR"); 1016 } 1017 1018 /* 1019 * The table has an appropriate format. 1020 * Allocate and build 1021 * - a modified to-Unicode state table 1022 * - a modified from-Unicode output array 1023 * - a converter name string with the swap option appended 1024 */ 1025 // size = mbcsTable.countStates * 1024 + sizeofFromUBytes + UConverterConstants.MAX_CONVERTER_NAME_LENGTH + 20; 1026 1027 /* copy and modify the to-Unicode state table */ 1028 newStateTable = new int[mbcsTable.stateTable.length][mbcsTable.stateTable[0].length]; 1029 for (int i = 0; i < newStateTable.length; i++) { 1030 System.arraycopy(mbcsTable.stateTable[i], 0, newStateTable[i], 0, newStateTable[i].length); 1031 } 1032 1033 newStateTable[0][EBCDIC_LF] = MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL); 1034 newStateTable[0][EBCDIC_NL] = MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF); 1035 1036 /* copy and modify the from-Unicode result table */ 1037 char[] newResults = new char[chars.length]; 1038 System.arraycopy(chars, 0, newResults, 0, chars.length); 1039 /* conveniently, the table access macros work on the left side of expressions */ 1040 if (mbcsTable.outputType == MBCS_OUTPUT_1) { 1041 MBCS_SINGLE_RESULT_FROM_U_SET(table, newResults, U_LF, EBCDIC_RT_NL); 1042 MBCS_SINGLE_RESULT_FROM_U_SET(table, newResults, U_NL, EBCDIC_RT_LF); 1043 } else /* MBCS_OUTPUT_2_SISO */ { 1044 stage2Entry = MBCS_STAGE_2_FROM_U(table, tableInts, U_LF); 1045 MBCS_VALUE_2_FROM_STAGE_2_SET(newResults, stage2Entry, U_LF, EBCDIC_NL); 1046 1047 stage2Entry = MBCS_STAGE_2_FROM_U(table, tableInts, U_NL); 1048 MBCS_VALUE_2_FROM_STAGE_2_SET(newResults, stage2Entry, U_NL, EBCDIC_LF); 1049 } 1050 1051 /* set the canonical converter name */ 1052 newName = icuCanonicalName.concat(UConverterConstants.OPTION_SWAP_LFNL_STRING); 1053 1054 if (mbcsTable.swapLFNLStateTable == null) { 1055 mbcsTable.swapLFNLStateTable = newStateTable; 1056 mbcsTable.swapLFNLFromUnicodeChars = newResults; 1057 mbcsTable.swapLFNLName = newName; 1058 } 1059 return true; 1060 } 1061 1062 /** 1063 * MBCS output types for conversions from Unicode. These per-converter types determine the storage method in stage 3 1064 * of the lookup table, mostly how many bytes are stored per entry. 1065 */ 1066 static final int MBCS_OUTPUT_1 = 0; /* 0 */ 1067 static final int MBCS_OUTPUT_2 = MBCS_OUTPUT_1 + 1; /* 1 */ 1068 static final int MBCS_OUTPUT_3 = MBCS_OUTPUT_2 + 1; /* 2 */ 1069 static final int MBCS_OUTPUT_4 = MBCS_OUTPUT_3 + 1; /* 3 */ 1070 static final int MBCS_OUTPUT_3_EUC = 8; /* 8 */ 1071 static final int MBCS_OUTPUT_4_EUC = MBCS_OUTPUT_3_EUC + 1; /* 9 */ 1072 static final int MBCS_OUTPUT_2_SISO = 12; /* c */ 1073 static final int MBCS_OUTPUT_2_HZ = MBCS_OUTPUT_2_SISO + 1; /* d */ 1074 static final int MBCS_OUTPUT_EXT_ONLY = MBCS_OUTPUT_2_HZ + 1; /* e */ 1075 // static final int MBCS_OUTPUT_COUNT = MBCS_OUTPUT_EXT_ONLY + 1; 1076 static final int MBCS_OUTPUT_DBCS_ONLY = 0xdb; /* runtime-only type for DBCS-only handling of SISO tables */ 1077 1078 /* GB 18030 data ------------------------------------------------------------ */ 1079 1080 /* helper macros for linear values for GB 18030 four-byte sequences */ LINEAR_18030(int a, int b, int c, int d)1081 private static int LINEAR_18030(int a, int b, int c, int d) { 1082 return ((((a & 0xff) * 10 + (b & 0xff)) * 126 + (c & 0xff)) * 10 + (d & 0xff)); 1083 } 1084 1085 private static int LINEAR_18030_BASE = LINEAR_18030(0x81, 0x30, 0x81, 0x30); 1086 LINEAR(int x)1087 private static int LINEAR(int x) { 1088 return LINEAR_18030(x >>> 24, (x >>> 16) & 0xff, (x >>> 8) & 0xff, x & 0xff); 1089 } 1090 1091 /* 1092 * Some ranges of GB 18030 where both the Unicode code points and the GB four-byte sequences are contiguous and are 1093 * handled algorithmically by the special callback functions below. The values are start & end of Unicode & GB 1094 * codes. 1095 * 1096 * Note that single surrogates are not mapped by GB 18030 as of the re-released mapping tables from 2000-nov-30. 1097 */ 1098 private static final int gb18030Ranges[][] = new int[/* 14 */][/* 4 */] { 1099 { 0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35) }, 1100 { 0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738) }, 1101 { 0x0452, 0x1E3E, LINEAR(0x8130D330), LINEAR(0x8135F436) }, 1102 { 0x1E40, 0x200F, LINEAR(0x8135F438), LINEAR(0x8136A531) }, 1103 { 0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534) }, 1104 { 0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38) }, 1105 { 0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537) }, 1106 { 0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32) }, 1107 { 0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237) }, 1108 { 0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733) }, 1109 { 0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837) }, 1110 { 0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638) }, 1111 { 0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931) }, 1112 { 0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439) } }; 1113 1114 /* bit flag for UConverter.options indicating GB 18030 special handling */ 1115 private static final int MBCS_OPTION_GB18030 = 0x8000; 1116 1117 /* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */ 1118 private static final int MBCS_OPTION_KEIS = 0x01000; 1119 private static final int MBCS_OPTION_JEF = 0x02000; 1120 private static final int MBCS_OPTION_JIPS = 0x04000; 1121 1122 private static enum SISO_Option { 1123 SI, 1124 SO 1125 } 1126 1127 private static final byte[] KEIS_SO_CHAR = { 0x0A, 0x42 }; 1128 private static final byte[] KEIS_SI_CHAR = { 0x0A, 0x41 }; 1129 private static final byte JEF_SO_CHAR = 0x28; 1130 private static final byte JEF_SI_CHAR = 0x29; 1131 private static final byte[] JIPS_SO_CHAR = { 0x1A, 0x70 }; 1132 private static final byte[] JIPS_SI_CHAR = { 0x1A, 0x71 }; 1133 getSISOBytes(SISO_Option option, int cnvOption, byte[] value)1134 private static int getSISOBytes(SISO_Option option, int cnvOption, byte[] value) { 1135 int SISOLength = 0; 1136 1137 switch (option) { 1138 case SI: 1139 if ((cnvOption&MBCS_OPTION_KEIS)!=0) { 1140 value[0] = KEIS_SI_CHAR[0]; 1141 value[1] = KEIS_SI_CHAR[1]; 1142 SISOLength = 2; 1143 } else if ((cnvOption&MBCS_OPTION_JEF)!=0) { 1144 value[0] = JEF_SI_CHAR; 1145 SISOLength = 1; 1146 } else if ((cnvOption&MBCS_OPTION_JIPS)!=0) { 1147 value[0] = JIPS_SI_CHAR[0]; 1148 value[1] = JIPS_SI_CHAR[1]; 1149 SISOLength = 2; 1150 } else { 1151 value[0] = UConverterConstants.SI; 1152 SISOLength = 1; 1153 } 1154 break; 1155 case SO: 1156 if ((cnvOption&MBCS_OPTION_KEIS)!=0) { 1157 value[0] = KEIS_SO_CHAR[0]; 1158 value[1] = KEIS_SO_CHAR[1]; 1159 SISOLength = 2; 1160 } else if ((cnvOption&MBCS_OPTION_JEF)!=0) { 1161 value[0] = JEF_SO_CHAR; 1162 SISOLength = 1; 1163 } else if ((cnvOption&MBCS_OPTION_JIPS)!=0) { 1164 value[0] = JIPS_SO_CHAR[0]; 1165 value[1] = JIPS_SO_CHAR[1]; 1166 SISOLength = 2; 1167 } else { 1168 value[0] = UConverterConstants.SO; 1169 SISOLength = 1; 1170 } 1171 break; 1172 default: 1173 /* Should never happen. */ 1174 break; 1175 } 1176 1177 return SISOLength; 1178 } 1179 // enum { 1180 static final int MBCS_MAX_STATE_COUNT = 128; 1181 // }; 1182 /** 1183 * MBCS action codes for conversions to Unicode. These values are in bits 23..20 of the state table entries. 1184 */ 1185 static final int MBCS_STATE_VALID_DIRECT_16 = 0; 1186 static final int MBCS_STATE_VALID_DIRECT_20 = MBCS_STATE_VALID_DIRECT_16 + 1; 1187 static final int MBCS_STATE_FALLBACK_DIRECT_16 = MBCS_STATE_VALID_DIRECT_20 + 1; 1188 static final int MBCS_STATE_FALLBACK_DIRECT_20 = MBCS_STATE_FALLBACK_DIRECT_16 + 1; 1189 static final int MBCS_STATE_VALID_16 = MBCS_STATE_FALLBACK_DIRECT_20 + 1; 1190 static final int MBCS_STATE_VALID_16_PAIR = MBCS_STATE_VALID_16 + 1; 1191 static final int MBCS_STATE_UNASSIGNED = MBCS_STATE_VALID_16_PAIR + 1; 1192 static final int MBCS_STATE_ILLEGAL = MBCS_STATE_UNASSIGNED + 1; 1193 static final int MBCS_STATE_CHANGE_ONLY = MBCS_STATE_ILLEGAL + 1; 1194 MBCS_ENTRY_SET_STATE(int entry, int state)1195 static int MBCS_ENTRY_SET_STATE(int entry, int state) { 1196 return (entry&0x80ffffff)|(state<<24L); 1197 } 1198 MBCS_ENTRY_STATE(int entry)1199 static int MBCS_ENTRY_STATE(int entry) { 1200 return (((entry)>>24)&0x7f); 1201 } 1202 1203 /* Methods for state table entries */ MBCS_ENTRY_TRANSITION(int state, int offset)1204 static int MBCS_ENTRY_TRANSITION(int state, int offset) { 1205 return (state << 24L) | offset; 1206 } 1207 MBCS_ENTRY_FINAL(int state, int action, int value)1208 static int MBCS_ENTRY_FINAL(int state, int action, int value) { 1209 return 0x80000000 | (state << 24L) | (action << 20L) | value; 1210 } 1211 MBCS_ENTRY_IS_TRANSITION(int entry)1212 static boolean MBCS_ENTRY_IS_TRANSITION(int entry) { 1213 return (entry) >= 0; 1214 } 1215 MBCS_ENTRY_IS_FINAL(int entry)1216 static boolean MBCS_ENTRY_IS_FINAL(int entry) { 1217 return (entry) < 0; 1218 } 1219 MBCS_ENTRY_TRANSITION_STATE(int entry)1220 static int MBCS_ENTRY_TRANSITION_STATE(int entry) { 1221 return ((entry) >>> 24); 1222 } 1223 MBCS_ENTRY_TRANSITION_OFFSET(int entry)1224 static int MBCS_ENTRY_TRANSITION_OFFSET(int entry) { 1225 return ((entry) & 0xffffff); 1226 } 1227 MBCS_ENTRY_FINAL_STATE(int entry)1228 static int MBCS_ENTRY_FINAL_STATE(int entry) { 1229 return ((entry) >>> 24) & 0x7f; 1230 } 1231 MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(int entry)1232 static boolean MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(int entry) { 1233 return ((entry) < 0x80100000); 1234 } 1235 MBCS_ENTRY_FINAL_ACTION(int entry)1236 static int MBCS_ENTRY_FINAL_ACTION(int entry) { 1237 return ((entry) >>> 20) & 0xf; 1238 } 1239 MBCS_ENTRY_FINAL_VALUE(int entry)1240 static int MBCS_ENTRY_FINAL_VALUE(int entry) { 1241 return ((entry) & 0xfffff); 1242 } 1243 MBCS_ENTRY_FINAL_VALUE_16(int entry)1244 static char MBCS_ENTRY_FINAL_VALUE_16(int entry) { 1245 return (char) (entry); 1246 } 1247 MBCS_IS_ASCII_ROUNDTRIP(int b, long asciiRoundtrips)1248 static boolean MBCS_IS_ASCII_ROUNDTRIP(int b, long asciiRoundtrips) { 1249 return (((asciiRoundtrips) & (1<<((b)>>2)))!=0); 1250 } 1251 1252 /** 1253 * This macro version of _MBCSSingleSimpleGetNextUChar() gets a code point from a byte. It works for single-byte, 1254 * single-state codepages that only map to and from BMP code points, and it always returns fallback values. 1255 */ MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(UConverterMBCSTable mbcs, final int b)1256 static char MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(UConverterMBCSTable mbcs, final int b) { 1257 assert 0 <= b && b <= 0xff; 1258 return MBCS_ENTRY_FINAL_VALUE_16(mbcs.stateTable[0][b]); 1259 } 1260 1261 /* single-byte fromUnicode: get the 16-bit result word */ MBCS_SINGLE_RESULT_FROM_U(char[] table, char[] results, int c)1262 static char MBCS_SINGLE_RESULT_FROM_U(char[] table, char[] results, int c) { 1263 int i1 = table[c >>> 10] + ((c >>> 4) & 0x3f); 1264 int i = table[i1] + (c & 0xf); 1265 return results[i]; 1266 } 1267 1268 /* single-byte fromUnicode: set the 16-bit result word with newValue*/ MBCS_SINGLE_RESULT_FROM_U_SET(char[] table, char[] results, int c, int newValue)1269 static void MBCS_SINGLE_RESULT_FROM_U_SET(char[] table, char[] results, int c, int newValue) { 1270 int i1 = table[c >>> 10] + ((c >>> 4) & 0x3f); 1271 int i = table[i1] + (c & 0xf); 1272 results[i] = (char) newValue; 1273 } 1274 1275 /* multi-byte fromUnicode: get the 32-bit stage 2 entry */ MBCS_STAGE_2_FROM_U(char[] table, int[] tableInts, int c)1276 static int MBCS_STAGE_2_FROM_U(char[] table, int[] tableInts, int c) { 1277 int i = table[(c) >>> 10] + ((c >>> 4) & 0x3f); 1278 return tableInts[i]; 1279 } 1280 MBCS_FROM_U_IS_ROUNDTRIP(int stage2Entry, int c)1281 private static boolean MBCS_FROM_U_IS_ROUNDTRIP(int stage2Entry, int c) { 1282 return (((stage2Entry) & (1 << (16 + ((c) & 0xf)))) != 0); 1283 } 1284 MBCS_VALUE_2_FROM_STAGE_2(char[] chars, int stage2Entry, int c)1285 static char MBCS_VALUE_2_FROM_STAGE_2(char[] chars, int stage2Entry, int c) { 1286 int i = 16 * (stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf); 1287 return chars[i]; 1288 } 1289 MBCS_VALUE_2_FROM_STAGE_2_SET(char[] chars, int stage2Entry, int c, int newValue)1290 static void MBCS_VALUE_2_FROM_STAGE_2_SET(char[] chars, int stage2Entry, int c, int newValue) { 1291 int i = 16 * (stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf); 1292 chars[i] = (char) newValue; 1293 } 1294 MBCS_VALUE_4_FROM_STAGE_2(int[] ints, int stage2Entry, int c)1295 private static int MBCS_VALUE_4_FROM_STAGE_2(int[] ints, int stage2Entry, int c) { 1296 int i = 16 * (stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf); 1297 return ints[i]; 1298 } 1299 MBCS_POINTER_3_FROM_STAGE_2(byte[] bytes, int stage2Entry, int c)1300 static int MBCS_POINTER_3_FROM_STAGE_2(byte[] bytes, int stage2Entry, int c) { 1301 return ((16 * (stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + ((c) & 0xf)) * 3); 1302 } 1303 1304 // ------------UConverterExt------------------------------------------------------- 1305 1306 static final int EXT_INDEXES_LENGTH = 0; /* 0 */ 1307 1308 static final int EXT_TO_U_INDEX = EXT_INDEXES_LENGTH + 1; /* 1 */ 1309 static final int EXT_TO_U_LENGTH = EXT_TO_U_INDEX + 1; 1310 static final int EXT_TO_U_UCHARS_INDEX = EXT_TO_U_LENGTH + 1; 1311 static final int EXT_TO_U_UCHARS_LENGTH = EXT_TO_U_UCHARS_INDEX + 1; 1312 1313 static final int EXT_FROM_U_UCHARS_INDEX = EXT_TO_U_UCHARS_LENGTH + 1; /* 5 */ 1314 static final int EXT_FROM_U_VALUES_INDEX = EXT_FROM_U_UCHARS_INDEX + 1; 1315 static final int EXT_FROM_U_LENGTH = EXT_FROM_U_VALUES_INDEX + 1; 1316 static final int EXT_FROM_U_BYTES_INDEX = EXT_FROM_U_LENGTH + 1; 1317 static final int EXT_FROM_U_BYTES_LENGTH = EXT_FROM_U_BYTES_INDEX + 1; 1318 1319 static final int EXT_FROM_U_STAGE_12_INDEX = EXT_FROM_U_BYTES_LENGTH + 1; /* 10 */ 1320 static final int EXT_FROM_U_STAGE_1_LENGTH = EXT_FROM_U_STAGE_12_INDEX + 1; 1321 static final int EXT_FROM_U_STAGE_12_LENGTH = EXT_FROM_U_STAGE_1_LENGTH + 1; 1322 static final int EXT_FROM_U_STAGE_3_INDEX = EXT_FROM_U_STAGE_12_LENGTH + 1; 1323 static final int EXT_FROM_U_STAGE_3_LENGTH = EXT_FROM_U_STAGE_3_INDEX + 1; 1324 static final int EXT_FROM_U_STAGE_3B_INDEX = EXT_FROM_U_STAGE_3_LENGTH + 1; 1325 static final int EXT_FROM_U_STAGE_3B_LENGTH = EXT_FROM_U_STAGE_3B_INDEX + 1; 1326 1327 private static final int EXT_COUNT_BYTES = EXT_FROM_U_STAGE_3B_LENGTH + 1; /* 17 */ 1328 // private static final int EXT_COUNT_UCHARS = EXT_COUNT_BYTES + 1; 1329 // private static final int EXT_FLAGS = EXT_COUNT_UCHARS + 1; 1330 // 1331 // private static final int EXT_RESERVED_INDEX = EXT_FLAGS + 1; /* 20, moves with additional indexes */ 1332 // 1333 // private static final int EXT_SIZE=31; 1334 // private static final int EXT_INDEXES_MIN_LENGTH=32; 1335 1336 static final int EXT_FROM_U_MAX_DIRECT_LENGTH = 3; 1337 1338 /* toUnicode helpers -------------------------------------------------------- */ 1339 1340 private static final int TO_U_BYTE_SHIFT = 24; 1341 private static final int TO_U_VALUE_MASK = 0xffffff; 1342 private static final int TO_U_MIN_CODE_POINT = 0x1f0000; 1343 private static final int TO_U_MAX_CODE_POINT = 0x2fffff; 1344 private static final int TO_U_ROUNDTRIP_FLAG = (1 << 23); 1345 private static final int TO_U_INDEX_MASK = 0x3ffff; 1346 private static final int TO_U_LENGTH_SHIFT = 18; 1347 private static final int TO_U_LENGTH_OFFSET = 12; 1348 1349 /* maximum number of indexed UChars */ 1350 static final int MAX_UCHARS = 19; 1351 TO_U_GET_BYTE(int word)1352 static int TO_U_GET_BYTE(int word) { 1353 return word >>> TO_U_BYTE_SHIFT; 1354 } 1355 TO_U_GET_VALUE(int word)1356 static int TO_U_GET_VALUE(int word) { 1357 return word & TO_U_VALUE_MASK; 1358 } 1359 TO_U_IS_ROUNDTRIP(int value)1360 static boolean TO_U_IS_ROUNDTRIP(int value) { 1361 return (value & TO_U_ROUNDTRIP_FLAG) != 0; 1362 } 1363 TO_U_IS_PARTIAL(int value)1364 static boolean TO_U_IS_PARTIAL(int value) { 1365 return 0 <= value && value < TO_U_MIN_CODE_POINT; 1366 } 1367 TO_U_GET_PARTIAL_INDEX(int value)1368 static int TO_U_GET_PARTIAL_INDEX(int value) { 1369 return value; 1370 } 1371 TO_U_MASK_ROUNDTRIP(int value)1372 static int TO_U_MASK_ROUNDTRIP(int value) { 1373 return value & ~TO_U_ROUNDTRIP_FLAG; 1374 } 1375 TO_U_MAKE_WORD(byte b, int value)1376 private static int TO_U_MAKE_WORD(byte b, int value) { 1377 // TO_U_BYTE_SHIFT == 24: safe to just shift the signed byte-as-int. 1378 return (b << TO_U_BYTE_SHIFT) | value; 1379 } 1380 1381 /* use after masking off the roundtrip flag */ TO_U_IS_CODE_POINT(int value)1382 static boolean TO_U_IS_CODE_POINT(int value) { 1383 assert value >= 0; 1384 return value <= TO_U_MAX_CODE_POINT; 1385 } 1386 TO_U_GET_CODE_POINT(int value)1387 static int TO_U_GET_CODE_POINT(int value) { 1388 assert value >= 0; 1389 return value - TO_U_MIN_CODE_POINT; 1390 } 1391 TO_U_GET_INDEX(int value)1392 private static int TO_U_GET_INDEX(int value) { 1393 return value & TO_U_INDEX_MASK; 1394 } 1395 TO_U_GET_LENGTH(int value)1396 private static int TO_U_GET_LENGTH(int value) { 1397 return (value >>> TO_U_LENGTH_SHIFT) - TO_U_LENGTH_OFFSET; 1398 } 1399 1400 /* fromUnicode helpers ------------------------------------------------------ */ 1401 1402 /* most trie constants are shared with ucnvmbcs.h */ 1403 private static final int STAGE_2_LEFT_SHIFT = 2; 1404 1405 // private static final int STAGE_3_GRANULARITY = 4; 1406 1407 /* trie access, returns the stage 3 value=index to stage 3b; s1Index=c>>10 */ FROM_U(CharBuffer stage12, CharBuffer stage3, int s1Index, int c)1408 static int FROM_U(CharBuffer stage12, CharBuffer stage3, int s1Index, int c) { 1409 return stage3.get((stage12.get((stage12.get(s1Index) + ((c >>> 4) & 0x3f))) << STAGE_2_LEFT_SHIFT) 1410 + (c & 0xf)); 1411 } 1412 1413 private static final int FROM_U_LENGTH_SHIFT = 24; 1414 private static final int FROM_U_ROUNDTRIP_FLAG = 1 << 31; 1415 static final int FROM_U_RESERVED_MASK = 0x60000000; 1416 private static final int FROM_U_DATA_MASK = 0xffffff; 1417 1418 /* special value for "no mapping" to <subchar1> (impossible roundtrip to 0 bytes, value 01) */ 1419 static final int FROM_U_SUBCHAR1 = 0x80000001; 1420 1421 /* at most 3 bytes in the lower part of the value */ 1422 private static final int FROM_U_MAX_DIRECT_LENGTH = 3; 1423 1424 /* maximum number of indexed bytes */ 1425 static final int MAX_BYTES = 0x1f; 1426 FROM_U_IS_PARTIAL(int value)1427 static boolean FROM_U_IS_PARTIAL(int value) { 1428 return (value >>> FROM_U_LENGTH_SHIFT) == 0; 1429 } 1430 FROM_U_GET_PARTIAL_INDEX(int value)1431 static int FROM_U_GET_PARTIAL_INDEX(int value) { 1432 return value; 1433 } 1434 FROM_U_IS_ROUNDTRIP(int value)1435 static boolean FROM_U_IS_ROUNDTRIP(int value) { 1436 return (value & FROM_U_ROUNDTRIP_FLAG) != 0; 1437 } 1438 FROM_U_MASK_ROUNDTRIP(int value)1439 private static int FROM_U_MASK_ROUNDTRIP(int value) { 1440 return value & ~FROM_U_ROUNDTRIP_FLAG; 1441 } 1442 1443 /* use after masking off the roundtrip flag */ FROM_U_GET_LENGTH(int value)1444 static int FROM_U_GET_LENGTH(int value) { 1445 return (value >>> FROM_U_LENGTH_SHIFT) & MAX_BYTES; 1446 } 1447 1448 /* get bytes or bytes index */ FROM_U_GET_DATA(int value)1449 static int FROM_U_GET_DATA(int value) { 1450 return value & FROM_U_DATA_MASK; 1451 } 1452 1453 /* get the pointer to an extension array from indexes[index] */ ARRAY(ByteBuffer indexes, int index, Class<?> itemType)1454 static Buffer ARRAY(ByteBuffer indexes, int index, Class<?> itemType) { 1455 int oldpos = indexes.position(); 1456 Buffer b; 1457 1458 // TODO: It is very inefficient to create Buffer objects for each array access. 1459 // We should create an inner class Extensions (or sibling class CharsetMBCSExtensions) 1460 // which has buffers for the arrays, together with the code that works with them. 1461 indexes.position(indexes.getInt(index << 2)); 1462 if (itemType == int.class) 1463 b = indexes.asIntBuffer(); 1464 else if (itemType == char.class) 1465 b = indexes.asCharBuffer(); 1466 else if (itemType == short.class) 1467 b = indexes.asShortBuffer(); 1468 else 1469 // default or (itemType == byte.class) 1470 b = indexes.slice(); 1471 indexes.position(oldpos); 1472 return b; 1473 } 1474 GET_MAX_BYTES_PER_UCHAR(ByteBuffer indexes)1475 private static int GET_MAX_BYTES_PER_UCHAR(ByteBuffer indexes) { 1476 indexes.position(0); 1477 return indexes.getInt(EXT_COUNT_BYTES) & 0xff; 1478 } 1479 1480 /* 1481 * @return index of the UChar, if found; else <0 1482 */ findFromU(CharBuffer fromUSection, int length, char u)1483 static int findFromU(CharBuffer fromUSection, int length, char u) { 1484 int i, start, limit; 1485 1486 /* binary search */ 1487 start = 0; 1488 limit = length; 1489 for (;;) { 1490 i = limit - start; 1491 if (i <= 1) { 1492 break; /* done */ 1493 } 1494 /* start<limit-1 */ 1495 1496 if (i <= 4) { 1497 /* linear search for the last part */ 1498 if (u <= fromUSection.get(fromUSection.position() + start)) { 1499 break; 1500 } 1501 if (++start < limit && u <= fromUSection.get(fromUSection.position() + start)) { 1502 break; 1503 } 1504 if (++start < limit && u <= fromUSection.get(fromUSection.position() + start)) { 1505 break; 1506 } 1507 /* always break at start==limit-1 */ 1508 ++start; 1509 break; 1510 } 1511 1512 i = (start + limit) / 2; 1513 if (u < fromUSection.get(fromUSection.position() + i)) { 1514 limit = i; 1515 } else { 1516 start = i; 1517 } 1518 } 1519 1520 /* did we really find it? */ 1521 if (start < limit && u == fromUSection.get(fromUSection.position() + start)) { 1522 return start; 1523 } else { 1524 return -1; /* not found */ 1525 } 1526 } 1527 1528 /* 1529 * @return lookup value for the byte, if found; else 0 1530 */ findToU(IntBuffer toUSection, int length, short byt)1531 static int findToU(IntBuffer toUSection, int length, short byt) { 1532 long word0, word; 1533 int i, start, limit; 1534 1535 /* check the input byte against the lowest and highest section bytes */ 1536 // agljport:comment instead of receiving a start position parameter for toUSection we'll rely on its position 1537 // property 1538 start = TO_U_GET_BYTE(toUSection.get(toUSection.position())); 1539 limit = TO_U_GET_BYTE(toUSection.get(toUSection.position() + length - 1)); 1540 if (byt < start || limit < byt) { 1541 return 0; /* the byte is out of range */ 1542 } 1543 1544 if (length == ((limit - start) + 1)) { 1545 /* direct access on a linear array */ 1546 return TO_U_GET_VALUE(toUSection.get(toUSection.position() + byt - start)); /* could be 0 */ 1547 } 1548 1549 /* word0 is suitable for <=toUSection[] comparison, word for <toUSection[] */ 1550 word0 = TO_U_MAKE_WORD((byte) byt, 0) & UConverterConstants.UNSIGNED_INT_MASK; 1551 1552 /* 1553 * Shift byte once instead of each section word and add 0xffffff. We will compare the shifted/added byte 1554 * (bbffffff) against section words which have byte values in the same bit position. If and only if byte bb < 1555 * section byte ss then bbffffff<ssvvvvvv for all v=0..f so we need not mask off the lower 24 bits of each 1556 * section word. 1557 */ 1558 word = word0 | TO_U_VALUE_MASK; 1559 1560 /* binary search */ 1561 start = 0; 1562 limit = length; 1563 for (;;) { 1564 i = limit - start; 1565 if (i <= 1) { 1566 break; /* done */ 1567 } 1568 /* start<limit-1 */ 1569 1570 if (i <= 4) { 1571 /* linear search for the last part */ 1572 if (word0 <= (toUSection.get(toUSection.position() + start) & UConverterConstants.UNSIGNED_INT_MASK)) { 1573 break; 1574 } 1575 if (++start < limit 1576 && word0 <= (toUSection.get(toUSection.position() + start) & UConverterConstants.UNSIGNED_INT_MASK)) { 1577 break; 1578 } 1579 if (++start < limit 1580 && word0 <= (toUSection.get(toUSection.position() + start) & UConverterConstants.UNSIGNED_INT_MASK)) { 1581 break; 1582 } 1583 /* always break at start==limit-1 */ 1584 ++start; 1585 break; 1586 } 1587 1588 i = (start + limit) / 2; 1589 if (word < (toUSection.get(toUSection.position() + i) & UConverterConstants.UNSIGNED_INT_MASK)) { 1590 limit = i; 1591 } else { 1592 start = i; 1593 } 1594 } 1595 1596 /* did we really find it? */ 1597 if (start < limit) { 1598 word = (toUSection.get(toUSection.position() + start) & UConverterConstants.UNSIGNED_INT_MASK); 1599 if (byt == TO_U_GET_BYTE((int)word)) { 1600 return TO_U_GET_VALUE((int) word); /* never 0 */ 1601 } 1602 } 1603 return 0; /* not found */ 1604 } 1605 1606 /* 1607 * TRUE if not an SI/SO stateful converter, or if the match length fits with the current converter state 1608 */ TO_U_VERIFY_SISO_MATCH(byte sisoState, int match)1609 static boolean TO_U_VERIFY_SISO_MATCH(byte sisoState, int match) { 1610 return sisoState < 0 || (sisoState == 0) == (match == 1); 1611 } 1612 1613 /* 1614 * get the SI/SO toU state (state 0 is for SBCS, 1 for DBCS), or 1 for DBCS-only, or -1 if the converter is not 1615 * SI/SO stateful 1616 * 1617 * Note: For SI/SO stateful converters getting here, cnv->mode==0 is equivalent to firstLength==1. 1618 */ SISO_STATE(UConverterSharedData sharedData, int mode)1619 private static int SISO_STATE(UConverterSharedData sharedData, int mode) { 1620 return sharedData.mbcs.outputType == MBCS_OUTPUT_2_SISO ? (byte) mode 1621 : sharedData.mbcs.outputType == MBCS_OUTPUT_DBCS_ONLY ? 1 : -1; 1622 } 1623 1624 class CharsetDecoderMBCS extends CharsetDecoderICU { 1625 CharsetDecoderMBCS(CharsetICU cs)1626 CharsetDecoderMBCS(CharsetICU cs) { 1627 super(cs); 1628 } 1629 1630 @Override decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush)1631 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) { 1632 /* Just call cnvMBCSToUnicodeWithOffsets() to remove duplicate code. */ 1633 return cnvMBCSToUnicodeWithOffsets(source, target, offsets, flush); 1634 } 1635 1636 /* 1637 * continue partial match with new input never called for simple, single-character conversion 1638 */ continueMatchToU(ByteBuffer source, CharBuffer target, IntBuffer offsets, int srcIndex, boolean flush)1639 private CoderResult continueMatchToU(ByteBuffer source, CharBuffer target, IntBuffer offsets, int srcIndex, 1640 boolean flush) { 1641 CoderResult cr = CoderResult.UNDERFLOW; 1642 1643 int[] value = new int[1]; 1644 int match, length; 1645 1646 match = matchToU((byte) SISO_STATE(sharedData, mode), preToUArray, preToUBegin, preToULength, source, 1647 value, isToUUseFallback(), flush); 1648 1649 if (match > 0) { 1650 if (match >= preToULength) { 1651 /* advance src pointer for the consumed input */ 1652 source.position(source.position() + match - preToULength); 1653 preToULength = 0; 1654 } else { 1655 /* the match did not use all of preToU[] - keep the rest for replay */ 1656 length = preToULength - match; 1657 System.arraycopy(preToUArray, preToUBegin + match, preToUArray, preToUBegin, length); 1658 preToULength = (byte) -length; 1659 } 1660 1661 /* write result */ 1662 cr = writeToU(value[0], target, offsets, srcIndex); 1663 } else if (match < 0) { 1664 /* save state for partial match */ 1665 int j, sArrayIndex; 1666 1667 /* just _append_ the newly consumed input to preToU[] */ 1668 sArrayIndex = source.position(); 1669 match = -match; 1670 for (j = preToULength; j < match; ++j) { 1671 preToUArray[j] = source.get(sArrayIndex++); 1672 } 1673 source.position(sArrayIndex); /* same as *src=srcLimit; because we reached the end of input */ 1674 preToULength = (byte) match; 1675 } else /* match==0 */{ 1676 /* 1677 * no match 1678 * 1679 * We need to split the previous input into two parts: 1680 * 1681 * 1. The first codepage character is unmappable - that's how we got into trying the extension data in 1682 * the first place. We need to move it from the preToU buffer to the error buffer, set an error code, 1683 * and prepare the rest of the previous input for 2. 1684 * 1685 * 2. The rest of the previous input must be converted once we come back from the callback for the first 1686 * character. At that time, we have to try again from scratch to convert these input characters. The 1687 * replay will be handled by the ucnv.c conversion code. 1688 */ 1689 1690 /* move the first codepage character to the error field */ 1691 System.arraycopy(preToUArray, preToUBegin, toUBytesArray, toUBytesBegin, preToUFirstLength); 1692 toULength = preToUFirstLength; 1693 1694 /* move the rest up inside the buffer */ 1695 length = preToULength - preToUFirstLength; 1696 if (length > 0) { 1697 System.arraycopy(preToUArray, preToUBegin + preToUFirstLength, preToUArray, preToUBegin, length); 1698 } 1699 1700 /* mark preToU for replay */ 1701 preToULength = (byte) -length; 1702 1703 /* set the error code for unassigned */ 1704 cr = CoderResult.unmappableForLength(preToUFirstLength); 1705 } 1706 return cr; 1707 } 1708 1709 /* 1710 * this works like matchFromU() except - the first character is in pre - no trie is used - the returned 1711 * matchLength is not offset by 2 1712 */ matchToU(byte sisoState, byte[] preArray, int preArrayBegin, int preLength, ByteBuffer source, int[] pMatchValue, boolean isUseFallback, boolean flush)1713 private int matchToU(byte sisoState, byte[] preArray, int preArrayBegin, int preLength, ByteBuffer source, 1714 int[] pMatchValue, boolean isUseFallback, boolean flush) { 1715 ByteBuffer cx = sharedData.mbcs.extIndexes; 1716 IntBuffer toUTable, toUSection; 1717 1718 int value, matchValue, srcLength = 0; 1719 int i, j, index, length, matchLength; 1720 short b; 1721 1722 if (cx == null || cx.asIntBuffer().get(EXT_TO_U_LENGTH) <= 0) { 1723 return 0; /* no extension data, no match */ 1724 } 1725 1726 /* initialize */ 1727 toUTable = (IntBuffer) ARRAY(cx, EXT_TO_U_INDEX, int.class); 1728 index = 0; 1729 1730 matchValue = 0; 1731 i = j = matchLength = 0; 1732 if (source != null) { 1733 srcLength = source.remaining(); 1734 } 1735 1736 if (sisoState == 0) { 1737 /* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */ 1738 if (preLength > 1) { 1739 return 0; /* no match of a DBCS sequence in SBCS mode */ 1740 } else if (preLength == 1) { 1741 srcLength = 0; 1742 } else /* preLength==0 */{ 1743 if (srcLength > 1) { 1744 srcLength = 1; 1745 } 1746 } 1747 flush = true; 1748 } 1749 1750 /* we must not remember fallback matches when not using fallbacks */ 1751 1752 /* match input units until there is a full match or the input is consumed */ 1753 for (;;) { 1754 /* go to the next section */ 1755 int oldpos = toUTable.position(); 1756 toUSection = ((IntBuffer) toUTable.position(index)).slice(); 1757 toUTable.position(oldpos); 1758 1759 /* read first pair of the section */ 1760 value = toUSection.get(); 1761 length = TO_U_GET_BYTE(value); 1762 value = TO_U_GET_VALUE(value); 1763 if (value != 0 && (TO_U_IS_ROUNDTRIP(value) || isToUUseFallback(isUseFallback)) 1764 && TO_U_VERIFY_SISO_MATCH(sisoState, i + j)) { 1765 /* remember longest match so far */ 1766 matchValue = value; 1767 matchLength = i + j; 1768 } 1769 1770 /* match pre[] then src[] */ 1771 if (i < preLength) { 1772 b = (short) (preArray[preArrayBegin + i++] & UConverterConstants.UNSIGNED_BYTE_MASK); 1773 } else if (j < srcLength) { 1774 b = (short) (source.get(source.position() + j++) & UConverterConstants.UNSIGNED_BYTE_MASK); 1775 } else { 1776 /* all input consumed, partial match */ 1777 if (flush || (length = (i + j)) > MAX_BYTES) { 1778 /* 1779 * end of the entire input stream, stop with the longest match so far or: partial match must not 1780 * be longer than UCNV_EXT_MAX_BYTES because it must fit into state buffers 1781 */ 1782 break; 1783 } else { 1784 /* continue with more input next time */ 1785 return -length; 1786 } 1787 } 1788 1789 /* search for the current UChar */ 1790 value = findToU(toUSection, length, b); 1791 if (value == 0) { 1792 /* no match here, stop with the longest match so far */ 1793 break; 1794 } else { 1795 if (TO_U_IS_PARTIAL(value)) { 1796 /* partial match, continue */ 1797 index = TO_U_GET_PARTIAL_INDEX(value); 1798 } else { 1799 if ((TO_U_IS_ROUNDTRIP(value) || isToUUseFallback(isUseFallback)) && TO_U_VERIFY_SISO_MATCH(sisoState, i + j)) { 1800 /* full match, stop with result */ 1801 matchValue = value; 1802 matchLength = i + j; 1803 } else { 1804 /* full match on fallback not taken, stop with the longest match so far */ 1805 } 1806 break; 1807 } 1808 } 1809 } 1810 1811 if (matchLength == 0) { 1812 /* no match at all */ 1813 return 0; 1814 } 1815 1816 /* return result */ 1817 pMatchValue[0] = TO_U_MASK_ROUNDTRIP(matchValue); 1818 return matchLength; 1819 } 1820 writeToU(int value, CharBuffer target, IntBuffer offsets, int srcIndex)1821 private CoderResult writeToU(int value, CharBuffer target, IntBuffer offsets, int srcIndex) { 1822 ByteBuffer cx = sharedData.mbcs.extIndexes; 1823 /* output the result */ 1824 if (TO_U_IS_CODE_POINT(value)) { 1825 /* output a single code point */ 1826 return toUWriteCodePoint(TO_U_GET_CODE_POINT(value), target, offsets, srcIndex); 1827 } else { 1828 /* output a string - with correct data we have resultLength>0 */ 1829 1830 char[] a = new char[TO_U_GET_LENGTH(value)]; 1831 CharBuffer cb = ((CharBuffer) ARRAY(cx, EXT_TO_U_UCHARS_INDEX, char.class)); 1832 cb.position(TO_U_GET_INDEX(value)); 1833 cb.get(a, 0, a.length); 1834 return toUWriteUChars(this, a, 0, a.length, target, offsets, srcIndex); 1835 } 1836 } 1837 toUWriteCodePoint(int c, CharBuffer target, IntBuffer offsets, int sourceIndex)1838 private CoderResult toUWriteCodePoint(int c, CharBuffer target, IntBuffer offsets, int sourceIndex) { 1839 CoderResult cr = CoderResult.UNDERFLOW; 1840 int tBeginIndex = target.position(); 1841 1842 if (target.hasRemaining()) { 1843 if (c <= 0xffff) { 1844 target.put((char) c); 1845 c = UConverterConstants.U_SENTINEL; 1846 } else /* c is a supplementary code point */{ 1847 target.put(UTF16.getLeadSurrogate(c)); 1848 c = UTF16.getTrailSurrogate(c); 1849 if (target.hasRemaining()) { 1850 target.put((char) c); 1851 c = UConverterConstants.U_SENTINEL; 1852 } 1853 } 1854 1855 /* write offsets */ 1856 if (offsets != null) { 1857 offsets.put(sourceIndex); 1858 if ((tBeginIndex + 1) < target.position()) { 1859 offsets.put(sourceIndex); 1860 } 1861 } 1862 } 1863 1864 /* write overflow from c */ 1865 if (c >= 0) { 1866 charErrorBufferLength = UTF16.append(charErrorBufferArray, 0, c); 1867 cr = CoderResult.OVERFLOW; 1868 } 1869 1870 return cr; 1871 } 1872 1873 /* 1874 * Input sequence: cnv->toUBytes[0..length[ @return if(U_FAILURE) return the length (toULength, byteIndex) for 1875 * the input else return 0 after output has been written to the target 1876 */ toU(int length, ByteBuffer source, CharBuffer target, IntBuffer offsets, int sourceIndex, boolean flush, CoderResult[] cr)1877 private int toU(int length, ByteBuffer source, CharBuffer target, IntBuffer offsets, int sourceIndex, 1878 boolean flush, CoderResult[] cr) { 1879 // ByteBuffer cx; 1880 1881 if (sharedData.mbcs.extIndexes != null 1882 && initialMatchToU(length, source, target, offsets, sourceIndex, flush, cr)) { 1883 return 0; /* an extension mapping handled the input */ 1884 } 1885 1886 /* GB 18030 */ 1887 if (length == 4 && (options & MBCS_OPTION_GB18030) != 0) { 1888 int[] range; 1889 int linear; 1890 int i; 1891 1892 linear = LINEAR_18030(toUBytesArray[0], toUBytesArray[1], toUBytesArray[2], toUBytesArray[3]); 1893 for (i = 0; i < gb18030Ranges.length; ++i) { 1894 range = gb18030Ranges[i]; 1895 if (range[2] <= linear && linear <= range[3]) { 1896 /* found the sequence, output the Unicode code point for it */ 1897 cr[0] = CoderResult.UNDERFLOW; 1898 1899 /* add the linear difference between the input and start sequences to the start code point */ 1900 linear = range[0] + (linear - range[2]); 1901 1902 /* output this code point */ 1903 cr[0] = toUWriteCodePoint(linear, target, offsets, sourceIndex); 1904 1905 return 0; 1906 } 1907 } 1908 } 1909 1910 /* no mapping */ 1911 cr[0] = CoderResult.unmappableForLength(length); 1912 return length; 1913 } 1914 1915 /* 1916 * target<targetLimit; set error code for overflow 1917 */ initialMatchToU(int firstLength, ByteBuffer source, CharBuffer target, IntBuffer offsets, int srcIndex, boolean flush, CoderResult[] cr)1918 private boolean initialMatchToU(int firstLength, ByteBuffer source, CharBuffer target, IntBuffer offsets, 1919 int srcIndex, boolean flush, CoderResult[] cr) { 1920 int[] value = new int[1]; 1921 int match = 0; 1922 1923 /* try to match */ 1924 match = matchToU((byte) SISO_STATE(sharedData, mode), toUBytesArray, toUBytesBegin, firstLength, source, 1925 value, isToUUseFallback(), flush); 1926 if (match > 0) { 1927 /* advance src pointer for the consumed input */ 1928 source.position(source.position() + match - firstLength); 1929 1930 /* write result to target */ 1931 cr[0] = writeToU(value[0], target, offsets, srcIndex); 1932 return true; 1933 } else if (match < 0) { 1934 /* save state for partial match */ 1935 byte[] sArray; 1936 int sArrayIndex; 1937 int j; 1938 1939 /* copy the first code point */ 1940 sArray = toUBytesArray; 1941 sArrayIndex = toUBytesBegin; 1942 preToUFirstLength = (byte) firstLength; 1943 for (j = 0; j < firstLength; ++j) { 1944 preToUArray[j] = sArray[sArrayIndex++]; 1945 } 1946 1947 /* now copy the newly consumed input */ 1948 sArrayIndex = source.position(); 1949 match = -match; 1950 for (; j < match; ++j) { 1951 preToUArray[j] = source.get(sArrayIndex++); 1952 } 1953 source.position(sArrayIndex); 1954 preToULength = (byte) match; 1955 return true; 1956 } else /* match==0 no match */{ 1957 return false; 1958 } 1959 } 1960 simpleMatchToU(ByteBuffer source, boolean useFallback)1961 private int simpleMatchToU(ByteBuffer source, boolean useFallback) { 1962 int[] value = new int[1]; 1963 int match; 1964 1965 if (source.remaining() <= 0) { 1966 return 0xffff; 1967 } 1968 1969 /* try to match */ 1970 byte[] sourceArray; 1971 int sourcePosition, sourceLimit; 1972 if (source.isReadOnly()) { 1973 // source.array() would throw an exception 1974 sourcePosition = source.position(); // relative to source.array() 1975 sourceArray = new byte[Math.min(source.remaining(), EXT_MAX_BYTES)]; 1976 source.get(sourceArray).position(sourcePosition); 1977 sourcePosition = 0; // relative to sourceArray 1978 sourceLimit = sourceArray.length; 1979 } else { 1980 sourceArray = source.array(); 1981 sourcePosition = source.position(); 1982 sourceLimit = source.limit(); 1983 } 1984 match = matchToU((byte) -1, sourceArray, sourcePosition, sourceLimit, null, value, useFallback, true); 1985 1986 if (match == source.remaining()) { 1987 /* write result for simple, single-character conversion */ 1988 if (TO_U_IS_CODE_POINT(value[0])) { 1989 return TO_U_GET_CODE_POINT(value[0]); 1990 } 1991 } 1992 1993 /* 1994 * return no match because - match>0 && value points to string: simple conversion cannot handle multiple 1995 * code points - match>0 && match!=length: not all input consumed, forbidden for this function - match==0: 1996 * no match found in the first place - match<0: partial match, not supported for simple conversion (and 1997 * flush==TRUE) 1998 */ 1999 return 0xfffe; 2000 } 2001 cnvMBCSToUnicodeWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush)2002 CoderResult cnvMBCSToUnicodeWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) { 2003 CoderResult[] cr = { CoderResult.UNDERFLOW }; 2004 2005 int sourceArrayIndex, sourceArrayIndexStart; 2006 int stateTable[][/* 256 */]; 2007 char[] unicodeCodeUnits; 2008 2009 int offset; 2010 byte state; 2011 int byteIndex; 2012 byte[] bytes; 2013 2014 int sourceIndex, nextSourceIndex; 2015 2016 int entry = 0; 2017 char c; 2018 byte action; 2019 2020 if (preToULength > 0) { 2021 /* 2022 * pass sourceIndex=-1 because we continue from an earlier buffer in the future, this may change with 2023 * continuous offsets 2024 */ 2025 cr[0] = continueMatchToU(source, target, offsets, -1, flush); 2026 2027 if (cr[0].isError() || preToULength < 0) { 2028 return cr[0]; 2029 } 2030 } 2031 2032 if (sharedData.mbcs.countStates == 1) { 2033 if (!sharedData.mbcs.hasSupplementary()) { 2034 cr[0] = cnvMBCSSingleToBMPWithOffsets(source, target, offsets, flush); 2035 } else { 2036 cr[0] = cnvMBCSSingleToUnicodeWithOffsets(source, target, offsets, flush); 2037 } 2038 return cr[0]; 2039 } 2040 2041 /* set up the local pointers */ 2042 sourceArrayIndex = sourceArrayIndexStart = source.position(); 2043 2044 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) { 2045 stateTable = sharedData.mbcs.swapLFNLStateTable; 2046 } else { 2047 stateTable = sharedData.mbcs.stateTable; 2048 } 2049 unicodeCodeUnits = sharedData.mbcs.unicodeCodeUnits; 2050 2051 /* get the converter state from UConverter */ 2052 offset = toUnicodeStatus; 2053 byteIndex = toULength; 2054 bytes = toUBytesArray; 2055 2056 /* 2057 * if we are in the SBCS state for a DBCS-only converter, then load the DBCS state from the MBCS data 2058 * (dbcsOnlyState==0 if it is not a DBCS-only converter) 2059 */ 2060 state = (byte)mode; 2061 if (state == 0) { 2062 state = sharedData.mbcs.dbcsOnlyState; 2063 } 2064 2065 /* sourceIndex=-1 if the current character began in the previous buffer */ 2066 sourceIndex = byteIndex == 0 ? 0 : -1; 2067 nextSourceIndex = 0; 2068 2069 /* conversion loop */ 2070 while (sourceArrayIndex < source.limit()) { 2071 /* 2072 * This following test is to see if available input would overflow the output. It does not catch output 2073 * of more than one code unit that overflows as a result of a surrogate pair or callback output from the 2074 * last source byte. Therefore, those situations also test for overflows and will then break the loop, 2075 * too. 2076 */ 2077 if (!target.hasRemaining()) { 2078 /* target is full */ 2079 cr[0] = CoderResult.OVERFLOW; 2080 break; 2081 } 2082 2083 if (byteIndex == 0) { 2084 /* optimized loop for 1/2-byte input and BMP output */ 2085 // agljport:todo see ucnvmbcs.c for deleted block 2086 do { 2087 entry = stateTable[state][source.get(sourceArrayIndex)&UConverterConstants.UNSIGNED_BYTE_MASK]; 2088 if (MBCS_ENTRY_IS_TRANSITION(entry)) { 2089 state = (byte)MBCS_ENTRY_TRANSITION_STATE(entry); 2090 offset = MBCS_ENTRY_TRANSITION_OFFSET(entry); 2091 ++sourceArrayIndex; 2092 if (sourceArrayIndex < source.limit() 2093 && MBCS_ENTRY_IS_FINAL(entry = stateTable[state][source.get(sourceArrayIndex)&UConverterConstants.UNSIGNED_BYTE_MASK]) 2094 && MBCS_ENTRY_FINAL_ACTION(entry) == MBCS_STATE_VALID_16 2095 && (c = unicodeCodeUnits[offset + MBCS_ENTRY_FINAL_VALUE_16(entry)]) < 0xfffe) { 2096 ++sourceArrayIndex; 2097 target.put(c); 2098 if (offsets != null) { 2099 offsets.put(sourceIndex); 2100 sourceIndex = (nextSourceIndex += 2); 2101 } 2102 state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2103 offset = 0; 2104 } else { 2105 /* set the state and leave the optimized loop */ 2106 ++nextSourceIndex; 2107 bytes[0] = source.get(sourceArrayIndex - 1); 2108 byteIndex = 1; 2109 break; 2110 } 2111 } else { 2112 if (MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2113 /* output BMP code point */ 2114 ++sourceArrayIndex; 2115 target.put(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2116 if (offsets != null) { 2117 offsets.put(sourceIndex); 2118 sourceIndex = ++nextSourceIndex; 2119 } 2120 state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2121 } else { 2122 /* leave the optimized loop */ 2123 break; 2124 } 2125 } 2126 } while (sourceArrayIndex < source.limit() && target.hasRemaining()); 2127 /* 2128 * these tests and break statements could be put inside the loop if C had "break outerLoop" like 2129 * Java 2130 */ 2131 if (sourceArrayIndex >= source.limit()) { 2132 break; 2133 } 2134 if (!target.hasRemaining()) { 2135 /* target is full */ 2136 cr[0] = CoderResult.OVERFLOW; 2137 break; 2138 } 2139 2140 ++nextSourceIndex; 2141 bytes[byteIndex++] = source.get(sourceArrayIndex++); 2142 } else /* byteIndex>0 */{ 2143 ++nextSourceIndex; 2144 entry = stateTable[state][(bytes[byteIndex++] = source.get(sourceArrayIndex++)) 2145 & UConverterConstants.UNSIGNED_BYTE_MASK]; 2146 } 2147 2148 if (MBCS_ENTRY_IS_TRANSITION(entry)) { 2149 state = (byte)MBCS_ENTRY_TRANSITION_STATE(entry); 2150 offset += MBCS_ENTRY_TRANSITION_OFFSET(entry); 2151 continue; 2152 } 2153 2154 /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 2155 mode = state; 2156 2157 /* set the next state early so that we can reuse the entry variable */ 2158 state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2159 2160 /* 2161 * An if-else-if chain provides more reliable performance for the most common cases compared to a 2162 * switch. 2163 */ 2164 action = (byte)MBCS_ENTRY_FINAL_ACTION(entry); 2165 if (action == MBCS_STATE_VALID_16) { 2166 offset += MBCS_ENTRY_FINAL_VALUE_16(entry); 2167 c = unicodeCodeUnits[offset]; 2168 if (c < 0xfffe) { 2169 /* output BMP code point */ 2170 target.put(c); 2171 if (offsets != null) { 2172 offsets.put(sourceIndex); 2173 } 2174 byteIndex = 0; 2175 } else if (c == 0xfffe) { 2176 if (isFallbackUsed() && (entry = getFallback(sharedData.mbcs, offset)) != 0xfffe) { 2177 /* output fallback BMP code point */ 2178 target.put((char)entry); 2179 if (offsets != null) { 2180 offsets.put(sourceIndex); 2181 } 2182 byteIndex = 0; 2183 } 2184 } else { 2185 /* callback(illegal) */ 2186 cr[0] = CoderResult.malformedForLength(byteIndex); 2187 } 2188 } else if (action == MBCS_STATE_VALID_DIRECT_16) { 2189 /* output BMP code point */ 2190 target.put(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2191 if (offsets != null) { 2192 offsets.put(sourceIndex); 2193 } 2194 byteIndex = 0; 2195 } else if (action == MBCS_STATE_VALID_16_PAIR) { 2196 offset += MBCS_ENTRY_FINAL_VALUE_16(entry); 2197 c = unicodeCodeUnits[offset++]; 2198 if (c < 0xd800) { 2199 /* output BMP code point below 0xd800 */ 2200 target.put(c); 2201 if (offsets != null) { 2202 offsets.put(sourceIndex); 2203 } 2204 byteIndex = 0; 2205 } else if (isFallbackUsed() ? c <= 0xdfff : c <= 0xdbff) { 2206 /* output roundtrip or fallback surrogate pair */ 2207 target.put((char)(c & 0xdbff)); 2208 if (offsets != null) { 2209 offsets.put(sourceIndex); 2210 } 2211 byteIndex = 0; 2212 if (target.hasRemaining()) { 2213 target.put(unicodeCodeUnits[offset]); 2214 if (offsets != null) { 2215 offsets.put(sourceIndex); 2216 } 2217 } else { 2218 /* target overflow */ 2219 charErrorBufferArray[0] = unicodeCodeUnits[offset]; 2220 charErrorBufferLength = 1; 2221 cr[0] = CoderResult.OVERFLOW; 2222 2223 offset = 0; 2224 break; 2225 } 2226 } else if (isFallbackUsed() ? (c & 0xfffe) == 0xe000 : c == 0xe000) { 2227 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 2228 target.put(unicodeCodeUnits[offset]); 2229 if (offsets != null) { 2230 offsets.put(sourceIndex); 2231 } 2232 byteIndex = 0; 2233 } else if (c == 0xffff) { 2234 /* callback(illegal) */ 2235 cr[0] = CoderResult.malformedForLength(byteIndex); 2236 } 2237 } else if (action == MBCS_STATE_VALID_DIRECT_20 2238 || (action == MBCS_STATE_FALLBACK_DIRECT_20 && isFallbackUsed())) { 2239 entry = MBCS_ENTRY_FINAL_VALUE(entry); 2240 /* output surrogate pair */ 2241 target.put((char)(0xd800 | (char)(entry >> 10))); 2242 if (offsets != null) { 2243 offsets.put(sourceIndex); 2244 } 2245 byteIndex = 0; 2246 c = (char)(0xdc00 | (char)(entry & 0x3ff)); 2247 if (target.hasRemaining()) { 2248 target.put(c); 2249 if (offsets != null) { 2250 offsets.put(sourceIndex); 2251 } 2252 } else { 2253 /* target overflow */ 2254 charErrorBufferArray[0] = c; 2255 charErrorBufferLength = 1; 2256 cr[0] = CoderResult.OVERFLOW; 2257 2258 offset = 0; 2259 break; 2260 } 2261 } else if (action == MBCS_STATE_CHANGE_ONLY) { 2262 /* 2263 * This serves as a state change without any output. It is useful for reading simple stateful 2264 * encodings, for example using just Shift-In/Shift-Out codes. The 21 unused bits may later be used 2265 * for more sophisticated state transitions. 2266 */ 2267 if (sharedData.mbcs.dbcsOnlyState == 0) { 2268 byteIndex = 0; 2269 } else { 2270 /* SI/SO are illegal for DBCS-only conversion */ 2271 state = (byte)(mode); /* restore the previous state */ 2272 2273 /* callback(illegal) */ 2274 cr[0] = CoderResult.malformedForLength(byteIndex); 2275 } 2276 } else if (action == MBCS_STATE_FALLBACK_DIRECT_16) { 2277 if (isFallbackUsed()) { 2278 /* output BMP code point */ 2279 target.put(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2280 if (offsets != null) { 2281 offsets.put(sourceIndex); 2282 } 2283 byteIndex = 0; 2284 } 2285 } else if (action == MBCS_STATE_UNASSIGNED) { 2286 /* just fall through */ 2287 } else if (action == MBCS_STATE_ILLEGAL) { 2288 /* callback(illegal) */ 2289 cr[0] = CoderResult.malformedForLength(byteIndex); 2290 } else { 2291 /* reserved, must never occur */ 2292 byteIndex = 0; 2293 } 2294 2295 /* end of action codes: prepare for a new character */ 2296 offset = 0; 2297 2298 if (byteIndex == 0) { 2299 sourceIndex = nextSourceIndex; 2300 } else if (cr[0].isError()) { 2301 /* callback(illegal) */ 2302 if (byteIndex > 1) { 2303 /* 2304 * Ticket 5691: consistent illegal sequences: 2305 * - We include at least the first byte in the illegal sequence. 2306 * - If any of the non-initial bytes could be the start of a character, 2307 * we stop the illegal sequence before the first one of those. 2308 */ 2309 boolean isDBCSOnly = (sharedData.mbcs.dbcsOnlyState != 0); 2310 byte i; 2311 for (i = 1; i < byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, (short)(bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK)); i++) {} 2312 if (i < byteIndex) { 2313 byte backOutDistance = (byte)(byteIndex - i); 2314 int bytesFromThisBuffer = sourceArrayIndex - sourceArrayIndexStart; 2315 byteIndex = i; /* length of reported illegal byte sequence */ 2316 if (backOutDistance <= bytesFromThisBuffer) { 2317 sourceArrayIndex -= backOutDistance; 2318 } else { 2319 /* Back out bytes from the previous buffer: Need to replay them. */ 2320 this.preToULength = (byte)(bytesFromThisBuffer - backOutDistance); 2321 /* preToULength is negative! */ 2322 for (int n = 0; n < -this.preToULength; n++) { 2323 this.preToUArray[n] = bytes[i+n]; 2324 } 2325 sourceArrayIndex = sourceArrayIndexStart; 2326 } 2327 } 2328 } 2329 break; 2330 } else /* unassigned sequences indicated with byteIndex>0 */{ 2331 /* try an extension mapping */ 2332 int sourceBeginIndex = sourceArrayIndex; 2333 source.position(sourceArrayIndex); 2334 byteIndex = toU(byteIndex, source, target, offsets, sourceIndex, flush, cr); 2335 sourceArrayIndex = source.position(); 2336 sourceIndex = nextSourceIndex += (sourceArrayIndex - sourceBeginIndex); 2337 2338 if (cr[0].isError() || cr[0].isOverflow()) { 2339 /* not mappable or buffer overflow */ 2340 break; 2341 } 2342 } 2343 } 2344 2345 /* set the converter state back into UConverter */ 2346 toUnicodeStatus = offset; 2347 mode = state; 2348 toULength = byteIndex; 2349 2350 /* write back the updated pointers */ 2351 source.position(sourceArrayIndex); 2352 2353 return cr[0]; 2354 } 2355 /* 2356 * This version of cnvMBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages that 2357 * only map to and from the BMP. In addition to single-byte optimizations, the offset calculations become much 2358 * easier. 2359 */ cnvMBCSSingleToBMPWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush)2360 private CoderResult cnvMBCSSingleToBMPWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets, 2361 boolean flush) { 2362 CoderResult[] cr = { CoderResult.UNDERFLOW }; 2363 2364 int sourceArrayIndex, lastSource; 2365 int targetCapacity, length; 2366 int[][] stateTable; 2367 2368 int sourceIndex; 2369 2370 int entry; 2371 byte action; 2372 2373 /* set up the local pointers */ 2374 sourceArrayIndex = source.position(); 2375 targetCapacity = target.remaining(); 2376 2377 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) { 2378 stateTable = sharedData.mbcs.swapLFNLStateTable; 2379 } else { 2380 stateTable = sharedData.mbcs.stateTable; 2381 } 2382 2383 /* sourceIndex=-1 if the current character began in the previous buffer */ 2384 sourceIndex = 0; 2385 lastSource = sourceArrayIndex; 2386 2387 /* 2388 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter for the minimum of the 2389 * sourceLength and targetCapacity 2390 */ 2391 length = source.remaining(); 2392 if (length < targetCapacity) { 2393 targetCapacity = length; 2394 } 2395 2396 /* conversion loop */ 2397 while (targetCapacity > 0 && sourceArrayIndex < source.limit()) { 2398 entry = stateTable[0][source.get(sourceArrayIndex++) & UConverterConstants.UNSIGNED_BYTE_MASK]; 2399 /* MBCS_ENTRY_IS_FINAL(entry) */ 2400 2401 /* test the most common case first */ 2402 if (MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2403 /* output BMP code point */ 2404 target.put(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2405 --targetCapacity; 2406 continue; 2407 } 2408 2409 /* 2410 * An if-else-if chain provides more reliable performance for the most common cases compared to a 2411 * switch. 2412 */ 2413 action = (byte) (MBCS_ENTRY_FINAL_ACTION(entry)); 2414 if (action == MBCS_STATE_FALLBACK_DIRECT_16) { 2415 if (isFallbackUsed()) { 2416 /* output BMP code point */ 2417 target.put(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2418 --targetCapacity; 2419 continue; 2420 } 2421 } else if (action == MBCS_STATE_UNASSIGNED) { 2422 /* just fall through */ 2423 } else if (action == MBCS_STATE_ILLEGAL) { 2424 /* callback(illegal) */ 2425 cr[0] = CoderResult.malformedForLength(sourceArrayIndex - lastSource); 2426 } else { 2427 /* reserved, must never occur */ 2428 continue; 2429 } 2430 2431 /* set offsets since the start or the last extension */ 2432 if (offsets != null) { 2433 int count = sourceArrayIndex - lastSource; 2434 2435 /* predecrement: do not set the offset for the callback-causing character */ 2436 while (--count > 0) { 2437 offsets.put(sourceIndex++); 2438 } 2439 /* offset and sourceIndex are now set for the current character */ 2440 } 2441 2442 if (cr[0].isError()) { 2443 /* callback(illegal) */ 2444 break; 2445 } else /* unassigned sequences indicated with byteIndex>0 */{ 2446 /* try an extension mapping */ 2447 lastSource = sourceArrayIndex; 2448 toUBytesArray[0] = source.get(sourceArrayIndex - 1); 2449 source.position(sourceArrayIndex); 2450 toULength = toU((byte) 1, source, target, offsets, sourceIndex, flush, cr); 2451 sourceArrayIndex = source.position(); 2452 sourceIndex += 1 + (sourceArrayIndex - lastSource); 2453 2454 if (cr[0].isError()) { 2455 /* not mappable or buffer overflow */ 2456 break; 2457 } 2458 2459 /* recalculate the targetCapacity after an extension mapping */ 2460 targetCapacity = target.remaining(); 2461 length = source.remaining(); 2462 if (length < targetCapacity) { 2463 targetCapacity = length; 2464 } 2465 } 2466 } 2467 2468 if (!cr[0].isError() && sourceArrayIndex < source.limit() && !target.hasRemaining()) { 2469 /* target is full */ 2470 cr[0] = CoderResult.OVERFLOW; 2471 } 2472 2473 /* set offsets since the start or the last callback */ 2474 if (offsets != null) { 2475 int count = sourceArrayIndex - lastSource; 2476 while (count > 0) { 2477 offsets.put(sourceIndex++); 2478 --count; 2479 } 2480 } 2481 2482 /* write back the updated pointers */ 2483 source.position(sourceArrayIndex); 2484 2485 return cr[0]; 2486 } 2487 2488 /* This version of cnvMBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */ cnvMBCSSingleToUnicodeWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush)2489 private CoderResult cnvMBCSSingleToUnicodeWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets, 2490 boolean flush) { 2491 CoderResult[] cr = { CoderResult.UNDERFLOW }; 2492 2493 int sourceArrayIndex; 2494 int[][] stateTable; 2495 2496 int sourceIndex; 2497 2498 int entry; 2499 char c; 2500 byte action; 2501 2502 /* set up the local pointers */ 2503 sourceArrayIndex = source.position(); 2504 2505 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) { 2506 stateTable = sharedData.mbcs.swapLFNLStateTable; 2507 } else { 2508 stateTable = sharedData.mbcs.stateTable; 2509 } 2510 2511 /* sourceIndex=-1 if the current character began in the previous buffer */ 2512 sourceIndex = 0; 2513 2514 /* conversion loop */ 2515 while (sourceArrayIndex < source.limit()) { 2516 /* 2517 * This following test is to see if available input would overflow the output. It does not catch output 2518 * of more than one code unit that overflows as a result of a surrogate pair or callback output from the 2519 * last source byte. Therefore, those situations also test for overflows and will then break the loop, 2520 * too. 2521 */ 2522 if (!target.hasRemaining()) { 2523 /* target is full */ 2524 cr[0] = CoderResult.OVERFLOW; 2525 break; 2526 } 2527 2528 entry = stateTable[0][source.get(sourceArrayIndex++) & UConverterConstants.UNSIGNED_BYTE_MASK]; 2529 /* MBCS_ENTRY_IS_FINAL(entry) */ 2530 2531 /* test the most common case first */ 2532 if (MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2533 /* output BMP code point */ 2534 target.put(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2535 if (offsets != null) { 2536 offsets.put(sourceIndex); 2537 } 2538 2539 /* normal end of action codes: prepare for a new character */ 2540 ++sourceIndex; 2541 continue; 2542 } 2543 2544 /* 2545 * An if-else-if chain provides more reliable performance for the most common cases compared to a 2546 * switch. 2547 */ 2548 action = (byte) (MBCS_ENTRY_FINAL_ACTION(entry)); 2549 if (action == MBCS_STATE_VALID_DIRECT_20 2550 || (action == MBCS_STATE_FALLBACK_DIRECT_20 && isFallbackUsed())) { 2551 2552 entry = MBCS_ENTRY_FINAL_VALUE(entry); 2553 /* output surrogate pair */ 2554 target.put((char) (0xd800 | (char) (entry >>> 10))); 2555 if (offsets != null) { 2556 offsets.put(sourceIndex); 2557 } 2558 c = (char) (0xdc00 | (char) (entry & 0x3ff)); 2559 if (target.hasRemaining()) { 2560 target.put(c); 2561 if (offsets != null) { 2562 offsets.put(sourceIndex); 2563 } 2564 } else { 2565 /* target overflow */ 2566 charErrorBufferArray[0] = c; 2567 charErrorBufferLength = 1; 2568 cr[0] = CoderResult.OVERFLOW; 2569 break; 2570 } 2571 2572 ++sourceIndex; 2573 continue; 2574 } else if (action == MBCS_STATE_FALLBACK_DIRECT_16) { 2575 if (isFallbackUsed()) { 2576 /* output BMP code point */ 2577 target.put(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2578 if (offsets != null) { 2579 offsets.put(sourceIndex); 2580 } 2581 2582 ++sourceIndex; 2583 continue; 2584 } 2585 } else if (action == MBCS_STATE_UNASSIGNED) { 2586 /* just fall through */ 2587 } else if (action == MBCS_STATE_ILLEGAL) { 2588 /* callback(illegal) */ 2589 cr[0] = CoderResult.malformedForLength(1); 2590 } else { 2591 /* reserved, must never occur */ 2592 ++sourceIndex; 2593 continue; 2594 } 2595 2596 if (cr[0].isError()) { 2597 /* callback(illegal) */ 2598 break; 2599 } else /* unassigned sequences indicated with byteIndex>0 */{ 2600 /* try an extension mapping */ 2601 int sourceBeginIndex = sourceArrayIndex; 2602 toUBytesArray[0] = source.get(sourceArrayIndex - 1); 2603 source.position(sourceArrayIndex); 2604 toULength = toU((byte) 1, source, target, offsets, sourceIndex, flush, cr); 2605 sourceArrayIndex = source.position(); 2606 sourceIndex += 1 + (sourceArrayIndex - sourceBeginIndex); 2607 2608 if (cr[0].isError()) { 2609 /* not mappable or buffer overflow */ 2610 break; 2611 } 2612 } 2613 } 2614 2615 /* write back the updated pointers */ 2616 source.position(sourceArrayIndex); 2617 2618 return cr[0]; 2619 } 2620 getFallback(UConverterMBCSTable mbcsTable, int offset)2621 private int getFallback(UConverterMBCSTable mbcsTable, int offset) { 2622 MBCSToUFallback[] toUFallbacks; 2623 int i, start, limit; 2624 2625 limit = mbcsTable.countToUFallbacks; 2626 if (limit > 0) { 2627 /* do a binary search for the fallback mapping */ 2628 toUFallbacks = mbcsTable.toUFallbacks; 2629 start = 0; 2630 while (start < limit - 1) { 2631 i = (start + limit) >>> 1; 2632 if (offset < toUFallbacks[i].offset) { 2633 limit = i; 2634 } else { 2635 start = i; 2636 } 2637 } 2638 2639 /* did we really find it? */ 2640 if (offset == toUFallbacks[start].offset) { 2641 return toUFallbacks[start].codePoint; 2642 } 2643 } 2644 2645 return 0xfffe; 2646 } 2647 2648 /** 2649 * This is a simple version of _MBCSGetNextUChar() that is used by other converter implementations. It only 2650 * returns an "assigned" result if it consumes the entire input. It does not use state from the converter, nor 2651 * error codes. It does not handle the EBCDIC swaplfnl option (set in UConverter). It handles conversion 2652 * extensions but not GB 18030. 2653 * 2654 * @return U+fffe unassigned U+ffff illegal otherwise the Unicode code point 2655 */ simpleGetNextUChar(ByteBuffer source, boolean useFallback)2656 int simpleGetNextUChar(ByteBuffer source, boolean useFallback) { 2657 2658 // #if 0 2659 // /* 2660 // * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 2661 // * TODO In future releases, verify that this function is never called for SBCS 2662 // * conversions, i.e., that sharedData->mbcs.countStates==1 is still true. 2663 // * Removal improves code coverage. 2664 // */ 2665 // /* use optimized function if possible */ 2666 // if(sharedData->mbcs.countStates==1) { 2667 // if(length==1) { 2668 // return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback); 2669 // } else { 2670 // return 0xffff; /* illegal: more than a single byte for an SBCS converter */ 2671 // } 2672 // } 2673 // #endif 2674 2675 /* set up the local pointers */ 2676 int[][] stateTable = sharedData.mbcs.stateTable; 2677 char[] unicodeCodeUnits = sharedData.mbcs.unicodeCodeUnits; 2678 2679 /* converter state */ 2680 int offset = 0; 2681 int state = sharedData.mbcs.dbcsOnlyState; 2682 2683 int action; 2684 int entry; 2685 int c; 2686 int i = source.position(); 2687 int length = source.limit() - i; 2688 2689 /* conversion loop */ 2690 while (true) { 2691 // entry=stateTable[state][(uint8_t)source[i++]]; 2692 entry = stateTable[state][source.get(i++) & UConverterConstants.UNSIGNED_BYTE_MASK]; 2693 2694 if (MBCS_ENTRY_IS_TRANSITION(entry)) { 2695 state = MBCS_ENTRY_TRANSITION_STATE(entry); 2696 offset += MBCS_ENTRY_TRANSITION_OFFSET(entry); 2697 2698 if (i == source.limit()) { 2699 return 0xffff; /* truncated character */ 2700 } 2701 } else { 2702 /* 2703 * An if-else-if chain provides more reliable performance for the most common cases compared to a 2704 * switch. 2705 */ 2706 action = MBCS_ENTRY_FINAL_ACTION(entry); 2707 if (action == MBCS_STATE_VALID_16) { 2708 offset += MBCS_ENTRY_FINAL_VALUE_16(entry); 2709 c = unicodeCodeUnits[offset]; 2710 if (c != 0xfffe) { 2711 /* done */ 2712 } else if (isToUUseFallback()) { 2713 c = getFallback(sharedData.mbcs, offset); 2714 } 2715 /* else done with 0xfffe */ 2716 } else if (action == MBCS_STATE_VALID_DIRECT_16) { 2717 // /* output BMP code point */ 2718 c = MBCS_ENTRY_FINAL_VALUE_16(entry); 2719 } else if (action == MBCS_STATE_VALID_16_PAIR) { 2720 offset += MBCS_ENTRY_FINAL_VALUE_16(entry); 2721 c = unicodeCodeUnits[offset++]; 2722 if (c < 0xd800) { 2723 /* output BMP code point below 0xd800 */ 2724 } else if (isToUUseFallback() ? c <= 0xdfff : c <= 0xdbff) { 2725 /* output roundtrip or fallback supplementary code point */ 2726 c = (((c & 0x3ff) << 10) + unicodeCodeUnits[offset] + (0x10000 - 0xdc00)); 2727 } else if (isToUUseFallback() ? (c & 0xfffe) == 0xe000 : c == 0xe000) { 2728 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 2729 c = unicodeCodeUnits[offset]; 2730 } else if (c == 0xffff) { 2731 return 0xffff; 2732 } else { 2733 c = 0xfffe; 2734 } 2735 } else if (action == MBCS_STATE_VALID_DIRECT_20) { 2736 /* output supplementary code point */ 2737 c = 0x10000 + MBCS_ENTRY_FINAL_VALUE(entry); 2738 } else if (action == MBCS_STATE_FALLBACK_DIRECT_16) { 2739 if (!isToUUseFallback(useFallback)) { 2740 c = 0xfffe; 2741 } else { 2742 /* output BMP code point */ 2743 c = MBCS_ENTRY_FINAL_VALUE_16(entry); 2744 } 2745 } else if (action == MBCS_STATE_FALLBACK_DIRECT_20) { 2746 if (!isToUUseFallback(useFallback)) { 2747 c = 0xfffe; 2748 } else { 2749 /* output supplementary code point */ 2750 c = 0x10000 + MBCS_ENTRY_FINAL_VALUE(entry); 2751 } 2752 } else if (action == MBCS_STATE_UNASSIGNED) { 2753 c = 0xfffe; 2754 } else { 2755 /* 2756 * forbid MBCS_STATE_CHANGE_ONLY for this function, and MBCS_STATE_ILLEGAL and reserved action 2757 * codes 2758 */ 2759 return 0xffff; 2760 } 2761 break; 2762 } 2763 } 2764 2765 if (i != source.limit()) { 2766 /* illegal for this function: not all input consumed */ 2767 return 0xffff; 2768 } 2769 2770 if (c == 0xfffe) { 2771 /* try an extension mapping */ 2772 if (sharedData.mbcs.extIndexes != null) { 2773 /* Increase the limit for proper handling. Used in LMBCS. */ 2774 if (source.limit() > i + length) { 2775 source.limit(i + length); 2776 } 2777 return simpleMatchToU(source, useFallback); 2778 } 2779 } 2780 2781 return c; 2782 } hasValidTrailBytes(int[][] stateTable, short state)2783 private boolean hasValidTrailBytes(int[][] stateTable, short state) { 2784 int[] row = stateTable[state]; 2785 int b, entry; 2786 /* First test for final entries in this state for some commonly valid byte values. */ 2787 entry = row[0xa1]; 2788 if (!MBCS_ENTRY_IS_TRANSITION(entry) && MBCS_ENTRY_FINAL_ACTION(entry) != MBCS_STATE_ILLEGAL) { 2789 return true; 2790 } 2791 entry = row[0x41]; 2792 if (!MBCS_ENTRY_IS_TRANSITION(entry) && MBCS_ENTRY_FINAL_ACTION(entry) != MBCS_STATE_ILLEGAL) { 2793 return true; 2794 } 2795 /* Then test for final entries in this state. */ 2796 for (b = 0; b <= 0xff; b++) { 2797 entry = row[b]; 2798 if (!MBCS_ENTRY_IS_TRANSITION(entry) && MBCS_ENTRY_FINAL_ACTION(entry) != MBCS_STATE_ILLEGAL) { 2799 return true; 2800 } 2801 } 2802 /* Then recurse for transition entries. */ 2803 for (b = 0; b <= 0xff; b++) { 2804 entry = row[b]; 2805 if (MBCS_ENTRY_IS_TRANSITION(entry) && 2806 hasValidTrailBytes(stateTable, (short)MBCS_ENTRY_TRANSITION_STATE(entry))) { 2807 return true; 2808 } 2809 } 2810 return false; 2811 } 2812 isSingleOrLead(int[][] stateTable, int state, boolean isDBCSOnly, int b)2813 private boolean isSingleOrLead(int[][] stateTable, int state, boolean isDBCSOnly, int b) { 2814 int[] row = stateTable[state]; 2815 int entry = row[b]; 2816 if (MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */ 2817 return hasValidTrailBytes(stateTable, (short)MBCS_ENTRY_TRANSITION_STATE(entry)); 2818 } else { 2819 int action = MBCS_ENTRY_FINAL_ACTION(entry); 2820 if (action == MBCS_STATE_CHANGE_ONLY && isDBCSOnly) { 2821 return false; /* SI/SO are illegal for DBCS-only conversion */ 2822 } else { 2823 return (action != MBCS_STATE_ILLEGAL); 2824 } 2825 } 2826 } 2827 2828 2829 } 2830 2831 class CharsetEncoderMBCS extends CharsetEncoderICU { 2832 private boolean allowReplacementChanges = false; 2833 CharsetEncoderMBCS(CharsetICU cs)2834 CharsetEncoderMBCS(CharsetICU cs) { 2835 super(cs, fromUSubstitution); 2836 allowReplacementChanges = true; // allow changes in implReplaceWith 2837 implReset(); 2838 } 2839 2840 @Override implReset()2841 protected void implReset() { 2842 super.implReset(); 2843 preFromUFirstCP = UConverterConstants.U_SENTINEL; 2844 } 2845 2846 @Override 2847 @SuppressWarnings("fallthrough") encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush)2848 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) { 2849 CoderResult[] cr = { CoderResult.UNDERFLOW }; 2850 // if (!source.hasRemaining() && fromUChar32 == 0) 2851 // return cr[0]; 2852 2853 int sourceArrayIndex; 2854 char[] table; 2855 byte[] pArray, bytes; 2856 char[] chars; 2857 int[] ints; 2858 int pArrayIndex, outputType, c; 2859 int prevSourceIndex, sourceIndex, nextSourceIndex; 2860 int stage2Entry = 0, value = 0, length = 0, prevLength; 2861 short uniMask; 2862 // long asciiRoundtrips; 2863 2864 byte[] si_value = new byte[2]; 2865 byte[] so_value = new byte[2]; 2866 int si_value_length = 0, so_value_length = 0; 2867 2868 boolean gotoUnassigned = false; 2869 2870 try { 2871 2872 if (!flush && preFromUFirstCP >= 0) { 2873 /* 2874 * pass sourceIndex=-1 because we continue from an earlier buffer in the future, this may change 2875 * with continuous offsets 2876 */ 2877 cr[0] = continueMatchFromU(source, target, offsets, flush, -1); 2878 2879 if (cr[0].isError() || preFromULength < 0) { 2880 return cr[0]; 2881 } 2882 } 2883 2884 /* use optimized function if possible */ 2885 outputType = sharedData.mbcs.outputType; 2886 uniMask = sharedData.mbcs.unicodeMask; 2887 if (outputType == MBCS_OUTPUT_1 && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) { 2888 if ((uniMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) { 2889 cr[0] = cnvMBCSSingleFromBMPWithOffsets(source, target, offsets, flush); 2890 } else { 2891 cr[0] = cnvMBCSSingleFromUnicodeWithOffsets(source, target, offsets, flush); 2892 } 2893 return cr[0]; 2894 } else if (outputType == MBCS_OUTPUT_2) { 2895 cr[0] = cnvMBCSDoubleFromUnicodeWithOffsets(source, target, offsets, flush); 2896 return cr[0]; 2897 } 2898 2899 table = sharedData.mbcs.fromUnicodeTable; 2900 int[] tableInts = sharedData.mbcs.fromUnicodeTableInts; 2901 sourceArrayIndex = source.position(); 2902 2903 bytes = sharedData.mbcs.fromUnicodeBytes; 2904 ints = sharedData.mbcs.fromUnicodeInts; 2905 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) { 2906 chars = sharedData.mbcs.swapLFNLFromUnicodeChars; 2907 } else { 2908 chars = sharedData.mbcs.fromUnicodeChars; 2909 } 2910 2911 // asciiRoundtrips = sharedData.mbcs.asciiRoundtrips; 2912 2913 /* get the converter state from UConverter */ 2914 c = fromUChar32; 2915 2916 if (outputType == MBCS_OUTPUT_2_SISO) { 2917 prevLength = fromUnicodeStatus; 2918 if (prevLength == 0) { 2919 /* set the real value */ 2920 prevLength = 1; 2921 } 2922 } else { 2923 /* prevent fromUnicodeStatus from being set to something non-0 */ 2924 prevLength = 0; 2925 } 2926 2927 /* sourceIndex=-1 if the current character began in the previous buffer */ 2928 prevSourceIndex = -1; 2929 sourceIndex = c == 0 ? 0 : -1; 2930 nextSourceIndex = 0; 2931 2932 /* Get the SI/SO character for the converter */ 2933 si_value_length = getSISOBytes(SISO_Option.SI, options, si_value); 2934 so_value_length = getSISOBytes(SISO_Option.SO, options, so_value); 2935 2936 /* conversion loop */ 2937 /* 2938 * This is another piece of ugly code: A goto into the loop if the converter state contains a first 2939 * surrogate from the previous function call. It saves me to check in each loop iteration a check of 2940 * if(c==0) and duplicating the trail-surrogate-handling code in the else branch of that check. I could 2941 * not find any other way to get around this other than using a function call for the conversion and 2942 * callback, which would be even more inefficient. 2943 * 2944 * Markus Scherer 2000-jul-19 2945 */ 2946 boolean doloop = true; 2947 boolean doread = true; 2948 if (c != 0 && target.hasRemaining()) { 2949 if (UTF16.isLeadSurrogate((char) c) && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) { 2950 // c is a lead surrogate, read another input 2951 SideEffects x = new SideEffects(c, sourceArrayIndex, sourceIndex, nextSourceIndex, 2952 prevSourceIndex, prevLength); 2953 doloop = getTrail(source, target, uniMask, x, flush, cr); 2954 doread = x.doread; 2955 c = x.c; 2956 sourceArrayIndex = x.sourceArrayIndex; 2957 sourceIndex = x.sourceIndex; 2958 nextSourceIndex = x.nextSourceIndex; 2959 prevSourceIndex = x.prevSourceIndex; 2960 prevLength = x.prevLength; 2961 } else { 2962 // c is not a lead surrogate, do not read another input 2963 doread = false; 2964 } 2965 } 2966 2967 if (doloop) { 2968 while (!doread || sourceArrayIndex < source.limit()) { 2969 /* 2970 * This following test is to see if available input would overflow the output. It does not catch 2971 * output of more than one byte that overflows as a result of a multi-byte character or callback 2972 * output from the last source character. Therefore, those situations also test for overflows 2973 * and will then break the loop, too. 2974 */ 2975 if (target.hasRemaining()) { 2976 /* 2977 * Get a correct Unicode code point: a single UChar for a BMP code point or a matched 2978 * surrogate pair for a "supplementary code point". 2979 */ 2980 2981 if (doread) { 2982 // doread might be false only on the first looping 2983 2984 c = source.get(sourceArrayIndex++); 2985 ++nextSourceIndex; 2986 2987 /* 2988 * This also tests if the codepage maps single surrogates. If it does, then surrogates 2989 * are not paired but mapped separately. Note that in this case unmatched surrogates are 2990 * not detected. 2991 */ 2992 if (UTF16.isSurrogate((char) c) 2993 && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) { 2994 if (UTF16.isLeadSurrogate((char) c)) { 2995 // getTrail: 2996 SideEffects x = new SideEffects(c, sourceArrayIndex, sourceIndex, 2997 nextSourceIndex, prevSourceIndex, prevLength); 2998 doloop = getTrail(source, target, uniMask, x, flush, cr); 2999 c = x.c; 3000 sourceArrayIndex = x.sourceArrayIndex; 3001 sourceIndex = x.sourceIndex; 3002 nextSourceIndex = x.nextSourceIndex; 3003 prevSourceIndex = x.prevSourceIndex; 3004 3005 if (x.doread) { 3006 if (doloop) 3007 continue; 3008 else 3009 break; 3010 } 3011 } else { 3012 /* this is an unmatched trail code unit (2nd surrogate) */ 3013 /* callback(illegal) */ 3014 cr[0] = CoderResult.malformedForLength(1); 3015 break; 3016 } 3017 } 3018 } else { 3019 doread = true; 3020 } 3021 /* convert the Unicode code point in c into codepage bytes */ 3022 3023 /* 3024 * The basic lookup is a triple-stage compact array (trie) lookup. For details see the 3025 * beginning of this file. 3026 * 3027 * Single-byte codepages are handled with a different data structure by _MBCSSingle... 3028 * functions. 3029 * 3030 * The result consists of a 32-bit value from stage 2 and a pointer to as many bytes as are 3031 * stored per character. The pointer points to the character's bytes in stage 3. Bits 15..0 3032 * of the stage 2 entry contain the stage 3 index for that pointer, while bits 31..16 are 3033 * flags for which of the 16 characters in the block are roundtrip-assigned. 3034 * 3035 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t respectively as 3036 * uint32_t, in the platform encoding. For 3-byte codepages, the bytes are always stored in 3037 * big-endian order. 3038 * 3039 * For EUC encodings that use only either 0x8e or 0x8f as the first byte of their longest 3040 * byte sequences, the first two bytes in this third stage indicate with their 7th bits 3041 * whether these bytes are to be written directly or actually need to be preceeded by one of 3042 * the two Single-Shift codes. With this, the third stage stores one byte fewer per 3043 * character than the actual maximum length of EUC byte sequences. 3044 * 3045 * Other than that, leading zero bytes are removed and the other bytes output. A single zero 3046 * byte may be output if the "assigned" bit in stage 2 was on. The data structure does not 3047 * support zero byte output as a fallback, and also does not allow output of leading zeros. 3048 */ 3049 stage2Entry = MBCS_STAGE_2_FROM_U(table, tableInts, c); 3050 3051 /* get the bytes and the length for the output */ 3052 switch (outputType) { 3053 /* This is handled above with the method cnvMBCSDoubleFromUnicodeWithOffsets() */ 3054 /* case MBCS_OUTPUT_2: 3055 value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 3056 if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) { 3057 length = 1; 3058 } else { 3059 length = 2; 3060 } 3061 break; */ 3062 case MBCS_OUTPUT_2_SISO: 3063 /* 1/2-byte stateful with Shift-In/Shift-Out */ 3064 /* 3065 * Save the old state in the converter object right here, then change the local 3066 * prevLength state variable if necessary. Then, if this character turns out to be 3067 * unassigned or a fallback that is not taken, the callback code must not save the new 3068 * state in the converter because the new state is for a character that is not output. 3069 * However, the callback must still restore the state from the converter in case the 3070 * callback function changed it for its output. 3071 */ 3072 fromUnicodeStatus = prevLength; /* save the old state */ 3073 value = MBCS_VALUE_2_FROM_STAGE_2(chars, stage2Entry, c); 3074 if (value <= 0xff) { 3075 if (value == 0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) == false) { 3076 /* no mapping, leave value==0 */ 3077 length = 0; 3078 } else if (prevLength <= 1) { 3079 length = 1; 3080 } else { 3081 /* change from double-byte mode to single-byte */ 3082 if (si_value_length == 1) { 3083 value|=si_value[0]<<8; 3084 length = 2; 3085 } else if (si_value_length == 2) { 3086 value|=si_value[1]<<8; 3087 value|=si_value[0]<<16; 3088 length = 3; 3089 } 3090 prevLength = 1; 3091 } 3092 } else { 3093 if (prevLength == 2) { 3094 length = 2; 3095 } else { 3096 /* change from single-byte mode to double-byte */ 3097 if (so_value_length == 1) { 3098 value|=so_value[0]<<16; 3099 length = 3; 3100 } else if (so_value_length == 2) { 3101 value|=so_value[1]<<16; 3102 value|=so_value[0]<<24; 3103 length = 4; 3104 } 3105 prevLength = 2; 3106 } 3107 } 3108 break; 3109 case MBCS_OUTPUT_DBCS_ONLY: 3110 /* table with single-byte results, but only DBCS mappings used */ 3111 value = MBCS_VALUE_2_FROM_STAGE_2(chars, stage2Entry, c); 3112 if (value <= 0xff) { 3113 /* no mapping or SBCS result, not taken for DBCS-only */ 3114 value = stage2Entry = 0; /* stage2Entry=0 to reset roundtrip flags */ 3115 length = 0; 3116 } else { 3117 length = 2; 3118 } 3119 break; 3120 case MBCS_OUTPUT_3: 3121 pArray = bytes; 3122 pArrayIndex = MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 3123 value = ((pArray[pArrayIndex] & UConverterConstants.UNSIGNED_BYTE_MASK) << 16) 3124 | ((pArray[pArrayIndex + 1] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) 3125 | (pArray[pArrayIndex + 2] & UConverterConstants.UNSIGNED_BYTE_MASK); 3126 if (value <= 0xff) { 3127 length = 1; 3128 } else if (value <= 0xffff) { 3129 length = 2; 3130 } else { 3131 length = 3; 3132 } 3133 break; 3134 case MBCS_OUTPUT_4: 3135 value = MBCS_VALUE_4_FROM_STAGE_2(ints, stage2Entry, c); 3136 if (value < 0) { 3137 // Half of the 4-byte values look negative in a signed int. 3138 length = 4; 3139 } else if (value <= 0xff) { 3140 length = 1; 3141 } else if (value <= 0xffff) { 3142 length = 2; 3143 } else if (value <= 0xffffff) { 3144 length = 3; 3145 } else { 3146 length = 4; 3147 } 3148 break; 3149 case MBCS_OUTPUT_3_EUC: 3150 value = MBCS_VALUE_2_FROM_STAGE_2(chars, stage2Entry, c); 3151 /* EUC 16-bit fixed-length representation */ 3152 if (value <= 0xff) { 3153 length = 1; 3154 } else if ((value & 0x8000) == 0) { 3155 value |= 0x8e8000; 3156 length = 3; 3157 } else if ((value & 0x80) == 0) { 3158 value |= 0x8f0080; 3159 length = 3; 3160 } else { 3161 length = 2; 3162 } 3163 break; 3164 case MBCS_OUTPUT_4_EUC: 3165 pArray = bytes; 3166 pArrayIndex = MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 3167 value = ((pArray[pArrayIndex] & UConverterConstants.UNSIGNED_BYTE_MASK) << 16) 3168 | ((pArray[pArrayIndex + 1] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) 3169 | (pArray[pArrayIndex + 2] & UConverterConstants.UNSIGNED_BYTE_MASK); 3170 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 3171 if (value <= 0xff) { 3172 length = 1; 3173 } else if (value <= 0xffff) { 3174 length = 2; 3175 } else if ((value & 0x800000) == 0) { 3176 value |= 0x8e800000; 3177 length = 4; 3178 } else if ((value & 0x8000) == 0) { 3179 value |= 0x8f008000; 3180 length = 4; 3181 } else { 3182 length = 3; 3183 } 3184 break; 3185 default: 3186 /* must not occur */ 3187 /* 3188 * To avoid compiler warnings that value & length may be used without having been 3189 * initialized, we set them here. In reality, this is unreachable code. Not having a 3190 * default branch also causes warnings with some compilers. 3191 */ 3192 value = stage2Entry = 0; /* stage2Entry=0 to reset roundtrip flags */ 3193 length = 0; 3194 break; 3195 } 3196 3197 /* is this code point assigned, or do we use fallbacks? */ 3198 if (gotoUnassigned || (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || (isFromUUseFallback(c) && value != 0)))) { 3199 gotoUnassigned = false; 3200 /* 3201 * We allow a 0 byte output if the "assigned" bit is set for this entry. There is no way 3202 * with this data structure for fallback output to be a zero byte. 3203 */ 3204 3205 // unassigned: 3206 SideEffects x = new SideEffects(c, sourceArrayIndex, sourceIndex, nextSourceIndex, 3207 prevSourceIndex, prevLength); 3208 doloop = unassigned(source, target, offsets, x, flush, cr); 3209 c = x.c; 3210 sourceArrayIndex = x.sourceArrayIndex; 3211 sourceIndex = x.sourceIndex; 3212 nextSourceIndex = x.nextSourceIndex; 3213 prevSourceIndex = x.prevSourceIndex; 3214 prevLength = x.prevLength; 3215 if (doloop) 3216 continue; 3217 else 3218 break; 3219 } 3220 3221 /* write the output character bytes from value and length */ 3222 /* from the first if in the loop we know that targetCapacity>0 */ 3223 if (length <= target.remaining()) { 3224 switch (length) { 3225 /* each branch falls through to the next one */ 3226 case 4: 3227 target.put((byte) (value >>> 24)); 3228 if (offsets != null) { 3229 offsets.put(sourceIndex); 3230 } 3231 case 3: 3232 target.put((byte) (value >>> 16)); 3233 if (offsets != null) { 3234 offsets.put(sourceIndex); 3235 } 3236 case 2: 3237 target.put((byte) (value >>> 8)); 3238 if (offsets != null) { 3239 offsets.put(sourceIndex); 3240 } 3241 case 1: 3242 target.put((byte) value); 3243 if (offsets != null) { 3244 offsets.put(sourceIndex); 3245 } 3246 default: 3247 /* will never occur */ 3248 break; 3249 } 3250 } else { 3251 int errorBufferArrayIndex; 3252 3253 /* 3254 * We actually do this backwards here: In order to save an intermediate variable, we 3255 * output first to the overflow buffer what does not fit into the regular target. 3256 */ 3257 /* we know that 1<=targetCapacity<length<=4 */ 3258 length -= target.remaining(); 3259 3260 errorBufferArrayIndex = 0; 3261 switch (length) { 3262 /* each branch falls through to the next one */ 3263 case 3: 3264 errorBuffer[errorBufferArrayIndex++] = (byte) (value >>> 16); 3265 case 2: 3266 errorBuffer[errorBufferArrayIndex++] = (byte) (value >>> 8); 3267 case 1: 3268 errorBuffer[errorBufferArrayIndex] = (byte) value; 3269 default: 3270 /* will never occur */ 3271 break; 3272 } 3273 errorBufferLength = (byte) length; 3274 3275 /* now output what fits into the regular target */ 3276 value >>>= 8 * length; /* length was reduced by targetCapacity */ 3277 switch (target.remaining()) { 3278 /* each branch falls through to the next one */ 3279 case 3: 3280 target.put((byte) (value >>> 16)); 3281 if (offsets != null) { 3282 offsets.put(sourceIndex); 3283 } 3284 case 2: 3285 target.put((byte) (value >>> 8)); 3286 if (offsets != null) { 3287 offsets.put(sourceIndex); 3288 } 3289 case 1: 3290 target.put((byte) value); 3291 if (offsets != null) { 3292 offsets.put(sourceIndex); 3293 } 3294 default: 3295 /* will never occur */ 3296 break; 3297 } 3298 3299 /* target overflow */ 3300 cr[0] = CoderResult.OVERFLOW; 3301 c = 0; 3302 break; 3303 } 3304 3305 /* normal end of conversion: prepare for a new character */ 3306 c = 0; 3307 if (offsets != null) { 3308 prevSourceIndex = sourceIndex; 3309 sourceIndex = nextSourceIndex; 3310 } 3311 continue; 3312 } else { 3313 /* target is full */ 3314 cr[0] = CoderResult.OVERFLOW; 3315 break; 3316 } 3317 } 3318 } 3319 3320 /* 3321 * the end of the input stream and detection of truncated input are handled by the framework, but for 3322 * EBCDIC_STATEFUL conversion we need to emit an SI at the very end 3323 * 3324 * conditions: successful EBCDIC_STATEFUL in DBCS mode end of input and no truncated input 3325 */ 3326 if (outputType == MBCS_OUTPUT_2_SISO && prevLength == 2 && flush && sourceArrayIndex >= source.limit() 3327 && c == 0) { 3328 3329 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */ 3330 if (target.hasRemaining()) { 3331 target.put(si_value[0]); 3332 if (si_value_length == 2) { 3333 if (target.remaining() > 0) { 3334 target.put(si_value[1]); 3335 } else { 3336 errorBuffer[0] = si_value[1]; 3337 errorBufferLength = 1; 3338 cr[0] = CoderResult.OVERFLOW; 3339 } 3340 } 3341 if (offsets != null) { 3342 /* set the last source character's index (sourceIndex points at sourceLimit now) */ 3343 offsets.put(prevSourceIndex); 3344 } 3345 } else { 3346 /* target is full */ 3347 errorBuffer[0] = si_value[0]; 3348 if (si_value_length == 2) { 3349 errorBuffer[1] = si_value[1]; 3350 } 3351 errorBufferLength = si_value_length; 3352 cr[0] = CoderResult.OVERFLOW; 3353 } 3354 prevLength = 1; /* we switched into SBCS */ 3355 } 3356 3357 /* set the converter state back into UConverter */ 3358 fromUChar32 = c; 3359 fromUnicodeStatus = prevLength; 3360 3361 source.position(sourceArrayIndex); 3362 } catch (BufferOverflowException ex) { 3363 cr[0] = CoderResult.OVERFLOW; 3364 } 3365 3366 return cr[0]; 3367 } 3368 3369 /* 3370 * This is another simple conversion function for internal use by other conversion implementations. It does not 3371 * use the converter state nor call callbacks. It does not handle the EBCDIC swaplfnl option (set in 3372 * UConverter). It handles conversion extensions but not GB 18030. 3373 * 3374 * It converts one single Unicode code point into codepage bytes, encoded as one 32-bit value. The function 3375 * returns the number of bytes in *pValue: 1..4 the number of bytes in *pValue 0 unassigned (*pValue undefined) 3376 * -1 illegal (currently not used, *pValue undefined) 3377 * 3378 * *pValue will contain the resulting bytes with the last byte in bits 7..0, the second to last byte in bits 3379 * 15..8, etc. Currently, the function assumes but does not check that 0<=c<=0x10ffff. 3380 */ fromUChar32(int c, int[] pValue, boolean isUseFallback)3381 int fromUChar32(int c, int[] pValue, boolean isUseFallback) { 3382 // #if 0 3383 // /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 3384 // const uint8_t *p; 3385 // #endif 3386 3387 char[] table; 3388 int stage2Entry; 3389 int value; 3390 int length; 3391 int p; 3392 3393 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 3394 if (c <= 0xffff || sharedData.mbcs.hasSupplementary()) { 3395 table = sharedData.mbcs.fromUnicodeTable; 3396 3397 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 3398 if (sharedData.mbcs.outputType == MBCS_OUTPUT_1) { 3399 value = MBCS_SINGLE_RESULT_FROM_U(table, sharedData.mbcs.fromUnicodeChars, c); 3400 /* is this code point assigned, or do we use fallbacks? */ 3401 if (isUseFallback ? value >= 0x800 : value >= 0xc00) { 3402 pValue[0] = value & 0xff; 3403 return 1; 3404 } 3405 } else /* outputType!=MBCS_OUTPUT_1 */{ 3406 int[] tableInts = sharedData.mbcs.fromUnicodeTableInts; 3407 stage2Entry = MBCS_STAGE_2_FROM_U(table, tableInts, c); 3408 3409 /* get the bytes and the length for the output */ 3410 switch (sharedData.mbcs.outputType) { 3411 case MBCS_OUTPUT_2: 3412 value = MBCS_VALUE_2_FROM_STAGE_2(sharedData.mbcs.fromUnicodeChars, stage2Entry, c); 3413 if (value <= 0xff) { 3414 length = 1; 3415 } else { 3416 length = 2; 3417 } 3418 break; 3419 // #if 0 3420 // /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 3421 // case MBCS_OUTPUT_DBCS_ONLY: 3422 // /* table with single-byte results, but only DBCS mappings used */ 3423 // value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 3424 // if(value<=0xff) { 3425 // /* no mapping or SBCS result, not taken for DBCS-only */ 3426 // value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 3427 // length=0; 3428 // } else { 3429 // length=2; 3430 // } 3431 // break; 3432 case MBCS_OUTPUT_3: 3433 byte[] bytes = sharedData.mbcs.fromUnicodeBytes; 3434 p = CharsetMBCS.MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 3435 value = ((bytes[p] & UConverterConstants.UNSIGNED_BYTE_MASK)<<16) | 3436 ((bytes[p+1] & UConverterConstants.UNSIGNED_BYTE_MASK)<<8) | 3437 (bytes[p+2] & UConverterConstants.UNSIGNED_BYTE_MASK); 3438 if (value <= 0xff) { 3439 length = 1; 3440 } else if (value <= 0xffff) { 3441 length = 2; 3442 } else { 3443 length = 3; 3444 } 3445 break; 3446 // case MBCS_OUTPUT_4: 3447 // value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 3448 // if(value<=0xff) { 3449 // length=1; 3450 // } else if(value<=0xffff) { 3451 // length=2; 3452 // } else if(value<=0xffffff) { 3453 // length=3; 3454 // } else { 3455 // length=4; 3456 // } 3457 // break; 3458 // case MBCS_OUTPUT_3_EUC: 3459 // value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 3460 // /* EUC 16-bit fixed-length representation */ 3461 // if(value<=0xff) { 3462 // length=1; 3463 // } else if((value&0x8000)==0) { 3464 // value|=0x8e8000; 3465 // length=3; 3466 // } else if((value&0x80)==0) { 3467 // value|=0x8f0080; 3468 // length=3; 3469 // } else { 3470 // length=2; 3471 // } 3472 // break; 3473 // case MBCS_OUTPUT_4_EUC: 3474 // p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 3475 // value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 3476 // /* EUC 16-bit fixed-length representation applied to the first two bytes */ 3477 // if(value<=0xff) { 3478 // length=1; 3479 // } else if(value<=0xffff) { 3480 // length=2; 3481 // } else if((value&0x800000)==0) { 3482 // value|=0x8e800000; 3483 // length=4; 3484 // } else if((value&0x8000)==0) { 3485 // value|=0x8f008000; 3486 // length=4; 3487 // } else { 3488 // length=3; 3489 // } 3490 // break; 3491 // #endif 3492 default: 3493 /* must not occur */ 3494 return -1; 3495 } 3496 3497 /* is this code point assigned, or do we use fallbacks? */ 3498 if (MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) 3499 || (CharsetEncoderICU.isFromUUseFallback(isUseFallback, c) && value != 0)) { 3500 /* 3501 * We allow a 0 byte output if the "assigned" bit is set for this entry. There is no way with 3502 * this data structure for fallback output to be a zero byte. 3503 */ 3504 /* assigned */ 3505 pValue[0] = value; 3506 return length; 3507 } 3508 } 3509 } 3510 3511 if (sharedData.mbcs.extIndexes != null) { 3512 length = simpleMatchFromU(c, pValue, isUseFallback); 3513 return length >= 0 ? length : -length; /* return abs(length); */ 3514 } 3515 3516 /* unassigned */ 3517 return 0; 3518 } 3519 3520 /* 3521 * continue partial match with new input, requires cnv->preFromUFirstCP>=0 never called for simple, 3522 * single-character conversion 3523 */ continueMatchFromU(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush, int srcIndex)3524 private CoderResult continueMatchFromU(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush, 3525 int srcIndex) { 3526 CoderResult cr = CoderResult.UNDERFLOW; 3527 int[] value = new int[1]; 3528 int match; 3529 3530 match = matchFromU(preFromUFirstCP, preFromUArray, preFromUBegin, preFromULength, source, value, useFallback, flush); 3531 if (match >= 2) { 3532 match -= 2; /* remove 2 for the initial code point */ 3533 3534 if (match >= preFromULength) { 3535 /* advance src pointer for the consumed input */ 3536 source.position(source.position() + match - preFromULength); 3537 preFromULength = 0; 3538 } else { 3539 /* the match did not use all of preFromU[] - keep the rest for replay */ 3540 int length = preFromULength - match; 3541 System.arraycopy(preFromUArray, preFromUBegin + match, preFromUArray, preFromUBegin, length); 3542 preFromULength = (byte) -length; 3543 } 3544 3545 /* finish the partial match */ 3546 preFromUFirstCP = UConverterConstants.U_SENTINEL; 3547 3548 /* write result */ 3549 writeFromU(value[0], target, offsets, srcIndex); 3550 } else if (match < 0) { 3551 /* save state for partial match */ 3552 int sArrayIndex; 3553 int j; 3554 3555 /* just _append_ the newly consumed input to preFromU[] */ 3556 sArrayIndex = source.position(); 3557 match = -match - 2; /* remove 2 for the initial code point */ 3558 for (j = preFromULength; j < match; ++j) { 3559 preFromUArray[j] = source.get(sArrayIndex++); 3560 } 3561 source.position(sArrayIndex); /* same as *src=srcLimit; because we reached the end of input */ 3562 preFromULength = (byte) match; 3563 } else { /* match==0 or 1 */ 3564 /* 3565 * no match 3566 * 3567 * We need to split the previous input into two parts: 3568 * 3569 * 1. The first code point is unmappable - that's how we got into trying the extension data in the first 3570 * place. We need to move it from the preFromU buffer to the error buffer, set an error code, and 3571 * prepare the rest of the previous input for 2. 3572 * 3573 * 2. The rest of the previous input must be converted once we come back from the callback for the first 3574 * code point. At that time, we have to try again from scratch to convert these input characters. The 3575 * replay will be handled by the ucnv.c conversion code. 3576 */ 3577 3578 if (match == 1) { 3579 /* matched, no mapping but request for <subchar1> */ 3580 useSubChar1 = true; 3581 } 3582 3583 /* move the first code point to the error field */ 3584 fromUChar32 = preFromUFirstCP; 3585 preFromUFirstCP = UConverterConstants.U_SENTINEL; 3586 3587 /* mark preFromU for replay */ 3588 preFromULength = (byte) -preFromULength; 3589 3590 /* set the error code for unassigned */ 3591 // TODO: figure out what the unmappable length really should be 3592 cr = CoderResult.unmappableForLength(1); 3593 } 3594 return cr; 3595 } 3596 3597 /** 3598 * @param cx 3599 * pointer to extension data; if NULL, returns 0 3600 * @param firstCP 3601 * the first code point before all the other UChars 3602 * @param pre 3603 * UChars that must match; !initialMatch: partial match with them 3604 * @param preLength 3605 * length of pre, >=0 3606 * @param src 3607 * UChars that can be used to complete a match 3608 * @param srcLength 3609 * length of src, >=0 3610 * @param pMatchValue 3611 * [out] output result value for the match from the data structure 3612 * @param useFallback 3613 * "use fallback" flag, usually from cnv->useFallback 3614 * @param flush 3615 * TRUE if the end of the input stream is reached 3616 * @return >1: matched, return value=total match length (number of input units matched) 1: matched, no mapping 3617 * but request for <subchar1> (only for the first code point) 0: no match <0: partial match, return 3618 * value=negative total match length (partial matches are never returned for flush==TRUE) (partial 3619 * matches are never returned as being longer than UCNV_EXT_MAX_UCHARS) the matchLength is 2 if only 3620 * firstCP matched, and >2 if firstCP and further code units matched 3621 */ 3622 // static int32_t ucnv_extMatchFromU(const int32_t *cx, UChar32 firstCP, const UChar *pre, int32_t preLength, 3623 // const UChar *src, int32_t srcLength, uint32_t *pMatchValue, UBool useFallback, UBool flush) matchFromU(int firstCP, char[] preArray, int preArrayBegin, int preLength, CharBuffer source, int[] pMatchValue, boolean isUseFallback, boolean flush)3624 private int matchFromU(int firstCP, char[] preArray, int preArrayBegin, int preLength, CharBuffer source, 3625 int[] pMatchValue, boolean isUseFallback, boolean flush) { 3626 ByteBuffer cx = sharedData.mbcs.extIndexes; 3627 3628 CharBuffer stage12, stage3; 3629 IntBuffer stage3b; 3630 3631 CharBuffer fromUTableUChars, fromUSectionUChars; 3632 IntBuffer fromUTableValues, fromUSectionValues; 3633 3634 int value, matchValue; 3635 int i, j, index, length, matchLength; 3636 char c; 3637 3638 if (cx == null) { 3639 return 0; /* no extension data, no match */ 3640 } 3641 3642 /* trie lookup of firstCP */ 3643 index = firstCP >>> 10; /* stage 1 index */ 3644 if (index >= cx.asIntBuffer().get(EXT_FROM_U_STAGE_1_LENGTH)) { 3645 return 0; /* the first code point is outside the trie */ 3646 } 3647 3648 stage12 = (CharBuffer) ARRAY(cx, EXT_FROM_U_STAGE_12_INDEX, char.class); 3649 stage3 = (CharBuffer) ARRAY(cx, EXT_FROM_U_STAGE_3_INDEX, char.class); 3650 index = FROM_U(stage12, stage3, index, firstCP); 3651 3652 stage3b = (IntBuffer) ARRAY(cx, EXT_FROM_U_STAGE_3B_INDEX, int.class); 3653 value = stage3b.get(stage3b.position() + index); 3654 if (value == 0) { 3655 return 0; 3656 } 3657 3658 if (TO_U_IS_PARTIAL(value)) { 3659 /* partial match, enter the loop below */ 3660 index = FROM_U_GET_PARTIAL_INDEX(value); 3661 3662 /* initialize */ 3663 fromUTableUChars = (CharBuffer) ARRAY(cx, EXT_FROM_U_UCHARS_INDEX, char.class); 3664 fromUTableValues = (IntBuffer) ARRAY(cx, EXT_FROM_U_VALUES_INDEX, int.class); 3665 3666 matchValue = 0; 3667 i = j = matchLength = 0; 3668 3669 /* we must not remember fallback matches when not using fallbacks */ 3670 3671 /* match input units until there is a full match or the input is consumed */ 3672 for (;;) { 3673 /* go to the next section */ 3674 int oldpos = fromUTableUChars.position(); 3675 fromUSectionUChars = ((CharBuffer) fromUTableUChars.position(index)).slice(); 3676 fromUTableUChars.position(oldpos); 3677 oldpos = fromUTableValues.position(); 3678 fromUSectionValues = ((IntBuffer) fromUTableValues.position(index)).slice(); 3679 fromUTableValues.position(oldpos); 3680 3681 /* read first pair of the section */ 3682 length = fromUSectionUChars.get(); 3683 value = fromUSectionValues.get(); 3684 if (value != 0 && (FROM_U_IS_ROUNDTRIP(value) || isFromUUseFallback(isUseFallback, firstCP))) { 3685 /* remember longest match so far */ 3686 matchValue = value; 3687 matchLength = 2 + i + j; 3688 } 3689 3690 /* match pre[] then src[] */ 3691 if (i < preLength) { 3692 c = preArray[preArrayBegin + i++]; 3693 } else if (source != null && j < source.remaining()) { 3694 c = source.get(source.position() + j++); 3695 } else { 3696 /* all input consumed, partial match */ 3697 if (flush || (length = (i + j)) > MAX_UCHARS) { 3698 /* 3699 * end of the entire input stream, stop with the longest match so far or: partial match must 3700 * not be longer than UCNV_EXT_MAX_UCHARS because it must fit into state buffers 3701 */ 3702 break; 3703 } else { 3704 /* continue with more input next time */ 3705 return -(2 + length); 3706 } 3707 } 3708 3709 /* search for the current UChar */ 3710 index = findFromU(fromUSectionUChars, length, c); 3711 if (index < 0) { 3712 /* no match here, stop with the longest match so far */ 3713 break; 3714 } else { 3715 value = fromUSectionValues.get(fromUSectionValues.position() + index); 3716 if (FROM_U_IS_PARTIAL(value)) { 3717 /* partial match, continue */ 3718 index = FROM_U_GET_PARTIAL_INDEX(value); 3719 } else { 3720 if (FROM_U_IS_ROUNDTRIP(value) || isFromUUseFallback(isUseFallback, firstCP)) { 3721 /* full match, stop with result */ 3722 matchValue = value; 3723 matchLength = 2 + i + j; 3724 } else { 3725 /* full match on fallback not taken, stop with the longest match so far */ 3726 } 3727 break; 3728 } 3729 } 3730 } 3731 3732 if (matchLength == 0) { 3733 /* no match at all */ 3734 return 0; 3735 } 3736 } else /* result from firstCP trie lookup */{ 3737 if (FROM_U_IS_ROUNDTRIP(value) || isFromUUseFallback(isUseFallback, firstCP)) { 3738 /* full match, stop with result */ 3739 matchValue = value; 3740 matchLength = 2; 3741 } else { 3742 /* fallback not taken */ 3743 return 0; 3744 } 3745 } 3746 3747 if ((matchValue & FROM_U_RESERVED_MASK) != 0) { 3748 /* do not interpret values with reserved bits used, for forward compatibility */ 3749 return 0; 3750 } 3751 3752 /* return result */ 3753 if (matchValue == FROM_U_SUBCHAR1) { 3754 return 1; /* assert matchLength==2 */ 3755 } 3756 3757 pMatchValue[0] = FROM_U_MASK_ROUNDTRIP(matchValue); 3758 return matchLength; 3759 } 3760 simpleMatchFromU(int cp, int[] pValue, boolean isUseFallback)3761 private int simpleMatchFromU(int cp, int[] pValue, boolean isUseFallback) { 3762 int[] value = new int[1]; 3763 int match; // signed 3764 3765 /* try to match */ 3766 match = matchFromU(cp, null, 0, 0, null, value, isUseFallback, true); 3767 if (match >= 2) { 3768 /* write result for simple, single-character conversion */ 3769 int length; 3770 boolean isRoundtrip; 3771 3772 isRoundtrip = FROM_U_IS_ROUNDTRIP(value[0]); 3773 length = FROM_U_GET_LENGTH(value[0]); 3774 value[0] = FROM_U_GET_DATA(value[0]); 3775 3776 if (length <= EXT_FROM_U_MAX_DIRECT_LENGTH) { 3777 pValue[0] = value[0]; 3778 return isRoundtrip ? length : -length; 3779 // #if 0 /* not currently used */ 3780 // } else if(length==4) { 3781 // /* de-serialize a 4-byte result */ 3782 // const uint8_t *result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value; 3783 // *pValue= 3784 // ((uint32_t)result[0]<<24)| 3785 // ((uint32_t)result[1]<<16)| 3786 // ((uint32_t)result[2]<<8)| 3787 // result[3]; 3788 // return isRoundtrip ? 4 : -4; 3789 // #endif 3790 } 3791 } 3792 3793 /* 3794 * return no match because - match>1 && resultLength>4: result too long for simple conversion - match==1: no 3795 * match found, <subchar1> preferred - match==0: no match found in the first place - match<0: partial 3796 * match, not supported for simple conversion (and flush==TRUE) 3797 */ 3798 return 0; 3799 } 3800 3801 @SuppressWarnings("fallthrough") writeFromU(int value, ByteBuffer target, IntBuffer offsets, int srcIndex)3802 private CoderResult writeFromU(int value, ByteBuffer target, IntBuffer offsets, int srcIndex) { 3803 ByteBuffer cx = sharedData.mbcs.extIndexes; 3804 3805 byte bufferArray[] = new byte[1 + MAX_BYTES]; 3806 int bufferArrayIndex = 0; 3807 byte[] resultArray; 3808 int resultArrayIndex; 3809 int length, prevLength; 3810 3811 length = FROM_U_GET_LENGTH(value); 3812 value = FROM_U_GET_DATA(value); 3813 3814 /* output the result */ 3815 if (length <= FROM_U_MAX_DIRECT_LENGTH) { 3816 /* 3817 * Generate a byte array and then write it below. This is not the fastest possible way, but it should be 3818 * ok for extension mappings, and it is much simpler. Offset and overflow handling are only done once 3819 * this way. 3820 */ 3821 int p = bufferArrayIndex + 1; /* reserve buffer[0] for shiftByte below */ 3822 switch (length) { 3823 case 3: 3824 bufferArray[p++] = (byte) (value >>> 16); 3825 case 2: 3826 bufferArray[p++] = (byte) (value >>> 8); 3827 case 1: 3828 bufferArray[p++] = (byte) value; 3829 default: 3830 break; /* will never occur */ 3831 } 3832 resultArray = bufferArray; 3833 resultArrayIndex = bufferArrayIndex + 1; 3834 } else { 3835 byte[] slice = new byte[length]; 3836 3837 ByteBuffer bb = ((ByteBuffer) ARRAY(cx, EXT_FROM_U_BYTES_INDEX, byte.class)); 3838 bb.position(value); 3839 bb.get(slice, 0, slice.length); 3840 3841 resultArray = slice; 3842 resultArrayIndex = 0; 3843 } 3844 3845 /* with correct data we have length>0 */ 3846 3847 if ((prevLength = fromUnicodeStatus) != 0) { 3848 /* handle SI/SO stateful output */ 3849 byte shiftByte; 3850 3851 if (prevLength > 1 && length == 1) { 3852 /* change from double-byte mode to single-byte */ 3853 shiftByte = (byte) UConverterConstants.SI; 3854 fromUnicodeStatus = 1; 3855 } else if (prevLength == 1 && length > 1) { 3856 /* change from single-byte mode to double-byte */ 3857 shiftByte = (byte) UConverterConstants.SO; 3858 fromUnicodeStatus = 2; 3859 } else { 3860 shiftByte = 0; 3861 } 3862 3863 if (shiftByte != 0) { 3864 /* prepend the shift byte to the result bytes */ 3865 bufferArray[0] = shiftByte; 3866 if (resultArray != bufferArray || resultArrayIndex != bufferArrayIndex + 1) { 3867 System.arraycopy(resultArray, resultArrayIndex, bufferArray, bufferArrayIndex + 1, length); 3868 } 3869 resultArray = bufferArray; 3870 resultArrayIndex = bufferArrayIndex; 3871 ++length; 3872 } 3873 } 3874 3875 return fromUWriteBytes(this, resultArray, resultArrayIndex, length, target, offsets, srcIndex); 3876 } 3877 3878 /* 3879 * @return if(U_FAILURE) return the code point for cnv->fromUChar32 else return 0 after output has been written 3880 * to the target 3881 */ fromU(int cp, CharBuffer source, ByteBuffer target, IntBuffer offsets, int sourceIndex, int length, boolean flush, CoderResult[] cr)3882 private int fromU(int cp, CharBuffer source, ByteBuffer target, IntBuffer offsets, int sourceIndex, 3883 int length, boolean flush, CoderResult[] cr) { 3884 // ByteBuffer cx; 3885 3886 useSubChar1 = false; 3887 3888 if (sharedData.mbcs.extIndexes != null 3889 && initialMatchFromU(cp, source, target, offsets, sourceIndex, flush, cr)) { 3890 return 0; /* an extension mapping handled the input */ 3891 } 3892 3893 /* GB 18030 */ 3894 if ((options & MBCS_OPTION_GB18030) != 0) { 3895 int[] range; 3896 int i; 3897 3898 for (i = 0; i < gb18030Ranges.length; ++i) { 3899 range = gb18030Ranges[i]; 3900 if (range[0] <= cp && cp <= range[1]) { 3901 /* found the Unicode code point, output the four-byte sequence for it */ 3902 int linear; 3903 byte bytes[] = new byte[4]; 3904 3905 /* get the linear value of the first GB 18030 code in this range */ 3906 linear = range[2] - LINEAR_18030_BASE; 3907 3908 /* add the offset from the beginning of the range */ 3909 linear += (cp - range[0]); 3910 3911 bytes[3] = (byte) (0x30 + linear % 10); 3912 linear /= 10; 3913 bytes[2] = (byte) (0x81 + linear % 126); 3914 linear /= 126; 3915 bytes[1] = (byte) (0x30 + linear % 10); 3916 linear /= 10; 3917 bytes[0] = (byte) (0x81 + linear); 3918 3919 /* output this sequence */ 3920 cr[0] = fromUWriteBytes(this, bytes, 0, 4, target, offsets, sourceIndex); 3921 return 0; 3922 } 3923 } 3924 } 3925 3926 /* no mapping */ 3927 cr[0] = CoderResult.unmappableForLength(length); 3928 return cp; 3929 } 3930 3931 /* 3932 * target<targetLimit; set error code for overflow 3933 */ initialMatchFromU(int cp, CharBuffer source, ByteBuffer target, IntBuffer offsets, int srcIndex, boolean flush, CoderResult[] cr)3934 private boolean initialMatchFromU(int cp, CharBuffer source, ByteBuffer target, IntBuffer offsets, 3935 int srcIndex, boolean flush, CoderResult[] cr) { 3936 int[] value = new int[1]; 3937 int match; 3938 3939 /* try to match */ 3940 match = matchFromU(cp, null, 0, 0, source, value, useFallback, flush); 3941 3942 /* reject a match if the result is a single byte for DBCS-only */ 3943 if (match >= 2 3944 && !(FROM_U_GET_LENGTH(value[0]) == 1 && sharedData.mbcs.outputType == MBCS_OUTPUT_DBCS_ONLY)) { 3945 /* advance src pointer for the consumed input */ 3946 source.position(source.position() + match - 2); /* remove 2 for the initial code point */ 3947 3948 /* write result to target */ 3949 cr[0] = writeFromU(value[0], target, offsets, srcIndex); 3950 return true; 3951 } else if (match < 0) { 3952 /* save state for partial match */ 3953 int sArrayIndex; 3954 int j; 3955 3956 /* copy the first code point */ 3957 preFromUFirstCP = cp; 3958 3959 /* now copy the newly consumed input */ 3960 sArrayIndex = source.position(); 3961 match = -match - 2; /* remove 2 for the initial code point */ 3962 for (j = 0; j < match; ++j) { 3963 preFromUArray[j] = source.get(sArrayIndex++); 3964 } 3965 source.position(sArrayIndex); /* same as *src=srcLimit; because we reached the end of input */ 3966 preFromULength = (byte) match; 3967 return true; 3968 } else if (match == 1) { 3969 /* matched, no mapping but request for <subchar1> */ 3970 useSubChar1 = true; 3971 return false; 3972 } else /* match==0 no match */{ 3973 return false; 3974 } 3975 } 3976 cnvMBCSFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush)3977 CoderResult cnvMBCSFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) { 3978 // Just call encodeLoop to remove duplicate code. 3979 return encodeLoop(source, target, offsets, flush); 3980 } 3981 3982 /* 3983 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages that map only to and from the 3984 * BMP. In addition to single-byte/state optimizations, the offset calculations become much easier. 3985 */ cnvMBCSSingleFromBMPWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush)3986 private CoderResult cnvMBCSSingleFromBMPWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets, 3987 boolean flush) { 3988 3989 CoderResult[] cr = { CoderResult.UNDERFLOW }; 3990 3991 int sourceArrayIndex, lastSource; 3992 int targetCapacity, length; 3993 char[] table; 3994 char[] results; 3995 3996 int c, sourceIndex; 3997 char value, minValue; 3998 3999 /* set up the local pointers */ 4000 sourceArrayIndex = source.position(); 4001 targetCapacity = target.remaining(); 4002 table = sharedData.mbcs.fromUnicodeTable; 4003 4004 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) { 4005 results = sharedData.mbcs.swapLFNLFromUnicodeChars; 4006 } else { 4007 results = sharedData.mbcs.fromUnicodeChars; 4008 } 4009 4010 if (useFallback) { 4011 /* use all roundtrip and fallback results */ 4012 minValue = 0x800; 4013 } else { 4014 /* use only roundtrips and fallbacks from private-use characters */ 4015 minValue = 0xc00; 4016 } 4017 4018 /* get the converter state from UConverter */ 4019 c = fromUChar32; 4020 4021 /* sourceIndex=-1 if the current character began in the previous buffer */ 4022 sourceIndex = c == 0 ? 0 : -1; 4023 lastSource = sourceArrayIndex; 4024 4025 /* 4026 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter for the minimum of the 4027 * sourceLength and targetCapacity 4028 */ 4029 length = source.limit() - sourceArrayIndex; 4030 if (length < targetCapacity) { 4031 targetCapacity = length; 4032 } 4033 4034 boolean doloop = true; 4035 if (c != 0 && targetCapacity > 0) { 4036 SideEffectsSingleBMP x = new SideEffectsSingleBMP(c, sourceArrayIndex); 4037 doloop = getTrailSingleBMP(source, x, cr); 4038 c = x.c; 4039 sourceArrayIndex = x.sourceArrayIndex; 4040 } 4041 4042 if (doloop) { 4043 while (targetCapacity > 0) { 4044 /* 4045 * Get a correct Unicode code point: a single UChar for a BMP code point or a matched surrogate pair 4046 * for a "supplementary code point". 4047 */ 4048 c = source.get(sourceArrayIndex++); 4049 /* 4050 * Do not immediately check for single surrogates: Assume that they are unassigned and check for 4051 * them in that case. This speeds up the conversion of assigned characters. 4052 */ 4053 /* convert the Unicode code point in c into codepage bytes */ 4054 value = MBCS_SINGLE_RESULT_FROM_U(table, results, c); 4055 4056 /* is this code point assigned, or do we use fallbacks? */ 4057 if (value >= minValue) { 4058 /* assigned, write the output character bytes from value and length */ 4059 /* length==1 */ 4060 /* this is easy because we know that there is enough space */ 4061 target.put((byte) value); 4062 --targetCapacity; 4063 4064 /* normal end of conversion: prepare for a new character */ 4065 c = 0; 4066 continue; 4067 } else if (!UTF16.isSurrogate((char) c)) { 4068 /* normal, unassigned BMP character */ 4069 } else if (UTF16.isLeadSurrogate((char) c)) { 4070 // getTrail: 4071 SideEffectsSingleBMP x = new SideEffectsSingleBMP(c, sourceArrayIndex); 4072 doloop = getTrailSingleBMP(source, x, cr); 4073 c = x.c; 4074 sourceArrayIndex = x.sourceArrayIndex; 4075 if (!doloop) 4076 break; 4077 } else { 4078 /* this is an unmatched trail code unit (2nd surrogate) */ 4079 /* callback(illegal) */ 4080 cr[0] = CoderResult.malformedForLength(1); 4081 break; 4082 } 4083 4084 /* c does not have a mapping */ 4085 4086 /* get the number of code units for c to correctly advance sourceIndex */ 4087 length = UTF16.getCharCount(c); 4088 4089 /* set offsets since the start or the last extension */ 4090 if (offsets != null) { 4091 int count = sourceArrayIndex - lastSource; 4092 4093 /* do not set the offset for this character */ 4094 count -= length; 4095 4096 while (count > 0) { 4097 offsets.put(sourceIndex++); 4098 --count; 4099 } 4100 /* offsets and sourceIndex are now set for the current character */ 4101 } 4102 4103 /* try an extension mapping */ 4104 lastSource = sourceArrayIndex; 4105 source.position(sourceArrayIndex); 4106 c = fromU(c, source, target, offsets, sourceIndex, length, flush, cr); 4107 sourceArrayIndex = source.position(); 4108 sourceIndex += length + (sourceArrayIndex - lastSource); 4109 lastSource = sourceArrayIndex; 4110 4111 if (cr[0].isError()) { 4112 /* not mappable or buffer overflow */ 4113 break; 4114 } else { 4115 /* a mapping was written to the target, continue */ 4116 4117 /* recalculate the targetCapacity after an extension mapping */ 4118 targetCapacity = target.remaining(); 4119 length = source.limit() - sourceArrayIndex; 4120 if (length < targetCapacity) { 4121 targetCapacity = length; 4122 } 4123 } 4124 } 4125 } 4126 4127 if (sourceArrayIndex < source.limit() && !target.hasRemaining()) { 4128 /* target is full */ 4129 cr[0] = CoderResult.OVERFLOW; 4130 } 4131 4132 /* set offsets since the start or the last callback */ 4133 if (offsets != null) { 4134 int count = sourceArrayIndex - lastSource; 4135 while (count > 0) { 4136 offsets.put(sourceIndex++); 4137 --count; 4138 } 4139 } 4140 4141 /* set the converter state back into UConverter */ 4142 fromUChar32 = c; 4143 4144 /* write back the updated pointers */ 4145 source.position(sourceArrayIndex); 4146 4147 return cr[0]; 4148 } 4149 4150 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */ cnvMBCSSingleFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush)4151 private CoderResult cnvMBCSSingleFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target, 4152 IntBuffer offsets, boolean flush) { 4153 4154 CoderResult[] cr = { CoderResult.UNDERFLOW }; 4155 4156 int sourceArrayIndex; 4157 4158 char[] table; 4159 char[] results; 4160 4161 int c; 4162 int sourceIndex, nextSourceIndex; 4163 4164 char value, minValue; 4165 4166 /* set up the local pointers */ 4167 short uniMask; 4168 sourceArrayIndex = source.position(); 4169 4170 table = sharedData.mbcs.fromUnicodeTable; 4171 4172 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) { 4173 results = sharedData.mbcs.swapLFNLFromUnicodeChars; 4174 } else { 4175 results = sharedData.mbcs.fromUnicodeChars; 4176 } 4177 4178 if (useFallback) { 4179 /* use all roundtrip and fallback results */ 4180 minValue = 0x800; 4181 } else { 4182 /* use only roundtrips and fallbacks from private-use characters */ 4183 minValue = 0xc00; 4184 } 4185 // agljport:comment hasSupplementary only used in getTrail block which now simply repeats the mask operation 4186 uniMask = sharedData.mbcs.unicodeMask; 4187 4188 /* get the converter state from UConverter */ 4189 c = fromUChar32; 4190 4191 /* sourceIndex=-1 if the current character began in the previous buffer */ 4192 sourceIndex = c == 0 ? 0 : -1; 4193 nextSourceIndex = 0; 4194 4195 boolean doloop = true; 4196 boolean doread = true; 4197 if (c != 0 && target.hasRemaining()) { 4198 if (UTF16.isLeadSurrogate((char) c)) { 4199 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex); 4200 doloop = getTrailDouble(source, target, uniMask, x, flush, cr); 4201 doread = x.doread; 4202 c = x.c; 4203 sourceArrayIndex = x.sourceArrayIndex; 4204 sourceIndex = x.sourceIndex; 4205 nextSourceIndex = x.nextSourceIndex; 4206 } else { 4207 doread = false; 4208 } 4209 } 4210 4211 if (doloop) { 4212 while (!doread || sourceArrayIndex < source.limit()) { 4213 /* 4214 * This following test is to see if available input would overflow the output. It does not catch 4215 * output of more than one byte that overflows as a result of a multi-byte character or callback 4216 * output from the last source character. Therefore, those situations also test for overflows and 4217 * will then break the loop, too. 4218 */ 4219 if (target.hasRemaining()) { 4220 /* 4221 * Get a correct Unicode code point: a single UChar for a BMP code point or a matched surrogate 4222 * pair for a "supplementary code point". 4223 */ 4224 4225 if (doread) { 4226 c = source.get(sourceArrayIndex++); 4227 ++nextSourceIndex; 4228 if (UTF16.isSurrogate((char) c)) { 4229 if (UTF16.isLeadSurrogate((char) c)) { 4230 // getTrail: 4231 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, 4232 nextSourceIndex); 4233 doloop = getTrailDouble(source, target, uniMask, x, flush, cr); 4234 c = x.c; 4235 sourceArrayIndex = x.sourceArrayIndex; 4236 sourceIndex = x.sourceIndex; 4237 nextSourceIndex = x.nextSourceIndex; 4238 if (x.doread) { 4239 if (doloop) 4240 continue; 4241 else 4242 break; 4243 } 4244 } else { 4245 /* this is an unmatched trail code unit (2nd surrogate) */ 4246 /* callback(illegal) */ 4247 cr[0] = CoderResult.malformedForLength(1); 4248 break; 4249 } 4250 } 4251 } else { 4252 doread = true; 4253 } 4254 4255 /* convert the Unicode code point in c into codepage bytes */ 4256 value = MBCS_SINGLE_RESULT_FROM_U(table, results, c); 4257 4258 /* is this code point assigned, or do we use fallbacks? */ 4259 if (value >= minValue) { 4260 /* assigned, write the output character bytes from value and length */ 4261 /* length==1 */ 4262 /* this is easy because we know that there is enough space */ 4263 target.put((byte) value); 4264 if (offsets != null) { 4265 offsets.put(sourceIndex); 4266 } 4267 4268 /* normal end of conversion: prepare for a new character */ 4269 c = 0; 4270 sourceIndex = nextSourceIndex; 4271 } else { /* unassigned */ 4272 /* try an extension mapping */ 4273 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, 4274 nextSourceIndex); 4275 doloop = unassignedDouble(source, target, x, flush, cr); 4276 c = x.c; 4277 sourceArrayIndex = x.sourceArrayIndex; 4278 sourceIndex = x.sourceIndex; 4279 nextSourceIndex = x.nextSourceIndex; 4280 if (!doloop) 4281 break; 4282 } 4283 } else { 4284 /* target is full */ 4285 cr[0] = CoderResult.OVERFLOW; 4286 break; 4287 } 4288 } 4289 } 4290 4291 /* set the converter state back into UConverter */ 4292 fromUChar32 = c; 4293 4294 /* write back the updated pointers */ 4295 source.position(sourceArrayIndex); 4296 4297 return cr[0]; 4298 } 4299 4300 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */ cnvMBCSDoubleFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush)4301 private CoderResult cnvMBCSDoubleFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target, 4302 IntBuffer offsets, boolean flush) { 4303 CoderResult[] cr = { CoderResult.UNDERFLOW }; 4304 4305 int sourceArrayIndex; 4306 4307 char[] table; 4308 char[] chars; 4309 4310 int c, sourceIndex, nextSourceIndex; 4311 4312 int stage2Entry; 4313 int value; 4314 int length; 4315 short uniMask; 4316 4317 /* use optimized function if possible */ 4318 uniMask = sharedData.mbcs.unicodeMask; 4319 4320 /* set up the local pointers */ 4321 sourceArrayIndex = source.position(); 4322 4323 table = sharedData.mbcs.fromUnicodeTable; 4324 int[] tableInts = sharedData.mbcs.fromUnicodeTableInts; 4325 4326 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) { 4327 chars = sharedData.mbcs.swapLFNLFromUnicodeChars; 4328 } else { 4329 chars = sharedData.mbcs.fromUnicodeChars; 4330 } 4331 4332 /* get the converter state from UConverter */ 4333 c = fromUChar32; 4334 4335 /* sourceIndex=-1 if the current character began in the previous buffer */ 4336 sourceIndex = c == 0 ? 0 : -1; 4337 nextSourceIndex = 0; 4338 4339 /* conversion loop */ 4340 boolean doloop = true; 4341 boolean doread = true; 4342 if (c != 0 && target.hasRemaining()) { 4343 if (UTF16.isLeadSurrogate((char) c)) { 4344 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex); 4345 doloop = getTrailDouble(source, target, uniMask, x, flush, cr); 4346 doread = x.doread; 4347 c = x.c; 4348 sourceArrayIndex = x.sourceArrayIndex; 4349 sourceIndex = x.sourceIndex; 4350 nextSourceIndex = x.nextSourceIndex; 4351 } else { 4352 doread = false; 4353 } 4354 } 4355 4356 if (doloop) { 4357 while (!doread || sourceArrayIndex < source.limit()) { 4358 /* 4359 * This following test is to see if available input would overflow the output. It does not catch 4360 * output of more than one byte that overflows as a result of a multi-byte character or callback 4361 * output from the last source character. Therefore, those situations also test for overflows and 4362 * will then break the loop, too. 4363 */ 4364 if (target.hasRemaining()) { 4365 if (doread) { 4366 /* 4367 * Get a correct Unicode code point: a single UChar for a BMP code point or a matched 4368 * surrogate pair for a "supplementary code point". 4369 */ 4370 c = source.get(sourceArrayIndex++); 4371 ++nextSourceIndex; 4372 /* 4373 * This also tests if the codepage maps single surrogates. If it does, then surrogates are 4374 * not paired but mapped separately. Note that in this case unmatched surrogates are not 4375 * detected. 4376 */ 4377 if (UTF16.isSurrogate((char) c) && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) { 4378 if (UTF16.isLeadSurrogate((char) c)) { 4379 // getTrail: 4380 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, 4381 nextSourceIndex); 4382 doloop = getTrailDouble(source, target, uniMask, x, flush, cr); 4383 c = x.c; 4384 sourceArrayIndex = x.sourceArrayIndex; 4385 sourceIndex = x.sourceIndex; 4386 nextSourceIndex = x.nextSourceIndex; 4387 4388 if (x.doread) { 4389 if (doloop) 4390 continue; 4391 else 4392 break; 4393 } 4394 } else { 4395 /* this is an unmatched trail code unit (2nd surrogate) */ 4396 /* callback(illegal) */ 4397 cr[0] = CoderResult.malformedForLength(1); 4398 break; 4399 } 4400 } 4401 } else { 4402 doread = true; 4403 } 4404 4405 /* convert the Unicode code point in c into codepage bytes */ 4406 stage2Entry = MBCS_STAGE_2_FROM_U(table, tableInts, c); 4407 4408 /* get the bytes and the length for the output */ 4409 /* MBCS_OUTPUT_2 */ 4410 value = MBCS_VALUE_2_FROM_STAGE_2(chars, stage2Entry, c); 4411 if (value <= 0xff) { 4412 length = 1; 4413 } else { 4414 length = 2; 4415 } 4416 4417 /* is this code point assigned, or do we use fallbacks? */ 4418 if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || (isFromUUseFallback(c) && value != 0))) { 4419 /* 4420 * We allow a 0 byte output if the "assigned" bit is set for this entry. There is no way 4421 * with this data structure for fallback output to be a zero byte. 4422 */ 4423 4424 // unassigned: 4425 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, 4426 nextSourceIndex); 4427 4428 doloop = unassignedDouble(source, target, x, flush, cr); 4429 c = x.c; 4430 sourceArrayIndex = x.sourceArrayIndex; 4431 sourceIndex = x.sourceIndex; 4432 nextSourceIndex = x.nextSourceIndex; 4433 if (doloop) 4434 continue; 4435 else 4436 break; 4437 } 4438 4439 /* write the output character bytes from value and length */ 4440 /* from the first if in the loop we know that targetCapacity>0 */ 4441 if (length == 1) { 4442 /* this is easy because we know that there is enough space */ 4443 target.put((byte) value); 4444 if (offsets != null) { 4445 offsets.put(sourceIndex); 4446 } 4447 } else /* length==2 */{ 4448 target.put((byte) (value >>> 8)); 4449 if (2 <= target.remaining()) { 4450 target.put((byte) value); 4451 if (offsets != null) { 4452 offsets.put(sourceIndex); 4453 offsets.put(sourceIndex); 4454 } 4455 } else { 4456 if (offsets != null) { 4457 offsets.put(sourceIndex); 4458 } 4459 errorBuffer[0] = (byte) value; 4460 errorBufferLength = 1; 4461 4462 /* target overflow */ 4463 cr[0] = CoderResult.OVERFLOW; 4464 c = 0; 4465 break; 4466 } 4467 } 4468 4469 /* normal end of conversion: prepare for a new character */ 4470 c = 0; 4471 sourceIndex = nextSourceIndex; 4472 continue; 4473 } else { 4474 /* target is full */ 4475 cr[0] = CoderResult.OVERFLOW; 4476 break; 4477 } 4478 } 4479 } 4480 4481 /* set the converter state back into UConverter */ 4482 fromUChar32 = c; 4483 4484 /* write back the updated pointers */ 4485 source.position(sourceArrayIndex); 4486 4487 return cr[0]; 4488 } 4489 4490 private final class SideEffectsSingleBMP { 4491 int c, sourceArrayIndex; 4492 SideEffectsSingleBMP(int c_, int sourceArrayIndex_)4493 SideEffectsSingleBMP(int c_, int sourceArrayIndex_) { 4494 c = c_; 4495 sourceArrayIndex = sourceArrayIndex_; 4496 } 4497 } 4498 4499 // function made out of block labeled getTrail in ucnv_MBCSSingleFromUnicodeWithOffsets 4500 // assumes input c is lead surrogate getTrailSingleBMP(CharBuffer source, SideEffectsSingleBMP x, CoderResult[] cr)4501 private final boolean getTrailSingleBMP(CharBuffer source, SideEffectsSingleBMP x, CoderResult[] cr) { 4502 if (x.sourceArrayIndex < source.limit()) { 4503 /* test the following code unit */ 4504 char trail = source.get(x.sourceArrayIndex); 4505 if (UTF16.isTrailSurrogate(trail)) { 4506 ++x.sourceArrayIndex; 4507 x.c = UCharacter.getCodePoint((char) x.c, trail); 4508 /* this codepage does not map supplementary code points */ 4509 /* callback(unassigned) */ 4510 cr[0] = CoderResult.unmappableForLength(2); 4511 return false; 4512 } else { 4513 /* this is an unmatched lead code unit (1st surrogate) */ 4514 /* callback(illegal) */ 4515 cr[0] = CoderResult.malformedForLength(1); 4516 return false; 4517 } 4518 } else { 4519 /* no more input */ 4520 return false; 4521 } 4522 // return true; 4523 } 4524 4525 private final class SideEffects { 4526 int c, sourceArrayIndex, sourceIndex, nextSourceIndex, prevSourceIndex, prevLength; 4527 boolean doread = true; 4528 SideEffects(int c_, int sourceArrayIndex_, int sourceIndex_, int nextSourceIndex_, int prevSourceIndex_, int prevLength_)4529 SideEffects(int c_, int sourceArrayIndex_, int sourceIndex_, int nextSourceIndex_, int prevSourceIndex_, 4530 int prevLength_) { 4531 c = c_; 4532 sourceArrayIndex = sourceArrayIndex_; 4533 sourceIndex = sourceIndex_; 4534 nextSourceIndex = nextSourceIndex_; 4535 prevSourceIndex = prevSourceIndex_; 4536 prevLength = prevLength_; 4537 } 4538 } 4539 4540 // function made out of block labeled getTrail in ucnv_MBCSFromUnicodeWithOffsets 4541 // assumes input c is lead surrogate getTrail(CharBuffer source, ByteBuffer target, int uniMask, SideEffects x, boolean flush, CoderResult[] cr)4542 private final boolean getTrail(CharBuffer source, ByteBuffer target, int uniMask, SideEffects x, 4543 boolean flush, CoderResult[] cr) { 4544 if (x.sourceArrayIndex < source.limit()) { 4545 /* test the following code unit */ 4546 char trail = source.get(x.sourceArrayIndex); 4547 if (UTF16.isTrailSurrogate(trail)) { 4548 ++x.sourceArrayIndex; 4549 ++x.nextSourceIndex; 4550 /* convert this supplementary code point */ 4551 x.c = UCharacter.getCodePoint((char) x.c, trail); 4552 if ((uniMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) { 4553 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4554 fromUnicodeStatus = x.prevLength; /* save the old state */ 4555 /* callback(unassigned) */ 4556 x.doread = true; 4557 return unassigned(source, target, null, x, flush, cr); 4558 } else { 4559 x.doread = false; 4560 return true; 4561 } 4562 } else { 4563 /* this is an unmatched lead code unit (1st surrogate) */ 4564 /* callback(illegal) */ 4565 cr[0] = CoderResult.malformedForLength(1); 4566 return false; 4567 } 4568 } else { 4569 /* no more input */ 4570 return false; 4571 } 4572 } 4573 4574 // function made out of block labeled unassigned in ucnv_MBCSFromUnicodeWithOffsets unassigned(CharBuffer source, ByteBuffer target, IntBuffer offsets, SideEffects x, boolean flush, CoderResult[] cr)4575 private final boolean unassigned(CharBuffer source, ByteBuffer target, IntBuffer offsets, SideEffects x, 4576 boolean flush, CoderResult[] cr) { 4577 /* try an extension mapping */ 4578 int sourceBegin = x.sourceArrayIndex; 4579 source.position(x.sourceArrayIndex); 4580 x.c = fromU(x.c, source, target, null, x.sourceIndex, x.nextSourceIndex, flush, cr); 4581 x.sourceArrayIndex = source.position(); 4582 x.nextSourceIndex += x.sourceArrayIndex - sourceBegin; 4583 x.prevLength = fromUnicodeStatus; 4584 4585 if (cr[0].isError()) { 4586 /* not mappable or buffer overflow */ 4587 return false; 4588 } else { 4589 /* a mapping was written to the target, continue */ 4590 4591 /* recalculate the targetCapacity after an extension mapping */ 4592 // x.targetCapacity=pArgs.targetLimit-x.targetArrayIndex; 4593 /* normal end of conversion: prepare for a new character */ 4594 if (offsets != null) { 4595 x.prevSourceIndex = x.sourceIndex; 4596 x.sourceIndex = x.nextSourceIndex; 4597 } 4598 return true; 4599 } 4600 } 4601 4602 private final class SideEffectsDouble { 4603 int c, sourceArrayIndex, sourceIndex, nextSourceIndex; 4604 boolean doread = true; 4605 SideEffectsDouble(int c_, int sourceArrayIndex_, int sourceIndex_, int nextSourceIndex_)4606 SideEffectsDouble(int c_, int sourceArrayIndex_, int sourceIndex_, int nextSourceIndex_) { 4607 c = c_; 4608 sourceArrayIndex = sourceArrayIndex_; 4609 sourceIndex = sourceIndex_; 4610 nextSourceIndex = nextSourceIndex_; 4611 } 4612 } 4613 4614 // function made out of block labeled getTrail in ucnv_MBCSDoubleFromUnicodeWithOffsets 4615 // assumes input c is lead surrogate getTrailDouble(CharBuffer source, ByteBuffer target, int uniMask, SideEffectsDouble x, boolean flush, CoderResult[] cr)4616 private final boolean getTrailDouble(CharBuffer source, ByteBuffer target, int uniMask, 4617 SideEffectsDouble x, boolean flush, CoderResult[] cr) { 4618 if (x.sourceArrayIndex < source.limit()) { 4619 /* test the following code unit */ 4620 char trail = source.get(x.sourceArrayIndex); 4621 if (UTF16.isTrailSurrogate(trail)) { 4622 ++x.sourceArrayIndex; 4623 ++x.nextSourceIndex; 4624 /* convert this supplementary code point */ 4625 x.c = UCharacter.getCodePoint((char) x.c, trail); 4626 if ((uniMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) { 4627 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4628 /* callback(unassigned) */ 4629 x.doread = true; 4630 return unassignedDouble(source, target, x, flush, cr); 4631 } else { 4632 x.doread = false; 4633 return true; 4634 } 4635 } else { 4636 /* this is an unmatched lead code unit (1st surrogate) */ 4637 /* callback(illegal) */ 4638 cr[0] = CoderResult.malformedForLength(1); 4639 return false; 4640 } 4641 } else { 4642 /* no more input */ 4643 return false; 4644 } 4645 } 4646 4647 // function made out of block labeled unassigned in ucnv_MBCSDoubleFromUnicodeWithOffsets unassignedDouble(CharBuffer source, ByteBuffer target, SideEffectsDouble x, boolean flush, CoderResult[] cr)4648 private final boolean unassignedDouble(CharBuffer source, ByteBuffer target, SideEffectsDouble x, 4649 boolean flush, CoderResult[] cr) { 4650 /* try an extension mapping */ 4651 int sourceBegin = x.sourceArrayIndex; 4652 source.position(x.sourceArrayIndex); 4653 x.c = fromU(x.c, source, target, null, x.sourceIndex, x.nextSourceIndex, flush, cr); 4654 x.sourceArrayIndex = source.position(); 4655 x.nextSourceIndex += x.sourceArrayIndex - sourceBegin; 4656 4657 if (cr[0].isError()) { 4658 /* not mappable or buffer overflow */ 4659 return false; 4660 } else { 4661 /* a mapping was written to the target, continue */ 4662 4663 /* recalculate the targetCapacity after an extension mapping */ 4664 // x.targetCapacity=pArgs.targetLimit-x.targetArrayIndex; 4665 /* normal end of conversion: prepare for a new character */ 4666 x.sourceIndex = x.nextSourceIndex; 4667 return true; 4668 } 4669 } 4670 4671 /** 4672 * Overrides super class method 4673 * 4674 * @param encoder 4675 * @param source 4676 * @param target 4677 * @param offsets 4678 * @return 4679 */ 4680 @Override cbFromUWriteSub(CharsetEncoderICU encoder, CharBuffer source, ByteBuffer target, IntBuffer offsets)4681 protected CoderResult cbFromUWriteSub(CharsetEncoderICU encoder, CharBuffer source, ByteBuffer target, 4682 IntBuffer offsets) { 4683 CharsetMBCS cs = (CharsetMBCS) encoder.charset(); 4684 byte[] subchar; 4685 int length; 4686 4687 if (cs.subChar1 != 0 4688 && (cs.sharedData.mbcs.extIndexes != null ? encoder.useSubChar1 4689 : (encoder.invalidUCharBuffer[0] <= 0xff))) { 4690 /* 4691 * select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS 4692 * behavior) 4693 */ 4694 subchar = new byte[] { cs.subChar1 }; 4695 length = 1; 4696 } else { 4697 /* select subChar in all other cases */ 4698 subchar = cs.subChar; 4699 length = cs.subCharLen; 4700 } 4701 4702 /* reset the selector for the next code point */ 4703 encoder.useSubChar1 = false; 4704 4705 if (cs.sharedData.mbcs.outputType == MBCS_OUTPUT_2_SISO) { 4706 byte[] buffer = new byte[4]; 4707 int i = 0; 4708 4709 /* fromUnicodeStatus contains prevLength */ 4710 switch (length) { 4711 case 1: 4712 if (encoder.fromUnicodeStatus == 2) { 4713 /* DBCS mode and SBCS sub char: change to SBCS */ 4714 encoder.fromUnicodeStatus = 1; 4715 buffer[i++] = UConverterConstants.SI; 4716 } 4717 buffer[i++] = subchar[0]; 4718 break; 4719 case 2: 4720 if (encoder.fromUnicodeStatus <= 1) { 4721 /* SBCS mode and DBCS sub char: change to DBCS */ 4722 encoder.fromUnicodeStatus = 2; 4723 buffer[i++] = UConverterConstants.SO; 4724 } 4725 buffer[i++] = subchar[0]; 4726 buffer[i++] = subchar[1]; 4727 break; 4728 default: 4729 throw new IllegalArgumentException(); 4730 } 4731 4732 subchar = buffer; 4733 length = i; 4734 } 4735 return CharsetEncoderICU.fromUWriteBytes(encoder, subchar, 0, length, target, offsets, source.position()); 4736 } 4737 4738 /** 4739 * Gets called whenever CharsetEncoder.replaceWith gets called. allowReplacementChanges only allows subChar and 4740 * subChar1 to be modified outside construction (since replaceWith is called once during construction). 4741 * 4742 * @param replacement 4743 * The replacement for subchar. 4744 */ 4745 @Override implReplaceWith(byte[] replacement)4746 protected void implReplaceWith(byte[] replacement) { 4747 if (allowReplacementChanges) { 4748 CharsetMBCS cs = (CharsetMBCS) this.charset(); 4749 4750 System.arraycopy(replacement, 0, cs.subChar, 0, replacement.length); 4751 cs.subCharLen = (byte) replacement.length; 4752 cs.subChar1 = 0; 4753 } 4754 } 4755 } 4756 4757 @Override newDecoder()4758 public CharsetDecoder newDecoder() { 4759 return new CharsetDecoderMBCS(this); 4760 } 4761 4762 @Override newEncoder()4763 public CharsetEncoder newEncoder() { 4764 return new CharsetEncoderMBCS(this); 4765 } 4766 4767 @SuppressWarnings("fallthrough") MBCSGetFilteredUnicodeSetForUnicode(UConverterSharedData data, UnicodeSet setFillIn, int which, int filter)4768 void MBCSGetFilteredUnicodeSetForUnicode(UConverterSharedData data, UnicodeSet setFillIn, int which, int filter){ 4769 UConverterMBCSTable mbcsTable; 4770 char[] table; 4771 char st1,maxStage1, st2; 4772 int st3; 4773 int c ; 4774 4775 mbcsTable = data.mbcs; 4776 table = mbcsTable.fromUnicodeTable; 4777 if(mbcsTable.hasSupplementary()){ 4778 maxStage1 = 0x440; 4779 } 4780 else{ 4781 maxStage1 = 0x40; 4782 } 4783 c=0; /* keep track of current code point while enumerating */ 4784 4785 if(mbcsTable.outputType==MBCS_OUTPUT_1){ 4786 char stage2, stage3; 4787 char minValue; 4788 char[] results = mbcsTable.fromUnicodeChars; 4789 4790 if(which==ROUNDTRIP_SET) { 4791 /* use only roundtrips */ 4792 minValue=0xf00; 4793 } else { 4794 /* use all roundtrip and fallback results */ 4795 minValue=0x800; 4796 } 4797 for(st1=0;st1<maxStage1;++st1){ 4798 st2 = table[st1]; 4799 if(st2>maxStage1){ 4800 stage2 = st2; 4801 for(st2=0; st2<64; ++st2){ 4802 st3 = table[stage2 + st2]; 4803 if(st3!=0){ 4804 /*read the stage 3 block */ 4805 stage3 = (char)st3; 4806 do { 4807 if(results[stage3++]>=minValue){ 4808 setFillIn.add(c); 4809 } 4810 }while((++c&0xf) !=0); 4811 } else { 4812 c+= 16; /*empty stage 2 block */ 4813 } 4814 } 4815 } else { 4816 c+=1024; /* empty stage 2 block */ 4817 } 4818 } 4819 } else { 4820 int[] tableInts = mbcsTable.fromUnicodeTableInts; 4821 int stage2,stage3; 4822 byte[] bytes; 4823 int st3Multiplier; 4824 int value; 4825 boolean useFallBack; 4826 bytes = mbcsTable.fromUnicodeBytes; 4827 char[] chars = mbcsTable.fromUnicodeChars; 4828 int[] ints = mbcsTable.fromUnicodeInts; 4829 useFallBack = (which == ROUNDTRIP_AND_FALLBACK_SET); 4830 switch(mbcsTable.outputType) { 4831 case MBCS_OUTPUT_3: 4832 case MBCS_OUTPUT_4_EUC: 4833 st3Multiplier = 3; 4834 break; 4835 case MBCS_OUTPUT_4: 4836 st3Multiplier =4; 4837 break; 4838 default: 4839 st3Multiplier =2; 4840 break; 4841 } 4842 4843 for(st1=0;st1<maxStage1;++st1){ 4844 st2 = table[st1]; 4845 if(st2>(maxStage1>>1)){ 4846 stage2 = st2 ; 4847 for(st2=0;st2<64;++st2){ 4848 /*read the stage 3 block */ 4849 st3 = tableInts[stage2 + st2]; 4850 if(st3!=0){ 4851 //if((st3=table[stage2+st2])!=0){ 4852 stage3 = st3Multiplier*16*(st3&UConverterConstants.UNSIGNED_SHORT_MASK); 4853 4854 /* get the roundtrip flags for the stage 3 block */ 4855 st3>>>=16; 4856 switch(filter) { 4857 case UCNV_SET_FILTER_NONE: 4858 do { 4859 if((st3&1)!=0){ 4860 setFillIn.add(c); 4861 }else if (useFallBack) { 4862 int b =0; 4863 switch(st3Multiplier) { 4864 case 4: 4865 b = ints[stage3 / 4]; 4866 break; 4867 case 3: 4868 b |= bytes[stage3] | bytes[stage3 + 1] | bytes[stage3 + 2]; 4869 break; 4870 case 2: 4871 b = chars[stage3 / 2]; 4872 break; 4873 default: 4874 break; 4875 } 4876 stage3+=st3Multiplier; 4877 if(b!=0) { 4878 setFillIn.add(c); 4879 } 4880 } 4881 st3>>=1; 4882 }while((++c&0xf)!=0); 4883 break; 4884 case UCNV_SET_FILTER_DBCS_ONLY: 4885 /* Ignore single bytes results (<0x100). */ 4886 do { 4887 if(((st3&1) != 0 || useFallBack) && chars[stage3 / 2] >= 0x100){ 4888 setFillIn.add(c); 4889 } 4890 st3>>=1; 4891 stage3+=2; 4892 }while((++c&0xf) != 0); 4893 break; 4894 case UCNV_SET_FILTER_2022_CN : 4895 /* only add code points that map to CNS 11643 planes 1&2 for non-EXT ISO-2202-CN. */ 4896 do { 4897 if(((st3&1) != 0 || useFallBack) && 4898 ((value= (UConverterConstants.UNSIGNED_BYTE_MASK & bytes[stage3]))==0x81 || value==0x82) ){ 4899 setFillIn.add(c); 4900 } 4901 st3>>=1; 4902 stage3+=3; 4903 }while((++c&0xf)!=0); 4904 break; 4905 case UCNV_SET_FILTER_SJIS: 4906 /* only add code points that map tp Shift-JIS codes corrosponding to JIS X 0280. */ 4907 do{ 4908 if(((st3&1) != 0 || useFallBack) && (value=chars[stage3 / 2])>=0x8140 && value<=0xeffc){ 4909 setFillIn.add(c); 4910 } 4911 st3>>=1; 4912 stage3+=2; 4913 }while((++c&0xf)!=0); 4914 break; 4915 case UCNV_SET_FILTER_GR94DBCS: 4916 /* only add code points that maps to ISO 2022 GR 94 DBCS codes*/ 4917 do { 4918 if(((st3&1) != 0 || useFallBack) && 4919 (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=chars[stage3 / 2])- 0xa1a1))<=(0xfefe - 0xa1a1) && 4920 (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1)) <= (0xfe - 0xa1)){ 4921 setFillIn.add(c); 4922 } 4923 st3>>=1; 4924 stage3+=2; 4925 }while((++c&0xf)!=0); 4926 break; 4927 case UCNV_SET_FILTER_HZ: 4928 /*Only add code points that are suitable for HZ DBCS*/ 4929 do { 4930 if( ((st3&1) != 0 || useFallBack) && 4931 (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=chars[stage3 / 2])-0xa1a1))<=(0xfdfe - 0xa1a1) && 4932 (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1)) <= (0xfe - 0xa1)){ 4933 setFillIn.add(c); 4934 } 4935 st3>>=1; 4936 stage3+=2; 4937 }while((++c&0xf) != 0); 4938 break; 4939 default: 4940 return; 4941 } 4942 } else { 4943 c+=16; /* empty stage 3 block */ 4944 } 4945 } 4946 } else { 4947 c+=1024; /*empty stage2 block */ 4948 } 4949 } 4950 } 4951 extGetUnicodeSet(setFillIn, which, filter, data); 4952 } 4953 extGetUnicodeSetString(ByteBuffer cx,UnicodeSet setFillIn, boolean useFallback, int minLength, int c, char s[],int length,int sectionIndex)4954 static void extGetUnicodeSetString(ByteBuffer cx,UnicodeSet setFillIn, boolean useFallback, 4955 int minLength, int c, char s[],int length,int sectionIndex){ 4956 CharBuffer fromUSectionUChar; 4957 IntBuffer fromUSectionValues; 4958 fromUSectionUChar = (CharBuffer)ARRAY(cx, EXT_FROM_U_UCHARS_INDEX,char.class ); 4959 fromUSectionValues = (IntBuffer)ARRAY(cx, EXT_FROM_U_VALUES_INDEX,int.class ); 4960 int fromUSectionUCharIndex = fromUSectionUChar.position()+sectionIndex; 4961 int fromUSectionValuesIndex = fromUSectionValues.position()+sectionIndex; 4962 int value, i, count; 4963 4964 /* read first pair of the section */ 4965 count = fromUSectionUChar.get(fromUSectionUCharIndex++); 4966 value = fromUSectionValues.get(fromUSectionValuesIndex++); 4967 if(value!=0 && (FROM_U_IS_ROUNDTRIP(value) || useFallback) && FROM_U_GET_LENGTH(value)>=minLength) { 4968 if(c>=0){ 4969 setFillIn.add(c); 4970 } else { 4971 StringBuilder normalizedStringBuilder = new StringBuilder(); 4972 for(int j=0; j<length;j++){ 4973 normalizedStringBuilder.append(s[j]); 4974 } 4975 String normalizedString = normalizedStringBuilder.toString(); 4976 for(int j=0;j<length;j++){ 4977 setFillIn.add(normalizedString); 4978 } 4979 } 4980 } 4981 4982 for(i=0; i<count; ++i){ 4983 s[length] = fromUSectionUChar.get(fromUSectionUCharIndex + i); 4984 value = fromUSectionValues.get(fromUSectionValuesIndex + i); 4985 4986 if(value==0) { 4987 /* no mapping, do nothing */ 4988 } else if (FROM_U_IS_PARTIAL(value)) { 4989 extGetUnicodeSetString( cx, setFillIn, useFallback, minLength, UConverterConstants.U_SENTINEL, s, length+1, 4990 FROM_U_GET_PARTIAL_INDEX(value)); 4991 } else if ((useFallback ? (value&FROM_U_RESERVED_MASK)==0:((value&(FROM_U_ROUNDTRIP_FLAG|FROM_U_RESERVED_MASK))==FROM_U_ROUNDTRIP_FLAG)) 4992 && FROM_U_GET_LENGTH(value)>=minLength) { 4993 StringBuilder normalizedStringBuilder = new StringBuilder(); // String for composite characters 4994 for(int j=0; j<(length+1);j++){ 4995 normalizedStringBuilder.append(s[j]); 4996 } 4997 setFillIn.add(normalizedStringBuilder.toString()); 4998 } 4999 } 5000 5001 } 5002 5003 extGetUnicodeSet(UnicodeSet setFillIn, int which, int filter, UConverterSharedData Data)5004 static void extGetUnicodeSet(UnicodeSet setFillIn, int which, int filter, UConverterSharedData Data){ 5005 int st1, stage1Length, st2, st3, minLength; 5006 int ps2, ps3; 5007 5008 CharBuffer stage12, stage3; 5009 int value, length; 5010 IntBuffer stage3b; 5011 boolean useFallback; 5012 char s[] = new char[MAX_UCHARS]; 5013 int c; 5014 ByteBuffer cx = Data.mbcs.extIndexes; 5015 if(cx == null){ 5016 return; 5017 } 5018 stage12 = (CharBuffer)ARRAY(cx, EXT_FROM_U_STAGE_12_INDEX,char.class ); 5019 stage3 = (CharBuffer)ARRAY(cx, EXT_FROM_U_STAGE_3_INDEX,char.class ); 5020 stage3b = (IntBuffer)ARRAY(cx, EXT_FROM_U_STAGE_3B_INDEX,int.class ); 5021 5022 stage1Length = cx.asIntBuffer().get(EXT_FROM_U_STAGE_1_LENGTH); 5023 useFallback = (which==ROUNDTRIP_AND_FALLBACK_SET); 5024 5025 c = 0; 5026 if(filter == UCNV_SET_FILTER_2022_CN) { 5027 minLength = 3; 5028 } else if (Data.mbcs.outputType == MBCS_OUTPUT_DBCS_ONLY || filter != UCNV_SET_FILTER_NONE) { 5029 /* DBCS-only, ignore single-byte results */ 5030 minLength = 2; 5031 } else { 5032 minLength = 1; 5033 } 5034 5035 for(st1=0; st1< stage1Length; ++st1){ 5036 st2 = stage12.get(st1); 5037 if(st2>stage1Length) { 5038 ps2 = st2; 5039 for(st2=0;st2<64;++st2){ 5040 st3=(stage12.get(ps2+st2))<<STAGE_2_LEFT_SHIFT; 5041 if(st3!= 0){ 5042 ps3 = st3; 5043 do { 5044 value = stage3b.get(stage3.get(ps3++)); 5045 if(value==0){ 5046 /* no mapping do nothing */ 5047 }else if (FROM_U_IS_PARTIAL(value)){ 5048 length = 0; 5049 length=UTF16.append(s, length, c); 5050 extGetUnicodeSetString(cx,setFillIn,useFallback,minLength,c,s,length,FROM_U_GET_PARTIAL_INDEX(value)); 5051 } else if ((useFallback ? (value&FROM_U_RESERVED_MASK)==0 :((value&(FROM_U_ROUNDTRIP_FLAG|FROM_U_RESERVED_MASK))== FROM_U_ROUNDTRIP_FLAG)) && 5052 FROM_U_GET_LENGTH(value)>=minLength){ 5053 5054 switch(filter) { 5055 case UCNV_SET_FILTER_2022_CN: 5056 if(!(FROM_U_GET_LENGTH(value)==3 && FROM_U_GET_DATA(value)<=0x82ffff)){ 5057 continue; 5058 } 5059 break; 5060 case UCNV_SET_FILTER_SJIS: 5061 if(!(FROM_U_GET_LENGTH(value)==2 && (value=FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)){ 5062 continue; 5063 } 5064 break; 5065 case UCNV_SET_FILTER_GR94DBCS: 5066 if(!(FROM_U_GET_LENGTH(value)==2 && ((value=FROM_U_GET_DATA(value)) - 0xa1a1)<=(0xfefe - 0xa1a1) 5067 && (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1))<= (0xfe - 0xa1))){ 5068 continue; 5069 } 5070 break; 5071 case UCNV_SET_FILTER_HZ: 5072 if(!(FROM_U_GET_LENGTH(value)==2 && ((value=FROM_U_GET_DATA(value)) - 0xa1a1)<=(0xfdfe - 0xa1a1) 5073 && (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1))<= (0xfe - 0xa1))){ 5074 continue; 5075 } 5076 break; 5077 default: 5078 /* 5079 * UCNV_SET_FILTER_NONE, 5080 * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength 5081 */ 5082 break; 5083 } 5084 setFillIn.add(c); 5085 5086 } 5087 }while((++c&0xf) != 0); 5088 5089 } else { 5090 c+=16; /* emplty stage3 block */ 5091 } 5092 } 5093 } else { 5094 c+=1024; /* empty stage 2 block*/ 5095 } 5096 } 5097 } 5098 MBCSGetUnicodeSetForUnicode(UConverterSharedData data, UnicodeSet setFillIn, int which)5099 void MBCSGetUnicodeSetForUnicode(UConverterSharedData data, UnicodeSet setFillIn, int which){ 5100 MBCSGetFilteredUnicodeSetForUnicode(data, setFillIn, which, 5101 this.sharedData.mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? UCNV_SET_FILTER_DBCS_ONLY : UCNV_SET_FILTER_NONE ); 5102 } 5103 5104 @Override getUnicodeSetImpl( UnicodeSet setFillIn, int which)5105 void getUnicodeSetImpl( UnicodeSet setFillIn, int which){ 5106 if((options & MBCS_OPTION_GB18030)!=0){ 5107 setFillIn.add(0, 0xd7ff); 5108 setFillIn.add(0xe000, 0x10ffff); 5109 } 5110 else { 5111 this.MBCSGetUnicodeSetForUnicode(sharedData, setFillIn, which); 5112 } 5113 } 5114 5115 } 5116