1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /** 4 ******************************************************************************* 5 * Copyright (C) 2006-2013, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 * 9 ******************************************************************************* 10 */ 11 12 package com.ibm.icu.charset; 13 14 import java.nio.BufferOverflowException; 15 import java.nio.ByteBuffer; 16 import java.nio.CharBuffer; 17 import java.nio.IntBuffer; 18 import java.nio.charset.CharsetEncoder; 19 import java.nio.charset.CoderResult; 20 import java.nio.charset.CodingErrorAction; 21 22 import com.ibm.icu.impl.Assert; 23 import com.ibm.icu.lang.UCharacter; 24 import com.ibm.icu.text.UTF16; 25 26 /** 27 * An abstract class that provides framework methods of decoding operations for concrete 28 * subclasses. 29 * In the future this class will contain API that will implement converter semantics of ICU4C. 30 * @stable ICU 3.6 31 */ 32 public abstract class CharsetEncoderICU extends CharsetEncoder { 33 34 /* this is used in fromUnicode DBCS tables as an "unassigned" marker */ 35 static final char MISSING_CHAR_MARKER = '\uFFFF'; 36 37 byte[] errorBuffer = new byte[30]; 38 39 int errorBufferLength = 0; 40 41 /** these are for encodeLoopICU */ 42 int fromUnicodeStatus; 43 44 int fromUChar32; 45 46 boolean useSubChar1; 47 48 boolean useFallback; 49 50 /* maximum number of indexed UChars */ 51 static final int EXT_MAX_UCHARS = 19; 52 53 /* store previous UChars/chars to continue partial matches */ 54 int preFromUFirstCP; /* >=0: partial match */ 55 56 char[] preFromUArray = new char[EXT_MAX_UCHARS]; 57 58 int preFromUBegin; 59 60 int preFromULength; /* negative: replay */ 61 62 char[] invalidUCharBuffer = new char[2]; 63 64 int invalidUCharLength; 65 66 Object fromUContext; 67 68 private CharsetCallback.Encoder onUnmappableInput = CharsetCallback.FROM_U_CALLBACK_STOP; 69 70 private CharsetCallback.Encoder onMalformedInput = CharsetCallback.FROM_U_CALLBACK_STOP; 71 72 CharsetCallback.Encoder fromCharErrorBehaviour = new CharsetCallback.Encoder() { 73 @Override 74 public CoderResult call(CharsetEncoderICU encoder, Object context, 75 CharBuffer source, ByteBuffer target, IntBuffer offsets, 76 char[] buffer, int length, int cp, CoderResult cr) { 77 if (cr.isUnmappable()) { 78 return onUnmappableInput.call(encoder, context, source, target, 79 offsets, buffer, length, cp, cr); 80 } else /* if (cr.isMalformed()) */ { 81 return onMalformedInput.call(encoder, context, source, target, 82 offsets, buffer, length, cp, cr); 83 } 84 // return CharsetCallback.FROM_U_CALLBACK_STOP.call(encoder, context, source, target, offsets, buffer, length, cp, cr); 85 86 } 87 }; 88 89 /* 90 * Construcs a new encoder for the given charset 91 * 92 * @param cs 93 * for which the decoder is created 94 * @param replacement 95 * the substitution bytes 96 */ CharsetEncoderICU(CharsetICU cs, byte[] replacement)97 CharsetEncoderICU(CharsetICU cs, byte[] replacement) { 98 super(cs, (cs.minBytesPerChar + cs.maxBytesPerChar) / 2, 99 cs.maxBytesPerChar, replacement); 100 } 101 102 /** 103 * Is this Encoder allowed to use fallbacks? A fallback mapping is a mapping 104 * that will convert a Unicode codepoint sequence to a byte sequence, but 105 * the encoded byte sequence will round trip convert to a different 106 * Unicode codepoint sequence. 107 * @return true if the converter uses fallback, false otherwise. 108 * @stable ICU 3.8 109 */ isFallbackUsed()110 public boolean isFallbackUsed() { 111 return useFallback; 112 } 113 114 /** 115 * Sets whether this Encoder can use fallbacks? 116 * @param usesFallback true if the user wants the converter to take 117 * advantage of the fallback mapping, false otherwise. 118 * @stable ICU 3.8 119 */ setFallbackUsed(boolean usesFallback)120 public void setFallbackUsed(boolean usesFallback) { 121 useFallback = usesFallback; 122 } 123 124 /* 125 * Use fallbacks from Unicode to codepage when useFallback or for private-use code points 126 * @param c A codepoint 127 */ isFromUUseFallback(int c)128 final boolean isFromUUseFallback(int c) { 129 return (useFallback) || isUnicodePrivateUse(c); 130 } 131 132 /** 133 * Use fallbacks from Unicode to codepage when useFallback or for private-use code points 134 */ isFromUUseFallback(boolean iUseFallback, int c)135 static final boolean isFromUUseFallback(boolean iUseFallback, int c) { 136 return (iUseFallback) || isUnicodePrivateUse(c); 137 } 138 isUnicodePrivateUse(int c)139 private static final boolean isUnicodePrivateUse(int c) { 140 // First test for U+E000 to optimize for the most common characters. 141 return c >= 0xE000 && (c <= 0xF8FF || 142 c >= 0xF0000 && (c <= 0xFFFFD || 143 (c >= 0x100000 && c <= 0x10FFFD))); 144 } 145 146 /** 147 * Sets the action to be taken if an illegal sequence is encountered 148 * 149 * @param newAction 150 * action to be taken 151 * @exception IllegalArgumentException 152 * @stable ICU 3.6 153 */ 154 @Override implOnMalformedInput(CodingErrorAction newAction)155 protected void implOnMalformedInput(CodingErrorAction newAction) { 156 onMalformedInput = getCallback(newAction); 157 } 158 159 /** 160 * Sets the action to be taken if an illegal sequence is encountered 161 * 162 * @param newAction 163 * action to be taken 164 * @exception IllegalArgumentException 165 * @stable ICU 3.6 166 */ 167 @Override implOnUnmappableCharacter(CodingErrorAction newAction)168 protected void implOnUnmappableCharacter(CodingErrorAction newAction) { 169 onUnmappableInput = getCallback(newAction); 170 } 171 172 /** 173 * Sets the callback encoder method and context to be used if an illegal sequence is encountered. 174 * You would normally call this twice to set both the malform and unmappable error. In this case, 175 * newContext should remain the same since using a different newContext each time will negate the last 176 * one used. 177 * @param err CoderResult 178 * @param newCallback CharsetCallback.Encoder 179 * @param newContext Object 180 * @stable ICU 4.0 181 */ setFromUCallback(CoderResult err, CharsetCallback.Encoder newCallback, Object newContext)182 public final void setFromUCallback(CoderResult err, CharsetCallback.Encoder newCallback, Object newContext) { 183 if (err.isMalformed()) { 184 onMalformedInput = newCallback; 185 } else if (err.isUnmappable()) { 186 onUnmappableInput = newCallback; 187 } else { 188 /* Error: Only malformed and unmappable are handled. */ 189 } 190 191 if (fromUContext == null || !fromUContext.equals(newContext)) { 192 setFromUContext(newContext); 193 } 194 } 195 196 /** 197 * Sets fromUContext used in callbacks. 198 * 199 * @param newContext Object 200 * @exception IllegalArgumentException The object is an illegal argument for UContext. 201 * @stable ICU 4.0 202 */ setFromUContext(Object newContext)203 public final void setFromUContext(Object newContext) { 204 fromUContext = newContext; 205 } 206 getCallback(CodingErrorAction action)207 private static CharsetCallback.Encoder getCallback(CodingErrorAction action) { 208 if (action == CodingErrorAction.REPLACE) { 209 return CharsetCallback.FROM_U_CALLBACK_SUBSTITUTE; 210 } else if (action == CodingErrorAction.IGNORE) { 211 return CharsetCallback.FROM_U_CALLBACK_SKIP; 212 } else /* if (action == CodingErrorAction.REPORT) */ { 213 return CharsetCallback.FROM_U_CALLBACK_STOP; 214 } 215 } 216 217 private static final CharBuffer EMPTY = CharBuffer.allocate(0); 218 219 /** 220 * Flushes any characters saved in the converter's internal buffer and 221 * resets the converter. 222 * @param out action to be taken 223 * @return result of flushing action and completes the decoding all input. 224 * Returns CoderResult.UNDERFLOW if the action succeeds. 225 * @stable ICU 3.6 226 */ 227 @Override implFlush(ByteBuffer out)228 protected CoderResult implFlush(ByteBuffer out) { 229 return encode(EMPTY, out, null, true); 230 } 231 232 /** 233 * Resets the from Unicode mode of converter 234 * @stable ICU 3.6 235 */ 236 @Override implReset()237 protected void implReset() { 238 errorBufferLength = 0; 239 fromUnicodeStatus = 0; 240 fromUChar32 = 0; 241 fromUnicodeReset(); 242 } 243 fromUnicodeReset()244 private void fromUnicodeReset() { 245 preFromUBegin = 0; 246 preFromUFirstCP = UConverterConstants.U_SENTINEL; 247 preFromULength = 0; 248 } 249 250 /** 251 * Encodes one or more chars. The default behaviour of the 252 * converter is stop and report if an error in input stream is encountered. 253 * To set different behaviour use @see CharsetEncoder.onMalformedInput() 254 * @param in buffer to decode 255 * @param out buffer to populate with decoded result 256 * @return result of decoding action. Returns CoderResult.UNDERFLOW if the decoding 257 * action succeeds or more input is needed for completing the decoding action. 258 * @stable ICU 3.6 259 */ 260 @Override encodeLoop(CharBuffer in, ByteBuffer out)261 protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) { 262 if (!in.hasRemaining() && this.errorBufferLength == 0) { // make sure the errorBuffer is empty 263 // The Java framework should have already substituted what was left. 264 fromUChar32 = 0; 265 //fromUnicodeReset(); 266 return CoderResult.UNDERFLOW; 267 } 268 in.position(in.position() + fromUCountPending()); 269 /* do the conversion */ 270 CoderResult ret = encode(in, out, null, false); 271 setSourcePosition(in); 272 /* No need to reset to keep the proper state of the encoder. 273 if (ret.isUnderflow() && in.hasRemaining()) { 274 // The Java framework is going to substitute what is left. 275 //fromUnicodeReset(); 276 } */ 277 return ret; 278 } 279 280 /* 281 * Implements ICU semantics of buffer management 282 * @param source 283 * @param target 284 * @param offsets 285 * @return A CoderResult object that contains the error result when an error occurs. 286 */ encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush)287 abstract CoderResult encodeLoop(CharBuffer source, ByteBuffer target, 288 IntBuffer offsets, boolean flush); 289 290 /* 291 * Implements ICU semantics for encoding the buffer 292 * @param source The input character buffer 293 * @param target The output byte buffer 294 * @param offsets 295 * @param flush true if, and only if, the invoker can provide no 296 * additional input bytes beyond those in the given buffer. 297 * @return A CoderResult object that contains the error result when an error occurs. 298 */ encode(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush)299 final CoderResult encode(CharBuffer source, ByteBuffer target, 300 IntBuffer offsets, boolean flush) { 301 302 /* check parameters */ 303 if (target == null || source == null) { 304 throw new IllegalArgumentException(); 305 } 306 307 /* 308 * Make sure that the buffer sizes do not exceed the number range for 309 * int32_t because some functions use the size (in units or bytes) 310 * rather than comparing pointers, and because offsets are int32_t values. 311 * 312 * size_t is guaranteed to be unsigned and large enough for the job. 313 * 314 * Return with an error instead of adjusting the limits because we would 315 * not be able to maintain the semantics that either the source must be 316 * consumed or the target filled (unless an error occurs). 317 * An adjustment would be targetLimit=t+0x7fffffff; for example. 318 */ 319 320 /* flush the target overflow buffer */ 321 if (errorBufferLength > 0) { 322 byte[] overflowArray; 323 int i, length; 324 325 overflowArray = errorBuffer; 326 length = errorBufferLength; 327 i = 0; 328 do { 329 if (target.remaining() == 0) { 330 /* the overflow buffer contains too much, keep the rest */ 331 int j = 0; 332 333 do { 334 overflowArray[j++] = overflowArray[i++]; 335 } while (i < length); 336 337 errorBufferLength = (byte) j; 338 return CoderResult.OVERFLOW; 339 } 340 341 /* copy the overflow contents to the target */ 342 target.put(overflowArray[i++]); 343 if (offsets != null) { 344 offsets.put(-1); /* no source index available for old output */ 345 } 346 } while (i < length); 347 348 /* the overflow buffer is completely copied to the target */ 349 errorBufferLength = 0; 350 } 351 352 if (!flush && source.remaining() == 0 && preFromULength >= 0) { 353 /* the overflow buffer is emptied and there is no new input: we are done */ 354 return CoderResult.UNDERFLOW; 355 } 356 357 /* 358 * Do not simply return with a buffer overflow error if 359 * !flush && t==targetLimit 360 * because it is possible that the source will not generate any output. 361 * For example, the skip callback may be called; 362 * it does not output anything. 363 */ 364 365 return fromUnicodeWithCallback(source, target, offsets, flush); 366 367 } 368 369 /* 370 * Implementation note for m:n conversions 371 * 372 * While collecting source units to find the longest match for m:n conversion, 373 * some source units may need to be stored for a partial match. 374 * When a second buffer does not yield a match on all of the previously stored 375 * source units, then they must be "replayed", i.e., fed back into the converter. 376 * 377 * The code relies on the fact that replaying will not nest - 378 * converting a replay buffer will not result in a replay. 379 * This is because a replay is necessary only after the _continuation_ of a 380 * partial match failed, but a replay buffer is converted as a whole. 381 * It may result in some of its units being stored again for a partial match, 382 * but there will not be a continuation _during_ the replay which could fail. 383 * 384 * It is conceivable that a callback function could call the converter 385 * recursively in a way that causes another replay to be stored, but that 386 * would be an error in the callback function. 387 * Such violations will cause assertion failures in a debug build, 388 * and wrong output, but they will not cause a crash. 389 */ fromUnicodeWithCallback(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush)390 final CoderResult fromUnicodeWithCallback(CharBuffer source, 391 ByteBuffer target, IntBuffer offsets, boolean flush) { 392 int sBufferIndex; 393 int sourceIndex; 394 int errorInputLength; 395 boolean converterSawEndOfInput, calledCallback; 396 397 /* variables for m:n conversion */ 398 CharBuffer replayArray = CharBuffer.allocate(EXT_MAX_UCHARS); 399 int replayArrayIndex = 0; 400 CharBuffer realSource; 401 boolean realFlush; 402 403 CoderResult cr = CoderResult.UNDERFLOW; 404 405 /* get the converter implementation function */ 406 sourceIndex = 0; 407 408 if (preFromULength >= 0) { 409 /* normal mode */ 410 realSource = null; 411 realFlush = false; 412 } else { 413 /* 414 * Previous m:n conversion stored source units from a partial match 415 * and failed to consume all of them. 416 * We need to "replay" them from a temporary buffer and convert them first. 417 */ 418 realSource = source; 419 realFlush = flush; 420 421 //UConverterUtility.uprv_memcpy(replayArray, replayArrayIndex, preFromUArray, 0, -preFromULength*UMachine.U_SIZEOF_UCHAR); 422 replayArray.put(preFromUArray, 0, -preFromULength); 423 source = replayArray; 424 source.position(replayArrayIndex); 425 source.limit(replayArrayIndex - preFromULength); //preFromULength is negative, see declaration 426 flush = false; 427 428 preFromULength = 0; 429 } 430 431 /* 432 * loop for conversion and error handling 433 * 434 * loop { 435 * convert 436 * loop { 437 * update offsets 438 * handle end of input 439 * handle errors/call callback 440 * } 441 * } 442 */ 443 for (;;) { 444 /* convert */ 445 cr = encodeLoop(source, target, offsets, flush); 446 /* 447 * set a flag for whether the converter 448 * successfully processed the end of the input 449 * 450 * need not check cnv.preFromULength==0 because a replay (<0) will cause 451 * s<sourceLimit before converterSawEndOfInput is checked 452 */ 453 converterSawEndOfInput = (cr.isUnderflow() && flush 454 && source.remaining() == 0 && fromUChar32 == 0); 455 456 /* no callback called yet for this iteration */ 457 calledCallback = false; 458 459 /* no sourceIndex adjustment for conversion, only for callback output */ 460 errorInputLength = 0; 461 462 /* 463 * loop for offsets and error handling 464 * 465 * iterates at most 3 times: 466 * 1. to clean up after the conversion function 467 * 2. after the callback 468 * 3. after the callback again if there was truncated input 469 */ 470 for (;;) { 471 /* update offsets if we write any */ 472 /* Currently offsets are not being used in ICU4J */ 473 /* if (offsets != null) { 474 int length = target.remaining(); 475 if (length > 0) { 476 477 /* 478 * if a converter handles offsets and updates the offsets 479 * pointer at the end, then offset should not change 480 * here; 481 * however, some converters do not handle offsets at all 482 * (sourceIndex<0) or may not update the offsets pointer 483 */ 484 /* offsets.position(offsets.position() + length); 485 } 486 487 if (sourceIndex >= 0) { 488 sourceIndex += (int) (source.position()); 489 } 490 } */ 491 492 if (preFromULength < 0) { 493 /* 494 * switch the source to new replay units (cannot occur while replaying) 495 * after offset handling and before end-of-input and callback handling 496 */ 497 if (realSource == null) { 498 realSource = source; 499 realFlush = flush; 500 501 //UConverterUtility.uprv_memcpy(replayArray, replayArrayIndex, preFromUArray, 0, -preFromULength*UMachine.U_SIZEOF_UCHAR); 502 replayArray.put(preFromUArray, 0, -preFromULength); 503 504 source = replayArray; 505 source.position(replayArrayIndex); 506 source.limit(replayArrayIndex - preFromULength); 507 flush = false; 508 if ((sourceIndex += preFromULength) < 0) { 509 sourceIndex = -1; 510 } 511 512 preFromULength = 0; 513 } else { 514 /* see implementation note before _fromUnicodeWithCallback() */ 515 //agljport:todo U_ASSERT(realSource==NULL); 516 Assert.assrt(realSource == null); 517 } 518 } 519 520 /* update pointers */ 521 sBufferIndex = source.position(); 522 if (cr.isUnderflow()) { 523 if (sBufferIndex < source.limit()) { 524 /* 525 * continue with the conversion loop while there is still input left 526 * (continue converting by breaking out of only the inner loop) 527 */ 528 break; 529 } else if (realSource != null) { 530 /* switch back from replaying to the real source and continue */ 531 source = realSource; 532 flush = realFlush; 533 sourceIndex = source.position(); 534 realSource = null; 535 break; 536 } else if (flush && fromUChar32 != 0) { 537 /* 538 * the entire input stream is consumed 539 * and there is a partial, truncated input sequence left 540 */ 541 542 /* inject an error and continue with callback handling */ 543 //err[0]=ErrorCode.U_TRUNCATED_CHAR_FOUND; 544 cr = CoderResult.malformedForLength(1); 545 calledCallback = false; /* new error condition */ 546 } else { 547 /* input consumed */ 548 if (flush) { 549 /* 550 * return to the conversion loop once more if the flush 551 * flag is set and the conversion function has not 552 * successfully processed the end of the input yet 553 * 554 * (continue converting by breaking out of only the inner loop) 555 */ 556 if (!converterSawEndOfInput) { 557 break; 558 } 559 560 /* reset the converter without calling the callback function */ 561 implReset(); 562 } 563 564 /* done successfully */ 565 return cr; 566 } 567 } 568 569 /*U_FAILURE(*err) */ 570 { 571 572 if (calledCallback || cr.isOverflow() 573 || (!cr.isMalformed() && !cr.isUnmappable())) { 574 /* 575 * the callback did not or cannot resolve the error: 576 * set output pointers and return 577 * 578 * the check for buffer overflow is redundant but it is 579 * a high-runner case and hopefully documents the intent 580 * well 581 * 582 * if we were replaying, then the replay buffer must be 583 * copied back into the UConverter 584 * and the real arguments must be restored 585 */ 586 if (realSource != null) { 587 int length; 588 589 //agljport:todo U_ASSERT(cnv.preFromULength==0); 590 591 length = source.remaining(); 592 if (length > 0) { 593 //UConverterUtility.uprv_memcpy(preFromUArray, 0, sourceArray, pArgs.sourceBegin, length*UMachine.U_SIZEOF_UCHAR); 594 source.get(preFromUArray, 0, length); 595 preFromULength = (byte) -length; 596 } 597 } 598 return cr; 599 } 600 } 601 602 /* callback handling */ 603 { 604 int codePoint; 605 606 /* get and write the code point */ 607 codePoint = fromUChar32; 608 errorInputLength = UTF16.append(invalidUCharBuffer, 0, 609 fromUChar32); 610 invalidUCharLength = errorInputLength; 611 612 /* set the converter state to deal with the next character */ 613 fromUChar32 = 0; 614 615 /* call the callback function */ 616 cr = fromCharErrorBehaviour.call(this, fromUContext, 617 source, target, offsets, invalidUCharBuffer, 618 invalidUCharLength, codePoint, cr); 619 } 620 621 /* 622 * loop back to the offset handling 623 * 624 * this flag will indicate after offset handling 625 * that a callback was called; 626 * if the callback did not resolve the error, then we return 627 */ 628 calledCallback = true; 629 } 630 } 631 } 632 633 /* 634 * Ascertains if a given Unicode code point (32bit value for handling surrogates) 635 * can be converted to the target encoding. If the caller wants to test if a 636 * surrogate pair can be converted to target encoding then the 637 * responsibility of assembling the int value lies with the caller. 638 * For assembling a code point the caller can use UTF16 class of ICU4J and do something like: 639 * <pre> 640 * while(i<mySource.length){ 641 * if(UTF16.isLeadSurrogate(mySource[i])&& i+1< mySource.length){ 642 * if(UTF16.isTrailSurrogate(mySource[i+1])){ 643 * int temp = UTF16.charAt(mySource,i,i+1,0); 644 * if(!((CharsetEncoderICU) myConv).canEncode(temp)){ 645 * passed=false; 646 * } 647 * i++; 648 * i++; 649 * } 650 * } 651 * } 652 * </pre> 653 * or 654 * <pre> 655 * String src = new String(mySource); 656 * int i,codepoint; 657 * boolean passed = false; 658 * while(i<src.length()){ 659 * codepoint = UTF16.charAt(src,i); 660 * i+= (codepoint>0xfff)? 2:1; 661 * if(!(CharsetEncoderICU) myConv).canEncode(codepoint)){ 662 * passed = false; 663 * } 664 * } 665 * </pre> 666 * 667 * @param codepoint Unicode code point as int value 668 * @return true if a character can be converted 669 */ 670 /* TODO This is different from Java's canEncode(char) API. 671 * ICU's API should implement getUnicodeSet, 672 * and override canEncode(char) which queries getUnicodeSet. 673 * The getUnicodeSet should return a frozen UnicodeSet or use a fillin parameter, like ICU4C. 674 */ 675 /*public boolean canEncode(int codepoint) { 676 return true; 677 }*/ 678 /** 679 * Overrides super class method 680 * @stable ICU 3.6 681 */ 682 @Override isLegalReplacement(byte[] repl)683 public boolean isLegalReplacement(byte[] repl) { 684 return true; 685 } 686 687 /* 688 * Writes out the specified output bytes to the target byte buffer or to converter internal buffers. 689 * @param cnv 690 * @param bytesArray 691 * @param bytesBegin 692 * @param bytesLength 693 * @param out 694 * @param offsets 695 * @param sourceIndex 696 * @return A CoderResult object that contains the error result when an error occurs. 697 */ fromUWriteBytes(CharsetEncoderICU cnv, byte[] bytesArray, int bytesBegin, int bytesLength, ByteBuffer out, IntBuffer offsets, int sourceIndex)698 static final CoderResult fromUWriteBytes(CharsetEncoderICU cnv, 699 byte[] bytesArray, int bytesBegin, int bytesLength, ByteBuffer out, 700 IntBuffer offsets, int sourceIndex) { 701 702 //write bytes 703 int obl = bytesLength; 704 CoderResult cr = CoderResult.UNDERFLOW; 705 int bytesLimit = bytesBegin + bytesLength; 706 try { 707 for (; bytesBegin < bytesLimit;) { 708 out.put(bytesArray[bytesBegin]); 709 bytesBegin++; 710 } 711 // success 712 bytesLength = 0; 713 } catch (BufferOverflowException ex) { 714 cr = CoderResult.OVERFLOW; 715 } 716 717 if (offsets != null) { 718 while (obl > bytesLength) { 719 offsets.put(sourceIndex); 720 --obl; 721 } 722 } 723 //write overflow 724 cnv.errorBufferLength = bytesLimit - bytesBegin; 725 if (cnv.errorBufferLength > 0) { 726 int index = 0; 727 while (bytesBegin < bytesLimit) { 728 cnv.errorBuffer[index++] = bytesArray[bytesBegin++]; 729 } 730 cr = CoderResult.OVERFLOW; 731 } 732 return cr; 733 } 734 735 /* 736 * Returns the number of chars held in the converter's internal state 737 * because more input is needed for completing the conversion. This function is 738 * useful for mapping semantics of ICU's converter interface to those of iconv, 739 * and this information is not needed for normal conversion. 740 * @return The number of chars in the state. -1 if an error is encountered. 741 */ fromUCountPending()742 /*public*/int fromUCountPending() { 743 if (preFromULength > 0) { 744 return UTF16.getCharCount(preFromUFirstCP) + preFromULength; 745 } else if (preFromULength < 0) { 746 return -preFromULength; 747 } else if (fromUChar32 > 0) { 748 return 1; 749 } else if (preFromUFirstCP > 0) { 750 return UTF16.getCharCount(preFromUFirstCP); 751 } 752 return 0; 753 } 754 755 /** 756 * 757 * @param source 758 */ setSourcePosition(CharBuffer source)759 private final void setSourcePosition(CharBuffer source) { 760 761 // ok was there input held in the previous invocation of encodeLoop 762 // that resulted in output in this invocation? 763 source.position(source.position() - fromUCountPending()); 764 } 765 766 /* 767 * Write the codepage substitution character. 768 * Subclasses to override this method. 769 * For stateful converters, it is typically necessary to handle this 770 * specificially for the converter in order to properly maintain the state. 771 * @param source The input character buffer 772 * @param target The output byte buffer 773 * @param offsets 774 * @return A CoderResult object that contains the error result when an error occurs. 775 */ cbFromUWriteSub(CharsetEncoderICU encoder, CharBuffer source, ByteBuffer target, IntBuffer offsets)776 CoderResult cbFromUWriteSub(CharsetEncoderICU encoder, CharBuffer source, 777 ByteBuffer target, IntBuffer offsets) { 778 CharsetICU cs = (CharsetICU) encoder.charset(); 779 byte[] sub = encoder.replacement(); 780 if (cs.subChar1 != 0 && encoder.invalidUCharBuffer[0] <= 0xff) { 781 return CharsetEncoderICU.fromUWriteBytes(encoder, 782 new byte[] { cs.subChar1 }, 0, 1, target, offsets, source 783 .position()); 784 } else { 785 return CharsetEncoderICU.fromUWriteBytes(encoder, sub, 0, 786 sub.length, target, offsets, source.position()); 787 } 788 } 789 790 /* 791 * Write the characters to target. 792 * @param source The input character buffer 793 * @param target The output byte buffer 794 * @param offsets 795 * @return A CoderResult object that contains the error result when an error occurs. 796 */ cbFromUWriteUChars(CharsetEncoderICU encoder, CharBuffer source, ByteBuffer target, IntBuffer offsets)797 CoderResult cbFromUWriteUChars(CharsetEncoderICU encoder, 798 CharBuffer source, ByteBuffer target, IntBuffer offsets) { 799 CoderResult cr = CoderResult.UNDERFLOW; 800 801 /* This is a fun one. Recursion can occur - we're basically going to 802 * just retry shoving data through the same converter. Note, if you got 803 * here through some kind of invalid sequence, you maybe should emit a 804 * reset sequence of some kind. Since this IS an actual conversion, 805 * take care that you've changed the callback or the data, or you'll 806 * get an infinite loop. 807 */ 808 809 int oldTargetPosition = target.position(); 810 int offsetIndex = source.position(); 811 812 cr = encoder.encode(source, target, null, false); /* no offsets and no flush */ 813 814 if (offsets != null) { 815 while (target.position() != oldTargetPosition) { 816 offsets.put(offsetIndex); 817 oldTargetPosition++; 818 } 819 } 820 821 /* Note, if you did something like used a stop subcallback, things would get interesting. 822 * In fact, here's where we want to return the partially consumed in-source! 823 */ 824 if (cr.isOverflow()) { 825 /* Overflowed target. Now, we'll write into the charErrorBuffer. 826 * It's a fixed size. If we overflow it...Hm 827 */ 828 829 /* start the new target at the first free slot in the error buffer */ 830 int errBuffLen = encoder.errorBufferLength; 831 ByteBuffer newTarget = ByteBuffer.wrap(encoder.errorBuffer); 832 newTarget.position(errBuffLen); /* set the position at the end of the error buffer */ 833 encoder.errorBufferLength = 0; 834 835 encoder.encode(source, newTarget, null, false); 836 837 encoder.errorBuffer = newTarget.array(); 838 encoder.errorBufferLength = newTarget.position(); 839 } 840 841 return cr; 842 } 843 844 /** 845 * <p> 846 * Handles a common situation where a character has been read and it may be 847 * a lead surrogate followed by a trail surrogate. This method can change 848 * the source position and will modify fromUChar32. 849 * </p> 850 * 851 * <p> 852 * If <code>null</code> is returned, then there was success in reading a 853 * surrogate pair, the codepoint is stored in <code>fromUChar32</code> and 854 * <code>fromUChar32</code> should be reset (to 0) after being read. 855 * </p> 856 * 857 * @param source 858 * The encoding source. 859 * @param lead 860 * A character that may be the first in a surrogate pair. 861 * @return <code>CoderResult.malformedForLength(1)</code> or 862 * <code>CoderResult.UNDERFLOW</code> if there is a problem, or 863 * <code>null</code> if there isn't. 864 * @see #handleSurrogates(CharBuffer, char) 865 * @see #handleSurrogates(char[], int, int, char) 866 */ handleSurrogates(CharBuffer source, char lead)867 final CoderResult handleSurrogates(CharBuffer source, char lead) { 868 if (!UTF16.isLeadSurrogate(lead)) { 869 fromUChar32 = lead; 870 return CoderResult.malformedForLength(1); 871 } 872 873 if (!source.hasRemaining()) { 874 fromUChar32 = lead; 875 return CoderResult.UNDERFLOW; 876 } 877 878 char trail = source.get(); 879 880 if (!UTF16.isTrailSurrogate(trail)) { 881 fromUChar32 = lead; 882 source.position(source.position() - 1); 883 return CoderResult.malformedForLength(1); 884 } 885 886 fromUChar32 = UCharacter.getCodePoint(lead, trail); 887 return null; 888 } 889 890 /** 891 * <p> 892 * Same as <code>handleSurrogates(CharBuffer, char)</code>, but with arrays. As an added 893 * requirement, the calling method must also increment the index if this method returns 894 * <code>null</code>. 895 * </p> 896 * 897 * 898 * @param source 899 * The encoding source. 900 * @param lead 901 * A character that may be the first in a surrogate pair. 902 * @return <code>CoderResult.malformedForLength(1)</code> or 903 * <code>CoderResult.UNDERFLOW</code> if there is a problem, or <code>null</code> if 904 * there isn't. 905 * @see #handleSurrogates(CharBuffer, char) 906 * @see #handleSurrogates(char[], int, int, char) 907 */ handleSurrogates(char[] sourceArray, int sourceIndex, int sourceLimit, char lead)908 final CoderResult handleSurrogates(char[] sourceArray, int sourceIndex, 909 int sourceLimit, char lead) { 910 if (!UTF16.isLeadSurrogate(lead)) { 911 fromUChar32 = lead; 912 return CoderResult.malformedForLength(1); 913 } 914 915 if (sourceIndex >= sourceLimit) { 916 fromUChar32 = lead; 917 return CoderResult.UNDERFLOW; 918 } 919 920 char trail = sourceArray[sourceIndex]; 921 922 if (!UTF16.isTrailSurrogate(trail)) { 923 fromUChar32 = lead; 924 return CoderResult.malformedForLength(1); 925 } 926 927 fromUChar32 = UCharacter.getCodePoint(lead, trail); 928 return null; 929 } 930 931 /** 932 * Returns the maxCharsPerByte value for the Charset that created this encoder. 933 * @return maxCharsPerByte 934 * @stable ICU 4.8 935 */ maxCharsPerByte()936 public final float maxCharsPerByte() { 937 return ((CharsetICU)(this.charset())).maxCharsPerByte; 938 } 939 940 /** 941 * Calculates the size of a buffer for conversion from Unicode to a charset. 942 * The calculated size is guaranteed to be sufficient for this conversion. 943 * 944 * It takes into account initial and final non-character bytes that are output 945 * by some converters. 946 * It does not take into account callbacks which output more than one charset 947 * character sequence per call, like escape callbacks. 948 * The default (substitution) callback only outputs one charset character sequence. 949 * 950 * @param length Number of chars to be converted. 951 * @param maxCharSize Return value from maxBytesPerChar for the converter 952 * that will be used. 953 * @return Size of a buffer that will be large enough to hold the output of bytes 954 * 955 * @stable ICU 49 956 */ getMaxBytesForString(int length, int maxCharSize)957 public static int getMaxBytesForString(int length, int maxCharSize) { 958 return ((length + 10) * maxCharSize); 959 } 960 961 } 962