1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2009-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 11 package ohos.global.icu.impl; 12 13 import java.io.IOException; 14 import java.nio.ByteBuffer; 15 import java.util.ArrayList; 16 17 import ohos.global.icu.text.UTF16; 18 import ohos.global.icu.text.UnicodeSet; 19 import ohos.global.icu.util.CodePointMap; 20 import ohos.global.icu.util.CodePointTrie; 21 import ohos.global.icu.util.ICUUncheckedIOException; 22 import ohos.global.icu.util.MutableCodePointTrie; 23 import ohos.global.icu.util.VersionInfo; 24 25 /** 26 * Low-level implementation of the Unicode Normalization Algorithm. 27 * For the data structure and details see the documentation at the end of 28 * C++ normalizer2impl.h and in the design doc at 29 * http://site.icu-project.org/design/normalization/custom 30 * @hide exposed on OHOS 31 */ 32 public final class Normalizer2Impl { 33 /** 34 * @hide exposed on OHOS 35 */ 36 public static final class Hangul { 37 /* Korean Hangul and Jamo constants */ 38 public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ 39 public static final int JAMO_L_END=0x1112; 40 public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ 41 public static final int JAMO_V_END=0x1175; 42 public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ 43 public static final int JAMO_T_END=0x11c2; 44 45 public static final int HANGUL_BASE=0xac00; 46 public static final int HANGUL_END=0xd7a3; 47 48 public static final int JAMO_L_COUNT=19; 49 public static final int JAMO_V_COUNT=21; 50 public static final int JAMO_T_COUNT=28; 51 52 public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT; 53 public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT; 54 55 public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT; 56 57 public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; 58 public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; 59 isHangul(int c)60 public static boolean isHangul(int c) { 61 return HANGUL_BASE<=c && c<HANGUL_LIMIT; 62 } isHangulLV(int c)63 public static boolean isHangulLV(int c) { 64 c-=HANGUL_BASE; 65 return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0; 66 } isJamoL(int c)67 public static boolean isJamoL(int c) { 68 return JAMO_L_BASE<=c && c<JAMO_L_LIMIT; 69 } isJamoV(int c)70 public static boolean isJamoV(int c) { 71 return JAMO_V_BASE<=c && c<JAMO_V_LIMIT; 72 } isJamoT(int c)73 public static boolean isJamoT(int c) { 74 int t=c-JAMO_T_BASE; 75 return 0<t && t<JAMO_T_COUNT; // not JAMO_T_BASE itself 76 } isJamo(int c)77 public static boolean isJamo(int c) { 78 return JAMO_L_BASE<=c && c<=JAMO_T_END && 79 (c<=JAMO_L_END || (JAMO_V_BASE<=c && c<=JAMO_V_END) || JAMO_T_BASE<c); 80 } 81 82 /** 83 * Decomposes c, which must be a Hangul syllable, into buffer 84 * and returns the length of the decomposition (2 or 3). 85 */ decompose(int c, Appendable buffer)86 public static int decompose(int c, Appendable buffer) { 87 try { 88 c-=HANGUL_BASE; 89 int c2=c%JAMO_T_COUNT; 90 c/=JAMO_T_COUNT; 91 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 92 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 93 if(c2==0) { 94 return 2; 95 } else { 96 buffer.append((char)(JAMO_T_BASE+c2)); 97 return 3; 98 } 99 } catch(IOException e) { 100 // Will not occur because we do not write to I/O. 101 throw new ICUUncheckedIOException(e); 102 } 103 } 104 105 /** 106 * Decomposes c, which must be a Hangul syllable, into buffer. 107 * This is the raw, not recursive, decomposition. Its length is always 2. 108 */ getRawDecomposition(int c, Appendable buffer)109 public static void getRawDecomposition(int c, Appendable buffer) { 110 try { 111 int orig=c; 112 c-=HANGUL_BASE; 113 int c2=c%JAMO_T_COUNT; 114 if(c2==0) { 115 c/=JAMO_T_COUNT; 116 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 117 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 118 } else { 119 buffer.append((char)(orig-c2)); // LV syllable 120 buffer.append((char)(JAMO_T_BASE+c2)); 121 } 122 } catch(IOException e) { 123 // Will not occur because we do not write to I/O. 124 throw new ICUUncheckedIOException(e); 125 } 126 } 127 } 128 129 /** 130 * Writable buffer that takes care of canonical ordering. 131 * Its Appendable methods behave like the C++ implementation's 132 * appendZeroCC() methods. 133 * <p> 134 * If dest is a StringBuilder, then the buffer writes directly to it. 135 * Otherwise, the buffer maintains a StringBuilder for intermediate text segments 136 * until no further changes are necessary and whole segments are appended. 137 * append() methods that take combining-class values always write to the StringBuilder. 138 * Other append() methods flush and append to the Appendable. 139 * @hide exposed on OHOS 140 */ 141 public static final class ReorderingBuffer implements Appendable { ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity)142 public ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity) { 143 impl=ni; 144 app=dest; 145 if(app instanceof StringBuilder) { 146 appIsStringBuilder=true; 147 str=(StringBuilder)dest; 148 // In Java, the constructor subsumes public void init(int destCapacity) { 149 str.ensureCapacity(destCapacity); 150 reorderStart=0; 151 if(str.length()==0) { 152 lastCC=0; 153 } else { 154 setIterator(); 155 lastCC=previousCC(); 156 // Set reorderStart after the last code point with cc<=1 if there is one. 157 if(lastCC>1) { 158 while(previousCC()>1) {} 159 } 160 reorderStart=codePointLimit; 161 } 162 } else { 163 appIsStringBuilder=false; 164 str=new StringBuilder(); 165 reorderStart=0; 166 lastCC=0; 167 } 168 } 169 isEmpty()170 public boolean isEmpty() { return str.length()==0; } length()171 public int length() { return str.length(); } getLastCC()172 public int getLastCC() { return lastCC; } 173 getStringBuilder()174 public StringBuilder getStringBuilder() { return str; } 175 equals(CharSequence s, int start, int limit)176 public boolean equals(CharSequence s, int start, int limit) { 177 return UTF16Plus.equal(str, 0, str.length(), s, start, limit); 178 } 179 append(int c, int cc)180 public void append(int c, int cc) { 181 if(lastCC<=cc || cc==0) { 182 str.appendCodePoint(c); 183 lastCC=cc; 184 if(cc<=1) { 185 reorderStart=str.length(); 186 } 187 } else { 188 insert(c, cc); 189 } 190 } append(CharSequence s, int start, int limit, boolean isNFD, int leadCC, int trailCC)191 public void append(CharSequence s, int start, int limit, boolean isNFD, 192 int leadCC, int trailCC) { 193 if(start==limit) { 194 return; 195 } 196 if(lastCC<=leadCC || leadCC==0) { 197 if(trailCC<=1) { 198 reorderStart=str.length()+(limit-start); 199 } else if(leadCC<=1) { 200 reorderStart=str.length()+1; // Ok if not a code point boundary. 201 } 202 str.append(s, start, limit); 203 lastCC=trailCC; 204 } else { 205 int c=Character.codePointAt(s, start); 206 start+=Character.charCount(c); 207 insert(c, leadCC); // insert first code point 208 while(start<limit) { 209 c=Character.codePointAt(s, start); 210 start+=Character.charCount(c); 211 if(start<limit) { 212 if (isNFD) { 213 leadCC = getCCFromYesOrMaybe(impl.getNorm16(c)); 214 } else { 215 leadCC = impl.getCC(impl.getNorm16(c)); 216 } 217 } else { 218 leadCC=trailCC; 219 } 220 append(c, leadCC); 221 } 222 } 223 } 224 // The following append() methods work like C++ appendZeroCC(). 225 // They assume that the cc or trailCC of their input is 0. 226 // Most of them implement Appendable interface methods. 227 @Override append(char c)228 public ReorderingBuffer append(char c) { 229 str.append(c); 230 lastCC=0; 231 reorderStart=str.length(); 232 return this; 233 } appendZeroCC(int c)234 public void appendZeroCC(int c) { 235 str.appendCodePoint(c); 236 lastCC=0; 237 reorderStart=str.length(); 238 } 239 @Override append(CharSequence s)240 public ReorderingBuffer append(CharSequence s) { 241 if(s.length()!=0) { 242 str.append(s); 243 lastCC=0; 244 reorderStart=str.length(); 245 } 246 return this; 247 } 248 @Override append(CharSequence s, int start, int limit)249 public ReorderingBuffer append(CharSequence s, int start, int limit) { 250 if(start!=limit) { 251 str.append(s, start, limit); 252 lastCC=0; 253 reorderStart=str.length(); 254 } 255 return this; 256 } 257 /** 258 * Flushes from the intermediate StringBuilder to the Appendable, 259 * if they are different objects. 260 * Used after recomposition. 261 * Must be called at the end when writing to a non-StringBuilder Appendable. 262 */ flush()263 public void flush() { 264 if(appIsStringBuilder) { 265 reorderStart=str.length(); 266 } else { 267 try { 268 app.append(str); 269 str.setLength(0); 270 reorderStart=0; 271 } catch(IOException e) { 272 throw new ICUUncheckedIOException(e); // Avoid declaring "throws IOException". 273 } 274 } 275 lastCC=0; 276 } 277 /** 278 * Flushes from the intermediate StringBuilder to the Appendable, 279 * if they are different objects. 280 * Then appends the new text to the Appendable or StringBuilder. 281 * Normally used after quick check loops find a non-empty sequence. 282 */ flushAndAppendZeroCC(CharSequence s, int start, int limit)283 public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) { 284 if(appIsStringBuilder) { 285 str.append(s, start, limit); 286 reorderStart=str.length(); 287 } else { 288 try { 289 app.append(str).append(s, start, limit); 290 str.setLength(0); 291 reorderStart=0; 292 } catch(IOException e) { 293 throw new ICUUncheckedIOException(e); // Avoid declaring "throws IOException". 294 } 295 } 296 lastCC=0; 297 return this; 298 } remove()299 public void remove() { 300 str.setLength(0); 301 lastCC=0; 302 reorderStart=0; 303 } removeSuffix(int suffixLength)304 public void removeSuffix(int suffixLength) { 305 int oldLength=str.length(); 306 str.delete(oldLength-suffixLength, oldLength); 307 lastCC=0; 308 reorderStart=str.length(); 309 } 310 311 /* 312 * TODO: Revisit whether it makes sense to track reorderStart. 313 * It is set to after the last known character with cc<=1, 314 * which stops previousCC() before it reads that character and looks up its cc. 315 * previousCC() is normally only called from insert(). 316 * In other words, reorderStart speeds up the insertion of a combining mark 317 * into a multi-combining mark sequence where it does not belong at the end. 318 * This might not be worth the trouble. 319 * On the other hand, it's not a huge amount of trouble. 320 * 321 * We probably need it for UNORM_SIMPLE_APPEND. 322 */ 323 324 // Inserts c somewhere before the last character. 325 // Requires 0<cc<lastCC which implies reorderStart<limit. insert(int c, int cc)326 private void insert(int c, int cc) { 327 for(setIterator(), skipPrevious(); previousCC()>cc;) {} 328 // insert c at codePointLimit, after the character with prevCC<=cc 329 if(c<=0xffff) { 330 str.insert(codePointLimit, (char)c); 331 if(cc<=1) { 332 reorderStart=codePointLimit+1; 333 } 334 } else { 335 str.insert(codePointLimit, Character.toChars(c)); 336 if(cc<=1) { 337 reorderStart=codePointLimit+2; 338 } 339 } 340 } 341 342 private final Normalizer2Impl impl; 343 private final Appendable app; 344 private final StringBuilder str; 345 private final boolean appIsStringBuilder; 346 private int reorderStart; 347 private int lastCC; 348 349 // private backward iterator setIterator()350 private void setIterator() { codePointStart=str.length(); } skipPrevious()351 private void skipPrevious() { // Requires 0<codePointStart. 352 codePointLimit=codePointStart; 353 codePointStart=str.offsetByCodePoints(codePointStart, -1); 354 } previousCC()355 private int previousCC() { // Returns 0 if there is no previous character. 356 codePointLimit=codePointStart; 357 if(reorderStart>=codePointStart) { 358 return 0; 359 } 360 int c=str.codePointBefore(codePointStart); 361 codePointStart-=Character.charCount(c); 362 return impl.getCCFromYesOrMaybeCP(c); 363 } 364 365 private int codePointStart, codePointLimit; 366 } 367 368 // TODO: Propose as public API on the UTF16 class. 369 // TODO: Propose widening UTF16 methods that take char to take int. 370 // TODO: Propose widening UTF16 methods that take String to take CharSequence. 371 /** 372 * @hide exposed on OHOS 373 */ 374 public static final class UTF16Plus { 375 /** 376 * Is this code point a lead surrogate (U+d800..U+dbff)? 377 * @param c code unit or code point 378 * @return true or false 379 */ isLeadSurrogate(int c)380 public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; } 381 /** 382 * Is this code point a trail surrogate (U+dc00..U+dfff)? 383 * @param c code unit or code point 384 * @return true or false 385 */ isTrailSurrogate(int c)386 public static boolean isTrailSurrogate(int c) { return (c & 0xfffffc00) == 0xdc00; } 387 /** 388 * Is this code point a surrogate (U+d800..U+dfff)? 389 * @param c code unit or code point 390 * @return true or false 391 */ isSurrogate(int c)392 public static boolean isSurrogate(int c) { return (c & 0xfffff800) == 0xd800; } 393 /** 394 * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), 395 * is it a lead surrogate? 396 * @param c code unit or code point 397 * @return true or false 398 */ isSurrogateLead(int c)399 public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } 400 /** 401 * Compares two CharSequence objects for binary equality. 402 * @param s1 first sequence 403 * @param s2 second sequence 404 * @return true if s1 contains the same text as s2 405 */ equal(CharSequence s1, CharSequence s2)406 public static boolean equal(CharSequence s1, CharSequence s2) { 407 if(s1==s2) { 408 return true; 409 } 410 int length=s1.length(); 411 if(length!=s2.length()) { 412 return false; 413 } 414 for(int i=0; i<length; ++i) { 415 if(s1.charAt(i)!=s2.charAt(i)) { 416 return false; 417 } 418 } 419 return true; 420 } 421 /** 422 * Compares two CharSequence subsequences for binary equality. 423 * @param s1 first sequence 424 * @param start1 start offset in first sequence 425 * @param limit1 limit offset in first sequence 426 * @param s2 second sequence 427 * @param start2 start offset in second sequence 428 * @param limit2 limit offset in second sequence 429 * @return true if s1.subSequence(start1, limit1) contains the same text 430 * as s2.subSequence(start2, limit2) 431 */ equal(CharSequence s1, int start1, int limit1, CharSequence s2, int start2, int limit2)432 public static boolean equal(CharSequence s1, int start1, int limit1, 433 CharSequence s2, int start2, int limit2) { 434 if((limit1-start1)!=(limit2-start2)) { 435 return false; 436 } 437 if(s1==s2 && start1==start2) { 438 return true; 439 } 440 while(start1<limit1) { 441 if(s1.charAt(start1++)!=s2.charAt(start2++)) { 442 return false; 443 } 444 } 445 return true; 446 } 447 } 448 Normalizer2Impl()449 public Normalizer2Impl() {} 450 451 private static final class IsAcceptable implements ICUBinary.Authenticate { 452 @Override isDataVersionAcceptable(byte version[])453 public boolean isDataVersionAcceptable(byte version[]) { 454 return version[0]==4; 455 } 456 } 457 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 458 private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" 459 load(ByteBuffer bytes)460 public Normalizer2Impl load(ByteBuffer bytes) { 461 try { 462 dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); 463 int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 464 if(indexesLength<=IX_MIN_LCCC_CP) { 465 throw new ICUUncheckedIOException("Normalizer2 data: not enough indexes"); 466 } 467 int[] inIndexes=new int[indexesLength]; 468 inIndexes[0]=indexesLength*4; 469 for(int i=1; i<indexesLength; ++i) { 470 inIndexes[i]=bytes.getInt(); 471 } 472 473 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 474 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 475 minLcccCP=inIndexes[IX_MIN_LCCC_CP]; 476 477 minYesNo=inIndexes[IX_MIN_YES_NO]; 478 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 479 minNoNo=inIndexes[IX_MIN_NO_NO]; 480 minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]; 481 minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]; 482 minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY]; 483 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 484 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 485 assert((minMaybeYes&7)==0); // 8-aligned for noNoDelta bit fields 486 centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1; 487 488 // Read the normTrie. 489 int offset=inIndexes[IX_NORM_TRIE_OFFSET]; 490 int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; 491 int triePosition = bytes.position(); 492 normTrie = CodePointTrie.Fast16.fromBinary(bytes); 493 int trieLength = bytes.position() - triePosition; 494 if(trieLength>(nextOffset-offset)) { 495 throw new ICUUncheckedIOException("Normalizer2 data: not enough bytes for normTrie"); 496 } 497 ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes 498 499 // Read the composition and mapping data. 500 offset=nextOffset; 501 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; 502 int numChars=(nextOffset-offset)/2; 503 if(numChars!=0) { 504 maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0); 505 extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT); 506 } 507 508 // smallFCD: new in formatVersion 2 509 offset=nextOffset; 510 smallFCD=new byte[0x100]; 511 bytes.get(smallFCD); 512 513 return this; 514 } catch(IOException e) { 515 throw new ICUUncheckedIOException(e); 516 } 517 } load(String name)518 public Normalizer2Impl load(String name) { 519 return load(ICUBinary.getRequiredData(name)); 520 } 521 addLcccChars(UnicodeSet set)522 public void addLcccChars(UnicodeSet set) { 523 int start = 0; 524 CodePointMap.Range range = new CodePointMap.Range(); 525 while (normTrie.getRange(start, CodePointMap.RangeOption.FIXED_LEAD_SURROGATES, INERT, 526 null, range)) { 527 int end = range.getEnd(); 528 int norm16 = range.getValue(); 529 if (norm16 > MIN_NORMAL_MAYBE_YES && norm16 != JAMO_VT) { 530 set.add(start, end); 531 } else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) { 532 int fcd16 = getFCD16(start); 533 if (fcd16 > 0xff) { set.add(start, end); } 534 } 535 start = end + 1; 536 } 537 } 538 addPropertyStarts(UnicodeSet set)539 public void addPropertyStarts(UnicodeSet set) { 540 // Add the start code point of each same-value range of the trie. 541 int start = 0; 542 CodePointMap.Range range = new CodePointMap.Range(); 543 while (normTrie.getRange(start, CodePointMap.RangeOption.FIXED_LEAD_SURROGATES, INERT, 544 null, range)) { 545 int end = range.getEnd(); 546 int value = range.getValue(); 547 set.add(start); 548 if (start != end && isAlgorithmicNoNo(value) && 549 (value & DELTA_TCCC_MASK) > DELTA_TCCC_1) { 550 // Range of code points with same-norm16-value algorithmic decompositions. 551 // They might have different non-zero FCD16 values. 552 int prevFCD16 = getFCD16(start); 553 while (++start <= end) { 554 int fcd16 = getFCD16(start); 555 if (fcd16 != prevFCD16) { 556 set.add(start); 557 prevFCD16 = fcd16; 558 } 559 } 560 } 561 start = end + 1; 562 } 563 564 /* add Hangul LV syllables and LV+1 because of skippables */ 565 for(int c=Hangul.HANGUL_BASE; c<Hangul.HANGUL_LIMIT; c+=Hangul.JAMO_T_COUNT) { 566 set.add(c); 567 set.add(c+1); 568 } 569 set.add(Hangul.HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ 570 } 571 addCanonIterPropertyStarts(UnicodeSet set)572 public void addCanonIterPropertyStarts(UnicodeSet set) { 573 // Add the start code point of each same-value range of the canonical iterator data trie. 574 ensureCanonIterData(); 575 // Currently only used for the SEGMENT_STARTER property. 576 int start = 0; 577 CodePointMap.Range range = new CodePointMap.Range(); 578 while (canonIterData.getRange(start, segmentStarterMapper, range)) { 579 set.add(start); 580 start = range.getEnd() + 1; 581 } 582 } 583 private static final CodePointMap.ValueFilter segmentStarterMapper = 584 new CodePointMap.ValueFilter() { 585 @Override 586 public int apply(int value) { 587 return value & CANON_NOT_SEGMENT_STARTER; 588 } 589 }; 590 591 // low-level properties ------------------------------------------------ *** 592 593 // Note: Normalizer2Impl.java r30983 (2011-nov-27) 594 // still had getFCDTrie() which built and cached an FCD trie. 595 // That provided faster access to FCD data than getFCD16FromNormData() 596 // but required synchronization and consumed some 10kB of heap memory 597 // in any process that uses FCD (e.g., via collation). 598 // minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance, 599 // at least for ASCII & CJK. 600 601 /** 602 * Builds the canonical-iterator data for this instance. 603 * This is required before any of {@link #isCanonSegmentStarter(int)} or 604 * {@link #getCanonStartSet(int, UnicodeSet)} are called, 605 * or else they crash. 606 * @return this 607 */ ensureCanonIterData()608 public synchronized Normalizer2Impl ensureCanonIterData() { 609 if(canonIterData==null) { 610 MutableCodePointTrie mutableTrie = new MutableCodePointTrie(0, 0); 611 canonStartSets=new ArrayList<UnicodeSet>(); 612 int start = 0; 613 CodePointMap.Range range = new CodePointMap.Range(); 614 while (normTrie.getRange(start, CodePointMap.RangeOption.FIXED_LEAD_SURROGATES, INERT, 615 null, range)) { 616 final int end = range.getEnd(); 617 final int norm16 = range.getValue(); 618 if(isInert(norm16) || (minYesNo<=norm16 && norm16<minNoNo)) { 619 // Inert, or 2-way mapping (including Hangul syllable). 620 // We do not write a canonStartSet for any yesNo character. 621 // Composites from 2-way mappings are added at runtime from the 622 // starter's compositions list, and the other characters in 623 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are 624 // "maybe" characters. 625 start = end + 1; 626 continue; 627 } 628 for (int c = start; c <= end; ++c) { 629 final int oldValue = mutableTrie.get(c); 630 int newValue=oldValue; 631 if(isMaybeOrNonZeroCC(norm16)) { 632 // not a segment starter if it occurs in a decomposition or has cc!=0 633 newValue|=CANON_NOT_SEGMENT_STARTER; 634 if(norm16<MIN_NORMAL_MAYBE_YES) { 635 newValue|=CANON_HAS_COMPOSITIONS; 636 } 637 } else if(norm16<minYesNo) { 638 newValue|=CANON_HAS_COMPOSITIONS; 639 } else { 640 // c has a one-way decomposition 641 int c2=c; 642 // Do not modify the whole-range norm16 value. 643 int norm16_2=norm16; 644 if (isDecompNoAlgorithmic(norm16_2)) { 645 // Maps to an isCompYesAndZeroCC. 646 c2 = mapAlgorithmic(c2, norm16_2); 647 norm16_2 = getRawNorm16(c2); 648 // No compatibility mappings for the CanonicalIterator. 649 assert(!(isHangulLV(norm16_2) || isHangulLVT(norm16_2))); 650 } 651 if (norm16_2 > minYesNo) { 652 // c decomposes, get everything from the variable-length extra data 653 int mapping=norm16_2>>OFFSET_SHIFT; 654 int firstUnit=extraData.charAt(mapping); 655 int length=firstUnit&MAPPING_LENGTH_MASK; 656 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 657 if(c==c2 && (extraData.charAt(mapping-1)&0xff)!=0) { 658 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0 659 } 660 } 661 // Skip empty mappings (no characters in the decomposition). 662 if(length!=0) { 663 ++mapping; // skip over the firstUnit 664 // add c to first code point's start set 665 int limit=mapping+length; 666 c2=extraData.codePointAt(mapping); 667 addToStartSet(mutableTrie, c, c2); 668 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a 669 // one-way mapping. A 2-way mapping is possible here after 670 // intermediate algorithmic mapping. 671 if(norm16_2>=minNoNo) { 672 while((mapping+=Character.charCount(c2))<limit) { 673 c2=extraData.codePointAt(mapping); 674 int c2Value = mutableTrie.get(c2); 675 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) { 676 mutableTrie.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER); 677 } 678 } 679 } 680 } 681 } else { 682 // c decomposed to c2 algorithmically; c has cc==0 683 addToStartSet(mutableTrie, c, c2); 684 } 685 } 686 if(newValue!=oldValue) { 687 mutableTrie.set(c, newValue); 688 } 689 } 690 start = end + 1; 691 } 692 canonIterData = mutableTrie.buildImmutable( 693 CodePointTrie.Type.SMALL, CodePointTrie.ValueWidth.BITS_32); 694 } 695 return this; 696 } 697 698 // The trie stores values for lead surrogate code *units*. 699 // Surrogate code *points* are inert. getNorm16(int c)700 public int getNorm16(int c) { 701 return UTF16Plus.isLeadSurrogate(c) ? INERT : normTrie.get(c); 702 } getRawNorm16(int c)703 public int getRawNorm16(int c) { return normTrie.get(c); } 704 getCompQuickCheck(int norm16)705 public int getCompQuickCheck(int norm16) { 706 if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) { 707 return 1; // yes 708 } else if(minMaybeYes<=norm16) { 709 return 2; // maybe 710 } else { 711 return 0; // no 712 } 713 } isAlgorithmicNoNo(int norm16)714 public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; } isCompNo(int norm16)715 public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; } isDecompYes(int norm16)716 public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } 717 getCC(int norm16)718 public int getCC(int norm16) { 719 if(norm16>=MIN_NORMAL_MAYBE_YES) { 720 return getCCFromNormalYesOrMaybe(norm16); 721 } 722 if(norm16<minNoNo || limitNoNo<=norm16) { 723 return 0; 724 } 725 return getCCFromNoNo(norm16); 726 } getCCFromNormalYesOrMaybe(int norm16)727 public static int getCCFromNormalYesOrMaybe(int norm16) { 728 return (norm16 >> OFFSET_SHIFT) & 0xff; 729 } getCCFromYesOrMaybe(int norm16)730 public static int getCCFromYesOrMaybe(int norm16) { 731 return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0; 732 } getCCFromYesOrMaybeCP(int c)733 public int getCCFromYesOrMaybeCP(int c) { 734 if (c < minCompNoMaybeCP) { return 0; } 735 return getCCFromYesOrMaybe(getNorm16(c)); 736 } 737 738 /** 739 * Returns the FCD data for code point c. 740 * @param c A Unicode code point. 741 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 742 */ getFCD16(int c)743 public int getFCD16(int c) { 744 if(c<minDecompNoCP) { 745 return 0; 746 } else if(c<=0xffff) { 747 if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } 748 } 749 return getFCD16FromNormData(c); 750 } 751 /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */ singleLeadMightHaveNonZeroFCD16(int lead)752 public boolean singleLeadMightHaveNonZeroFCD16(int lead) { 753 // 0<=lead<=0xffff 754 byte bits=smallFCD[lead>>8]; 755 if(bits==0) { return false; } 756 return ((bits>>((lead>>5)&7))&1)!=0; 757 } 758 759 /** Gets the FCD value from the regular normalization data. */ getFCD16FromNormData(int c)760 public int getFCD16FromNormData(int c) { 761 int norm16=getNorm16(c); 762 if (norm16 >= limitNoNo) { 763 if(norm16>=MIN_NORMAL_MAYBE_YES) { 764 // combining mark 765 norm16=getCCFromNormalYesOrMaybe(norm16); 766 return norm16|(norm16<<8); 767 } else if(norm16>=minMaybeYes) { 768 return 0; 769 } else { // isDecompNoAlgorithmic(norm16) 770 int deltaTrailCC = norm16 & DELTA_TCCC_MASK; 771 if (deltaTrailCC <= DELTA_TCCC_1) { 772 return deltaTrailCC >> OFFSET_SHIFT; 773 } 774 // Maps to an isCompYesAndZeroCC. 775 c=mapAlgorithmic(c, norm16); 776 norm16 = getRawNorm16(c); 777 } 778 } 779 if(norm16<=minYesNo || isHangulLVT(norm16)) { 780 // no decomposition or Hangul syllable, all zeros 781 return 0; 782 } 783 // c decomposes, get everything from the variable-length extra data 784 int mapping=norm16>>OFFSET_SHIFT; 785 int firstUnit=extraData.charAt(mapping); 786 int fcd16=firstUnit>>8; // tccc 787 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 788 fcd16|=extraData.charAt(mapping-1)&0xff00; // lccc 789 } 790 return fcd16; 791 } 792 793 /** 794 * Gets the decomposition for one code point. 795 * @param c code point 796 * @return c's decomposition, if it has one; returns null if it does not have a decomposition 797 */ getDecomposition(int c)798 public String getDecomposition(int c) { 799 int norm16; 800 if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) { 801 // c does not decompose 802 return null; 803 } 804 int decomp = -1; 805 if(isDecompNoAlgorithmic(norm16)) { 806 // Maps to an isCompYesAndZeroCC. 807 decomp=c=mapAlgorithmic(c, norm16); 808 // The mapping might decompose further. 809 norm16 = getRawNorm16(c); 810 } 811 if (norm16 < minYesNo) { 812 if(decomp<0) { 813 return null; 814 } else { 815 return UTF16.valueOf(decomp); 816 } 817 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 818 // Hangul syllable: decompose algorithmically 819 StringBuilder buffer=new StringBuilder(); 820 Hangul.decompose(c, buffer); 821 return buffer.toString(); 822 } 823 // c decomposes, get everything from the variable-length extra data 824 int mapping=norm16>>OFFSET_SHIFT; 825 int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK; 826 return extraData.substring(mapping, mapping+length); 827 } 828 829 /** 830 * Gets the raw decomposition for one code point. 831 * @param c code point 832 * @return c's raw decomposition, if it has one; returns null if it does not have a decomposition 833 */ getRawDecomposition(int c)834 public String getRawDecomposition(int c) { 835 int norm16; 836 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 837 // c does not decompose 838 return null; 839 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 840 // Hangul syllable: decompose algorithmically 841 StringBuilder buffer=new StringBuilder(); 842 Hangul.getRawDecomposition(c, buffer); 843 return buffer.toString(); 844 } else if(isDecompNoAlgorithmic(norm16)) { 845 return UTF16.valueOf(mapAlgorithmic(c, norm16)); 846 } 847 // c decomposes, get everything from the variable-length extra data 848 int mapping=norm16>>OFFSET_SHIFT; 849 int firstUnit=extraData.charAt(mapping); 850 int mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping 851 if((firstUnit&MAPPING_HAS_RAW_MAPPING)!=0) { 852 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word. 853 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD 854 int rawMapping=mapping-((firstUnit>>7)&1)-1; 855 char rm0=extraData.charAt(rawMapping); 856 if(rm0<=MAPPING_LENGTH_MASK) { 857 return extraData.substring(rawMapping-rm0, rawMapping); 858 } else { 859 // Copy the normal mapping and replace its first two code units with rm0. 860 StringBuilder buffer=new StringBuilder(mLength-1).append(rm0); 861 mapping+=1+2; // skip over the firstUnit and the first two mapping code units 862 return buffer.append(extraData, mapping, mapping+mLength-2).toString(); 863 } 864 } else { 865 mapping+=1; // skip over the firstUnit 866 return extraData.substring(mapping, mapping+mLength); 867 } 868 } 869 870 /** 871 * Returns true if code point c starts a canonical-iterator string segment. 872 * <b>{@link #ensureCanonIterData()} must have been called before this method, 873 * or else this method will crash.</b> 874 * @param c A Unicode code point. 875 * @return true if c starts a canonical-iterator string segment. 876 */ isCanonSegmentStarter(int c)877 public boolean isCanonSegmentStarter(int c) { 878 return canonIterData.get(c)>=0; 879 } 880 /** 881 * Returns true if there are characters whose decomposition starts with c. 882 * If so, then the set is cleared and then filled with those characters. 883 * <b>{@link #ensureCanonIterData()} must have been called before this method, 884 * or else this method will crash.</b> 885 * @param c A Unicode code point. 886 * @param set A UnicodeSet to receive the characters whose decompositions 887 * start with c, if there are any. 888 * @return true if there are characters whose decomposition starts with c. 889 */ getCanonStartSet(int c, UnicodeSet set)890 public boolean getCanonStartSet(int c, UnicodeSet set) { 891 int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER; 892 if(canonValue==0) { 893 return false; 894 } 895 set.clear(); 896 int value=canonValue&CANON_VALUE_MASK; 897 if((canonValue&CANON_HAS_SET)!=0) { 898 set.addAll(canonStartSets.get(value)); 899 } else if(value!=0) { 900 set.add(value); 901 } 902 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { 903 int norm16 = getRawNorm16(c); 904 if(norm16==JAMO_L) { 905 int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT; 906 set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1); 907 } else { 908 addComposites(getCompositionsList(norm16), set); 909 } 910 } 911 return true; 912 } 913 914 // Fixed norm16 values. 915 public static final int MIN_YES_YES_WITH_CC=0xfe02; 916 public static final int JAMO_VT=0xfe00; 917 public static final int MIN_NORMAL_MAYBE_YES=0xfc00; 918 public static final int JAMO_L=2; // offset=1 hasCompBoundaryAfter=FALSE 919 public static final int INERT=1; // offset=0 hasCompBoundaryAfter=TRUE 920 921 // norm16 bit 0 is comp-boundary-after. 922 public static final int HAS_COMP_BOUNDARY_AFTER=1; 923 public static final int OFFSET_SHIFT=1; 924 925 // For algorithmic one-way mappings, norm16 bits 2..1 indicate the 926 // tccc (0, 1, >1) for quick FCC boundary-after tests. 927 public static final int DELTA_TCCC_0=0; 928 public static final int DELTA_TCCC_1=2; 929 public static final int DELTA_TCCC_GT_1=4; 930 public static final int DELTA_TCCC_MASK=6; 931 public static final int DELTA_SHIFT=3; 932 933 public static final int MAX_DELTA=0x40; 934 935 // Byte offsets from the start of the data, after the generic header. 936 public static final int IX_NORM_TRIE_OFFSET=0; 937 public static final int IX_EXTRA_DATA_OFFSET=1; 938 public static final int IX_SMALL_FCD_OFFSET=2; 939 public static final int IX_RESERVED3_OFFSET=3; 940 public static final int IX_TOTAL_SIZE=7; 941 942 // Code point thresholds for quick check codes. 943 public static final int IX_MIN_DECOMP_NO_CP=8; 944 public static final int IX_MIN_COMP_NO_MAYBE_CP=9; 945 946 // Norm16 value thresholds for quick check combinations and types of extra data. 947 948 /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ 949 public static final int IX_MIN_YES_NO=10; 950 /** Mappings are comp-normalized. */ 951 public static final int IX_MIN_NO_NO=11; 952 public static final int IX_LIMIT_NO_NO=12; 953 public static final int IX_MIN_MAYBE_YES=13; 954 955 /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ 956 public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; 957 /** Mappings are not comp-normalized but have a comp boundary before. */ 958 public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15; 959 /** Mappings do not have a comp boundary before. */ 960 public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16; 961 /** Mappings to the empty string. */ 962 public static final int IX_MIN_NO_NO_EMPTY=17; 963 964 public static final int IX_MIN_LCCC_CP=18; 965 public static final int IX_COUNT=20; 966 967 public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; 968 public static final int MAPPING_HAS_RAW_MAPPING=0x40; 969 // unused bit 0x20; 970 public static final int MAPPING_LENGTH_MASK=0x1f; 971 972 public static final int COMP_1_LAST_TUPLE=0x8000; 973 public static final int COMP_1_TRIPLE=1; 974 public static final int COMP_1_TRAIL_LIMIT=0x3400; 975 public static final int COMP_1_TRAIL_MASK=0x7ffe; 976 public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit 977 public static final int COMP_2_TRAIL_SHIFT=6; 978 public static final int COMP_2_TRAIL_MASK=0xffc0; 979 980 // higher-level functionality ------------------------------------------ *** 981 982 // NFD without an NFD Normalizer2 instance. decompose(CharSequence s, StringBuilder dest)983 public Appendable decompose(CharSequence s, StringBuilder dest) { 984 decompose(s, 0, s.length(), dest, s.length()); 985 return dest; 986 } 987 /** 988 * Decomposes s[src, limit[ and writes the result to dest. 989 * limit can be NULL if src is NUL-terminated. 990 * destLengthEstimate is the initial dest buffer capacity and can be -1. 991 */ decompose(CharSequence s, int src, int limit, StringBuilder dest, int destLengthEstimate)992 public void decompose(CharSequence s, int src, int limit, StringBuilder dest, 993 int destLengthEstimate) { 994 if(destLengthEstimate<0) { 995 destLengthEstimate=limit-src; 996 } 997 dest.setLength(0); 998 ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); 999 decompose(s, src, limit, buffer); 1000 } 1001 1002 // Dual functionality: 1003 // buffer!=NULL: normalize 1004 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes decompose(CharSequence s, int src, int limit, ReorderingBuffer buffer)1005 public int decompose(CharSequence s, int src, int limit, 1006 ReorderingBuffer buffer) { 1007 int minNoCP=minDecompNoCP; 1008 1009 int prevSrc; 1010 int c=0; 1011 int norm16=0; 1012 1013 // only for quick check 1014 int prevBoundary=src; 1015 int prevCC=0; 1016 1017 for(;;) { 1018 // count code units below the minimum or with irrelevant data for the quick check 1019 for(prevSrc=src; src!=limit;) { 1020 if( (c=s.charAt(src))<minNoCP || 1021 isMostDecompYesAndZeroCC(norm16=normTrie.bmpGet(c)) 1022 ) { 1023 ++src; 1024 } else if (!UTF16Plus.isLeadSurrogate(c)) { 1025 break; 1026 } else { 1027 char c2; 1028 if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) { 1029 c = Character.toCodePoint((char)c, c2); 1030 norm16 = normTrie.suppGet(c); 1031 if (isMostDecompYesAndZeroCC(norm16)) { 1032 src += 2; 1033 } else { 1034 break; 1035 } 1036 } else { 1037 ++src; // unpaired lead surrogate: inert 1038 } 1039 } 1040 } 1041 // copy these code units all at once 1042 if(src!=prevSrc) { 1043 if(buffer!=null) { 1044 buffer.flushAndAppendZeroCC(s, prevSrc, src); 1045 } else { 1046 prevCC=0; 1047 prevBoundary=src; 1048 } 1049 } 1050 if(src==limit) { 1051 break; 1052 } 1053 1054 // Check one above-minimum, relevant code point. 1055 src+=Character.charCount(c); 1056 if(buffer!=null) { 1057 decompose(c, norm16, buffer); 1058 } else { 1059 if(isDecompYes(norm16)) { 1060 int cc=getCCFromYesOrMaybe(norm16); 1061 if(prevCC<=cc || cc==0) { 1062 prevCC=cc; 1063 if(cc<=1) { 1064 prevBoundary=src; 1065 } 1066 continue; 1067 } 1068 } 1069 return prevBoundary; // "no" or cc out of order 1070 } 1071 } 1072 return src; 1073 } decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer)1074 public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) { 1075 int limit=s.length(); 1076 if(limit==0) { 1077 return; 1078 } 1079 if(doDecompose) { 1080 decompose(s, 0, limit, buffer); 1081 return; 1082 } 1083 // Just merge the strings at the boundary. 1084 int c=Character.codePointAt(s, 0); 1085 int src=0; 1086 int firstCC, prevCC, cc; 1087 firstCC=prevCC=cc=getCC(getNorm16(c)); 1088 while(cc!=0) { 1089 prevCC=cc; 1090 src+=Character.charCount(c); 1091 if(src>=limit) { 1092 break; 1093 } 1094 c=Character.codePointAt(s, src); 1095 cc=getCC(getNorm16(c)); 1096 }; 1097 buffer.append(s, 0, src, false, firstCC, prevCC); 1098 buffer.append(s, src, limit); 1099 } 1100 1101 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 1102 // doCompose: normalize 1103 // !doCompose: isNormalized (buffer must be empty and initialized) compose(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doCompose, ReorderingBuffer buffer)1104 public boolean compose(CharSequence s, int src, int limit, 1105 boolean onlyContiguous, 1106 boolean doCompose, 1107 ReorderingBuffer buffer) { 1108 int prevBoundary=src; 1109 int minNoMaybeCP=minCompNoMaybeCP; 1110 1111 for (;;) { 1112 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 1113 // or with (compYes && ccc==0) properties. 1114 int prevSrc; 1115 int c = 0; 1116 int norm16 = 0; 1117 for (;;) { 1118 if (src == limit) { 1119 if (prevBoundary != limit && doCompose) { 1120 buffer.append(s, prevBoundary, limit); 1121 } 1122 return true; 1123 } 1124 if( (c=s.charAt(src))<minNoMaybeCP || 1125 isCompYesAndZeroCC(norm16=normTrie.bmpGet(c)) 1126 ) { 1127 ++src; 1128 } else { 1129 prevSrc = src++; 1130 if (!UTF16Plus.isLeadSurrogate(c)) { 1131 break; 1132 } else { 1133 char c2; 1134 if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) { 1135 ++src; 1136 c = Character.toCodePoint((char)c, c2); 1137 norm16 = normTrie.suppGet(c); 1138 if (!isCompYesAndZeroCC(norm16)) { 1139 break; 1140 } 1141 } 1142 } 1143 } 1144 } 1145 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1146 // The current character is either a "noNo" (has a mapping) 1147 // or a "maybeYes" (combines backward) 1148 // or a "yesYes" with ccc!=0. 1149 // It is not a Hangul syllable or Jamo L because those have "yes" properties. 1150 1151 // Medium-fast path: Handle cases that do not require full decomposition and recomposition. 1152 if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes 1153 if (!doCompose) { 1154 return false; 1155 } 1156 // Fast path for mapping a character that is immediately surrounded by boundaries. 1157 // In this case, we need not decompose around the current character. 1158 if (isDecompNoAlgorithmic(norm16)) { 1159 // Maps to a single isCompYesAndZeroCC character 1160 // which also implies hasCompBoundaryBefore. 1161 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 1162 hasCompBoundaryBefore(s, src, limit)) { 1163 if (prevBoundary != prevSrc) { 1164 buffer.append(s, prevBoundary, prevSrc); 1165 } 1166 buffer.append(mapAlgorithmic(c, norm16), 0); 1167 prevBoundary = src; 1168 continue; 1169 } 1170 } else if (norm16 < minNoNoCompBoundaryBefore) { 1171 // The mapping is comp-normalized which also implies hasCompBoundaryBefore. 1172 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 1173 hasCompBoundaryBefore(s, src, limit)) { 1174 if (prevBoundary != prevSrc) { 1175 buffer.append(s, prevBoundary, prevSrc); 1176 } 1177 int mapping = norm16 >> OFFSET_SHIFT; 1178 int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK; 1179 buffer.append(extraData, mapping, mapping + length); 1180 prevBoundary = src; 1181 continue; 1182 } 1183 } else if (norm16 >= minNoNoEmpty) { 1184 // The current character maps to nothing. 1185 // Simply omit it from the output if there is a boundary before _or_ after it. 1186 // The character itself implies no boundaries. 1187 if (hasCompBoundaryBefore(s, src, limit) || 1188 hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) { 1189 if (prevBoundary != prevSrc) { 1190 buffer.append(s, prevBoundary, prevSrc); 1191 } 1192 prevBoundary = src; 1193 continue; 1194 } 1195 } 1196 // Other "noNo" type, or need to examine more text around this character: 1197 // Fall through to the slow path. 1198 } else if (isJamoVT(norm16) && prevBoundary != prevSrc) { 1199 char prev=s.charAt(prevSrc-1); 1200 if(c<Hangul.JAMO_T_BASE) { 1201 // The current character is a Jamo Vowel, 1202 // compose with previous Jamo L and following Jamo T. 1203 char l = (char)(prev-Hangul.JAMO_L_BASE); 1204 if(l<Hangul.JAMO_L_COUNT) { 1205 if (!doCompose) { 1206 return false; 1207 } 1208 int t; 1209 if (src != limit && 1210 0 < (t = (s.charAt(src) - Hangul.JAMO_T_BASE)) && 1211 t < Hangul.JAMO_T_COUNT) { 1212 // The next character is a Jamo T. 1213 ++src; 1214 } else if (hasCompBoundaryBefore(s, src, limit)) { 1215 // No Jamo T follows, not even via decomposition. 1216 t = 0; 1217 } else { 1218 t = -1; 1219 } 1220 if (t >= 0) { 1221 int syllable = Hangul.HANGUL_BASE + 1222 (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) * 1223 Hangul.JAMO_T_COUNT + t; 1224 --prevSrc; // Replace the Jamo L as well. 1225 if (prevBoundary != prevSrc) { 1226 buffer.append(s, prevBoundary, prevSrc); 1227 } 1228 buffer.append((char)syllable); 1229 prevBoundary = src; 1230 continue; 1231 } 1232 // If we see L+V+x where x!=T then we drop to the slow path, 1233 // decompose and recompose. 1234 // This is to deal with NFKC finding normal L and V but a 1235 // compatibility variant of a T. 1236 // We need to either fully compose that combination here 1237 // (which would complicate the code and may not work with strange custom data) 1238 // or use the slow path. 1239 } 1240 } else if (Hangul.isHangulLV(prev)) { 1241 // The current character is a Jamo Trailing consonant, 1242 // compose with previous Hangul LV that does not contain a Jamo T. 1243 if (!doCompose) { 1244 return false; 1245 } 1246 int syllable = prev + c - Hangul.JAMO_T_BASE; 1247 --prevSrc; // Replace the Hangul LV as well. 1248 if (prevBoundary != prevSrc) { 1249 buffer.append(s, prevBoundary, prevSrc); 1250 } 1251 buffer.append((char)syllable); 1252 prevBoundary = src; 1253 continue; 1254 } 1255 // No matching context, or may need to decompose surrounding text first: 1256 // Fall through to the slow path. 1257 } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC 1258 // One or more combining marks that do not combine-back: 1259 // Check for canonical order, copy unchanged if ok and 1260 // if followed by a character with a boundary-before. 1261 int cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0 1262 if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) { 1263 // Fails FCD test, need to decompose and contiguously recompose. 1264 if (!doCompose) { 1265 return false; 1266 } 1267 } else { 1268 // If !onlyContiguous (not FCC), then we ignore the tccc of 1269 // the previous character which passed the quick check "yes && ccc==0" test. 1270 int n16; 1271 for (;;) { 1272 if (src == limit) { 1273 if (doCompose) { 1274 buffer.append(s, prevBoundary, limit); 1275 } 1276 return true; 1277 } 1278 int prevCC = cc; 1279 c = Character.codePointAt(s, src); 1280 n16 = normTrie.get(c); 1281 if (n16 >= MIN_YES_YES_WITH_CC) { 1282 cc = getCCFromNormalYesOrMaybe(n16); 1283 if (prevCC > cc) { 1284 if (!doCompose) { 1285 return false; 1286 } 1287 break; 1288 } 1289 } else { 1290 break; 1291 } 1292 src += Character.charCount(c); 1293 } 1294 // p is after the last in-order combining mark. 1295 // If there is a boundary here, then we continue with no change. 1296 if (norm16HasCompBoundaryBefore(n16)) { 1297 if (isCompYesAndZeroCC(n16)) { 1298 src += Character.charCount(c); 1299 } 1300 continue; 1301 } 1302 // Use the slow path. There is no boundary in [prevSrc, src[. 1303 } 1304 } 1305 1306 // Slow path: Find the nearest boundaries around the current character, 1307 // decompose and recompose. 1308 if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { 1309 c = Character.codePointBefore(s, prevSrc); 1310 norm16 = normTrie.get(c); 1311 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 1312 prevSrc -= Character.charCount(c); 1313 } 1314 } 1315 if (doCompose && prevBoundary != prevSrc) { 1316 buffer.append(s, prevBoundary, prevSrc); 1317 } 1318 int recomposeStartIndex=buffer.length(); 1319 // We know there is not a boundary here. 1320 decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous, 1321 buffer); 1322 // Decompose until the next boundary. 1323 src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous, 1324 buffer); 1325 recompose(buffer, recomposeStartIndex, onlyContiguous); 1326 if(!doCompose) { 1327 if(!buffer.equals(s, prevSrc, src)) { 1328 return false; 1329 } 1330 buffer.remove(); 1331 } 1332 prevBoundary=src; 1333 } 1334 } 1335 1336 /** 1337 * Very similar to compose(): Make the same changes in both places if relevant. 1338 * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) 1339 * !doSpan: quickCheck 1340 * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and 1341 * bit 0: set if "maybe"; otherwise, if the span length<s.length() 1342 * then the quick check result is "no" 1343 */ composeQuickCheck(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doSpan)1344 public int composeQuickCheck(CharSequence s, int src, int limit, 1345 boolean onlyContiguous, boolean doSpan) { 1346 int qcResult=0; 1347 int prevBoundary=src; 1348 int minNoMaybeCP=minCompNoMaybeCP; 1349 1350 for(;;) { 1351 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 1352 // or with (compYes && ccc==0) properties. 1353 int prevSrc; 1354 int c = 0; 1355 int norm16 = 0; 1356 for (;;) { 1357 if(src==limit) { 1358 return (src<<1)|qcResult; // "yes" or "maybe" 1359 } 1360 if( (c=s.charAt(src))<minNoMaybeCP || 1361 isCompYesAndZeroCC(norm16=normTrie.bmpGet(c)) 1362 ) { 1363 ++src; 1364 } else { 1365 prevSrc = src++; 1366 if (!UTF16Plus.isLeadSurrogate(c)) { 1367 break; 1368 } else { 1369 char c2; 1370 if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) { 1371 ++src; 1372 c = Character.toCodePoint((char)c, c2); 1373 norm16 = normTrie.suppGet(c); 1374 if (!isCompYesAndZeroCC(norm16)) { 1375 break; 1376 } 1377 } 1378 } 1379 } 1380 } 1381 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1382 // The current character is either a "noNo" (has a mapping) 1383 // or a "maybeYes" (combines backward) 1384 // or a "yesYes" with ccc!=0. 1385 // It is not a Hangul syllable or Jamo L because those have "yes" properties. 1386 1387 int prevNorm16 = INERT; 1388 if (prevBoundary != prevSrc) { 1389 prevBoundary = prevSrc; 1390 if (!norm16HasCompBoundaryBefore(norm16)) { 1391 c = Character.codePointBefore(s, prevSrc); 1392 int n16 = getNorm16(c); 1393 if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) { 1394 prevBoundary -= Character.charCount(c); 1395 prevNorm16 = n16; 1396 } 1397 } 1398 } 1399 1400 if(isMaybeOrNonZeroCC(norm16)) { 1401 int cc=getCCFromYesOrMaybe(norm16); 1402 if (onlyContiguous /* FCC */ && cc != 0 && 1403 getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) { 1404 // The [prevBoundary..prevSrc[ character 1405 // passed the quick check "yes && ccc==0" test 1406 // but is out of canonical order with the current combining mark. 1407 } else { 1408 // If !onlyContiguous (not FCC), then we ignore the tccc of 1409 // the previous character which passed the quick check "yes && ccc==0" test. 1410 for (;;) { 1411 if (norm16 < MIN_YES_YES_WITH_CC) { 1412 if (!doSpan) { 1413 qcResult = 1; 1414 } else { 1415 return prevBoundary << 1; // spanYes does not care to know it's "maybe" 1416 } 1417 } 1418 if (src == limit) { 1419 return (src<<1) | qcResult; // "yes" or "maybe" 1420 } 1421 int prevCC = cc; 1422 c = Character.codePointAt(s, src); 1423 norm16 = getNorm16(c); 1424 if (isMaybeOrNonZeroCC(norm16)) { 1425 cc = getCCFromYesOrMaybe(norm16); 1426 if (!(prevCC <= cc || cc == 0)) { 1427 break; 1428 } 1429 } else { 1430 break; 1431 } 1432 src += Character.charCount(c); 1433 } 1434 // src is after the last in-order combining mark. 1435 if (isCompYesAndZeroCC(norm16)) { 1436 prevBoundary = src; 1437 src += Character.charCount(c); 1438 continue; 1439 } 1440 } 1441 } 1442 return prevBoundary<<1; // "no" 1443 } 1444 } composeAndAppend(CharSequence s, boolean doCompose, boolean onlyContiguous, ReorderingBuffer buffer)1445 public void composeAndAppend(CharSequence s, 1446 boolean doCompose, 1447 boolean onlyContiguous, 1448 ReorderingBuffer buffer) { 1449 int src=0, limit=s.length(); 1450 if(!buffer.isEmpty()) { 1451 int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous); 1452 if(0!=firstStarterInSrc) { 1453 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), 1454 buffer.length(), onlyContiguous); 1455 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ 1456 firstStarterInSrc+16); 1457 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); 1458 buffer.removeSuffix(buffer.length()-lastStarterInDest); 1459 middle.append(s, 0, firstStarterInSrc); 1460 compose(middle, 0, middle.length(), onlyContiguous, true, buffer); 1461 src=firstStarterInSrc; 1462 } 1463 } 1464 if(doCompose) { 1465 compose(s, src, limit, onlyContiguous, true, buffer); 1466 } else { 1467 buffer.append(s, src, limit); 1468 } 1469 } 1470 // Dual functionality: 1471 // buffer!=NULL: normalize 1472 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer)1473 public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { 1474 // Note: In this function we use buffer->appendZeroCC() because we track 1475 // the lead and trail combining classes here, rather than leaving it to 1476 // the ReorderingBuffer. 1477 // The exception is the call to decomposeShort() which uses the buffer 1478 // in the normal way. 1479 1480 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 1481 // Similar to the prevBoundary in the compose() implementation. 1482 int prevBoundary=src; 1483 int prevSrc; 1484 int c=0; 1485 int prevFCD16=0; 1486 int fcd16=0; 1487 1488 for(;;) { 1489 // count code units with lccc==0 1490 for(prevSrc=src; src!=limit;) { 1491 if((c=s.charAt(src))<minLcccCP) { 1492 prevFCD16=~c; 1493 ++src; 1494 } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 1495 prevFCD16=0; 1496 ++src; 1497 } else { 1498 if (UTF16Plus.isLeadSurrogate(c)) { 1499 char c2; 1500 if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) { 1501 c = Character.toCodePoint((char)c, c2); 1502 } 1503 } 1504 if((fcd16=getFCD16FromNormData(c))<=0xff) { 1505 prevFCD16=fcd16; 1506 src+=Character.charCount(c); 1507 } else { 1508 break; 1509 } 1510 } 1511 } 1512 // copy these code units all at once 1513 if(src!=prevSrc) { 1514 if(src==limit) { 1515 if(buffer!=null) { 1516 buffer.flushAndAppendZeroCC(s, prevSrc, src); 1517 } 1518 break; 1519 } 1520 prevBoundary=src; 1521 // We know that the previous character's lccc==0. 1522 if(prevFCD16<0) { 1523 // Fetching the fcd16 value was deferred for this below-minLcccCP code point. 1524 int prev=~prevFCD16; 1525 if(prev<minDecompNoCP) { 1526 prevFCD16=0; 1527 } else { 1528 prevFCD16=getFCD16FromNormData(prev); 1529 if(prevFCD16>1) { 1530 --prevBoundary; 1531 } 1532 } 1533 } else { 1534 int p=src-1; 1535 if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p && 1536 Character.isHighSurrogate(s.charAt(p-1)) 1537 ) { 1538 --p; 1539 // Need to fetch the previous character's FCD value because 1540 // prevFCD16 was just for the trail surrogate code point. 1541 prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1))); 1542 // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 1543 } 1544 if(prevFCD16>1) { 1545 prevBoundary=p; 1546 } 1547 } 1548 if(buffer!=null) { 1549 // The last lccc==0 character is excluded from the 1550 // flush-and-append call in case it needs to be modified. 1551 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); 1552 buffer.append(s, prevBoundary, src); 1553 } 1554 // The start of the current character (c). 1555 prevSrc=src; 1556 } else if(src==limit) { 1557 break; 1558 } 1559 1560 src+=Character.charCount(c); 1561 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 1562 // Check for proper order, and decompose locally if necessary. 1563 if((prevFCD16&0xff)<=(fcd16>>8)) { 1564 // proper order: prev tccc <= current lccc 1565 if((fcd16&0xff)<=1) { 1566 prevBoundary=src; 1567 } 1568 if(buffer!=null) { 1569 buffer.appendZeroCC(c); 1570 } 1571 prevFCD16=fcd16; 1572 continue; 1573 } else if(buffer==null) { 1574 return prevBoundary; // quick check "no" 1575 } else { 1576 /* 1577 * Back out the part of the source that we copied or appended 1578 * already but is now going to be decomposed. 1579 * prevSrc is set to after what was copied/appended. 1580 */ 1581 buffer.removeSuffix(prevSrc-prevBoundary); 1582 /* 1583 * Find the part of the source that needs to be decomposed, 1584 * up to the next safe boundary. 1585 */ 1586 src=findNextFCDBoundary(s, src, limit); 1587 /* 1588 * The source text does not fulfill the conditions for FCD. 1589 * Decompose and reorder a limited piece of the text. 1590 */ 1591 decomposeShort(s, prevBoundary, src, false, false, buffer); 1592 prevBoundary=src; 1593 prevFCD16=0; 1594 } 1595 } 1596 return src; 1597 } makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer)1598 public void makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer) { 1599 int src=0, limit=s.length(); 1600 if(!buffer.isEmpty()) { 1601 int firstBoundaryInSrc=findNextFCDBoundary(s, 0, limit); 1602 if(0!=firstBoundaryInSrc) { 1603 int lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStringBuilder(), 1604 buffer.length()); 1605 StringBuilder middle=new StringBuilder((buffer.length()-lastBoundaryInDest)+ 1606 firstBoundaryInSrc+16); 1607 middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length()); 1608 buffer.removeSuffix(buffer.length()-lastBoundaryInDest); 1609 middle.append(s, 0, firstBoundaryInSrc); 1610 makeFCD(middle, 0, middle.length(), buffer); 1611 src=firstBoundaryInSrc; 1612 } 1613 } 1614 if(doMakeFCD) { 1615 makeFCD(s, src, limit, buffer); 1616 } else { 1617 buffer.append(s, src, limit); 1618 } 1619 } 1620 hasDecompBoundaryBefore(int c)1621 public boolean hasDecompBoundaryBefore(int c) { 1622 return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) || 1623 norm16HasDecompBoundaryBefore(getNorm16(c)); 1624 } norm16HasDecompBoundaryBefore(int norm16)1625 public boolean norm16HasDecompBoundaryBefore(int norm16) { 1626 if (norm16 < minNoNoCompNoMaybeCC) { 1627 return true; 1628 } 1629 if (norm16 >= limitNoNo) { 1630 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; 1631 } 1632 // c decomposes, get everything from the variable-length extra data 1633 int mapping=norm16>>OFFSET_SHIFT; 1634 int firstUnit=extraData.charAt(mapping); 1635 // true if leadCC==0 (hasFCDBoundaryBefore()) 1636 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; 1637 } hasDecompBoundaryAfter(int c)1638 public boolean hasDecompBoundaryAfter(int c) { 1639 if (c < minDecompNoCP) { 1640 return true; 1641 } 1642 if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) { 1643 return true; 1644 } 1645 return norm16HasDecompBoundaryAfter(getNorm16(c)); 1646 } norm16HasDecompBoundaryAfter(int norm16)1647 public boolean norm16HasDecompBoundaryAfter(int norm16) { 1648 if(norm16 <= minYesNo || isHangulLVT(norm16)) { 1649 return true; 1650 } 1651 if (norm16 >= limitNoNo) { 1652 if (isMaybeOrNonZeroCC(norm16)) { 1653 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; 1654 } 1655 // Maps to an isCompYesAndZeroCC. 1656 return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; 1657 } 1658 // c decomposes, get everything from the variable-length extra data 1659 int mapping=norm16>>OFFSET_SHIFT; 1660 int firstUnit=extraData.charAt(mapping); 1661 // decomp after-boundary: same as hasFCDBoundaryAfter(), 1662 // fcd16<=1 || trailCC==0 1663 if(firstUnit>0x1ff) { 1664 return false; // trailCC>1 1665 } 1666 if(firstUnit<=0xff) { 1667 return true; // trailCC==0 1668 } 1669 // if(trailCC==1) test leadCC==0, same as checking for before-boundary 1670 // true if leadCC==0 (hasFCDBoundaryBefore()) 1671 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; 1672 } isDecompInert(int c)1673 public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } 1674 hasCompBoundaryBefore(int c)1675 public boolean hasCompBoundaryBefore(int c) { 1676 return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c)); 1677 } hasCompBoundaryAfter(int c, boolean onlyContiguous)1678 public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous) { 1679 return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous); 1680 } isCompInert(int c, boolean onlyContiguous)1681 public boolean isCompInert(int c, boolean onlyContiguous) { 1682 int norm16=getNorm16(c); 1683 return isCompYesAndZeroCC(norm16) && 1684 (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 && 1685 (!onlyContiguous || isInert(norm16) || extraData.charAt(norm16>>OFFSET_SHIFT) <= 0x1ff); 1686 } 1687 hasFCDBoundaryBefore(int c)1688 public boolean hasFCDBoundaryBefore(int c) { return hasDecompBoundaryBefore(c); } hasFCDBoundaryAfter(int c)1689 public boolean hasFCDBoundaryAfter(int c) { return hasDecompBoundaryAfter(c); } isFCDInert(int c)1690 public boolean isFCDInert(int c) { return getFCD16(c)<=1; } 1691 isMaybe(int norm16)1692 private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } isMaybeOrNonZeroCC(int norm16)1693 private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } isInert(int norm16)1694 private static boolean isInert(int norm16) { return norm16==INERT; } isJamoL(int norm16)1695 private static boolean isJamoL(int norm16) { return norm16==JAMO_L; } isJamoVT(int norm16)1696 private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } hangulLVT()1697 private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; } isHangulLV(int norm16)1698 private boolean isHangulLV(int norm16) { return norm16==minYesNo; } isHangulLVT(int norm16)1699 private boolean isHangulLVT(int norm16) { 1700 return norm16==hangulLVT(); 1701 } isCompYesAndZeroCC(int norm16)1702 private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } 1703 // UBool isCompYes(uint16_t norm16) const { 1704 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; 1705 // } 1706 // UBool isCompYesOrMaybe(uint16_t norm16) const { 1707 // return norm16<minNoNo || minMaybeYes<=norm16; 1708 // } 1709 // private boolean hasZeroCCFromDecompYes(int norm16) { 1710 // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1711 // } isDecompYesAndZeroCC(int norm16)1712 private boolean isDecompYesAndZeroCC(int norm16) { 1713 return norm16<minYesNo || 1714 norm16==JAMO_VT || 1715 (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); 1716 } 1717 /** 1718 * A little faster and simpler than isDecompYesAndZeroCC() but does not include 1719 * the MaybeYes which combine-forward and have ccc=0. 1720 * (Standard Unicode 10 normalization does not have such characters.) 1721 */ isMostDecompYesAndZeroCC(int norm16)1722 private boolean isMostDecompYesAndZeroCC(int norm16) { 1723 return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1724 } isDecompNoAlgorithmic(int norm16)1725 private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; } 1726 1727 // For use with isCompYes(). 1728 // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. 1729 // static uint8_t getCCFromYes(uint16_t norm16) { 1730 // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0; 1731 // } getCCFromNoNo(int norm16)1732 private int getCCFromNoNo(int norm16) { 1733 int mapping=norm16>>OFFSET_SHIFT; 1734 if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1735 return extraData.charAt(mapping-1)&0xff; 1736 } else { 1737 return 0; 1738 } 1739 } getTrailCCFromCompYesAndZeroCC(int norm16)1740 int getTrailCCFromCompYesAndZeroCC(int norm16) { 1741 if(norm16<=minYesNo) { 1742 return 0; // yesYes and Hangul LV have ccc=tccc=0 1743 } else { 1744 // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here. 1745 return extraData.charAt(norm16>>OFFSET_SHIFT)>>8; // tccc from yesNo 1746 } 1747 } 1748 1749 // Requires algorithmic-NoNo. mapAlgorithmic(int c, int norm16)1750 private int mapAlgorithmic(int c, int norm16) { 1751 return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta; 1752 } 1753 1754 // Requires minYesNo<norm16<limitNoNo. 1755 // private int getMapping(int norm16) { return extraData+(norm16>>OFFSET_SHIFT); } 1756 1757 /** 1758 * @return index into maybeYesCompositions, or -1 1759 */ getCompositionsListForDecompYes(int norm16)1760 private int getCompositionsListForDecompYes(int norm16) { 1761 if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) { 1762 return -1; 1763 } else { 1764 if((norm16-=minMaybeYes)<0) { 1765 // norm16<minMaybeYes: index into extraData which is a substring at 1766 // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes] 1767 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16 1768 norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list 1769 } 1770 return norm16>>OFFSET_SHIFT; 1771 } 1772 } 1773 /** 1774 * @return index into maybeYesCompositions 1775 */ getCompositionsListForComposite(int norm16)1776 private int getCompositionsListForComposite(int norm16) { 1777 // A composite has both mapping & compositions list. 1778 int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; 1779 int firstUnit=maybeYesCompositions.charAt(list); 1780 return list+ // mapping in maybeYesCompositions 1781 1+ // +1 to skip the first unit with the mapping length 1782 (firstUnit&MAPPING_LENGTH_MASK); // + mapping length 1783 } getCompositionsListForMaybe(int norm16)1784 private int getCompositionsListForMaybe(int norm16) { 1785 // minMaybeYes<=norm16<MIN_NORMAL_MAYBE_YES 1786 return (norm16-minMaybeYes)>>OFFSET_SHIFT; 1787 } 1788 /** 1789 * @param c code point must have compositions 1790 * @return index into maybeYesCompositions 1791 */ getCompositionsList(int norm16)1792 private int getCompositionsList(int norm16) { 1793 return isDecompYes(norm16) ? 1794 getCompositionsListForDecompYes(norm16) : 1795 getCompositionsListForComposite(norm16); 1796 } 1797 1798 // Decompose a short piece of text which is likely to contain characters that 1799 // fail the quick check loop and/or where the quick check loop's overhead 1800 // is unlikely to be amortized. 1801 // Called by the compose() and makeFCD() implementations. 1802 // Public in Java for collation implementation code. decomposeShort( CharSequence s, int src, int limit, boolean stopAtCompBoundary, boolean onlyContiguous, ReorderingBuffer buffer)1803 private int decomposeShort( 1804 CharSequence s, int src, int limit, 1805 boolean stopAtCompBoundary, boolean onlyContiguous, 1806 ReorderingBuffer buffer) { 1807 while(src<limit) { 1808 int c=Character.codePointAt(s, src); 1809 if (stopAtCompBoundary && c < minCompNoMaybeCP) { 1810 return src; 1811 } 1812 int norm16 = getNorm16(c); 1813 if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) { 1814 return src; 1815 } 1816 src+=Character.charCount(c); 1817 decompose(c, norm16, buffer); 1818 if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 1819 return src; 1820 } 1821 } 1822 return src; 1823 } decompose(int c, int norm16, ReorderingBuffer buffer)1824 private void decompose(int c, int norm16, ReorderingBuffer buffer) { 1825 // get the decomposition and the lead and trail cc's 1826 if (norm16 >= limitNoNo) { 1827 if (isMaybeOrNonZeroCC(norm16)) { 1828 buffer.append(c, getCCFromYesOrMaybe(norm16)); 1829 return; 1830 } 1831 // Maps to an isCompYesAndZeroCC. 1832 c=mapAlgorithmic(c, norm16); 1833 norm16 = getRawNorm16(c); 1834 } 1835 if (norm16 < minYesNo) { 1836 // c does not decompose 1837 buffer.append(c, 0); 1838 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 1839 // Hangul syllable: decompose algorithmically 1840 Hangul.decompose(c, buffer); 1841 } else { 1842 // c decomposes, get everything from the variable-length extra data 1843 int mapping=norm16>>OFFSET_SHIFT; 1844 int firstUnit=extraData.charAt(mapping); 1845 int length=firstUnit&MAPPING_LENGTH_MASK; 1846 int leadCC, trailCC; 1847 trailCC=firstUnit>>8; 1848 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1849 leadCC=extraData.charAt(mapping-1)>>8; 1850 } else { 1851 leadCC=0; 1852 } 1853 ++mapping; // skip over the firstUnit 1854 buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC); 1855 } 1856 } 1857 1858 /** 1859 * Finds the recomposition result for 1860 * a forward-combining "lead" character, 1861 * specified with a pointer to its compositions list, 1862 * and a backward-combining "trail" character. 1863 * 1864 * <p>If the lead and trail characters combine, then this function returns 1865 * the following "compositeAndFwd" value: 1866 * <pre> 1867 * Bits 21..1 composite character 1868 * Bit 0 set if the composite is a forward-combining starter 1869 * </pre> 1870 * otherwise it returns -1. 1871 * 1872 * <p>The compositions list has (trail, compositeAndFwd) pair entries, 1873 * encoded as either pairs or triples of 16-bit units. 1874 * The last entry has the high bit of its first unit set. 1875 * 1876 * <p>The list is sorted by ascending trail characters (there are no duplicates). 1877 * A linear search is used. 1878 * 1879 * <p>See normalizer2impl.h for a more detailed description 1880 * of the compositions list format. 1881 */ combine(String compositions, int list, int trail)1882 private static int combine(String compositions, int list, int trail) { 1883 int key1, firstUnit; 1884 if(trail<COMP_1_TRAIL_LIMIT) { 1885 // trail character is 0..33FF 1886 // result entry may have 2 or 3 units 1887 key1=(trail<<1); 1888 while(key1>(firstUnit=compositions.charAt(list))) { 1889 list+=2+(firstUnit&COMP_1_TRIPLE); 1890 } 1891 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1892 if((firstUnit&COMP_1_TRIPLE)!=0) { 1893 return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2); 1894 } else { 1895 return compositions.charAt(list+1); 1896 } 1897 } 1898 } else { 1899 // trail character is 3400..10FFFF 1900 // result entry has 3 units 1901 key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); 1902 int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff; 1903 int secondUnit; 1904 for(;;) { 1905 if(key1>(firstUnit=compositions.charAt(list))) { 1906 list+=2+(firstUnit&COMP_1_TRIPLE); 1907 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1908 if(key2>(secondUnit=compositions.charAt(list+1))) { 1909 if((firstUnit&COMP_1_LAST_TUPLE)!=0) { 1910 break; 1911 } else { 1912 list+=3; 1913 } 1914 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 1915 return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); 1916 } else { 1917 break; 1918 } 1919 } else { 1920 break; 1921 } 1922 } 1923 } 1924 return -1; 1925 } 1926 /** 1927 * @param list some character's compositions list 1928 * @param set recursively receives the composites from these compositions 1929 */ addComposites(int list, UnicodeSet set)1930 private void addComposites(int list, UnicodeSet set) { 1931 int firstUnit, compositeAndFwd; 1932 do { 1933 firstUnit=maybeYesCompositions.charAt(list); 1934 if((firstUnit&COMP_1_TRIPLE)==0) { 1935 compositeAndFwd=maybeYesCompositions.charAt(list+1); 1936 list+=2; 1937 } else { 1938 compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)| 1939 maybeYesCompositions.charAt(list+2); 1940 list+=3; 1941 } 1942 int composite=compositeAndFwd>>1; 1943 if((compositeAndFwd&1)!=0) { 1944 addComposites(getCompositionsListForComposite(getRawNorm16(composite)), set); 1945 } 1946 set.add(composite); 1947 } while((firstUnit&COMP_1_LAST_TUPLE)==0); 1948 } 1949 /* 1950 * Recomposes the buffer text starting at recomposeStartIndex 1951 * (which is in NFD - decomposed and canonically ordered), 1952 * and truncates the buffer contents. 1953 * 1954 * Note that recomposition never lengthens the text: 1955 * Any character consists of either one or two code units; 1956 * a composition may contain at most one more code unit than the original starter, 1957 * while the combining mark that is removed has at least one code unit. 1958 */ recompose(ReorderingBuffer buffer, int recomposeStartIndex, boolean onlyContiguous)1959 private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, 1960 boolean onlyContiguous) { 1961 StringBuilder sb=buffer.getStringBuilder(); 1962 int p=recomposeStartIndex; 1963 if(p==sb.length()) { 1964 return; 1965 } 1966 1967 int starter, pRemove; 1968 int compositionsList; 1969 int c, compositeAndFwd; 1970 int norm16; 1971 int cc, prevCC; 1972 boolean starterIsSupplementary; 1973 1974 // Some of the following variables are not used until we have a forward-combining starter 1975 // and are only initialized now to avoid compiler warnings. 1976 compositionsList=-1; // used as indicator for whether we have a forward-combining starter 1977 starter=-1; 1978 starterIsSupplementary=false; 1979 prevCC=0; 1980 1981 for(;;) { 1982 c=sb.codePointAt(p); 1983 p+=Character.charCount(c); 1984 norm16=getNorm16(c); 1985 cc=getCCFromYesOrMaybe(norm16); 1986 if( // this character combines backward and 1987 isMaybe(norm16) && 1988 // we have seen a starter that combines forward and 1989 compositionsList>=0 && 1990 // the backward-combining character is not blocked 1991 (prevCC<cc || prevCC==0) 1992 ) { 1993 if(isJamoVT(norm16)) { 1994 // c is a Jamo V/T, see if we can compose it with the previous character. 1995 if(c<Hangul.JAMO_T_BASE) { 1996 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1997 char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE); 1998 if(prev<Hangul.JAMO_L_COUNT) { 1999 pRemove=p-1; 2000 char syllable=(char) 2001 (Hangul.HANGUL_BASE+ 2002 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* 2003 Hangul.JAMO_T_COUNT); 2004 char t; 2005 if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { 2006 ++p; 2007 syllable+=t; // The next character was a Jamo T. 2008 } 2009 sb.setCharAt(starter, syllable); 2010 // remove the Jamo V/T 2011 sb.delete(pRemove, p); 2012 p=pRemove; 2013 } 2014 } 2015 /* 2016 * No "else" for Jamo T: 2017 * Since the input is in NFD, there are no Hangul LV syllables that 2018 * a Jamo T could combine with. 2019 * All Jamo Ts are combined above when handling Jamo Vs. 2020 */ 2021 if(p==sb.length()) { 2022 break; 2023 } 2024 compositionsList=-1; 2025 continue; 2026 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) { 2027 // The starter and the combining mark (c) do combine. 2028 int composite=compositeAndFwd>>1; 2029 2030 // Remove the combining mark. 2031 pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark 2032 sb.delete(pRemove, p); 2033 p=pRemove; 2034 // Replace the starter with the composite. 2035 if(starterIsSupplementary) { 2036 if(composite>0xffff) { 2037 // both are supplementary 2038 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 2039 sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); 2040 } else { 2041 sb.setCharAt(starter, (char)c); 2042 sb.deleteCharAt(starter+1); 2043 // The composite is shorter than the starter, 2044 // move the intermediate characters forward one. 2045 starterIsSupplementary=false; 2046 --p; 2047 } 2048 } else if(composite>0xffff) { 2049 // The composite is longer than the starter, 2050 // move the intermediate characters back one. 2051 starterIsSupplementary=true; 2052 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 2053 sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); 2054 ++p; 2055 } else { 2056 // both are on the BMP 2057 sb.setCharAt(starter, (char)composite); 2058 } 2059 2060 // Keep prevCC because we removed the combining mark. 2061 2062 if(p==sb.length()) { 2063 break; 2064 } 2065 // Is the composite a starter that combines forward? 2066 if((compositeAndFwd&1)!=0) { 2067 compositionsList= 2068 getCompositionsListForComposite(getRawNorm16(composite)); 2069 } else { 2070 compositionsList=-1; 2071 } 2072 2073 // We combined; continue with looking for compositions. 2074 continue; 2075 } 2076 } 2077 2078 // no combination this time 2079 prevCC=cc; 2080 if(p==sb.length()) { 2081 break; 2082 } 2083 2084 // If c did not combine, then check if it is a starter. 2085 if(cc==0) { 2086 // Found a new starter. 2087 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { 2088 // It may combine with something, prepare for it. 2089 if(c<=0xffff) { 2090 starterIsSupplementary=false; 2091 starter=p-1; 2092 } else { 2093 starterIsSupplementary=true; 2094 starter=p-2; 2095 } 2096 } 2097 } else if(onlyContiguous) { 2098 // FCC: no discontiguous compositions; any intervening character blocks. 2099 compositionsList=-1; 2100 } 2101 } 2102 buffer.flush(); 2103 } 2104 composePair(int a, int b)2105 public int composePair(int a, int b) { 2106 int norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16 2107 int list; 2108 if(isInert(norm16)) { 2109 return -1; 2110 } else if(norm16<minYesNoMappingsOnly) { 2111 // a combines forward. 2112 if(isJamoL(norm16)) { 2113 b-=Hangul.JAMO_V_BASE; 2114 if(0<=b && b<Hangul.JAMO_V_COUNT) { 2115 return 2116 (Hangul.HANGUL_BASE+ 2117 ((a-Hangul.JAMO_L_BASE)*Hangul.JAMO_V_COUNT+b)* 2118 Hangul.JAMO_T_COUNT); 2119 } else { 2120 return -1; 2121 } 2122 } else if(isHangulLV(norm16)) { 2123 b-=Hangul.JAMO_T_BASE; 2124 if(0<b && b<Hangul.JAMO_T_COUNT) { // not b==0! 2125 return a+b; 2126 } else { 2127 return -1; 2128 } 2129 } else { 2130 // 'a' has a compositions list in extraData 2131 list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; 2132 if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list 2133 list+= // mapping pointer 2134 1+ // +1 to skip the first unit with the mapping length 2135 (maybeYesCompositions.charAt(list)&MAPPING_LENGTH_MASK); // + mapping length 2136 } 2137 } 2138 } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) { 2139 return -1; 2140 } else { 2141 list=getCompositionsListForMaybe(norm16); // offset into maybeYesCompositions 2142 } 2143 if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b 2144 return -1; 2145 } 2146 return combine(maybeYesCompositions, list, b)>>1; 2147 } 2148 2149 /** 2150 * Does c have a composition boundary before it? 2151 * True if its decomposition begins with a character that has 2152 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 2153 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 2154 * (isCompYesAndZeroCC()) so we need not decompose. 2155 */ hasCompBoundaryBefore(int c, int norm16)2156 private boolean hasCompBoundaryBefore(int c, int norm16) { 2157 return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16); 2158 } norm16HasCompBoundaryBefore(int norm16)2159 private boolean norm16HasCompBoundaryBefore(int norm16) { 2160 return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16); 2161 } hasCompBoundaryBefore(CharSequence s, int src, int limit)2162 private boolean hasCompBoundaryBefore(CharSequence s, int src, int limit) { 2163 return src == limit || hasCompBoundaryBefore(Character.codePointAt(s, src)); 2164 } norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous)2165 private boolean norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous) { 2166 return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 && 2167 (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16)); 2168 } hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous)2169 private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous) { 2170 return start == p || hasCompBoundaryAfter(Character.codePointBefore(s, p), onlyContiguous); 2171 } 2172 /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */ isTrailCC01ForCompBoundaryAfter(int norm16)2173 private boolean isTrailCC01ForCompBoundaryAfter(int norm16) { 2174 return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ? 2175 (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff); 2176 } 2177 findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous)2178 private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) { 2179 while(p>0) { 2180 int c=Character.codePointBefore(s, p); 2181 int norm16 = getNorm16(c); 2182 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 2183 break; 2184 } 2185 p-=Character.charCount(c); 2186 if(hasCompBoundaryBefore(c, norm16)) { 2187 break; 2188 } 2189 } 2190 return p; 2191 } findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous)2192 private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) { 2193 while(p<limit) { 2194 int c=Character.codePointAt(s, p); 2195 int norm16=normTrie.get(c); 2196 if(hasCompBoundaryBefore(c, norm16)) { 2197 break; 2198 } 2199 p+=Character.charCount(c); 2200 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 2201 break; 2202 } 2203 } 2204 return p; 2205 } 2206 findPreviousFCDBoundary(CharSequence s, int p)2207 private int findPreviousFCDBoundary(CharSequence s, int p) { 2208 while(p>0) { 2209 int c=Character.codePointBefore(s, p); 2210 int norm16; 2211 if (c < minDecompNoCP || norm16HasDecompBoundaryAfter(norm16 = getNorm16(c))) { 2212 break; 2213 } 2214 p-=Character.charCount(c); 2215 if (norm16HasDecompBoundaryBefore(norm16)) { 2216 break; 2217 } 2218 } 2219 return p; 2220 } findNextFCDBoundary(CharSequence s, int p, int limit)2221 private int findNextFCDBoundary(CharSequence s, int p, int limit) { 2222 while(p<limit) { 2223 int c=Character.codePointAt(s, p); 2224 int norm16; 2225 if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16 = getNorm16(c))) { 2226 break; 2227 } 2228 p+=Character.charCount(c); 2229 if (norm16HasDecompBoundaryAfter(norm16)) { 2230 break; 2231 } 2232 } 2233 return p; 2234 } 2235 getPreviousTrailCC(CharSequence s, int start, int p)2236 private int getPreviousTrailCC(CharSequence s, int start, int p) { 2237 if (start == p) { 2238 return 0; 2239 } 2240 return getFCD16(Character.codePointBefore(s, p)); 2241 } 2242 addToStartSet(MutableCodePointTrie mutableTrie, int origin, int decompLead)2243 private void addToStartSet(MutableCodePointTrie mutableTrie, int origin, int decompLead) { 2244 int canonValue = mutableTrie.get(decompLead); 2245 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) { 2246 // origin is the first character whose decomposition starts with 2247 // the character for which we are setting the value. 2248 mutableTrie.set(decompLead, canonValue|origin); 2249 } else { 2250 // origin is not the first character, or it is U+0000. 2251 UnicodeSet set; 2252 if((canonValue&CANON_HAS_SET)==0) { 2253 int firstOrigin=canonValue&CANON_VALUE_MASK; 2254 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|canonStartSets.size(); 2255 mutableTrie.set(decompLead, canonValue); 2256 canonStartSets.add(set=new UnicodeSet()); 2257 if(firstOrigin!=0) { 2258 set.add(firstOrigin); 2259 } 2260 } else { 2261 set=canonStartSets.get(canonValue&CANON_VALUE_MASK); 2262 } 2263 set.add(origin); 2264 } 2265 } 2266 2267 @SuppressWarnings("unused") 2268 private VersionInfo dataVersion; 2269 2270 // BMP code point thresholds for quick check loops looking at single UTF-16 code units. 2271 private int minDecompNoCP; 2272 private int minCompNoMaybeCP; 2273 private int minLcccCP; 2274 2275 // Norm16 value thresholds for quick check combinations and types of extra data. 2276 private int minYesNo; 2277 private int minYesNoMappingsOnly; 2278 private int minNoNo; 2279 private int minNoNoCompBoundaryBefore; 2280 private int minNoNoCompNoMaybeCC; 2281 private int minNoNoEmpty; 2282 private int limitNoNo; 2283 private int centerNoNoDelta; 2284 private int minMaybeYes; 2285 2286 private CodePointTrie.Fast16 normTrie; 2287 private String maybeYesCompositions; 2288 private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters 2289 private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 2290 2291 private CodePointTrie canonIterData; 2292 private ArrayList<UnicodeSet> canonStartSets; 2293 2294 // bits in canonIterData 2295 private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000; 2296 private static final int CANON_HAS_COMPOSITIONS = 0x40000000; 2297 private static final int CANON_HAS_SET = 0x200000; 2298 private static final int CANON_VALUE_MASK = 0x1fffff; 2299 } 2300