1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.impl; 11 12 import java.io.IOException; 13 import java.nio.ByteBuffer; 14 import java.util.Iterator; 15 import java.util.MissingResourceException; 16 17 import com.ibm.icu.lang.UCharacter; 18 import com.ibm.icu.lang.UCharacter.HangulSyllableType; 19 import com.ibm.icu.lang.UCharacter.NumericType; 20 import com.ibm.icu.lang.UCharacterCategory; 21 import com.ibm.icu.lang.UProperty; 22 import com.ibm.icu.lang.UScript; 23 import com.ibm.icu.text.Normalizer2; 24 import com.ibm.icu.text.UTF16; 25 import com.ibm.icu.text.UnicodeSet; 26 import com.ibm.icu.util.CodePointMap; 27 import com.ibm.icu.util.CodePointTrie; 28 import com.ibm.icu.util.ICUException; 29 import com.ibm.icu.util.ICUUncheckedIOException; 30 import com.ibm.icu.util.VersionInfo; 31 32 /** 33 * <p>Internal class used for Unicode character property database.</p> 34 * <p>This classes store binary data read from uprops.icu. 35 * It does not have the capability to parse the data into more high-level 36 * information. It only returns bytes of information when required.</p> 37 * <p>Due to the form most commonly used for retrieval, array of char is used 38 * to store the binary data.</p> 39 * <p>UCharacterPropertyDB also contains information on accessing indexes to 40 * significant points in the binary data.</p> 41 * <p>Responsibility for molding the binary data into more meaning form lies on 42 * <a href=UCharacter.html>UCharacter</a>.</p> 43 * @author Syn Wee Quek 44 * @since release 2.1, february 1st 2002 45 */ 46 47 public final class UCharacterProperty 48 { 49 // public data members ----------------------------------------------- 50 51 /* 52 * public singleton instance 53 */ 54 public static final UCharacterProperty INSTANCE; 55 56 /** 57 * Trie data 58 */ 59 public Trie2_16 m_trie_; 60 /** 61 * Unicode version 62 */ 63 public VersionInfo m_unicodeVersion_; 64 /** 65 * Latin capital letter i with dot above 66 */ 67 public static final char LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_ = 0x130; 68 /** 69 * Latin small letter i with dot above 70 */ 71 public static final char LATIN_SMALL_LETTER_DOTLESS_I_ = 0x131; 72 /** 73 * Latin lowercase i 74 */ 75 public static final char LATIN_SMALL_LETTER_I_ = 0x69; 76 /** 77 * Character type mask 78 */ 79 public static final int TYPE_MASK = 0x1F; 80 81 // uprops.h enum UPropertySource --------------------------------------- *** 82 83 /** No source, not a supported property. */ 84 public static final int SRC_NONE=0; 85 /** From uchar.c/uprops.icu main trie */ 86 public static final int SRC_CHAR=1; 87 /** From uchar.c/uprops.icu properties vectors trie */ 88 public static final int SRC_PROPSVEC=2; 89 /** From unames.c/unames.icu */ 90 public static final int SRC_NAMES=3; 91 /** From ucase.c/ucase.icu */ 92 public static final int SRC_CASE=4; 93 /** From ubidi_props.c/ubidi.icu */ 94 public static final int SRC_BIDI=5; 95 /** From uchar.c/uprops.icu main trie as well as properties vectors trie */ 96 public static final int SRC_CHAR_AND_PROPSVEC=6; 97 /** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */ 98 public static final int SRC_CASE_AND_NORM=7; 99 /** From normalizer2impl.cpp/nfc.nrm */ 100 public static final int SRC_NFC=8; 101 /** From normalizer2impl.cpp/nfkc.nrm */ 102 public static final int SRC_NFKC=9; 103 /** From normalizer2impl.cpp/nfkc_cf.nrm */ 104 public static final int SRC_NFKC_CF=10; 105 /** From normalizer2impl.cpp/nfc.nrm canonical iterator data */ 106 public static final int SRC_NFC_CANON_ITER=11; 107 // Text layout properties. 108 public static final int SRC_INPC=12; 109 public static final int SRC_INSC=13; 110 public static final int SRC_VO=14; 111 public static final int SRC_EMOJI=15; 112 /** One more than the highest UPropertySource (SRC_) constant. */ 113 public static final int SRC_COUNT=16; 114 115 private static final class LayoutProps { 116 private static final class IsAcceptable implements ICUBinary.Authenticate { 117 @Override isDataVersionAcceptable(byte version[])118 public boolean isDataVersionAcceptable(byte version[]) { 119 return version[0] == 1; 120 } 121 } 122 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 123 private static final int DATA_FORMAT = 0x4c61796f; // "Layo" 124 125 // indexes into indexes[] 126 // Element 0 stores the length of the indexes[] array. 127 //ivate static final int IX_INDEXES_LENGTH = 0; 128 // Elements 1..7 store the tops of consecutive code point tries. 129 // No trie is stored if the difference between two of these is less than 16. 130 private static final int IX_INPC_TRIE_TOP = 1; 131 private static final int IX_INSC_TRIE_TOP = 2; 132 private static final int IX_VO_TRIE_TOP = 3; 133 //ivate static final int IX_RESERVED_TOP = 4; 134 135 //ivate static final int IX_TRIES_TOP = 7; 136 137 private static final int IX_MAX_VALUES = 9; 138 139 // Length of indexes[]. Multiple of 4 to 16-align the tries. 140 //ivate static final int IX_COUNT = 12; 141 142 private static final int MAX_INPC_SHIFT = 24; 143 private static final int MAX_INSC_SHIFT = 16; 144 private static final int MAX_VO_SHIFT = 8; 145 146 static final LayoutProps INSTANCE = new LayoutProps(); 147 148 CodePointTrie inpcTrie = null; // Indic_Positional_Category 149 CodePointTrie inscTrie = null; // Indic_Syllabic_Category 150 CodePointTrie voTrie = null; // Vertical_Orientation 151 152 int maxInpcValue = 0; 153 int maxInscValue = 0; 154 int maxVoValue = 0; 155 LayoutProps()156 LayoutProps() { 157 ByteBuffer bytes = ICUBinary.getRequiredData("ulayout.icu"); 158 try { 159 ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); 160 int startPos = bytes.position(); 161 int indexesLength = bytes.getInt(); // inIndexes[IX_INDEXES_LENGTH] 162 if (indexesLength < 12) { 163 throw new ICUUncheckedIOException( 164 "Text layout properties data: not enough indexes"); 165 } 166 int[] inIndexes = new int[indexesLength]; 167 inIndexes[0] = indexesLength; 168 for (int i = 1; i < indexesLength; ++i) { 169 inIndexes[i] = bytes.getInt(); 170 } 171 172 int offset = indexesLength * 4; 173 int top = inIndexes[IX_INPC_TRIE_TOP]; 174 int trieSize = top - offset; 175 if (trieSize >= 16) { 176 inpcTrie = CodePointTrie.fromBinary(null, null, bytes); 177 } 178 int pos = bytes.position() - startPos; 179 assert top >= pos; 180 ICUBinary.skipBytes(bytes, top - pos); // skip padding after trie bytes 181 offset = top; 182 top = inIndexes[IX_INSC_TRIE_TOP]; 183 trieSize = top - offset; 184 if (trieSize >= 16) { 185 inscTrie = CodePointTrie.fromBinary(null, null, bytes); 186 } 187 pos = bytes.position() - startPos; 188 assert top >= pos; 189 ICUBinary.skipBytes(bytes, top - pos); // skip padding after trie bytes 190 offset = top; 191 top = inIndexes[IX_VO_TRIE_TOP]; 192 trieSize = top - offset; 193 if (trieSize >= 16) { 194 voTrie = CodePointTrie.fromBinary(null, null, bytes); 195 } 196 pos = bytes.position() - startPos; 197 assert top >= pos; 198 ICUBinary.skipBytes(bytes, top - pos); // skip padding after trie bytes 199 200 int maxValues = inIndexes[IX_MAX_VALUES]; 201 maxInpcValue = maxValues >>> MAX_INPC_SHIFT; 202 maxInscValue = (maxValues >> MAX_INSC_SHIFT) & 0xff; 203 maxVoValue = (maxValues >> MAX_VO_SHIFT) & 0xff; 204 } catch(IOException e) { 205 throw new ICUUncheckedIOException(e); 206 } 207 } 208 addPropertyStarts(int src, UnicodeSet set)209 public UnicodeSet addPropertyStarts(int src, UnicodeSet set) { 210 CodePointTrie trie; 211 switch (src) { 212 case SRC_INPC: 213 trie = inpcTrie; 214 break; 215 case SRC_INSC: 216 trie = inscTrie; 217 break; 218 case SRC_VO: 219 trie = voTrie; 220 break; 221 default: 222 throw new IllegalStateException(); 223 } 224 225 if (trie == null) { 226 throw new MissingResourceException( 227 "no data for one of the text layout properties; src=" + src, 228 "LayoutProps", ""); 229 } 230 231 // Add the start code point of each same-value range of the trie. 232 CodePointMap.Range range = new CodePointMap.Range(); 233 int start = 0; 234 while (trie.getRange(start, null, range)) { 235 set.add(start); 236 start = range.getEnd() + 1; 237 } 238 return set; 239 } 240 } 241 242 // public methods ---------------------------------------------------- 243 244 /** 245 * Gets the main property value for code point ch. 246 * @param ch code point whose property value is to be retrieved 247 * @return property value of code point 248 */ getProperty(int ch)249 public final int getProperty(int ch) 250 { 251 return m_trie_.get(ch); 252 } 253 254 /** 255 * Gets the unicode additional properties. 256 * Java version of C u_getUnicodeProperties(). 257 * @param codepoint codepoint whose additional properties is to be 258 * retrieved 259 * @param column The column index. 260 * @return unicode properties 261 */ getAdditional(int codepoint, int column)262 public int getAdditional(int codepoint, int column) { 263 assert column >= 0; 264 if (column >= m_additionalColumnsCount_) { 265 return 0; 266 } 267 return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; 268 } 269 270 static final int MY_MASK = UCharacterProperty.TYPE_MASK 271 & ((1<<UCharacterCategory.UPPERCASE_LETTER) | 272 (1<<UCharacterCategory.LOWERCASE_LETTER) | 273 (1<<UCharacterCategory.TITLECASE_LETTER) | 274 (1<<UCharacterCategory.MODIFIER_LETTER) | 275 (1<<UCharacterCategory.OTHER_LETTER)); 276 277 278 /** 279 * <p>Get the "age" of the code point.</p> 280 * <p>The "age" is the Unicode version when the code point was first 281 * designated (as a non-character or for Private Use) or assigned a 282 * character.</p> 283 * <p>This can be useful to avoid emitting code points to receiving 284 * processes that do not accept newer characters.</p> 285 * <p>The data is from the UCD file DerivedAge.txt.</p> 286 * <p>This API does not check the validity of the codepoint.</p> 287 * @param codepoint The code point. 288 * @return the Unicode version number 289 */ getAge(int codepoint)290 public VersionInfo getAge(int codepoint) 291 { 292 int version = getAdditional(codepoint, 0) >> AGE_SHIFT_; 293 return VersionInfo.getInstance( 294 (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, 295 version & LAST_NIBBLE_MASK_, 0, 0); 296 } 297 298 private static final int GC_CN_MASK = getMask(UCharacter.UNASSIGNED); 299 private static final int GC_CC_MASK = getMask(UCharacter.CONTROL); 300 private static final int GC_CS_MASK = getMask(UCharacter.SURROGATE); 301 private static final int GC_ZS_MASK = getMask(UCharacter.SPACE_SEPARATOR); 302 private static final int GC_ZL_MASK = getMask(UCharacter.LINE_SEPARATOR); 303 private static final int GC_ZP_MASK = getMask(UCharacter.PARAGRAPH_SEPARATOR); 304 /** Mask constant for multiple UCharCategory bits (Z Separators). */ 305 private static final int GC_Z_MASK = GC_ZS_MASK|GC_ZL_MASK|GC_ZP_MASK; 306 307 /** 308 * Checks if c is in 309 * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}] 310 * with space=\p{Whitespace} and Control=Cc. 311 * Implements UCHAR_POSIX_GRAPH. 312 * @internal 313 */ isgraphPOSIX(int c)314 private static final boolean isgraphPOSIX(int c) { 315 /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */ 316 /* comparing ==0 returns FALSE for the categories mentioned */ 317 return (getMask(UCharacter.getType(c))& 318 (GC_CC_MASK|GC_CS_MASK|GC_CN_MASK|GC_Z_MASK)) 319 ==0; 320 } 321 322 // binary properties --------------------------------------------------- *** 323 324 private class BinaryProperty { 325 int column; // SRC_PROPSVEC column, or "source" if mask==0 326 int mask; BinaryProperty(int column, int mask)327 BinaryProperty(int column, int mask) { 328 this.column=column; 329 this.mask=mask; 330 } BinaryProperty(int source)331 BinaryProperty(int source) { 332 this.column=source; 333 this.mask=0; 334 } getSource()335 final int getSource() { 336 return mask==0 ? column : SRC_PROPSVEC; 337 } contains(int c)338 boolean contains(int c) { 339 // systematic, directly stored properties 340 return (getAdditional(c, column)&mask)!=0; 341 } 342 } 343 344 private class CaseBinaryProperty extends BinaryProperty { // case mapping properties 345 int which; CaseBinaryProperty(int which)346 CaseBinaryProperty(int which) { 347 super(SRC_CASE); 348 this.which=which; 349 } 350 @Override contains(int c)351 boolean contains(int c) { 352 return UCaseProps.INSTANCE.hasBinaryProperty(c, which); 353 } 354 } 355 356 private class EmojiBinaryProperty extends BinaryProperty { 357 int which; EmojiBinaryProperty(int which)358 EmojiBinaryProperty(int which) { 359 super(SRC_EMOJI); 360 this.which=which; 361 } 362 @Override contains(int c)363 boolean contains(int c) { 364 return EmojiProps.INSTANCE.hasBinaryProperty(c, which); 365 } 366 } 367 368 private class NormInertBinaryProperty extends BinaryProperty { // UCHAR_NF*_INERT properties 369 int which; NormInertBinaryProperty(int source, int which)370 NormInertBinaryProperty(int source, int which) { 371 super(source); 372 this.which=which; 373 } 374 @Override contains(int c)375 boolean contains(int c) { 376 return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_INERT).isInert(c); 377 } 378 } 379 380 BinaryProperty[] binProps={ 381 /* 382 * Binary-property implementations must be in order of corresponding UProperty, 383 * and there must be exactly one entry per binary UProperty. 384 */ 385 new BinaryProperty(1, (1<<ALPHABETIC_PROPERTY_)), 386 new BinaryProperty(1, (1<<ASCII_HEX_DIGIT_PROPERTY_)), 387 new BinaryProperty(SRC_BIDI) { // UCHAR_BIDI_CONTROL 388 @Override 389 boolean contains(int c) { 390 return UBiDiProps.INSTANCE.isBidiControl(c); 391 } 392 }, 393 new BinaryProperty(SRC_BIDI) { // UCHAR_BIDI_MIRRORED 394 @Override 395 boolean contains(int c) { 396 return UBiDiProps.INSTANCE.isMirrored(c); 397 } 398 }, 399 new BinaryProperty(1, (1<<DASH_PROPERTY_)), 400 new BinaryProperty(1, (1<<DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_)), 401 new BinaryProperty(1, (1<<DEPRECATED_PROPERTY_)), 402 new BinaryProperty(1, (1<<DIACRITIC_PROPERTY_)), 403 new BinaryProperty(1, (1<<EXTENDER_PROPERTY_)), 404 new BinaryProperty(SRC_NFC) { // UCHAR_FULL_COMPOSITION_EXCLUSION 405 @Override 406 boolean contains(int c) { 407 // By definition, Full_Composition_Exclusion is the same as NFC_QC=No. 408 Normalizer2Impl impl=Norm2AllModes.getNFCInstance().impl; 409 return impl.isCompNo(impl.getNorm16(c)); 410 } 411 }, 412 new BinaryProperty(1, (1<<GRAPHEME_BASE_PROPERTY_)), 413 new BinaryProperty(1, (1<<GRAPHEME_EXTEND_PROPERTY_)), 414 new BinaryProperty(1, (1<<GRAPHEME_LINK_PROPERTY_)), 415 new BinaryProperty(1, (1<<HEX_DIGIT_PROPERTY_)), 416 new BinaryProperty(1, (1<<HYPHEN_PROPERTY_)), 417 new BinaryProperty(1, (1<<ID_CONTINUE_PROPERTY_)), 418 new BinaryProperty(1, (1<<ID_START_PROPERTY_)), 419 new BinaryProperty(1, (1<<IDEOGRAPHIC_PROPERTY_)), 420 new BinaryProperty(1, (1<<IDS_BINARY_OPERATOR_PROPERTY_)), 421 new BinaryProperty(1, (1<<IDS_TRINARY_OPERATOR_PROPERTY_)), 422 new BinaryProperty(SRC_BIDI) { // UCHAR_JOIN_CONTROL 423 @Override 424 boolean contains(int c) { 425 return UBiDiProps.INSTANCE.isJoinControl(c); 426 } 427 }, 428 new BinaryProperty(1, (1<<LOGICAL_ORDER_EXCEPTION_PROPERTY_)), 429 new CaseBinaryProperty(UProperty.LOWERCASE), 430 new BinaryProperty(1, (1<<MATH_PROPERTY_)), 431 new BinaryProperty(1, (1<<NONCHARACTER_CODE_POINT_PROPERTY_)), 432 new BinaryProperty(1, (1<<QUOTATION_MARK_PROPERTY_)), 433 new BinaryProperty(1, (1<<RADICAL_PROPERTY_)), 434 new CaseBinaryProperty(UProperty.SOFT_DOTTED), 435 new BinaryProperty(1, (1<<TERMINAL_PUNCTUATION_PROPERTY_)), 436 new BinaryProperty(1, (1<<UNIFIED_IDEOGRAPH_PROPERTY_)), 437 new CaseBinaryProperty(UProperty.UPPERCASE), 438 new BinaryProperty(1, (1<<WHITE_SPACE_PROPERTY_)), 439 new BinaryProperty(1, (1<<XID_CONTINUE_PROPERTY_)), 440 new BinaryProperty(1, (1<<XID_START_PROPERTY_)), 441 new CaseBinaryProperty(UProperty.CASE_SENSITIVE), 442 new BinaryProperty(1, (1<<S_TERM_PROPERTY_)), 443 new BinaryProperty(1, (1<<VARIATION_SELECTOR_PROPERTY_)), 444 new NormInertBinaryProperty(SRC_NFC, UProperty.NFD_INERT), 445 new NormInertBinaryProperty(SRC_NFKC, UProperty.NFKD_INERT), 446 new NormInertBinaryProperty(SRC_NFC, UProperty.NFC_INERT), 447 new NormInertBinaryProperty(SRC_NFKC, UProperty.NFKC_INERT), 448 new BinaryProperty(SRC_NFC_CANON_ITER) { // UCHAR_SEGMENT_STARTER 449 @Override 450 boolean contains(int c) { 451 return Norm2AllModes.getNFCInstance().impl. 452 ensureCanonIterData().isCanonSegmentStarter(c); 453 } 454 }, 455 new BinaryProperty(1, (1<<PATTERN_SYNTAX)), 456 new BinaryProperty(1, (1<<PATTERN_WHITE_SPACE)), 457 new BinaryProperty(SRC_CHAR_AND_PROPSVEC) { // UCHAR_POSIX_ALNUM 458 @Override 459 boolean contains(int c) { 460 return UCharacter.isUAlphabetic(c) || UCharacter.isDigit(c); 461 } 462 }, 463 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_BLANK 464 @Override 465 boolean contains(int c) { 466 // "horizontal space" 467 if(c<=0x9f) { 468 return c==9 || c==0x20; /* TAB or SPACE */ 469 } else { 470 /* Zs */ 471 return UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR; 472 } 473 } 474 }, 475 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_GRAPH 476 @Override 477 boolean contains(int c) { 478 return isgraphPOSIX(c); 479 } 480 }, 481 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_PRINT 482 @Override 483 boolean contains(int c) { 484 /* 485 * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}. 486 * 487 * The only cntrl character in graph+blank is TAB (in blank). 488 * Here we implement (blank-TAB)=Zs instead of calling u_isblank(). 489 */ 490 return (UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(c); 491 } 492 }, 493 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_XDIGIT 494 @Override 495 boolean contains(int c) { 496 /* check ASCII and Fullwidth ASCII a-fA-F */ 497 if( 498 (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) || 499 (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41)) 500 ) { 501 return true; 502 } 503 return UCharacter.getType(c)==UCharacter.DECIMAL_DIGIT_NUMBER; 504 } 505 }, 506 new CaseBinaryProperty(UProperty.CASED), 507 new CaseBinaryProperty(UProperty.CASE_IGNORABLE), 508 new CaseBinaryProperty(UProperty.CHANGES_WHEN_LOWERCASED), 509 new CaseBinaryProperty(UProperty.CHANGES_WHEN_UPPERCASED), 510 new CaseBinaryProperty(UProperty.CHANGES_WHEN_TITLECASED), 511 new BinaryProperty(SRC_CASE_AND_NORM) { // UCHAR_CHANGES_WHEN_CASEFOLDED 512 @Override 513 boolean contains(int c) { 514 String nfd=Norm2AllModes.getNFCInstance().impl.getDecomposition(c); 515 if(nfd!=null) { 516 /* c has a decomposition */ 517 c=nfd.codePointAt(0); 518 if(Character.charCount(c)!=nfd.length()) { 519 /* multiple code points */ 520 c=-1; 521 } 522 } else if(c<0) { 523 return false; /* protect against bad input */ 524 } 525 if(c>=0) { 526 /* single code point */ 527 UCaseProps csp=UCaseProps.INSTANCE; 528 UCaseProps.dummyStringBuilder.setLength(0); 529 return csp.toFullFolding(c, UCaseProps.dummyStringBuilder, 530 UCharacter.FOLD_CASE_DEFAULT)>=0; 531 } else { 532 String folded=UCharacter.foldCase(nfd, true); 533 return !folded.equals(nfd); 534 } 535 } 536 }, 537 new CaseBinaryProperty(UProperty.CHANGES_WHEN_CASEMAPPED), 538 new BinaryProperty(SRC_NFKC_CF) { // UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED 539 @Override 540 boolean contains(int c) { 541 Normalizer2Impl kcf=Norm2AllModes.getNFKC_CFInstance().impl; 542 String src=UTF16.valueOf(c); 543 StringBuilder dest=new StringBuilder(); 544 // Small destCapacity for NFKC_CF(c). 545 Normalizer2Impl.ReorderingBuffer buffer=new Normalizer2Impl.ReorderingBuffer(kcf, dest, 5); 546 kcf.compose(src, 0, src.length(), false, true, buffer); 547 return !Normalizer2Impl.UTF16Plus.equal(dest, src); 548 } 549 }, 550 new EmojiBinaryProperty(UProperty.EMOJI), 551 new EmojiBinaryProperty(UProperty.EMOJI_PRESENTATION), 552 new EmojiBinaryProperty(UProperty.EMOJI_MODIFIER), 553 new EmojiBinaryProperty(UProperty.EMOJI_MODIFIER_BASE), 554 new EmojiBinaryProperty(UProperty.EMOJI_COMPONENT), 555 new BinaryProperty(SRC_PROPSVEC) { // REGIONAL_INDICATOR 556 // Property starts are a subset of lb=RI etc. 557 @Override 558 boolean contains(int c) { 559 return 0x1F1E6<=c && c<=0x1F1FF; 560 } 561 }, 562 new BinaryProperty(1, 1<<PREPENDED_CONCATENATION_MARK), 563 new EmojiBinaryProperty(UProperty.EXTENDED_PICTOGRAPHIC), 564 new EmojiBinaryProperty(UProperty.BASIC_EMOJI), 565 new EmojiBinaryProperty(UProperty.EMOJI_KEYCAP_SEQUENCE), 566 new EmojiBinaryProperty(UProperty.RGI_EMOJI_MODIFIER_SEQUENCE), 567 new EmojiBinaryProperty(UProperty.RGI_EMOJI_FLAG_SEQUENCE), 568 new EmojiBinaryProperty(UProperty.RGI_EMOJI_TAG_SEQUENCE), 569 new EmojiBinaryProperty(UProperty.RGI_EMOJI_ZWJ_SEQUENCE), 570 new EmojiBinaryProperty(UProperty.RGI_EMOJI), 571 }; 572 hasBinaryProperty(int c, int which)573 public boolean hasBinaryProperty(int c, int which) { 574 if(which<UProperty.BINARY_START || UProperty.BINARY_LIMIT<=which) { 575 // not a known binary property 576 return false; 577 } else { 578 return binProps[which].contains(c); 579 } 580 } 581 582 // int-value and enumerated properties --------------------------------- *** 583 getType(int c)584 public int getType(int c) { 585 return getProperty(c)&TYPE_MASK; 586 } 587 588 /* 589 * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. 590 * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. 591 */ 592 private static final int /* UHangulSyllableType */ gcbToHst[]={ 593 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */ 594 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */ 595 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */ 596 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */ 597 HangulSyllableType.LEADING_JAMO, /* U_GCB_L */ 598 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */ 599 HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */ 600 HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */ 601 HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */ 602 HangulSyllableType.VOWEL_JAMO /* U_GCB_V */ 603 /* 604 * Omit GCB values beyond what we need for hst. 605 * The code below checks for the array length. 606 */ 607 }; 608 609 private class IntProperty { 610 int column; // SRC_PROPSVEC column, or "source" if mask==0 611 int mask; 612 int shift; IntProperty(int column, int mask, int shift)613 IntProperty(int column, int mask, int shift) { 614 this.column=column; 615 this.mask=mask; 616 this.shift=shift; 617 } IntProperty(int source)618 IntProperty(int source) { 619 this.column=source; 620 this.mask=0; 621 } getSource()622 final int getSource() { 623 return mask==0 ? column : SRC_PROPSVEC; 624 } getValue(int c)625 int getValue(int c) { 626 // systematic, directly stored properties 627 return (getAdditional(c, column)&mask)>>>shift; 628 } getMaxValue(int which)629 int getMaxValue(int which) { 630 return (getMaxValues(column)&mask)>>>shift; 631 } 632 } 633 634 private class BiDiIntProperty extends IntProperty { BiDiIntProperty()635 BiDiIntProperty() { 636 super(SRC_BIDI); 637 } 638 @Override getMaxValue(int which)639 int getMaxValue(int which) { 640 return UBiDiProps.INSTANCE.getMaxValue(which); 641 } 642 } 643 644 private class CombiningClassIntProperty extends IntProperty { CombiningClassIntProperty(int source)645 CombiningClassIntProperty(int source) { 646 super(source); 647 } 648 @Override getMaxValue(int which)649 int getMaxValue(int which) { 650 return 0xff; 651 } 652 } 653 654 private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties 655 int which; 656 int max; NormQuickCheckIntProperty(int source, int which, int max)657 NormQuickCheckIntProperty(int source, int which, int max) { 658 super(source); 659 this.which=which; 660 this.max=max; 661 } 662 @Override getValue(int c)663 int getValue(int c) { 664 return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_QUICK_CHECK).getQuickCheck(c); 665 } 666 @Override getMaxValue(int which)667 int getMaxValue(int which) { 668 return max; 669 } 670 } 671 672 IntProperty intProps[]={ 673 new BiDiIntProperty() { // BIDI_CLASS 674 @Override 675 int getValue(int c) { 676 return UBiDiProps.INSTANCE.getClass(c); 677 } 678 }, 679 new IntProperty(0, BLOCK_MASK_, BLOCK_SHIFT_), 680 new CombiningClassIntProperty(SRC_NFC) { // CANONICAL_COMBINING_CLASS 681 @Override 682 int getValue(int c) { 683 return Normalizer2.getNFDInstance().getCombiningClass(c); 684 } 685 }, 686 new IntProperty(2, DECOMPOSITION_TYPE_MASK_, 0), 687 new IntProperty(0, EAST_ASIAN_MASK_, EAST_ASIAN_SHIFT_), 688 new IntProperty(SRC_CHAR) { // GENERAL_CATEGORY 689 @Override 690 int getValue(int c) { 691 return getType(c); 692 } 693 @Override 694 int getMaxValue(int which) { 695 return UCharacterCategory.CHAR_CATEGORY_COUNT-1; 696 } 697 }, 698 new BiDiIntProperty() { // JOINING_GROUP 699 @Override 700 int getValue(int c) { 701 return UBiDiProps.INSTANCE.getJoiningGroup(c); 702 } 703 }, 704 new BiDiIntProperty() { // JOINING_TYPE 705 @Override 706 int getValue(int c) { 707 return UBiDiProps.INSTANCE.getJoiningType(c); 708 } 709 }, 710 new IntProperty(2, LB_MASK, LB_SHIFT), // LINE_BREAK 711 new IntProperty(SRC_CHAR) { // NUMERIC_TYPE 712 @Override 713 int getValue(int c) { 714 return ntvGetType(getNumericTypeValue(getProperty(c))); 715 } 716 @Override 717 int getMaxValue(int which) { 718 return NumericType.COUNT-1; 719 } 720 }, 721 new IntProperty(SRC_PROPSVEC) { 722 @Override 723 int getValue(int c) { 724 return UScript.getScript(c); 725 } 726 @Override 727 int getMaxValue(int which) { 728 int scriptX=getMaxValues(0)&SCRIPT_X_MASK; 729 return mergeScriptCodeOrIndex(scriptX); 730 } 731 }, 732 new IntProperty(SRC_PROPSVEC) { // HANGUL_SYLLABLE_TYPE 733 @Override 734 int getValue(int c) { 735 /* see comments on gcbToHst[] above */ 736 int gcb=(getAdditional(c, 2)&GCB_MASK)>>>GCB_SHIFT; 737 if(gcb<gcbToHst.length) { 738 return gcbToHst[gcb]; 739 } else { 740 return HangulSyllableType.NOT_APPLICABLE; 741 } 742 } 743 @Override 744 int getMaxValue(int which) { 745 return HangulSyllableType.COUNT-1; 746 } 747 }, 748 // max=1=YES -- these are never "maybe", only "no" or "yes" 749 new NormQuickCheckIntProperty(SRC_NFC, UProperty.NFD_QUICK_CHECK, 1), 750 new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKD_QUICK_CHECK, 1), 751 // max=2=MAYBE 752 new NormQuickCheckIntProperty(SRC_NFC, UProperty.NFC_QUICK_CHECK, 2), 753 new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKC_QUICK_CHECK, 2), 754 new CombiningClassIntProperty(SRC_NFC) { // LEAD_CANONICAL_COMBINING_CLASS 755 @Override 756 int getValue(int c) { 757 return Norm2AllModes.getNFCInstance().impl.getFCD16(c)>>8; 758 } 759 }, 760 new CombiningClassIntProperty(SRC_NFC) { // TRAIL_CANONICAL_COMBINING_CLASS 761 @Override 762 int getValue(int c) { 763 return Norm2AllModes.getNFCInstance().impl.getFCD16(c)&0xff; 764 } 765 }, 766 new IntProperty(2, GCB_MASK, GCB_SHIFT), // GRAPHEME_CLUSTER_BREAK 767 new IntProperty(2, SB_MASK, SB_SHIFT), // SENTENCE_BREAK 768 new IntProperty(2, WB_MASK, WB_SHIFT), // WORD_BREAK 769 new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE 770 @Override 771 int getValue(int c) { 772 return UBiDiProps.INSTANCE.getPairedBracketType(c); 773 } 774 }, 775 new IntProperty(SRC_INPC) { 776 @Override 777 int getValue(int c) { 778 CodePointTrie trie = LayoutProps.INSTANCE.inpcTrie; 779 return trie != null ? trie.get(c) : 0; 780 } 781 @Override 782 int getMaxValue(int which) { 783 return LayoutProps.INSTANCE.maxInpcValue; 784 } 785 }, 786 new IntProperty(SRC_INSC) { 787 @Override 788 int getValue(int c) { 789 CodePointTrie trie = LayoutProps.INSTANCE.inscTrie; 790 return trie != null ? trie.get(c) : 0; 791 } 792 @Override 793 int getMaxValue(int which) { 794 return LayoutProps.INSTANCE.maxInscValue; 795 } 796 }, 797 new IntProperty(SRC_VO) { 798 @Override 799 int getValue(int c) { 800 CodePointTrie trie = LayoutProps.INSTANCE.voTrie; 801 return trie != null ? trie.get(c) : 0; 802 } 803 @Override 804 int getMaxValue(int which) { 805 return LayoutProps.INSTANCE.maxVoValue; 806 } 807 }, 808 }; 809 getIntPropertyValue(int c, int which)810 public int getIntPropertyValue(int c, int which) { 811 if(which<UProperty.INT_START) { 812 if(UProperty.BINARY_START<=which && which<UProperty.BINARY_LIMIT) { 813 return binProps[which].contains(c) ? 1 : 0; 814 } 815 } else if(which<UProperty.INT_LIMIT) { 816 return intProps[which-UProperty.INT_START].getValue(c); 817 } else if (which == UProperty.GENERAL_CATEGORY_MASK) { 818 return getMask(getType(c)); 819 } 820 return 0; // undefined 821 } 822 getIntPropertyMaxValue(int which)823 public int getIntPropertyMaxValue(int which) { 824 if(which<UProperty.INT_START) { 825 if(UProperty.BINARY_START<=which && which<UProperty.BINARY_LIMIT) { 826 return 1; // maximum TRUE for all binary properties 827 } 828 } else if(which<UProperty.INT_LIMIT) { 829 return intProps[which-UProperty.INT_START].getMaxValue(which); 830 } 831 return -1; // undefined 832 } 833 getSource(int which)834 final int getSource(int which) { 835 if(which<UProperty.BINARY_START) { 836 return SRC_NONE; /* undefined */ 837 } else if(which<UProperty.BINARY_LIMIT) { 838 return binProps[which].getSource(); 839 } else if(which<UProperty.INT_START) { 840 return SRC_NONE; /* undefined */ 841 } else if(which<UProperty.INT_LIMIT) { 842 return intProps[which-UProperty.INT_START].getSource(); 843 } else if(which<UProperty.STRING_START) { 844 switch(which) { 845 case UProperty.GENERAL_CATEGORY_MASK: 846 case UProperty.NUMERIC_VALUE: 847 return SRC_CHAR; 848 849 default: 850 return SRC_NONE; 851 } 852 } else if(which<UProperty.STRING_LIMIT) { 853 switch(which) { 854 case UProperty.AGE: 855 return SRC_PROPSVEC; 856 857 case UProperty.BIDI_MIRRORING_GLYPH: 858 return SRC_BIDI; 859 860 case UProperty.CASE_FOLDING: 861 case UProperty.LOWERCASE_MAPPING: 862 case UProperty.SIMPLE_CASE_FOLDING: 863 case UProperty.SIMPLE_LOWERCASE_MAPPING: 864 case UProperty.SIMPLE_TITLECASE_MAPPING: 865 case UProperty.SIMPLE_UPPERCASE_MAPPING: 866 case UProperty.TITLECASE_MAPPING: 867 case UProperty.UPPERCASE_MAPPING: 868 return SRC_CASE; 869 870 case UProperty.ISO_COMMENT: 871 case UProperty.NAME: 872 case UProperty.UNICODE_1_NAME: 873 return SRC_NAMES; 874 875 default: 876 return SRC_NONE; 877 } 878 } else { 879 switch(which) { 880 case UProperty.SCRIPT_EXTENSIONS: 881 return SRC_PROPSVEC; 882 default: 883 return SRC_NONE; /* undefined */ 884 } 885 } 886 } 887 888 /** 889 * <p> 890 * Unicode property names and property value names are compared 891 * "loosely". Property[Value]Aliases.txt say: 892 * <quote> 893 * "With loose matching of property names, the case distinctions, 894 * whitespace, and '_' are ignored." 895 * </quote> 896 * </p> 897 * <p> 898 * This function does just that, for ASCII (char *) name strings. 899 * It is almost identical to ucnv_compareNames() but also ignores 900 * ASCII White_Space characters (U+0009..U+000d). 901 * </p> 902 * @param name1 name to compare 903 * @param name2 name to compare 904 * @return 0 if names are equal, < 0 if name1 is less than name2 and > 0 905 * if name1 is greater than name2. 906 */ 907 /* to be implemented in 2.4 908 * public static int comparePropertyNames(String name1, String name2) 909 { 910 int result = 0; 911 int i1 = 0; 912 int i2 = 0; 913 while (true) { 914 char ch1 = 0; 915 char ch2 = 0; 916 // Ignore delimiters '-', '_', and ASCII White_Space 917 if (i1 < name1.length()) { 918 ch1 = name1.charAt(i1 ++); 919 } 920 while (ch1 == '-' || ch1 == '_' || ch1 == ' ' || ch1 == '\t' 921 || ch1 == '\n' // synwee what is || ch1 == '\v' 922 || ch1 == '\f' || ch1=='\r') { 923 if (i1 < name1.length()) { 924 ch1 = name1.charAt(i1 ++); 925 } 926 else { 927 ch1 = 0; 928 } 929 } 930 if (i2 < name2.length()) { 931 ch2 = name2.charAt(i2 ++); 932 } 933 while (ch2 == '-' || ch2 == '_' || ch2 == ' ' || ch2 == '\t' 934 || ch2 == '\n' // synwee what is || ch1 == '\v' 935 || ch2 == '\f' || ch2=='\r') { 936 if (i2 < name2.length()) { 937 ch2 = name2.charAt(i2 ++); 938 } 939 else { 940 ch2 = 0; 941 } 942 } 943 944 // If we reach the ends of both strings then they match 945 if (ch1 == 0 && ch2 == 0) { 946 return 0; 947 } 948 949 // Case-insensitive comparison 950 if (ch1 != ch2) { 951 result = Character.toLowerCase(ch1) 952 - Character.toLowerCase(ch2); 953 if (result != 0) { 954 return result; 955 } 956 } 957 } 958 } 959 */ 960 961 /** 962 * Get the the maximum values for some enum/int properties. 963 * @return maximum values for the integer properties. 964 */ getMaxValues(int column)965 public int getMaxValues(int column) 966 { 967 // return m_maxBlockScriptValue_; 968 969 switch(column) { 970 case 0: 971 return m_maxBlockScriptValue_; 972 case 2: 973 return m_maxJTGValue_; 974 default: 975 return 0; 976 } 977 } 978 979 /** 980 * Gets the type mask 981 * @param type character type 982 * @return mask 983 */ getMask(int type)984 public static final int getMask(int type) 985 { 986 return 1 << type; 987 } 988 989 990 /** 991 * Returns the digit values of characters like 'A' - 'Z', normal, 992 * half-width and full-width. This method assumes that the other digit 993 * characters are checked by the calling method. 994 * @param ch character to test 995 * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise 996 * its corresponding digit will be returned. 997 */ getEuropeanDigit(int ch)998 public static int getEuropeanDigit(int ch) { 999 if ((ch > 0x7a && ch < 0xff21) 1000 || ch < 0x41 || (ch > 0x5a && ch < 0x61) 1001 || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { 1002 return -1; 1003 } 1004 if (ch <= 0x7a) { 1005 // ch >= 0x41 or ch < 0x61 1006 return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); 1007 } 1008 // ch >= 0xff21 1009 if (ch <= 0xff3a) { 1010 return ch + 10 - 0xff21; 1011 } 1012 // ch >= 0xff41 && ch <= 0xff5a 1013 return ch + 10 - 0xff41; 1014 } 1015 digit(int c)1016 public int digit(int c) { 1017 int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_; 1018 if(value<=9) { 1019 return value; 1020 } else { 1021 return -1; 1022 } 1023 } 1024 getNumericValue(int c)1025 public int getNumericValue(int c) { 1026 // slightly pruned version of getUnicodeNumericValue(), plus getEuropeanDigit() 1027 int ntv = getNumericTypeValue(getProperty(c)); 1028 1029 if(ntv==NTV_NONE_) { 1030 return getEuropeanDigit(c); 1031 } else if(ntv<NTV_DIGIT_START_) { 1032 /* decimal digit */ 1033 return ntv-NTV_DECIMAL_START_; 1034 } else if(ntv<NTV_NUMERIC_START_) { 1035 /* other digit */ 1036 return ntv-NTV_DIGIT_START_; 1037 } else if(ntv<NTV_FRACTION_START_) { 1038 /* small integer */ 1039 return ntv-NTV_NUMERIC_START_; 1040 } else if(ntv<NTV_LARGE_START_) { 1041 /* fraction */ 1042 return -2; 1043 } else if(ntv<NTV_BASE60_START_) { 1044 /* large, single-significant-digit integer */ 1045 int mant=(ntv>>5)-14; 1046 int exp=(ntv&0x1f)+2; 1047 if(exp<9 || (exp==9 && mant<=2)) { 1048 int numValue=mant; 1049 do { 1050 numValue*=10; 1051 } while(--exp>0); 1052 return numValue; 1053 } else { 1054 return -2; 1055 } 1056 } else if(ntv<NTV_FRACTION20_START_) { 1057 /* sexagesimal (base 60) integer */ 1058 int numValue=(ntv>>2)-0xbf; 1059 int exp=(ntv&3)+1; 1060 1061 switch(exp) { 1062 case 4: 1063 numValue*=60*60*60*60; 1064 break; 1065 case 3: 1066 numValue*=60*60*60; 1067 break; 1068 case 2: 1069 numValue*=60*60; 1070 break; 1071 case 1: 1072 numValue*=60; 1073 break; 1074 case 0: 1075 default: 1076 break; 1077 } 1078 1079 return numValue; 1080 } else if(ntv<NTV_RESERVED_START_) { 1081 // fraction-20 e.g. 3/80 1082 return -2; 1083 } else { 1084 /* reserved */ 1085 return -2; 1086 } 1087 } 1088 getUnicodeNumericValue(int c)1089 public double getUnicodeNumericValue(int c) { 1090 // equivalent to c version double u_getNumericValue(UChar32 c) 1091 int ntv = getNumericTypeValue(getProperty(c)); 1092 1093 if(ntv==NTV_NONE_) { 1094 return UCharacter.NO_NUMERIC_VALUE; 1095 } else if(ntv<NTV_DIGIT_START_) { 1096 /* decimal digit */ 1097 return ntv-NTV_DECIMAL_START_; 1098 } else if(ntv<NTV_NUMERIC_START_) { 1099 /* other digit */ 1100 return ntv-NTV_DIGIT_START_; 1101 } else if(ntv<NTV_FRACTION_START_) { 1102 /* small integer */ 1103 return ntv-NTV_NUMERIC_START_; 1104 } else if(ntv<NTV_LARGE_START_) { 1105 /* fraction */ 1106 int numerator=(ntv>>4)-12; 1107 int denominator=(ntv&0xf)+1; 1108 return (double)numerator/denominator; 1109 } else if(ntv<NTV_BASE60_START_) { 1110 /* large, single-significant-digit integer */ 1111 double numValue; 1112 int mant=(ntv>>5)-14; 1113 int exp=(ntv&0x1f)+2; 1114 numValue=mant; 1115 1116 /* multiply by 10^exp without math.h */ 1117 while(exp>=4) { 1118 numValue*=10000.; 1119 exp-=4; 1120 } 1121 switch(exp) { 1122 case 3: 1123 numValue*=1000.; 1124 break; 1125 case 2: 1126 numValue*=100.; 1127 break; 1128 case 1: 1129 numValue*=10.; 1130 break; 1131 case 0: 1132 default: 1133 break; 1134 } 1135 1136 return numValue; 1137 } else if(ntv<NTV_FRACTION20_START_) { 1138 /* sexagesimal (base 60) integer */ 1139 int numValue=(ntv>>2)-0xbf; 1140 int exp=(ntv&3)+1; 1141 1142 switch(exp) { 1143 case 4: 1144 numValue*=60*60*60*60; 1145 break; 1146 case 3: 1147 numValue*=60*60*60; 1148 break; 1149 case 2: 1150 numValue*=60*60; 1151 break; 1152 case 1: 1153 numValue*=60; 1154 break; 1155 case 0: 1156 default: 1157 break; 1158 } 1159 1160 return numValue; 1161 } else if(ntv<NTV_FRACTION32_START_) { 1162 // fraction-20 e.g. 3/80 1163 int frac20=ntv-NTV_FRACTION20_START_; // 0..0x17 1164 int numerator=2*(frac20&3)+1; 1165 int denominator=20<<(frac20>>2); 1166 return (double)numerator/denominator; 1167 } else if(ntv<NTV_RESERVED_START_) { 1168 // fraction-32 e.g. 3/64 1169 int frac32=ntv-NTV_FRACTION32_START_; // 0..15 1170 int numerator=2*(frac32&3)+1; 1171 int denominator=32<<(frac32>>2); 1172 return (double)numerator/denominator; 1173 } else { 1174 /* reserved */ 1175 return UCharacter.NO_NUMERIC_VALUE; 1176 } 1177 } 1178 1179 // protected variables ----------------------------------------------- 1180 1181 /** 1182 * Extra property trie 1183 */ 1184 Trie2_16 m_additionalTrie_; 1185 /** 1186 * Extra property vectors, 1st column for age and second for binary 1187 * properties. 1188 */ 1189 int m_additionalVectors_[]; 1190 /** 1191 * Number of additional columns 1192 */ 1193 int m_additionalColumnsCount_; 1194 /** 1195 * Maximum values for block, bits used as in vector word 1196 * 0 1197 */ 1198 int m_maxBlockScriptValue_; 1199 /** 1200 * Maximum values for script, bits used as in vector word 1201 * 0 1202 */ 1203 int m_maxJTGValue_; 1204 1205 /** 1206 * Script_Extensions data 1207 */ 1208 public char[] m_scriptExtensions_; 1209 1210 // private variables ------------------------------------------------- 1211 1212 /** 1213 * Default name of the datafile 1214 */ 1215 private static final String DATA_FILE_NAME_ = "uprops.icu"; 1216 1217 // property data constants ------------------------------------------------- 1218 1219 /** 1220 * Numeric types and values in the main properties words. 1221 */ 1222 private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6; getNumericTypeValue(int props)1223 private static final int getNumericTypeValue(int props) { 1224 return props >> NUMERIC_TYPE_VALUE_SHIFT_; 1225 } 1226 /* constants for the storage form of numeric types and values */ 1227 /** No numeric value. */ 1228 private static final int NTV_NONE_ = 0; 1229 /** Decimal digits: nv=0..9 */ 1230 private static final int NTV_DECIMAL_START_ = 1; 1231 /** Other digits: nv=0..9 */ 1232 private static final int NTV_DIGIT_START_ = 11; 1233 /** Small integers: nv=0..154 */ 1234 private static final int NTV_NUMERIC_START_ = 21; 1235 /** Fractions: ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 */ 1236 private static final int NTV_FRACTION_START_ = 0xb0; 1237 /** 1238 * Large integers: 1239 * ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33) 1240 * (only one significant decimal digit) 1241 */ 1242 private static final int NTV_LARGE_START_ = 0x1e0; 1243 /** 1244 * Sexagesimal numbers: 1245 * ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4) 1246 */ 1247 private static final int NTV_BASE60_START_=0x300; 1248 /** 1249 * Fraction-20 values: 1250 * frac20 = ntv-0x324 = 0..0x17 -> 1|3|5|7 / 20|40|80|160|320|640 1251 * numerator: num = 2*(frac20&3)+1 1252 * denominator: den = 20<<(frac20>>2) 1253 */ 1254 private static final int NTV_FRACTION20_START_ = NTV_BASE60_START_ + 36; // 0x300+9*4=0x324 1255 /** 1256 * Fraction-32 values: 1257 * frac32 = ntv-0x34c = 0..15 -> 1|3|5|7 / 32|64|128|256 1258 * numerator: num = 2*(frac32&3)+1 1259 * denominator: den = 32<<(frac32>>2) 1260 */ 1261 private static final int NTV_FRACTION32_START_ = NTV_FRACTION20_START_ + 24; // 0x324+6*4=0x34c 1262 /** No numeric value (yet). */ 1263 private static final int NTV_RESERVED_START_ = NTV_FRACTION32_START_ + 16; // 0x34c+4*4=0x35c 1264 ntvGetType(int ntv)1265 private static final int ntvGetType(int ntv) { 1266 return 1267 (ntv==NTV_NONE_) ? NumericType.NONE : 1268 (ntv<NTV_DIGIT_START_) ? NumericType.DECIMAL : 1269 (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT : 1270 NumericType.NUMERIC; 1271 } 1272 1273 /* 1274 * Properties in vector word 0 1275 * Bits 1276 * 31..24 DerivedAge version major/minor one nibble each 1277 * 23..22 3..1: Bits 21..20 & 7..0 = Script_Extensions index 1278 * 3: Script value from Script_Extensions 1279 * 2: Script=Inherited 1280 * 1: Script=Common 1281 * 0: Script=bits 21..20 & 7..0 1282 * 21..20 Bits 9..8 of the UScriptCode, or index to Script_Extensions 1283 * 19..17 East Asian Width 1284 * 16.. 8 UBlockCode 1285 * 7.. 0 UScriptCode, or index to Script_Extensions 1286 */ 1287 1288 /** 1289 * Script_Extensions: mask includes Script 1290 */ 1291 public static final int SCRIPT_X_MASK = 0x00f000ff; 1292 //private static final int SCRIPT_X_SHIFT = 22; 1293 1294 // The UScriptCode or Script_Extensions index is split across two bit fields. 1295 // (Starting with Unicode 13/ICU 66/2019 due to more varied Script_Extensions.) 1296 // Shift the high bits right by 12 to assemble the full value. 1297 public static final int SCRIPT_HIGH_MASK = 0x00300000; 1298 public static final int SCRIPT_HIGH_SHIFT = 12; 1299 public static final int MAX_SCRIPT = 0x3ff; 1300 1301 /** 1302 * Integer properties mask and shift values for East Asian cell width. 1303 * Equivalent to icu4c UPROPS_EA_MASK 1304 */ 1305 private static final int EAST_ASIAN_MASK_ = 0x000e0000; 1306 /** 1307 * Integer properties mask and shift values for East Asian cell width. 1308 * Equivalent to icu4c UPROPS_EA_SHIFT 1309 */ 1310 private static final int EAST_ASIAN_SHIFT_ = 17; 1311 /** 1312 * Integer properties mask and shift values for blocks. 1313 * Equivalent to icu4c UPROPS_BLOCK_MASK 1314 */ 1315 private static final int BLOCK_MASK_ = 0x0001ff00; 1316 /** 1317 * Integer properties mask and shift values for blocks. 1318 * Equivalent to icu4c UPROPS_BLOCK_SHIFT 1319 */ 1320 private static final int BLOCK_SHIFT_ = 8; 1321 /** 1322 * Integer properties mask and shift values for scripts. 1323 * Equivalent to icu4c UPROPS_SHIFT_LOW_MASK. 1324 */ 1325 public static final int SCRIPT_LOW_MASK = 0x000000ff; 1326 1327 /* SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. */ 1328 public static final int SCRIPT_X_WITH_COMMON = 0x400000; 1329 public static final int SCRIPT_X_WITH_INHERITED = 0x800000; 1330 public static final int SCRIPT_X_WITH_OTHER = 0xc00000; 1331 mergeScriptCodeOrIndex(int scriptX)1332 public static final int mergeScriptCodeOrIndex(int scriptX) { 1333 return 1334 ((scriptX & SCRIPT_HIGH_MASK) >> SCRIPT_HIGH_SHIFT) | 1335 (scriptX & SCRIPT_LOW_MASK); 1336 } 1337 1338 /** 1339 * Additional properties used in internal trie data 1340 */ 1341 /* 1342 * Properties in vector word 1 1343 * Each bit encodes one binary property. 1344 * The following constants represent the bit number, use 1<<UPROPS_XYZ. 1345 * UPROPS_BINARY_1_TOP<=32! 1346 * 1347 * Keep this list of property enums in sync with 1348 * propListNames[] in icu/source/tools/genprops/props2.c! 1349 * 1350 * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_". 1351 */ 1352 private static final int WHITE_SPACE_PROPERTY_ = 0; 1353 private static final int DASH_PROPERTY_ = 1; 1354 private static final int HYPHEN_PROPERTY_ = 2; 1355 private static final int QUOTATION_MARK_PROPERTY_ = 3; 1356 private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4; 1357 private static final int MATH_PROPERTY_ = 5; 1358 private static final int HEX_DIGIT_PROPERTY_ = 6; 1359 private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7; 1360 private static final int ALPHABETIC_PROPERTY_ = 8; 1361 private static final int IDEOGRAPHIC_PROPERTY_ = 9; 1362 private static final int DIACRITIC_PROPERTY_ = 10; 1363 private static final int EXTENDER_PROPERTY_ = 11; 1364 private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12; 1365 private static final int GRAPHEME_EXTEND_PROPERTY_ = 13; 1366 private static final int GRAPHEME_LINK_PROPERTY_ = 14; 1367 private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15; 1368 private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16; 1369 private static final int RADICAL_PROPERTY_ = 17; 1370 private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18; 1371 private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19; 1372 private static final int DEPRECATED_PROPERTY_ = 20; 1373 private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21; 1374 private static final int XID_START_PROPERTY_ = 22; 1375 private static final int XID_CONTINUE_PROPERTY_ = 23; 1376 private static final int ID_START_PROPERTY_ = 24; 1377 private static final int ID_CONTINUE_PROPERTY_ = 25; 1378 private static final int GRAPHEME_BASE_PROPERTY_ = 26; 1379 private static final int S_TERM_PROPERTY_ = 27; 1380 private static final int VARIATION_SELECTOR_PROPERTY_ = 28; 1381 private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */ 1382 private static final int PATTERN_WHITE_SPACE = 30; 1383 private static final int PREPENDED_CONCATENATION_MARK = 31; // new in ICU 60 and Unicode 10 1384 1385 /* 1386 * Properties in vector word 2 1387 * Bits 1388 * 31..26 unused since ICU 70 added uemoji.icu; 1389 * in ICU 57..69 stored emoji properties 1390 * 25..20 Line Break 1391 * 19..15 Sentence Break 1392 * 14..10 Word Break 1393 * 9.. 5 Grapheme Cluster Break 1394 * 4.. 0 Decomposition Type 1395 */ 1396 //ivate static final int PROPS_2_EXTENDED_PICTOGRAPHIC=26; // ICU 62..69 1397 //ivate static final int PROPS_2_EMOJI_COMPONENT = 27; // ICU 60..69 1398 //ivate static final int PROPS_2_EMOJI = 28; // ICU 57..69 1399 //ivate static final int PROPS_2_EMOJI_PRESENTATION = 29; // ICU 57..69 1400 //ivate static final int PROPS_2_EMOJI_MODIFIER = 30; // ICU 57..69 1401 //ivate static final int PROPS_2_EMOJI_MODIFIER_BASE = 31; // ICU 57..69 1402 1403 private static final int LB_MASK = 0x03f00000; 1404 private static final int LB_SHIFT = 20; 1405 1406 private static final int SB_MASK = 0x000f8000; 1407 private static final int SB_SHIFT = 15; 1408 1409 private static final int WB_MASK = 0x00007c00; 1410 private static final int WB_SHIFT = 10; 1411 1412 private static final int GCB_MASK = 0x000003e0; 1413 private static final int GCB_SHIFT = 5; 1414 1415 /** 1416 * Integer properties mask for decomposition type. 1417 * Equivalent to icu4c UPROPS_DT_MASK. 1418 */ 1419 private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f; 1420 1421 /** 1422 * First nibble shift 1423 */ 1424 private static final int FIRST_NIBBLE_SHIFT_ = 0x4; 1425 /** 1426 * Second nibble mask 1427 */ 1428 private static final int LAST_NIBBLE_MASK_ = 0xF; 1429 /** 1430 * Age value shift 1431 */ 1432 private static final int AGE_SHIFT_ = 24; 1433 1434 1435 // private constructors -------------------------------------------------- 1436 1437 /** 1438 * Constructor 1439 * @exception IOException thrown when data reading fails or data corrupted 1440 */ UCharacterProperty()1441 private UCharacterProperty() throws IOException 1442 { 1443 // consistency check 1444 if(binProps.length!=UProperty.BINARY_LIMIT) { 1445 throw new ICUException("binProps.length!=UProperty.BINARY_LIMIT"); 1446 } 1447 if(intProps.length!=(UProperty.INT_LIMIT-UProperty.INT_START)) { 1448 throw new ICUException("intProps.length!=(UProperty.INT_LIMIT-UProperty.INT_START)"); 1449 } 1450 1451 // jar access 1452 ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_); 1453 m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable()); 1454 // Read or skip the 16 indexes. 1455 int propertyOffset = bytes.getInt(); 1456 /* exceptionOffset = */ bytes.getInt(); 1457 /* caseOffset = */ bytes.getInt(); 1458 int additionalOffset = bytes.getInt(); 1459 int additionalVectorsOffset = bytes.getInt(); 1460 m_additionalColumnsCount_ = bytes.getInt(); 1461 int scriptExtensionsOffset = bytes.getInt(); 1462 int reservedOffset7 = bytes.getInt(); 1463 /* reservedOffset8 = */ bytes.getInt(); 1464 /* dataTopOffset = */ bytes.getInt(); 1465 m_maxBlockScriptValue_ = bytes.getInt(); 1466 m_maxJTGValue_ = bytes.getInt(); 1467 ICUBinary.skipBytes(bytes, (16 - 12) << 2); 1468 1469 // read the main properties trie 1470 m_trie_ = Trie2_16.createFromSerialized(bytes); 1471 int expectedTrieLength = (propertyOffset - 16) * 4; 1472 int trieLength = m_trie_.getSerializedLength(); 1473 if(trieLength > expectedTrieLength) { 1474 throw new IOException("uprops.icu: not enough bytes for main trie"); 1475 } 1476 // skip padding after trie bytes 1477 ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); 1478 1479 // skip unused intervening data structures 1480 ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4); 1481 1482 if(m_additionalColumnsCount_ > 0) { 1483 // reads the additional property block 1484 m_additionalTrie_ = Trie2_16.createFromSerialized(bytes); 1485 expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4; 1486 trieLength = m_additionalTrie_.getSerializedLength(); 1487 if(trieLength > expectedTrieLength) { 1488 throw new IOException("uprops.icu: not enough bytes for additional-properties trie"); 1489 } 1490 // skip padding after trie bytes 1491 ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); 1492 1493 // additional properties 1494 int size = scriptExtensionsOffset - additionalVectorsOffset; 1495 m_additionalVectors_ = ICUBinary.getInts(bytes, size, 0); 1496 } 1497 1498 // Script_Extensions 1499 int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2; 1500 if(numChars > 0) { 1501 m_scriptExtensions_ = ICUBinary.getChars(bytes, numChars, 0); 1502 } 1503 } 1504 1505 private static final class IsAcceptable implements ICUBinary.Authenticate { 1506 @Override isDataVersionAcceptable(byte version[])1507 public boolean isDataVersionAcceptable(byte version[]) { 1508 return version[0] == 7; 1509 } 1510 } 1511 private static final int DATA_FORMAT = 0x5550726F; // "UPro" 1512 1513 // private methods ------------------------------------------------------- 1514 1515 /* 1516 * Compare additional properties to see if it has argument type 1517 * @param property 32 bit properties 1518 * @param type character type 1519 * @return true if property has type 1520 */ 1521 /*private boolean compareAdditionalType(int property, int type) 1522 { 1523 return (property & (1 << type)) != 0; 1524 }*/ 1525 1526 // property starts for UnicodeSet -------------------------------------- *** 1527 1528 private static final int TAB = 0x0009; 1529 //private static final int LF = 0x000a; 1530 //private static final int FF = 0x000c; 1531 private static final int CR = 0x000d; 1532 private static final int U_A = 0x0041; 1533 private static final int U_F = 0x0046; 1534 private static final int U_Z = 0x005a; 1535 private static final int U_a = 0x0061; 1536 private static final int U_f = 0x0066; 1537 private static final int U_z = 0x007a; 1538 private static final int DEL = 0x007f; 1539 private static final int NL = 0x0085; 1540 private static final int NBSP = 0x00a0; 1541 private static final int CGJ = 0x034f; 1542 private static final int FIGURESP= 0x2007; 1543 private static final int HAIRSP = 0x200a; 1544 //private static final int ZWNJ = 0x200c; 1545 //private static final int ZWJ = 0x200d; 1546 private static final int RLM = 0x200f; 1547 private static final int NNBSP = 0x202f; 1548 private static final int WJ = 0x2060; 1549 private static final int INHSWAP = 0x206a; 1550 private static final int NOMDIG = 0x206f; 1551 private static final int U_FW_A = 0xff21; 1552 private static final int U_FW_F = 0xff26; 1553 private static final int U_FW_Z = 0xff3a; 1554 private static final int U_FW_a = 0xff41; 1555 private static final int U_FW_f = 0xff46; 1556 private static final int U_FW_z = 0xff5a; 1557 private static final int ZWNBSP = 0xfeff; 1558 addPropertyStarts(UnicodeSet set)1559 public UnicodeSet addPropertyStarts(UnicodeSet set) { 1560 /* add the start code point of each same-value range of the main trie */ 1561 Iterator<Trie2.Range> trieIterator = m_trie_.iterator(); 1562 Trie2.Range range; 1563 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 1564 set.add(range.startCodePoint); 1565 } 1566 1567 /* add code points with hardcoded properties, plus the ones following them */ 1568 1569 /* add for u_isblank() */ 1570 set.add(TAB); 1571 set.add(TAB+1); 1572 1573 /* add for IS_THAT_CONTROL_SPACE() */ 1574 set.add(CR+1); /* range TAB..CR */ 1575 set.add(0x1c); 1576 set.add(0x1f+1); 1577 set.add(NL); 1578 set.add(NL+1); 1579 1580 /* add for u_isIDIgnorable() what was not added above */ 1581 set.add(DEL); /* range DEL..NBSP-1, NBSP added below */ 1582 set.add(HAIRSP); 1583 set.add(RLM+1); 1584 set.add(INHSWAP); 1585 set.add(NOMDIG+1); 1586 set.add(ZWNBSP); 1587 set.add(ZWNBSP+1); 1588 1589 /* add no-break spaces for u_isWhitespace() what was not added above */ 1590 set.add(NBSP); 1591 set.add(NBSP+1); 1592 set.add(FIGURESP); 1593 set.add(FIGURESP+1); 1594 set.add(NNBSP); 1595 set.add(NNBSP+1); 1596 1597 /* add for u_charDigitValue() */ 1598 // TODO remove when UCharacter.getHanNumericValue() is changed to just return 1599 // Unicode numeric values 1600 set.add(0x3007); 1601 set.add(0x3008); 1602 set.add(0x4e00); 1603 set.add(0x4e01); 1604 set.add(0x4e8c); 1605 set.add(0x4e8d); 1606 set.add(0x4e09); 1607 set.add(0x4e0a); 1608 set.add(0x56db); 1609 set.add(0x56dc); 1610 set.add(0x4e94); 1611 set.add(0x4e95); 1612 set.add(0x516d); 1613 set.add(0x516e); 1614 set.add(0x4e03); 1615 set.add(0x4e04); 1616 set.add(0x516b); 1617 set.add(0x516c); 1618 set.add(0x4e5d); 1619 set.add(0x4e5e); 1620 1621 /* add for u_digit() */ 1622 set.add(U_a); 1623 set.add(U_z+1); 1624 set.add(U_A); 1625 set.add(U_Z+1); 1626 set.add(U_FW_a); 1627 set.add(U_FW_z+1); 1628 set.add(U_FW_A); 1629 set.add(U_FW_Z+1); 1630 1631 /* add for u_isxdigit() */ 1632 set.add(U_f+1); 1633 set.add(U_F+1); 1634 set.add(U_FW_f+1); 1635 set.add(U_FW_F+1); 1636 1637 /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */ 1638 set.add(WJ); /* range WJ..NOMDIG */ 1639 set.add(0xfff0); 1640 set.add(0xfffb+1); 1641 set.add(0xe0000); 1642 set.add(0xe0fff+1); 1643 1644 /* add for UCHAR_GRAPHEME_BASE and others */ 1645 set.add(CGJ); 1646 set.add(CGJ+1); 1647 1648 return set; // for chaining 1649 } 1650 upropsvec_addPropertyStarts(UnicodeSet set)1651 public void upropsvec_addPropertyStarts(UnicodeSet set) { 1652 /* add the start code point of each same-value range of the properties vectors trie */ 1653 if(m_additionalColumnsCount_>0) { 1654 /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ 1655 Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator(); 1656 Trie2.Range range; 1657 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 1658 set.add(range.startCodePoint); 1659 } 1660 } 1661 } 1662 ulayout_addPropertyStarts(int src, UnicodeSet set)1663 static UnicodeSet ulayout_addPropertyStarts(int src, UnicodeSet set) { 1664 return LayoutProps.INSTANCE.addPropertyStarts(src, set); 1665 } 1666 1667 // This static initializer block must be placed after 1668 // other static member initialization 1669 static { 1670 try { 1671 INSTANCE = new UCharacterProperty(); 1672 } 1673 catch (IOException e) { 1674 throw new MissingResourceException(e.getMessage(),"",""); 1675 } 1676 } 1677 1678 /*---------------------------------------------------------------- 1679 * Inclusions list 1680 *----------------------------------------------------------------*/ 1681 1682 /* 1683 * Return a set of characters for property enumeration. 1684 * The set implicitly contains 0x110000 as well, which is one more than the highest 1685 * Unicode code point. 1686 * 1687 * This set is used as an ordered list - its code points are ordered, and 1688 * consecutive code points (in Unicode code point order) in the set define a range. 1689 * For each two consecutive characters (start, limit) in the set, 1690 * all of the UCD/normalization and related properties for 1691 * all code points start..limit-1 are all the same, 1692 * except for character names and ISO comments. 1693 * 1694 * All Unicode code points U+0000..U+10ffff are covered by these ranges. 1695 * The ranges define a partition of the Unicode code space. 1696 * ICU uses the inclusions set to enumerate properties for generating 1697 * UnicodeSets containing all code points that have a certain property value. 1698 * 1699 * The Inclusion List is generated from the UCD. It is generated 1700 * by enumerating the data tries, and code points for hardcoded properties 1701 * are added as well. 1702 * 1703 * -------------------------------------------------------------------------- 1704 * 1705 * The following are ideas for getting properties-unique code point ranges, 1706 * with possible optimizations beyond the current implementation. 1707 * These optimizations would require more code and be more fragile. 1708 * The current implementation generates one single list (set) for all properties. 1709 * 1710 * To enumerate properties efficiently, one needs to know ranges of 1711 * repetitive values, so that the value of only each start code point 1712 * can be applied to the whole range. 1713 * This information is in principle available in the uprops.icu/unorm.icu data. 1714 * 1715 * There are two obstacles: 1716 * 1717 * 1. Some properties are computed from multiple data structures, 1718 * making it necessary to get repetitive ranges by intersecting 1719 * ranges from multiple tries. 1720 * 1721 * 2. It is not economical to write code for getting repetitive ranges 1722 * that are precise for each of some 50 properties. 1723 * 1724 * Compromise ideas: 1725 * 1726 * - Get ranges per trie, not per individual property. 1727 * Each range contains the same values for a whole group of properties. 1728 * This would generate currently five range sets, two for uprops.icu tries 1729 * and three for unorm.icu tries. 1730 * 1731 * - Combine sets of ranges for multiple tries to get sufficient sets 1732 * for properties, e.g., the uprops.icu main and auxiliary tries 1733 * for all non-normalization properties. 1734 * 1735 * Ideas for representing ranges and combining them: 1736 * 1737 * - A UnicodeSet could hold just the start code points of ranges. 1738 * Multiple sets are easily combined by or-ing them together. 1739 * 1740 * - Alternatively, a UnicodeSet could hold each even-numbered range. 1741 * All ranges could be enumerated by using each start code point 1742 * (for the even-numbered ranges) as well as each limit (end+1) code point 1743 * (for the odd-numbered ranges). 1744 * It should be possible to combine two such sets by xor-ing them, 1745 * but no more than two. 1746 * 1747 * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays, 1748 * but the first one is certainly simpler and applicable for combining more than 1749 * two range sets. 1750 * 1751 * It is possible to combine all range sets for all uprops/unorm tries into one 1752 * set that can be used for all properties. 1753 * As an optimization, there could be less-combined range sets for certain 1754 * groups of properties. 1755 * The relationship of which less-combined range set to use for which property 1756 * depends on the implementation of the properties and must be hardcoded 1757 * - somewhat error-prone and higher maintenance but can be tested easily 1758 * by building property sets "the simple way" in test code. 1759 * 1760 * --- 1761 * 1762 * Do not use a UnicodeSet pattern because that causes infinite recursion; 1763 * UnicodeSet depends on the inclusions set. 1764 * 1765 * --- 1766 * 1767 * getInclusions() is commented out starting 2005-feb-12 because 1768 * UnicodeSet now calls the uxyz_addPropertyStarts() directly, 1769 * and only for the relevant property source. 1770 */ 1771 /* 1772 public UnicodeSet getInclusions() { 1773 UnicodeSet set = new UnicodeSet(); 1774 NormalizerImpl.addPropertyStarts(set); 1775 addPropertyStarts(set); 1776 return set; 1777 } 1778 */ 1779 } 1780