1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 1996-2016, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 11 package ohos.global.icu.impl; 12 13 import java.io.IOException; 14 import java.nio.ByteBuffer; 15 import java.util.Iterator; 16 import java.util.MissingResourceException; 17 18 import ohos.global.icu.lang.UCharacter; 19 import ohos.global.icu.lang.UCharacter.HangulSyllableType; 20 import ohos.global.icu.lang.UCharacter.NumericType; 21 import ohos.global.icu.lang.UCharacterCategory; 22 import ohos.global.icu.lang.UProperty; 23 import ohos.global.icu.lang.UScript; 24 import ohos.global.icu.text.Normalizer2; 25 import ohos.global.icu.text.UTF16; 26 import ohos.global.icu.text.UnicodeSet; 27 import ohos.global.icu.util.CodePointMap; 28 import ohos.global.icu.util.CodePointTrie; 29 import ohos.global.icu.util.ICUException; 30 import ohos.global.icu.util.ICUUncheckedIOException; 31 import ohos.global.icu.util.VersionInfo; 32 33 /** 34 * <p>Internal class used for Unicode character property database.</p> 35 * <p>This classes store binary data read from uprops.icu. 36 * It does not have the capability to parse the data into more high-level 37 * information. It only returns bytes of information when required.</p> 38 * <p>Due to the form most commonly used for retrieval, array of char is used 39 * to store the binary data.</p> 40 * <p>UCharacterPropertyDB also contains information on accessing indexes to 41 * significant points in the binary data.</p> 42 * <p>Responsibility for molding the binary data into more meaning form lies on 43 * <a href=UCharacter.html>UCharacter</a>.</p> 44 * @author Syn Wee Quek 45 * @hide exposed on OHOS 46 */ 47 48 public final class UCharacterProperty 49 { 50 // public data members ----------------------------------------------- 51 52 /* 53 * public singleton instance 54 */ 55 public static final UCharacterProperty INSTANCE; 56 57 /** 58 * Trie data 59 */ 60 public Trie2_16 m_trie_; 61 /** 62 * Unicode version 63 */ 64 public VersionInfo m_unicodeVersion_; 65 /** 66 * Latin capital letter i with dot above 67 */ 68 public static final char LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_ = 0x130; 69 /** 70 * Latin small letter i with dot above 71 */ 72 public static final char LATIN_SMALL_LETTER_DOTLESS_I_ = 0x131; 73 /** 74 * Latin lowercase i 75 */ 76 public static final char LATIN_SMALL_LETTER_I_ = 0x69; 77 /** 78 * Character type mask 79 */ 80 public static final int TYPE_MASK = 0x1F; 81 82 // uprops.h enum UPropertySource --------------------------------------- *** 83 84 /** No source, not a supported property. */ 85 public static final int SRC_NONE=0; 86 /** From uchar.c/uprops.icu main trie */ 87 public static final int SRC_CHAR=1; 88 /** From uchar.c/uprops.icu properties vectors trie */ 89 public static final int SRC_PROPSVEC=2; 90 /** From unames.c/unames.icu */ 91 public static final int SRC_NAMES=3; 92 /** From ucase.c/ucase.icu */ 93 public static final int SRC_CASE=4; 94 /** From ubidi_props.c/ubidi.icu */ 95 public static final int SRC_BIDI=5; 96 /** From uchar.c/uprops.icu main trie as well as properties vectors trie */ 97 public static final int SRC_CHAR_AND_PROPSVEC=6; 98 /** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */ 99 public static final int SRC_CASE_AND_NORM=7; 100 /** From normalizer2impl.cpp/nfc.nrm */ 101 public static final int SRC_NFC=8; 102 /** From normalizer2impl.cpp/nfkc.nrm */ 103 public static final int SRC_NFKC=9; 104 /** From normalizer2impl.cpp/nfkc_cf.nrm */ 105 public static final int SRC_NFKC_CF=10; 106 /** From normalizer2impl.cpp/nfc.nrm canonical iterator data */ 107 public static final int SRC_NFC_CANON_ITER=11; 108 // Text layout properties. 109 public static final int SRC_INPC=12; 110 public static final int SRC_INSC=13; 111 public static final int SRC_VO=14; 112 /** One more than the highest UPropertySource (SRC_) constant. */ 113 public static final int SRC_COUNT=15; 114 115 private static final class LayoutProps { 116 private static final class IsAcceptable implements ICUBinary.Authenticate { 117 @Override isDataVersionAcceptable(byte version[])118 public boolean isDataVersionAcceptable(byte version[]) { 119 return version[0] == 1; 120 } 121 } 122 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 123 private static final int DATA_FORMAT = 0x4c61796f; // "Layo" 124 125 // indexes into indexes[] 126 // Element 0 stores the length of the indexes[] array. 127 //ivate static final int IX_INDEXES_LENGTH = 0; 128 // Elements 1..7 store the tops of consecutive code point tries. 129 // No trie is stored if the difference between two of these is less than 16. 130 private static final int IX_INPC_TRIE_TOP = 1; 131 private static final int IX_INSC_TRIE_TOP = 2; 132 private static final int IX_VO_TRIE_TOP = 3; 133 //ivate static final int IX_RESERVED_TOP = 4; 134 135 //ivate static final int IX_TRIES_TOP = 7; 136 137 private static final int IX_MAX_VALUES = 9; 138 139 // Length of indexes[]. Multiple of 4 to 16-align the tries. 140 //ivate static final int IX_COUNT = 12; 141 142 private static final int MAX_INPC_SHIFT = 24; 143 private static final int MAX_INSC_SHIFT = 16; 144 private static final int MAX_VO_SHIFT = 8; 145 146 static final LayoutProps INSTANCE = new LayoutProps(); 147 148 CodePointTrie inpcTrie = null; // Indic_Positional_Category 149 CodePointTrie inscTrie = null; // Indic_Syllabic_Category 150 CodePointTrie voTrie = null; // Vertical_Orientation 151 152 int maxInpcValue = 0; 153 int maxInscValue = 0; 154 int maxVoValue = 0; 155 LayoutProps()156 LayoutProps() { 157 ByteBuffer bytes = ICUBinary.getRequiredData("ulayout.icu"); 158 try { 159 ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); 160 int startPos = bytes.position(); 161 int indexesLength = bytes.getInt(); // inIndexes[IX_INDEXES_LENGTH] 162 if (indexesLength < 12) { 163 throw new ICUUncheckedIOException( 164 "Text layout properties data: not enough indexes"); 165 } 166 int[] inIndexes = new int[indexesLength]; 167 inIndexes[0] = indexesLength; 168 for (int i = 1; i < indexesLength; ++i) { 169 inIndexes[i] = bytes.getInt(); 170 } 171 172 int offset = indexesLength * 4; 173 int top = inIndexes[IX_INPC_TRIE_TOP]; 174 int trieSize = top - offset; 175 if (trieSize >= 16) { 176 inpcTrie = CodePointTrie.fromBinary(null, null, bytes); 177 } 178 int pos = bytes.position() - startPos; 179 assert top >= pos; 180 ICUBinary.skipBytes(bytes, top - pos); // skip padding after trie bytes 181 offset = top; 182 top = inIndexes[IX_INSC_TRIE_TOP]; 183 trieSize = top - offset; 184 if (trieSize >= 16) { 185 inscTrie = CodePointTrie.fromBinary(null, null, bytes); 186 } 187 pos = bytes.position() - startPos; 188 assert top >= pos; 189 ICUBinary.skipBytes(bytes, top - pos); // skip padding after trie bytes 190 offset = top; 191 top = inIndexes[IX_VO_TRIE_TOP]; 192 trieSize = top - offset; 193 if (trieSize >= 16) { 194 voTrie = CodePointTrie.fromBinary(null, null, bytes); 195 } 196 pos = bytes.position() - startPos; 197 assert top >= pos; 198 ICUBinary.skipBytes(bytes, top - pos); // skip padding after trie bytes 199 200 int maxValues = inIndexes[IX_MAX_VALUES]; 201 maxInpcValue = maxValues >>> MAX_INPC_SHIFT; 202 maxInscValue = (maxValues >> MAX_INSC_SHIFT) & 0xff; 203 maxVoValue = (maxValues >> MAX_VO_SHIFT) & 0xff; 204 } catch(IOException e) { 205 throw new ICUUncheckedIOException(e); 206 } 207 } 208 addPropertyStarts(int src, UnicodeSet set)209 public UnicodeSet addPropertyStarts(int src, UnicodeSet set) { 210 CodePointTrie trie; 211 switch (src) { 212 case SRC_INPC: 213 trie = inpcTrie; 214 break; 215 case SRC_INSC: 216 trie = inscTrie; 217 break; 218 case SRC_VO: 219 trie = voTrie; 220 break; 221 default: 222 throw new IllegalStateException(); 223 } 224 225 if (trie == null) { 226 throw new MissingResourceException( 227 "no data for one of the text layout properties; src=" + src, 228 "LayoutProps", ""); 229 } 230 231 // Add the start code point of each same-value range of the trie. 232 CodePointMap.Range range = new CodePointMap.Range(); 233 int start = 0; 234 while (trie.getRange(start, null, range)) { 235 set.add(start); 236 start = range.getEnd() + 1; 237 } 238 return set; 239 } 240 } 241 242 // public methods ---------------------------------------------------- 243 244 /** 245 * Gets the main property value for code point ch. 246 * @param ch code point whose property value is to be retrieved 247 * @return property value of code point 248 */ getProperty(int ch)249 public final int getProperty(int ch) 250 { 251 return m_trie_.get(ch); 252 } 253 254 /** 255 * Gets the unicode additional properties. 256 * Java version of C u_getUnicodeProperties(). 257 * @param codepoint codepoint whose additional properties is to be 258 * retrieved 259 * @param column The column index. 260 * @return unicode properties 261 */ getAdditional(int codepoint, int column)262 public int getAdditional(int codepoint, int column) { 263 assert column >= 0; 264 if (column >= m_additionalColumnsCount_) { 265 return 0; 266 } 267 return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; 268 } 269 270 static final int MY_MASK = UCharacterProperty.TYPE_MASK 271 & ((1<<UCharacterCategory.UPPERCASE_LETTER) | 272 (1<<UCharacterCategory.LOWERCASE_LETTER) | 273 (1<<UCharacterCategory.TITLECASE_LETTER) | 274 (1<<UCharacterCategory.MODIFIER_LETTER) | 275 (1<<UCharacterCategory.OTHER_LETTER)); 276 277 278 /** 279 * <p>Get the "age" of the code point.</p> 280 * <p>The "age" is the Unicode version when the code point was first 281 * designated (as a non-character or for Private Use) or assigned a 282 * character.</p> 283 * <p>This can be useful to avoid emitting code points to receiving 284 * processes that do not accept newer characters.</p> 285 * <p>The data is from the UCD file DerivedAge.txt.</p> 286 * <p>This API does not check the validity of the codepoint.</p> 287 * @param codepoint The code point. 288 * @return the Unicode version number 289 */ getAge(int codepoint)290 public VersionInfo getAge(int codepoint) 291 { 292 int version = getAdditional(codepoint, 0) >> AGE_SHIFT_; 293 return VersionInfo.getInstance( 294 (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, 295 version & LAST_NIBBLE_MASK_, 0, 0); 296 } 297 298 private static final int GC_CN_MASK = getMask(UCharacter.UNASSIGNED); 299 private static final int GC_CC_MASK = getMask(UCharacter.CONTROL); 300 private static final int GC_CS_MASK = getMask(UCharacter.SURROGATE); 301 private static final int GC_ZS_MASK = getMask(UCharacter.SPACE_SEPARATOR); 302 private static final int GC_ZL_MASK = getMask(UCharacter.LINE_SEPARATOR); 303 private static final int GC_ZP_MASK = getMask(UCharacter.PARAGRAPH_SEPARATOR); 304 /** Mask constant for multiple UCharCategory bits (Z Separators). */ 305 private static final int GC_Z_MASK = GC_ZS_MASK|GC_ZL_MASK|GC_ZP_MASK; 306 307 /** 308 * Checks if c is in 309 * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}] 310 * with space=\p{Whitespace} and Control=Cc. 311 * Implements UCHAR_POSIX_GRAPH. 312 * @hide draft / provisional / internal are hidden on OHOS 313 */ isgraphPOSIX(int c)314 private static final boolean isgraphPOSIX(int c) { 315 /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */ 316 /* comparing ==0 returns FALSE for the categories mentioned */ 317 return (getMask(UCharacter.getType(c))& 318 (GC_CC_MASK|GC_CS_MASK|GC_CN_MASK|GC_Z_MASK)) 319 ==0; 320 } 321 322 // binary properties --------------------------------------------------- *** 323 324 private class BinaryProperty { 325 int column; // SRC_PROPSVEC column, or "source" if mask==0 326 int mask; BinaryProperty(int column, int mask)327 BinaryProperty(int column, int mask) { 328 this.column=column; 329 this.mask=mask; 330 } BinaryProperty(int source)331 BinaryProperty(int source) { 332 this.column=source; 333 this.mask=0; 334 } getSource()335 final int getSource() { 336 return mask==0 ? column : SRC_PROPSVEC; 337 } contains(int c)338 boolean contains(int c) { 339 // systematic, directly stored properties 340 return (getAdditional(c, column)&mask)!=0; 341 } 342 } 343 344 private class CaseBinaryProperty extends BinaryProperty { // case mapping properties 345 int which; CaseBinaryProperty(int which)346 CaseBinaryProperty(int which) { 347 super(SRC_CASE); 348 this.which=which; 349 } 350 @Override contains(int c)351 boolean contains(int c) { 352 return UCaseProps.INSTANCE.hasBinaryProperty(c, which); 353 } 354 } 355 356 private class NormInertBinaryProperty extends BinaryProperty { // UCHAR_NF*_INERT properties 357 int which; NormInertBinaryProperty(int source, int which)358 NormInertBinaryProperty(int source, int which) { 359 super(source); 360 this.which=which; 361 } 362 @Override contains(int c)363 boolean contains(int c) { 364 return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_INERT).isInert(c); 365 } 366 } 367 368 BinaryProperty[] binProps={ 369 /* 370 * Binary-property implementations must be in order of corresponding UProperty, 371 * and there must be exactly one entry per binary UProperty. 372 */ 373 new BinaryProperty(1, (1<<ALPHABETIC_PROPERTY_)), 374 new BinaryProperty(1, (1<<ASCII_HEX_DIGIT_PROPERTY_)), 375 new BinaryProperty(SRC_BIDI) { // UCHAR_BIDI_CONTROL 376 @Override 377 boolean contains(int c) { 378 return UBiDiProps.INSTANCE.isBidiControl(c); 379 } 380 }, 381 new BinaryProperty(SRC_BIDI) { // UCHAR_BIDI_MIRRORED 382 @Override 383 boolean contains(int c) { 384 return UBiDiProps.INSTANCE.isMirrored(c); 385 } 386 }, 387 new BinaryProperty(1, (1<<DASH_PROPERTY_)), 388 new BinaryProperty(1, (1<<DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_)), 389 new BinaryProperty(1, (1<<DEPRECATED_PROPERTY_)), 390 new BinaryProperty(1, (1<<DIACRITIC_PROPERTY_)), 391 new BinaryProperty(1, (1<<EXTENDER_PROPERTY_)), 392 new BinaryProperty(SRC_NFC) { // UCHAR_FULL_COMPOSITION_EXCLUSION 393 @Override 394 boolean contains(int c) { 395 // By definition, Full_Composition_Exclusion is the same as NFC_QC=No. 396 Normalizer2Impl impl=Norm2AllModes.getNFCInstance().impl; 397 return impl.isCompNo(impl.getNorm16(c)); 398 } 399 }, 400 new BinaryProperty(1, (1<<GRAPHEME_BASE_PROPERTY_)), 401 new BinaryProperty(1, (1<<GRAPHEME_EXTEND_PROPERTY_)), 402 new BinaryProperty(1, (1<<GRAPHEME_LINK_PROPERTY_)), 403 new BinaryProperty(1, (1<<HEX_DIGIT_PROPERTY_)), 404 new BinaryProperty(1, (1<<HYPHEN_PROPERTY_)), 405 new BinaryProperty(1, (1<<ID_CONTINUE_PROPERTY_)), 406 new BinaryProperty(1, (1<<ID_START_PROPERTY_)), 407 new BinaryProperty(1, (1<<IDEOGRAPHIC_PROPERTY_)), 408 new BinaryProperty(1, (1<<IDS_BINARY_OPERATOR_PROPERTY_)), 409 new BinaryProperty(1, (1<<IDS_TRINARY_OPERATOR_PROPERTY_)), 410 new BinaryProperty(SRC_BIDI) { // UCHAR_JOIN_CONTROL 411 @Override 412 boolean contains(int c) { 413 return UBiDiProps.INSTANCE.isJoinControl(c); 414 } 415 }, 416 new BinaryProperty(1, (1<<LOGICAL_ORDER_EXCEPTION_PROPERTY_)), 417 new CaseBinaryProperty(UProperty.LOWERCASE), 418 new BinaryProperty(1, (1<<MATH_PROPERTY_)), 419 new BinaryProperty(1, (1<<NONCHARACTER_CODE_POINT_PROPERTY_)), 420 new BinaryProperty(1, (1<<QUOTATION_MARK_PROPERTY_)), 421 new BinaryProperty(1, (1<<RADICAL_PROPERTY_)), 422 new CaseBinaryProperty(UProperty.SOFT_DOTTED), 423 new BinaryProperty(1, (1<<TERMINAL_PUNCTUATION_PROPERTY_)), 424 new BinaryProperty(1, (1<<UNIFIED_IDEOGRAPH_PROPERTY_)), 425 new CaseBinaryProperty(UProperty.UPPERCASE), 426 new BinaryProperty(1, (1<<WHITE_SPACE_PROPERTY_)), 427 new BinaryProperty(1, (1<<XID_CONTINUE_PROPERTY_)), 428 new BinaryProperty(1, (1<<XID_START_PROPERTY_)), 429 new CaseBinaryProperty(UProperty.CASE_SENSITIVE), 430 new BinaryProperty(1, (1<<S_TERM_PROPERTY_)), 431 new BinaryProperty(1, (1<<VARIATION_SELECTOR_PROPERTY_)), 432 new NormInertBinaryProperty(SRC_NFC, UProperty.NFD_INERT), 433 new NormInertBinaryProperty(SRC_NFKC, UProperty.NFKD_INERT), 434 new NormInertBinaryProperty(SRC_NFC, UProperty.NFC_INERT), 435 new NormInertBinaryProperty(SRC_NFKC, UProperty.NFKC_INERT), 436 new BinaryProperty(SRC_NFC_CANON_ITER) { // UCHAR_SEGMENT_STARTER 437 @Override 438 boolean contains(int c) { 439 return Norm2AllModes.getNFCInstance().impl. 440 ensureCanonIterData().isCanonSegmentStarter(c); 441 } 442 }, 443 new BinaryProperty(1, (1<<PATTERN_SYNTAX)), 444 new BinaryProperty(1, (1<<PATTERN_WHITE_SPACE)), 445 new BinaryProperty(SRC_CHAR_AND_PROPSVEC) { // UCHAR_POSIX_ALNUM 446 @Override 447 boolean contains(int c) { 448 return UCharacter.isUAlphabetic(c) || UCharacter.isDigit(c); 449 } 450 }, 451 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_BLANK 452 @Override 453 boolean contains(int c) { 454 // "horizontal space" 455 if(c<=0x9f) { 456 return c==9 || c==0x20; /* TAB or SPACE */ 457 } else { 458 /* Zs */ 459 return UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR; 460 } 461 } 462 }, 463 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_GRAPH 464 @Override 465 boolean contains(int c) { 466 return isgraphPOSIX(c); 467 } 468 }, 469 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_PRINT 470 @Override 471 boolean contains(int c) { 472 /* 473 * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}. 474 * 475 * The only cntrl character in graph+blank is TAB (in blank). 476 * Here we implement (blank-TAB)=Zs instead of calling u_isblank(). 477 */ 478 return (UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(c); 479 } 480 }, 481 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_XDIGIT 482 @Override 483 boolean contains(int c) { 484 /* check ASCII and Fullwidth ASCII a-fA-F */ 485 if( 486 (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) || 487 (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41)) 488 ) { 489 return true; 490 } 491 return UCharacter.getType(c)==UCharacter.DECIMAL_DIGIT_NUMBER; 492 } 493 }, 494 new CaseBinaryProperty(UProperty.CASED), 495 new CaseBinaryProperty(UProperty.CASE_IGNORABLE), 496 new CaseBinaryProperty(UProperty.CHANGES_WHEN_LOWERCASED), 497 new CaseBinaryProperty(UProperty.CHANGES_WHEN_UPPERCASED), 498 new CaseBinaryProperty(UProperty.CHANGES_WHEN_TITLECASED), 499 new BinaryProperty(SRC_CASE_AND_NORM) { // UCHAR_CHANGES_WHEN_CASEFOLDED 500 @Override 501 boolean contains(int c) { 502 String nfd=Norm2AllModes.getNFCInstance().impl.getDecomposition(c); 503 if(nfd!=null) { 504 /* c has a decomposition */ 505 c=nfd.codePointAt(0); 506 if(Character.charCount(c)!=nfd.length()) { 507 /* multiple code points */ 508 c=-1; 509 } 510 } else if(c<0) { 511 return false; /* protect against bad input */ 512 } 513 if(c>=0) { 514 /* single code point */ 515 UCaseProps csp=UCaseProps.INSTANCE; 516 UCaseProps.dummyStringBuilder.setLength(0); 517 return csp.toFullFolding(c, UCaseProps.dummyStringBuilder, 518 UCharacter.FOLD_CASE_DEFAULT)>=0; 519 } else { 520 String folded=UCharacter.foldCase(nfd, true); 521 return !folded.equals(nfd); 522 } 523 } 524 }, 525 new CaseBinaryProperty(UProperty.CHANGES_WHEN_CASEMAPPED), 526 new BinaryProperty(SRC_NFKC_CF) { // UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED 527 @Override 528 boolean contains(int c) { 529 Normalizer2Impl kcf=Norm2AllModes.getNFKC_CFInstance().impl; 530 String src=UTF16.valueOf(c); 531 StringBuilder dest=new StringBuilder(); 532 // Small destCapacity for NFKC_CF(c). 533 Normalizer2Impl.ReorderingBuffer buffer=new Normalizer2Impl.ReorderingBuffer(kcf, dest, 5); 534 kcf.compose(src, 0, src.length(), false, true, buffer); 535 return !Normalizer2Impl.UTF16Plus.equal(dest, src); 536 } 537 }, 538 new BinaryProperty(2, 1<<PROPS_2_EMOJI), 539 new BinaryProperty(2, 1<<PROPS_2_EMOJI_PRESENTATION), 540 new BinaryProperty(2, 1<<PROPS_2_EMOJI_MODIFIER), 541 new BinaryProperty(2, 1<<PROPS_2_EMOJI_MODIFIER_BASE), 542 new BinaryProperty(2, 1<<PROPS_2_EMOJI_COMPONENT), 543 new BinaryProperty(SRC_PROPSVEC) { // REGIONAL_INDICATOR 544 // Property starts are a subset of lb=RI etc. 545 @Override 546 boolean contains(int c) { 547 return 0x1F1E6<=c && c<=0x1F1FF; 548 } 549 }, 550 new BinaryProperty(1, 1<<PREPENDED_CONCATENATION_MARK), 551 new BinaryProperty(2, 1<<PROPS_2_EXTENDED_PICTOGRAPHIC), 552 }; 553 hasBinaryProperty(int c, int which)554 public boolean hasBinaryProperty(int c, int which) { 555 if(which<UProperty.BINARY_START || UProperty.BINARY_LIMIT<=which) { 556 // not a known binary property 557 return false; 558 } else { 559 return binProps[which].contains(c); 560 } 561 } 562 563 // int-value and enumerated properties --------------------------------- *** 564 getType(int c)565 public int getType(int c) { 566 return getProperty(c)&TYPE_MASK; 567 } 568 569 /* 570 * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. 571 * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. 572 */ 573 private static final int /* UHangulSyllableType */ gcbToHst[]={ 574 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */ 575 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */ 576 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */ 577 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */ 578 HangulSyllableType.LEADING_JAMO, /* U_GCB_L */ 579 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */ 580 HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */ 581 HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */ 582 HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */ 583 HangulSyllableType.VOWEL_JAMO /* U_GCB_V */ 584 /* 585 * Omit GCB values beyond what we need for hst. 586 * The code below checks for the array length. 587 */ 588 }; 589 590 private class IntProperty { 591 int column; // SRC_PROPSVEC column, or "source" if mask==0 592 int mask; 593 int shift; IntProperty(int column, int mask, int shift)594 IntProperty(int column, int mask, int shift) { 595 this.column=column; 596 this.mask=mask; 597 this.shift=shift; 598 } IntProperty(int source)599 IntProperty(int source) { 600 this.column=source; 601 this.mask=0; 602 } getSource()603 final int getSource() { 604 return mask==0 ? column : SRC_PROPSVEC; 605 } getValue(int c)606 int getValue(int c) { 607 // systematic, directly stored properties 608 return (getAdditional(c, column)&mask)>>>shift; 609 } getMaxValue(int which)610 int getMaxValue(int which) { 611 return (getMaxValues(column)&mask)>>>shift; 612 } 613 } 614 615 private class BiDiIntProperty extends IntProperty { BiDiIntProperty()616 BiDiIntProperty() { 617 super(SRC_BIDI); 618 } 619 @Override getMaxValue(int which)620 int getMaxValue(int which) { 621 return UBiDiProps.INSTANCE.getMaxValue(which); 622 } 623 } 624 625 private class CombiningClassIntProperty extends IntProperty { CombiningClassIntProperty(int source)626 CombiningClassIntProperty(int source) { 627 super(source); 628 } 629 @Override getMaxValue(int which)630 int getMaxValue(int which) { 631 return 0xff; 632 } 633 } 634 635 private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties 636 int which; 637 int max; NormQuickCheckIntProperty(int source, int which, int max)638 NormQuickCheckIntProperty(int source, int which, int max) { 639 super(source); 640 this.which=which; 641 this.max=max; 642 } 643 @Override getValue(int c)644 int getValue(int c) { 645 return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_QUICK_CHECK).getQuickCheck(c); 646 } 647 @Override getMaxValue(int which)648 int getMaxValue(int which) { 649 return max; 650 } 651 } 652 653 IntProperty intProps[]={ 654 new BiDiIntProperty() { // BIDI_CLASS 655 @Override 656 int getValue(int c) { 657 return UBiDiProps.INSTANCE.getClass(c); 658 } 659 }, 660 new IntProperty(0, BLOCK_MASK_, BLOCK_SHIFT_), 661 new CombiningClassIntProperty(SRC_NFC) { // CANONICAL_COMBINING_CLASS 662 @Override 663 int getValue(int c) { 664 return Normalizer2.getNFDInstance().getCombiningClass(c); 665 } 666 }, 667 new IntProperty(2, DECOMPOSITION_TYPE_MASK_, 0), 668 new IntProperty(0, EAST_ASIAN_MASK_, EAST_ASIAN_SHIFT_), 669 new IntProperty(SRC_CHAR) { // GENERAL_CATEGORY 670 @Override 671 int getValue(int c) { 672 return getType(c); 673 } 674 @Override 675 int getMaxValue(int which) { 676 return UCharacterCategory.CHAR_CATEGORY_COUNT-1; 677 } 678 }, 679 new BiDiIntProperty() { // JOINING_GROUP 680 @Override 681 int getValue(int c) { 682 return UBiDiProps.INSTANCE.getJoiningGroup(c); 683 } 684 }, 685 new BiDiIntProperty() { // JOINING_TYPE 686 @Override 687 int getValue(int c) { 688 return UBiDiProps.INSTANCE.getJoiningType(c); 689 } 690 }, 691 new IntProperty(2, LB_MASK, LB_SHIFT), // LINE_BREAK 692 new IntProperty(SRC_CHAR) { // NUMERIC_TYPE 693 @Override 694 int getValue(int c) { 695 return ntvGetType(getNumericTypeValue(getProperty(c))); 696 } 697 @Override 698 int getMaxValue(int which) { 699 return NumericType.COUNT-1; 700 } 701 }, 702 new IntProperty(SRC_PROPSVEC) { 703 @Override 704 int getValue(int c) { 705 return UScript.getScript(c); 706 } 707 @Override 708 int getMaxValue(int which) { 709 int scriptX=getMaxValues(0)&SCRIPT_X_MASK; 710 return mergeScriptCodeOrIndex(scriptX); 711 } 712 }, 713 new IntProperty(SRC_PROPSVEC) { // HANGUL_SYLLABLE_TYPE 714 @Override 715 int getValue(int c) { 716 /* see comments on gcbToHst[] above */ 717 int gcb=(getAdditional(c, 2)&GCB_MASK)>>>GCB_SHIFT; 718 if(gcb<gcbToHst.length) { 719 return gcbToHst[gcb]; 720 } else { 721 return HangulSyllableType.NOT_APPLICABLE; 722 } 723 } 724 @Override 725 int getMaxValue(int which) { 726 return HangulSyllableType.COUNT-1; 727 } 728 }, 729 // max=1=YES -- these are never "maybe", only "no" or "yes" 730 new NormQuickCheckIntProperty(SRC_NFC, UProperty.NFD_QUICK_CHECK, 1), 731 new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKD_QUICK_CHECK, 1), 732 // max=2=MAYBE 733 new NormQuickCheckIntProperty(SRC_NFC, UProperty.NFC_QUICK_CHECK, 2), 734 new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKC_QUICK_CHECK, 2), 735 new CombiningClassIntProperty(SRC_NFC) { // LEAD_CANONICAL_COMBINING_CLASS 736 @Override 737 int getValue(int c) { 738 return Norm2AllModes.getNFCInstance().impl.getFCD16(c)>>8; 739 } 740 }, 741 new CombiningClassIntProperty(SRC_NFC) { // TRAIL_CANONICAL_COMBINING_CLASS 742 @Override 743 int getValue(int c) { 744 return Norm2AllModes.getNFCInstance().impl.getFCD16(c)&0xff; 745 } 746 }, 747 new IntProperty(2, GCB_MASK, GCB_SHIFT), // GRAPHEME_CLUSTER_BREAK 748 new IntProperty(2, SB_MASK, SB_SHIFT), // SENTENCE_BREAK 749 new IntProperty(2, WB_MASK, WB_SHIFT), // WORD_BREAK 750 new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE 751 @Override 752 int getValue(int c) { 753 return UBiDiProps.INSTANCE.getPairedBracketType(c); 754 } 755 }, 756 new IntProperty(SRC_INPC) { 757 @Override 758 int getValue(int c) { 759 CodePointTrie trie = LayoutProps.INSTANCE.inpcTrie; 760 return trie != null ? trie.get(c) : 0; 761 } 762 @Override 763 int getMaxValue(int which) { 764 return LayoutProps.INSTANCE.maxInpcValue; 765 } 766 }, 767 new IntProperty(SRC_INSC) { 768 @Override 769 int getValue(int c) { 770 CodePointTrie trie = LayoutProps.INSTANCE.inscTrie; 771 return trie != null ? trie.get(c) : 0; 772 } 773 @Override 774 int getMaxValue(int which) { 775 return LayoutProps.INSTANCE.maxInscValue; 776 } 777 }, 778 new IntProperty(SRC_VO) { 779 @Override 780 int getValue(int c) { 781 CodePointTrie trie = LayoutProps.INSTANCE.voTrie; 782 return trie != null ? trie.get(c) : 0; 783 } 784 @Override 785 int getMaxValue(int which) { 786 return LayoutProps.INSTANCE.maxVoValue; 787 } 788 }, 789 }; 790 getIntPropertyValue(int c, int which)791 public int getIntPropertyValue(int c, int which) { 792 if(which<UProperty.INT_START) { 793 if(UProperty.BINARY_START<=which && which<UProperty.BINARY_LIMIT) { 794 return binProps[which].contains(c) ? 1 : 0; 795 } 796 } else if(which<UProperty.INT_LIMIT) { 797 return intProps[which-UProperty.INT_START].getValue(c); 798 } else if (which == UProperty.GENERAL_CATEGORY_MASK) { 799 return getMask(getType(c)); 800 } 801 return 0; // undefined 802 } 803 getIntPropertyMaxValue(int which)804 public int getIntPropertyMaxValue(int which) { 805 if(which<UProperty.INT_START) { 806 if(UProperty.BINARY_START<=which && which<UProperty.BINARY_LIMIT) { 807 return 1; // maximum TRUE for all binary properties 808 } 809 } else if(which<UProperty.INT_LIMIT) { 810 return intProps[which-UProperty.INT_START].getMaxValue(which); 811 } 812 return -1; // undefined 813 } 814 getSource(int which)815 final int getSource(int which) { 816 if(which<UProperty.BINARY_START) { 817 return SRC_NONE; /* undefined */ 818 } else if(which<UProperty.BINARY_LIMIT) { 819 return binProps[which].getSource(); 820 } else if(which<UProperty.INT_START) { 821 return SRC_NONE; /* undefined */ 822 } else if(which<UProperty.INT_LIMIT) { 823 return intProps[which-UProperty.INT_START].getSource(); 824 } else if(which<UProperty.STRING_START) { 825 switch(which) { 826 case UProperty.GENERAL_CATEGORY_MASK: 827 case UProperty.NUMERIC_VALUE: 828 return SRC_CHAR; 829 830 default: 831 return SRC_NONE; 832 } 833 } else if(which<UProperty.STRING_LIMIT) { 834 switch(which) { 835 case UProperty.AGE: 836 return SRC_PROPSVEC; 837 838 case UProperty.BIDI_MIRRORING_GLYPH: 839 return SRC_BIDI; 840 841 case UProperty.CASE_FOLDING: 842 case UProperty.LOWERCASE_MAPPING: 843 case UProperty.SIMPLE_CASE_FOLDING: 844 case UProperty.SIMPLE_LOWERCASE_MAPPING: 845 case UProperty.SIMPLE_TITLECASE_MAPPING: 846 case UProperty.SIMPLE_UPPERCASE_MAPPING: 847 case UProperty.TITLECASE_MAPPING: 848 case UProperty.UPPERCASE_MAPPING: 849 return SRC_CASE; 850 851 case UProperty.ISO_COMMENT: 852 case UProperty.NAME: 853 case UProperty.UNICODE_1_NAME: 854 return SRC_NAMES; 855 856 default: 857 return SRC_NONE; 858 } 859 } else { 860 switch(which) { 861 case UProperty.SCRIPT_EXTENSIONS: 862 return SRC_PROPSVEC; 863 default: 864 return SRC_NONE; /* undefined */ 865 } 866 } 867 } 868 869 /** 870 * <p> 871 * Unicode property names and property value names are compared 872 * "loosely". Property[Value]Aliases.txt say: 873 * <quote> 874 * "With loose matching of property names, the case distinctions, 875 * whitespace, and '_' are ignored." 876 * </quote> 877 * </p> 878 * <p> 879 * This function does just that, for ASCII (char *) name strings. 880 * It is almost identical to ucnv_compareNames() but also ignores 881 * ASCII White_Space characters (U+0009..U+000d). 882 * </p> 883 * @param name1 name to compare 884 * @param name2 name to compare 885 * @return 0 if names are equal, < 0 if name1 is less than name2 and > 0 886 * if name1 is greater than name2. 887 */ 888 /* to be implemented in 2.4 889 * public static int comparePropertyNames(String name1, String name2) 890 { 891 int result = 0; 892 int i1 = 0; 893 int i2 = 0; 894 while (true) { 895 char ch1 = 0; 896 char ch2 = 0; 897 // Ignore delimiters '-', '_', and ASCII White_Space 898 if (i1 < name1.length()) { 899 ch1 = name1.charAt(i1 ++); 900 } 901 while (ch1 == '-' || ch1 == '_' || ch1 == ' ' || ch1 == '\t' 902 || ch1 == '\n' // synwee what is || ch1 == '\v' 903 || ch1 == '\f' || ch1=='\r') { 904 if (i1 < name1.length()) { 905 ch1 = name1.charAt(i1 ++); 906 } 907 else { 908 ch1 = 0; 909 } 910 } 911 if (i2 < name2.length()) { 912 ch2 = name2.charAt(i2 ++); 913 } 914 while (ch2 == '-' || ch2 == '_' || ch2 == ' ' || ch2 == '\t' 915 || ch2 == '\n' // synwee what is || ch1 == '\v' 916 || ch2 == '\f' || ch2=='\r') { 917 if (i2 < name2.length()) { 918 ch2 = name2.charAt(i2 ++); 919 } 920 else { 921 ch2 = 0; 922 } 923 } 924 925 // If we reach the ends of both strings then they match 926 if (ch1 == 0 && ch2 == 0) { 927 return 0; 928 } 929 930 // Case-insensitive comparison 931 if (ch1 != ch2) { 932 result = Character.toLowerCase(ch1) 933 - Character.toLowerCase(ch2); 934 if (result != 0) { 935 return result; 936 } 937 } 938 } 939 } 940 */ 941 942 /** 943 * Get the the maximum values for some enum/int properties. 944 * @return maximum values for the integer properties. 945 */ getMaxValues(int column)946 public int getMaxValues(int column) 947 { 948 // return m_maxBlockScriptValue_; 949 950 switch(column) { 951 case 0: 952 return m_maxBlockScriptValue_; 953 case 2: 954 return m_maxJTGValue_; 955 default: 956 return 0; 957 } 958 } 959 960 /** 961 * Gets the type mask 962 * @param type character type 963 * @return mask 964 */ getMask(int type)965 public static final int getMask(int type) 966 { 967 return 1 << type; 968 } 969 970 971 /** 972 * Returns the digit values of characters like 'A' - 'Z', normal, 973 * half-width and full-width. This method assumes that the other digit 974 * characters are checked by the calling method. 975 * @param ch character to test 976 * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise 977 * its corresponding digit will be returned. 978 */ getEuropeanDigit(int ch)979 public static int getEuropeanDigit(int ch) { 980 if ((ch > 0x7a && ch < 0xff21) 981 || ch < 0x41 || (ch > 0x5a && ch < 0x61) 982 || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { 983 return -1; 984 } 985 if (ch <= 0x7a) { 986 // ch >= 0x41 or ch < 0x61 987 return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); 988 } 989 // ch >= 0xff21 990 if (ch <= 0xff3a) { 991 return ch + 10 - 0xff21; 992 } 993 // ch >= 0xff41 && ch <= 0xff5a 994 return ch + 10 - 0xff41; 995 } 996 digit(int c)997 public int digit(int c) { 998 int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_; 999 if(value<=9) { 1000 return value; 1001 } else { 1002 return -1; 1003 } 1004 } 1005 getNumericValue(int c)1006 public int getNumericValue(int c) { 1007 // slightly pruned version of getUnicodeNumericValue(), plus getEuropeanDigit() 1008 int ntv = getNumericTypeValue(getProperty(c)); 1009 1010 if(ntv==NTV_NONE_) { 1011 return getEuropeanDigit(c); 1012 } else if(ntv<NTV_DIGIT_START_) { 1013 /* decimal digit */ 1014 return ntv-NTV_DECIMAL_START_; 1015 } else if(ntv<NTV_NUMERIC_START_) { 1016 /* other digit */ 1017 return ntv-NTV_DIGIT_START_; 1018 } else if(ntv<NTV_FRACTION_START_) { 1019 /* small integer */ 1020 return ntv-NTV_NUMERIC_START_; 1021 } else if(ntv<NTV_LARGE_START_) { 1022 /* fraction */ 1023 return -2; 1024 } else if(ntv<NTV_BASE60_START_) { 1025 /* large, single-significant-digit integer */ 1026 int mant=(ntv>>5)-14; 1027 int exp=(ntv&0x1f)+2; 1028 if(exp<9 || (exp==9 && mant<=2)) { 1029 int numValue=mant; 1030 do { 1031 numValue*=10; 1032 } while(--exp>0); 1033 return numValue; 1034 } else { 1035 return -2; 1036 } 1037 } else if(ntv<NTV_FRACTION20_START_) { 1038 /* sexagesimal (base 60) integer */ 1039 int numValue=(ntv>>2)-0xbf; 1040 int exp=(ntv&3)+1; 1041 1042 switch(exp) { 1043 case 4: 1044 numValue*=60*60*60*60; 1045 break; 1046 case 3: 1047 numValue*=60*60*60; 1048 break; 1049 case 2: 1050 numValue*=60*60; 1051 break; 1052 case 1: 1053 numValue*=60; 1054 break; 1055 case 0: 1056 default: 1057 break; 1058 } 1059 1060 return numValue; 1061 } else if(ntv<NTV_RESERVED_START_) { 1062 // fraction-20 e.g. 3/80 1063 return -2; 1064 } else { 1065 /* reserved */ 1066 return -2; 1067 } 1068 } 1069 getUnicodeNumericValue(int c)1070 public double getUnicodeNumericValue(int c) { 1071 // equivalent to c version double u_getNumericValue(UChar32 c) 1072 int ntv = getNumericTypeValue(getProperty(c)); 1073 1074 if(ntv==NTV_NONE_) { 1075 return UCharacter.NO_NUMERIC_VALUE; 1076 } else if(ntv<NTV_DIGIT_START_) { 1077 /* decimal digit */ 1078 return ntv-NTV_DECIMAL_START_; 1079 } else if(ntv<NTV_NUMERIC_START_) { 1080 /* other digit */ 1081 return ntv-NTV_DIGIT_START_; 1082 } else if(ntv<NTV_FRACTION_START_) { 1083 /* small integer */ 1084 return ntv-NTV_NUMERIC_START_; 1085 } else if(ntv<NTV_LARGE_START_) { 1086 /* fraction */ 1087 int numerator=(ntv>>4)-12; 1088 int denominator=(ntv&0xf)+1; 1089 return (double)numerator/denominator; 1090 } else if(ntv<NTV_BASE60_START_) { 1091 /* large, single-significant-digit integer */ 1092 double numValue; 1093 int mant=(ntv>>5)-14; 1094 int exp=(ntv&0x1f)+2; 1095 numValue=mant; 1096 1097 /* multiply by 10^exp without math.h */ 1098 while(exp>=4) { 1099 numValue*=10000.; 1100 exp-=4; 1101 } 1102 switch(exp) { 1103 case 3: 1104 numValue*=1000.; 1105 break; 1106 case 2: 1107 numValue*=100.; 1108 break; 1109 case 1: 1110 numValue*=10.; 1111 break; 1112 case 0: 1113 default: 1114 break; 1115 } 1116 1117 return numValue; 1118 } else if(ntv<NTV_FRACTION20_START_) { 1119 /* sexagesimal (base 60) integer */ 1120 int numValue=(ntv>>2)-0xbf; 1121 int exp=(ntv&3)+1; 1122 1123 switch(exp) { 1124 case 4: 1125 numValue*=60*60*60*60; 1126 break; 1127 case 3: 1128 numValue*=60*60*60; 1129 break; 1130 case 2: 1131 numValue*=60*60; 1132 break; 1133 case 1: 1134 numValue*=60; 1135 break; 1136 case 0: 1137 default: 1138 break; 1139 } 1140 1141 return numValue; 1142 } else if(ntv<NTV_FRACTION32_START_) { 1143 // fraction-20 e.g. 3/80 1144 int frac20=ntv-NTV_FRACTION20_START_; // 0..0x17 1145 int numerator=2*(frac20&3)+1; 1146 int denominator=20<<(frac20>>2); 1147 return (double)numerator/denominator; 1148 } else if(ntv<NTV_RESERVED_START_) { 1149 // fraction-32 e.g. 3/64 1150 int frac32=ntv-NTV_FRACTION32_START_; // 0..15 1151 int numerator=2*(frac32&3)+1; 1152 int denominator=32<<(frac32>>2); 1153 return (double)numerator/denominator; 1154 } else { 1155 /* reserved */ 1156 return UCharacter.NO_NUMERIC_VALUE; 1157 } 1158 } 1159 1160 // protected variables ----------------------------------------------- 1161 1162 /** 1163 * Extra property trie 1164 */ 1165 Trie2_16 m_additionalTrie_; 1166 /** 1167 * Extra property vectors, 1st column for age and second for binary 1168 * properties. 1169 */ 1170 int m_additionalVectors_[]; 1171 /** 1172 * Number of additional columns 1173 */ 1174 int m_additionalColumnsCount_; 1175 /** 1176 * Maximum values for block, bits used as in vector word 1177 * 0 1178 */ 1179 int m_maxBlockScriptValue_; 1180 /** 1181 * Maximum values for script, bits used as in vector word 1182 * 0 1183 */ 1184 int m_maxJTGValue_; 1185 1186 /** 1187 * Script_Extensions data 1188 */ 1189 public char[] m_scriptExtensions_; 1190 1191 // private variables ------------------------------------------------- 1192 1193 /** 1194 * Default name of the datafile 1195 */ 1196 private static final String DATA_FILE_NAME_ = "uprops.icu"; 1197 1198 // property data constants ------------------------------------------------- 1199 1200 /** 1201 * Numeric types and values in the main properties words. 1202 */ 1203 private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6; getNumericTypeValue(int props)1204 private static final int getNumericTypeValue(int props) { 1205 return props >> NUMERIC_TYPE_VALUE_SHIFT_; 1206 } 1207 /* constants for the storage form of numeric types and values */ 1208 /** No numeric value. */ 1209 private static final int NTV_NONE_ = 0; 1210 /** Decimal digits: nv=0..9 */ 1211 private static final int NTV_DECIMAL_START_ = 1; 1212 /** Other digits: nv=0..9 */ 1213 private static final int NTV_DIGIT_START_ = 11; 1214 /** Small integers: nv=0..154 */ 1215 private static final int NTV_NUMERIC_START_ = 21; 1216 /** Fractions: ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 */ 1217 private static final int NTV_FRACTION_START_ = 0xb0; 1218 /** 1219 * Large integers: 1220 * ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33) 1221 * (only one significant decimal digit) 1222 */ 1223 private static final int NTV_LARGE_START_ = 0x1e0; 1224 /** 1225 * Sexagesimal numbers: 1226 * ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4) 1227 */ 1228 private static final int NTV_BASE60_START_=0x300; 1229 /** 1230 * Fraction-20 values: 1231 * frac20 = ntv-0x324 = 0..0x17 -> 1|3|5|7 / 20|40|80|160|320|640 1232 * numerator: num = 2*(frac20&3)+1 1233 * denominator: den = 20<<(frac20>>2) 1234 */ 1235 private static final int NTV_FRACTION20_START_ = NTV_BASE60_START_ + 36; // 0x300+9*4=0x324 1236 /** 1237 * Fraction-32 values: 1238 * frac32 = ntv-0x34c = 0..15 -> 1|3|5|7 / 32|64|128|256 1239 * numerator: num = 2*(frac32&3)+1 1240 * denominator: den = 32<<(frac32>>2) 1241 */ 1242 private static final int NTV_FRACTION32_START_ = NTV_FRACTION20_START_ + 24; // 0x324+6*4=0x34c 1243 /** No numeric value (yet). */ 1244 private static final int NTV_RESERVED_START_ = NTV_FRACTION32_START_ + 16; // 0x34c+4*4=0x35c 1245 ntvGetType(int ntv)1246 private static final int ntvGetType(int ntv) { 1247 return 1248 (ntv==NTV_NONE_) ? NumericType.NONE : 1249 (ntv<NTV_DIGIT_START_) ? NumericType.DECIMAL : 1250 (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT : 1251 NumericType.NUMERIC; 1252 } 1253 1254 /* 1255 * Properties in vector word 0 1256 * Bits 1257 * 31..24 DerivedAge version major/minor one nibble each 1258 * 23..22 3..1: Bits 21..20 & 7..0 = Script_Extensions index 1259 * 3: Script value from Script_Extensions 1260 * 2: Script=Inherited 1261 * 1: Script=Common 1262 * 0: Script=bits 21..20 & 7..0 1263 * 21..20 Bits 9..8 of the UScriptCode, or index to Script_Extensions 1264 * 19..17 East Asian Width 1265 * 16.. 8 UBlockCode 1266 * 7.. 0 UScriptCode, or index to Script_Extensions 1267 */ 1268 1269 /** 1270 * Script_Extensions: mask includes Script 1271 */ 1272 public static final int SCRIPT_X_MASK = 0x00f000ff; 1273 //private static final int SCRIPT_X_SHIFT = 22; 1274 1275 // The UScriptCode or Script_Extensions index is split across two bit fields. 1276 // (Starting with Unicode 13/ICU 66/2019 due to more varied Script_Extensions.) 1277 // Shift the high bits right by 12 to assemble the full value. 1278 public static final int SCRIPT_HIGH_MASK = 0x00300000; 1279 public static final int SCRIPT_HIGH_SHIFT = 12; 1280 public static final int MAX_SCRIPT = 0x3ff; 1281 1282 /** 1283 * Integer properties mask and shift values for East Asian cell width. 1284 * Equivalent to icu4c UPROPS_EA_MASK 1285 */ 1286 private static final int EAST_ASIAN_MASK_ = 0x000e0000; 1287 /** 1288 * Integer properties mask and shift values for East Asian cell width. 1289 * Equivalent to icu4c UPROPS_EA_SHIFT 1290 */ 1291 private static final int EAST_ASIAN_SHIFT_ = 17; 1292 /** 1293 * Integer properties mask and shift values for blocks. 1294 * Equivalent to icu4c UPROPS_BLOCK_MASK 1295 */ 1296 private static final int BLOCK_MASK_ = 0x0001ff00; 1297 /** 1298 * Integer properties mask and shift values for blocks. 1299 * Equivalent to icu4c UPROPS_BLOCK_SHIFT 1300 */ 1301 private static final int BLOCK_SHIFT_ = 8; 1302 /** 1303 * Integer properties mask and shift values for scripts. 1304 * Equivalent to icu4c UPROPS_SHIFT_LOW_MASK. 1305 */ 1306 public static final int SCRIPT_LOW_MASK = 0x000000ff; 1307 1308 /* SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. */ 1309 public static final int SCRIPT_X_WITH_COMMON = 0x400000; 1310 public static final int SCRIPT_X_WITH_INHERITED = 0x800000; 1311 public static final int SCRIPT_X_WITH_OTHER = 0xc00000; 1312 mergeScriptCodeOrIndex(int scriptX)1313 public static final int mergeScriptCodeOrIndex(int scriptX) { 1314 return 1315 ((scriptX & SCRIPT_HIGH_MASK) >> SCRIPT_HIGH_SHIFT) | 1316 (scriptX & SCRIPT_LOW_MASK); 1317 } 1318 1319 /** 1320 * Additional properties used in internal trie data 1321 */ 1322 /* 1323 * Properties in vector word 1 1324 * Each bit encodes one binary property. 1325 * The following constants represent the bit number, use 1<<UPROPS_XYZ. 1326 * UPROPS_BINARY_1_TOP<=32! 1327 * 1328 * Keep this list of property enums in sync with 1329 * propListNames[] in icu/source/tools/genprops/props2.c! 1330 * 1331 * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_". 1332 */ 1333 private static final int WHITE_SPACE_PROPERTY_ = 0; 1334 private static final int DASH_PROPERTY_ = 1; 1335 private static final int HYPHEN_PROPERTY_ = 2; 1336 private static final int QUOTATION_MARK_PROPERTY_ = 3; 1337 private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4; 1338 private static final int MATH_PROPERTY_ = 5; 1339 private static final int HEX_DIGIT_PROPERTY_ = 6; 1340 private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7; 1341 private static final int ALPHABETIC_PROPERTY_ = 8; 1342 private static final int IDEOGRAPHIC_PROPERTY_ = 9; 1343 private static final int DIACRITIC_PROPERTY_ = 10; 1344 private static final int EXTENDER_PROPERTY_ = 11; 1345 private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12; 1346 private static final int GRAPHEME_EXTEND_PROPERTY_ = 13; 1347 private static final int GRAPHEME_LINK_PROPERTY_ = 14; 1348 private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15; 1349 private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16; 1350 private static final int RADICAL_PROPERTY_ = 17; 1351 private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18; 1352 private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19; 1353 private static final int DEPRECATED_PROPERTY_ = 20; 1354 private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21; 1355 private static final int XID_START_PROPERTY_ = 22; 1356 private static final int XID_CONTINUE_PROPERTY_ = 23; 1357 private static final int ID_START_PROPERTY_ = 24; 1358 private static final int ID_CONTINUE_PROPERTY_ = 25; 1359 private static final int GRAPHEME_BASE_PROPERTY_ = 26; 1360 private static final int S_TERM_PROPERTY_ = 27; 1361 private static final int VARIATION_SELECTOR_PROPERTY_ = 28; 1362 private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */ 1363 private static final int PATTERN_WHITE_SPACE = 30; 1364 private static final int PREPENDED_CONCATENATION_MARK = 31; // new in ICU 60 and Unicode 10 1365 1366 /* 1367 * Properties in vector word 2 1368 * Bits 1369 * 31..26 http://www.unicode.org/reports/tr51/#Emoji_Properties 1370 * 25..20 Line Break 1371 * 19..15 Sentence Break 1372 * 14..10 Word Break 1373 * 9.. 5 Grapheme Cluster Break 1374 * 4.. 0 Decomposition Type 1375 */ 1376 private static final int PROPS_2_EXTENDED_PICTOGRAPHIC=26; 1377 private static final int PROPS_2_EMOJI_COMPONENT = 27; 1378 private static final int PROPS_2_EMOJI = 28; 1379 private static final int PROPS_2_EMOJI_PRESENTATION = 29; 1380 private static final int PROPS_2_EMOJI_MODIFIER = 30; 1381 private static final int PROPS_2_EMOJI_MODIFIER_BASE = 31; 1382 1383 private static final int LB_MASK = 0x03f00000; 1384 private static final int LB_SHIFT = 20; 1385 1386 private static final int SB_MASK = 0x000f8000; 1387 private static final int SB_SHIFT = 15; 1388 1389 private static final int WB_MASK = 0x00007c00; 1390 private static final int WB_SHIFT = 10; 1391 1392 private static final int GCB_MASK = 0x000003e0; 1393 private static final int GCB_SHIFT = 5; 1394 1395 /** 1396 * Integer properties mask for decomposition type. 1397 * Equivalent to icu4c UPROPS_DT_MASK. 1398 */ 1399 private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f; 1400 1401 /** 1402 * First nibble shift 1403 */ 1404 private static final int FIRST_NIBBLE_SHIFT_ = 0x4; 1405 /** 1406 * Second nibble mask 1407 */ 1408 private static final int LAST_NIBBLE_MASK_ = 0xF; 1409 /** 1410 * Age value shift 1411 */ 1412 private static final int AGE_SHIFT_ = 24; 1413 1414 1415 // private constructors -------------------------------------------------- 1416 1417 /** 1418 * Constructor 1419 * @exception IOException thrown when data reading fails or data corrupted 1420 */ UCharacterProperty()1421 private UCharacterProperty() throws IOException 1422 { 1423 // consistency check 1424 if(binProps.length!=UProperty.BINARY_LIMIT) { 1425 throw new ICUException("binProps.length!=UProperty.BINARY_LIMIT"); 1426 } 1427 if(intProps.length!=(UProperty.INT_LIMIT-UProperty.INT_START)) { 1428 throw new ICUException("intProps.length!=(UProperty.INT_LIMIT-UProperty.INT_START)"); 1429 } 1430 1431 // jar access 1432 ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_); 1433 m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable()); 1434 // Read or skip the 16 indexes. 1435 int propertyOffset = bytes.getInt(); 1436 /* exceptionOffset = */ bytes.getInt(); 1437 /* caseOffset = */ bytes.getInt(); 1438 int additionalOffset = bytes.getInt(); 1439 int additionalVectorsOffset = bytes.getInt(); 1440 m_additionalColumnsCount_ = bytes.getInt(); 1441 int scriptExtensionsOffset = bytes.getInt(); 1442 int reservedOffset7 = bytes.getInt(); 1443 /* reservedOffset8 = */ bytes.getInt(); 1444 /* dataTopOffset = */ bytes.getInt(); 1445 m_maxBlockScriptValue_ = bytes.getInt(); 1446 m_maxJTGValue_ = bytes.getInt(); 1447 ICUBinary.skipBytes(bytes, (16 - 12) << 2); 1448 1449 // read the main properties trie 1450 m_trie_ = Trie2_16.createFromSerialized(bytes); 1451 int expectedTrieLength = (propertyOffset - 16) * 4; 1452 int trieLength = m_trie_.getSerializedLength(); 1453 if(trieLength > expectedTrieLength) { 1454 throw new IOException("uprops.icu: not enough bytes for main trie"); 1455 } 1456 // skip padding after trie bytes 1457 ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); 1458 1459 // skip unused intervening data structures 1460 ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4); 1461 1462 if(m_additionalColumnsCount_ > 0) { 1463 // reads the additional property block 1464 m_additionalTrie_ = Trie2_16.createFromSerialized(bytes); 1465 expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4; 1466 trieLength = m_additionalTrie_.getSerializedLength(); 1467 if(trieLength > expectedTrieLength) { 1468 throw new IOException("uprops.icu: not enough bytes for additional-properties trie"); 1469 } 1470 // skip padding after trie bytes 1471 ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); 1472 1473 // additional properties 1474 int size = scriptExtensionsOffset - additionalVectorsOffset; 1475 m_additionalVectors_ = ICUBinary.getInts(bytes, size, 0); 1476 } 1477 1478 // Script_Extensions 1479 int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2; 1480 if(numChars > 0) { 1481 m_scriptExtensions_ = ICUBinary.getChars(bytes, numChars, 0); 1482 } 1483 } 1484 1485 private static final class IsAcceptable implements ICUBinary.Authenticate { 1486 @Override isDataVersionAcceptable(byte version[])1487 public boolean isDataVersionAcceptable(byte version[]) { 1488 return version[0] == 7; 1489 } 1490 } 1491 private static final int DATA_FORMAT = 0x5550726F; // "UPro" 1492 1493 // private methods ------------------------------------------------------- 1494 1495 /* 1496 * Compare additional properties to see if it has argument type 1497 * @param property 32 bit properties 1498 * @param type character type 1499 * @return true if property has type 1500 */ 1501 /*private boolean compareAdditionalType(int property, int type) 1502 { 1503 return (property & (1 << type)) != 0; 1504 }*/ 1505 1506 // property starts for UnicodeSet -------------------------------------- *** 1507 1508 private static final int TAB = 0x0009; 1509 //private static final int LF = 0x000a; 1510 //private static final int FF = 0x000c; 1511 private static final int CR = 0x000d; 1512 private static final int U_A = 0x0041; 1513 private static final int U_F = 0x0046; 1514 private static final int U_Z = 0x005a; 1515 private static final int U_a = 0x0061; 1516 private static final int U_f = 0x0066; 1517 private static final int U_z = 0x007a; 1518 private static final int DEL = 0x007f; 1519 private static final int NL = 0x0085; 1520 private static final int NBSP = 0x00a0; 1521 private static final int CGJ = 0x034f; 1522 private static final int FIGURESP= 0x2007; 1523 private static final int HAIRSP = 0x200a; 1524 //private static final int ZWNJ = 0x200c; 1525 //private static final int ZWJ = 0x200d; 1526 private static final int RLM = 0x200f; 1527 private static final int NNBSP = 0x202f; 1528 private static final int WJ = 0x2060; 1529 private static final int INHSWAP = 0x206a; 1530 private static final int NOMDIG = 0x206f; 1531 private static final int U_FW_A = 0xff21; 1532 private static final int U_FW_F = 0xff26; 1533 private static final int U_FW_Z = 0xff3a; 1534 private static final int U_FW_a = 0xff41; 1535 private static final int U_FW_f = 0xff46; 1536 private static final int U_FW_z = 0xff5a; 1537 private static final int ZWNBSP = 0xfeff; 1538 addPropertyStarts(UnicodeSet set)1539 public UnicodeSet addPropertyStarts(UnicodeSet set) { 1540 /* add the start code point of each same-value range of the main trie */ 1541 Iterator<Trie2.Range> trieIterator = m_trie_.iterator(); 1542 Trie2.Range range; 1543 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 1544 set.add(range.startCodePoint); 1545 } 1546 1547 /* add code points with hardcoded properties, plus the ones following them */ 1548 1549 /* add for u_isblank() */ 1550 set.add(TAB); 1551 set.add(TAB+1); 1552 1553 /* add for IS_THAT_CONTROL_SPACE() */ 1554 set.add(CR+1); /* range TAB..CR */ 1555 set.add(0x1c); 1556 set.add(0x1f+1); 1557 set.add(NL); 1558 set.add(NL+1); 1559 1560 /* add for u_isIDIgnorable() what was not added above */ 1561 set.add(DEL); /* range DEL..NBSP-1, NBSP added below */ 1562 set.add(HAIRSP); 1563 set.add(RLM+1); 1564 set.add(INHSWAP); 1565 set.add(NOMDIG+1); 1566 set.add(ZWNBSP); 1567 set.add(ZWNBSP+1); 1568 1569 /* add no-break spaces for u_isWhitespace() what was not added above */ 1570 set.add(NBSP); 1571 set.add(NBSP+1); 1572 set.add(FIGURESP); 1573 set.add(FIGURESP+1); 1574 set.add(NNBSP); 1575 set.add(NNBSP+1); 1576 1577 /* add for u_charDigitValue() */ 1578 // TODO remove when UCharacter.getHanNumericValue() is changed to just return 1579 // Unicode numeric values 1580 set.add(0x3007); 1581 set.add(0x3008); 1582 set.add(0x4e00); 1583 set.add(0x4e01); 1584 set.add(0x4e8c); 1585 set.add(0x4e8d); 1586 set.add(0x4e09); 1587 set.add(0x4e0a); 1588 set.add(0x56db); 1589 set.add(0x56dc); 1590 set.add(0x4e94); 1591 set.add(0x4e95); 1592 set.add(0x516d); 1593 set.add(0x516e); 1594 set.add(0x4e03); 1595 set.add(0x4e04); 1596 set.add(0x516b); 1597 set.add(0x516c); 1598 set.add(0x4e5d); 1599 set.add(0x4e5e); 1600 1601 /* add for u_digit() */ 1602 set.add(U_a); 1603 set.add(U_z+1); 1604 set.add(U_A); 1605 set.add(U_Z+1); 1606 set.add(U_FW_a); 1607 set.add(U_FW_z+1); 1608 set.add(U_FW_A); 1609 set.add(U_FW_Z+1); 1610 1611 /* add for u_isxdigit() */ 1612 set.add(U_f+1); 1613 set.add(U_F+1); 1614 set.add(U_FW_f+1); 1615 set.add(U_FW_F+1); 1616 1617 /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */ 1618 set.add(WJ); /* range WJ..NOMDIG */ 1619 set.add(0xfff0); 1620 set.add(0xfffb+1); 1621 set.add(0xe0000); 1622 set.add(0xe0fff+1); 1623 1624 /* add for UCHAR_GRAPHEME_BASE and others */ 1625 set.add(CGJ); 1626 set.add(CGJ+1); 1627 1628 return set; // for chaining 1629 } 1630 upropsvec_addPropertyStarts(UnicodeSet set)1631 public void upropsvec_addPropertyStarts(UnicodeSet set) { 1632 /* add the start code point of each same-value range of the properties vectors trie */ 1633 if(m_additionalColumnsCount_>0) { 1634 /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ 1635 Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator(); 1636 Trie2.Range range; 1637 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 1638 set.add(range.startCodePoint); 1639 } 1640 } 1641 } 1642 ulayout_addPropertyStarts(int src, UnicodeSet set)1643 static UnicodeSet ulayout_addPropertyStarts(int src, UnicodeSet set) { 1644 return LayoutProps.INSTANCE.addPropertyStarts(src, set); 1645 } 1646 1647 // This static initializer block must be placed after 1648 // other static member initialization 1649 static { 1650 try { 1651 INSTANCE = new UCharacterProperty(); 1652 } 1653 catch (IOException e) { 1654 throw new MissingResourceException(e.getMessage(),"",""); 1655 } 1656 } 1657 1658 /*---------------------------------------------------------------- 1659 * Inclusions list 1660 *----------------------------------------------------------------*/ 1661 1662 /* 1663 * Return a set of characters for property enumeration. 1664 * The set implicitly contains 0x110000 as well, which is one more than the highest 1665 * Unicode code point. 1666 * 1667 * This set is used as an ordered list - its code points are ordered, and 1668 * consecutive code points (in Unicode code point order) in the set define a range. 1669 * For each two consecutive characters (start, limit) in the set, 1670 * all of the UCD/normalization and related properties for 1671 * all code points start..limit-1 are all the same, 1672 * except for character names and ISO comments. 1673 * 1674 * All Unicode code points U+0000..U+10ffff are covered by these ranges. 1675 * The ranges define a partition of the Unicode code space. 1676 * ICU uses the inclusions set to enumerate properties for generating 1677 * UnicodeSets containing all code points that have a certain property value. 1678 * 1679 * The Inclusion List is generated from the UCD. It is generated 1680 * by enumerating the data tries, and code points for hardcoded properties 1681 * are added as well. 1682 * 1683 * -------------------------------------------------------------------------- 1684 * 1685 * The following are ideas for getting properties-unique code point ranges, 1686 * with possible optimizations beyond the current implementation. 1687 * These optimizations would require more code and be more fragile. 1688 * The current implementation generates one single list (set) for all properties. 1689 * 1690 * To enumerate properties efficiently, one needs to know ranges of 1691 * repetitive values, so that the value of only each start code point 1692 * can be applied to the whole range. 1693 * This information is in principle available in the uprops.icu/unorm.icu data. 1694 * 1695 * There are two obstacles: 1696 * 1697 * 1. Some properties are computed from multiple data structures, 1698 * making it necessary to get repetitive ranges by intersecting 1699 * ranges from multiple tries. 1700 * 1701 * 2. It is not economical to write code for getting repetitive ranges 1702 * that are precise for each of some 50 properties. 1703 * 1704 * Compromise ideas: 1705 * 1706 * - Get ranges per trie, not per individual property. 1707 * Each range contains the same values for a whole group of properties. 1708 * This would generate currently five range sets, two for uprops.icu tries 1709 * and three for unorm.icu tries. 1710 * 1711 * - Combine sets of ranges for multiple tries to get sufficient sets 1712 * for properties, e.g., the uprops.icu main and auxiliary tries 1713 * for all non-normalization properties. 1714 * 1715 * Ideas for representing ranges and combining them: 1716 * 1717 * - A UnicodeSet could hold just the start code points of ranges. 1718 * Multiple sets are easily combined by or-ing them together. 1719 * 1720 * - Alternatively, a UnicodeSet could hold each even-numbered range. 1721 * All ranges could be enumerated by using each start code point 1722 * (for the even-numbered ranges) as well as each limit (end+1) code point 1723 * (for the odd-numbered ranges). 1724 * It should be possible to combine two such sets by xor-ing them, 1725 * but no more than two. 1726 * 1727 * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays, 1728 * but the first one is certainly simpler and applicable for combining more than 1729 * two range sets. 1730 * 1731 * It is possible to combine all range sets for all uprops/unorm tries into one 1732 * set that can be used for all properties. 1733 * As an optimization, there could be less-combined range sets for certain 1734 * groups of properties. 1735 * The relationship of which less-combined range set to use for which property 1736 * depends on the implementation of the properties and must be hardcoded 1737 * - somewhat error-prone and higher maintenance but can be tested easily 1738 * by building property sets "the simple way" in test code. 1739 * 1740 * --- 1741 * 1742 * Do not use a UnicodeSet pattern because that causes infinite recursion; 1743 * UnicodeSet depends on the inclusions set. 1744 * 1745 * --- 1746 * 1747 * getInclusions() is commented out starting 2005-feb-12 because 1748 * UnicodeSet now calls the uxyz_addPropertyStarts() directly, 1749 * and only for the relevant property source. 1750 */ 1751 /* 1752 public UnicodeSet getInclusions() { 1753 UnicodeSet set = new UnicodeSet(); 1754 NormalizerImpl.addPropertyStarts(set); 1755 addPropertyStarts(set); 1756 return set; 1757 } 1758 */ 1759 } 1760