1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.impl; 11 12 import java.io.IOException; 13 import java.nio.ByteBuffer; 14 import java.util.Iterator; 15 import java.util.MissingResourceException; 16 17 import com.ibm.icu.lang.UCharacter; 18 import com.ibm.icu.lang.UCharacter.HangulSyllableType; 19 import com.ibm.icu.lang.UCharacter.NumericType; 20 import com.ibm.icu.lang.UCharacterCategory; 21 import com.ibm.icu.lang.UProperty; 22 import com.ibm.icu.lang.UScript; 23 import com.ibm.icu.text.Normalizer2; 24 import com.ibm.icu.text.UTF16; 25 import com.ibm.icu.text.UnicodeSet; 26 import com.ibm.icu.util.CodePointMap; 27 import com.ibm.icu.util.CodePointTrie; 28 import com.ibm.icu.util.ICUException; 29 import com.ibm.icu.util.ICUUncheckedIOException; 30 import com.ibm.icu.util.VersionInfo; 31 32 /** 33 * <p>Internal class used for Unicode character property database.</p> 34 * <p>This classes store binary data read from uprops.icu. 35 * It does not have the capability to parse the data into more high-level 36 * information. It only returns bytes of information when required.</p> 37 * <p>Due to the form most commonly used for retrieval, array of char is used 38 * to store the binary data.</p> 39 * <p>UCharacterPropertyDB also contains information on accessing indexes to 40 * significant points in the binary data.</p> 41 * <p>Responsibility for molding the binary data into more meaning form lies on 42 * <a href=UCharacter.html>UCharacter</a>.</p> 43 * @author Syn Wee Quek 44 * @since release 2.1, february 1st 2002 45 */ 46 47 public final class UCharacterProperty 48 { 49 // public data members ----------------------------------------------- 50 51 /* 52 * public singleton instance 53 */ 54 public static final UCharacterProperty INSTANCE; 55 56 /** 57 * Trie data 58 */ 59 public Trie2_16 m_trie_; 60 /** 61 * Unicode version 62 */ 63 public VersionInfo m_unicodeVersion_; 64 /** 65 * Latin capital letter i with dot above 66 */ 67 public static final char LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_ = 0x130; 68 /** 69 * Latin small letter i with dot above 70 */ 71 public static final char LATIN_SMALL_LETTER_DOTLESS_I_ = 0x131; 72 /** 73 * Latin lowercase i 74 */ 75 public static final char LATIN_SMALL_LETTER_I_ = 0x69; 76 /** 77 * Character type mask 78 */ 79 public static final int TYPE_MASK = 0x1F; 80 81 // uprops.h enum UPropertySource --------------------------------------- *** 82 83 /** No source, not a supported property. */ 84 public static final int SRC_NONE=0; 85 /** From uchar.c/uprops.icu main trie */ 86 public static final int SRC_CHAR=1; 87 /** From uchar.c/uprops.icu properties vectors trie */ 88 public static final int SRC_PROPSVEC=2; 89 /** From unames.c/unames.icu */ 90 public static final int SRC_NAMES=3; 91 /** From ucase.c/ucase.icu */ 92 public static final int SRC_CASE=4; 93 /** From ubidi_props.c/ubidi.icu */ 94 public static final int SRC_BIDI=5; 95 /** From uchar.c/uprops.icu main trie as well as properties vectors trie */ 96 public static final int SRC_CHAR_AND_PROPSVEC=6; 97 /** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */ 98 public static final int SRC_CASE_AND_NORM=7; 99 /** From normalizer2impl.cpp/nfc.nrm */ 100 public static final int SRC_NFC=8; 101 /** From normalizer2impl.cpp/nfkc.nrm */ 102 public static final int SRC_NFKC=9; 103 /** From normalizer2impl.cpp/nfkc_cf.nrm */ 104 public static final int SRC_NFKC_CF=10; 105 /** From normalizer2impl.cpp/nfc.nrm canonical iterator data */ 106 public static final int SRC_NFC_CANON_ITER=11; 107 // Text layout properties. 108 public static final int SRC_INPC=12; 109 public static final int SRC_INSC=13; 110 public static final int SRC_VO=14; 111 /** One more than the highest UPropertySource (SRC_) constant. */ 112 public static final int SRC_COUNT=15; 113 114 private static final class LayoutProps { 115 private static final class IsAcceptable implements ICUBinary.Authenticate { 116 @Override isDataVersionAcceptable(byte version[])117 public boolean isDataVersionAcceptable(byte version[]) { 118 return version[0] == 1; 119 } 120 } 121 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 122 private static final int DATA_FORMAT = 0x4c61796f; // "Layo" 123 124 // indexes into indexes[] 125 // Element 0 stores the length of the indexes[] array. 126 //ivate static final int IX_INDEXES_LENGTH = 0; 127 // Elements 1..7 store the tops of consecutive code point tries. 128 // No trie is stored if the difference between two of these is less than 16. 129 private static final int IX_INPC_TRIE_TOP = 1; 130 private static final int IX_INSC_TRIE_TOP = 2; 131 private static final int IX_VO_TRIE_TOP = 3; 132 //ivate static final int IX_RESERVED_TOP = 4; 133 134 //ivate static final int IX_TRIES_TOP = 7; 135 136 private static final int IX_MAX_VALUES = 9; 137 138 // Length of indexes[]. Multiple of 4 to 16-align the tries. 139 //ivate static final int IX_COUNT = 12; 140 141 private static final int MAX_INPC_SHIFT = 24; 142 private static final int MAX_INSC_SHIFT = 16; 143 private static final int MAX_VO_SHIFT = 8; 144 145 static final LayoutProps INSTANCE = new LayoutProps(); 146 147 CodePointTrie inpcTrie = null; // Indic_Positional_Category 148 CodePointTrie inscTrie = null; // Indic_Syllabic_Category 149 CodePointTrie voTrie = null; // Vertical_Orientation 150 151 int maxInpcValue = 0; 152 int maxInscValue = 0; 153 int maxVoValue = 0; 154 LayoutProps()155 LayoutProps() { 156 ByteBuffer bytes = ICUBinary.getRequiredData("ulayout.icu"); 157 try { 158 ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); 159 int startPos = bytes.position(); 160 int indexesLength = bytes.getInt(); // inIndexes[IX_INDEXES_LENGTH] 161 if (indexesLength < 12) { 162 throw new ICUUncheckedIOException( 163 "Text layout properties data: not enough indexes"); 164 } 165 int[] inIndexes = new int[indexesLength]; 166 inIndexes[0] = indexesLength; 167 for (int i = 1; i < indexesLength; ++i) { 168 inIndexes[i] = bytes.getInt(); 169 } 170 171 int offset = indexesLength * 4; 172 int top = inIndexes[IX_INPC_TRIE_TOP]; 173 int trieSize = top - offset; 174 if (trieSize >= 16) { 175 inpcTrie = CodePointTrie.fromBinary(null, null, bytes); 176 } 177 int pos = bytes.position() - startPos; 178 assert top >= pos; 179 ICUBinary.skipBytes(bytes, top - pos); // skip padding after trie bytes 180 offset = top; 181 top = inIndexes[IX_INSC_TRIE_TOP]; 182 trieSize = top - offset; 183 if (trieSize >= 16) { 184 inscTrie = CodePointTrie.fromBinary(null, null, bytes); 185 } 186 pos = bytes.position() - startPos; 187 assert top >= pos; 188 ICUBinary.skipBytes(bytes, top - pos); // skip padding after trie bytes 189 offset = top; 190 top = inIndexes[IX_VO_TRIE_TOP]; 191 trieSize = top - offset; 192 if (trieSize >= 16) { 193 voTrie = CodePointTrie.fromBinary(null, null, bytes); 194 } 195 pos = bytes.position() - startPos; 196 assert top >= pos; 197 ICUBinary.skipBytes(bytes, top - pos); // skip padding after trie bytes 198 199 int maxValues = inIndexes[IX_MAX_VALUES]; 200 maxInpcValue = maxValues >>> MAX_INPC_SHIFT; 201 maxInscValue = (maxValues >> MAX_INSC_SHIFT) & 0xff; 202 maxVoValue = (maxValues >> MAX_VO_SHIFT) & 0xff; 203 } catch(IOException e) { 204 throw new ICUUncheckedIOException(e); 205 } 206 } 207 addPropertyStarts(int src, UnicodeSet set)208 public UnicodeSet addPropertyStarts(int src, UnicodeSet set) { 209 CodePointTrie trie; 210 switch (src) { 211 case SRC_INPC: 212 trie = inpcTrie; 213 break; 214 case SRC_INSC: 215 trie = inscTrie; 216 break; 217 case SRC_VO: 218 trie = voTrie; 219 break; 220 default: 221 throw new IllegalStateException(); 222 } 223 224 if (trie == null) { 225 throw new MissingResourceException( 226 "no data for one of the text layout properties; src=" + src, 227 "LayoutProps", ""); 228 } 229 230 // Add the start code point of each same-value range of the trie. 231 CodePointMap.Range range = new CodePointMap.Range(); 232 int start = 0; 233 while (trie.getRange(start, null, range)) { 234 set.add(start); 235 start = range.getEnd() + 1; 236 } 237 return set; 238 } 239 } 240 241 // public methods ---------------------------------------------------- 242 243 /** 244 * Gets the main property value for code point ch. 245 * @param ch code point whose property value is to be retrieved 246 * @return property value of code point 247 */ getProperty(int ch)248 public final int getProperty(int ch) 249 { 250 return m_trie_.get(ch); 251 } 252 253 /** 254 * Gets the unicode additional properties. 255 * Java version of C u_getUnicodeProperties(). 256 * @param codepoint codepoint whose additional properties is to be 257 * retrieved 258 * @param column The column index. 259 * @return unicode properties 260 */ getAdditional(int codepoint, int column)261 public int getAdditional(int codepoint, int column) { 262 assert column >= 0; 263 if (column >= m_additionalColumnsCount_) { 264 return 0; 265 } 266 return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; 267 } 268 269 static final int MY_MASK = UCharacterProperty.TYPE_MASK 270 & ((1<<UCharacterCategory.UPPERCASE_LETTER) | 271 (1<<UCharacterCategory.LOWERCASE_LETTER) | 272 (1<<UCharacterCategory.TITLECASE_LETTER) | 273 (1<<UCharacterCategory.MODIFIER_LETTER) | 274 (1<<UCharacterCategory.OTHER_LETTER)); 275 276 277 /** 278 * <p>Get the "age" of the code point.</p> 279 * <p>The "age" is the Unicode version when the code point was first 280 * designated (as a non-character or for Private Use) or assigned a 281 * character.</p> 282 * <p>This can be useful to avoid emitting code points to receiving 283 * processes that do not accept newer characters.</p> 284 * <p>The data is from the UCD file DerivedAge.txt.</p> 285 * <p>This API does not check the validity of the codepoint.</p> 286 * @param codepoint The code point. 287 * @return the Unicode version number 288 */ getAge(int codepoint)289 public VersionInfo getAge(int codepoint) 290 { 291 int version = getAdditional(codepoint, 0) >> AGE_SHIFT_; 292 return VersionInfo.getInstance( 293 (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, 294 version & LAST_NIBBLE_MASK_, 0, 0); 295 } 296 297 private static final int GC_CN_MASK = getMask(UCharacter.UNASSIGNED); 298 private static final int GC_CC_MASK = getMask(UCharacter.CONTROL); 299 private static final int GC_CS_MASK = getMask(UCharacter.SURROGATE); 300 private static final int GC_ZS_MASK = getMask(UCharacter.SPACE_SEPARATOR); 301 private static final int GC_ZL_MASK = getMask(UCharacter.LINE_SEPARATOR); 302 private static final int GC_ZP_MASK = getMask(UCharacter.PARAGRAPH_SEPARATOR); 303 /** Mask constant for multiple UCharCategory bits (Z Separators). */ 304 private static final int GC_Z_MASK = GC_ZS_MASK|GC_ZL_MASK|GC_ZP_MASK; 305 306 /** 307 * Checks if c is in 308 * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}] 309 * with space=\p{Whitespace} and Control=Cc. 310 * Implements UCHAR_POSIX_GRAPH. 311 * @internal 312 */ isgraphPOSIX(int c)313 private static final boolean isgraphPOSIX(int c) { 314 /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */ 315 /* comparing ==0 returns FALSE for the categories mentioned */ 316 return (getMask(UCharacter.getType(c))& 317 (GC_CC_MASK|GC_CS_MASK|GC_CN_MASK|GC_Z_MASK)) 318 ==0; 319 } 320 321 // binary properties --------------------------------------------------- *** 322 323 private class BinaryProperty { 324 int column; // SRC_PROPSVEC column, or "source" if mask==0 325 int mask; BinaryProperty(int column, int mask)326 BinaryProperty(int column, int mask) { 327 this.column=column; 328 this.mask=mask; 329 } BinaryProperty(int source)330 BinaryProperty(int source) { 331 this.column=source; 332 this.mask=0; 333 } getSource()334 final int getSource() { 335 return mask==0 ? column : SRC_PROPSVEC; 336 } contains(int c)337 boolean contains(int c) { 338 // systematic, directly stored properties 339 return (getAdditional(c, column)&mask)!=0; 340 } 341 } 342 343 private class CaseBinaryProperty extends BinaryProperty { // case mapping properties 344 int which; CaseBinaryProperty(int which)345 CaseBinaryProperty(int which) { 346 super(SRC_CASE); 347 this.which=which; 348 } 349 @Override contains(int c)350 boolean contains(int c) { 351 return UCaseProps.INSTANCE.hasBinaryProperty(c, which); 352 } 353 } 354 355 private class NormInertBinaryProperty extends BinaryProperty { // UCHAR_NF*_INERT properties 356 int which; NormInertBinaryProperty(int source, int which)357 NormInertBinaryProperty(int source, int which) { 358 super(source); 359 this.which=which; 360 } 361 @Override contains(int c)362 boolean contains(int c) { 363 return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_INERT).isInert(c); 364 } 365 } 366 367 BinaryProperty[] binProps={ 368 /* 369 * Binary-property implementations must be in order of corresponding UProperty, 370 * and there must be exactly one entry per binary UProperty. 371 */ 372 new BinaryProperty(1, (1<<ALPHABETIC_PROPERTY_)), 373 new BinaryProperty(1, (1<<ASCII_HEX_DIGIT_PROPERTY_)), 374 new BinaryProperty(SRC_BIDI) { // UCHAR_BIDI_CONTROL 375 @Override 376 boolean contains(int c) { 377 return UBiDiProps.INSTANCE.isBidiControl(c); 378 } 379 }, 380 new BinaryProperty(SRC_BIDI) { // UCHAR_BIDI_MIRRORED 381 @Override 382 boolean contains(int c) { 383 return UBiDiProps.INSTANCE.isMirrored(c); 384 } 385 }, 386 new BinaryProperty(1, (1<<DASH_PROPERTY_)), 387 new BinaryProperty(1, (1<<DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_)), 388 new BinaryProperty(1, (1<<DEPRECATED_PROPERTY_)), 389 new BinaryProperty(1, (1<<DIACRITIC_PROPERTY_)), 390 new BinaryProperty(1, (1<<EXTENDER_PROPERTY_)), 391 new BinaryProperty(SRC_NFC) { // UCHAR_FULL_COMPOSITION_EXCLUSION 392 @Override 393 boolean contains(int c) { 394 // By definition, Full_Composition_Exclusion is the same as NFC_QC=No. 395 Normalizer2Impl impl=Norm2AllModes.getNFCInstance().impl; 396 return impl.isCompNo(impl.getNorm16(c)); 397 } 398 }, 399 new BinaryProperty(1, (1<<GRAPHEME_BASE_PROPERTY_)), 400 new BinaryProperty(1, (1<<GRAPHEME_EXTEND_PROPERTY_)), 401 new BinaryProperty(1, (1<<GRAPHEME_LINK_PROPERTY_)), 402 new BinaryProperty(1, (1<<HEX_DIGIT_PROPERTY_)), 403 new BinaryProperty(1, (1<<HYPHEN_PROPERTY_)), 404 new BinaryProperty(1, (1<<ID_CONTINUE_PROPERTY_)), 405 new BinaryProperty(1, (1<<ID_START_PROPERTY_)), 406 new BinaryProperty(1, (1<<IDEOGRAPHIC_PROPERTY_)), 407 new BinaryProperty(1, (1<<IDS_BINARY_OPERATOR_PROPERTY_)), 408 new BinaryProperty(1, (1<<IDS_TRINARY_OPERATOR_PROPERTY_)), 409 new BinaryProperty(SRC_BIDI) { // UCHAR_JOIN_CONTROL 410 @Override 411 boolean contains(int c) { 412 return UBiDiProps.INSTANCE.isJoinControl(c); 413 } 414 }, 415 new BinaryProperty(1, (1<<LOGICAL_ORDER_EXCEPTION_PROPERTY_)), 416 new CaseBinaryProperty(UProperty.LOWERCASE), 417 new BinaryProperty(1, (1<<MATH_PROPERTY_)), 418 new BinaryProperty(1, (1<<NONCHARACTER_CODE_POINT_PROPERTY_)), 419 new BinaryProperty(1, (1<<QUOTATION_MARK_PROPERTY_)), 420 new BinaryProperty(1, (1<<RADICAL_PROPERTY_)), 421 new CaseBinaryProperty(UProperty.SOFT_DOTTED), 422 new BinaryProperty(1, (1<<TERMINAL_PUNCTUATION_PROPERTY_)), 423 new BinaryProperty(1, (1<<UNIFIED_IDEOGRAPH_PROPERTY_)), 424 new CaseBinaryProperty(UProperty.UPPERCASE), 425 new BinaryProperty(1, (1<<WHITE_SPACE_PROPERTY_)), 426 new BinaryProperty(1, (1<<XID_CONTINUE_PROPERTY_)), 427 new BinaryProperty(1, (1<<XID_START_PROPERTY_)), 428 new CaseBinaryProperty(UProperty.CASE_SENSITIVE), 429 new BinaryProperty(1, (1<<S_TERM_PROPERTY_)), 430 new BinaryProperty(1, (1<<VARIATION_SELECTOR_PROPERTY_)), 431 new NormInertBinaryProperty(SRC_NFC, UProperty.NFD_INERT), 432 new NormInertBinaryProperty(SRC_NFKC, UProperty.NFKD_INERT), 433 new NormInertBinaryProperty(SRC_NFC, UProperty.NFC_INERT), 434 new NormInertBinaryProperty(SRC_NFKC, UProperty.NFKC_INERT), 435 new BinaryProperty(SRC_NFC_CANON_ITER) { // UCHAR_SEGMENT_STARTER 436 @Override 437 boolean contains(int c) { 438 return Norm2AllModes.getNFCInstance().impl. 439 ensureCanonIterData().isCanonSegmentStarter(c); 440 } 441 }, 442 new BinaryProperty(1, (1<<PATTERN_SYNTAX)), 443 new BinaryProperty(1, (1<<PATTERN_WHITE_SPACE)), 444 new BinaryProperty(SRC_CHAR_AND_PROPSVEC) { // UCHAR_POSIX_ALNUM 445 @Override 446 boolean contains(int c) { 447 return UCharacter.isUAlphabetic(c) || UCharacter.isDigit(c); 448 } 449 }, 450 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_BLANK 451 @Override 452 boolean contains(int c) { 453 // "horizontal space" 454 if(c<=0x9f) { 455 return c==9 || c==0x20; /* TAB or SPACE */ 456 } else { 457 /* Zs */ 458 return UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR; 459 } 460 } 461 }, 462 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_GRAPH 463 @Override 464 boolean contains(int c) { 465 return isgraphPOSIX(c); 466 } 467 }, 468 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_PRINT 469 @Override 470 boolean contains(int c) { 471 /* 472 * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}. 473 * 474 * The only cntrl character in graph+blank is TAB (in blank). 475 * Here we implement (blank-TAB)=Zs instead of calling u_isblank(). 476 */ 477 return (UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(c); 478 } 479 }, 480 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_XDIGIT 481 @Override 482 boolean contains(int c) { 483 /* check ASCII and Fullwidth ASCII a-fA-F */ 484 if( 485 (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) || 486 (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41)) 487 ) { 488 return true; 489 } 490 return UCharacter.getType(c)==UCharacter.DECIMAL_DIGIT_NUMBER; 491 } 492 }, 493 new CaseBinaryProperty(UProperty.CASED), 494 new CaseBinaryProperty(UProperty.CASE_IGNORABLE), 495 new CaseBinaryProperty(UProperty.CHANGES_WHEN_LOWERCASED), 496 new CaseBinaryProperty(UProperty.CHANGES_WHEN_UPPERCASED), 497 new CaseBinaryProperty(UProperty.CHANGES_WHEN_TITLECASED), 498 new BinaryProperty(SRC_CASE_AND_NORM) { // UCHAR_CHANGES_WHEN_CASEFOLDED 499 @Override 500 boolean contains(int c) { 501 String nfd=Norm2AllModes.getNFCInstance().impl.getDecomposition(c); 502 if(nfd!=null) { 503 /* c has a decomposition */ 504 c=nfd.codePointAt(0); 505 if(Character.charCount(c)!=nfd.length()) { 506 /* multiple code points */ 507 c=-1; 508 } 509 } else if(c<0) { 510 return false; /* protect against bad input */ 511 } 512 if(c>=0) { 513 /* single code point */ 514 UCaseProps csp=UCaseProps.INSTANCE; 515 UCaseProps.dummyStringBuilder.setLength(0); 516 return csp.toFullFolding(c, UCaseProps.dummyStringBuilder, 517 UCharacter.FOLD_CASE_DEFAULT)>=0; 518 } else { 519 String folded=UCharacter.foldCase(nfd, true); 520 return !folded.equals(nfd); 521 } 522 } 523 }, 524 new CaseBinaryProperty(UProperty.CHANGES_WHEN_CASEMAPPED), 525 new BinaryProperty(SRC_NFKC_CF) { // UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED 526 @Override 527 boolean contains(int c) { 528 Normalizer2Impl kcf=Norm2AllModes.getNFKC_CFInstance().impl; 529 String src=UTF16.valueOf(c); 530 StringBuilder dest=new StringBuilder(); 531 // Small destCapacity for NFKC_CF(c). 532 Normalizer2Impl.ReorderingBuffer buffer=new Normalizer2Impl.ReorderingBuffer(kcf, dest, 5); 533 kcf.compose(src, 0, src.length(), false, true, buffer); 534 return !Normalizer2Impl.UTF16Plus.equal(dest, src); 535 } 536 }, 537 new BinaryProperty(2, 1<<PROPS_2_EMOJI), 538 new BinaryProperty(2, 1<<PROPS_2_EMOJI_PRESENTATION), 539 new BinaryProperty(2, 1<<PROPS_2_EMOJI_MODIFIER), 540 new BinaryProperty(2, 1<<PROPS_2_EMOJI_MODIFIER_BASE), 541 new BinaryProperty(2, 1<<PROPS_2_EMOJI_COMPONENT), 542 new BinaryProperty(SRC_PROPSVEC) { // REGIONAL_INDICATOR 543 // Property starts are a subset of lb=RI etc. 544 @Override 545 boolean contains(int c) { 546 return 0x1F1E6<=c && c<=0x1F1FF; 547 } 548 }, 549 new BinaryProperty(1, 1<<PREPENDED_CONCATENATION_MARK), 550 new BinaryProperty(2, 1<<PROPS_2_EXTENDED_PICTOGRAPHIC), 551 }; 552 hasBinaryProperty(int c, int which)553 public boolean hasBinaryProperty(int c, int which) { 554 if(which<UProperty.BINARY_START || UProperty.BINARY_LIMIT<=which) { 555 // not a known binary property 556 return false; 557 } else { 558 return binProps[which].contains(c); 559 } 560 } 561 562 // int-value and enumerated properties --------------------------------- *** 563 getType(int c)564 public int getType(int c) { 565 return getProperty(c)&TYPE_MASK; 566 } 567 568 /* 569 * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. 570 * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. 571 */ 572 private static final int /* UHangulSyllableType */ gcbToHst[]={ 573 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */ 574 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */ 575 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */ 576 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */ 577 HangulSyllableType.LEADING_JAMO, /* U_GCB_L */ 578 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */ 579 HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */ 580 HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */ 581 HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */ 582 HangulSyllableType.VOWEL_JAMO /* U_GCB_V */ 583 /* 584 * Omit GCB values beyond what we need for hst. 585 * The code below checks for the array length. 586 */ 587 }; 588 589 private class IntProperty { 590 int column; // SRC_PROPSVEC column, or "source" if mask==0 591 int mask; 592 int shift; IntProperty(int column, int mask, int shift)593 IntProperty(int column, int mask, int shift) { 594 this.column=column; 595 this.mask=mask; 596 this.shift=shift; 597 } IntProperty(int source)598 IntProperty(int source) { 599 this.column=source; 600 this.mask=0; 601 } getSource()602 final int getSource() { 603 return mask==0 ? column : SRC_PROPSVEC; 604 } getValue(int c)605 int getValue(int c) { 606 // systematic, directly stored properties 607 return (getAdditional(c, column)&mask)>>>shift; 608 } getMaxValue(int which)609 int getMaxValue(int which) { 610 return (getMaxValues(column)&mask)>>>shift; 611 } 612 } 613 614 private class BiDiIntProperty extends IntProperty { BiDiIntProperty()615 BiDiIntProperty() { 616 super(SRC_BIDI); 617 } 618 @Override getMaxValue(int which)619 int getMaxValue(int which) { 620 return UBiDiProps.INSTANCE.getMaxValue(which); 621 } 622 } 623 624 private class CombiningClassIntProperty extends IntProperty { CombiningClassIntProperty(int source)625 CombiningClassIntProperty(int source) { 626 super(source); 627 } 628 @Override getMaxValue(int which)629 int getMaxValue(int which) { 630 return 0xff; 631 } 632 } 633 634 private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties 635 int which; 636 int max; NormQuickCheckIntProperty(int source, int which, int max)637 NormQuickCheckIntProperty(int source, int which, int max) { 638 super(source); 639 this.which=which; 640 this.max=max; 641 } 642 @Override getValue(int c)643 int getValue(int c) { 644 return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_QUICK_CHECK).getQuickCheck(c); 645 } 646 @Override getMaxValue(int which)647 int getMaxValue(int which) { 648 return max; 649 } 650 } 651 652 IntProperty intProps[]={ 653 new BiDiIntProperty() { // BIDI_CLASS 654 @Override 655 int getValue(int c) { 656 return UBiDiProps.INSTANCE.getClass(c); 657 } 658 }, 659 new IntProperty(0, BLOCK_MASK_, BLOCK_SHIFT_), 660 new CombiningClassIntProperty(SRC_NFC) { // CANONICAL_COMBINING_CLASS 661 @Override 662 int getValue(int c) { 663 return Normalizer2.getNFDInstance().getCombiningClass(c); 664 } 665 }, 666 new IntProperty(2, DECOMPOSITION_TYPE_MASK_, 0), 667 new IntProperty(0, EAST_ASIAN_MASK_, EAST_ASIAN_SHIFT_), 668 new IntProperty(SRC_CHAR) { // GENERAL_CATEGORY 669 @Override 670 int getValue(int c) { 671 return getType(c); 672 } 673 @Override 674 int getMaxValue(int which) { 675 return UCharacterCategory.CHAR_CATEGORY_COUNT-1; 676 } 677 }, 678 new BiDiIntProperty() { // JOINING_GROUP 679 @Override 680 int getValue(int c) { 681 return UBiDiProps.INSTANCE.getJoiningGroup(c); 682 } 683 }, 684 new BiDiIntProperty() { // JOINING_TYPE 685 @Override 686 int getValue(int c) { 687 return UBiDiProps.INSTANCE.getJoiningType(c); 688 } 689 }, 690 new IntProperty(2, LB_MASK, LB_SHIFT), // LINE_BREAK 691 new IntProperty(SRC_CHAR) { // NUMERIC_TYPE 692 @Override 693 int getValue(int c) { 694 return ntvGetType(getNumericTypeValue(getProperty(c))); 695 } 696 @Override 697 int getMaxValue(int which) { 698 return NumericType.COUNT-1; 699 } 700 }, 701 new IntProperty(SRC_PROPSVEC) { 702 @Override 703 int getValue(int c) { 704 return UScript.getScript(c); 705 } 706 @Override 707 int getMaxValue(int which) { 708 int scriptX=getMaxValues(0)&SCRIPT_X_MASK; 709 return mergeScriptCodeOrIndex(scriptX); 710 } 711 }, 712 new IntProperty(SRC_PROPSVEC) { // HANGUL_SYLLABLE_TYPE 713 @Override 714 int getValue(int c) { 715 /* see comments on gcbToHst[] above */ 716 int gcb=(getAdditional(c, 2)&GCB_MASK)>>>GCB_SHIFT; 717 if(gcb<gcbToHst.length) { 718 return gcbToHst[gcb]; 719 } else { 720 return HangulSyllableType.NOT_APPLICABLE; 721 } 722 } 723 @Override 724 int getMaxValue(int which) { 725 return HangulSyllableType.COUNT-1; 726 } 727 }, 728 // max=1=YES -- these are never "maybe", only "no" or "yes" 729 new NormQuickCheckIntProperty(SRC_NFC, UProperty.NFD_QUICK_CHECK, 1), 730 new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKD_QUICK_CHECK, 1), 731 // max=2=MAYBE 732 new NormQuickCheckIntProperty(SRC_NFC, UProperty.NFC_QUICK_CHECK, 2), 733 new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKC_QUICK_CHECK, 2), 734 new CombiningClassIntProperty(SRC_NFC) { // LEAD_CANONICAL_COMBINING_CLASS 735 @Override 736 int getValue(int c) { 737 return Norm2AllModes.getNFCInstance().impl.getFCD16(c)>>8; 738 } 739 }, 740 new CombiningClassIntProperty(SRC_NFC) { // TRAIL_CANONICAL_COMBINING_CLASS 741 @Override 742 int getValue(int c) { 743 return Norm2AllModes.getNFCInstance().impl.getFCD16(c)&0xff; 744 } 745 }, 746 new IntProperty(2, GCB_MASK, GCB_SHIFT), // GRAPHEME_CLUSTER_BREAK 747 new IntProperty(2, SB_MASK, SB_SHIFT), // SENTENCE_BREAK 748 new IntProperty(2, WB_MASK, WB_SHIFT), // WORD_BREAK 749 new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE 750 @Override 751 int getValue(int c) { 752 return UBiDiProps.INSTANCE.getPairedBracketType(c); 753 } 754 }, 755 new IntProperty(SRC_INPC) { 756 @Override 757 int getValue(int c) { 758 CodePointTrie trie = LayoutProps.INSTANCE.inpcTrie; 759 return trie != null ? trie.get(c) : 0; 760 } 761 @Override 762 int getMaxValue(int which) { 763 return LayoutProps.INSTANCE.maxInpcValue; 764 } 765 }, 766 new IntProperty(SRC_INSC) { 767 @Override 768 int getValue(int c) { 769 CodePointTrie trie = LayoutProps.INSTANCE.inscTrie; 770 return trie != null ? trie.get(c) : 0; 771 } 772 @Override 773 int getMaxValue(int which) { 774 return LayoutProps.INSTANCE.maxInscValue; 775 } 776 }, 777 new IntProperty(SRC_VO) { 778 @Override 779 int getValue(int c) { 780 CodePointTrie trie = LayoutProps.INSTANCE.voTrie; 781 return trie != null ? trie.get(c) : 0; 782 } 783 @Override 784 int getMaxValue(int which) { 785 return LayoutProps.INSTANCE.maxVoValue; 786 } 787 }, 788 }; 789 getIntPropertyValue(int c, int which)790 public int getIntPropertyValue(int c, int which) { 791 if(which<UProperty.INT_START) { 792 if(UProperty.BINARY_START<=which && which<UProperty.BINARY_LIMIT) { 793 return binProps[which].contains(c) ? 1 : 0; 794 } 795 } else if(which<UProperty.INT_LIMIT) { 796 return intProps[which-UProperty.INT_START].getValue(c); 797 } else if (which == UProperty.GENERAL_CATEGORY_MASK) { 798 return getMask(getType(c)); 799 } 800 return 0; // undefined 801 } 802 getIntPropertyMaxValue(int which)803 public int getIntPropertyMaxValue(int which) { 804 if(which<UProperty.INT_START) { 805 if(UProperty.BINARY_START<=which && which<UProperty.BINARY_LIMIT) { 806 return 1; // maximum TRUE for all binary properties 807 } 808 } else if(which<UProperty.INT_LIMIT) { 809 return intProps[which-UProperty.INT_START].getMaxValue(which); 810 } 811 return -1; // undefined 812 } 813 getSource(int which)814 final int getSource(int which) { 815 if(which<UProperty.BINARY_START) { 816 return SRC_NONE; /* undefined */ 817 } else if(which<UProperty.BINARY_LIMIT) { 818 return binProps[which].getSource(); 819 } else if(which<UProperty.INT_START) { 820 return SRC_NONE; /* undefined */ 821 } else if(which<UProperty.INT_LIMIT) { 822 return intProps[which-UProperty.INT_START].getSource(); 823 } else if(which<UProperty.STRING_START) { 824 switch(which) { 825 case UProperty.GENERAL_CATEGORY_MASK: 826 case UProperty.NUMERIC_VALUE: 827 return SRC_CHAR; 828 829 default: 830 return SRC_NONE; 831 } 832 } else if(which<UProperty.STRING_LIMIT) { 833 switch(which) { 834 case UProperty.AGE: 835 return SRC_PROPSVEC; 836 837 case UProperty.BIDI_MIRRORING_GLYPH: 838 return SRC_BIDI; 839 840 case UProperty.CASE_FOLDING: 841 case UProperty.LOWERCASE_MAPPING: 842 case UProperty.SIMPLE_CASE_FOLDING: 843 case UProperty.SIMPLE_LOWERCASE_MAPPING: 844 case UProperty.SIMPLE_TITLECASE_MAPPING: 845 case UProperty.SIMPLE_UPPERCASE_MAPPING: 846 case UProperty.TITLECASE_MAPPING: 847 case UProperty.UPPERCASE_MAPPING: 848 return SRC_CASE; 849 850 case UProperty.ISO_COMMENT: 851 case UProperty.NAME: 852 case UProperty.UNICODE_1_NAME: 853 return SRC_NAMES; 854 855 default: 856 return SRC_NONE; 857 } 858 } else { 859 switch(which) { 860 case UProperty.SCRIPT_EXTENSIONS: 861 return SRC_PROPSVEC; 862 default: 863 return SRC_NONE; /* undefined */ 864 } 865 } 866 } 867 868 /** 869 * <p> 870 * Unicode property names and property value names are compared 871 * "loosely". Property[Value]Aliases.txt say: 872 * <quote> 873 * "With loose matching of property names, the case distinctions, 874 * whitespace, and '_' are ignored." 875 * </quote> 876 * </p> 877 * <p> 878 * This function does just that, for ASCII (char *) name strings. 879 * It is almost identical to ucnv_compareNames() but also ignores 880 * ASCII White_Space characters (U+0009..U+000d). 881 * </p> 882 * @param name1 name to compare 883 * @param name2 name to compare 884 * @return 0 if names are equal, < 0 if name1 is less than name2 and > 0 885 * if name1 is greater than name2. 886 */ 887 /* to be implemented in 2.4 888 * public static int comparePropertyNames(String name1, String name2) 889 { 890 int result = 0; 891 int i1 = 0; 892 int i2 = 0; 893 while (true) { 894 char ch1 = 0; 895 char ch2 = 0; 896 // Ignore delimiters '-', '_', and ASCII White_Space 897 if (i1 < name1.length()) { 898 ch1 = name1.charAt(i1 ++); 899 } 900 while (ch1 == '-' || ch1 == '_' || ch1 == ' ' || ch1 == '\t' 901 || ch1 == '\n' // synwee what is || ch1 == '\v' 902 || ch1 == '\f' || ch1=='\r') { 903 if (i1 < name1.length()) { 904 ch1 = name1.charAt(i1 ++); 905 } 906 else { 907 ch1 = 0; 908 } 909 } 910 if (i2 < name2.length()) { 911 ch2 = name2.charAt(i2 ++); 912 } 913 while (ch2 == '-' || ch2 == '_' || ch2 == ' ' || ch2 == '\t' 914 || ch2 == '\n' // synwee what is || ch1 == '\v' 915 || ch2 == '\f' || ch2=='\r') { 916 if (i2 < name2.length()) { 917 ch2 = name2.charAt(i2 ++); 918 } 919 else { 920 ch2 = 0; 921 } 922 } 923 924 // If we reach the ends of both strings then they match 925 if (ch1 == 0 && ch2 == 0) { 926 return 0; 927 } 928 929 // Case-insensitive comparison 930 if (ch1 != ch2) { 931 result = Character.toLowerCase(ch1) 932 - Character.toLowerCase(ch2); 933 if (result != 0) { 934 return result; 935 } 936 } 937 } 938 } 939 */ 940 941 /** 942 * Get the the maximum values for some enum/int properties. 943 * @return maximum values for the integer properties. 944 */ getMaxValues(int column)945 public int getMaxValues(int column) 946 { 947 // return m_maxBlockScriptValue_; 948 949 switch(column) { 950 case 0: 951 return m_maxBlockScriptValue_; 952 case 2: 953 return m_maxJTGValue_; 954 default: 955 return 0; 956 } 957 } 958 959 /** 960 * Gets the type mask 961 * @param type character type 962 * @return mask 963 */ getMask(int type)964 public static final int getMask(int type) 965 { 966 return 1 << type; 967 } 968 969 970 /** 971 * Returns the digit values of characters like 'A' - 'Z', normal, 972 * half-width and full-width. This method assumes that the other digit 973 * characters are checked by the calling method. 974 * @param ch character to test 975 * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise 976 * its corresponding digit will be returned. 977 */ getEuropeanDigit(int ch)978 public static int getEuropeanDigit(int ch) { 979 if ((ch > 0x7a && ch < 0xff21) 980 || ch < 0x41 || (ch > 0x5a && ch < 0x61) 981 || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { 982 return -1; 983 } 984 if (ch <= 0x7a) { 985 // ch >= 0x41 or ch < 0x61 986 return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); 987 } 988 // ch >= 0xff21 989 if (ch <= 0xff3a) { 990 return ch + 10 - 0xff21; 991 } 992 // ch >= 0xff41 && ch <= 0xff5a 993 return ch + 10 - 0xff41; 994 } 995 digit(int c)996 public int digit(int c) { 997 int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_; 998 if(value<=9) { 999 return value; 1000 } else { 1001 return -1; 1002 } 1003 } 1004 getNumericValue(int c)1005 public int getNumericValue(int c) { 1006 // slightly pruned version of getUnicodeNumericValue(), plus getEuropeanDigit() 1007 int ntv = getNumericTypeValue(getProperty(c)); 1008 1009 if(ntv==NTV_NONE_) { 1010 return getEuropeanDigit(c); 1011 } else if(ntv<NTV_DIGIT_START_) { 1012 /* decimal digit */ 1013 return ntv-NTV_DECIMAL_START_; 1014 } else if(ntv<NTV_NUMERIC_START_) { 1015 /* other digit */ 1016 return ntv-NTV_DIGIT_START_; 1017 } else if(ntv<NTV_FRACTION_START_) { 1018 /* small integer */ 1019 return ntv-NTV_NUMERIC_START_; 1020 } else if(ntv<NTV_LARGE_START_) { 1021 /* fraction */ 1022 return -2; 1023 } else if(ntv<NTV_BASE60_START_) { 1024 /* large, single-significant-digit integer */ 1025 int mant=(ntv>>5)-14; 1026 int exp=(ntv&0x1f)+2; 1027 if(exp<9 || (exp==9 && mant<=2)) { 1028 int numValue=mant; 1029 do { 1030 numValue*=10; 1031 } while(--exp>0); 1032 return numValue; 1033 } else { 1034 return -2; 1035 } 1036 } else if(ntv<NTV_FRACTION20_START_) { 1037 /* sexagesimal (base 60) integer */ 1038 int numValue=(ntv>>2)-0xbf; 1039 int exp=(ntv&3)+1; 1040 1041 switch(exp) { 1042 case 4: 1043 numValue*=60*60*60*60; 1044 break; 1045 case 3: 1046 numValue*=60*60*60; 1047 break; 1048 case 2: 1049 numValue*=60*60; 1050 break; 1051 case 1: 1052 numValue*=60; 1053 break; 1054 case 0: 1055 default: 1056 break; 1057 } 1058 1059 return numValue; 1060 } else if(ntv<NTV_RESERVED_START_) { 1061 // fraction-20 e.g. 3/80 1062 return -2; 1063 } else { 1064 /* reserved */ 1065 return -2; 1066 } 1067 } 1068 getUnicodeNumericValue(int c)1069 public double getUnicodeNumericValue(int c) { 1070 // equivalent to c version double u_getNumericValue(UChar32 c) 1071 int ntv = getNumericTypeValue(getProperty(c)); 1072 1073 if(ntv==NTV_NONE_) { 1074 return UCharacter.NO_NUMERIC_VALUE; 1075 } else if(ntv<NTV_DIGIT_START_) { 1076 /* decimal digit */ 1077 return ntv-NTV_DECIMAL_START_; 1078 } else if(ntv<NTV_NUMERIC_START_) { 1079 /* other digit */ 1080 return ntv-NTV_DIGIT_START_; 1081 } else if(ntv<NTV_FRACTION_START_) { 1082 /* small integer */ 1083 return ntv-NTV_NUMERIC_START_; 1084 } else if(ntv<NTV_LARGE_START_) { 1085 /* fraction */ 1086 int numerator=(ntv>>4)-12; 1087 int denominator=(ntv&0xf)+1; 1088 return (double)numerator/denominator; 1089 } else if(ntv<NTV_BASE60_START_) { 1090 /* large, single-significant-digit integer */ 1091 double numValue; 1092 int mant=(ntv>>5)-14; 1093 int exp=(ntv&0x1f)+2; 1094 numValue=mant; 1095 1096 /* multiply by 10^exp without math.h */ 1097 while(exp>=4) { 1098 numValue*=10000.; 1099 exp-=4; 1100 } 1101 switch(exp) { 1102 case 3: 1103 numValue*=1000.; 1104 break; 1105 case 2: 1106 numValue*=100.; 1107 break; 1108 case 1: 1109 numValue*=10.; 1110 break; 1111 case 0: 1112 default: 1113 break; 1114 } 1115 1116 return numValue; 1117 } else if(ntv<NTV_FRACTION20_START_) { 1118 /* sexagesimal (base 60) integer */ 1119 int numValue=(ntv>>2)-0xbf; 1120 int exp=(ntv&3)+1; 1121 1122 switch(exp) { 1123 case 4: 1124 numValue*=60*60*60*60; 1125 break; 1126 case 3: 1127 numValue*=60*60*60; 1128 break; 1129 case 2: 1130 numValue*=60*60; 1131 break; 1132 case 1: 1133 numValue*=60; 1134 break; 1135 case 0: 1136 default: 1137 break; 1138 } 1139 1140 return numValue; 1141 } else if(ntv<NTV_FRACTION32_START_) { 1142 // fraction-20 e.g. 3/80 1143 int frac20=ntv-NTV_FRACTION20_START_; // 0..0x17 1144 int numerator=2*(frac20&3)+1; 1145 int denominator=20<<(frac20>>2); 1146 return (double)numerator/denominator; 1147 } else if(ntv<NTV_RESERVED_START_) { 1148 // fraction-32 e.g. 3/64 1149 int frac32=ntv-NTV_FRACTION32_START_; // 0..15 1150 int numerator=2*(frac32&3)+1; 1151 int denominator=32<<(frac32>>2); 1152 return (double)numerator/denominator; 1153 } else { 1154 /* reserved */ 1155 return UCharacter.NO_NUMERIC_VALUE; 1156 } 1157 } 1158 1159 // protected variables ----------------------------------------------- 1160 1161 /** 1162 * Extra property trie 1163 */ 1164 Trie2_16 m_additionalTrie_; 1165 /** 1166 * Extra property vectors, 1st column for age and second for binary 1167 * properties. 1168 */ 1169 int m_additionalVectors_[]; 1170 /** 1171 * Number of additional columns 1172 */ 1173 int m_additionalColumnsCount_; 1174 /** 1175 * Maximum values for block, bits used as in vector word 1176 * 0 1177 */ 1178 int m_maxBlockScriptValue_; 1179 /** 1180 * Maximum values for script, bits used as in vector word 1181 * 0 1182 */ 1183 int m_maxJTGValue_; 1184 1185 /** 1186 * Script_Extensions data 1187 */ 1188 public char[] m_scriptExtensions_; 1189 1190 // private variables ------------------------------------------------- 1191 1192 /** 1193 * Default name of the datafile 1194 */ 1195 private static final String DATA_FILE_NAME_ = "uprops.icu"; 1196 1197 // property data constants ------------------------------------------------- 1198 1199 /** 1200 * Numeric types and values in the main properties words. 1201 */ 1202 private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6; getNumericTypeValue(int props)1203 private static final int getNumericTypeValue(int props) { 1204 return props >> NUMERIC_TYPE_VALUE_SHIFT_; 1205 } 1206 /* constants for the storage form of numeric types and values */ 1207 /** No numeric value. */ 1208 private static final int NTV_NONE_ = 0; 1209 /** Decimal digits: nv=0..9 */ 1210 private static final int NTV_DECIMAL_START_ = 1; 1211 /** Other digits: nv=0..9 */ 1212 private static final int NTV_DIGIT_START_ = 11; 1213 /** Small integers: nv=0..154 */ 1214 private static final int NTV_NUMERIC_START_ = 21; 1215 /** Fractions: ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 */ 1216 private static final int NTV_FRACTION_START_ = 0xb0; 1217 /** 1218 * Large integers: 1219 * ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33) 1220 * (only one significant decimal digit) 1221 */ 1222 private static final int NTV_LARGE_START_ = 0x1e0; 1223 /** 1224 * Sexagesimal numbers: 1225 * ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4) 1226 */ 1227 private static final int NTV_BASE60_START_=0x300; 1228 /** 1229 * Fraction-20 values: 1230 * frac20 = ntv-0x324 = 0..0x17 -> 1|3|5|7 / 20|40|80|160|320|640 1231 * numerator: num = 2*(frac20&3)+1 1232 * denominator: den = 20<<(frac20>>2) 1233 */ 1234 private static final int NTV_FRACTION20_START_ = NTV_BASE60_START_ + 36; // 0x300+9*4=0x324 1235 /** 1236 * Fraction-32 values: 1237 * frac32 = ntv-0x34c = 0..15 -> 1|3|5|7 / 32|64|128|256 1238 * numerator: num = 2*(frac32&3)+1 1239 * denominator: den = 32<<(frac32>>2) 1240 */ 1241 private static final int NTV_FRACTION32_START_ = NTV_FRACTION20_START_ + 24; // 0x324+6*4=0x34c 1242 /** No numeric value (yet). */ 1243 private static final int NTV_RESERVED_START_ = NTV_FRACTION32_START_ + 16; // 0x34c+4*4=0x35c 1244 ntvGetType(int ntv)1245 private static final int ntvGetType(int ntv) { 1246 return 1247 (ntv==NTV_NONE_) ? NumericType.NONE : 1248 (ntv<NTV_DIGIT_START_) ? NumericType.DECIMAL : 1249 (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT : 1250 NumericType.NUMERIC; 1251 } 1252 1253 /* 1254 * Properties in vector word 0 1255 * Bits 1256 * 31..24 DerivedAge version major/minor one nibble each 1257 * 23..22 3..1: Bits 21..20 & 7..0 = Script_Extensions index 1258 * 3: Script value from Script_Extensions 1259 * 2: Script=Inherited 1260 * 1: Script=Common 1261 * 0: Script=bits 21..20 & 7..0 1262 * 21..20 Bits 9..8 of the UScriptCode, or index to Script_Extensions 1263 * 19..17 East Asian Width 1264 * 16.. 8 UBlockCode 1265 * 7.. 0 UScriptCode, or index to Script_Extensions 1266 */ 1267 1268 /** 1269 * Script_Extensions: mask includes Script 1270 */ 1271 public static final int SCRIPT_X_MASK = 0x00f000ff; 1272 //private static final int SCRIPT_X_SHIFT = 22; 1273 1274 // The UScriptCode or Script_Extensions index is split across two bit fields. 1275 // (Starting with Unicode 13/ICU 66/2019 due to more varied Script_Extensions.) 1276 // Shift the high bits right by 12 to assemble the full value. 1277 public static final int SCRIPT_HIGH_MASK = 0x00300000; 1278 public static final int SCRIPT_HIGH_SHIFT = 12; 1279 public static final int MAX_SCRIPT = 0x3ff; 1280 1281 /** 1282 * Integer properties mask and shift values for East Asian cell width. 1283 * Equivalent to icu4c UPROPS_EA_MASK 1284 */ 1285 private static final int EAST_ASIAN_MASK_ = 0x000e0000; 1286 /** 1287 * Integer properties mask and shift values for East Asian cell width. 1288 * Equivalent to icu4c UPROPS_EA_SHIFT 1289 */ 1290 private static final int EAST_ASIAN_SHIFT_ = 17; 1291 /** 1292 * Integer properties mask and shift values for blocks. 1293 * Equivalent to icu4c UPROPS_BLOCK_MASK 1294 */ 1295 private static final int BLOCK_MASK_ = 0x0001ff00; 1296 /** 1297 * Integer properties mask and shift values for blocks. 1298 * Equivalent to icu4c UPROPS_BLOCK_SHIFT 1299 */ 1300 private static final int BLOCK_SHIFT_ = 8; 1301 /** 1302 * Integer properties mask and shift values for scripts. 1303 * Equivalent to icu4c UPROPS_SHIFT_LOW_MASK. 1304 */ 1305 public static final int SCRIPT_LOW_MASK = 0x000000ff; 1306 1307 /* SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. */ 1308 public static final int SCRIPT_X_WITH_COMMON = 0x400000; 1309 public static final int SCRIPT_X_WITH_INHERITED = 0x800000; 1310 public static final int SCRIPT_X_WITH_OTHER = 0xc00000; 1311 mergeScriptCodeOrIndex(int scriptX)1312 public static final int mergeScriptCodeOrIndex(int scriptX) { 1313 return 1314 ((scriptX & SCRIPT_HIGH_MASK) >> SCRIPT_HIGH_SHIFT) | 1315 (scriptX & SCRIPT_LOW_MASK); 1316 } 1317 1318 /** 1319 * Additional properties used in internal trie data 1320 */ 1321 /* 1322 * Properties in vector word 1 1323 * Each bit encodes one binary property. 1324 * The following constants represent the bit number, use 1<<UPROPS_XYZ. 1325 * UPROPS_BINARY_1_TOP<=32! 1326 * 1327 * Keep this list of property enums in sync with 1328 * propListNames[] in icu/source/tools/genprops/props2.c! 1329 * 1330 * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_". 1331 */ 1332 private static final int WHITE_SPACE_PROPERTY_ = 0; 1333 private static final int DASH_PROPERTY_ = 1; 1334 private static final int HYPHEN_PROPERTY_ = 2; 1335 private static final int QUOTATION_MARK_PROPERTY_ = 3; 1336 private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4; 1337 private static final int MATH_PROPERTY_ = 5; 1338 private static final int HEX_DIGIT_PROPERTY_ = 6; 1339 private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7; 1340 private static final int ALPHABETIC_PROPERTY_ = 8; 1341 private static final int IDEOGRAPHIC_PROPERTY_ = 9; 1342 private static final int DIACRITIC_PROPERTY_ = 10; 1343 private static final int EXTENDER_PROPERTY_ = 11; 1344 private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12; 1345 private static final int GRAPHEME_EXTEND_PROPERTY_ = 13; 1346 private static final int GRAPHEME_LINK_PROPERTY_ = 14; 1347 private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15; 1348 private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16; 1349 private static final int RADICAL_PROPERTY_ = 17; 1350 private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18; 1351 private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19; 1352 private static final int DEPRECATED_PROPERTY_ = 20; 1353 private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21; 1354 private static final int XID_START_PROPERTY_ = 22; 1355 private static final int XID_CONTINUE_PROPERTY_ = 23; 1356 private static final int ID_START_PROPERTY_ = 24; 1357 private static final int ID_CONTINUE_PROPERTY_ = 25; 1358 private static final int GRAPHEME_BASE_PROPERTY_ = 26; 1359 private static final int S_TERM_PROPERTY_ = 27; 1360 private static final int VARIATION_SELECTOR_PROPERTY_ = 28; 1361 private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */ 1362 private static final int PATTERN_WHITE_SPACE = 30; 1363 private static final int PREPENDED_CONCATENATION_MARK = 31; // new in ICU 60 and Unicode 10 1364 1365 /* 1366 * Properties in vector word 2 1367 * Bits 1368 * 31..26 http://www.unicode.org/reports/tr51/#Emoji_Properties 1369 * 25..20 Line Break 1370 * 19..15 Sentence Break 1371 * 14..10 Word Break 1372 * 9.. 5 Grapheme Cluster Break 1373 * 4.. 0 Decomposition Type 1374 */ 1375 private static final int PROPS_2_EXTENDED_PICTOGRAPHIC=26; 1376 private static final int PROPS_2_EMOJI_COMPONENT = 27; 1377 private static final int PROPS_2_EMOJI = 28; 1378 private static final int PROPS_2_EMOJI_PRESENTATION = 29; 1379 private static final int PROPS_2_EMOJI_MODIFIER = 30; 1380 private static final int PROPS_2_EMOJI_MODIFIER_BASE = 31; 1381 1382 private static final int LB_MASK = 0x03f00000; 1383 private static final int LB_SHIFT = 20; 1384 1385 private static final int SB_MASK = 0x000f8000; 1386 private static final int SB_SHIFT = 15; 1387 1388 private static final int WB_MASK = 0x00007c00; 1389 private static final int WB_SHIFT = 10; 1390 1391 private static final int GCB_MASK = 0x000003e0; 1392 private static final int GCB_SHIFT = 5; 1393 1394 /** 1395 * Integer properties mask for decomposition type. 1396 * Equivalent to icu4c UPROPS_DT_MASK. 1397 */ 1398 private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f; 1399 1400 /** 1401 * First nibble shift 1402 */ 1403 private static final int FIRST_NIBBLE_SHIFT_ = 0x4; 1404 /** 1405 * Second nibble mask 1406 */ 1407 private static final int LAST_NIBBLE_MASK_ = 0xF; 1408 /** 1409 * Age value shift 1410 */ 1411 private static final int AGE_SHIFT_ = 24; 1412 1413 1414 // private constructors -------------------------------------------------- 1415 1416 /** 1417 * Constructor 1418 * @exception IOException thrown when data reading fails or data corrupted 1419 */ UCharacterProperty()1420 private UCharacterProperty() throws IOException 1421 { 1422 // consistency check 1423 if(binProps.length!=UProperty.BINARY_LIMIT) { 1424 throw new ICUException("binProps.length!=UProperty.BINARY_LIMIT"); 1425 } 1426 if(intProps.length!=(UProperty.INT_LIMIT-UProperty.INT_START)) { 1427 throw new ICUException("intProps.length!=(UProperty.INT_LIMIT-UProperty.INT_START)"); 1428 } 1429 1430 // jar access 1431 ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_); 1432 m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable()); 1433 // Read or skip the 16 indexes. 1434 int propertyOffset = bytes.getInt(); 1435 /* exceptionOffset = */ bytes.getInt(); 1436 /* caseOffset = */ bytes.getInt(); 1437 int additionalOffset = bytes.getInt(); 1438 int additionalVectorsOffset = bytes.getInt(); 1439 m_additionalColumnsCount_ = bytes.getInt(); 1440 int scriptExtensionsOffset = bytes.getInt(); 1441 int reservedOffset7 = bytes.getInt(); 1442 /* reservedOffset8 = */ bytes.getInt(); 1443 /* dataTopOffset = */ bytes.getInt(); 1444 m_maxBlockScriptValue_ = bytes.getInt(); 1445 m_maxJTGValue_ = bytes.getInt(); 1446 ICUBinary.skipBytes(bytes, (16 - 12) << 2); 1447 1448 // read the main properties trie 1449 m_trie_ = Trie2_16.createFromSerialized(bytes); 1450 int expectedTrieLength = (propertyOffset - 16) * 4; 1451 int trieLength = m_trie_.getSerializedLength(); 1452 if(trieLength > expectedTrieLength) { 1453 throw new IOException("uprops.icu: not enough bytes for main trie"); 1454 } 1455 // skip padding after trie bytes 1456 ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); 1457 1458 // skip unused intervening data structures 1459 ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4); 1460 1461 if(m_additionalColumnsCount_ > 0) { 1462 // reads the additional property block 1463 m_additionalTrie_ = Trie2_16.createFromSerialized(bytes); 1464 expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4; 1465 trieLength = m_additionalTrie_.getSerializedLength(); 1466 if(trieLength > expectedTrieLength) { 1467 throw new IOException("uprops.icu: not enough bytes for additional-properties trie"); 1468 } 1469 // skip padding after trie bytes 1470 ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); 1471 1472 // additional properties 1473 int size = scriptExtensionsOffset - additionalVectorsOffset; 1474 m_additionalVectors_ = ICUBinary.getInts(bytes, size, 0); 1475 } 1476 1477 // Script_Extensions 1478 int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2; 1479 if(numChars > 0) { 1480 m_scriptExtensions_ = ICUBinary.getChars(bytes, numChars, 0); 1481 } 1482 } 1483 1484 private static final class IsAcceptable implements ICUBinary.Authenticate { 1485 @Override isDataVersionAcceptable(byte version[])1486 public boolean isDataVersionAcceptable(byte version[]) { 1487 return version[0] == 7; 1488 } 1489 } 1490 private static final int DATA_FORMAT = 0x5550726F; // "UPro" 1491 1492 // private methods ------------------------------------------------------- 1493 1494 /* 1495 * Compare additional properties to see if it has argument type 1496 * @param property 32 bit properties 1497 * @param type character type 1498 * @return true if property has type 1499 */ 1500 /*private boolean compareAdditionalType(int property, int type) 1501 { 1502 return (property & (1 << type)) != 0; 1503 }*/ 1504 1505 // property starts for UnicodeSet -------------------------------------- *** 1506 1507 private static final int TAB = 0x0009; 1508 //private static final int LF = 0x000a; 1509 //private static final int FF = 0x000c; 1510 private static final int CR = 0x000d; 1511 private static final int U_A = 0x0041; 1512 private static final int U_F = 0x0046; 1513 private static final int U_Z = 0x005a; 1514 private static final int U_a = 0x0061; 1515 private static final int U_f = 0x0066; 1516 private static final int U_z = 0x007a; 1517 private static final int DEL = 0x007f; 1518 private static final int NL = 0x0085; 1519 private static final int NBSP = 0x00a0; 1520 private static final int CGJ = 0x034f; 1521 private static final int FIGURESP= 0x2007; 1522 private static final int HAIRSP = 0x200a; 1523 //private static final int ZWNJ = 0x200c; 1524 //private static final int ZWJ = 0x200d; 1525 private static final int RLM = 0x200f; 1526 private static final int NNBSP = 0x202f; 1527 private static final int WJ = 0x2060; 1528 private static final int INHSWAP = 0x206a; 1529 private static final int NOMDIG = 0x206f; 1530 private static final int U_FW_A = 0xff21; 1531 private static final int U_FW_F = 0xff26; 1532 private static final int U_FW_Z = 0xff3a; 1533 private static final int U_FW_a = 0xff41; 1534 private static final int U_FW_f = 0xff46; 1535 private static final int U_FW_z = 0xff5a; 1536 private static final int ZWNBSP = 0xfeff; 1537 addPropertyStarts(UnicodeSet set)1538 public UnicodeSet addPropertyStarts(UnicodeSet set) { 1539 /* add the start code point of each same-value range of the main trie */ 1540 Iterator<Trie2.Range> trieIterator = m_trie_.iterator(); 1541 Trie2.Range range; 1542 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 1543 set.add(range.startCodePoint); 1544 } 1545 1546 /* add code points with hardcoded properties, plus the ones following them */ 1547 1548 /* add for u_isblank() */ 1549 set.add(TAB); 1550 set.add(TAB+1); 1551 1552 /* add for IS_THAT_CONTROL_SPACE() */ 1553 set.add(CR+1); /* range TAB..CR */ 1554 set.add(0x1c); 1555 set.add(0x1f+1); 1556 set.add(NL); 1557 set.add(NL+1); 1558 1559 /* add for u_isIDIgnorable() what was not added above */ 1560 set.add(DEL); /* range DEL..NBSP-1, NBSP added below */ 1561 set.add(HAIRSP); 1562 set.add(RLM+1); 1563 set.add(INHSWAP); 1564 set.add(NOMDIG+1); 1565 set.add(ZWNBSP); 1566 set.add(ZWNBSP+1); 1567 1568 /* add no-break spaces for u_isWhitespace() what was not added above */ 1569 set.add(NBSP); 1570 set.add(NBSP+1); 1571 set.add(FIGURESP); 1572 set.add(FIGURESP+1); 1573 set.add(NNBSP); 1574 set.add(NNBSP+1); 1575 1576 /* add for u_charDigitValue() */ 1577 // TODO remove when UCharacter.getHanNumericValue() is changed to just return 1578 // Unicode numeric values 1579 set.add(0x3007); 1580 set.add(0x3008); 1581 set.add(0x4e00); 1582 set.add(0x4e01); 1583 set.add(0x4e8c); 1584 set.add(0x4e8d); 1585 set.add(0x4e09); 1586 set.add(0x4e0a); 1587 set.add(0x56db); 1588 set.add(0x56dc); 1589 set.add(0x4e94); 1590 set.add(0x4e95); 1591 set.add(0x516d); 1592 set.add(0x516e); 1593 set.add(0x4e03); 1594 set.add(0x4e04); 1595 set.add(0x516b); 1596 set.add(0x516c); 1597 set.add(0x4e5d); 1598 set.add(0x4e5e); 1599 1600 /* add for u_digit() */ 1601 set.add(U_a); 1602 set.add(U_z+1); 1603 set.add(U_A); 1604 set.add(U_Z+1); 1605 set.add(U_FW_a); 1606 set.add(U_FW_z+1); 1607 set.add(U_FW_A); 1608 set.add(U_FW_Z+1); 1609 1610 /* add for u_isxdigit() */ 1611 set.add(U_f+1); 1612 set.add(U_F+1); 1613 set.add(U_FW_f+1); 1614 set.add(U_FW_F+1); 1615 1616 /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */ 1617 set.add(WJ); /* range WJ..NOMDIG */ 1618 set.add(0xfff0); 1619 set.add(0xfffb+1); 1620 set.add(0xe0000); 1621 set.add(0xe0fff+1); 1622 1623 /* add for UCHAR_GRAPHEME_BASE and others */ 1624 set.add(CGJ); 1625 set.add(CGJ+1); 1626 1627 return set; // for chaining 1628 } 1629 upropsvec_addPropertyStarts(UnicodeSet set)1630 public void upropsvec_addPropertyStarts(UnicodeSet set) { 1631 /* add the start code point of each same-value range of the properties vectors trie */ 1632 if(m_additionalColumnsCount_>0) { 1633 /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ 1634 Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator(); 1635 Trie2.Range range; 1636 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 1637 set.add(range.startCodePoint); 1638 } 1639 } 1640 } 1641 ulayout_addPropertyStarts(int src, UnicodeSet set)1642 static UnicodeSet ulayout_addPropertyStarts(int src, UnicodeSet set) { 1643 return LayoutProps.INSTANCE.addPropertyStarts(src, set); 1644 } 1645 1646 // This static initializer block must be placed after 1647 // other static member initialization 1648 static { 1649 try { 1650 INSTANCE = new UCharacterProperty(); 1651 } 1652 catch (IOException e) { 1653 throw new MissingResourceException(e.getMessage(),"",""); 1654 } 1655 } 1656 1657 /*---------------------------------------------------------------- 1658 * Inclusions list 1659 *----------------------------------------------------------------*/ 1660 1661 /* 1662 * Return a set of characters for property enumeration. 1663 * The set implicitly contains 0x110000 as well, which is one more than the highest 1664 * Unicode code point. 1665 * 1666 * This set is used as an ordered list - its code points are ordered, and 1667 * consecutive code points (in Unicode code point order) in the set define a range. 1668 * For each two consecutive characters (start, limit) in the set, 1669 * all of the UCD/normalization and related properties for 1670 * all code points start..limit-1 are all the same, 1671 * except for character names and ISO comments. 1672 * 1673 * All Unicode code points U+0000..U+10ffff are covered by these ranges. 1674 * The ranges define a partition of the Unicode code space. 1675 * ICU uses the inclusions set to enumerate properties for generating 1676 * UnicodeSets containing all code points that have a certain property value. 1677 * 1678 * The Inclusion List is generated from the UCD. It is generated 1679 * by enumerating the data tries, and code points for hardcoded properties 1680 * are added as well. 1681 * 1682 * -------------------------------------------------------------------------- 1683 * 1684 * The following are ideas for getting properties-unique code point ranges, 1685 * with possible optimizations beyond the current implementation. 1686 * These optimizations would require more code and be more fragile. 1687 * The current implementation generates one single list (set) for all properties. 1688 * 1689 * To enumerate properties efficiently, one needs to know ranges of 1690 * repetitive values, so that the value of only each start code point 1691 * can be applied to the whole range. 1692 * This information is in principle available in the uprops.icu/unorm.icu data. 1693 * 1694 * There are two obstacles: 1695 * 1696 * 1. Some properties are computed from multiple data structures, 1697 * making it necessary to get repetitive ranges by intersecting 1698 * ranges from multiple tries. 1699 * 1700 * 2. It is not economical to write code for getting repetitive ranges 1701 * that are precise for each of some 50 properties. 1702 * 1703 * Compromise ideas: 1704 * 1705 * - Get ranges per trie, not per individual property. 1706 * Each range contains the same values for a whole group of properties. 1707 * This would generate currently five range sets, two for uprops.icu tries 1708 * and three for unorm.icu tries. 1709 * 1710 * - Combine sets of ranges for multiple tries to get sufficient sets 1711 * for properties, e.g., the uprops.icu main and auxiliary tries 1712 * for all non-normalization properties. 1713 * 1714 * Ideas for representing ranges and combining them: 1715 * 1716 * - A UnicodeSet could hold just the start code points of ranges. 1717 * Multiple sets are easily combined by or-ing them together. 1718 * 1719 * - Alternatively, a UnicodeSet could hold each even-numbered range. 1720 * All ranges could be enumerated by using each start code point 1721 * (for the even-numbered ranges) as well as each limit (end+1) code point 1722 * (for the odd-numbered ranges). 1723 * It should be possible to combine two such sets by xor-ing them, 1724 * but no more than two. 1725 * 1726 * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays, 1727 * but the first one is certainly simpler and applicable for combining more than 1728 * two range sets. 1729 * 1730 * It is possible to combine all range sets for all uprops/unorm tries into one 1731 * set that can be used for all properties. 1732 * As an optimization, there could be less-combined range sets for certain 1733 * groups of properties. 1734 * The relationship of which less-combined range set to use for which property 1735 * depends on the implementation of the properties and must be hardcoded 1736 * - somewhat error-prone and higher maintenance but can be tested easily 1737 * by building property sets "the simple way" in test code. 1738 * 1739 * --- 1740 * 1741 * Do not use a UnicodeSet pattern because that causes infinite recursion; 1742 * UnicodeSet depends on the inclusions set. 1743 * 1744 * --- 1745 * 1746 * getInclusions() is commented out starting 2005-feb-12 because 1747 * UnicodeSet now calls the uxyz_addPropertyStarts() directly, 1748 * and only for the relevant property source. 1749 */ 1750 /* 1751 public UnicodeSet getInclusions() { 1752 UnicodeSet set = new UnicodeSet(); 1753 NormalizerImpl.addPropertyStarts(set); 1754 addPropertyStarts(set); 1755 return set; 1756 } 1757 */ 1758 } 1759