1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2002-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: uprops.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2002feb24 16 * created by: Markus W. Scherer 17 * 18 * Constants for mostly non-core Unicode character properties 19 * stored in uprops.icu. 20 */ 21 22 #ifndef __UPROPS_H__ 23 #define __UPROPS_H__ 24 25 #include "unicode/utypes.h" 26 #include "unicode/uset.h" 27 #include "uset_imp.h" 28 #include "udataswp.h" 29 30 /* indexes[] entries */ 31 enum { 32 UPROPS_PROPS32_INDEX, 33 UPROPS_EXCEPTIONS_INDEX, 34 UPROPS_EXCEPTIONS_TOP_INDEX, 35 36 UPROPS_ADDITIONAL_TRIE_INDEX, 37 UPROPS_ADDITIONAL_VECTORS_INDEX, 38 UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX, 39 40 UPROPS_SCRIPT_EXTENSIONS_INDEX, 41 42 UPROPS_BLOCK_TRIE_INDEX, 43 UPROPS_RESERVED_INDEX_8, 44 45 /** size of the data file (number of 32-bit units after the header) */ 46 UPROPS_DATA_TOP_INDEX, 47 48 /** maximum values for code values in vector word 0 */ 49 UPROPS_MAX_VALUES_INDEX=10, 50 /** maximum values for code values in vector word 2 */ 51 UPROPS_MAX_VALUES_2_INDEX, 52 /** maximum values for other code values */ 53 UPROPS_MAX_VALUES_OTHER_INDEX, 54 55 UPROPS_INDEX_COUNT=16 56 }; 57 58 /* definitions for the main properties words */ 59 enum { 60 /* general category shift==0 0 (5 bits) */ 61 /* reserved 5 (1 bit) */ 62 UPROPS_NUMERIC_TYPE_VALUE_SHIFT=6 /* 6 (10 bits) */ 63 }; 64 65 #define GET_CATEGORY(props) ((props)&0x1f) 66 #define CAT_MASK(props) U_MASK(GET_CATEGORY(props)) 67 68 #define GET_NUMERIC_TYPE_VALUE(props) ((props)>>UPROPS_NUMERIC_TYPE_VALUE_SHIFT) 69 70 /* constants for the storage form of numeric types and values */ 71 enum { 72 /** No numeric value. */ 73 UPROPS_NTV_NONE=0, 74 /** Decimal digits: nv=0..9 */ 75 UPROPS_NTV_DECIMAL_START=1, 76 /** Other digits: nv=0..9 */ 77 UPROPS_NTV_DIGIT_START=11, 78 /** Small integers: nv=0..154 */ 79 UPROPS_NTV_NUMERIC_START=21, 80 /** Fractions: ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 */ 81 UPROPS_NTV_FRACTION_START=0xb0, 82 /** 83 * Large integers: 84 * ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33) 85 * (only one significant decimal digit) 86 */ 87 UPROPS_NTV_LARGE_START=0x1e0, 88 /** 89 * Sexagesimal numbers: 90 * ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4) 91 */ 92 UPROPS_NTV_BASE60_START=0x300, 93 /** 94 * Fraction-20 values: 95 * frac20 = ntv-0x324 = 0..0x17 -> 1|3|5|7 / 20|40|80|160|320|640 96 * numerator: num = 2*(frac20&3)+1 97 * denominator: den = 20<<(frac20>>2) 98 */ 99 UPROPS_NTV_FRACTION20_START=UPROPS_NTV_BASE60_START+36, // 0x300+9*4=0x324 100 /** 101 * Fraction-32 values: 102 * frac32 = ntv-0x34c = 0..15 -> 1|3|5|7 / 32|64|128|256 103 * numerator: num = 2*(frac32&3)+1 104 * denominator: den = 32<<(frac32>>2) 105 */ 106 UPROPS_NTV_FRACTION32_START=UPROPS_NTV_FRACTION20_START+24, // 0x324+6*4=0x34c 107 /** No numeric value (yet). */ 108 UPROPS_NTV_RESERVED_START=UPROPS_NTV_FRACTION32_START+16, // 0x34c+4*4=0x35c 109 110 UPROPS_NTV_MAX_SMALL_INT=UPROPS_NTV_FRACTION_START-UPROPS_NTV_NUMERIC_START-1 111 }; 112 113 #define UPROPS_NTV_GET_TYPE(ntv) \ 114 ((ntv==UPROPS_NTV_NONE) ? U_NT_NONE : \ 115 (ntv<UPROPS_NTV_DIGIT_START) ? U_NT_DECIMAL : \ 116 (ntv<UPROPS_NTV_NUMERIC_START) ? U_NT_DIGIT : \ 117 U_NT_NUMERIC) 118 119 /* number of properties vector words */ 120 #define UPROPS_VECTOR_WORDS 3 121 122 #ifdef __cplusplus 123 124 namespace { 125 126 // Properties in vector word 0 127 // Bits 128 // 31..26 Age major version (major=0..63) 129 // 25..24 Age minor version (minor=0..3) 130 // 23..17 reserved 131 // 16..15 Indic Conjunct Break 132 // 14..12 East Asian Width 133 // 11..10 3..1: Bits 9..0 = Script_Extensions index 134 // 3: Script value from Script_Extensions 135 // 2: Script=Inherited 136 // 1: Script=Common 137 // 0: Script=bits 9..0 138 // 9.. 0 UScriptCode, or index to Script_Extensions 139 140 // *Note*: If we need more than the available bits for new properties, 141 // then we could move the Age property out of the properties vectors. 142 // For example, we could store the Age property in its own trie. 143 // In a small, 8-bit-value-width CodePointTrie, it would be larger than 144 // the amount of data that we would save in the properties vectors and their trie, 145 // but the size increase would be a small percentage of the total uprops.icu size. 146 // It would certainly be a much smaller increase than widening the properties vectors. 147 // The savings in the properties vectors+trie from pulling out the Age property 148 // are partly from mediocre correlation between Age and other property values. 149 // (Adding new characters to existing scripts tends to split property vectors where 150 // new characters are similar to old ones.) 151 // See https://github.com/unicode-org/icu/pull/3025 for details. 152 153 inline constexpr uint32_t UPROPS_AGE_MASK = 0xff000000; 154 inline constexpr int32_t UPROPS_AGE_SHIFT = 24; 155 156 inline constexpr uint8_t UPROPS_AGE_MAJOR_MAX = 63; 157 inline constexpr uint8_t UPROPS_AGE_MINOR_MAX = 3; 158 159 inline constexpr uint32_t UPROPS_EA_MASK = 0x00007000; 160 inline constexpr int32_t UPROPS_EA_SHIFT = 12; 161 162 inline constexpr uint32_t UPROPS_INCB_MASK = 0x00018000; 163 inline constexpr int32_t UPROPS_INCB_SHIFT = 15; 164 165 /** Script_Extensions: mask includes Script */ 166 inline constexpr uint32_t UPROPS_SCRIPT_X_MASK = 0x00000fff; 167 168 // UPROPS_SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. 169 inline constexpr uint32_t UPROPS_SCRIPT_X_WITH_OTHER = 0xc00; 170 inline constexpr uint32_t UPROPS_SCRIPT_X_WITH_INHERITED = 0x800; 171 inline constexpr uint32_t UPROPS_SCRIPT_X_WITH_COMMON = 0x400; 172 inline constexpr int32_t UPROPS_MAX_SCRIPT = 0x3ff; 173 174 /* 175 * Properties in vector word 1 176 * Each bit encodes one binary property. 177 * The following constants represent the bit number, use 1<<UPROPS_XYZ. 178 * UPROPS_BINARY_1_TOP<=32! 179 * 180 * Keep this list of property enums in sync with 181 * propListNames[] in icu/source/tools/genprops/props2.c! 182 * 183 * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_". 184 */ 185 enum { 186 UPROPS_WHITE_SPACE, 187 UPROPS_DASH, 188 UPROPS_HYPHEN, 189 UPROPS_QUOTATION_MARK, 190 UPROPS_TERMINAL_PUNCTUATION, 191 UPROPS_MATH, 192 UPROPS_HEX_DIGIT, 193 UPROPS_ASCII_HEX_DIGIT, 194 UPROPS_ALPHABETIC, 195 UPROPS_IDEOGRAPHIC, 196 UPROPS_DIACRITIC, 197 UPROPS_EXTENDER, 198 UPROPS_NONCHARACTER_CODE_POINT, 199 UPROPS_GRAPHEME_EXTEND, 200 UPROPS_GRAPHEME_LINK, 201 UPROPS_IDS_BINARY_OPERATOR, 202 UPROPS_IDS_TRINARY_OPERATOR, 203 UPROPS_RADICAL, 204 UPROPS_UNIFIED_IDEOGRAPH, 205 UPROPS_DEFAULT_IGNORABLE_CODE_POINT, 206 UPROPS_DEPRECATED, 207 UPROPS_LOGICAL_ORDER_EXCEPTION, 208 UPROPS_XID_START, 209 UPROPS_XID_CONTINUE, 210 UPROPS_ID_START, /* ICU 2.6, uprops format version 3.2 */ 211 UPROPS_ID_CONTINUE, 212 UPROPS_GRAPHEME_BASE, 213 UPROPS_S_TERM, /* new in ICU 3.0 and Unicode 4.0.1 */ 214 UPROPS_VARIATION_SELECTOR, 215 UPROPS_PATTERN_SYNTAX, /* new in ICU 3.4 and Unicode 4.1 */ 216 UPROPS_PATTERN_WHITE_SPACE, 217 UPROPS_PREPENDED_CONCATENATION_MARK, // new in ICU 60 and Unicode 10 218 UPROPS_BINARY_1_TOP /* ==32 - full! */ 219 }; 220 221 /* 222 * Properties in vector word 2 223 * Bits 224 * 31..26 ICU 75: Identifier_Type bit set 225 * ICU 70..74: unused 226 * ICU 57..69: emoji properties; moved to uemoji.icu in ICU 70 227 * 25..20 Line Break 228 * 19..15 Sentence Break 229 * 14..10 Word Break 230 * 9.. 5 Grapheme Cluster Break 231 * 4.. 0 Decomposition Type 232 */ 233 234 // https://www.unicode.org/reports/tr39/#Identifier_Status_and_Type 235 // The Identifier_Type maps each code point to a *set* of one or more values. 236 // Some can be combined with others, some can only occur alone. 237 // Exclusion & Limited_Use are combinable bits, but cannot occur together. 238 // We use this forbidden combination for enumerated values. 239 // We use 6 bits for all possible combinations. 240 // If more combinable values are added, then we need to use more bits. 241 // 242 // We do not store separate data for Identifier_Status: 243 // We can derive that from the encoded Identifier_Type via a simple range check. 244 245 inline constexpr uint32_t UPROPS_2_ID_TYPE_MASK = 0xfc000000; 246 inline constexpr int32_t UPROPS_2_ID_TYPE_SHIFT = 26; 247 248 enum { 249 // A high bit for use in idTypeToEncoded[] but not used in the data 250 UPROPS_ID_TYPE_BIT = 0x80, 251 252 // Combinable bits 253 UPROPS_ID_TYPE_EXCLUSION = 0x20, 254 UPROPS_ID_TYPE_LIMITED_USE = 0x10, 255 UPROPS_ID_TYPE_UNCOMMON_USE = 8, 256 UPROPS_ID_TYPE_TECHNICAL = 4, 257 UPROPS_ID_TYPE_OBSOLETE = 2, 258 UPROPS_ID_TYPE_NOT_XID = 1, 259 260 // Exclusive values 261 UPROPS_ID_TYPE_NOT_CHARACTER = 0, 262 263 // Forbidden bit combination used for enumerating other exclusive values 264 UPROPS_ID_TYPE_FORBIDDEN = UPROPS_ID_TYPE_EXCLUSION | UPROPS_ID_TYPE_LIMITED_USE, // 0x30 265 UPROPS_ID_TYPE_DEPRECATED = UPROPS_ID_TYPE_FORBIDDEN, // 0x30 266 UPROPS_ID_TYPE_DEFAULT_IGNORABLE, // 0x31 267 UPROPS_ID_TYPE_NOT_NFKC, // 0x32 268 269 UPROPS_ID_TYPE_ALLOWED_MIN = UPROPS_ID_TYPE_FORBIDDEN + 0xc, // 0x3c 270 UPROPS_ID_TYPE_INCLUSION = UPROPS_ID_TYPE_FORBIDDEN + 0xe, // 0x3e 271 UPROPS_ID_TYPE_RECOMMENDED = UPROPS_ID_TYPE_FORBIDDEN + 0xf, // 0x3f 272 }; 273 274 /** 275 * Maps UIdentifierType to encoded bits. 276 * When UPROPS_ID_TYPE_BIT is set, then use "&" to test whether the value bit is set. 277 * When UPROPS_ID_TYPE_BIT is not set, then compare ("==") the array value with the data value. 278 */ 279 inline constexpr uint8_t uprops_idTypeToEncoded[] = { 280 UPROPS_ID_TYPE_NOT_CHARACTER, 281 UPROPS_ID_TYPE_DEPRECATED, 282 UPROPS_ID_TYPE_DEFAULT_IGNORABLE, 283 UPROPS_ID_TYPE_NOT_NFKC, 284 UPROPS_ID_TYPE_BIT | UPROPS_ID_TYPE_NOT_XID, 285 UPROPS_ID_TYPE_BIT | UPROPS_ID_TYPE_EXCLUSION, 286 UPROPS_ID_TYPE_BIT | UPROPS_ID_TYPE_OBSOLETE, 287 UPROPS_ID_TYPE_BIT | UPROPS_ID_TYPE_TECHNICAL, 288 UPROPS_ID_TYPE_BIT | UPROPS_ID_TYPE_UNCOMMON_USE, 289 UPROPS_ID_TYPE_BIT | UPROPS_ID_TYPE_LIMITED_USE, 290 UPROPS_ID_TYPE_INCLUSION, 291 UPROPS_ID_TYPE_RECOMMENDED 292 }; 293 294 } // namespace 295 296 #endif // __cplusplus 297 298 #define UPROPS_LB_MASK 0x03f00000 299 #define UPROPS_LB_SHIFT 20 300 301 #define UPROPS_SB_MASK 0x000f8000 302 #define UPROPS_SB_SHIFT 15 303 304 #define UPROPS_WB_MASK 0x00007c00 305 #define UPROPS_WB_SHIFT 10 306 307 #define UPROPS_GCB_MASK 0x000003e0 308 #define UPROPS_GCB_SHIFT 5 309 310 #define UPROPS_DT_MASK 0x0000001f 311 312 #ifdef __cplusplus 313 314 namespace { 315 316 // Bits 9..0 in UPROPS_MAX_VALUES_OTHER_INDEX 317 inline constexpr uint32_t UPROPS_MAX_BLOCK = 0x3ff; 318 319 } // namespace 320 321 #endif // __cplusplus 322 323 /** 324 * Gets the main properties value for a code point. 325 * Implemented in uchar.c for uprops.cpp. 326 */ 327 U_CFUNC uint32_t 328 u_getMainProperties(UChar32 c); 329 330 /** 331 * Get a properties vector word for a code point. 332 * Implemented in uchar.c for uprops.cpp. 333 * @return 0 if no data or illegal argument 334 */ 335 U_CFUNC uint32_t 336 u_getUnicodeProperties(UChar32 c, int32_t column); 337 338 /** 339 * Get the the maximum values for some enum/int properties. 340 * Use the same column numbers as for u_getUnicodeProperties(). 341 * The returned value will contain maximum values stored in the same bit fields 342 * as where the enum values are stored in the u_getUnicodeProperties() 343 * return values for the same columns. 344 * 345 * Valid columns are those for properties words that contain enumerated values. 346 * (ICU 2.6: columns 0 and 2) 347 * For other column numbers, this function will return 0. 348 * 349 * @internal 350 */ 351 U_CFUNC int32_t 352 uprv_getMaxValues(int32_t column); 353 354 /** 355 * Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM. 356 * @internal 357 */ 358 U_CFUNC UBool 359 u_isalnumPOSIX(UChar32 c); 360 361 /** 362 * Checks if c is in 363 * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}] 364 * with space=\p{Whitespace} and Control=Cc. 365 * Implements UCHAR_POSIX_GRAPH. 366 * @internal 367 */ 368 U_CFUNC UBool 369 u_isgraphPOSIX(UChar32 c); 370 371 /** 372 * Checks if c is in \p{graph}\p{blank} - \p{cntrl}. 373 * Implements UCHAR_POSIX_PRINT. 374 * @internal 375 */ 376 U_CFUNC UBool 377 u_isprintPOSIX(UChar32 c); 378 379 /** Some code points. @internal */ 380 enum { 381 TAB =0x0009, 382 LF =0x000a, 383 FF =0x000c, 384 CR =0x000d, 385 NBSP =0x00a0, 386 CGJ =0x034f, 387 FIGURESP=0x2007, 388 HAIRSP =0x200a, 389 ZWNJ =0x200c, 390 ZWJ =0x200d, 391 RLM =0x200f, 392 NNBSP =0x202f, 393 ZWNBSP =0xfeff 394 }; 395 396 // TODO: Move these two functions into a different header file (new unames.h?) so that uprops.h 397 // need not be C-compatible any more. 398 /** 399 * Get the maximum length of a (regular/1.0/extended) character name. 400 * @return 0 if no character names available. 401 */ 402 U_CAPI int32_t U_EXPORT2 403 uprv_getMaxCharNameLength(void); 404 405 /** 406 * Fills set with characters that are used in Unicode character names. 407 * Includes all characters that are used in regular/Unicode 1.0/extended names. 408 * Just empties the set if no character names are available. 409 * @param sa USetAdder to receive characters. 410 */ 411 U_CAPI void U_EXPORT2 412 uprv_getCharNameCharacters(const USetAdder *sa); 413 414 /** 415 * Constants for which data and implementation files provide which properties. 416 * Used by UnicodeSet for service-specific property enumeration. 417 * @internal 418 */ 419 enum UPropertySource { 420 /** No source, not a supported property. */ 421 UPROPS_SRC_NONE, 422 /** From uchar.c/uprops.icu main trie */ 423 UPROPS_SRC_CHAR, 424 /** From uchar.c/uprops.icu properties vectors trie */ 425 UPROPS_SRC_PROPSVEC, 426 /** From unames.c/unames.icu */ 427 UPROPS_SRC_NAMES, 428 /** From ucase.c/ucase.icu */ 429 UPROPS_SRC_CASE, 430 /** From ubidi_props.c/ubidi.icu */ 431 UPROPS_SRC_BIDI, 432 /** From uchar.c/uprops.icu main trie as well as properties vectors trie */ 433 UPROPS_SRC_CHAR_AND_PROPSVEC, 434 /** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */ 435 UPROPS_SRC_CASE_AND_NORM, 436 /** From normalizer2impl.cpp/nfc.nrm */ 437 UPROPS_SRC_NFC, 438 /** From normalizer2impl.cpp/nfkc.nrm */ 439 UPROPS_SRC_NFKC, 440 /** From normalizer2impl.cpp/nfkc_cf.nrm */ 441 UPROPS_SRC_NFKC_CF, 442 /** From normalizer2impl.cpp/nfc.nrm canonical iterator data */ 443 UPROPS_SRC_NFC_CANON_ITER, 444 // Text layout properties. 445 UPROPS_SRC_INPC, 446 UPROPS_SRC_INSC, 447 UPROPS_SRC_VO, 448 UPROPS_SRC_EMOJI, 449 UPROPS_SRC_IDSU, 450 UPROPS_SRC_ID_COMPAT_MATH, 451 UPROPS_SRC_BLOCK, 452 UPROPS_SRC_MCM, 453 /** One more than the highest UPropertySource (UPROPS_SRC_) constant. */ 454 UPROPS_SRC_COUNT 455 }; 456 typedef enum UPropertySource UPropertySource; 457 458 /** 459 * @see UPropertySource 460 * @internal 461 */ 462 U_CFUNC UPropertySource U_EXPORT2 463 uprops_getSource(UProperty which); 464 465 /** 466 * Enumerate uprops.icu's main data trie and add the 467 * start of each range of same properties to the set. 468 * @internal 469 */ 470 U_CFUNC void U_EXPORT2 471 uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode); 472 473 /** 474 * Enumerate uprops.icu's properties vectors trie and add the 475 * start of each range of same properties to the set. 476 * @internal 477 */ 478 U_CFUNC void U_EXPORT2 479 upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode); 480 481 U_CFUNC void U_EXPORT2 482 uprops_addPropertyStarts(UPropertySource src, const USetAdder *sa, UErrorCode *pErrorCode); 483 484 #ifdef __cplusplus 485 486 U_CFUNC void U_EXPORT2 487 ublock_addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode); 488 489 #endif // __cplusplus 490 491 /** 492 * Return a set of characters for property enumeration. 493 * For each two consecutive characters (start, limit) in the set, 494 * all of the properties for start..limit-1 are all the same. 495 * 496 * @param sa USetAdder to receive result. Existing contents are lost. 497 * @internal 498 */ 499 /*U_CFUNC void U_EXPORT2 500 uprv_getInclusions(const USetAdder *sa, UErrorCode *pErrorCode); 501 */ 502 503 // TODO: Move this into a different header file (udataswp.h? new unames.h?) so that uprops.h 504 // need not be C-compatible any more. 505 /** 506 * Swap the ICU Unicode character names file. See uchar.c. 507 * @internal 508 */ 509 U_CAPI int32_t U_EXPORT2 510 uchar_swapNames(const UDataSwapper *ds, 511 const void *inData, int32_t length, void *outData, 512 UErrorCode *pErrorCode); 513 514 #ifdef __cplusplus 515 516 U_NAMESPACE_BEGIN 517 518 class UnicodeSet; 519 520 class CharacterProperties { 521 public: 522 CharacterProperties() = delete; 523 static const UnicodeSet *getInclusionsForProperty(UProperty prop, UErrorCode &errorCode); 524 static const UnicodeSet *getBinaryPropertySet(UProperty property, UErrorCode &errorCode); 525 }; 526 527 // implemented in uniset_props.cpp 528 U_CFUNC UnicodeSet * 529 uniset_getUnicode32Instance(UErrorCode &errorCode); 530 531 U_NAMESPACE_END 532 533 #endif 534 535 #endif 536