• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2002-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  uprops.h
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2002feb24
16 *   created by: Markus W. Scherer
17 *
18 *   Constants for mostly non-core Unicode character properties
19 *   stored in uprops.icu.
20 */
21 
22 #ifndef __UPROPS_H__
23 #define __UPROPS_H__
24 
25 #include "unicode/utypes.h"
26 #include "unicode/uset.h"
27 #include "uset_imp.h"
28 #include "udataswp.h"
29 
30 /* indexes[] entries */
31 enum {
32     UPROPS_PROPS32_INDEX,
33     UPROPS_EXCEPTIONS_INDEX,
34     UPROPS_EXCEPTIONS_TOP_INDEX,
35 
36     UPROPS_ADDITIONAL_TRIE_INDEX,
37     UPROPS_ADDITIONAL_VECTORS_INDEX,
38     UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX,
39 
40     UPROPS_SCRIPT_EXTENSIONS_INDEX,
41 
42     UPROPS_BLOCK_TRIE_INDEX,
43     UPROPS_RESERVED_INDEX_8,
44 
45     /** size of the data file (number of 32-bit units after the header) */
46     UPROPS_DATA_TOP_INDEX,
47 
48     /** maximum values for code values in vector word 0 */
49     UPROPS_MAX_VALUES_INDEX=10,
50     /** maximum values for code values in vector word 2 */
51     UPROPS_MAX_VALUES_2_INDEX,
52     /** maximum values for other code values */
53     UPROPS_MAX_VALUES_OTHER_INDEX,
54 
55     UPROPS_INDEX_COUNT=16
56 };
57 
58 /* definitions for the main properties words */
59 enum {
60     /* general category shift==0                                0 (5 bits) */
61     /* reserved                                                 5 (1 bit) */
62     UPROPS_NUMERIC_TYPE_VALUE_SHIFT=6                       /*  6 (10 bits) */
63 };
64 
65 #define GET_CATEGORY(props) ((props)&0x1f)
66 #define CAT_MASK(props) U_MASK(GET_CATEGORY(props))
67 
68 #define GET_NUMERIC_TYPE_VALUE(props) ((props)>>UPROPS_NUMERIC_TYPE_VALUE_SHIFT)
69 
70 /* constants for the storage form of numeric types and values */
71 enum {
72     /** No numeric value. */
73     UPROPS_NTV_NONE=0,
74     /** Decimal digits: nv=0..9 */
75     UPROPS_NTV_DECIMAL_START=1,
76     /** Other digits: nv=0..9 */
77     UPROPS_NTV_DIGIT_START=11,
78     /** Small integers: nv=0..154 */
79     UPROPS_NTV_NUMERIC_START=21,
80     /** Fractions: ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 */
81     UPROPS_NTV_FRACTION_START=0xb0,
82     /**
83      * Large integers:
84      * ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33)
85      * (only one significant decimal digit)
86      */
87     UPROPS_NTV_LARGE_START=0x1e0,
88     /**
89      * Sexagesimal numbers:
90      * ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4)
91      */
92     UPROPS_NTV_BASE60_START=0x300,
93     /**
94      * Fraction-20 values:
95      * frac20 = ntv-0x324 = 0..0x17 -> 1|3|5|7 / 20|40|80|160|320|640
96      * numerator: num = 2*(frac20&3)+1
97      * denominator: den = 20<<(frac20>>2)
98      */
99     UPROPS_NTV_FRACTION20_START=UPROPS_NTV_BASE60_START+36,  // 0x300+9*4=0x324
100     /**
101      * Fraction-32 values:
102      * frac32 = ntv-0x34c = 0..15 -> 1|3|5|7 / 32|64|128|256
103      * numerator: num = 2*(frac32&3)+1
104      * denominator: den = 32<<(frac32>>2)
105      */
106     UPROPS_NTV_FRACTION32_START=UPROPS_NTV_FRACTION20_START+24,  // 0x324+6*4=0x34c
107     /** No numeric value (yet). */
108     UPROPS_NTV_RESERVED_START=UPROPS_NTV_FRACTION32_START+16,  // 0x34c+4*4=0x35c
109 
110     UPROPS_NTV_MAX_SMALL_INT=UPROPS_NTV_FRACTION_START-UPROPS_NTV_NUMERIC_START-1
111 };
112 
113 #define UPROPS_NTV_GET_TYPE(ntv) \
114     ((ntv==UPROPS_NTV_NONE) ? U_NT_NONE : \
115     (ntv<UPROPS_NTV_DIGIT_START) ?  U_NT_DECIMAL : \
116     (ntv<UPROPS_NTV_NUMERIC_START) ? U_NT_DIGIT : \
117     U_NT_NUMERIC)
118 
119 /* number of properties vector words */
120 #define UPROPS_VECTOR_WORDS     3
121 
122 #ifdef __cplusplus
123 
124 namespace {
125 
126 // Properties in vector word 0
127 // Bits
128 // 31..26   Age major version (major=0..63)
129 // 25..24   Age minor version (minor=0..3)
130 // 23..17   reserved
131 // 16..15   Indic Conjunct Break
132 // 14..12   East Asian Width
133 // 11..10   3..1: Bits 9..0 = Script_Extensions index
134 //             3: Script value from Script_Extensions
135 //             2: Script=Inherited
136 //             1: Script=Common
137 //             0: Script=bits 9..0
138 //  9.. 0   UScriptCode, or index to Script_Extensions
139 
140 // *Note*: If we need more than the available bits for new properties,
141 // then we could move the Age property out of the properties vectors.
142 // For example, we could store the Age property in its own trie.
143 // In a small, 8-bit-value-width CodePointTrie, it would be larger than
144 // the amount of data that we would save in the properties vectors and their trie,
145 // but the size increase would be a small percentage of the total uprops.icu size.
146 // It would certainly be a much smaller increase than widening the properties vectors.
147 // The savings in the properties vectors+trie from pulling out the Age property
148 // are partly from mediocre correlation between Age and other property values.
149 // (Adding new characters to existing scripts tends to split property vectors where
150 // new characters are similar to old ones.)
151 // See https://github.com/unicode-org/icu/pull/3025 for details.
152 
153 inline constexpr uint32_t UPROPS_AGE_MASK = 0xff000000;
154 inline constexpr int32_t UPROPS_AGE_SHIFT = 24;
155 
156 inline constexpr uint8_t UPROPS_AGE_MAJOR_MAX = 63;
157 inline constexpr uint8_t UPROPS_AGE_MINOR_MAX = 3;
158 
159 inline constexpr uint32_t UPROPS_EA_MASK = 0x00007000;
160 inline constexpr int32_t UPROPS_EA_SHIFT = 12;
161 
162 inline constexpr uint32_t UPROPS_INCB_MASK = 0x00018000;
163 inline constexpr int32_t UPROPS_INCB_SHIFT = 15;
164 
165 /** Script_Extensions: mask includes Script */
166 inline constexpr uint32_t UPROPS_SCRIPT_X_MASK = 0x00000fff;
167 
168 // UPROPS_SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions.
169 inline constexpr uint32_t UPROPS_SCRIPT_X_WITH_OTHER = 0xc00;
170 inline constexpr uint32_t UPROPS_SCRIPT_X_WITH_INHERITED = 0x800;
171 inline constexpr uint32_t UPROPS_SCRIPT_X_WITH_COMMON = 0x400;
172 inline constexpr int32_t UPROPS_MAX_SCRIPT = 0x3ff;
173 
174 /*
175  * Properties in vector word 1
176  * Each bit encodes one binary property.
177  * The following constants represent the bit number, use 1<<UPROPS_XYZ.
178  * UPROPS_BINARY_1_TOP<=32!
179  *
180  * Keep this list of property enums in sync with
181  * propListNames[] in icu/source/tools/genprops/props2.c!
182  *
183  * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".
184  */
185 enum {
186     UPROPS_WHITE_SPACE,
187     UPROPS_DASH,
188     UPROPS_HYPHEN,
189     UPROPS_QUOTATION_MARK,
190     UPROPS_TERMINAL_PUNCTUATION,
191     UPROPS_MATH,
192     UPROPS_HEX_DIGIT,
193     UPROPS_ASCII_HEX_DIGIT,
194     UPROPS_ALPHABETIC,
195     UPROPS_IDEOGRAPHIC,
196     UPROPS_DIACRITIC,
197     UPROPS_EXTENDER,
198     UPROPS_NONCHARACTER_CODE_POINT,
199     UPROPS_GRAPHEME_EXTEND,
200     UPROPS_GRAPHEME_LINK,
201     UPROPS_IDS_BINARY_OPERATOR,
202     UPROPS_IDS_TRINARY_OPERATOR,
203     UPROPS_RADICAL,
204     UPROPS_UNIFIED_IDEOGRAPH,
205     UPROPS_DEFAULT_IGNORABLE_CODE_POINT,
206     UPROPS_DEPRECATED,
207     UPROPS_LOGICAL_ORDER_EXCEPTION,
208     UPROPS_XID_START,
209     UPROPS_XID_CONTINUE,
210     UPROPS_ID_START,                            /* ICU 2.6, uprops format version 3.2 */
211     UPROPS_ID_CONTINUE,
212     UPROPS_GRAPHEME_BASE,
213     UPROPS_S_TERM,                              /* new in ICU 3.0 and Unicode 4.0.1 */
214     UPROPS_VARIATION_SELECTOR,
215     UPROPS_PATTERN_SYNTAX,                      /* new in ICU 3.4 and Unicode 4.1 */
216     UPROPS_PATTERN_WHITE_SPACE,
217     UPROPS_PREPENDED_CONCATENATION_MARK,        // new in ICU 60 and Unicode 10
218     UPROPS_BINARY_1_TOP                         /* ==32 - full! */
219 };
220 
221 /*
222  * Properties in vector word 2
223  * Bits
224  * 31..26   ICU 75: Identifier_Type bit set
225  *          ICU 70..74: unused
226  *          ICU 57..69: emoji properties; moved to uemoji.icu in ICU 70
227  * 25..20   Line Break
228  * 19..15   Sentence Break
229  * 14..10   Word Break
230  *  9.. 5   Grapheme Cluster Break
231  *  4.. 0   Decomposition Type
232  */
233 
234 // https://www.unicode.org/reports/tr39/#Identifier_Status_and_Type
235 // The Identifier_Type maps each code point to a *set* of one or more values.
236 // Some can be combined with others, some can only occur alone.
237 // Exclusion & Limited_Use are combinable bits, but cannot occur together.
238 // We use this forbidden combination for enumerated values.
239 // We use 6 bits for all possible combinations.
240 // If more combinable values are added, then we need to use more bits.
241 //
242 // We do not store separate data for Identifier_Status:
243 // We can derive that from the encoded Identifier_Type via a simple range check.
244 
245 inline constexpr uint32_t UPROPS_2_ID_TYPE_MASK = 0xfc000000;
246 inline constexpr int32_t UPROPS_2_ID_TYPE_SHIFT = 26;
247 
248 enum {
249     // A high bit for use in idTypeToEncoded[] but not used in the data
250     UPROPS_ID_TYPE_BIT = 0x80,
251 
252     // Combinable bits
253     UPROPS_ID_TYPE_EXCLUSION = 0x20,
254     UPROPS_ID_TYPE_LIMITED_USE = 0x10,
255     UPROPS_ID_TYPE_UNCOMMON_USE = 8,
256     UPROPS_ID_TYPE_TECHNICAL = 4,
257     UPROPS_ID_TYPE_OBSOLETE = 2,
258     UPROPS_ID_TYPE_NOT_XID = 1,
259 
260     // Exclusive values
261     UPROPS_ID_TYPE_NOT_CHARACTER = 0,
262 
263     // Forbidden bit combination used for enumerating other exclusive values
264     UPROPS_ID_TYPE_FORBIDDEN = UPROPS_ID_TYPE_EXCLUSION | UPROPS_ID_TYPE_LIMITED_USE, // 0x30
265     UPROPS_ID_TYPE_DEPRECATED = UPROPS_ID_TYPE_FORBIDDEN, // 0x30
266     UPROPS_ID_TYPE_DEFAULT_IGNORABLE, // 0x31
267     UPROPS_ID_TYPE_NOT_NFKC, // 0x32
268 
269     UPROPS_ID_TYPE_ALLOWED_MIN = UPROPS_ID_TYPE_FORBIDDEN + 0xc, // 0x3c
270     UPROPS_ID_TYPE_INCLUSION = UPROPS_ID_TYPE_FORBIDDEN + 0xe, // 0x3e
271     UPROPS_ID_TYPE_RECOMMENDED = UPROPS_ID_TYPE_FORBIDDEN + 0xf, // 0x3f
272 };
273 
274 /**
275  * Maps UIdentifierType to encoded bits.
276  * When UPROPS_ID_TYPE_BIT is set, then use "&" to test whether the value bit is set.
277  * When UPROPS_ID_TYPE_BIT is not set, then compare ("==") the array value with the data value.
278  */
279 inline constexpr uint8_t uprops_idTypeToEncoded[] = {
280     UPROPS_ID_TYPE_NOT_CHARACTER,
281     UPROPS_ID_TYPE_DEPRECATED,
282     UPROPS_ID_TYPE_DEFAULT_IGNORABLE,
283     UPROPS_ID_TYPE_NOT_NFKC,
284     UPROPS_ID_TYPE_BIT | UPROPS_ID_TYPE_NOT_XID,
285     UPROPS_ID_TYPE_BIT | UPROPS_ID_TYPE_EXCLUSION,
286     UPROPS_ID_TYPE_BIT | UPROPS_ID_TYPE_OBSOLETE,
287     UPROPS_ID_TYPE_BIT | UPROPS_ID_TYPE_TECHNICAL,
288     UPROPS_ID_TYPE_BIT | UPROPS_ID_TYPE_UNCOMMON_USE,
289     UPROPS_ID_TYPE_BIT | UPROPS_ID_TYPE_LIMITED_USE,
290     UPROPS_ID_TYPE_INCLUSION,
291     UPROPS_ID_TYPE_RECOMMENDED
292 };
293 
294 }  // namespace
295 
296 #endif  // __cplusplus
297 
298 #define UPROPS_LB_MASK          0x03f00000
299 #define UPROPS_LB_SHIFT         20
300 
301 #define UPROPS_SB_MASK          0x000f8000
302 #define UPROPS_SB_SHIFT         15
303 
304 #define UPROPS_WB_MASK          0x00007c00
305 #define UPROPS_WB_SHIFT         10
306 
307 #define UPROPS_GCB_MASK         0x000003e0
308 #define UPROPS_GCB_SHIFT        5
309 
310 #define UPROPS_DT_MASK          0x0000001f
311 
312 #ifdef __cplusplus
313 
314 namespace {
315 
316 // Bits 9..0 in UPROPS_MAX_VALUES_OTHER_INDEX
317 inline constexpr uint32_t UPROPS_MAX_BLOCK = 0x3ff;
318 
319 }  // namespace
320 
321 #endif  // __cplusplus
322 
323 /**
324  * Gets the main properties value for a code point.
325  * Implemented in uchar.c for uprops.cpp.
326  */
327 U_CFUNC uint32_t
328 u_getMainProperties(UChar32 c);
329 
330 /**
331  * Get a properties vector word for a code point.
332  * Implemented in uchar.c for uprops.cpp.
333  * @return 0 if no data or illegal argument
334  */
335 U_CFUNC uint32_t
336 u_getUnicodeProperties(UChar32 c, int32_t column);
337 
338 /**
339  * Get the the maximum values for some enum/int properties.
340  * Use the same column numbers as for u_getUnicodeProperties().
341  * The returned value will contain maximum values stored in the same bit fields
342  * as where the enum values are stored in the u_getUnicodeProperties()
343  * return values for the same columns.
344  *
345  * Valid columns are those for properties words that contain enumerated values.
346  * (ICU 2.6: columns 0 and 2)
347  * For other column numbers, this function will return 0.
348  *
349  * @internal
350  */
351 U_CFUNC int32_t
352 uprv_getMaxValues(int32_t column);
353 
354 /**
355  * Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM.
356  * @internal
357  */
358 U_CFUNC UBool
359 u_isalnumPOSIX(UChar32 c);
360 
361 /**
362  * Checks if c is in
363  * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]
364  * with space=\p{Whitespace} and Control=Cc.
365  * Implements UCHAR_POSIX_GRAPH.
366  * @internal
367  */
368 U_CFUNC UBool
369 u_isgraphPOSIX(UChar32 c);
370 
371 /**
372  * Checks if c is in \p{graph}\p{blank} - \p{cntrl}.
373  * Implements UCHAR_POSIX_PRINT.
374  * @internal
375  */
376 U_CFUNC UBool
377 u_isprintPOSIX(UChar32 c);
378 
379 /** Some code points. @internal */
380 enum {
381     TAB     =0x0009,
382     LF      =0x000a,
383     FF      =0x000c,
384     CR      =0x000d,
385     NBSP    =0x00a0,
386     CGJ     =0x034f,
387     FIGURESP=0x2007,
388     HAIRSP  =0x200a,
389     ZWNJ    =0x200c,
390     ZWJ     =0x200d,
391     RLM     =0x200f,
392     NNBSP   =0x202f,
393     ZWNBSP  =0xfeff
394 };
395 
396 // TODO: Move these two functions into a different header file (new unames.h?) so that uprops.h
397 // need not be C-compatible any more.
398 /**
399  * Get the maximum length of a (regular/1.0/extended) character name.
400  * @return 0 if no character names available.
401  */
402 U_CAPI int32_t U_EXPORT2
403 uprv_getMaxCharNameLength(void);
404 
405 /**
406  * Fills set with characters that are used in Unicode character names.
407  * Includes all characters that are used in regular/Unicode 1.0/extended names.
408  * Just empties the set if no character names are available.
409  * @param sa USetAdder to receive characters.
410  */
411 U_CAPI void U_EXPORT2
412 uprv_getCharNameCharacters(const USetAdder *sa);
413 
414 /**
415  * Constants for which data and implementation files provide which properties.
416  * Used by UnicodeSet for service-specific property enumeration.
417  * @internal
418  */
419 enum UPropertySource {
420     /** No source, not a supported property. */
421     UPROPS_SRC_NONE,
422     /** From uchar.c/uprops.icu main trie */
423     UPROPS_SRC_CHAR,
424     /** From uchar.c/uprops.icu properties vectors trie */
425     UPROPS_SRC_PROPSVEC,
426     /** From unames.c/unames.icu */
427     UPROPS_SRC_NAMES,
428     /** From ucase.c/ucase.icu */
429     UPROPS_SRC_CASE,
430     /** From ubidi_props.c/ubidi.icu */
431     UPROPS_SRC_BIDI,
432     /** From uchar.c/uprops.icu main trie as well as properties vectors trie */
433     UPROPS_SRC_CHAR_AND_PROPSVEC,
434     /** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */
435     UPROPS_SRC_CASE_AND_NORM,
436     /** From normalizer2impl.cpp/nfc.nrm */
437     UPROPS_SRC_NFC,
438     /** From normalizer2impl.cpp/nfkc.nrm */
439     UPROPS_SRC_NFKC,
440     /** From normalizer2impl.cpp/nfkc_cf.nrm */
441     UPROPS_SRC_NFKC_CF,
442     /** From normalizer2impl.cpp/nfc.nrm canonical iterator data */
443     UPROPS_SRC_NFC_CANON_ITER,
444     // Text layout properties.
445     UPROPS_SRC_INPC,
446     UPROPS_SRC_INSC,
447     UPROPS_SRC_VO,
448     UPROPS_SRC_EMOJI,
449     UPROPS_SRC_IDSU,
450     UPROPS_SRC_ID_COMPAT_MATH,
451     UPROPS_SRC_BLOCK,
452     UPROPS_SRC_MCM,
453     /** One more than the highest UPropertySource (UPROPS_SRC_) constant. */
454     UPROPS_SRC_COUNT
455 };
456 typedef enum UPropertySource UPropertySource;
457 
458 /**
459  * @see UPropertySource
460  * @internal
461  */
462 U_CFUNC UPropertySource U_EXPORT2
463 uprops_getSource(UProperty which);
464 
465 /**
466  * Enumerate uprops.icu's main data trie and add the
467  * start of each range of same properties to the set.
468  * @internal
469  */
470 U_CFUNC void U_EXPORT2
471 uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode);
472 
473 /**
474  * Enumerate uprops.icu's properties vectors trie and add the
475  * start of each range of same properties to the set.
476  * @internal
477  */
478 U_CFUNC void U_EXPORT2
479 upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode);
480 
481 U_CFUNC void U_EXPORT2
482 uprops_addPropertyStarts(UPropertySource src, const USetAdder *sa, UErrorCode *pErrorCode);
483 
484 #ifdef __cplusplus
485 
486 U_CFUNC void U_EXPORT2
487 ublock_addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode);
488 
489 #endif  // __cplusplus
490 
491 /**
492  * Return a set of characters for property enumeration.
493  * For each two consecutive characters (start, limit) in the set,
494  * all of the properties for start..limit-1 are all the same.
495  *
496  * @param sa USetAdder to receive result. Existing contents are lost.
497  * @internal
498  */
499 /*U_CFUNC void U_EXPORT2
500 uprv_getInclusions(const USetAdder *sa, UErrorCode *pErrorCode);
501 */
502 
503 // TODO: Move this into a different header file (udataswp.h? new unames.h?) so that uprops.h
504 // need not be C-compatible any more.
505 /**
506  * Swap the ICU Unicode character names file. See uchar.c.
507  * @internal
508  */
509 U_CAPI int32_t U_EXPORT2
510 uchar_swapNames(const UDataSwapper *ds,
511                 const void *inData, int32_t length, void *outData,
512                 UErrorCode *pErrorCode);
513 
514 #ifdef __cplusplus
515 
516 U_NAMESPACE_BEGIN
517 
518 class UnicodeSet;
519 
520 class CharacterProperties {
521 public:
522     CharacterProperties() = delete;
523     static const UnicodeSet *getInclusionsForProperty(UProperty prop, UErrorCode &errorCode);
524     static const UnicodeSet *getBinaryPropertySet(UProperty property, UErrorCode &errorCode);
525 };
526 
527 // implemented in uniset_props.cpp
528 U_CFUNC UnicodeSet *
529 uniset_getUnicode32Instance(UErrorCode &errorCode);
530 
531 U_NAMESPACE_END
532 
533 #endif
534 
535 #endif
536