1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 // characterproperties.cpp
5 // created: 2018sep03 Markus W. Scherer
6
7 #include "unicode/utypes.h"
8 #include "unicode/localpointer.h"
9 #include "unicode/uchar.h"
10 #include "unicode/ucpmap.h"
11 #include "unicode/ucptrie.h"
12 #include "unicode/umutablecptrie.h"
13 #include "unicode/uniset.h"
14 #include "unicode/uscript.h"
15 #include "unicode/uset.h"
16 #include "cmemory.h"
17 #include "emojiprops.h"
18 #include "mutex.h"
19 #include "normalizer2impl.h"
20 #include "uassert.h"
21 #include "ubidi_props.h"
22 #include "ucase.h"
23 #include "ucln_cmn.h"
24 #include "umutex.h"
25 #include "uprops.h"
26
27 using icu::LocalPointer;
28 #if !UCONFIG_NO_NORMALIZATION
29 using icu::Normalizer2Factory;
30 using icu::Normalizer2Impl;
31 #endif
32 using icu::UInitOnce;
33 using icu::UnicodeSet;
34
35 namespace {
36
37 UBool U_CALLCONV characterproperties_cleanup();
38
39 constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + (UCHAR_INT_LIMIT - UCHAR_INT_START);
40
41 struct Inclusion {
42 UnicodeSet *fSet = nullptr;
43 UInitOnce fInitOnce {};
44 };
45 Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
46
47 UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
48
49 UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
50
51 icu::UMutex cpMutex;
52
53 //----------------------------------------------------------------
54 // Inclusions list
55 //----------------------------------------------------------------
56
57 // USetAdder implementation
58 // Does not use uset.h to reduce code dependencies
59 void U_CALLCONV
_set_add(USet * set,UChar32 c)60 _set_add(USet *set, UChar32 c) {
61 reinterpret_cast<UnicodeSet*>(set)->add(c);
62 }
63
64 void U_CALLCONV
_set_addRange(USet * set,UChar32 start,UChar32 end)65 _set_addRange(USet *set, UChar32 start, UChar32 end) {
66 reinterpret_cast<UnicodeSet*>(set)->add(start, end);
67 }
68
69 void U_CALLCONV
_set_addString(USet * set,const char16_t * str,int32_t length)70 _set_addString(USet *set, const char16_t *str, int32_t length) {
71 reinterpret_cast<UnicodeSet*>(set)->add(icu::UnicodeString(static_cast<UBool>(length < 0), str, length));
72 }
73
characterproperties_cleanup()74 UBool U_CALLCONV characterproperties_cleanup() {
75 for (Inclusion &in: gInclusions) {
76 delete in.fSet;
77 in.fSet = nullptr;
78 in.fInitOnce.reset();
79 }
80 for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
81 delete sets[i];
82 sets[i] = nullptr;
83 }
84 for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
85 ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
86 maps[i] = nullptr;
87 }
88 return true;
89 }
90
initInclusion(UPropertySource src,UErrorCode & errorCode)91 void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
92 // This function is invoked only via umtx_initOnce().
93 U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
94 if (src == UPROPS_SRC_NONE) {
95 errorCode = U_INTERNAL_PROGRAM_ERROR;
96 return;
97 }
98 U_ASSERT(gInclusions[src].fSet == nullptr);
99
100 LocalPointer<UnicodeSet> incl(new UnicodeSet());
101 if (incl.isNull()) {
102 errorCode = U_MEMORY_ALLOCATION_ERROR;
103 return;
104 }
105 USetAdder sa = {
106 reinterpret_cast<USet*>(incl.getAlias()),
107 _set_add,
108 _set_addRange,
109 _set_addString,
110 nullptr, // don't need remove()
111 nullptr // don't need removeRange()
112 };
113
114 switch(src) {
115 case UPROPS_SRC_CHAR:
116 uchar_addPropertyStarts(&sa, &errorCode);
117 break;
118 case UPROPS_SRC_PROPSVEC:
119 upropsvec_addPropertyStarts(&sa, &errorCode);
120 break;
121 case UPROPS_SRC_CHAR_AND_PROPSVEC:
122 uchar_addPropertyStarts(&sa, &errorCode);
123 upropsvec_addPropertyStarts(&sa, &errorCode);
124 break;
125 #if !UCONFIG_NO_NORMALIZATION
126 case UPROPS_SRC_CASE_AND_NORM: {
127 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
128 if(U_SUCCESS(errorCode)) {
129 impl->addPropertyStarts(&sa, errorCode);
130 }
131 ucase_addPropertyStarts(&sa, &errorCode);
132 break;
133 }
134 case UPROPS_SRC_NFC: {
135 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
136 if(U_SUCCESS(errorCode)) {
137 impl->addPropertyStarts(&sa, errorCode);
138 }
139 break;
140 }
141 case UPROPS_SRC_NFKC: {
142 const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
143 if(U_SUCCESS(errorCode)) {
144 impl->addPropertyStarts(&sa, errorCode);
145 }
146 break;
147 }
148 case UPROPS_SRC_NFKC_CF: {
149 const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
150 if(U_SUCCESS(errorCode)) {
151 impl->addPropertyStarts(&sa, errorCode);
152 }
153 break;
154 }
155 case UPROPS_SRC_NFC_CANON_ITER: {
156 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
157 if(U_SUCCESS(errorCode)) {
158 impl->addCanonIterPropertyStarts(&sa, errorCode);
159 }
160 break;
161 }
162 #endif
163 case UPROPS_SRC_CASE:
164 ucase_addPropertyStarts(&sa, &errorCode);
165 break;
166 case UPROPS_SRC_BIDI:
167 ubidi_addPropertyStarts(&sa, &errorCode);
168 break;
169 case UPROPS_SRC_INPC:
170 case UPROPS_SRC_INSC:
171 case UPROPS_SRC_VO:
172 uprops_addPropertyStarts(src, &sa, &errorCode);
173 break;
174 case UPROPS_SRC_EMOJI: {
175 const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
176 if (U_SUCCESS(errorCode)) {
177 ep->addPropertyStarts(&sa, errorCode);
178 }
179 break;
180 }
181 case UPROPS_SRC_IDSU:
182 // New in Unicode 15.1 for just two characters.
183 sa.add(sa.set, 0x2FFE);
184 sa.add(sa.set, 0x2FFF + 1);
185 break;
186 case UPROPS_SRC_ID_COMPAT_MATH:
187 case UPROPS_SRC_MCM:
188 uprops_addPropertyStarts(src, &sa, &errorCode);
189 break;
190 case UPROPS_SRC_BLOCK:
191 ublock_addPropertyStarts(&sa, errorCode);
192 break;
193 default:
194 errorCode = U_INTERNAL_PROGRAM_ERROR;
195 break;
196 }
197
198 if (U_FAILURE(errorCode)) {
199 return;
200 }
201 if (incl->isBogus()) {
202 errorCode = U_MEMORY_ALLOCATION_ERROR;
203 return;
204 }
205 // Compact for caching.
206 incl->compact();
207 gInclusions[src].fSet = incl.orphan();
208 ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
209 }
210
getInclusionsForSource(UPropertySource src,UErrorCode & errorCode)211 const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
212 if (U_FAILURE(errorCode)) { return nullptr; }
213 if (src < 0 || UPROPS_SRC_COUNT <= src) {
214 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
215 return nullptr;
216 }
217 Inclusion &i = gInclusions[src];
218 umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
219 return i.fSet;
220 }
221
initIntPropInclusion(UProperty prop,UErrorCode & errorCode)222 void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
223 // This function is invoked only via umtx_initOnce().
224 U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
225 int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
226 U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
227 UPropertySource src = uprops_getSource(prop);
228 const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
229 if (U_FAILURE(errorCode)) {
230 return;
231 }
232
233 LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
234 if (intPropIncl.isNull()) {
235 errorCode = U_MEMORY_ALLOCATION_ERROR;
236 return;
237 }
238 int32_t numRanges = incl->getRangeCount();
239 int32_t prevValue = 0;
240 for (int32_t i = 0; i < numRanges; ++i) {
241 UChar32 rangeEnd = incl->getRangeEnd(i);
242 for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
243 // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
244 int32_t value = u_getIntPropertyValue(c, prop);
245 if (value != prevValue) {
246 intPropIncl->add(c);
247 prevValue = value;
248 }
249 }
250 }
251
252 if (intPropIncl->isBogus()) {
253 errorCode = U_MEMORY_ALLOCATION_ERROR;
254 return;
255 }
256 // Compact for caching.
257 intPropIncl->compact();
258 gInclusions[inclIndex].fSet = intPropIncl.orphan();
259 ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
260 }
261
262 } // namespace
263
264 U_NAMESPACE_BEGIN
265
getInclusionsForProperty(UProperty prop,UErrorCode & errorCode)266 const UnicodeSet *CharacterProperties::getInclusionsForProperty(
267 UProperty prop, UErrorCode &errorCode) {
268 if (U_FAILURE(errorCode)) { return nullptr; }
269 if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
270 int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
271 Inclusion &i = gInclusions[inclIndex];
272 umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
273 return i.fSet;
274 } else {
275 UPropertySource src = uprops_getSource(prop);
276 return getInclusionsForSource(src, errorCode);
277 }
278 }
279
280 U_NAMESPACE_END
281
282 namespace {
283
makeSet(UProperty property,UErrorCode & errorCode)284 UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
285 if (U_FAILURE(errorCode)) { return nullptr; }
286 LocalPointer<UnicodeSet> set(new UnicodeSet());
287 if (set.isNull()) {
288 errorCode = U_MEMORY_ALLOCATION_ERROR;
289 return nullptr;
290 }
291 if (UCHAR_BASIC_EMOJI <= property && property <= UCHAR_RGI_EMOJI) {
292 // property of strings
293 const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
294 if (U_FAILURE(errorCode)) { return nullptr; }
295 USetAdder sa = {
296 reinterpret_cast<USet*>(set.getAlias()),
297 _set_add,
298 _set_addRange,
299 _set_addString,
300 nullptr, // don't need remove()
301 nullptr // don't need removeRange()
302 };
303 ep->addStrings(&sa, property, errorCode);
304 if (property != UCHAR_BASIC_EMOJI && property != UCHAR_RGI_EMOJI) {
305 // property of _only_ strings
306 set->freeze();
307 return set.orphan();
308 }
309 }
310
311 const UnicodeSet *inclusions =
312 icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
313 if (U_FAILURE(errorCode)) { return nullptr; }
314 int32_t numRanges = inclusions->getRangeCount();
315 UChar32 startHasProperty = -1;
316
317 for (int32_t i = 0; i < numRanges; ++i) {
318 UChar32 rangeEnd = inclusions->getRangeEnd(i);
319 for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
320 // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
321 if (u_hasBinaryProperty(c, property)) {
322 if (startHasProperty < 0) {
323 // Transition from false to true.
324 startHasProperty = c;
325 }
326 } else if (startHasProperty >= 0) {
327 // Transition from true to false.
328 set->add(startHasProperty, c - 1);
329 startHasProperty = -1;
330 }
331 }
332 }
333 if (startHasProperty >= 0) {
334 set->add(startHasProperty, 0x10FFFF);
335 }
336 set->freeze();
337 return set.orphan();
338 }
339
makeMap(UProperty property,UErrorCode & errorCode)340 UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
341 if (U_FAILURE(errorCode)) { return nullptr; }
342 uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
343 icu::LocalUMutableCPTriePointer mutableTrie(
344 umutablecptrie_open(nullValue, nullValue, &errorCode));
345 const UnicodeSet *inclusions =
346 icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
347 if (U_FAILURE(errorCode)) { return nullptr; }
348 int32_t numRanges = inclusions->getRangeCount();
349 UChar32 start = 0;
350 uint32_t value = nullValue;
351
352 for (int32_t i = 0; i < numRanges; ++i) {
353 UChar32 rangeEnd = inclusions->getRangeEnd(i);
354 for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
355 // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
356 uint32_t nextValue = u_getIntPropertyValue(c, property);
357 if (value != nextValue) {
358 if (value != nullValue) {
359 umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
360 }
361 start = c;
362 value = nextValue;
363 }
364 }
365 }
366 if (value != 0) {
367 umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
368 }
369
370 UCPTrieType type;
371 if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
372 type = UCPTRIE_TYPE_FAST;
373 } else {
374 type = UCPTRIE_TYPE_SMALL;
375 }
376 UCPTrieValueWidth valueWidth;
377 // TODO: UCharacterProperty.IntProperty
378 int32_t max = u_getIntPropertyMaxValue(property);
379 if (max <= 0xff) {
380 valueWidth = UCPTRIE_VALUE_BITS_8;
381 } else if (max <= 0xffff) {
382 valueWidth = UCPTRIE_VALUE_BITS_16;
383 } else {
384 valueWidth = UCPTRIE_VALUE_BITS_32;
385 }
386 return reinterpret_cast<UCPMap *>(
387 umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
388 }
389
390 } // namespace
391
392 U_NAMESPACE_BEGIN
393
getBinaryPropertySet(UProperty property,UErrorCode & errorCode)394 const UnicodeSet *CharacterProperties::getBinaryPropertySet(UProperty property, UErrorCode &errorCode) {
395 if (U_FAILURE(errorCode)) { return nullptr; }
396 if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
397 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
398 return nullptr;
399 }
400 Mutex m(&cpMutex);
401 UnicodeSet *set = sets[property];
402 if (set == nullptr) {
403 sets[property] = set = makeSet(property, errorCode);
404 }
405 return set;
406 }
407
408 U_NAMESPACE_END
409
410 U_NAMESPACE_USE
411
412 U_CAPI const USet * U_EXPORT2
u_getBinaryPropertySet(UProperty property,UErrorCode * pErrorCode)413 u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
414 const UnicodeSet *set = CharacterProperties::getBinaryPropertySet(property, *pErrorCode);
415 return U_SUCCESS(*pErrorCode) ? set->toUSet() : nullptr;
416 }
417
418 U_CAPI const UCPMap * U_EXPORT2
u_getIntPropertyMap(UProperty property,UErrorCode * pErrorCode)419 u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
420 if (U_FAILURE(*pErrorCode)) { return nullptr; }
421 if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
422 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
423 return nullptr;
424 }
425 Mutex m(&cpMutex);
426 UCPMap *map = maps[property - UCHAR_INT_START];
427 if (map == nullptr) {
428 maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
429 }
430 return map;
431 }
432