• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 // characterproperties.cpp
5 // created: 2018sep03 Markus W. Scherer
6 
7 #include "unicode/utypes.h"
8 #include "unicode/localpointer.h"
9 #include "unicode/uchar.h"
10 #include "unicode/ucpmap.h"
11 #include "unicode/ucptrie.h"
12 #include "unicode/umutablecptrie.h"
13 #include "unicode/uniset.h"
14 #include "unicode/uscript.h"
15 #include "unicode/uset.h"
16 #include "cmemory.h"
17 #include "emojiprops.h"
18 #include "mutex.h"
19 #include "normalizer2impl.h"
20 #include "uassert.h"
21 #include "ubidi_props.h"
22 #include "ucase.h"
23 #include "ucln_cmn.h"
24 #include "umutex.h"
25 #include "uprops.h"
26 
27 using icu::LocalPointer;
28 #if !UCONFIG_NO_NORMALIZATION
29 using icu::Normalizer2Factory;
30 using icu::Normalizer2Impl;
31 #endif
32 using icu::UInitOnce;
33 using icu::UnicodeSet;
34 
35 namespace {
36 
37 UBool U_CALLCONV characterproperties_cleanup();
38 
39 constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + (UCHAR_INT_LIMIT - UCHAR_INT_START);
40 
41 struct Inclusion {
42     UnicodeSet  *fSet = nullptr;
43     UInitOnce    fInitOnce {};
44 };
45 Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
46 
47 UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
48 
49 UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
50 
51 icu::UMutex cpMutex;
52 
53 //----------------------------------------------------------------
54 // Inclusions list
55 //----------------------------------------------------------------
56 
57 // USetAdder implementation
58 // Does not use uset.h to reduce code dependencies
59 void U_CALLCONV
_set_add(USet * set,UChar32 c)60 _set_add(USet *set, UChar32 c) {
61     reinterpret_cast<UnicodeSet*>(set)->add(c);
62 }
63 
64 void U_CALLCONV
_set_addRange(USet * set,UChar32 start,UChar32 end)65 _set_addRange(USet *set, UChar32 start, UChar32 end) {
66     reinterpret_cast<UnicodeSet*>(set)->add(start, end);
67 }
68 
69 void U_CALLCONV
_set_addString(USet * set,const char16_t * str,int32_t length)70 _set_addString(USet *set, const char16_t *str, int32_t length) {
71     reinterpret_cast<UnicodeSet*>(set)->add(icu::UnicodeString(static_cast<UBool>(length < 0), str, length));
72 }
73 
characterproperties_cleanup()74 UBool U_CALLCONV characterproperties_cleanup() {
75     for (Inclusion &in: gInclusions) {
76         delete in.fSet;
77         in.fSet = nullptr;
78         in.fInitOnce.reset();
79     }
80     for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
81         delete sets[i];
82         sets[i] = nullptr;
83     }
84     for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
85         ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
86         maps[i] = nullptr;
87     }
88     return true;
89 }
90 
initInclusion(UPropertySource src,UErrorCode & errorCode)91 void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
92     // This function is invoked only via umtx_initOnce().
93     U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
94     if (src == UPROPS_SRC_NONE) {
95         errorCode = U_INTERNAL_PROGRAM_ERROR;
96         return;
97     }
98     U_ASSERT(gInclusions[src].fSet == nullptr);
99 
100     LocalPointer<UnicodeSet> incl(new UnicodeSet());
101     if (incl.isNull()) {
102         errorCode = U_MEMORY_ALLOCATION_ERROR;
103         return;
104     }
105     USetAdder sa = {
106         reinterpret_cast<USet*>(incl.getAlias()),
107         _set_add,
108         _set_addRange,
109         _set_addString,
110         nullptr, // don't need remove()
111         nullptr // don't need removeRange()
112     };
113 
114     switch(src) {
115     case UPROPS_SRC_CHAR:
116         uchar_addPropertyStarts(&sa, &errorCode);
117         break;
118     case UPROPS_SRC_PROPSVEC:
119         upropsvec_addPropertyStarts(&sa, &errorCode);
120         break;
121     case UPROPS_SRC_CHAR_AND_PROPSVEC:
122         uchar_addPropertyStarts(&sa, &errorCode);
123         upropsvec_addPropertyStarts(&sa, &errorCode);
124         break;
125 #if !UCONFIG_NO_NORMALIZATION
126     case UPROPS_SRC_CASE_AND_NORM: {
127         const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
128         if(U_SUCCESS(errorCode)) {
129             impl->addPropertyStarts(&sa, errorCode);
130         }
131         ucase_addPropertyStarts(&sa, &errorCode);
132         break;
133     }
134     case UPROPS_SRC_NFC: {
135         const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
136         if(U_SUCCESS(errorCode)) {
137             impl->addPropertyStarts(&sa, errorCode);
138         }
139         break;
140     }
141     case UPROPS_SRC_NFKC: {
142         const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
143         if(U_SUCCESS(errorCode)) {
144             impl->addPropertyStarts(&sa, errorCode);
145         }
146         break;
147     }
148     case UPROPS_SRC_NFKC_CF: {
149         const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
150         if(U_SUCCESS(errorCode)) {
151             impl->addPropertyStarts(&sa, errorCode);
152         }
153         break;
154     }
155     case UPROPS_SRC_NFC_CANON_ITER: {
156         const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
157         if(U_SUCCESS(errorCode)) {
158             impl->addCanonIterPropertyStarts(&sa, errorCode);
159         }
160         break;
161     }
162 #endif
163     case UPROPS_SRC_CASE:
164         ucase_addPropertyStarts(&sa, &errorCode);
165         break;
166     case UPROPS_SRC_BIDI:
167         ubidi_addPropertyStarts(&sa, &errorCode);
168         break;
169     case UPROPS_SRC_INPC:
170     case UPROPS_SRC_INSC:
171     case UPROPS_SRC_VO:
172         uprops_addPropertyStarts(src, &sa, &errorCode);
173         break;
174     case UPROPS_SRC_EMOJI: {
175         const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
176         if (U_SUCCESS(errorCode)) {
177             ep->addPropertyStarts(&sa, errorCode);
178         }
179         break;
180     }
181     case UPROPS_SRC_IDSU:
182         // New in Unicode 15.1 for just two characters.
183         sa.add(sa.set, 0x2FFE);
184         sa.add(sa.set, 0x2FFF + 1);
185         break;
186     case UPROPS_SRC_ID_COMPAT_MATH:
187     case UPROPS_SRC_MCM:
188         uprops_addPropertyStarts(src, &sa, &errorCode);
189         break;
190     case UPROPS_SRC_BLOCK:
191         ublock_addPropertyStarts(&sa, errorCode);
192         break;
193     default:
194         errorCode = U_INTERNAL_PROGRAM_ERROR;
195         break;
196     }
197 
198     if (U_FAILURE(errorCode)) {
199         return;
200     }
201     if (incl->isBogus()) {
202         errorCode = U_MEMORY_ALLOCATION_ERROR;
203         return;
204     }
205     // Compact for caching.
206     incl->compact();
207     gInclusions[src].fSet = incl.orphan();
208     ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
209 }
210 
getInclusionsForSource(UPropertySource src,UErrorCode & errorCode)211 const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
212     if (U_FAILURE(errorCode)) { return nullptr; }
213     if (src < 0 || UPROPS_SRC_COUNT <= src) {
214         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
215         return nullptr;
216     }
217     Inclusion &i = gInclusions[src];
218     umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
219     return i.fSet;
220 }
221 
initIntPropInclusion(UProperty prop,UErrorCode & errorCode)222 void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
223     // This function is invoked only via umtx_initOnce().
224     U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
225     int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
226     U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
227     UPropertySource src = uprops_getSource(prop);
228     const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
229     if (U_FAILURE(errorCode)) {
230         return;
231     }
232 
233     LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
234     if (intPropIncl.isNull()) {
235         errorCode = U_MEMORY_ALLOCATION_ERROR;
236         return;
237     }
238     int32_t numRanges = incl->getRangeCount();
239     int32_t prevValue = 0;
240     for (int32_t i = 0; i < numRanges; ++i) {
241         UChar32 rangeEnd = incl->getRangeEnd(i);
242         for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
243             // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
244             int32_t value = u_getIntPropertyValue(c, prop);
245             if (value != prevValue) {
246                 intPropIncl->add(c);
247                 prevValue = value;
248             }
249         }
250     }
251 
252     if (intPropIncl->isBogus()) {
253         errorCode = U_MEMORY_ALLOCATION_ERROR;
254         return;
255     }
256     // Compact for caching.
257     intPropIncl->compact();
258     gInclusions[inclIndex].fSet = intPropIncl.orphan();
259     ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
260 }
261 
262 }  // namespace
263 
264 U_NAMESPACE_BEGIN
265 
getInclusionsForProperty(UProperty prop,UErrorCode & errorCode)266 const UnicodeSet *CharacterProperties::getInclusionsForProperty(
267         UProperty prop, UErrorCode &errorCode) {
268     if (U_FAILURE(errorCode)) { return nullptr; }
269     if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
270         int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
271         Inclusion &i = gInclusions[inclIndex];
272         umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
273         return i.fSet;
274     } else {
275         UPropertySource src = uprops_getSource(prop);
276         return getInclusionsForSource(src, errorCode);
277     }
278 }
279 
280 U_NAMESPACE_END
281 
282 namespace {
283 
makeSet(UProperty property,UErrorCode & errorCode)284 UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
285     if (U_FAILURE(errorCode)) { return nullptr; }
286     LocalPointer<UnicodeSet> set(new UnicodeSet());
287     if (set.isNull()) {
288         errorCode = U_MEMORY_ALLOCATION_ERROR;
289         return nullptr;
290     }
291     if (UCHAR_BASIC_EMOJI <= property && property <= UCHAR_RGI_EMOJI) {
292         // property of strings
293         const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
294         if (U_FAILURE(errorCode)) { return nullptr; }
295         USetAdder sa = {
296             reinterpret_cast<USet*>(set.getAlias()),
297             _set_add,
298             _set_addRange,
299             _set_addString,
300             nullptr, // don't need remove()
301             nullptr // don't need removeRange()
302         };
303         ep->addStrings(&sa, property, errorCode);
304         if (property != UCHAR_BASIC_EMOJI && property != UCHAR_RGI_EMOJI) {
305             // property of _only_ strings
306             set->freeze();
307             return set.orphan();
308         }
309     }
310 
311     const UnicodeSet *inclusions =
312         icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
313     if (U_FAILURE(errorCode)) { return nullptr; }
314     int32_t numRanges = inclusions->getRangeCount();
315     UChar32 startHasProperty = -1;
316 
317     for (int32_t i = 0; i < numRanges; ++i) {
318         UChar32 rangeEnd = inclusions->getRangeEnd(i);
319         for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
320             // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
321             if (u_hasBinaryProperty(c, property)) {
322                 if (startHasProperty < 0) {
323                     // Transition from false to true.
324                     startHasProperty = c;
325                 }
326             } else if (startHasProperty >= 0) {
327                 // Transition from true to false.
328                 set->add(startHasProperty, c - 1);
329                 startHasProperty = -1;
330             }
331         }
332     }
333     if (startHasProperty >= 0) {
334         set->add(startHasProperty, 0x10FFFF);
335     }
336     set->freeze();
337     return set.orphan();
338 }
339 
makeMap(UProperty property,UErrorCode & errorCode)340 UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
341     if (U_FAILURE(errorCode)) { return nullptr; }
342     uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
343     icu::LocalUMutableCPTriePointer mutableTrie(
344         umutablecptrie_open(nullValue, nullValue, &errorCode));
345     const UnicodeSet *inclusions =
346         icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
347     if (U_FAILURE(errorCode)) { return nullptr; }
348     int32_t numRanges = inclusions->getRangeCount();
349     UChar32 start = 0;
350     uint32_t value = nullValue;
351 
352     for (int32_t i = 0; i < numRanges; ++i) {
353         UChar32 rangeEnd = inclusions->getRangeEnd(i);
354         for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
355             // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
356             uint32_t nextValue = u_getIntPropertyValue(c, property);
357             if (value != nextValue) {
358                 if (value != nullValue) {
359                     umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
360                 }
361                 start = c;
362                 value = nextValue;
363             }
364         }
365     }
366     if (value != 0) {
367         umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
368     }
369 
370     UCPTrieType type;
371     if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
372         type = UCPTRIE_TYPE_FAST;
373     } else {
374         type = UCPTRIE_TYPE_SMALL;
375     }
376     UCPTrieValueWidth valueWidth;
377     // TODO: UCharacterProperty.IntProperty
378     int32_t max = u_getIntPropertyMaxValue(property);
379     if (max <= 0xff) {
380         valueWidth = UCPTRIE_VALUE_BITS_8;
381     } else if (max <= 0xffff) {
382         valueWidth = UCPTRIE_VALUE_BITS_16;
383     } else {
384         valueWidth = UCPTRIE_VALUE_BITS_32;
385     }
386     return reinterpret_cast<UCPMap *>(
387         umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
388 }
389 
390 }  // namespace
391 
392 U_NAMESPACE_BEGIN
393 
getBinaryPropertySet(UProperty property,UErrorCode & errorCode)394 const UnicodeSet *CharacterProperties::getBinaryPropertySet(UProperty property, UErrorCode &errorCode) {
395     if (U_FAILURE(errorCode)) { return nullptr; }
396     if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
397         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
398         return nullptr;
399     }
400     Mutex m(&cpMutex);
401     UnicodeSet *set = sets[property];
402     if (set == nullptr) {
403         sets[property] = set = makeSet(property, errorCode);
404     }
405     return set;
406 }
407 
408 U_NAMESPACE_END
409 
410 U_NAMESPACE_USE
411 
412 U_CAPI const USet * U_EXPORT2
u_getBinaryPropertySet(UProperty property,UErrorCode * pErrorCode)413 u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
414     const UnicodeSet *set = CharacterProperties::getBinaryPropertySet(property, *pErrorCode);
415     return U_SUCCESS(*pErrorCode) ? set->toUSet() : nullptr;
416 }
417 
418 U_CAPI const UCPMap * U_EXPORT2
u_getIntPropertyMap(UProperty property,UErrorCode * pErrorCode)419 u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
420     if (U_FAILURE(*pErrorCode)) { return nullptr; }
421     if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
422         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
423         return nullptr;
424     }
425     Mutex m(&cpMutex);
426     UCPMap *map = maps[property - UCHAR_INT_START];
427     if (map == nullptr) {
428         maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
429     }
430     return map;
431 }
432