• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 // characterproperties.cpp
5 // created: 2018sep03 Markus W. Scherer
6 
7 #include "unicode/utypes.h"
8 #include "unicode/localpointer.h"
9 #include "unicode/uchar.h"
10 #include "unicode/ucpmap.h"
11 #include "unicode/ucptrie.h"
12 #include "unicode/umutablecptrie.h"
13 #include "unicode/uniset.h"
14 #include "unicode/uscript.h"
15 #include "unicode/uset.h"
16 #include "cmemory.h"
17 #include "mutex.h"
18 #include "normalizer2impl.h"
19 #include "uassert.h"
20 #include "ubidi_props.h"
21 #include "ucase.h"
22 #include "ucln_cmn.h"
23 #include "umutex.h"
24 #include "uprops.h"
25 
26 using icu::LocalPointer;
27 using icu::Normalizer2Factory;
28 using icu::Normalizer2Impl;
29 using icu::UInitOnce;
30 using icu::UnicodeSet;
31 
32 namespace {
33 
34 UBool U_CALLCONV characterproperties_cleanup();
35 
36 constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + UCHAR_INT_LIMIT - UCHAR_INT_START;
37 
38 struct Inclusion {
39     UnicodeSet  *fSet;
40     UInitOnce    fInitOnce;
41 };
42 Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
43 
44 UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
45 
46 UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
47 
48 UMutex cpMutex = U_MUTEX_INITIALIZER;
49 
50 //----------------------------------------------------------------
51 // Inclusions list
52 //----------------------------------------------------------------
53 
54 // USetAdder implementation
55 // Does not use uset.h to reduce code dependencies
56 void U_CALLCONV
_set_add(USet * set,UChar32 c)57 _set_add(USet *set, UChar32 c) {
58     ((UnicodeSet *)set)->add(c);
59 }
60 
61 void U_CALLCONV
_set_addRange(USet * set,UChar32 start,UChar32 end)62 _set_addRange(USet *set, UChar32 start, UChar32 end) {
63     ((UnicodeSet *)set)->add(start, end);
64 }
65 
66 void U_CALLCONV
_set_addString(USet * set,const UChar * str,int32_t length)67 _set_addString(USet *set, const UChar *str, int32_t length) {
68     ((UnicodeSet *)set)->add(icu::UnicodeString((UBool)(length<0), str, length));
69 }
70 
characterproperties_cleanup()71 UBool U_CALLCONV characterproperties_cleanup() {
72     for (Inclusion &in: gInclusions) {
73         delete in.fSet;
74         in.fSet = nullptr;
75         in.fInitOnce.reset();
76     }
77     for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
78         delete sets[i];
79         sets[i] = nullptr;
80     }
81     for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
82         ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
83         maps[i] = nullptr;
84     }
85     return TRUE;
86 }
87 
initInclusion(UPropertySource src,UErrorCode & errorCode)88 void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
89     // This function is invoked only via umtx_initOnce().
90     U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
91     if (src == UPROPS_SRC_NONE) {
92         errorCode = U_INTERNAL_PROGRAM_ERROR;
93         return;
94     }
95     U_ASSERT(gInclusions[src].fSet == nullptr);
96 
97     LocalPointer<UnicodeSet> incl(new UnicodeSet());
98     if (incl.isNull()) {
99         errorCode = U_MEMORY_ALLOCATION_ERROR;
100         return;
101     }
102     USetAdder sa = {
103         (USet *)incl.getAlias(),
104         _set_add,
105         _set_addRange,
106         _set_addString,
107         nullptr, // don't need remove()
108         nullptr // don't need removeRange()
109     };
110 
111     switch(src) {
112     case UPROPS_SRC_CHAR:
113         uchar_addPropertyStarts(&sa, &errorCode);
114         break;
115     case UPROPS_SRC_PROPSVEC:
116         upropsvec_addPropertyStarts(&sa, &errorCode);
117         break;
118     case UPROPS_SRC_CHAR_AND_PROPSVEC:
119         uchar_addPropertyStarts(&sa, &errorCode);
120         upropsvec_addPropertyStarts(&sa, &errorCode);
121         break;
122 #if !UCONFIG_NO_NORMALIZATION
123     case UPROPS_SRC_CASE_AND_NORM: {
124         const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
125         if(U_SUCCESS(errorCode)) {
126             impl->addPropertyStarts(&sa, errorCode);
127         }
128         ucase_addPropertyStarts(&sa, &errorCode);
129         break;
130     }
131     case UPROPS_SRC_NFC: {
132         const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
133         if(U_SUCCESS(errorCode)) {
134             impl->addPropertyStarts(&sa, errorCode);
135         }
136         break;
137     }
138     case UPROPS_SRC_NFKC: {
139         const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
140         if(U_SUCCESS(errorCode)) {
141             impl->addPropertyStarts(&sa, errorCode);
142         }
143         break;
144     }
145     case UPROPS_SRC_NFKC_CF: {
146         const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
147         if(U_SUCCESS(errorCode)) {
148             impl->addPropertyStarts(&sa, errorCode);
149         }
150         break;
151     }
152     case UPROPS_SRC_NFC_CANON_ITER: {
153         const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
154         if(U_SUCCESS(errorCode)) {
155             impl->addCanonIterPropertyStarts(&sa, errorCode);
156         }
157         break;
158     }
159 #endif
160     case UPROPS_SRC_CASE:
161         ucase_addPropertyStarts(&sa, &errorCode);
162         break;
163     case UPROPS_SRC_BIDI:
164         ubidi_addPropertyStarts(&sa, &errorCode);
165         break;
166     case UPROPS_SRC_INPC:
167     case UPROPS_SRC_INSC:
168     case UPROPS_SRC_VO:
169         uprops_addPropertyStarts((UPropertySource)src, &sa, &errorCode);
170         break;
171     default:
172         errorCode = U_INTERNAL_PROGRAM_ERROR;
173         break;
174     }
175 
176     if (U_FAILURE(errorCode)) {
177         return;
178     }
179     if (incl->isBogus()) {
180         errorCode = U_MEMORY_ALLOCATION_ERROR;
181         return;
182     }
183     // Compact for caching.
184     incl->compact();
185     gInclusions[src].fSet = incl.orphan();
186     ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
187 }
188 
getInclusionsForSource(UPropertySource src,UErrorCode & errorCode)189 const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
190     if (U_FAILURE(errorCode)) { return nullptr; }
191     if (src < 0 || UPROPS_SRC_COUNT <= src) {
192         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
193         return nullptr;
194     }
195     Inclusion &i = gInclusions[src];
196     umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
197     return i.fSet;
198 }
199 
initIntPropInclusion(UProperty prop,UErrorCode & errorCode)200 void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
201     // This function is invoked only via umtx_initOnce().
202     U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
203     int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
204     U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
205     UPropertySource src = uprops_getSource(prop);
206     const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
207     if (U_FAILURE(errorCode)) {
208         return;
209     }
210 
211     LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
212     if (intPropIncl.isNull()) {
213         errorCode = U_MEMORY_ALLOCATION_ERROR;
214         return;
215     }
216     int32_t numRanges = incl->getRangeCount();
217     int32_t prevValue = 0;
218     for (int32_t i = 0; i < numRanges; ++i) {
219         UChar32 rangeEnd = incl->getRangeEnd(i);
220         for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
221             // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
222             int32_t value = u_getIntPropertyValue(c, prop);
223             if (value != prevValue) {
224                 intPropIncl->add(c);
225                 prevValue = value;
226             }
227         }
228     }
229 
230     if (intPropIncl->isBogus()) {
231         errorCode = U_MEMORY_ALLOCATION_ERROR;
232         return;
233     }
234     // Compact for caching.
235     intPropIncl->compact();
236     gInclusions[inclIndex].fSet = intPropIncl.orphan();
237     ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
238 }
239 
240 }  // namespace
241 
242 U_NAMESPACE_BEGIN
243 
getInclusionsForProperty(UProperty prop,UErrorCode & errorCode)244 const UnicodeSet *CharacterProperties::getInclusionsForProperty(
245         UProperty prop, UErrorCode &errorCode) {
246     if (U_FAILURE(errorCode)) { return nullptr; }
247     if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
248         int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
249         Inclusion &i = gInclusions[inclIndex];
250         umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
251         return i.fSet;
252     } else {
253         UPropertySource src = uprops_getSource(prop);
254         return getInclusionsForSource(src, errorCode);
255     }
256 }
257 
258 U_NAMESPACE_END
259 
260 namespace {
261 
makeSet(UProperty property,UErrorCode & errorCode)262 UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
263     if (U_FAILURE(errorCode)) { return nullptr; }
264     LocalPointer<UnicodeSet> set(new UnicodeSet());
265     if (set.isNull()) {
266         errorCode = U_MEMORY_ALLOCATION_ERROR;
267         return nullptr;
268     }
269     const UnicodeSet *inclusions =
270         icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
271     if (U_FAILURE(errorCode)) { return nullptr; }
272     int32_t numRanges = inclusions->getRangeCount();
273     UChar32 startHasProperty = -1;
274 
275     for (int32_t i = 0; i < numRanges; ++i) {
276         UChar32 rangeEnd = inclusions->getRangeEnd(i);
277         for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
278             // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
279             if (u_hasBinaryProperty(c, property)) {
280                 if (startHasProperty < 0) {
281                     // Transition from false to true.
282                     startHasProperty = c;
283                 }
284             } else if (startHasProperty >= 0) {
285                 // Transition from true to false.
286                 set->add(startHasProperty, c - 1);
287                 startHasProperty = -1;
288             }
289         }
290     }
291     if (startHasProperty >= 0) {
292         set->add(startHasProperty, 0x10FFFF);
293     }
294     set->freeze();
295     return set.orphan();
296 }
297 
makeMap(UProperty property,UErrorCode & errorCode)298 UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
299     if (U_FAILURE(errorCode)) { return nullptr; }
300     uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
301     icu::LocalUMutableCPTriePointer mutableTrie(
302         umutablecptrie_open(nullValue, nullValue, &errorCode));
303     const UnicodeSet *inclusions =
304         icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
305     if (U_FAILURE(errorCode)) { return nullptr; }
306     int32_t numRanges = inclusions->getRangeCount();
307     UChar32 start = 0;
308     uint32_t value = nullValue;
309 
310     for (int32_t i = 0; i < numRanges; ++i) {
311         UChar32 rangeEnd = inclusions->getRangeEnd(i);
312         for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
313             // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
314             uint32_t nextValue = u_getIntPropertyValue(c, property);
315             if (value != nextValue) {
316                 if (value != nullValue) {
317                     umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
318                 }
319                 start = c;
320                 value = nextValue;
321             }
322         }
323     }
324     if (value != 0) {
325         umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
326     }
327 
328     UCPTrieType type;
329     if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
330         type = UCPTRIE_TYPE_FAST;
331     } else {
332         type = UCPTRIE_TYPE_SMALL;
333     }
334     UCPTrieValueWidth valueWidth;
335     // TODO: UCharacterProperty.IntProperty
336     int32_t max = u_getIntPropertyMaxValue(property);
337     if (max <= 0xff) {
338         valueWidth = UCPTRIE_VALUE_BITS_8;
339     } else if (max <= 0xffff) {
340         valueWidth = UCPTRIE_VALUE_BITS_16;
341     } else {
342         valueWidth = UCPTRIE_VALUE_BITS_32;
343     }
344     return reinterpret_cast<UCPMap *>(
345         umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
346 }
347 
348 }  // namespace
349 
350 U_NAMESPACE_USE
351 
352 U_CAPI const USet * U_EXPORT2
u_getBinaryPropertySet(UProperty property,UErrorCode * pErrorCode)353 u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
354     if (U_FAILURE(*pErrorCode)) { return nullptr; }
355     if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
356         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
357         return nullptr;
358     }
359     Mutex m(&cpMutex);
360     UnicodeSet *set = sets[property];
361     if (set == nullptr) {
362         sets[property] = set = makeSet(property, *pErrorCode);
363     }
364     if (U_FAILURE(*pErrorCode)) { return nullptr; }
365     return set->toUSet();
366 }
367 
368 U_CAPI const UCPMap * U_EXPORT2
u_getIntPropertyMap(UProperty property,UErrorCode * pErrorCode)369 u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
370     if (U_FAILURE(*pErrorCode)) { return nullptr; }
371     if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
372         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
373         return nullptr;
374     }
375     Mutex m(&cpMutex);
376     UCPMap *map = maps[property - UCHAR_INT_START];
377     if (map == nullptr) {
378         maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
379     }
380     return map;
381 }
382