1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 // characterproperties.cpp
5 // created: 2018sep03 Markus W. Scherer
6
7 #include "unicode/utypes.h"
8 #include "unicode/localpointer.h"
9 #include "unicode/uchar.h"
10 #include "unicode/ucpmap.h"
11 #include "unicode/ucptrie.h"
12 #include "unicode/umutablecptrie.h"
13 #include "unicode/uniset.h"
14 #include "unicode/uscript.h"
15 #include "unicode/uset.h"
16 #include "cmemory.h"
17 #include "mutex.h"
18 #include "normalizer2impl.h"
19 #include "uassert.h"
20 #include "ubidi_props.h"
21 #include "ucase.h"
22 #include "ucln_cmn.h"
23 #include "umutex.h"
24 #include "uprops.h"
25
26 using icu::LocalPointer;
27 using icu::Normalizer2Factory;
28 using icu::Normalizer2Impl;
29 using icu::UInitOnce;
30 using icu::UnicodeSet;
31
32 namespace {
33
34 UBool U_CALLCONV characterproperties_cleanup();
35
36 constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + UCHAR_INT_LIMIT - UCHAR_INT_START;
37
38 struct Inclusion {
39 UnicodeSet *fSet;
40 UInitOnce fInitOnce;
41 };
42 Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
43
44 UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
45
46 UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
47
48 UMutex cpMutex = U_MUTEX_INITIALIZER;
49
50 //----------------------------------------------------------------
51 // Inclusions list
52 //----------------------------------------------------------------
53
54 // USetAdder implementation
55 // Does not use uset.h to reduce code dependencies
56 void U_CALLCONV
_set_add(USet * set,UChar32 c)57 _set_add(USet *set, UChar32 c) {
58 ((UnicodeSet *)set)->add(c);
59 }
60
61 void U_CALLCONV
_set_addRange(USet * set,UChar32 start,UChar32 end)62 _set_addRange(USet *set, UChar32 start, UChar32 end) {
63 ((UnicodeSet *)set)->add(start, end);
64 }
65
66 void U_CALLCONV
_set_addString(USet * set,const UChar * str,int32_t length)67 _set_addString(USet *set, const UChar *str, int32_t length) {
68 ((UnicodeSet *)set)->add(icu::UnicodeString((UBool)(length<0), str, length));
69 }
70
characterproperties_cleanup()71 UBool U_CALLCONV characterproperties_cleanup() {
72 for (Inclusion &in: gInclusions) {
73 delete in.fSet;
74 in.fSet = nullptr;
75 in.fInitOnce.reset();
76 }
77 for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
78 delete sets[i];
79 sets[i] = nullptr;
80 }
81 for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
82 ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
83 maps[i] = nullptr;
84 }
85 return TRUE;
86 }
87
initInclusion(UPropertySource src,UErrorCode & errorCode)88 void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
89 // This function is invoked only via umtx_initOnce().
90 U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
91 if (src == UPROPS_SRC_NONE) {
92 errorCode = U_INTERNAL_PROGRAM_ERROR;
93 return;
94 }
95 U_ASSERT(gInclusions[src].fSet == nullptr);
96
97 LocalPointer<UnicodeSet> incl(new UnicodeSet());
98 if (incl.isNull()) {
99 errorCode = U_MEMORY_ALLOCATION_ERROR;
100 return;
101 }
102 USetAdder sa = {
103 (USet *)incl.getAlias(),
104 _set_add,
105 _set_addRange,
106 _set_addString,
107 nullptr, // don't need remove()
108 nullptr // don't need removeRange()
109 };
110
111 switch(src) {
112 case UPROPS_SRC_CHAR:
113 uchar_addPropertyStarts(&sa, &errorCode);
114 break;
115 case UPROPS_SRC_PROPSVEC:
116 upropsvec_addPropertyStarts(&sa, &errorCode);
117 break;
118 case UPROPS_SRC_CHAR_AND_PROPSVEC:
119 uchar_addPropertyStarts(&sa, &errorCode);
120 upropsvec_addPropertyStarts(&sa, &errorCode);
121 break;
122 #if !UCONFIG_NO_NORMALIZATION
123 case UPROPS_SRC_CASE_AND_NORM: {
124 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
125 if(U_SUCCESS(errorCode)) {
126 impl->addPropertyStarts(&sa, errorCode);
127 }
128 ucase_addPropertyStarts(&sa, &errorCode);
129 break;
130 }
131 case UPROPS_SRC_NFC: {
132 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
133 if(U_SUCCESS(errorCode)) {
134 impl->addPropertyStarts(&sa, errorCode);
135 }
136 break;
137 }
138 case UPROPS_SRC_NFKC: {
139 const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
140 if(U_SUCCESS(errorCode)) {
141 impl->addPropertyStarts(&sa, errorCode);
142 }
143 break;
144 }
145 case UPROPS_SRC_NFKC_CF: {
146 const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
147 if(U_SUCCESS(errorCode)) {
148 impl->addPropertyStarts(&sa, errorCode);
149 }
150 break;
151 }
152 case UPROPS_SRC_NFC_CANON_ITER: {
153 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
154 if(U_SUCCESS(errorCode)) {
155 impl->addCanonIterPropertyStarts(&sa, errorCode);
156 }
157 break;
158 }
159 #endif
160 case UPROPS_SRC_CASE:
161 ucase_addPropertyStarts(&sa, &errorCode);
162 break;
163 case UPROPS_SRC_BIDI:
164 ubidi_addPropertyStarts(&sa, &errorCode);
165 break;
166 case UPROPS_SRC_INPC:
167 case UPROPS_SRC_INSC:
168 case UPROPS_SRC_VO:
169 uprops_addPropertyStarts((UPropertySource)src, &sa, &errorCode);
170 break;
171 default:
172 errorCode = U_INTERNAL_PROGRAM_ERROR;
173 break;
174 }
175
176 if (U_FAILURE(errorCode)) {
177 return;
178 }
179 if (incl->isBogus()) {
180 errorCode = U_MEMORY_ALLOCATION_ERROR;
181 return;
182 }
183 // Compact for caching.
184 incl->compact();
185 gInclusions[src].fSet = incl.orphan();
186 ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
187 }
188
getInclusionsForSource(UPropertySource src,UErrorCode & errorCode)189 const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
190 if (U_FAILURE(errorCode)) { return nullptr; }
191 if (src < 0 || UPROPS_SRC_COUNT <= src) {
192 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
193 return nullptr;
194 }
195 Inclusion &i = gInclusions[src];
196 umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
197 return i.fSet;
198 }
199
initIntPropInclusion(UProperty prop,UErrorCode & errorCode)200 void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
201 // This function is invoked only via umtx_initOnce().
202 U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
203 int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
204 U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
205 UPropertySource src = uprops_getSource(prop);
206 const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
207 if (U_FAILURE(errorCode)) {
208 return;
209 }
210
211 LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
212 if (intPropIncl.isNull()) {
213 errorCode = U_MEMORY_ALLOCATION_ERROR;
214 return;
215 }
216 int32_t numRanges = incl->getRangeCount();
217 int32_t prevValue = 0;
218 for (int32_t i = 0; i < numRanges; ++i) {
219 UChar32 rangeEnd = incl->getRangeEnd(i);
220 for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
221 // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
222 int32_t value = u_getIntPropertyValue(c, prop);
223 if (value != prevValue) {
224 intPropIncl->add(c);
225 prevValue = value;
226 }
227 }
228 }
229
230 if (intPropIncl->isBogus()) {
231 errorCode = U_MEMORY_ALLOCATION_ERROR;
232 return;
233 }
234 // Compact for caching.
235 intPropIncl->compact();
236 gInclusions[inclIndex].fSet = intPropIncl.orphan();
237 ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
238 }
239
240 } // namespace
241
242 U_NAMESPACE_BEGIN
243
getInclusionsForProperty(UProperty prop,UErrorCode & errorCode)244 const UnicodeSet *CharacterProperties::getInclusionsForProperty(
245 UProperty prop, UErrorCode &errorCode) {
246 if (U_FAILURE(errorCode)) { return nullptr; }
247 if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
248 int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
249 Inclusion &i = gInclusions[inclIndex];
250 umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
251 return i.fSet;
252 } else {
253 UPropertySource src = uprops_getSource(prop);
254 return getInclusionsForSource(src, errorCode);
255 }
256 }
257
258 U_NAMESPACE_END
259
260 namespace {
261
makeSet(UProperty property,UErrorCode & errorCode)262 UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
263 if (U_FAILURE(errorCode)) { return nullptr; }
264 LocalPointer<UnicodeSet> set(new UnicodeSet());
265 if (set.isNull()) {
266 errorCode = U_MEMORY_ALLOCATION_ERROR;
267 return nullptr;
268 }
269 const UnicodeSet *inclusions =
270 icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
271 if (U_FAILURE(errorCode)) { return nullptr; }
272 int32_t numRanges = inclusions->getRangeCount();
273 UChar32 startHasProperty = -1;
274
275 for (int32_t i = 0; i < numRanges; ++i) {
276 UChar32 rangeEnd = inclusions->getRangeEnd(i);
277 for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
278 // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
279 if (u_hasBinaryProperty(c, property)) {
280 if (startHasProperty < 0) {
281 // Transition from false to true.
282 startHasProperty = c;
283 }
284 } else if (startHasProperty >= 0) {
285 // Transition from true to false.
286 set->add(startHasProperty, c - 1);
287 startHasProperty = -1;
288 }
289 }
290 }
291 if (startHasProperty >= 0) {
292 set->add(startHasProperty, 0x10FFFF);
293 }
294 set->freeze();
295 return set.orphan();
296 }
297
makeMap(UProperty property,UErrorCode & errorCode)298 UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
299 if (U_FAILURE(errorCode)) { return nullptr; }
300 uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
301 icu::LocalUMutableCPTriePointer mutableTrie(
302 umutablecptrie_open(nullValue, nullValue, &errorCode));
303 const UnicodeSet *inclusions =
304 icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
305 if (U_FAILURE(errorCode)) { return nullptr; }
306 int32_t numRanges = inclusions->getRangeCount();
307 UChar32 start = 0;
308 uint32_t value = nullValue;
309
310 for (int32_t i = 0; i < numRanges; ++i) {
311 UChar32 rangeEnd = inclusions->getRangeEnd(i);
312 for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
313 // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
314 uint32_t nextValue = u_getIntPropertyValue(c, property);
315 if (value != nextValue) {
316 if (value != nullValue) {
317 umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
318 }
319 start = c;
320 value = nextValue;
321 }
322 }
323 }
324 if (value != 0) {
325 umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
326 }
327
328 UCPTrieType type;
329 if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
330 type = UCPTRIE_TYPE_FAST;
331 } else {
332 type = UCPTRIE_TYPE_SMALL;
333 }
334 UCPTrieValueWidth valueWidth;
335 // TODO: UCharacterProperty.IntProperty
336 int32_t max = u_getIntPropertyMaxValue(property);
337 if (max <= 0xff) {
338 valueWidth = UCPTRIE_VALUE_BITS_8;
339 } else if (max <= 0xffff) {
340 valueWidth = UCPTRIE_VALUE_BITS_16;
341 } else {
342 valueWidth = UCPTRIE_VALUE_BITS_32;
343 }
344 return reinterpret_cast<UCPMap *>(
345 umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
346 }
347
348 } // namespace
349
350 U_NAMESPACE_USE
351
352 U_CAPI const USet * U_EXPORT2
u_getBinaryPropertySet(UProperty property,UErrorCode * pErrorCode)353 u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
354 if (U_FAILURE(*pErrorCode)) { return nullptr; }
355 if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
356 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
357 return nullptr;
358 }
359 Mutex m(&cpMutex);
360 UnicodeSet *set = sets[property];
361 if (set == nullptr) {
362 sets[property] = set = makeSet(property, *pErrorCode);
363 }
364 if (U_FAILURE(*pErrorCode)) { return nullptr; }
365 return set->toUSet();
366 }
367
368 U_CAPI const UCPMap * U_EXPORT2
u_getIntPropertyMap(UProperty property,UErrorCode * pErrorCode)369 u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
370 if (U_FAILURE(*pErrorCode)) { return nullptr; }
371 if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
372 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
373 return nullptr;
374 }
375 Mutex m(&cpMutex);
376 UCPMap *map = maps[property - UCHAR_INT_START];
377 if (map == nullptr) {
378 maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
379 }
380 return map;
381 }
382