1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2001-2013, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: ucol_bld.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created 02/22/2001
14 * created by: Vladimir Weinstein
15 *
16 * This module builds a collator based on the rule set.
17 *
18 */
19
20 #include "unicode/utypes.h"
21
22 #if !UCONFIG_NO_COLLATION
23
24 #include "unicode/ucoleitr.h"
25 #include "unicode/udata.h"
26 #include "unicode/uchar.h"
27 #include "unicode/uniset.h"
28 #include "unicode/uscript.h"
29 #include "unicode/ustring.h"
30 #include "unicode/utf16.h"
31 #include "normalizer2impl.h"
32 #include "uassert.h"
33 #include "ucol_bld.h"
34 #include "ucol_elm.h"
35 #include "ucol_cnt.h"
36 #include "ucln_in.h"
37 #include "umutex.h"
38 #include "cmemory.h"
39 #include "cstring.h"
40
41 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
42
43 static const InverseUCATableHeader* _staticInvUCA = NULL;
44 static UDataMemory* invUCA_DATA_MEM = NULL;
45 static icu::UInitOnce gStaticInvUCAInitOnce = U_INITONCE_INITIALIZER;
46
47 U_CDECL_BEGIN
48 static UBool U_CALLCONV
isAcceptableInvUCA(void *,const char *,const char *,const UDataInfo * pInfo)49 isAcceptableInvUCA(void * /*context*/,
50 const char * /*type*/, const char * /*name*/,
51 const UDataInfo *pInfo)
52 {
53 /* context, type & name are intentionally not used */
54 if( pInfo->size>=20 &&
55 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
56 pInfo->charsetFamily==U_CHARSET_FAMILY &&
57 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */
58 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
59 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
60 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
61 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
62 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
63 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
64 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
65 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
66 )
67 {
68 // TODO: Check that the invuca data version (pInfo->dataVersion)
69 // matches the ucadata version.
70 return TRUE;
71 } else {
72 return FALSE;
73 }
74 }
75 U_CDECL_END
76
77 /*
78 * Takes two CEs (lead and continuation) and
79 * compares them as CEs should be compared:
80 * primary vs. primary, secondary vs. secondary
81 * tertiary vs. tertiary
82 */
compareCEs(uint32_t source0,uint32_t source1,uint32_t target0,uint32_t target1)83 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
84 uint32_t s1 = source0, s2, t1 = target0, t2;
85 if(isContinuation(source1)) {
86 s2 = source1;
87 } else {
88 s2 = 0;
89 }
90 if(isContinuation(target1)) {
91 t2 = target1;
92 } else {
93 t2 = 0;
94 }
95
96 uint32_t s = 0, t = 0;
97 if(s1 == t1 && s2 == t2) {
98 return 0;
99 }
100 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
101 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
102 if(s < t) {
103 return -1;
104 } else if(s > t) {
105 return 1;
106 } else {
107 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
108 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
109 if(s < t) {
110 return -1;
111 } else if(s > t) {
112 return 1;
113 } else {
114 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
115 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
116 if(s < t) {
117 return -1;
118 } else {
119 return 1;
120 }
121 }
122 }
123 }
124
125 static
ucol_inv_findCE(const UColTokenParser * src,uint32_t CE,uint32_t SecondCE)126 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
127 uint32_t bottom = 0, top = src->invUCA->tableSize;
128 uint32_t i = 0;
129 uint32_t first = 0, second = 0;
130 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
131 int32_t res = 0;
132
133 while(bottom < top-1) {
134 i = (top+bottom)/2;
135 first = *(CETable+3*i);
136 second = *(CETable+3*i+1);
137 res = compareCEs(first, second, CE, SecondCE);
138 if(res > 0) {
139 top = i;
140 } else if(res < 0) {
141 bottom = i;
142 } else {
143 break;
144 }
145 }
146
147 /* weiv: */
148 /* in searching for elements, I have removed the failure */
149 /* The reason for this is that the builder does not rely */
150 /* on search mechanism telling it that it didn't find an */
151 /* element. However, indirect positioning relies on being */
152 /* able to find the elements around any CE, even if it is */
153 /* not defined in the UCA. */
154 return i;
155 /*
156 if((first == CE && second == SecondCE)) {
157 return i;
158 } else {
159 return -1;
160 }
161 */
162 }
163
164 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
165 0xFFFF0000,
166 0xFFFFFF00,
167 0xFFFFFFFF
168 };
169
ucol_inv_getNextCE(const UColTokenParser * src,uint32_t CE,uint32_t contCE,uint32_t * nextCE,uint32_t * nextContCE,uint32_t strength)170 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
171 uint32_t CE, uint32_t contCE,
172 uint32_t *nextCE, uint32_t *nextContCE,
173 uint32_t strength)
174 {
175 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
176 int32_t iCE;
177
178 iCE = ucol_inv_findCE(src, CE, contCE);
179
180 if(iCE<0) {
181 *nextCE = UCOL_NOT_FOUND;
182 return -1;
183 }
184
185 CE &= strengthMask[strength];
186 contCE &= strengthMask[strength];
187
188 *nextCE = CE;
189 *nextContCE = contCE;
190
191 while((*nextCE & strengthMask[strength]) == CE
192 && (*nextContCE & strengthMask[strength]) == contCE)
193 {
194 *nextCE = (*(CETable+3*(++iCE)));
195 *nextContCE = (*(CETable+3*(iCE)+1));
196 }
197
198 return iCE;
199 }
200
ucol_inv_getPrevCE(const UColTokenParser * src,uint32_t CE,uint32_t contCE,uint32_t * prevCE,uint32_t * prevContCE,uint32_t strength)201 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
202 uint32_t CE, uint32_t contCE,
203 uint32_t *prevCE, uint32_t *prevContCE,
204 uint32_t strength)
205 {
206 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
207 int32_t iCE;
208
209 iCE = ucol_inv_findCE(src, CE, contCE);
210
211 if(iCE<0) {
212 *prevCE = UCOL_NOT_FOUND;
213 return -1;
214 }
215
216 CE &= strengthMask[strength];
217 contCE &= strengthMask[strength];
218
219 *prevCE = CE;
220 *prevContCE = contCE;
221
222 while((*prevCE & strengthMask[strength]) == CE
223 && (*prevContCE & strengthMask[strength])== contCE
224 && iCE > 0) /* this condition should prevent falling off the edge of the world */
225 {
226 /* here, we end up in a singularity - zero */
227 *prevCE = (*(CETable+3*(--iCE)));
228 *prevContCE = (*(CETable+3*(iCE)+1));
229 }
230
231 return iCE;
232 }
233
ucol_getCEStrengthDifference(uint32_t CE,uint32_t contCE,uint32_t prevCE,uint32_t prevContCE)234 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
235 uint32_t prevCE, uint32_t prevContCE)
236 {
237 if(prevCE == CE && prevContCE == contCE) {
238 return UCOL_IDENTICAL;
239 }
240 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
241 || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
242 {
243 return UCOL_PRIMARY;
244 }
245 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
246 || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
247 {
248 return UCOL_SECONDARY;
249 }
250 return UCOL_TERTIARY;
251 }
252
253
254 /*static
255 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
256
257 uint32_t CE = lh->baseCE;
258 uint32_t SecondCE = lh->baseContCE;
259
260 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
261 uint32_t previousCE, previousContCE;
262 int32_t iCE;
263
264 iCE = ucol_inv_findCE(src, CE, SecondCE);
265
266 if(iCE<0) {
267 return -1;
268 }
269
270 CE &= strengthMask[strength];
271 SecondCE &= strengthMask[strength];
272
273 previousCE = CE;
274 previousContCE = SecondCE;
275
276 while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) {
277 previousCE = (*(CETable+3*(--iCE)));
278 previousContCE = (*(CETable+3*(iCE)+1));
279 }
280 lh->previousCE = previousCE;
281 lh->previousContCE = previousContCE;
282
283 return iCE;
284 }*/
285
286 static
ucol_inv_getNext(UColTokenParser * src,UColTokListHeader * lh,uint32_t strength)287 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
288 uint32_t CE = lh->baseCE;
289 uint32_t SecondCE = lh->baseContCE;
290
291 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
292 uint32_t nextCE, nextContCE;
293 int32_t iCE;
294
295 iCE = ucol_inv_findCE(src, CE, SecondCE);
296
297 if(iCE<0) {
298 return -1;
299 }
300
301 CE &= strengthMask[strength];
302 SecondCE &= strengthMask[strength];
303
304 nextCE = CE;
305 nextContCE = SecondCE;
306
307 while((nextCE & strengthMask[strength]) == CE
308 && (nextContCE & strengthMask[strength]) == SecondCE)
309 {
310 nextCE = (*(CETable+3*(++iCE)));
311 nextContCE = (*(CETable+3*(iCE)+1));
312 }
313
314 lh->nextCE = nextCE;
315 lh->nextContCE = nextContCE;
316
317 return iCE;
318 }
319
ucol_inv_getGapPositions(UColTokenParser * src,UColTokListHeader * lh,UErrorCode * status)320 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
321 /* reset all the gaps */
322 int32_t i = 0;
323 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
324 uint32_t st = 0;
325 uint32_t t1, t2;
326 int32_t pos;
327
328 UColToken *tok = lh->first;
329 uint32_t tokStrength = tok->strength;
330
331 for(i = 0; i<3; i++) {
332 lh->gapsHi[3*i] = 0;
333 lh->gapsHi[3*i+1] = 0;
334 lh->gapsHi[3*i+2] = 0;
335 lh->gapsLo[3*i] = 0;
336 lh->gapsLo[3*i+1] = 0;
337 lh->gapsLo[3*i+2] = 0;
338 lh->numStr[i] = 0;
339 lh->fStrToken[i] = NULL;
340 lh->lStrToken[i] = NULL;
341 lh->pos[i] = -1;
342 }
343
344 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
345
346 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
347 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
348 lh->pos[0] = 0;
349 t1 = lh->baseCE;
350 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
351 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
352 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
353 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
354 uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
355 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
356
357 t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
358 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
359
360 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
361 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
362 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
363 } else if(lh->indirect == TRUE && lh->nextCE != 0) {
364 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
365 lh->pos[0] = 0;
366 t1 = lh->baseCE;
367 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
368 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
369 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
370 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
371 t1 = lh->nextCE;
372 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
373 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
374 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
375 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
376 } else {
377 for(;;) {
378 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
379 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
380 lh->fStrToken[tokStrength] = tok;
381 } else { /* The CE must be implicit, since it's not in the table */
382 /* Error */
383 *status = U_INTERNAL_PROGRAM_ERROR;
384 }
385 }
386
387 while(tok != NULL && tok->strength >= tokStrength) {
388 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
389 lh->lStrToken[tokStrength] = tok;
390 }
391 tok = tok->next;
392 }
393 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
394 /* check if previous interval is the same and merge the intervals if it is so */
395 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
396 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
397 lh->fStrToken[tokStrength+1] = NULL;
398 lh->lStrToken[tokStrength+1] = NULL;
399 lh->pos[tokStrength+1] = -1;
400 }
401 }
402 if(tok != NULL) {
403 tokStrength = tok->strength;
404 } else {
405 break;
406 }
407 }
408 for(st = 0; st < 3; st++) {
409 if((pos = lh->pos[st]) >= 0) {
410 t1 = *(CETable+3*(pos));
411 t2 = *(CETable+3*(pos)+1);
412 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
413 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
414 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
415 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
416 //pos--;
417 //t1 = *(CETable+3*(pos));
418 //t2 = *(CETable+3*(pos)+1);
419 t1 = lh->baseCE;
420 t2 = lh->baseContCE;
421 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
422 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
423 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
424 }
425 }
426 }
427 }
428
429
430 #define ucol_countBytes(value, noOfBytes) \
431 { \
432 uint32_t mask = 0xFFFFFFFF; \
433 (noOfBytes) = 0; \
434 while(mask != 0) { \
435 if(((value) & mask) != 0) { \
436 (noOfBytes)++; \
437 } \
438 mask >>= 8; \
439 } \
440 }
441
ucol_getNextGenerated(ucolCEGenerator * g,UErrorCode * status)442 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
443 if(U_SUCCESS(*status)) {
444 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
445 }
446 return g->current;
447 }
448
ucol_getSimpleCEGenerator(ucolCEGenerator * g,UColToken * tok,uint32_t strength,UErrorCode * status)449 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
450 /* TODO: rename to enum names */
451 uint32_t high, low, count=1;
452 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
453
454 if(strength == UCOL_SECONDARY) {
455 low = UCOL_COMMON_TOP2<<24;
456 high = 0xFFFFFFFF;
457 count = 0xFF - UCOL_COMMON_TOP2;
458 } else {
459 low = UCOL_BYTE_COMMON << 24; //0x05000000;
460 high = 0x40000000;
461 count = 0x40 - UCOL_BYTE_COMMON;
462 }
463
464 if(tok->next != NULL && tok->next->strength == strength) {
465 count = tok->next->toInsert;
466 }
467
468 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
469 g->current = UCOL_BYTE_COMMON<<24;
470
471 if(g->noOfRanges == 0) {
472 *status = U_INTERNAL_PROGRAM_ERROR;
473 }
474 return g->current;
475 }
476
ucol_getCEGenerator(ucolCEGenerator * g,uint32_t * lows,uint32_t * highs,UColToken * tok,uint32_t fStrength,UErrorCode * status)477 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
478 uint32_t strength = tok->strength;
479 uint32_t low = lows[fStrength*3+strength];
480 uint32_t high = highs[fStrength*3+strength];
481 uint32_t maxByte = 0;
482 if(strength == UCOL_TERTIARY) {
483 maxByte = 0x3F;
484 } else if(strength == UCOL_PRIMARY) {
485 maxByte = 0xFE;
486 } else {
487 maxByte = 0xFF;
488 }
489
490 uint32_t count = tok->toInsert;
491
492 if(low >= high && strength > UCOL_PRIMARY) {
493 int32_t s = strength;
494 for(;;) {
495 s--;
496 if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
497 if(strength == UCOL_SECONDARY) {
498 if (low < UCOL_COMMON_TOP2<<24 ) {
499 // Override if low range is less than UCOL_COMMON_TOP2.
500 low = UCOL_COMMON_TOP2<<24;
501 }
502 high = 0xFFFFFFFF;
503 } else {
504 // Override if low range is less than UCOL_COMMON_BOT3.
505 if ( low < UCOL_COMMON_BOT3<<24 ) {
506 low = UCOL_COMMON_BOT3<<24;
507 }
508 high = 0x40000000;
509 }
510 break;
511 }
512 if(s<0) {
513 *status = U_INTERNAL_PROGRAM_ERROR;
514 return 0;
515 }
516 }
517 }
518
519 if(low < 0x02000000) {
520 // We must not use CE weight byte 02, so we set it as the minimum lower bound.
521 // See http://site.icu-project.org/design/collation/bytes
522 low = 0x02000000;
523 }
524
525 if(strength == UCOL_SECONDARY) { /* similar as simple */
526 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
527 low = UCOL_COMMON_TOP2<<24;
528 }
529 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
530 high = UCOL_COMMON_TOP2<<24;
531 }
532 if(low < (UCOL_COMMON_BOT2<<24)) {
533 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
534 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
535 //g->current = UCOL_COMMON_BOT2<<24;
536 return g->current;
537 }
538 }
539
540 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
541 if(g->noOfRanges == 0) {
542 *status = U_INTERNAL_PROGRAM_ERROR;
543 }
544 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
545 return g->current;
546 }
547
548 static
u_toLargeKana(const UChar * source,const uint32_t sourceLen,UChar * resBuf,const uint32_t resLen,UErrorCode * status)549 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
550 uint32_t i = 0;
551 UChar c;
552
553 if(U_FAILURE(*status)) {
554 return 0;
555 }
556
557 if(sourceLen > resLen) {
558 *status = U_MEMORY_ALLOCATION_ERROR;
559 return 0;
560 }
561
562 for(i = 0; i < sourceLen; i++) {
563 c = source[i];
564 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
565 switch(c - 0x3000) {
566 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
567 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
568 c++;
569 break;
570 case 0xF5:
571 c = 0x30AB;
572 break;
573 case 0xF6:
574 c = 0x30B1;
575 break;
576 }
577 }
578 resBuf[i] = c;
579 }
580 return sourceLen;
581 }
582
583 static
u_toSmallKana(const UChar * source,const uint32_t sourceLen,UChar * resBuf,const uint32_t resLen,UErrorCode * status)584 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
585 uint32_t i = 0;
586 UChar c;
587
588 if(U_FAILURE(*status)) {
589 return 0;
590 }
591
592 if(sourceLen > resLen) {
593 *status = U_MEMORY_ALLOCATION_ERROR;
594 return 0;
595 }
596
597 for(i = 0; i < sourceLen; i++) {
598 c = source[i];
599 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
600 switch(c - 0x3000) {
601 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
602 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
603 c--;
604 break;
605 case 0xAB:
606 c = 0x30F5;
607 break;
608 case 0xB1:
609 c = 0x30F6;
610 break;
611 }
612 }
613 resBuf[i] = c;
614 }
615 return sourceLen;
616 }
617
618 U_NAMESPACE_BEGIN
619
620 static
ucol_uprv_getCaseBits(const UCollator * UCA,const UChar * src,uint32_t len,UErrorCode * status)621 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
622 uint32_t i = 0;
623 UChar n[128];
624 uint32_t nLen = 0;
625 uint32_t uCount = 0, lCount = 0;
626
627 collIterate s;
628 uint32_t order = 0;
629
630 if(U_FAILURE(*status)) {
631 return UCOL_LOWER_CASE;
632 }
633
634 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
635 if(U_SUCCESS(*status)) {
636 for(i = 0; i < nLen; i++) {
637 uprv_init_collIterate(UCA, &n[i], 1, &s, status);
638 order = ucol_getNextCE(UCA, &s, status);
639 if(isContinuation(order)) {
640 *status = U_INTERNAL_PROGRAM_ERROR;
641 return UCOL_LOWER_CASE;
642 }
643 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
644 uCount++;
645 } else {
646 if(u_islower(n[i])) {
647 lCount++;
648 } else if(U_SUCCESS(*status)) {
649 UChar sk[1], lk[1];
650 u_toSmallKana(&n[i], 1, sk, 1, status);
651 u_toLargeKana(&n[i], 1, lk, 1, status);
652 if(sk[0] == n[i] && lk[0] != n[i]) {
653 lCount++;
654 }
655 }
656 }
657 }
658 }
659
660 if(uCount != 0 && lCount != 0) {
661 return UCOL_MIXED_CASE;
662 } else if(uCount != 0) {
663 return UCOL_UPPER_CASE;
664 } else {
665 return UCOL_LOWER_CASE;
666 }
667 }
668
669
ucol_doCE(UColTokenParser * src,uint32_t * CEparts,UColToken * tok,UErrorCode * status)670 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
671 /* this one makes the table and stuff */
672 uint32_t noOfBytes[3];
673 uint32_t i;
674
675 for(i = 0; i<3; i++) {
676 ucol_countBytes(CEparts[i], noOfBytes[i]);
677 }
678
679 /* Here we have to pack CEs from parts */
680
681 uint32_t CEi = 0;
682 uint32_t value = 0;
683
684 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
685 if(CEi > 0) {
686 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
687 } else {
688 value = 0;
689 }
690
691 if(2*CEi<noOfBytes[0]) {
692 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
693 }
694 if(CEi<noOfBytes[1]) {
695 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
696 }
697 if(CEi<noOfBytes[2]) {
698 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
699 }
700 tok->CEs[CEi] = value;
701 CEi++;
702 }
703 if(CEi == 0) { /* totally ignorable */
704 tok->noOfCEs = 1;
705 tok->CEs[0] = 0;
706 } else { /* there is at least something */
707 tok->noOfCEs = CEi;
708 }
709
710
711 // we want to set case bits here and now, not later.
712 // Case bits handling
713 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
714 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
715 int32_t cSize = (tok->source & 0xFF000000) >> 24;
716 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
717
718 if(cSize > 1) {
719 // Do it manually
720 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
721 } else {
722 // Copy it from the UCA
723 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
724 tok->CEs[0] |= (caseCE & 0xC0);
725 }
726 }
727
728 #if UCOL_DEBUG==2
729 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
730 for(i = 0; i<tok->noOfCEs; i++) {
731 fprintf(stderr, "%08X ", tok->CEs[i]);
732 }
733 fprintf(stderr, "\n");
734 #endif
735 }
736
ucol_initBuffers(UColTokenParser * src,UColTokListHeader * lh,UErrorCode * status)737 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
738 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
739 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
740
741 UColToken *tok = lh->last;
742 uint32_t t[UCOL_STRENGTH_LIMIT];
743
744 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
745
746 /* must initialize ranges to avoid memory check warnings */
747 for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) {
748 uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges));
749 }
750
751 tok->toInsert = 1;
752 t[tok->strength] = 1;
753
754 while(tok->previous != NULL) {
755 if(tok->previous->strength < tok->strength) { /* going up */
756 t[tok->strength] = 0;
757 t[tok->previous->strength]++;
758 } else if(tok->previous->strength > tok->strength) { /* going down */
759 t[tok->previous->strength] = 1;
760 } else {
761 t[tok->strength]++;
762 }
763 tok=tok->previous;
764 tok->toInsert = t[tok->strength];
765 }
766
767 tok->toInsert = t[tok->strength];
768 ucol_inv_getGapPositions(src, lh, status);
769
770 #if UCOL_DEBUG
771 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
772 int32_t j = 2;
773 for(j = 2; j >= 0; j--) {
774 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
775 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
776 }
777 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
778
779 do {
780 fprintf(stderr,"%i", tok->strength);
781 tok = tok->next;
782 } while(tok != NULL);
783 fprintf(stderr, "\n");
784
785 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
786
787 do {
788 fprintf(stderr,"%i", tok->toInsert);
789 tok = tok->next;
790 } while(tok != NULL);
791 #endif
792
793 tok = lh->first;
794 uint32_t fStrength = UCOL_IDENTICAL;
795 uint32_t initStrength = UCOL_IDENTICAL;
796
797
798 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
799 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
800 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
801
802 while (tok != NULL && U_SUCCESS(*status)) {
803 fStrength = tok->strength;
804 if(fStrength < initStrength) {
805 initStrength = fStrength;
806 if(lh->pos[fStrength] == -1) {
807 while(lh->pos[fStrength] == -1 && fStrength > 0) {
808 fStrength--;
809 }
810 if(lh->pos[fStrength] == -1) {
811 *status = U_INTERNAL_PROGRAM_ERROR;
812 return;
813 }
814 }
815 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
816 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
817 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
818 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
819 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
820 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
821 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
822 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
823 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
824 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
825 } else { /* primaries */
826 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
827 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
828 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
829 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
830 }
831 } else {
832 if(tok->strength == UCOL_TERTIARY) {
833 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
834 } else if(tok->strength == UCOL_SECONDARY) {
835 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
836 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
837 } else if(tok->strength == UCOL_PRIMARY) {
838 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
839 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
840 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
841 }
842 }
843 ucol_doCE(src, CEparts, tok, status);
844 tok = tok->next;
845 }
846 }
847
ucol_createElements(UColTokenParser * src,tempUCATable * t,UColTokListHeader * lh,UErrorCode * status)848 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
849 UCAElements el;
850 UColToken *tok = lh->first;
851 UColToken *expt = NULL;
852 uint32_t i = 0, j = 0;
853 const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
854
855 while(tok != NULL && U_SUCCESS(*status)) {
856 /* first, check if there are any expansions */
857 /* if there are expansions, we need to do a little bit more processing */
858 /* since parts of expansion can be tailored, while others are not */
859 if(tok->expansion != 0) {
860 uint32_t len = tok->expansion >> 24;
861 uint32_t currentSequenceLen = len;
862 uint32_t expOffset = tok->expansion & 0x00FFFFFF;
863 //uint32_t exp = currentSequenceLen | expOffset;
864 UColToken exp;
865 exp.source = currentSequenceLen | expOffset;
866 exp.rulesToParseHdl = &(src->source);
867
868 while(len > 0) {
869 currentSequenceLen = len;
870 while(currentSequenceLen > 0) {
871 exp.source = (currentSequenceLen << 24) | expOffset;
872 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
873 uint32_t noOfCEsToCopy = expt->noOfCEs;
874 for(j = 0; j<noOfCEsToCopy; j++) {
875 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
876 }
877 tok->noOfExpCEs += noOfCEsToCopy;
878 // Smart people never try to add codepoints and CEs.
879 // For some odd reason, it won't work.
880 expOffset += currentSequenceLen; //noOfCEsToCopy;
881 len -= currentSequenceLen; //noOfCEsToCopy;
882 break;
883 } else {
884 currentSequenceLen--;
885 }
886 }
887 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
888 /* will have to get one from UCA */
889 /* first, get the UChars from the rules */
890 /* then pick CEs out until there is no more and stuff them into expansion */
891 collIterate s;
892 uint32_t order = 0;
893 uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
894
895 for(;;) {
896 order = ucol_getNextCE(src->UCA, &s, status);
897 if(order == UCOL_NO_MORE_CES) {
898 break;
899 }
900 tok->expCEs[tok->noOfExpCEs++] = order;
901 }
902 expOffset++;
903 len--;
904 }
905 }
906 } else {
907 tok->noOfExpCEs = 0;
908 }
909
910 /* set the ucaelement with obtained values */
911 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
912 /* copy CEs */
913 for(i = 0; i<tok->noOfCEs; i++) {
914 el.CEs[i] = tok->CEs[i];
915 }
916 for(i = 0; i<tok->noOfExpCEs; i++) {
917 el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
918 }
919
920 /* copy UChars */
921 // We kept prefix and source kind of together, as it is a kind of a contraction.
922 // However, now we have to slice the prefix off the main thing -
923 el.prefix = el.prefixChars;
924 el.cPoints = el.uchars;
925 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
926 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
927 // decomposed elements to the unsaf table.
928 el.prefixSize = tok->prefix>>24;
929 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
930
931 el.cSize = (tok->source >> 24)-(tok->prefix>>24);
932 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
933 } else {
934 el.prefixSize = 0;
935 *el.prefix = 0;
936
937 el.cSize = (tok->source >> 24);
938 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
939 }
940 if(src->UCA != NULL) {
941 for(i = 0; i<el.cSize; i++) {
942 if(UCOL_ISJAMO(el.cPoints[i])) {
943 t->image->jamoSpecial = TRUE;
944 }
945 }
946 if (!src->buildCCTabFlag && el.cSize > 0) {
947 // Check the trailing canonical combining class (tccc) of the last character.
948 const UChar *s = el.cPoints + el.cSize;
949 uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s);
950 if ((fcd & 0xff) != 0) {
951 src->buildCCTabFlag = TRUE;
952 }
953 }
954 }
955
956 /* and then, add it */
957 #if UCOL_DEBUG==2
958 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
959 #endif
960 uprv_uca_addAnElement(t, &el, status);
961
962 #if UCOL_DEBUG_DUPLICATES
963 if(*status != U_ZERO_ERROR) {
964 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
965 *status = U_ZERO_ERROR;
966 }
967 #endif
968
969 tok = tok->next;
970 }
971 }
972
973 U_CDECL_BEGIN
974 static UBool U_CALLCONV
_processUCACompleteIgnorables(const void * context,UChar32 start,UChar32 limit,uint32_t value)975 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
976 UErrorCode status = U_ZERO_ERROR;
977 tempUCATable *t = (tempUCATable *)context;
978 if(value == 0) {
979 while(start < limit) {
980 uint32_t CE = utrie_get32(t->mapping, start, NULL);
981 if(CE == UCOL_NOT_FOUND) {
982 UCAElements el;
983 el.isThai = FALSE;
984 el.prefixSize = 0;
985 el.prefixChars[0] = 0;
986 el.prefix = el.prefixChars;
987 el.cPoints = el.uchars;
988
989 el.cSize = 0;
990 U16_APPEND_UNSAFE(el.uchars, el.cSize, start);
991
992 el.noOfCEs = 1;
993 el.CEs[0] = 0;
994 uprv_uca_addAnElement(t, &el, &status);
995
996 }
997 start++;
998 }
999 }
1000 if(U_FAILURE(status)) {
1001 return FALSE;
1002 } else {
1003 return TRUE;
1004 }
1005 }
1006 U_CDECL_END
1007
1008 static void
ucol_uprv_bld_copyRangeFromUCA(UColTokenParser * src,tempUCATable * t,UChar32 start,UChar32 end,UErrorCode * status)1009 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
1010 UChar32 start, UChar32 end,
1011 UErrorCode *status)
1012 {
1013 //UChar decomp[256];
1014 uint32_t CE = UCOL_NOT_FOUND;
1015 UChar32 u = 0;
1016 UCAElements el;
1017 el.isThai = FALSE;
1018 el.prefixSize = 0;
1019 el.prefixChars[0] = 0;
1020 collIterate colIt;
1021
1022 if(U_SUCCESS(*status)) {
1023 for(u = start; u<=end; u++) {
1024 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
1025 /* this test is for contractions that are missing the starting element. */
1026 || ((isCntTableElement(CE)) &&
1027 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
1028 )
1029 {
1030 el.cSize = 0;
1031 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
1032 //decomp[0] = (UChar)u;
1033 //el.uchars[0] = (UChar)u;
1034 el.cPoints = el.uchars;
1035 //el.cSize = 1;
1036 el.noOfCEs = 0;
1037 el.prefix = el.prefixChars;
1038 el.prefixSize = 0;
1039 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1040 // We actually want to check whether this element is a special
1041 // If it is an implicit element (hangul, CJK - we want to copy the
1042 // special, not the resolved CEs) - for hangul, copying resolved
1043 // would just make things the same (there is an expansion and it
1044 // takes approximately the same amount of time to resolve as
1045 // falling back to the UCA).
1046 /*
1047 UTRIE_GET32(src->UCA->mapping, u, CE);
1048 tag = getCETag(CE);
1049 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1050 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1051 || tag == LEAD_SURROGATE_TAG) {
1052 el.CEs[el.noOfCEs++] = CE;
1053 } else {
1054 */
1055 // It turns out that it does not make sense to keep implicits
1056 // unresolved. The cost of resolving them is big enough so that
1057 // it doesn't make any difference whether we have to go to the UCA
1058 // or not.
1059 {
1060 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
1061 while(CE != UCOL_NO_MORE_CES) {
1062 CE = ucol_getNextCE(src->UCA, &colIt, status);
1063 if(CE != UCOL_NO_MORE_CES) {
1064 el.CEs[el.noOfCEs++] = CE;
1065 }
1066 }
1067 }
1068 uprv_uca_addAnElement(t, &el, status);
1069 }
1070 }
1071 }
1072 }
1073
1074 U_NAMESPACE_END
1075
1076 U_CFUNC UCATableHeader *
ucol_assembleTailoringTable(UColTokenParser * src,UErrorCode * status)1077 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1078 U_NAMESPACE_USE
1079
1080 uint32_t i = 0;
1081 if(U_FAILURE(*status)) {
1082 return NULL;
1083 }
1084 /*
1085 2. Eliminate the negative lists by doing the following for each non-null negative list:
1086 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1087 create new ListHeader X
1088 o reverse the list, add to the end of X's positive list. Reset the strength of the
1089 first item you add, based on the stronger strength levels of the two lists.
1090 */
1091 /*
1092 3. For each ListHeader with a non-null positive list:
1093 */
1094 /*
1095 o Find all character strings with CEs between the baseCE and the
1096 next/previous CE, at the strength of the first token. Add these to the
1097 tailoring.
1098 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the
1099 tailoring has & x < z...
1100 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
1101 */
1102 /* It is possible that this part should be done even while constructing list */
1103 /* The problem is that it is unknown what is going to be the strongest weight */
1104 /* So we might as well do it here */
1105
1106 /*
1107 o Allocate CEs for each token in the list, based on the total number N of the
1108 largest level difference, and the gap G between baseCE and nextCE at that
1109 level. The relation * between the last item and nextCE is the same as the
1110 strongest strength.
1111 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1112 ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1113 Then fit b and c into the secondary gap between a and d, then fit q
1114 into the tertiary gap between b and c.
1115
1116 o Example: baseCE << b <<< q << c * nextCE(X,2)
1117 ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1118 Then fit q into the tertiary gap between b and c.
1119 o When incrementing primary values, we will not cross high byte
1120 boundaries except where there is only a single-byte primary. That is to
1121 ensure that the script reordering will continue to work.
1122 */
1123 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1124 /* test for NULL */
1125 if (image == NULL) {
1126 *status = U_MEMORY_ALLOCATION_ERROR;
1127 return NULL;
1128 }
1129 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1130
1131 for(i = 0; i<src->resultLen; i++) {
1132 /* now we need to generate the CEs */
1133 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1134 /* According to strength */
1135 if(U_SUCCESS(*status)) {
1136 if(src->lh[i].first) { // if there are any elements
1137 // due to the way parser works, subsequent tailorings
1138 // may remove all the elements from a sequence, therefore
1139 // leaving an empty tailoring sequence.
1140 ucol_initBuffers(src, &src->lh[i], status);
1141 }
1142 }
1143 if(U_FAILURE(*status)) {
1144 uprv_free(image);
1145 return NULL;
1146 }
1147 }
1148
1149 if(src->varTop != NULL) { /* stuff the variable top value */
1150 src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1151 /* remove it from the list */
1152 if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1153 src->varTop->listHeader->first = src->varTop->next;
1154 }
1155 if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1156 src->varTop->listHeader->last = src->varTop->previous;
1157 }
1158 if(src->varTop->next != NULL) {
1159 src->varTop->next->previous = src->varTop->previous;
1160 }
1161 if(src->varTop->previous != NULL) {
1162 src->varTop->previous->next = src->varTop->next;
1163 }
1164 }
1165
1166
1167 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1168 if(U_FAILURE(*status)) {
1169 uprv_free(image);
1170 return NULL;
1171 }
1172
1173
1174 /* After this, we have assigned CE values to all regular CEs */
1175 /* now we will go through list once more and resolve expansions, */
1176 /* make UCAElements structs and add them to table */
1177 for(i = 0; i<src->resultLen; i++) {
1178 /* now we need to generate the CEs */
1179 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1180 /* According to strength */
1181 if(U_SUCCESS(*status)) {
1182 ucol_createElements(src, t, &src->lh[i], status);
1183 }
1184 }
1185
1186 UCAElements el;
1187 el.isThai = FALSE;
1188 el.prefixSize = 0;
1189 el.prefixChars[0] = 0;
1190
1191 /* add latin-1 stuff */
1192 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1193
1194 /* add stuff for copying */
1195 if(src->copySet != NULL) {
1196 int32_t i = 0;
1197 UnicodeSet *set = (UnicodeSet *)src->copySet;
1198 for(i = 0; i < set->getRangeCount(); i++) {
1199 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
1200 }
1201 }
1202
1203 if(U_SUCCESS(*status)) {
1204 /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1205
1206 uint32_t tailoredCE = UCOL_NOT_FOUND;
1207 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1208 int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth;
1209 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1210 // Check for null pointer
1211 if (ucaEl == NULL) {
1212 *status = U_MEMORY_ALLOCATION_ERROR;
1213 return NULL;
1214 }
1215 while(*conts != 0) {
1216 // A continuation is NUL-terminated and NUL-padded
1217 // except if it has the maximum length.
1218 int32_t contractionLength = maxUCAContractionLength;
1219 while(contractionLength > 0 && conts[contractionLength - 1] == 0) {
1220 --contractionLength;
1221 }
1222 UChar32 first;
1223 int32_t firstLength = 0;
1224 U16_NEXT(conts, firstLength, contractionLength, first);
1225 tailoredCE = utrie_get32(t->mapping, first, NULL);
1226 if(tailoredCE != UCOL_NOT_FOUND) {
1227 UBool needToAdd = TRUE;
1228 if(isCntTableElement(tailoredCE)) {
1229 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) {
1230 needToAdd = FALSE;
1231 }
1232 }
1233 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
1234 UCAElements elm;
1235 elm.cPoints = el.uchars;
1236 elm.noOfCEs = 0;
1237 elm.uchars[0] = *conts;
1238 elm.uchars[1] = 0;
1239 elm.cSize = 1;
1240 elm.prefixChars[0] = *(conts+2);
1241 elm.isThai = FALSE;
1242 elm.prefix = elm.prefixChars;
1243 elm.prefixSize = 1;
1244 UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
1245 if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
1246 needToAdd = TRUE;
1247 }
1248 }
1249 if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
1250 needToAdd = FALSE;
1251 }
1252
1253 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1254 if (*(conts+1) != 0) { // contractions
1255 el.prefix = el.prefixChars;
1256 el.prefixSize = 0;
1257 el.cPoints = el.uchars;
1258 el.noOfCEs = 0;
1259 u_memcpy(el.uchars, conts, contractionLength);
1260 el.cSize = contractionLength;
1261 ucol_setText(ucaEl, el.uchars, el.cSize, status);
1262 }
1263 else { // pre-context character
1264 UChar str[4] = { 0 };
1265 int32_t len=0;
1266 int32_t preKeyLen=0;
1267
1268 el.cPoints = el.uchars;
1269 el.noOfCEs = 0;
1270 el.uchars[0] = *conts;
1271 el.uchars[1] = 0;
1272 el.cSize = 1;
1273 el.prefixChars[0] = *(conts+2);
1274 el.prefix = el.prefixChars;
1275 el.prefixSize = 1;
1276 if (el.prefixChars[0]!=0) {
1277 // get CE of prefix character first
1278 str[0]=el.prefixChars[0];
1279 str[1]=0;
1280 ucol_setText(ucaEl, str, 1, status);
1281 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
1282 != UCOL_NULLORDER) {
1283 preKeyLen++; // count number of keys for prefix character
1284 }
1285 str[len++] = el.prefixChars[0];
1286 }
1287
1288 str[len++] = el.uchars[0];
1289 str[len]=0;
1290 ucol_setText(ucaEl, str, len, status);
1291 // Skip the keys for prefix character, then copy the rest to el.
1292 while ((preKeyLen-->0) &&
1293 (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1294 continue;
1295 }
1296
1297 }
1298 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1299 el.noOfCEs++;
1300 }
1301 uprv_uca_addAnElement(t, &el, status);
1302 }
1303
1304 } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
1305 ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status);
1306 }
1307 conts+=maxUCAContractionLength;
1308 }
1309 ucol_closeElements(ucaEl);
1310 }
1311
1312 // Add completely ignorable elements
1313 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1314
1315 // add tailoring characters related canonical closures
1316 uprv_uca_canonicalClosure(t, src, NULL, status);
1317
1318 /* still need to produce compatibility closure */
1319
1320 UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1321
1322 uprv_uca_closeTempTable(t);
1323 uprv_free(image);
1324
1325 return myData;
1326 }
1327
1328 U_CDECL_BEGIN
1329 static UBool U_CALLCONV
ucol_bld_cleanup(void)1330 ucol_bld_cleanup(void)
1331 {
1332 udata_close(invUCA_DATA_MEM);
1333 invUCA_DATA_MEM = NULL;
1334 _staticInvUCA = NULL;
1335 gStaticInvUCAInitOnce.reset();
1336 return TRUE;
1337 }
1338 U_CDECL_END
1339
initInverseUCA(UErrorCode & status)1340 static void U_CALLCONV initInverseUCA(UErrorCode &status) {
1341 U_ASSERT(invUCA_DATA_MEM == NULL);
1342 U_ASSERT(_staticInvUCA == NULL);
1343 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1344 InverseUCATableHeader *newInvUCA = NULL;
1345 UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, &status);
1346
1347 if(U_FAILURE(status)) {
1348 if (result) {
1349 udata_close(result);
1350 }
1351 // This is not needed, as we are talking about
1352 // memory we got from UData
1353 //uprv_free(newInvUCA);
1354 return;
1355 }
1356
1357 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1358 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1359 UCollator *UCA = ucol_initUCA(&status);
1360 // UCA versions of UCA and inverse UCA should match
1361 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
1362 status = U_INVALID_FORMAT_ERROR;
1363 udata_close(result);
1364 return;
1365 }
1366
1367 invUCA_DATA_MEM = result;
1368 _staticInvUCA = newInvUCA;
1369 }
1370 }
1371
1372
1373 U_CAPI const InverseUCATableHeader * U_EXPORT2
ucol_initInverseUCA(UErrorCode * status)1374 ucol_initInverseUCA(UErrorCode *status)
1375 {
1376 umtx_initOnce(gStaticInvUCAInitOnce, &initInverseUCA, *status);
1377 return _staticInvUCA;
1378 }
1379
1380 /* This is the data that is used for non-script reordering codes. These _must_ be kept
1381 * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
1382 */
1383 static const char * const ReorderingTokenNames[] = {
1384 "SPACE",
1385 "PUNCT",
1386 "SYMBOL",
1387 "CURRENCY",
1388 "DIGIT"
1389 };
1390
toUpper(const char * src,char * dst,uint32_t length)1391 static void toUpper(const char* src, char* dst, uint32_t length) {
1392 for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
1393 *dst = uprv_toupper(*src);
1394 }
1395 *dst = '\0';
1396 }
1397
1398 U_INTERNAL int32_t U_EXPORT2
ucol_findReorderingEntry(const char * name)1399 ucol_findReorderingEntry(const char* name) {
1400 char buffer[32];
1401 toUpper(name, buffer, 32);
1402 for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) {
1403 if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
1404 return entry + UCOL_REORDER_CODE_FIRST;
1405 }
1406 }
1407 return USCRIPT_INVALID_CODE;
1408 }
1409
1410 #endif /* #if !UCONFIG_NO_COLLATION */
1411