1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2001-2012, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: ucol_bld.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created 02/22/2001
14 * created by: Vladimir Weinstein
15 *
16 * This module builds a collator based on the rule set.
17 *
18 */
19
20 #include "unicode/utypes.h"
21
22 #if !UCONFIG_NO_COLLATION
23
24 #include "unicode/ucoleitr.h"
25 #include "unicode/udata.h"
26 #include "unicode/uchar.h"
27 #include "unicode/uniset.h"
28 #include "unicode/uscript.h"
29 #include "unicode/ustring.h"
30 #include "unicode/utf16.h"
31 #include "normalizer2impl.h"
32 #include "ucol_bld.h"
33 #include "ucol_elm.h"
34 #include "ucol_cnt.h"
35 #include "ucln_in.h"
36 #include "umutex.h"
37 #include "cmemory.h"
38 #include "cstring.h"
39
40 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
41
42 static const InverseUCATableHeader* _staticInvUCA = NULL;
43 static UDataMemory* invUCA_DATA_MEM = NULL;
44
45 U_CDECL_BEGIN
46 static UBool U_CALLCONV
isAcceptableInvUCA(void *,const char *,const char *,const UDataInfo * pInfo)47 isAcceptableInvUCA(void * /*context*/,
48 const char * /*type*/, const char * /*name*/,
49 const UDataInfo *pInfo)
50 {
51 /* context, type & name are intentionally not used */
52 if( pInfo->size>=20 &&
53 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
54 pInfo->charsetFamily==U_CHARSET_FAMILY &&
55 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */
56 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
57 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
58 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
59 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
60 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
61 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
62 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
63 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
64 )
65 {
66 UVersionInfo UCDVersion;
67 u_getUnicodeVersion(UCDVersion);
68 return (pInfo->dataVersion[0]==UCDVersion[0] &&
69 pInfo->dataVersion[1]==UCDVersion[1]);
70 //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
71 //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
72 //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
73 } else {
74 return FALSE;
75 }
76 }
77 U_CDECL_END
78
79 /*
80 * Takes two CEs (lead and continuation) and
81 * compares them as CEs should be compared:
82 * primary vs. primary, secondary vs. secondary
83 * tertiary vs. tertiary
84 */
compareCEs(uint32_t source0,uint32_t source1,uint32_t target0,uint32_t target1)85 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
86 uint32_t s1 = source0, s2, t1 = target0, t2;
87 if(isContinuation(source1)) {
88 s2 = source1;
89 } else {
90 s2 = 0;
91 }
92 if(isContinuation(target1)) {
93 t2 = target1;
94 } else {
95 t2 = 0;
96 }
97
98 uint32_t s = 0, t = 0;
99 if(s1 == t1 && s2 == t2) {
100 return 0;
101 }
102 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
103 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
104 if(s < t) {
105 return -1;
106 } else if(s > t) {
107 return 1;
108 } else {
109 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
110 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
111 if(s < t) {
112 return -1;
113 } else if(s > t) {
114 return 1;
115 } else {
116 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
117 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
118 if(s < t) {
119 return -1;
120 } else {
121 return 1;
122 }
123 }
124 }
125 }
126
127 static
ucol_inv_findCE(const UColTokenParser * src,uint32_t CE,uint32_t SecondCE)128 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
129 uint32_t bottom = 0, top = src->invUCA->tableSize;
130 uint32_t i = 0;
131 uint32_t first = 0, second = 0;
132 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
133 int32_t res = 0;
134
135 while(bottom < top-1) {
136 i = (top+bottom)/2;
137 first = *(CETable+3*i);
138 second = *(CETable+3*i+1);
139 res = compareCEs(first, second, CE, SecondCE);
140 if(res > 0) {
141 top = i;
142 } else if(res < 0) {
143 bottom = i;
144 } else {
145 break;
146 }
147 }
148
149 /* weiv: */
150 /* in searching for elements, I have removed the failure */
151 /* The reason for this is that the builder does not rely */
152 /* on search mechanism telling it that it didn't find an */
153 /* element. However, indirect positioning relies on being */
154 /* able to find the elements around any CE, even if it is */
155 /* not defined in the UCA. */
156 return i;
157 /*
158 if((first == CE && second == SecondCE)) {
159 return i;
160 } else {
161 return -1;
162 }
163 */
164 }
165
166 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
167 0xFFFF0000,
168 0xFFFFFF00,
169 0xFFFFFFFF
170 };
171
ucol_inv_getNextCE(const UColTokenParser * src,uint32_t CE,uint32_t contCE,uint32_t * nextCE,uint32_t * nextContCE,uint32_t strength)172 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
173 uint32_t CE, uint32_t contCE,
174 uint32_t *nextCE, uint32_t *nextContCE,
175 uint32_t strength)
176 {
177 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
178 int32_t iCE;
179
180 iCE = ucol_inv_findCE(src, CE, contCE);
181
182 if(iCE<0) {
183 *nextCE = UCOL_NOT_FOUND;
184 return -1;
185 }
186
187 CE &= strengthMask[strength];
188 contCE &= strengthMask[strength];
189
190 *nextCE = CE;
191 *nextContCE = contCE;
192
193 while((*nextCE & strengthMask[strength]) == CE
194 && (*nextContCE & strengthMask[strength]) == contCE)
195 {
196 *nextCE = (*(CETable+3*(++iCE)));
197 *nextContCE = (*(CETable+3*(iCE)+1));
198 }
199
200 return iCE;
201 }
202
ucol_inv_getPrevCE(const UColTokenParser * src,uint32_t CE,uint32_t contCE,uint32_t * prevCE,uint32_t * prevContCE,uint32_t strength)203 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
204 uint32_t CE, uint32_t contCE,
205 uint32_t *prevCE, uint32_t *prevContCE,
206 uint32_t strength)
207 {
208 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
209 int32_t iCE;
210
211 iCE = ucol_inv_findCE(src, CE, contCE);
212
213 if(iCE<0) {
214 *prevCE = UCOL_NOT_FOUND;
215 return -1;
216 }
217
218 CE &= strengthMask[strength];
219 contCE &= strengthMask[strength];
220
221 *prevCE = CE;
222 *prevContCE = contCE;
223
224 while((*prevCE & strengthMask[strength]) == CE
225 && (*prevContCE & strengthMask[strength])== contCE
226 && iCE > 0) /* this condition should prevent falling off the edge of the world */
227 {
228 /* here, we end up in a singularity - zero */
229 *prevCE = (*(CETable+3*(--iCE)));
230 *prevContCE = (*(CETable+3*(iCE)+1));
231 }
232
233 return iCE;
234 }
235
ucol_getCEStrengthDifference(uint32_t CE,uint32_t contCE,uint32_t prevCE,uint32_t prevContCE)236 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
237 uint32_t prevCE, uint32_t prevContCE)
238 {
239 if(prevCE == CE && prevContCE == contCE) {
240 return UCOL_IDENTICAL;
241 }
242 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
243 || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
244 {
245 return UCOL_PRIMARY;
246 }
247 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
248 || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
249 {
250 return UCOL_SECONDARY;
251 }
252 return UCOL_TERTIARY;
253 }
254
255
256 /*static
257 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
258
259 uint32_t CE = lh->baseCE;
260 uint32_t SecondCE = lh->baseContCE;
261
262 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
263 uint32_t previousCE, previousContCE;
264 int32_t iCE;
265
266 iCE = ucol_inv_findCE(src, CE, SecondCE);
267
268 if(iCE<0) {
269 return -1;
270 }
271
272 CE &= strengthMask[strength];
273 SecondCE &= strengthMask[strength];
274
275 previousCE = CE;
276 previousContCE = SecondCE;
277
278 while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) {
279 previousCE = (*(CETable+3*(--iCE)));
280 previousContCE = (*(CETable+3*(iCE)+1));
281 }
282 lh->previousCE = previousCE;
283 lh->previousContCE = previousContCE;
284
285 return iCE;
286 }*/
287
288 static
ucol_inv_getNext(UColTokenParser * src,UColTokListHeader * lh,uint32_t strength)289 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
290 uint32_t CE = lh->baseCE;
291 uint32_t SecondCE = lh->baseContCE;
292
293 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
294 uint32_t nextCE, nextContCE;
295 int32_t iCE;
296
297 iCE = ucol_inv_findCE(src, CE, SecondCE);
298
299 if(iCE<0) {
300 return -1;
301 }
302
303 CE &= strengthMask[strength];
304 SecondCE &= strengthMask[strength];
305
306 nextCE = CE;
307 nextContCE = SecondCE;
308
309 while((nextCE & strengthMask[strength]) == CE
310 && (nextContCE & strengthMask[strength]) == SecondCE)
311 {
312 nextCE = (*(CETable+3*(++iCE)));
313 nextContCE = (*(CETable+3*(iCE)+1));
314 }
315
316 lh->nextCE = nextCE;
317 lh->nextContCE = nextContCE;
318
319 return iCE;
320 }
321
ucol_inv_getGapPositions(UColTokenParser * src,UColTokListHeader * lh,UErrorCode * status)322 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
323 /* reset all the gaps */
324 int32_t i = 0;
325 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
326 uint32_t st = 0;
327 uint32_t t1, t2;
328 int32_t pos;
329
330 UColToken *tok = lh->first;
331 uint32_t tokStrength = tok->strength;
332
333 for(i = 0; i<3; i++) {
334 lh->gapsHi[3*i] = 0;
335 lh->gapsHi[3*i+1] = 0;
336 lh->gapsHi[3*i+2] = 0;
337 lh->gapsLo[3*i] = 0;
338 lh->gapsLo[3*i+1] = 0;
339 lh->gapsLo[3*i+2] = 0;
340 lh->numStr[i] = 0;
341 lh->fStrToken[i] = NULL;
342 lh->lStrToken[i] = NULL;
343 lh->pos[i] = -1;
344 }
345
346 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
347
348 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
349 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
350 lh->pos[0] = 0;
351 t1 = lh->baseCE;
352 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
353 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
354 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
355 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
356 uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
357 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
358
359 t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
360 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
361
362 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
363 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
364 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
365 } else if(lh->indirect == TRUE && lh->nextCE != 0) {
366 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
367 lh->pos[0] = 0;
368 t1 = lh->baseCE;
369 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
370 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
371 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
372 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
373 t1 = lh->nextCE;
374 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
375 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
376 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
377 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
378 } else {
379 for(;;) {
380 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
381 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
382 lh->fStrToken[tokStrength] = tok;
383 } else { /* The CE must be implicit, since it's not in the table */
384 /* Error */
385 *status = U_INTERNAL_PROGRAM_ERROR;
386 }
387 }
388
389 while(tok != NULL && tok->strength >= tokStrength) {
390 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
391 lh->lStrToken[tokStrength] = tok;
392 }
393 tok = tok->next;
394 }
395 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
396 /* check if previous interval is the same and merge the intervals if it is so */
397 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
398 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
399 lh->fStrToken[tokStrength+1] = NULL;
400 lh->lStrToken[tokStrength+1] = NULL;
401 lh->pos[tokStrength+1] = -1;
402 }
403 }
404 if(tok != NULL) {
405 tokStrength = tok->strength;
406 } else {
407 break;
408 }
409 }
410 for(st = 0; st < 3; st++) {
411 if((pos = lh->pos[st]) >= 0) {
412 t1 = *(CETable+3*(pos));
413 t2 = *(CETable+3*(pos)+1);
414 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
415 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
416 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
417 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
418 //pos--;
419 //t1 = *(CETable+3*(pos));
420 //t2 = *(CETable+3*(pos)+1);
421 t1 = lh->baseCE;
422 t2 = lh->baseContCE;
423 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
424 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
425 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
426 }
427 }
428 }
429 }
430
431
432 #define ucol_countBytes(value, noOfBytes) \
433 { \
434 uint32_t mask = 0xFFFFFFFF; \
435 (noOfBytes) = 0; \
436 while(mask != 0) { \
437 if(((value) & mask) != 0) { \
438 (noOfBytes)++; \
439 } \
440 mask >>= 8; \
441 } \
442 }
443
ucol_getNextGenerated(ucolCEGenerator * g,UErrorCode * status)444 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
445 if(U_SUCCESS(*status)) {
446 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
447 }
448 return g->current;
449 }
450
ucol_getSimpleCEGenerator(ucolCEGenerator * g,UColToken * tok,uint32_t strength,UErrorCode * status)451 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
452 /* TODO: rename to enum names */
453 uint32_t high, low, count=1;
454 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
455
456 if(strength == UCOL_SECONDARY) {
457 low = UCOL_COMMON_TOP2<<24;
458 high = 0xFFFFFFFF;
459 count = 0xFF - UCOL_COMMON_TOP2;
460 } else {
461 low = UCOL_BYTE_COMMON << 24; //0x05000000;
462 high = 0x40000000;
463 count = 0x40 - UCOL_BYTE_COMMON;
464 }
465
466 if(tok->next != NULL && tok->next->strength == strength) {
467 count = tok->next->toInsert;
468 }
469
470 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
471 g->current = UCOL_BYTE_COMMON<<24;
472
473 if(g->noOfRanges == 0) {
474 *status = U_INTERNAL_PROGRAM_ERROR;
475 }
476 return g->current;
477 }
478
ucol_getCEGenerator(ucolCEGenerator * g,uint32_t * lows,uint32_t * highs,UColToken * tok,uint32_t fStrength,UErrorCode * status)479 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
480 uint32_t strength = tok->strength;
481 uint32_t low = lows[fStrength*3+strength];
482 uint32_t high = highs[fStrength*3+strength];
483 uint32_t maxByte = 0;
484 if(strength == UCOL_TERTIARY) {
485 maxByte = 0x3F;
486 } else if(strength == UCOL_PRIMARY) {
487 maxByte = 0xFE;
488 } else {
489 maxByte = 0xFF;
490 }
491
492 uint32_t count = tok->toInsert;
493
494 if(low >= high && strength > UCOL_PRIMARY) {
495 int32_t s = strength;
496 for(;;) {
497 s--;
498 if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
499 if(strength == UCOL_SECONDARY) {
500 if (low < UCOL_COMMON_TOP2<<24 ) {
501 // Override if low range is less than UCOL_COMMON_TOP2.
502 low = UCOL_COMMON_TOP2<<24;
503 }
504 high = 0xFFFFFFFF;
505 } else {
506 // Override if low range is less than UCOL_COMMON_BOT3.
507 if ( low < UCOL_COMMON_BOT3<<24 ) {
508 low = UCOL_COMMON_BOT3<<24;
509 }
510 high = 0x40000000;
511 }
512 break;
513 }
514 if(s<0) {
515 *status = U_INTERNAL_PROGRAM_ERROR;
516 return 0;
517 }
518 }
519 }
520
521 if(low < 0x02000000) {
522 // We must not use CE weight byte 02, so we set it as the minimum lower bound.
523 // See http://site.icu-project.org/design/collation/bytes
524 low = 0x02000000;
525 }
526
527 if(strength == UCOL_SECONDARY) { /* similar as simple */
528 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
529 low = UCOL_COMMON_TOP2<<24;
530 }
531 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
532 high = UCOL_COMMON_TOP2<<24;
533 }
534 if(low < (UCOL_COMMON_BOT2<<24)) {
535 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
536 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
537 //g->current = UCOL_COMMON_BOT2<<24;
538 return g->current;
539 }
540 }
541
542 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
543 if(g->noOfRanges == 0) {
544 *status = U_INTERNAL_PROGRAM_ERROR;
545 }
546 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
547 return g->current;
548 }
549
550 static
u_toLargeKana(const UChar * source,const uint32_t sourceLen,UChar * resBuf,const uint32_t resLen,UErrorCode * status)551 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
552 uint32_t i = 0;
553 UChar c;
554
555 if(U_FAILURE(*status)) {
556 return 0;
557 }
558
559 if(sourceLen > resLen) {
560 *status = U_MEMORY_ALLOCATION_ERROR;
561 return 0;
562 }
563
564 for(i = 0; i < sourceLen; i++) {
565 c = source[i];
566 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
567 switch(c - 0x3000) {
568 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
569 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
570 c++;
571 break;
572 case 0xF5:
573 c = 0x30AB;
574 break;
575 case 0xF6:
576 c = 0x30B1;
577 break;
578 }
579 }
580 resBuf[i] = c;
581 }
582 return sourceLen;
583 }
584
585 static
u_toSmallKana(const UChar * source,const uint32_t sourceLen,UChar * resBuf,const uint32_t resLen,UErrorCode * status)586 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
587 uint32_t i = 0;
588 UChar c;
589
590 if(U_FAILURE(*status)) {
591 return 0;
592 }
593
594 if(sourceLen > resLen) {
595 *status = U_MEMORY_ALLOCATION_ERROR;
596 return 0;
597 }
598
599 for(i = 0; i < sourceLen; i++) {
600 c = source[i];
601 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
602 switch(c - 0x3000) {
603 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
604 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
605 c--;
606 break;
607 case 0xAB:
608 c = 0x30F5;
609 break;
610 case 0xB1:
611 c = 0x30F6;
612 break;
613 }
614 }
615 resBuf[i] = c;
616 }
617 return sourceLen;
618 }
619
620 U_NAMESPACE_BEGIN
621
622 static
ucol_uprv_getCaseBits(const UCollator * UCA,const UChar * src,uint32_t len,UErrorCode * status)623 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
624 uint32_t i = 0;
625 UChar n[128];
626 uint32_t nLen = 0;
627 uint32_t uCount = 0, lCount = 0;
628
629 collIterate s;
630 uint32_t order = 0;
631
632 if(U_FAILURE(*status)) {
633 return UCOL_LOWER_CASE;
634 }
635
636 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
637 if(U_SUCCESS(*status)) {
638 for(i = 0; i < nLen; i++) {
639 uprv_init_collIterate(UCA, &n[i], 1, &s, status);
640 order = ucol_getNextCE(UCA, &s, status);
641 if(isContinuation(order)) {
642 *status = U_INTERNAL_PROGRAM_ERROR;
643 return UCOL_LOWER_CASE;
644 }
645 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
646 uCount++;
647 } else {
648 if(u_islower(n[i])) {
649 lCount++;
650 } else if(U_SUCCESS(*status)) {
651 UChar sk[1], lk[1];
652 u_toSmallKana(&n[i], 1, sk, 1, status);
653 u_toLargeKana(&n[i], 1, lk, 1, status);
654 if(sk[0] == n[i] && lk[0] != n[i]) {
655 lCount++;
656 }
657 }
658 }
659 }
660 }
661
662 if(uCount != 0 && lCount != 0) {
663 return UCOL_MIXED_CASE;
664 } else if(uCount != 0) {
665 return UCOL_UPPER_CASE;
666 } else {
667 return UCOL_LOWER_CASE;
668 }
669 }
670
671
ucol_doCE(UColTokenParser * src,uint32_t * CEparts,UColToken * tok,UErrorCode * status)672 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
673 /* this one makes the table and stuff */
674 uint32_t noOfBytes[3];
675 uint32_t i;
676
677 for(i = 0; i<3; i++) {
678 ucol_countBytes(CEparts[i], noOfBytes[i]);
679 }
680
681 /* Here we have to pack CEs from parts */
682
683 uint32_t CEi = 0;
684 uint32_t value = 0;
685
686 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
687 if(CEi > 0) {
688 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
689 } else {
690 value = 0;
691 }
692
693 if(2*CEi<noOfBytes[0]) {
694 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
695 }
696 if(CEi<noOfBytes[1]) {
697 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
698 }
699 if(CEi<noOfBytes[2]) {
700 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
701 }
702 tok->CEs[CEi] = value;
703 CEi++;
704 }
705 if(CEi == 0) { /* totally ignorable */
706 tok->noOfCEs = 1;
707 tok->CEs[0] = 0;
708 } else { /* there is at least something */
709 tok->noOfCEs = CEi;
710 }
711
712
713 // we want to set case bits here and now, not later.
714 // Case bits handling
715 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
716 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
717 int32_t cSize = (tok->source & 0xFF000000) >> 24;
718 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
719
720 if(cSize > 1) {
721 // Do it manually
722 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
723 } else {
724 // Copy it from the UCA
725 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
726 tok->CEs[0] |= (caseCE & 0xC0);
727 }
728 }
729
730 #if UCOL_DEBUG==2
731 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
732 for(i = 0; i<tok->noOfCEs; i++) {
733 fprintf(stderr, "%08X ", tok->CEs[i]);
734 }
735 fprintf(stderr, "\n");
736 #endif
737 }
738
ucol_initBuffers(UColTokenParser * src,UColTokListHeader * lh,UErrorCode * status)739 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
740 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
741 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
742
743 UColToken *tok = lh->last;
744 uint32_t t[UCOL_STRENGTH_LIMIT];
745
746 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
747
748 /* must initialize ranges to avoid memory check warnings */
749 for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) {
750 uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges));
751 }
752
753 tok->toInsert = 1;
754 t[tok->strength] = 1;
755
756 while(tok->previous != NULL) {
757 if(tok->previous->strength < tok->strength) { /* going up */
758 t[tok->strength] = 0;
759 t[tok->previous->strength]++;
760 } else if(tok->previous->strength > tok->strength) { /* going down */
761 t[tok->previous->strength] = 1;
762 } else {
763 t[tok->strength]++;
764 }
765 tok=tok->previous;
766 tok->toInsert = t[tok->strength];
767 }
768
769 tok->toInsert = t[tok->strength];
770 ucol_inv_getGapPositions(src, lh, status);
771
772 #if UCOL_DEBUG
773 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
774 int32_t j = 2;
775 for(j = 2; j >= 0; j--) {
776 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
777 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
778 }
779 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
780
781 do {
782 fprintf(stderr,"%i", tok->strength);
783 tok = tok->next;
784 } while(tok != NULL);
785 fprintf(stderr, "\n");
786
787 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
788
789 do {
790 fprintf(stderr,"%i", tok->toInsert);
791 tok = tok->next;
792 } while(tok != NULL);
793 #endif
794
795 tok = lh->first;
796 uint32_t fStrength = UCOL_IDENTICAL;
797 uint32_t initStrength = UCOL_IDENTICAL;
798
799
800 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
801 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
802 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
803
804 while (tok != NULL && U_SUCCESS(*status)) {
805 fStrength = tok->strength;
806 if(fStrength < initStrength) {
807 initStrength = fStrength;
808 if(lh->pos[fStrength] == -1) {
809 while(lh->pos[fStrength] == -1 && fStrength > 0) {
810 fStrength--;
811 }
812 if(lh->pos[fStrength] == -1) {
813 *status = U_INTERNAL_PROGRAM_ERROR;
814 return;
815 }
816 }
817 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
818 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
819 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
820 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
821 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
822 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
823 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
824 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
825 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
826 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
827 } else { /* primaries */
828 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
829 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
830 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
831 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
832 }
833 } else {
834 if(tok->strength == UCOL_TERTIARY) {
835 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
836 } else if(tok->strength == UCOL_SECONDARY) {
837 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
838 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
839 } else if(tok->strength == UCOL_PRIMARY) {
840 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
841 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
842 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
843 }
844 }
845 ucol_doCE(src, CEparts, tok, status);
846 tok = tok->next;
847 }
848 }
849
ucol_createElements(UColTokenParser * src,tempUCATable * t,UColTokListHeader * lh,UErrorCode * status)850 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
851 UCAElements el;
852 UColToken *tok = lh->first;
853 UColToken *expt = NULL;
854 uint32_t i = 0, j = 0;
855 const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
856
857 while(tok != NULL && U_SUCCESS(*status)) {
858 /* first, check if there are any expansions */
859 /* if there are expansions, we need to do a little bit more processing */
860 /* since parts of expansion can be tailored, while others are not */
861 if(tok->expansion != 0) {
862 uint32_t len = tok->expansion >> 24;
863 uint32_t currentSequenceLen = len;
864 uint32_t expOffset = tok->expansion & 0x00FFFFFF;
865 //uint32_t exp = currentSequenceLen | expOffset;
866 UColToken exp;
867 exp.source = currentSequenceLen | expOffset;
868 exp.rulesToParseHdl = &(src->source);
869
870 while(len > 0) {
871 currentSequenceLen = len;
872 while(currentSequenceLen > 0) {
873 exp.source = (currentSequenceLen << 24) | expOffset;
874 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
875 uint32_t noOfCEsToCopy = expt->noOfCEs;
876 for(j = 0; j<noOfCEsToCopy; j++) {
877 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
878 }
879 tok->noOfExpCEs += noOfCEsToCopy;
880 // Smart people never try to add codepoints and CEs.
881 // For some odd reason, it won't work.
882 expOffset += currentSequenceLen; //noOfCEsToCopy;
883 len -= currentSequenceLen; //noOfCEsToCopy;
884 break;
885 } else {
886 currentSequenceLen--;
887 }
888 }
889 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
890 /* will have to get one from UCA */
891 /* first, get the UChars from the rules */
892 /* then pick CEs out until there is no more and stuff them into expansion */
893 collIterate s;
894 uint32_t order = 0;
895 uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
896
897 for(;;) {
898 order = ucol_getNextCE(src->UCA, &s, status);
899 if(order == UCOL_NO_MORE_CES) {
900 break;
901 }
902 tok->expCEs[tok->noOfExpCEs++] = order;
903 }
904 expOffset++;
905 len--;
906 }
907 }
908 } else {
909 tok->noOfExpCEs = 0;
910 }
911
912 /* set the ucaelement with obtained values */
913 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
914 /* copy CEs */
915 for(i = 0; i<tok->noOfCEs; i++) {
916 el.CEs[i] = tok->CEs[i];
917 }
918 for(i = 0; i<tok->noOfExpCEs; i++) {
919 el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
920 }
921
922 /* copy UChars */
923 // We kept prefix and source kind of together, as it is a kind of a contraction.
924 // However, now we have to slice the prefix off the main thing -
925 el.prefix = el.prefixChars;
926 el.cPoints = el.uchars;
927 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
928 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
929 // decomposed elements to the unsaf table.
930 el.prefixSize = tok->prefix>>24;
931 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
932
933 el.cSize = (tok->source >> 24)-(tok->prefix>>24);
934 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
935 } else {
936 el.prefixSize = 0;
937 *el.prefix = 0;
938
939 el.cSize = (tok->source >> 24);
940 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
941 }
942 if(src->UCA != NULL) {
943 for(i = 0; i<el.cSize; i++) {
944 if(UCOL_ISJAMO(el.cPoints[i])) {
945 t->image->jamoSpecial = TRUE;
946 }
947 }
948 if (!src->buildCCTabFlag && el.cSize > 0) {
949 // Check the trailing canonical combining class (tccc) of the last character.
950 const UChar *s = el.cPoints + el.cSize;
951 uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s);
952 if ((fcd & 0xff) != 0) {
953 src->buildCCTabFlag = TRUE;
954 }
955 }
956 }
957
958 /* and then, add it */
959 #if UCOL_DEBUG==2
960 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
961 #endif
962 uprv_uca_addAnElement(t, &el, status);
963
964 #if UCOL_DEBUG_DUPLICATES
965 if(*status != U_ZERO_ERROR) {
966 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
967 *status = U_ZERO_ERROR;
968 }
969 #endif
970
971 tok = tok->next;
972 }
973 }
974
975 U_CDECL_BEGIN
976 static UBool U_CALLCONV
_processUCACompleteIgnorables(const void * context,UChar32 start,UChar32 limit,uint32_t value)977 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
978 UErrorCode status = U_ZERO_ERROR;
979 tempUCATable *t = (tempUCATable *)context;
980 if(value == 0) {
981 while(start < limit) {
982 uint32_t CE = utrie_get32(t->mapping, start, NULL);
983 if(CE == UCOL_NOT_FOUND) {
984 UCAElements el;
985 el.isThai = FALSE;
986 el.prefixSize = 0;
987 el.prefixChars[0] = 0;
988 el.prefix = el.prefixChars;
989 el.cPoints = el.uchars;
990
991 el.cSize = 0;
992 U16_APPEND_UNSAFE(el.uchars, el.cSize, start);
993
994 el.noOfCEs = 1;
995 el.CEs[0] = 0;
996 uprv_uca_addAnElement(t, &el, &status);
997
998 }
999 start++;
1000 }
1001 }
1002 if(U_FAILURE(status)) {
1003 return FALSE;
1004 } else {
1005 return TRUE;
1006 }
1007 }
1008 U_CDECL_END
1009
1010 static void
ucol_uprv_bld_copyRangeFromUCA(UColTokenParser * src,tempUCATable * t,UChar32 start,UChar32 end,UErrorCode * status)1011 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
1012 UChar32 start, UChar32 end,
1013 UErrorCode *status)
1014 {
1015 //UChar decomp[256];
1016 uint32_t CE = UCOL_NOT_FOUND;
1017 UChar32 u = 0;
1018 UCAElements el;
1019 el.isThai = FALSE;
1020 el.prefixSize = 0;
1021 el.prefixChars[0] = 0;
1022 collIterate colIt;
1023
1024 if(U_SUCCESS(*status)) {
1025 for(u = start; u<=end; u++) {
1026 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
1027 /* this test is for contractions that are missing the starting element. */
1028 || ((isCntTableElement(CE)) &&
1029 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
1030 )
1031 {
1032 el.cSize = 0;
1033 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
1034 //decomp[0] = (UChar)u;
1035 //el.uchars[0] = (UChar)u;
1036 el.cPoints = el.uchars;
1037 //el.cSize = 1;
1038 el.noOfCEs = 0;
1039 el.prefix = el.prefixChars;
1040 el.prefixSize = 0;
1041 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1042 // We actually want to check whether this element is a special
1043 // If it is an implicit element (hangul, CJK - we want to copy the
1044 // special, not the resolved CEs) - for hangul, copying resolved
1045 // would just make things the same (there is an expansion and it
1046 // takes approximately the same amount of time to resolve as
1047 // falling back to the UCA).
1048 /*
1049 UTRIE_GET32(src->UCA->mapping, u, CE);
1050 tag = getCETag(CE);
1051 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1052 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1053 || tag == LEAD_SURROGATE_TAG) {
1054 el.CEs[el.noOfCEs++] = CE;
1055 } else {
1056 */
1057 // It turns out that it does not make sense to keep implicits
1058 // unresolved. The cost of resolving them is big enough so that
1059 // it doesn't make any difference whether we have to go to the UCA
1060 // or not.
1061 {
1062 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
1063 while(CE != UCOL_NO_MORE_CES) {
1064 CE = ucol_getNextCE(src->UCA, &colIt, status);
1065 if(CE != UCOL_NO_MORE_CES) {
1066 el.CEs[el.noOfCEs++] = CE;
1067 }
1068 }
1069 }
1070 uprv_uca_addAnElement(t, &el, status);
1071 }
1072 }
1073 }
1074 }
1075
1076 U_NAMESPACE_END
1077
1078 U_CFUNC UCATableHeader *
ucol_assembleTailoringTable(UColTokenParser * src,UErrorCode * status)1079 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1080 U_NAMESPACE_USE
1081
1082 uint32_t i = 0;
1083 if(U_FAILURE(*status)) {
1084 return NULL;
1085 }
1086 /*
1087 2. Eliminate the negative lists by doing the following for each non-null negative list:
1088 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1089 create new ListHeader X
1090 o reverse the list, add to the end of X's positive list. Reset the strength of the
1091 first item you add, based on the stronger strength levels of the two lists.
1092 */
1093 /*
1094 3. For each ListHeader with a non-null positive list:
1095 */
1096 /*
1097 o Find all character strings with CEs between the baseCE and the
1098 next/previous CE, at the strength of the first token. Add these to the
1099 tailoring.
1100 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the
1101 tailoring has & x < z...
1102 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
1103 */
1104 /* It is possible that this part should be done even while constructing list */
1105 /* The problem is that it is unknown what is going to be the strongest weight */
1106 /* So we might as well do it here */
1107
1108 /*
1109 o Allocate CEs for each token in the list, based on the total number N of the
1110 largest level difference, and the gap G between baseCE and nextCE at that
1111 level. The relation * between the last item and nextCE is the same as the
1112 strongest strength.
1113 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1114 ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1115 Then fit b and c into the secondary gap between a and d, then fit q
1116 into the tertiary gap between b and c.
1117
1118 o Example: baseCE << b <<< q << c * nextCE(X,2)
1119 ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1120 Then fit q into the tertiary gap between b and c.
1121 o When incrementing primary values, we will not cross high byte
1122 boundaries except where there is only a single-byte primary. That is to
1123 ensure that the script reordering will continue to work.
1124 */
1125 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1126 /* test for NULL */
1127 if (image == NULL) {
1128 *status = U_MEMORY_ALLOCATION_ERROR;
1129 return NULL;
1130 }
1131 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1132
1133 for(i = 0; i<src->resultLen; i++) {
1134 /* now we need to generate the CEs */
1135 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1136 /* According to strength */
1137 if(U_SUCCESS(*status)) {
1138 if(src->lh[i].first) { // if there are any elements
1139 // due to the way parser works, subsequent tailorings
1140 // may remove all the elements from a sequence, therefore
1141 // leaving an empty tailoring sequence.
1142 ucol_initBuffers(src, &src->lh[i], status);
1143 }
1144 }
1145 if(U_FAILURE(*status)) {
1146 uprv_free(image);
1147 return NULL;
1148 }
1149 }
1150
1151 if(src->varTop != NULL) { /* stuff the variable top value */
1152 src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1153 /* remove it from the list */
1154 if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1155 src->varTop->listHeader->first = src->varTop->next;
1156 }
1157 if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1158 src->varTop->listHeader->last = src->varTop->previous;
1159 }
1160 if(src->varTop->next != NULL) {
1161 src->varTop->next->previous = src->varTop->previous;
1162 }
1163 if(src->varTop->previous != NULL) {
1164 src->varTop->previous->next = src->varTop->next;
1165 }
1166 }
1167
1168
1169 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1170 if(U_FAILURE(*status)) {
1171 uprv_free(image);
1172 return NULL;
1173 }
1174
1175
1176 /* After this, we have assigned CE values to all regular CEs */
1177 /* now we will go through list once more and resolve expansions, */
1178 /* make UCAElements structs and add them to table */
1179 for(i = 0; i<src->resultLen; i++) {
1180 /* now we need to generate the CEs */
1181 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1182 /* According to strength */
1183 if(U_SUCCESS(*status)) {
1184 ucol_createElements(src, t, &src->lh[i], status);
1185 }
1186 }
1187
1188 UCAElements el;
1189 el.isThai = FALSE;
1190 el.prefixSize = 0;
1191 el.prefixChars[0] = 0;
1192
1193 /* add latin-1 stuff */
1194 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1195
1196 /* add stuff for copying */
1197 if(src->copySet != NULL) {
1198 int32_t i = 0;
1199 UnicodeSet *set = (UnicodeSet *)src->copySet;
1200 for(i = 0; i < set->getRangeCount(); i++) {
1201 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
1202 }
1203 }
1204
1205 if(U_SUCCESS(*status)) {
1206 /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1207
1208 uint32_t tailoredCE = UCOL_NOT_FOUND;
1209 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1210 int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth;
1211 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1212 // Check for null pointer
1213 if (ucaEl == NULL) {
1214 *status = U_MEMORY_ALLOCATION_ERROR;
1215 return NULL;
1216 }
1217 while(*conts != 0) {
1218 // A continuation is NUL-terminated and NUL-padded
1219 // except if it has the maximum length.
1220 int32_t contractionLength = maxUCAContractionLength;
1221 while(contractionLength > 0 && conts[contractionLength - 1] == 0) {
1222 --contractionLength;
1223 }
1224 UChar32 first;
1225 int32_t firstLength = 0;
1226 U16_NEXT(conts, firstLength, contractionLength, first);
1227 tailoredCE = utrie_get32(t->mapping, first, NULL);
1228 if(tailoredCE != UCOL_NOT_FOUND) {
1229 UBool needToAdd = TRUE;
1230 if(isCntTableElement(tailoredCE)) {
1231 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) {
1232 needToAdd = FALSE;
1233 }
1234 }
1235 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
1236 UCAElements elm;
1237 elm.cPoints = el.uchars;
1238 elm.noOfCEs = 0;
1239 elm.uchars[0] = *conts;
1240 elm.uchars[1] = 0;
1241 elm.cSize = 1;
1242 elm.prefixChars[0] = *(conts+2);
1243 elm.isThai = FALSE;
1244 elm.prefix = elm.prefixChars;
1245 elm.prefixSize = 1;
1246 UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
1247 if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
1248 needToAdd = TRUE;
1249 }
1250 }
1251 if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
1252 needToAdd = FALSE;
1253 }
1254
1255 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1256 if (*(conts+1) != 0) { // contractions
1257 el.prefix = el.prefixChars;
1258 el.prefixSize = 0;
1259 el.cPoints = el.uchars;
1260 el.noOfCEs = 0;
1261 u_memcpy(el.uchars, conts, contractionLength);
1262 el.cSize = contractionLength;
1263 ucol_setText(ucaEl, el.uchars, el.cSize, status);
1264 }
1265 else { // pre-context character
1266 UChar str[4] = { 0 };
1267 int32_t len=0;
1268 int32_t preKeyLen=0;
1269
1270 el.cPoints = el.uchars;
1271 el.noOfCEs = 0;
1272 el.uchars[0] = *conts;
1273 el.uchars[1] = 0;
1274 el.cSize = 1;
1275 el.prefixChars[0] = *(conts+2);
1276 el.prefix = el.prefixChars;
1277 el.prefixSize = 1;
1278 if (el.prefixChars[0]!=0) {
1279 // get CE of prefix character first
1280 str[0]=el.prefixChars[0];
1281 str[1]=0;
1282 ucol_setText(ucaEl, str, 1, status);
1283 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
1284 != UCOL_NULLORDER) {
1285 preKeyLen++; // count number of keys for prefix character
1286 }
1287 str[len++] = el.prefixChars[0];
1288 }
1289
1290 str[len++] = el.uchars[0];
1291 str[len]=0;
1292 ucol_setText(ucaEl, str, len, status);
1293 // Skip the keys for prefix character, then copy the rest to el.
1294 while ((preKeyLen-->0) &&
1295 (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1296 continue;
1297 }
1298
1299 }
1300 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1301 el.noOfCEs++;
1302 }
1303 uprv_uca_addAnElement(t, &el, status);
1304 }
1305
1306 } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
1307 ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status);
1308 }
1309 conts+=maxUCAContractionLength;
1310 }
1311 ucol_closeElements(ucaEl);
1312 }
1313
1314 // Add completely ignorable elements
1315 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1316
1317 // add tailoring characters related canonical closures
1318 uprv_uca_canonicalClosure(t, src, NULL, status);
1319
1320 /* still need to produce compatibility closure */
1321
1322 UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1323
1324 uprv_uca_closeTempTable(t);
1325 uprv_free(image);
1326
1327 return myData;
1328 }
1329
1330 U_CDECL_BEGIN
1331 static UBool U_CALLCONV
ucol_bld_cleanup(void)1332 ucol_bld_cleanup(void)
1333 {
1334 udata_close(invUCA_DATA_MEM);
1335 invUCA_DATA_MEM = NULL;
1336 _staticInvUCA = NULL;
1337 return TRUE;
1338 }
1339 U_CDECL_END
1340
1341 U_CAPI const InverseUCATableHeader * U_EXPORT2
ucol_initInverseUCA(UErrorCode * status)1342 ucol_initInverseUCA(UErrorCode *status)
1343 {
1344 if(U_FAILURE(*status)) return NULL;
1345
1346 UBool needsInit;
1347 UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit);
1348
1349 if(needsInit) {
1350 InverseUCATableHeader *newInvUCA = NULL;
1351 UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
1352
1353 if(U_FAILURE(*status)) {
1354 if (result) {
1355 udata_close(result);
1356 }
1357 // This is not needed, as we are talking about
1358 // memory we got from UData
1359 //uprv_free(newInvUCA);
1360 }
1361
1362 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1363 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1364 UCollator *UCA = ucol_initUCA(status);
1365 // UCA versions of UCA and inverse UCA should match
1366 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
1367 *status = U_INVALID_FORMAT_ERROR;
1368 udata_close(result);
1369 return NULL;
1370 }
1371
1372 umtx_lock(NULL);
1373 if(_staticInvUCA == NULL) {
1374 invUCA_DATA_MEM = result;
1375 _staticInvUCA = newInvUCA;
1376 result = NULL;
1377 newInvUCA = NULL;
1378 }
1379 umtx_unlock(NULL);
1380
1381 if(newInvUCA != NULL) {
1382 udata_close(result);
1383 // This is not needed, as we are talking about
1384 // memory we got from UData
1385 //uprv_free(newInvUCA);
1386 }
1387 else {
1388 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1389 }
1390 }
1391 }
1392 return _staticInvUCA;
1393 }
1394
1395 /* This is the data that is used for non-script reordering codes. These _must_ be kept
1396 * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
1397 */
1398 static const char * const ReorderingTokenNames[] = {
1399 "SPACE",
1400 "PUNCT",
1401 "SYMBOL",
1402 "CURRENCY",
1403 "DIGIT"
1404 };
1405
toUpper(const char * src,char * dst,uint32_t length)1406 static void toUpper(const char* src, char* dst, uint32_t length) {
1407 for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
1408 *dst = uprv_toupper(*src);
1409 }
1410 *dst = '\0';
1411 }
1412
1413 U_INTERNAL int32_t U_EXPORT2
ucol_findReorderingEntry(const char * name)1414 ucol_findReorderingEntry(const char* name) {
1415 char buffer[32];
1416 toUpper(name, buffer, 32);
1417 for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) {
1418 if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
1419 return entry + UCOL_REORDER_CODE_FIRST;
1420 }
1421 }
1422 return USCRIPT_INVALID_CODE;
1423 }
1424
1425 #endif /* #if !UCONFIG_NO_COLLATION */
1426