1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2001-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: ucol_bld.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created 02/22/2001
14 * created by: Vladimir Weinstein
15 *
16 * This module builds a collator based on the rule set.
17 *
18 */
19
20 #include "unicode/utypes.h"
21
22 #if !UCONFIG_NO_COLLATION
23
24 #include "unicode/ucoleitr.h"
25 #include "unicode/udata.h"
26 #include "unicode/uchar.h"
27 #include "unicode/uniset.h"
28 #include "unicode/uscript.h"
29 #include "unicode/ustring.h"
30 #include "normalizer2impl.h"
31 #include "ucol_bld.h"
32 #include "ucol_elm.h"
33 #include "ucol_cnt.h"
34 #include "ucln_in.h"
35 #include "umutex.h"
36 #include "cmemory.h"
37 #include "cstring.h"
38
39 U_NAMESPACE_BEGIN
40
41 static const InverseUCATableHeader* _staticInvUCA = NULL;
42 static UDataMemory* invUCA_DATA_MEM = NULL;
43
44 U_CDECL_BEGIN
45 static UBool U_CALLCONV
isAcceptableInvUCA(void *,const char *,const char *,const UDataInfo * pInfo)46 isAcceptableInvUCA(void * /*context*/,
47 const char * /*type*/, const char * /*name*/,
48 const UDataInfo *pInfo)
49 {
50 /* context, type & name are intentionally not used */
51 if( pInfo->size>=20 &&
52 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
53 pInfo->charsetFamily==U_CHARSET_FAMILY &&
54 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */
55 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
56 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
57 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
58 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
59 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
60 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
61 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
62 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
63 )
64 {
65 UVersionInfo UCDVersion;
66 u_getUnicodeVersion(UCDVersion);
67 return (pInfo->dataVersion[0]==UCDVersion[0] &&
68 pInfo->dataVersion[1]==UCDVersion[1]);
69 //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
70 //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
71 //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
72 } else {
73 return FALSE;
74 }
75 }
76 U_CDECL_END
77
78 /*
79 * Takes two CEs (lead and continuation) and
80 * compares them as CEs should be compared:
81 * primary vs. primary, secondary vs. secondary
82 * tertiary vs. tertiary
83 */
compareCEs(uint32_t source0,uint32_t source1,uint32_t target0,uint32_t target1)84 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
85 uint32_t s1 = source0, s2, t1 = target0, t2;
86 if(isContinuation(source1)) {
87 s2 = source1;
88 } else {
89 s2 = 0;
90 }
91 if(isContinuation(target1)) {
92 t2 = target1;
93 } else {
94 t2 = 0;
95 }
96
97 uint32_t s = 0, t = 0;
98 if(s1 == t1 && s2 == t2) {
99 return 0;
100 }
101 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
102 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
103 if(s < t) {
104 return -1;
105 } else if(s > t) {
106 return 1;
107 } else {
108 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
109 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
110 if(s < t) {
111 return -1;
112 } else if(s > t) {
113 return 1;
114 } else {
115 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
116 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
117 if(s < t) {
118 return -1;
119 } else {
120 return 1;
121 }
122 }
123 }
124 }
125
126 static
ucol_inv_findCE(const UColTokenParser * src,uint32_t CE,uint32_t SecondCE)127 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
128 uint32_t bottom = 0, top = src->invUCA->tableSize;
129 uint32_t i = 0;
130 uint32_t first = 0, second = 0;
131 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
132 int32_t res = 0;
133
134 while(bottom < top-1) {
135 i = (top+bottom)/2;
136 first = *(CETable+3*i);
137 second = *(CETable+3*i+1);
138 res = compareCEs(first, second, CE, SecondCE);
139 if(res > 0) {
140 top = i;
141 } else if(res < 0) {
142 bottom = i;
143 } else {
144 break;
145 }
146 }
147
148 /* weiv: */
149 /* in searching for elements, I have removed the failure */
150 /* The reason for this is that the builder does not rely */
151 /* on search mechanism telling it that it didn't find an */
152 /* element. However, indirect positioning relies on being */
153 /* able to find the elements around any CE, even if it is */
154 /* not defined in the UCA. */
155 return i;
156 /*
157 if((first == CE && second == SecondCE)) {
158 return i;
159 } else {
160 return -1;
161 }
162 */
163 }
164
165 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
166 0xFFFF0000,
167 0xFFFFFF00,
168 0xFFFFFFFF
169 };
170
ucol_inv_getNextCE(const UColTokenParser * src,uint32_t CE,uint32_t contCE,uint32_t * nextCE,uint32_t * nextContCE,uint32_t strength)171 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
172 uint32_t CE, uint32_t contCE,
173 uint32_t *nextCE, uint32_t *nextContCE,
174 uint32_t strength)
175 {
176 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
177 int32_t iCE;
178
179 iCE = ucol_inv_findCE(src, CE, contCE);
180
181 if(iCE<0) {
182 *nextCE = UCOL_NOT_FOUND;
183 return -1;
184 }
185
186 CE &= strengthMask[strength];
187 contCE &= strengthMask[strength];
188
189 *nextCE = CE;
190 *nextContCE = contCE;
191
192 while((*nextCE & strengthMask[strength]) == CE
193 && (*nextContCE & strengthMask[strength]) == contCE)
194 {
195 *nextCE = (*(CETable+3*(++iCE)));
196 *nextContCE = (*(CETable+3*(iCE)+1));
197 }
198
199 return iCE;
200 }
201
ucol_inv_getPrevCE(const UColTokenParser * src,uint32_t CE,uint32_t contCE,uint32_t * prevCE,uint32_t * prevContCE,uint32_t strength)202 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
203 uint32_t CE, uint32_t contCE,
204 uint32_t *prevCE, uint32_t *prevContCE,
205 uint32_t strength)
206 {
207 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
208 int32_t iCE;
209
210 iCE = ucol_inv_findCE(src, CE, contCE);
211
212 if(iCE<0) {
213 *prevCE = UCOL_NOT_FOUND;
214 return -1;
215 }
216
217 CE &= strengthMask[strength];
218 contCE &= strengthMask[strength];
219
220 *prevCE = CE;
221 *prevContCE = contCE;
222
223 while((*prevCE & strengthMask[strength]) == CE
224 && (*prevContCE & strengthMask[strength])== contCE
225 && iCE > 0) /* this condition should prevent falling off the edge of the world */
226 {
227 /* here, we end up in a singularity - zero */
228 *prevCE = (*(CETable+3*(--iCE)));
229 *prevContCE = (*(CETable+3*(iCE)+1));
230 }
231
232 return iCE;
233 }
234
ucol_getCEStrengthDifference(uint32_t CE,uint32_t contCE,uint32_t prevCE,uint32_t prevContCE)235 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
236 uint32_t prevCE, uint32_t prevContCE)
237 {
238 if(prevCE == CE && prevContCE == contCE) {
239 return UCOL_IDENTICAL;
240 }
241 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
242 || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
243 {
244 return UCOL_PRIMARY;
245 }
246 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
247 || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
248 {
249 return UCOL_SECONDARY;
250 }
251 return UCOL_TERTIARY;
252 }
253
254
255 /*static
256 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
257
258 uint32_t CE = lh->baseCE;
259 uint32_t SecondCE = lh->baseContCE;
260
261 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
262 uint32_t previousCE, previousContCE;
263 int32_t iCE;
264
265 iCE = ucol_inv_findCE(src, CE, SecondCE);
266
267 if(iCE<0) {
268 return -1;
269 }
270
271 CE &= strengthMask[strength];
272 SecondCE &= strengthMask[strength];
273
274 previousCE = CE;
275 previousContCE = SecondCE;
276
277 while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) {
278 previousCE = (*(CETable+3*(--iCE)));
279 previousContCE = (*(CETable+3*(iCE)+1));
280 }
281 lh->previousCE = previousCE;
282 lh->previousContCE = previousContCE;
283
284 return iCE;
285 }*/
286
287 static
ucol_inv_getNext(UColTokenParser * src,UColTokListHeader * lh,uint32_t strength)288 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
289 uint32_t CE = lh->baseCE;
290 uint32_t SecondCE = lh->baseContCE;
291
292 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
293 uint32_t nextCE, nextContCE;
294 int32_t iCE;
295
296 iCE = ucol_inv_findCE(src, CE, SecondCE);
297
298 if(iCE<0) {
299 return -1;
300 }
301
302 CE &= strengthMask[strength];
303 SecondCE &= strengthMask[strength];
304
305 nextCE = CE;
306 nextContCE = SecondCE;
307
308 while((nextCE & strengthMask[strength]) == CE
309 && (nextContCE & strengthMask[strength]) == SecondCE)
310 {
311 nextCE = (*(CETable+3*(++iCE)));
312 nextContCE = (*(CETable+3*(iCE)+1));
313 }
314
315 lh->nextCE = nextCE;
316 lh->nextContCE = nextContCE;
317
318 return iCE;
319 }
320
ucol_inv_getGapPositions(UColTokenParser * src,UColTokListHeader * lh,UErrorCode * status)321 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
322 /* reset all the gaps */
323 int32_t i = 0;
324 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
325 uint32_t st = 0;
326 uint32_t t1, t2;
327 int32_t pos;
328
329 UColToken *tok = lh->first;
330 uint32_t tokStrength = tok->strength;
331
332 for(i = 0; i<3; i++) {
333 lh->gapsHi[3*i] = 0;
334 lh->gapsHi[3*i+1] = 0;
335 lh->gapsHi[3*i+2] = 0;
336 lh->gapsLo[3*i] = 0;
337 lh->gapsLo[3*i+1] = 0;
338 lh->gapsLo[3*i+2] = 0;
339 lh->numStr[i] = 0;
340 lh->fStrToken[i] = NULL;
341 lh->lStrToken[i] = NULL;
342 lh->pos[i] = -1;
343 }
344
345 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
346
347 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
348 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
349 lh->pos[0] = 0;
350 t1 = lh->baseCE;
351 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
352 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
353 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
354 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
355 uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
356 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
357
358 t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
359 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
360
361 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
362 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
363 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
364 } else if(lh->indirect == TRUE && lh->nextCE != 0) {
365 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
366 lh->pos[0] = 0;
367 t1 = lh->baseCE;
368 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
369 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
370 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
371 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
372 t1 = lh->nextCE;
373 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
374 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
375 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
376 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
377 } else {
378 for(;;) {
379 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
380 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
381 lh->fStrToken[tokStrength] = tok;
382 } else { /* The CE must be implicit, since it's not in the table */
383 /* Error */
384 *status = U_INTERNAL_PROGRAM_ERROR;
385 }
386 }
387
388 while(tok != NULL && tok->strength >= tokStrength) {
389 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
390 lh->lStrToken[tokStrength] = tok;
391 }
392 tok = tok->next;
393 }
394 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
395 /* check if previous interval is the same and merge the intervals if it is so */
396 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
397 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
398 lh->fStrToken[tokStrength+1] = NULL;
399 lh->lStrToken[tokStrength+1] = NULL;
400 lh->pos[tokStrength+1] = -1;
401 }
402 }
403 if(tok != NULL) {
404 tokStrength = tok->strength;
405 } else {
406 break;
407 }
408 }
409 for(st = 0; st < 3; st++) {
410 if((pos = lh->pos[st]) >= 0) {
411 t1 = *(CETable+3*(pos));
412 t2 = *(CETable+3*(pos)+1);
413 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
414 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
415 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
416 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
417 //pos--;
418 //t1 = *(CETable+3*(pos));
419 //t2 = *(CETable+3*(pos)+1);
420 t1 = lh->baseCE;
421 t2 = lh->baseContCE;
422 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
423 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
424 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
425 }
426 }
427 }
428 }
429
430
431 #define ucol_countBytes(value, noOfBytes) \
432 { \
433 uint32_t mask = 0xFFFFFFFF; \
434 (noOfBytes) = 0; \
435 while(mask != 0) { \
436 if(((value) & mask) != 0) { \
437 (noOfBytes)++; \
438 } \
439 mask >>= 8; \
440 } \
441 }
442
ucol_getNextGenerated(ucolCEGenerator * g,UErrorCode * status)443 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
444 if(U_SUCCESS(*status)) {
445 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
446 }
447 return g->current;
448 }
449
ucol_getSimpleCEGenerator(ucolCEGenerator * g,UColToken * tok,uint32_t strength,UErrorCode * status)450 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
451 /* TODO: rename to enum names */
452 uint32_t high, low, count=1;
453 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
454
455 if(strength == UCOL_SECONDARY) {
456 low = UCOL_COMMON_TOP2<<24;
457 high = 0xFFFFFFFF;
458 count = 0xFF - UCOL_COMMON_TOP2;
459 } else {
460 low = UCOL_BYTE_COMMON << 24; //0x05000000;
461 high = 0x40000000;
462 count = 0x40 - UCOL_BYTE_COMMON;
463 }
464
465 if(tok->next != NULL && tok->next->strength == strength) {
466 count = tok->next->toInsert;
467 }
468
469 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
470 g->current = UCOL_BYTE_COMMON<<24;
471
472 if(g->noOfRanges == 0) {
473 *status = U_INTERNAL_PROGRAM_ERROR;
474 }
475 return g->current;
476 }
477
ucol_getCEGenerator(ucolCEGenerator * g,uint32_t * lows,uint32_t * highs,UColToken * tok,uint32_t fStrength,UErrorCode * status)478 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
479 uint32_t strength = tok->strength;
480 uint32_t low = lows[fStrength*3+strength];
481 uint32_t high = highs[fStrength*3+strength];
482 uint32_t maxByte = 0;
483 if(strength == UCOL_TERTIARY) {
484 maxByte = 0x3F;
485 } else if(strength == UCOL_PRIMARY) {
486 maxByte = 0xFE;
487 } else {
488 maxByte = 0xFF;
489 }
490
491 uint32_t count = tok->toInsert;
492
493 if(low >= high && strength > UCOL_PRIMARY) {
494 int32_t s = strength;
495 for(;;) {
496 s--;
497 if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
498 if(strength == UCOL_SECONDARY) {
499 if (low < UCOL_COMMON_TOP2<<24 ) {
500 // Override if low range is less than UCOL_COMMON_TOP2.
501 low = UCOL_COMMON_TOP2<<24;
502 }
503 high = 0xFFFFFFFF;
504 } else {
505 // Override if low range is less than UCOL_COMMON_BOT3.
506 if ( low < UCOL_COMMON_BOT3<<24 ) {
507 low = UCOL_COMMON_BOT3<<24;
508 }
509 high = 0x40000000;
510 }
511 break;
512 }
513 if(s<0) {
514 *status = U_INTERNAL_PROGRAM_ERROR;
515 return 0;
516 }
517 }
518 }
519
520 if(low < 0x02000000) {
521 // We must not use CE weight byte 02, so we set it as the minimum lower bound.
522 // See http://site.icu-project.org/design/collation/bytes
523 low = 0x02000000;
524 }
525
526 if(strength == UCOL_SECONDARY) { /* similar as simple */
527 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
528 low = UCOL_COMMON_TOP2<<24;
529 }
530 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
531 high = UCOL_COMMON_TOP2<<24;
532 }
533 if(low < (UCOL_COMMON_BOT2<<24)) {
534 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
535 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
536 //g->current = UCOL_COMMON_BOT2<<24;
537 return g->current;
538 }
539 }
540
541 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
542 if(g->noOfRanges == 0) {
543 *status = U_INTERNAL_PROGRAM_ERROR;
544 }
545 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
546 return g->current;
547 }
548
549 static
u_toLargeKana(const UChar * source,const uint32_t sourceLen,UChar * resBuf,const uint32_t resLen,UErrorCode * status)550 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
551 uint32_t i = 0;
552 UChar c;
553
554 if(U_FAILURE(*status)) {
555 return 0;
556 }
557
558 if(sourceLen > resLen) {
559 *status = U_MEMORY_ALLOCATION_ERROR;
560 return 0;
561 }
562
563 for(i = 0; i < sourceLen; i++) {
564 c = source[i];
565 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
566 switch(c - 0x3000) {
567 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
568 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
569 c++;
570 break;
571 case 0xF5:
572 c = 0x30AB;
573 break;
574 case 0xF6:
575 c = 0x30B1;
576 break;
577 }
578 }
579 resBuf[i] = c;
580 }
581 return sourceLen;
582 }
583
584 static
u_toSmallKana(const UChar * source,const uint32_t sourceLen,UChar * resBuf,const uint32_t resLen,UErrorCode * status)585 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
586 uint32_t i = 0;
587 UChar c;
588
589 if(U_FAILURE(*status)) {
590 return 0;
591 }
592
593 if(sourceLen > resLen) {
594 *status = U_MEMORY_ALLOCATION_ERROR;
595 return 0;
596 }
597
598 for(i = 0; i < sourceLen; i++) {
599 c = source[i];
600 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
601 switch(c - 0x3000) {
602 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
603 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
604 c--;
605 break;
606 case 0xAB:
607 c = 0x30F5;
608 break;
609 case 0xB1:
610 c = 0x30F6;
611 break;
612 }
613 }
614 resBuf[i] = c;
615 }
616 return sourceLen;
617 }
618
619 static
ucol_uprv_getCaseBits(const UCollator * UCA,const UChar * src,uint32_t len,UErrorCode * status)620 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
621 uint32_t i = 0;
622 UChar n[128];
623 uint32_t nLen = 0;
624 uint32_t uCount = 0, lCount = 0;
625
626 collIterate s;
627 uint32_t order = 0;
628
629 if(U_FAILURE(*status)) {
630 return UCOL_LOWER_CASE;
631 }
632
633 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
634 if(U_SUCCESS(*status)) {
635 for(i = 0; i < nLen; i++) {
636 uprv_init_collIterate(UCA, &n[i], 1, &s, status);
637 order = ucol_getNextCE(UCA, &s, status);
638 if(isContinuation(order)) {
639 *status = U_INTERNAL_PROGRAM_ERROR;
640 return UCOL_LOWER_CASE;
641 }
642 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
643 uCount++;
644 } else {
645 if(u_islower(n[i])) {
646 lCount++;
647 } else if(U_SUCCESS(*status)) {
648 UChar sk[1], lk[1];
649 u_toSmallKana(&n[i], 1, sk, 1, status);
650 u_toLargeKana(&n[i], 1, lk, 1, status);
651 if(sk[0] == n[i] && lk[0] != n[i]) {
652 lCount++;
653 }
654 }
655 }
656 }
657 }
658
659 if(uCount != 0 && lCount != 0) {
660 return UCOL_MIXED_CASE;
661 } else if(uCount != 0) {
662 return UCOL_UPPER_CASE;
663 } else {
664 return UCOL_LOWER_CASE;
665 }
666 }
667
668
ucol_doCE(UColTokenParser * src,uint32_t * CEparts,UColToken * tok,UErrorCode * status)669 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
670 /* this one makes the table and stuff */
671 uint32_t noOfBytes[3];
672 uint32_t i;
673
674 for(i = 0; i<3; i++) {
675 ucol_countBytes(CEparts[i], noOfBytes[i]);
676 }
677
678 /* Here we have to pack CEs from parts */
679
680 uint32_t CEi = 0;
681 uint32_t value = 0;
682
683 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
684 if(CEi > 0) {
685 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
686 } else {
687 value = 0;
688 }
689
690 if(2*CEi<noOfBytes[0]) {
691 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
692 }
693 if(CEi<noOfBytes[1]) {
694 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
695 }
696 if(CEi<noOfBytes[2]) {
697 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
698 }
699 tok->CEs[CEi] = value;
700 CEi++;
701 }
702 if(CEi == 0) { /* totally ignorable */
703 tok->noOfCEs = 1;
704 tok->CEs[0] = 0;
705 } else { /* there is at least something */
706 tok->noOfCEs = CEi;
707 }
708
709
710 // we want to set case bits here and now, not later.
711 // Case bits handling
712 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
713 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
714 int32_t cSize = (tok->source & 0xFF000000) >> 24;
715 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
716
717 if(cSize > 1) {
718 // Do it manually
719 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
720 } else {
721 // Copy it from the UCA
722 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
723 tok->CEs[0] |= (caseCE & 0xC0);
724 }
725 }
726
727 #if UCOL_DEBUG==2
728 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
729 for(i = 0; i<tok->noOfCEs; i++) {
730 fprintf(stderr, "%08X ", tok->CEs[i]);
731 }
732 fprintf(stderr, "\n");
733 #endif
734 }
735
ucol_initBuffers(UColTokenParser * src,UColTokListHeader * lh,UErrorCode * status)736 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
737 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
738 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
739
740 UColToken *tok = lh->last;
741 uint32_t t[UCOL_STRENGTH_LIMIT];
742
743 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
744
745 tok->toInsert = 1;
746 t[tok->strength] = 1;
747
748 while(tok->previous != NULL) {
749 if(tok->previous->strength < tok->strength) { /* going up */
750 t[tok->strength] = 0;
751 t[tok->previous->strength]++;
752 } else if(tok->previous->strength > tok->strength) { /* going down */
753 t[tok->previous->strength] = 1;
754 } else {
755 t[tok->strength]++;
756 }
757 tok=tok->previous;
758 tok->toInsert = t[tok->strength];
759 }
760
761 tok->toInsert = t[tok->strength];
762 ucol_inv_getGapPositions(src, lh, status);
763
764 #if UCOL_DEBUG
765 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
766 int32_t j = 2;
767 for(j = 2; j >= 0; j--) {
768 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
769 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
770 }
771 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
772
773 do {
774 fprintf(stderr,"%i", tok->strength);
775 tok = tok->next;
776 } while(tok != NULL);
777 fprintf(stderr, "\n");
778
779 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
780
781 do {
782 fprintf(stderr,"%i", tok->toInsert);
783 tok = tok->next;
784 } while(tok != NULL);
785 #endif
786
787 tok = lh->first;
788 uint32_t fStrength = UCOL_IDENTICAL;
789 uint32_t initStrength = UCOL_IDENTICAL;
790
791
792 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
793 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
794 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
795
796 while (tok != NULL && U_SUCCESS(*status)) {
797 fStrength = tok->strength;
798 if(fStrength < initStrength) {
799 initStrength = fStrength;
800 if(lh->pos[fStrength] == -1) {
801 while(lh->pos[fStrength] == -1 && fStrength > 0) {
802 fStrength--;
803 }
804 if(lh->pos[fStrength] == -1) {
805 *status = U_INTERNAL_PROGRAM_ERROR;
806 return;
807 }
808 }
809 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
810 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
811 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
812 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
813 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
814 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
815 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
816 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
817 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
818 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
819 } else { /* primaries */
820 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
821 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
822 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
823 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
824 }
825 } else {
826 if(tok->strength == UCOL_TERTIARY) {
827 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
828 } else if(tok->strength == UCOL_SECONDARY) {
829 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
830 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
831 } else if(tok->strength == UCOL_PRIMARY) {
832 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
833 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
834 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
835 }
836 }
837 ucol_doCE(src, CEparts, tok, status);
838 tok = tok->next;
839 }
840 }
841
ucol_createElements(UColTokenParser * src,tempUCATable * t,UColTokListHeader * lh,UErrorCode * status)842 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
843 UCAElements el;
844 UColToken *tok = lh->first;
845 UColToken *expt = NULL;
846 uint32_t i = 0, j = 0;
847 UChar32 fcdHighStart;
848 const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
849
850 while(tok != NULL && U_SUCCESS(*status)) {
851 /* first, check if there are any expansions */
852 /* if there are expansions, we need to do a little bit more processing */
853 /* since parts of expansion can be tailored, while others are not */
854 if(tok->expansion != 0) {
855 uint32_t len = tok->expansion >> 24;
856 uint32_t currentSequenceLen = len;
857 uint32_t expOffset = tok->expansion & 0x00FFFFFF;
858 //uint32_t exp = currentSequenceLen | expOffset;
859 UColToken exp;
860 exp.source = currentSequenceLen | expOffset;
861 exp.rulesToParseHdl = &(src->source);
862
863 while(len > 0) {
864 currentSequenceLen = len;
865 while(currentSequenceLen > 0) {
866 exp.source = (currentSequenceLen << 24) | expOffset;
867 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
868 uint32_t noOfCEsToCopy = expt->noOfCEs;
869 for(j = 0; j<noOfCEsToCopy; j++) {
870 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
871 }
872 tok->noOfExpCEs += noOfCEsToCopy;
873 // Smart people never try to add codepoints and CEs.
874 // For some odd reason, it won't work.
875 expOffset += currentSequenceLen; //noOfCEsToCopy;
876 len -= currentSequenceLen; //noOfCEsToCopy;
877 break;
878 } else {
879 currentSequenceLen--;
880 }
881 }
882 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
883 /* will have to get one from UCA */
884 /* first, get the UChars from the rules */
885 /* then pick CEs out until there is no more and stuff them into expansion */
886 collIterate s;
887 uint32_t order = 0;
888 uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
889
890 for(;;) {
891 order = ucol_getNextCE(src->UCA, &s, status);
892 if(order == UCOL_NO_MORE_CES) {
893 break;
894 }
895 tok->expCEs[tok->noOfExpCEs++] = order;
896 }
897 expOffset++;
898 len--;
899 }
900 }
901 } else {
902 tok->noOfExpCEs = 0;
903 }
904
905 /* set the ucaelement with obtained values */
906 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
907 /* copy CEs */
908 for(i = 0; i<tok->noOfCEs; i++) {
909 el.CEs[i] = tok->CEs[i];
910 }
911 for(i = 0; i<tok->noOfExpCEs; i++) {
912 el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
913 }
914
915 /* copy UChars */
916 // We kept prefix and source kind of together, as it is a kind of a contraction.
917 // However, now we have to slice the prefix off the main thing -
918 el.prefix = el.prefixChars;
919 el.cPoints = el.uchars;
920 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
921 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
922 // decomposed elements to the unsaf table.
923 el.prefixSize = tok->prefix>>24;
924 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
925
926 el.cSize = (tok->source >> 24)-(tok->prefix>>24);
927 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
928 } else {
929 el.prefixSize = 0;
930 *el.prefix = 0;
931
932 el.cSize = (tok->source >> 24);
933 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
934 }
935 if(src->UCA != NULL) {
936 for(i = 0; i<el.cSize; i++) {
937 if(UCOL_ISJAMO(el.cPoints[i])) {
938 t->image->jamoSpecial = TRUE;
939 }
940 }
941 if (!src->buildCCTabFlag && el.cSize > 0) {
942 // Check the trailing canonical combining class (tccc) of the last character.
943 const UChar *s = el.cPoints + el.cSize;
944 uint16_t fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, el.cPoints, s);
945 if ((fcd & 0xff) != 0) {
946 src->buildCCTabFlag = TRUE;
947 }
948 }
949 }
950
951 /* and then, add it */
952 #if UCOL_DEBUG==2
953 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
954 #endif
955 uprv_uca_addAnElement(t, &el, status);
956
957 #if UCOL_DEBUG_DUPLICATES
958 if(*status != U_ZERO_ERROR) {
959 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
960 *status = U_ZERO_ERROR;
961 }
962 #endif
963
964 tok = tok->next;
965 }
966 }
967
968 U_CDECL_BEGIN
969 static UBool U_CALLCONV
_processUCACompleteIgnorables(const void * context,UChar32 start,UChar32 limit,uint32_t value)970 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
971 UErrorCode status = U_ZERO_ERROR;
972 tempUCATable *t = (tempUCATable *)context;
973 if(value == 0) {
974 while(start < limit) {
975 uint32_t CE = utrie_get32(t->mapping, start, NULL);
976 if(CE == UCOL_NOT_FOUND) {
977 UCAElements el;
978 el.isThai = FALSE;
979 el.prefixSize = 0;
980 el.prefixChars[0] = 0;
981 el.prefix = el.prefixChars;
982 el.cPoints = el.uchars;
983
984 el.cSize = 0;
985 UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start);
986
987 el.noOfCEs = 1;
988 el.CEs[0] = 0;
989 uprv_uca_addAnElement(t, &el, &status);
990
991 }
992 start++;
993 }
994 }
995 if(U_FAILURE(status)) {
996 return FALSE;
997 } else {
998 return TRUE;
999 }
1000 }
1001 U_CDECL_END
1002
1003 static void
ucol_uprv_bld_copyRangeFromUCA(UColTokenParser * src,tempUCATable * t,UChar32 start,UChar32 end,UErrorCode * status)1004 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
1005 UChar32 start, UChar32 end,
1006 UErrorCode *status)
1007 {
1008 //UChar decomp[256];
1009 uint32_t CE = UCOL_NOT_FOUND;
1010 UChar32 u = 0;
1011 UCAElements el;
1012 el.isThai = FALSE;
1013 el.prefixSize = 0;
1014 el.prefixChars[0] = 0;
1015 collIterate colIt;
1016
1017 if(U_SUCCESS(*status)) {
1018 for(u = start; u<=end; u++) {
1019 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
1020 /* this test is for contractions that are missing the starting element. */
1021 || ((isCntTableElement(CE)) &&
1022 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
1023 )
1024 {
1025 el.cSize = 0;
1026 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
1027 //decomp[0] = (UChar)u;
1028 //el.uchars[0] = (UChar)u;
1029 el.cPoints = el.uchars;
1030 //el.cSize = 1;
1031 el.noOfCEs = 0;
1032 el.prefix = el.prefixChars;
1033 el.prefixSize = 0;
1034 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1035 // We actually want to check whether this element is a special
1036 // If it is an implicit element (hangul, CJK - we want to copy the
1037 // special, not the resolved CEs) - for hangul, copying resolved
1038 // would just make things the same (there is an expansion and it
1039 // takes approximately the same amount of time to resolve as
1040 // falling back to the UCA).
1041 /*
1042 UTRIE_GET32(src->UCA->mapping, u, CE);
1043 tag = getCETag(CE);
1044 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1045 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1046 || tag == LEAD_SURROGATE_TAG) {
1047 el.CEs[el.noOfCEs++] = CE;
1048 } else {
1049 */
1050 // It turns out that it does not make sense to keep implicits
1051 // unresolved. The cost of resolving them is big enough so that
1052 // it doesn't make any difference whether we have to go to the UCA
1053 // or not.
1054 {
1055 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
1056 while(CE != UCOL_NO_MORE_CES) {
1057 CE = ucol_getNextCE(src->UCA, &colIt, status);
1058 if(CE != UCOL_NO_MORE_CES) {
1059 el.CEs[el.noOfCEs++] = CE;
1060 }
1061 }
1062 }
1063 uprv_uca_addAnElement(t, &el, status);
1064 }
1065 }
1066 }
1067 }
1068
1069 U_CFUNC UCATableHeader *
ucol_assembleTailoringTable(UColTokenParser * src,UErrorCode * status)1070 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1071 U_NAMESPACE_USE
1072
1073 uint32_t i = 0;
1074 if(U_FAILURE(*status)) {
1075 return NULL;
1076 }
1077 /*
1078 2. Eliminate the negative lists by doing the following for each non-null negative list:
1079 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1080 create new ListHeader X
1081 o reverse the list, add to the end of X's positive list. Reset the strength of the
1082 first item you add, based on the stronger strength levels of the two lists.
1083 */
1084 /*
1085 3. For each ListHeader with a non-null positive list:
1086 */
1087 /*
1088 o Find all character strings with CEs between the baseCE and the
1089 next/previous CE, at the strength of the first token. Add these to the
1090 tailoring.
1091 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the
1092 tailoring has & x < z...
1093 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
1094 */
1095 /* It is possible that this part should be done even while constructing list */
1096 /* The problem is that it is unknown what is going to be the strongest weight */
1097 /* So we might as well do it here */
1098
1099 /*
1100 o Allocate CEs for each token in the list, based on the total number N of the
1101 largest level difference, and the gap G between baseCE and nextCE at that
1102 level. The relation * between the last item and nextCE is the same as the
1103 strongest strength.
1104 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1105 ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1106 Then fit b and c into the secondary gap between a and d, then fit q
1107 into the tertiary gap between b and c.
1108
1109 o Example: baseCE << b <<< q << c * nextCE(X,2)
1110 ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1111 Then fit q into the tertiary gap between b and c.
1112 o When incrementing primary values, we will not cross high byte
1113 boundaries except where there is only a single-byte primary. That is to
1114 ensure that the script reordering will continue to work.
1115 */
1116 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1117 /* test for NULL */
1118 if (image == NULL) {
1119 *status = U_MEMORY_ALLOCATION_ERROR;
1120 return NULL;
1121 }
1122 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1123
1124 for(i = 0; i<src->resultLen; i++) {
1125 /* now we need to generate the CEs */
1126 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1127 /* According to strength */
1128 if(U_SUCCESS(*status)) {
1129 if(src->lh[i].first) { // if there are any elements
1130 // due to the way parser works, subsequent tailorings
1131 // may remove all the elements from a sequence, therefore
1132 // leaving an empty tailoring sequence.
1133 ucol_initBuffers(src, &src->lh[i], status);
1134 }
1135 }
1136 if(U_FAILURE(*status)) {
1137 uprv_free(image);
1138 return NULL;
1139 }
1140 }
1141
1142 if(src->varTop != NULL) { /* stuff the variable top value */
1143 src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1144 /* remove it from the list */
1145 if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1146 src->varTop->listHeader->first = src->varTop->next;
1147 }
1148 if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1149 src->varTop->listHeader->last = src->varTop->previous;
1150 }
1151 if(src->varTop->next != NULL) {
1152 src->varTop->next->previous = src->varTop->previous;
1153 }
1154 if(src->varTop->previous != NULL) {
1155 src->varTop->previous->next = src->varTop->next;
1156 }
1157 }
1158
1159
1160 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1161 if(U_FAILURE(*status)) {
1162 uprv_free(image);
1163 return NULL;
1164 }
1165
1166
1167 /* After this, we have assigned CE values to all regular CEs */
1168 /* now we will go through list once more and resolve expansions, */
1169 /* make UCAElements structs and add them to table */
1170 for(i = 0; i<src->resultLen; i++) {
1171 /* now we need to generate the CEs */
1172 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1173 /* According to strength */
1174 if(U_SUCCESS(*status)) {
1175 ucol_createElements(src, t, &src->lh[i], status);
1176 }
1177 }
1178
1179 UCAElements el;
1180 el.isThai = FALSE;
1181 el.prefixSize = 0;
1182 el.prefixChars[0] = 0;
1183
1184 /* add latin-1 stuff */
1185 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1186
1187 /* add stuff for copying */
1188 if(src->copySet != NULL) {
1189 int32_t i = 0;
1190 UnicodeSet *set = (UnicodeSet *)src->copySet;
1191 for(i = 0; i < set->getRangeCount(); i++) {
1192 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
1193 }
1194 }
1195
1196 if(U_SUCCESS(*status)) {
1197 /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1198
1199 uint32_t tailoredCE = UCOL_NOT_FOUND;
1200 //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants));
1201 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1202 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1203 // Check for null pointer
1204 if (ucaEl == NULL) {
1205 *status = U_MEMORY_ALLOCATION_ERROR;
1206 return NULL;
1207 }
1208 while(*conts != 0) {
1209 /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/
1210 tailoredCE = utrie_get32(t->mapping, *conts, NULL);
1211 if(tailoredCE != UCOL_NOT_FOUND) {
1212 UBool needToAdd = TRUE;
1213 if(isCntTableElement(tailoredCE)) {
1214 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+1, status) == TRUE) {
1215 needToAdd = FALSE;
1216 }
1217 }
1218 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
1219 UCAElements elm;
1220 elm.cPoints = el.uchars;
1221 elm.noOfCEs = 0;
1222 elm.uchars[0] = *conts;
1223 elm.uchars[1] = 0;
1224 elm.cSize = 1;
1225 elm.prefixChars[0] = *(conts+2);
1226 elm.isThai = FALSE;
1227 elm.prefix = elm.prefixChars;
1228 elm.prefixSize = 1;
1229 UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
1230 if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
1231 needToAdd = TRUE;
1232 }
1233 }
1234 if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1235 needToAdd = FALSE;
1236 }
1237
1238 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1239 if (*(conts+1) != 0) { // contractions
1240 el.prefix = el.prefixChars;
1241 el.prefixSize = 0;
1242 el.cPoints = el.uchars;
1243 el.noOfCEs = 0;
1244 el.uchars[0] = *conts;
1245 el.uchars[1] = *(conts+1);
1246 if(*(conts+2)!=0) {
1247 el.uchars[2] = *(conts+2);
1248 el.cSize = 3;
1249 } else {
1250 el.cSize = 2;
1251 }
1252 ucol_setText(ucaEl, el.uchars, el.cSize, status);
1253 }
1254 else { // pre-context character
1255 UChar str[4] = { 0 };
1256 int32_t len=0;
1257 int32_t preKeyLen=0;
1258
1259 el.cPoints = el.uchars;
1260 el.noOfCEs = 0;
1261 el.uchars[0] = *conts;
1262 el.uchars[1] = 0;
1263 el.cSize = 1;
1264 el.prefixChars[0] = *(conts+2);
1265 el.prefix = el.prefixChars;
1266 el.prefixSize = 1;
1267 if (el.prefixChars[0]!=0) {
1268 // get CE of prefix character first
1269 str[0]=el.prefixChars[0];
1270 str[1]=0;
1271 ucol_setText(ucaEl, str, 1, status);
1272 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
1273 != UCOL_NULLORDER) {
1274 preKeyLen++; // count number of keys for prefix character
1275 }
1276 str[len++] = el.prefixChars[0];
1277 }
1278
1279 str[len++] = el.uchars[0];
1280 str[len]=0;
1281 ucol_setText(ucaEl, str, len, status);
1282 // Skip the keys for prefix character, then copy the rest to el.
1283 while ((preKeyLen-->0) &&
1284 (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1285 continue;
1286 }
1287
1288 }
1289 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1290 el.noOfCEs++;
1291 }
1292 uprv_uca_addAnElement(t, &el, status);
1293 }
1294
1295 } else if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1296 ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status);
1297 }
1298 conts+=3;
1299 }
1300 ucol_closeElements(ucaEl);
1301 }
1302
1303 // Add completely ignorable elements
1304 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1305
1306 // add tailoring characters related canonical closures
1307 uprv_uca_canonicalClosure(t, src, NULL, status);
1308
1309 /* still need to produce compatibility closure */
1310
1311 UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1312
1313 uprv_uca_closeTempTable(t);
1314 uprv_free(image);
1315
1316 return myData;
1317 }
1318
1319 U_CDECL_BEGIN
1320 static UBool U_CALLCONV
ucol_bld_cleanup(void)1321 ucol_bld_cleanup(void)
1322 {
1323 udata_close(invUCA_DATA_MEM);
1324 invUCA_DATA_MEM = NULL;
1325 _staticInvUCA = NULL;
1326 return TRUE;
1327 }
1328 U_CDECL_END
1329
1330 U_CAPI const InverseUCATableHeader * U_EXPORT2
ucol_initInverseUCA(UErrorCode * status)1331 ucol_initInverseUCA(UErrorCode *status)
1332 {
1333 if(U_FAILURE(*status)) return NULL;
1334
1335 UBool needsInit;
1336 UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit);
1337
1338 if(needsInit) {
1339 InverseUCATableHeader *newInvUCA = NULL;
1340 UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
1341
1342 if(U_FAILURE(*status)) {
1343 if (result) {
1344 udata_close(result);
1345 }
1346 // This is not needed, as we are talking about
1347 // memory we got from UData
1348 //uprv_free(newInvUCA);
1349 }
1350
1351 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1352 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1353 UCollator *UCA = ucol_initUCA(status);
1354 // UCA versions of UCA and inverse UCA should match
1355 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
1356 *status = U_INVALID_FORMAT_ERROR;
1357 udata_close(result);
1358 return NULL;
1359 }
1360
1361 umtx_lock(NULL);
1362 if(_staticInvUCA == NULL) {
1363 invUCA_DATA_MEM = result;
1364 _staticInvUCA = newInvUCA;
1365 result = NULL;
1366 newInvUCA = NULL;
1367 }
1368 umtx_unlock(NULL);
1369
1370 if(newInvUCA != NULL) {
1371 udata_close(result);
1372 // This is not needed, as we are talking about
1373 // memory we got from UData
1374 //uprv_free(newInvUCA);
1375 }
1376 else {
1377 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1378 }
1379 }
1380 }
1381 return _staticInvUCA;
1382 }
1383
1384 /* This is the data that is used for non-script reordering codes. These _must_ be kept
1385 * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
1386 */
1387 static const char* ReorderingTokenNames[] = {
1388 "SPACE",
1389 "PUNCT",
1390 "SYMBOL",
1391 "CURRENCY",
1392 "DIGIT",
1393 NULL
1394 };
1395
toUpper(const char * src,char * dst,uint32_t length)1396 static void toUpper(const char* src, char* dst, uint32_t length) {
1397 for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
1398 *dst = toupper(*src);
1399 }
1400 *dst = '\0';
1401 }
1402
1403 U_INTERNAL int32_t U_EXPORT2
ucol_findReorderingEntry(const char * name)1404 ucol_findReorderingEntry(const char* name) {
1405 char buffer[32];
1406 toUpper(name, buffer, 32);
1407 for (uint32_t entry = 0; ReorderingTokenNames[entry] != NULL; entry++) {
1408 if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
1409 return entry + UCOL_REORDER_CODE_FIRST;
1410 }
1411 }
1412 return USCRIPT_INVALID_CODE;
1413 }
1414
1415 U_NAMESPACE_END
1416
1417 #endif /* #if !UCONFIG_NO_COLLATION */
1418