1 /*
2 *******************************************************************************
3 * Copyright (C) 1996-2007, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: ucol.cpp
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * Modification history
12 * Date Name Comments
13 * 1996-1999 various members of ICU team maintained C API for collation framework
14 * 02/16/2001 synwee Added internal method getPrevSpecialCE
15 * 03/01/2001 synwee Added maxexpansion functionality.
16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant
17 */
18
19 #include "unicode/utypes.h"
20 #include "uassert.h"
21
22 #if !UCONFIG_NO_COLLATION
23
24 #include "unicode/coleitr.h"
25 #include "unicode/unorm.h"
26 #include "unicode/udata.h"
27 #include "unicode/ustring.h"
28
29 #include "ucol_imp.h"
30 #include "ucol_elm.h"
31 #include "bocsu.h"
32
33 #include "unormimp.h"
34 #include "unorm_it.h"
35 #include "umutex.h"
36 #include "cmemory.h"
37 #include "ucln_in.h"
38 #include "cstring.h"
39 #include "utracimp.h"
40 #include "putilimp.h"
41
42 #ifdef UCOL_DEBUG
43 #include <stdio.h>
44 #endif
45
46 U_NAMESPACE_USE
47
48 /* added by synwee for trie manipulation*/
49 #define STAGE_1_SHIFT_ 10
50 #define STAGE_2_SHIFT_ 4
51 #define STAGE_2_MASK_AFTER_SHIFT_ 0x3F
52 #define STAGE_3_MASK_ 0xF
53 #define LAST_BYTE_MASK_ 0xFF
54 #define SECOND_LAST_BYTE_SHIFT_ 8
55
56 #define ZERO_CC_LIMIT_ 0xC0
57
58 // static UCA. There is only one. Collators don't use it.
59 // It is referenced only in ucol_initUCA and ucol_cleanup
60 static UCollator* _staticUCA = NULL;
61 // static pointer to udata memory. Inited in ucol_initUCA
62 // used for cleanup in ucol_cleanup
63 static UDataMemory* UCA_DATA_MEM = NULL;
64
65 // this is static pointer to the normalizer fcdTrieIndex
66 // it is always the same between calls to u_cleanup
67 // and therefore writing to it is not synchronized.
68 // It is cleaned in ucol_cleanup
69 static const uint16_t *fcdTrieIndex=NULL;
70
71 // These are values from UCA required for
72 // implicit generation and supressing sort key compression
73 // they should regularly be in the UCA, but if one
74 // is running without UCA, it could be a problem
75 static int32_t maxRegularPrimary = 0xA0;
76 static int32_t minImplicitPrimary = 0xE0;
77 static int32_t maxImplicitPrimary = 0xE4;
78
79 U_CDECL_BEGIN
80 static UBool U_CALLCONV
isAcceptableUCA(void *,const char *,const char *,const UDataInfo * pInfo)81 isAcceptableUCA(void * /*context*/,
82 const char * /*type*/, const char * /*name*/,
83 const UDataInfo *pInfo){
84 /* context, type & name are intentionally not used */
85 if( pInfo->size>=20 &&
86 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
87 pInfo->charsetFamily==U_CHARSET_FAMILY &&
88 pInfo->dataFormat[0]==UCA_DATA_FORMAT_0 && /* dataFormat="UCol" */
89 pInfo->dataFormat[1]==UCA_DATA_FORMAT_1 &&
90 pInfo->dataFormat[2]==UCA_DATA_FORMAT_2 &&
91 pInfo->dataFormat[3]==UCA_DATA_FORMAT_3 &&
92 pInfo->formatVersion[0]==UCA_FORMAT_VERSION_0 &&
93 pInfo->formatVersion[1]>=UCA_FORMAT_VERSION_1// &&
94 //pInfo->formatVersion[1]==UCA_FORMAT_VERSION_1 &&
95 //pInfo->formatVersion[2]==UCA_FORMAT_VERSION_2 && // Too harsh
96 //pInfo->formatVersion[3]==UCA_FORMAT_VERSION_3 && // Too harsh
97 ) {
98 UVersionInfo UCDVersion;
99 u_getUnicodeVersion(UCDVersion);
100 return (UBool)(pInfo->dataVersion[0]==UCDVersion[0]
101 && pInfo->dataVersion[1]==UCDVersion[1]);
102 //&& pInfo->dataVersion[2]==ucaDataInfo.dataVersion[2]
103 //&& pInfo->dataVersion[3]==ucaDataInfo.dataVersion[3]);
104 } else {
105 return FALSE;
106 }
107 }
108
109
110 static int32_t U_CALLCONV
_getFoldingOffset(uint32_t data)111 _getFoldingOffset(uint32_t data) {
112 return (int32_t)(data&0xFFFFFF);
113 }
114
115 U_CDECL_END
116
117 static
IInit_collIterate(const UCollator * collator,const UChar * sourceString,int32_t sourceLen,collIterate * s)118 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
119 int32_t sourceLen, collIterate *s) {
120 (s)->string = (s)->pos = (UChar *)(sourceString);
121 (s)->origFlags = 0;
122 (s)->flags = 0;
123 if (sourceLen >= 0) {
124 s->flags |= UCOL_ITER_HASLEN;
125 (s)->endp = (UChar *)sourceString+sourceLen;
126 }
127 else {
128 /* change to enable easier checking for end of string for fcdpositon */
129 (s)->endp = NULL;
130 }
131 (s)->CEpos = (s)->toReturn = (s)->CEs;
132 (s)->writableBuffer = (s)->stackWritableBuffer;
133 (s)->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE;
134 (s)->coll = (collator);
135 (s)->fcdPosition = 0;
136 if(collator->normalizationMode == UCOL_ON) {
137 (s)->flags |= UCOL_ITER_NORM;
138 }
139 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
140 (s)->flags |= UCOL_HIRAGANA_Q;
141 }
142 (s)->iterator = NULL;
143 //(s)->iteratorIndex = 0;
144 }
145
146 U_CAPI void U_EXPORT2
uprv_init_collIterate(const UCollator * collator,const UChar * sourceString,int32_t sourceLen,collIterate * s)147 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
148 int32_t sourceLen, collIterate *s){
149 /* Out-of-line version for use from other files. */
150 IInit_collIterate(collator, sourceString, sourceLen, s);
151 }
152
153
154 /**
155 * Backup the state of the collIterate struct data
156 * @param data collIterate to backup
157 * @param backup storage
158 */
159 static
backupState(const collIterate * data,collIterateState * backup)160 inline void backupState(const collIterate *data, collIterateState *backup)
161 {
162 backup->fcdPosition = data->fcdPosition;
163 backup->flags = data->flags;
164 backup->origFlags = data->origFlags;
165 backup->pos = data->pos;
166 backup->bufferaddress = data->writableBuffer;
167 backup->buffersize = data->writableBufSize;
168 backup->iteratorMove = 0;
169 backup->iteratorIndex = 0;
170 if(data->iterator != NULL) {
171 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
172 backup->iteratorIndex = data->iterator->getState(data->iterator);
173 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
174 if(backup->iteratorIndex == UITER_NO_STATE) {
175 while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
176 backup->iteratorMove++;
177 data->iterator->move(data->iterator, -1, UITER_CURRENT);
178 }
179 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
180 }
181 }
182 }
183
184 /**
185 * Loads the state into the collIterate struct data
186 * @param data collIterate to backup
187 * @param backup storage
188 * @param forwards boolean to indicate if forwards iteration is used,
189 * false indicates backwards iteration
190 */
191 static
loadState(collIterate * data,const collIterateState * backup,UBool forwards)192 inline void loadState(collIterate *data, const collIterateState *backup,
193 UBool forwards)
194 {
195 UErrorCode status = U_ZERO_ERROR;
196 data->flags = backup->flags;
197 data->origFlags = backup->origFlags;
198 if(data->iterator != NULL) {
199 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
200 data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
201 if(backup->iteratorMove != 0) {
202 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
203 }
204 }
205 data->pos = backup->pos;
206 if ((data->flags & UCOL_ITER_INNORMBUF) &&
207 data->writableBuffer != backup->bufferaddress) {
208 /*
209 this is when a new buffer has been reallocated and we'll have to
210 calculate the new position.
211 note the new buffer has to contain the contents of the old buffer.
212 */
213 if (forwards) {
214 data->pos = data->writableBuffer +
215 (data->pos - backup->bufferaddress);
216 }
217 else {
218 /* backwards direction */
219 uint32_t temp = backup->buffersize -
220 (data->pos - backup->bufferaddress);
221 data->pos = data->writableBuffer + (data->writableBufSize - temp);
222 }
223 }
224 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
225 /*
226 this is alittle tricky.
227 if we are initially not in the normalization buffer, even if we
228 normalize in the later stage, the data in the buffer will be
229 ignored, since we skip back up to the data string.
230 however if we are already in the normalization buffer, any
231 further normalization will pull data into the normalization
232 buffer and modify the fcdPosition.
233 since we are keeping the data in the buffer for use, the
234 fcdPosition can not be reverted back.
235 arrgghh....
236 */
237 data->fcdPosition = backup->fcdPosition;
238 }
239 }
240
241
242 /*
243 * collIter_eos()
244 * Checks for a collIterate being positioned at the end of
245 * its source string.
246 *
247 */
248 static
collIter_eos(collIterate * s)249 inline UBool collIter_eos(collIterate *s) {
250 if(s->flags & UCOL_USE_ITERATOR) {
251 return !(s->iterator->hasNext(s->iterator));
252 }
253 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
254 // Null terminated string, but not at null, so not at end.
255 // Whether in main or normalization buffer doesn't matter.
256 return FALSE;
257 }
258
259 // String with length. Can't be in normalization buffer, which is always
260 // null termintated.
261 if (s->flags & UCOL_ITER_HASLEN) {
262 return (s->pos == s->endp);
263 }
264
265 // We are at a null termination, could be either normalization buffer or main string.
266 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
267 // At null at end of main string.
268 return TRUE;
269 }
270
271 // At null at end of normalization buffer. Need to check whether there there are
272 // any characters left in the main buffer.
273 if(s->origFlags & UCOL_USE_ITERATOR) {
274 return !(s->iterator->hasNext(s->iterator));
275 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
276 // Null terminated main string. fcdPosition is the 'return' position into main buf.
277 return (*s->fcdPosition == 0);
278 }
279 else {
280 // Main string with an end pointer.
281 return s->fcdPosition == s->endp;
282 }
283 }
284
285 /*
286 * collIter_bos()
287 * Checks for a collIterate being positioned at the start of
288 * its source string.
289 *
290 */
291 static
collIter_bos(collIterate * source)292 inline UBool collIter_bos(collIterate *source) {
293 // if we're going backwards, we need to know whether there is more in the
294 // iterator, even if we are in the side buffer
295 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
296 return !source->iterator->hasPrevious(source->iterator);
297 }
298 if (source->pos <= source->string ||
299 ((source->flags & UCOL_ITER_INNORMBUF) &&
300 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
301 return TRUE;
302 }
303 return FALSE;
304 }
305
306 /*static
307 inline UBool collIter_SimpleBos(collIterate *source) {
308 // if we're going backwards, we need to know whether there is more in the
309 // iterator, even if we are in the side buffer
310 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
311 return !source->iterator->hasPrevious(source->iterator);
312 }
313 if (source->pos == source->string) {
314 return TRUE;
315 }
316 return FALSE;
317 }*/
318 //return (data->pos == data->string) ||
319
320
321 /**
322 * Checks and free writable buffer if it is not the original stack buffer
323 * in collIterate. This function does not reassign the writable buffer.
324 * @param data collIterate struct to determine and free the writable buffer
325 */
326 static
freeHeapWritableBuffer(collIterate * data)327 inline void freeHeapWritableBuffer(collIterate *data)
328 {
329 if (data->writableBuffer != data->stackWritableBuffer) {
330 uprv_free(data->writableBuffer);
331 }
332 }
333
334
335 /****************************************************************************/
336 /* Following are the open/close functions */
337 /* */
338 /****************************************************************************/
339
340 static UCollator*
ucol_initFromBinary(const uint8_t * bin,int32_t length,const UCollator * base,UCollator * fillIn,UErrorCode * status)341 ucol_initFromBinary(const uint8_t *bin, int32_t length,
342 const UCollator *base,
343 UCollator *fillIn,
344 UErrorCode *status)
345 {
346 UCollator *result = fillIn;
347 if(U_FAILURE(*status)) {
348 return NULL;
349 }
350 /*
351 if(base == NULL) {
352 // we don't support null base yet
353 *status = U_ILLEGAL_ARGUMENT_ERROR;
354 return NULL;
355 }
356 */
357 // We need these and we could be running without UCA
358 uprv_uca_initImplicitConstants(0, 0, status);
359 UCATableHeader *colData = (UCATableHeader *)bin;
360 // do we want version check here? We're trying to figure out whether collators are compatible
361 if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
362 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
363 colData->version[0] != UCOL_BUILDER_VERSION)
364 {
365 *status = U_COLLATOR_VERSION_MISMATCH;
366 return NULL;
367 }
368 else {
369 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
370 result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
371 if(U_FAILURE(*status)){
372 return NULL;
373 }
374 result->hasRealData = TRUE;
375 }
376 else {
377 if(base) {
378 result = ucol_initCollator(base->image, result, base, status);
379 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
380 if(U_FAILURE(*status)){
381 return NULL;
382 }
383 result->hasRealData = FALSE;
384 }
385 else {
386 *status = U_USELESS_COLLATOR_ERROR;
387 return NULL;
388 }
389 }
390 result->freeImageOnClose = FALSE;
391 }
392 result->validLocale = NULL;
393 result->requestedLocale = NULL;
394 result->rules = NULL;
395 result->rulesLength = 0;
396 result->freeRulesOnClose = FALSE;
397 result->rb = NULL;
398 result->elements = NULL;
399 return result;
400 }
401
402 U_CAPI UCollator* U_EXPORT2
ucol_openBinary(const uint8_t * bin,int32_t length,const UCollator * base,UErrorCode * status)403 ucol_openBinary(const uint8_t *bin, int32_t length,
404 const UCollator *base,
405 UErrorCode *status)
406 {
407 return ucol_initFromBinary(bin, length, base, NULL, status);
408 }
409
410 U_CAPI UCollator* U_EXPORT2
ucol_safeClone(const UCollator * coll,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)411 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
412 {
413 UCollator * localCollator;
414 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
415 char *stackBufferChars = (char *)stackBuffer;
416 int32_t imageSize = 0;
417 int32_t rulesSize = 0;
418 int32_t rulesPadding = 0;
419 uint8_t *image;
420 UChar *rules;
421 UBool colAllocated = FALSE;
422 UBool imageAllocated = FALSE;
423
424 if (status == NULL || U_FAILURE(*status)){
425 return 0;
426 }
427 if ((stackBuffer && !pBufferSize) || !coll){
428 *status = U_ILLEGAL_ARGUMENT_ERROR;
429 return 0;
430 }
431 if (coll->rules && coll->freeRulesOnClose) {
432 rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
433 rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
434 bufferSizeNeeded += rulesSize + rulesPadding;
435 }
436
437 if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
438 *pBufferSize = bufferSizeNeeded;
439 return 0;
440 }
441
442 /* Pointers on 64-bit platforms need to be aligned
443 * on a 64-bit boundry in memory.
444 */
445 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
446 int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
447 if (*pBufferSize > offsetUp) {
448 *pBufferSize -= offsetUp;
449 stackBufferChars += offsetUp;
450 }
451 else {
452 /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
453 *pBufferSize = 1;
454 }
455 }
456 stackBuffer = (void *)stackBufferChars;
457
458 if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) {
459 /* allocate one here...*/
460 stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
461 colAllocated = TRUE;
462 if (U_SUCCESS(*status)) {
463 *status = U_SAFECLONE_ALLOCATED_WARNING;
464 }
465 }
466 localCollator = (UCollator *)stackBufferChars;
467 rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
468 {
469 UErrorCode tempStatus = U_ZERO_ERROR;
470 imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
471 }
472 if (coll->freeImageOnClose) {
473 image = (uint8_t *)uprv_malloc(imageSize);
474 ucol_cloneBinary(coll, image, imageSize, status);
475 imageAllocated = TRUE;
476 }
477 else {
478 image = (uint8_t *)coll->image;
479 }
480 localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
481 if (U_FAILURE(*status)) {
482 return NULL;
483 }
484
485 if (coll->rules) {
486 if (coll->freeRulesOnClose) {
487 localCollator->rules = u_strcpy(rules, coll->rules);
488 //bufferEnd += rulesSize;
489 }
490 else {
491 localCollator->rules = coll->rules;
492 }
493 localCollator->freeRulesOnClose = FALSE;
494 localCollator->rulesLength = coll->rulesLength;
495 }
496
497 int32_t i;
498 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
499 ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
500 }
501 localCollator->requestedLocale = NULL; // zero copies of pointers
502 localCollator->validLocale = NULL;
503 localCollator->rb = NULL;
504 localCollator->elements = NULL;
505 localCollator->freeOnClose = colAllocated;
506 localCollator->freeImageOnClose = imageAllocated;
507 return localCollator;
508 }
509
510 U_CAPI void U_EXPORT2
ucol_close(UCollator * coll)511 ucol_close(UCollator *coll)
512 {
513 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
514 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
515 if(coll != NULL) {
516 // these are always owned by each UCollator struct,
517 // so we always free them
518 if(coll->validLocale != NULL) {
519 uprv_free(coll->validLocale);
520 }
521 if(coll->requestedLocale != NULL) {
522 uprv_free(coll->requestedLocale);
523 }
524 if(coll->resCleaner != NULL) {
525 coll->resCleaner(coll);
526 }
527 if(coll->latinOneCEs != NULL) {
528 uprv_free(coll->latinOneCEs);
529 }
530 if(coll->options != NULL && coll->freeOptionsOnClose) {
531 uprv_free(coll->options);
532 }
533 if(coll->rules != NULL && coll->freeRulesOnClose) {
534 uprv_free((UChar *)coll->rules);
535 }
536 if(coll->image != NULL && coll->freeImageOnClose) {
537 uprv_free((UCATableHeader *)coll->image);
538 }
539
540 /* Here, it would be advisable to close: */
541 /* - UData for UCA (unless we stuff it in the root resb */
542 /* Again, do we need additional housekeeping... HMMM! */
543 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
544 if(coll->freeOnClose){
545 /* for safeClone, if freeOnClose is FALSE,
546 don't free the other instance data */
547 uprv_free(coll);
548 }
549 }
550 UTRACE_EXIT();
551 }
552
553 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
554 /* you should be able to get the binary chunk to write out... Doesn't look very full now */
555 U_CFUNC uint8_t* U_EXPORT2
ucol_cloneRuleData(const UCollator * coll,int32_t * length,UErrorCode * status)556 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
557 {
558 uint8_t *result = NULL;
559 if(U_FAILURE(*status)) {
560 return NULL;
561 }
562 if(coll->hasRealData == TRUE) {
563 *length = coll->image->size;
564 result = (uint8_t *)uprv_malloc(*length);
565 /* test for NULL */
566 if (result == NULL) {
567 *status = U_MEMORY_ALLOCATION_ERROR;
568 return NULL;
569 }
570 uprv_memcpy(result, coll->image, *length);
571 } else {
572 *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
573 result = (uint8_t *)uprv_malloc(*length);
574 /* test for NULL */
575 if (result == NULL) {
576 *status = U_MEMORY_ALLOCATION_ERROR;
577 return NULL;
578 }
579
580 /* build the UCATableHeader with minimal entries */
581 /* do not copy the header from the UCA file because its values are wrong! */
582 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
583
584 /* reset everything */
585 uprv_memset(result, 0, *length);
586
587 /* set the tailoring-specific values */
588 UCATableHeader *myData = (UCATableHeader *)result;
589 myData->size = *length;
590
591 /* offset for the options, the only part of the data that is present after the header */
592 myData->options = sizeof(UCATableHeader);
593
594 /* need to always set the expansion value for an upper bound of the options */
595 myData->expansion = myData->options + sizeof(UColOptionSet);
596
597 myData->magic = UCOL_HEADER_MAGIC;
598 myData->isBigEndian = U_IS_BIG_ENDIAN;
599 myData->charSetFamily = U_CHARSET_FAMILY;
600
601 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
602 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
603
604 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
605 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
606 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
607 myData->jamoSpecial = coll->image->jamoSpecial;
608
609 /* copy the collator options */
610 uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
611 }
612 return result;
613 }
614
ucol_setOptionsFromHeader(UCollator * result,UColOptionSet * opts,UErrorCode * status)615 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
616 if(U_FAILURE(*status)) {
617 return;
618 }
619 result->caseFirst = (UColAttributeValue)opts->caseFirst;
620 result->caseLevel = (UColAttributeValue)opts->caseLevel;
621 result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
622 result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
623 result->strength = (UColAttributeValue)opts->strength;
624 result->variableTopValue = opts->variableTopValue;
625 result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
626 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
627 result->numericCollation = (UColAttributeValue)opts->numericCollation;
628
629 result->caseFirstisDefault = TRUE;
630 result->caseLevelisDefault = TRUE;
631 result->frenchCollationisDefault = TRUE;
632 result->normalizationModeisDefault = TRUE;
633 result->strengthisDefault = TRUE;
634 result->variableTopValueisDefault = TRUE;
635 result->hiraganaQisDefault = TRUE;
636 result->numericCollationisDefault = TRUE;
637
638 ucol_updateInternalState(result, status);
639
640 result->options = opts;
641 }
642
643
644 /**
645 * Approximate determination if a character is at a contraction end.
646 * Guaranteed to be TRUE if a character is at the end of a contraction,
647 * otherwise it is not deterministic.
648 * @param c character to be determined
649 * @param coll collator
650 */
651 static
ucol_contractionEndCP(UChar c,const UCollator * coll)652 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
653 if (U16_IS_TRAIL(c)) {
654 return TRUE;
655 }
656
657 if (c < coll->minContrEndCP) {
658 return FALSE;
659 }
660
661 int32_t hash = c;
662 uint8_t htbyte;
663 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
664 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
665 }
666 htbyte = coll->contrEndCP[hash>>3];
667 return (((htbyte >> (hash & 7)) & 1) == 1);
668 }
669
670
671
672 /*
673 * i_getCombiningClass()
674 * A fast, at least partly inline version of u_getCombiningClass()
675 * This is a candidate for further optimization. Used heavily
676 * in contraction processing.
677 */
678 static
i_getCombiningClass(UChar32 c,const UCollator * coll)679 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
680 uint8_t sCC = 0;
681 if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
682 sCC = u_getCombiningClass(c);
683 }
684 return sCC;
685 }
686
ucol_initCollator(const UCATableHeader * image,UCollator * fillIn,const UCollator * UCA,UErrorCode * status)687 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
688 UChar c;
689 UCollator *result = fillIn;
690 if(U_FAILURE(*status) || image == NULL) {
691 return NULL;
692 }
693
694 if(result == NULL) {
695 result = (UCollator *)uprv_malloc(sizeof(UCollator));
696 if(result == NULL) {
697 *status = U_MEMORY_ALLOCATION_ERROR;
698 return result;
699 }
700 result->freeOnClose = TRUE;
701 } else {
702 result->freeOnClose = FALSE;
703 }
704
705 result->image = image;
706 result->mapping.getFoldingOffset = _getFoldingOffset;
707 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
708 utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
709 if(U_FAILURE(*status)) {
710 if(result->freeOnClose == TRUE) {
711 uprv_free(result);
712 result = NULL;
713 }
714 return result;
715 }
716
717 /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/
718 result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
719 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
720 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
721 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
722
723 result->options = (UColOptionSet*)((uint8_t*)result->image+result->image->options);
724 result->freeOptionsOnClose = FALSE;
725
726 /* set attributes */
727 result->caseFirst = (UColAttributeValue)result->options->caseFirst;
728 result->caseLevel = (UColAttributeValue)result->options->caseLevel;
729 result->frenchCollation = (UColAttributeValue)result->options->frenchCollation;
730 result->normalizationMode = (UColAttributeValue)result->options->normalizationMode;
731 result->strength = (UColAttributeValue)result->options->strength;
732 result->variableTopValue = result->options->variableTopValue;
733 result->alternateHandling = (UColAttributeValue)result->options->alternateHandling;
734 result->hiraganaQ = (UColAttributeValue)result->options->hiraganaQ;
735 result->numericCollation = (UColAttributeValue)result->options->numericCollation;
736
737 result->caseFirstisDefault = TRUE;
738 result->caseLevelisDefault = TRUE;
739 result->frenchCollationisDefault = TRUE;
740 result->normalizationModeisDefault = TRUE;
741 result->strengthisDefault = TRUE;
742 result->variableTopValueisDefault = TRUE;
743 result->alternateHandlingisDefault = TRUE;
744 result->hiraganaQisDefault = TRUE;
745 result->numericCollationisDefault = TRUE;
746
747 /*result->scriptOrder = NULL;*/
748
749 result->rules = NULL;
750 result->rulesLength = 0;
751
752 /* get the version info from UCATableHeader and populate the Collator struct*/
753 result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
754 result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
755 result->dataVersion[2] = 0;
756 result->dataVersion[3] = 0;
757
758 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
759 result->minUnsafeCP = 0;
760 for (c=0; c<0x300; c++) { // Find the smallest unsafe char.
761 if (ucol_unsafeCP(c, result)) break;
762 }
763 result->minUnsafeCP = c;
764
765 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
766 result->minContrEndCP = 0;
767 for (c=0; c<0x300; c++) { // Find the Contraction-ending char.
768 if (ucol_contractionEndCP(c, result)) break;
769 }
770 result->minContrEndCP = c;
771
772 /* max expansion tables */
773 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
774 result->image->endExpansionCE);
775 result->lastEndExpansionCE = result->endExpansionCE +
776 result->image->endExpansionCECount - 1;
777 result->expansionCESize = (uint8_t*)result->image +
778 result->image->expansionCESize;
779
780
781 //result->errorCode = *status;
782
783 result->latinOneCEs = NULL;
784
785 result->latinOneRegenTable = FALSE;
786 result->latinOneFailed = FALSE;
787 result->UCA = UCA;
788 result->resCleaner = NULL;
789
790 ucol_updateInternalState(result, status);
791
792 /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
793 result->rb = NULL;
794 result->elements = NULL;
795 result->validLocale = NULL;
796 result->requestedLocale = NULL;
797 result->hasRealData = FALSE; // real data lives in .dat file...
798 result->freeImageOnClose = FALSE;
799
800 return result;
801 }
802
803 /* new Mark's code */
804
805 /**
806 * For generation of Implicit CEs
807 * @author Davis
808 *
809 * Cleaned up so that changes can be made more easily.
810 * Old values:
811 # First Implicit: E26A792D
812 # Last Implicit: E3DC70C0
813 # First CJK: E0030300
814 # Last CJK: E0A9DD00
815 # First CJK_A: E0A9DF00
816 # Last CJK_A: E0DE3100
817 */
818 /* Following is a port of Mark's code for new treatment of implicits.
819 * It is positioned here, since ucol_initUCA need to initialize the
820 * variables below according to the data in the fractional UCA.
821 */
822
823 /**
824 * Function used to:
825 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
826 * b) bump any non-CJK characters by 10FFFF.
827 * The relevant blocks are:
828 * A: 4E00..9FFF; CJK Unified Ideographs
829 * F900..FAFF; CJK Compatibility Ideographs
830 * B: 3400..4DBF; CJK Unified Ideographs Extension A
831 * 20000..XX; CJK Unified Ideographs Extension B (and others later on)
832 * As long as
833 * no new B characters are allocated between 4E00 and FAFF, and
834 * no new A characters are outside of this range,
835 * (very high probability) this simple code will work.
836 * The reordered blocks are:
837 * Block1 is CJK
838 * Block2 is CJK_COMPAT_USED
839 * Block3 is CJK_A
840 * (all contiguous)
841 * Any other CJK gets its normal code point
842 * Any non-CJK gets +10FFFF
843 * When we reorder Block1, we make sure that it is at the very start,
844 * so that it will use a 3-byte form.
845 * Warning: the we only pick up the compatibility characters that are
846 * NOT decomposed, so that block is smaller!
847 */
848
849 // CONSTANTS
850 static const UChar32
851 NON_CJK_OFFSET = 0x110000,
852 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
853
854 /**
855 * Precomputed by constructor
856 */
857 static int32_t
858 final3Multiplier = 0,
859 final4Multiplier = 0,
860 final3Count = 0,
861 final4Count = 0,
862 medialCount = 0,
863 min3Primary = 0,
864 min4Primary = 0,
865 max4Primary = 0,
866 minTrail = 0,
867 maxTrail = 0,
868 max3Trail = 0,
869 max4Trail = 0,
870 min4Boundary = 0;
871
872 static const UChar32
873 CJK_BASE = 0x4E00,
874 CJK_LIMIT = 0x9FFF+1,
875 CJK_COMPAT_USED_BASE = 0xFA0E,
876 CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
877 CJK_A_BASE = 0x3400,
878 CJK_A_LIMIT = 0x4DBF+1,
879 CJK_B_BASE = 0x20000,
880 CJK_B_LIMIT = 0x2A6DF+1;
881
swapCJK(UChar32 i)882 static UChar32 swapCJK(UChar32 i) {
883
884 if (i >= CJK_BASE) {
885 if (i < CJK_LIMIT) return i - CJK_BASE;
886
887 if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET;
888
889 if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE
890 + (CJK_LIMIT - CJK_BASE);
891 if (i < CJK_B_BASE) return i + NON_CJK_OFFSET;
892
893 if (i < CJK_B_LIMIT) return i; // non-BMP-CJK
894
895 return i + NON_CJK_OFFSET; // non-CJK
896 }
897 if (i < CJK_A_BASE) return i + NON_CJK_OFFSET;
898
899 if (i < CJK_A_LIMIT) return i - CJK_A_BASE
900 + (CJK_LIMIT - CJK_BASE)
901 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
902 return i + NON_CJK_OFFSET; // non-CJK
903 }
904
905 U_CAPI UChar32 U_EXPORT2
uprv_uca_getRawFromCodePoint(UChar32 i)906 uprv_uca_getRawFromCodePoint(UChar32 i) {
907 return swapCJK(i)+1;
908 }
909
910 U_CAPI UChar32 U_EXPORT2
uprv_uca_getCodePointFromRaw(UChar32 i)911 uprv_uca_getCodePointFromRaw(UChar32 i) {
912 i--;
913 UChar32 result = 0;
914 if(i >= NON_CJK_OFFSET) {
915 result = i - NON_CJK_OFFSET;
916 } else if(i >= CJK_B_BASE) {
917 result = i;
918 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
919 if(i < CJK_LIMIT - CJK_BASE) {
920 result = i + CJK_BASE;
921 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
922 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
923 } else {
924 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
925 }
926 } else {
927 result = -1;
928 }
929 return result;
930 }
931
932 // GET IMPLICIT PRIMARY WEIGHTS
933 // Return value is left justified primary key
934 U_CAPI uint32_t U_EXPORT2
uprv_uca_getImplicitFromRaw(UChar32 cp)935 uprv_uca_getImplicitFromRaw(UChar32 cp) {
936 /*
937 if (cp < 0 || cp > UCOL_MAX_INPUT) {
938 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
939 }
940 */
941 int32_t last0 = cp - min4Boundary;
942 if (last0 < 0) {
943 int32_t last1 = cp / final3Count;
944 last0 = cp % final3Count;
945
946 int32_t last2 = last1 / medialCount;
947 last1 %= medialCount;
948
949 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
950 last1 = minTrail + last1; // offset
951 last2 = min3Primary + last2; // offset
952 /*
953 if (last2 >= min4Primary) {
954 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
955 }
956 */
957 return (last2 << 24) + (last1 << 16) + (last0 << 8);
958 } else {
959 int32_t last1 = last0 / final4Count;
960 last0 %= final4Count;
961
962 int32_t last2 = last1 / medialCount;
963 last1 %= medialCount;
964
965 int32_t last3 = last2 / medialCount;
966 last2 %= medialCount;
967
968 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
969 last1 = minTrail + last1; // offset
970 last2 = minTrail + last2; // offset
971 last3 = min4Primary + last3; // offset
972 /*
973 if (last3 > max4Primary) {
974 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
975 }
976 */
977 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
978 }
979 }
980
981 static uint32_t U_EXPORT2
uprv_uca_getImplicitPrimary(UChar32 cp)982 uprv_uca_getImplicitPrimary(UChar32 cp) {
983 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
984
985 cp = swapCJK(cp);
986 cp++;
987 // we now have a range of numbers from 0 to 21FFFF.
988
989 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
990
991 return uprv_uca_getImplicitFromRaw(cp);
992 }
993
994 /**
995 * Converts implicit CE into raw integer ("code point")
996 * @param implicit
997 * @return -1 if illegal format
998 */
999 U_CAPI UChar32 U_EXPORT2
uprv_uca_getRawFromImplicit(uint32_t implicit)1000 uprv_uca_getRawFromImplicit(uint32_t implicit) {
1001 UChar32 result;
1002 UChar32 b3 = implicit & 0xFF;
1003 implicit >>= 8;
1004 UChar32 b2 = implicit & 0xFF;
1005 implicit >>= 8;
1006 UChar32 b1 = implicit & 0xFF;
1007 implicit >>= 8;
1008 UChar32 b0 = implicit & 0xFF;
1009
1010 // simple parameter checks
1011 if (b0 < min3Primary || b0 > max4Primary
1012 || b1 < minTrail || b1 > maxTrail) return -1;
1013 // normal offsets
1014 b1 -= minTrail;
1015
1016 // take care of the final values, and compose
1017 if (b0 < min4Primary) {
1018 if (b2 < minTrail || b2 > max3Trail || b3 != 0) return -1;
1019 b2 -= minTrail;
1020 UChar32 remainder = b2 % final3Multiplier;
1021 if (remainder != 0) return -1;
1022 b0 -= min3Primary;
1023 b2 /= final3Multiplier;
1024 result = ((b0 * medialCount) + b1) * final3Count + b2;
1025 } else {
1026 if (b2 < minTrail || b2 > maxTrail
1027 || b3 < minTrail || b3 > max4Trail) return -1;
1028 b2 -= minTrail;
1029 b3 -= minTrail;
1030 UChar32 remainder = b3 % final4Multiplier;
1031 if (remainder != 0) return -1;
1032 b3 /= final4Multiplier;
1033 b0 -= min4Primary;
1034 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
1035 }
1036 // final check
1037 if (result < 0 || result > UCOL_MAX_INPUT) return -1;
1038 return result;
1039 }
1040
1041
divideAndRoundUp(int a,int b)1042 static inline int32_t divideAndRoundUp(int a, int b) {
1043 return 1 + (a-1)/b;
1044 }
1045
1046 /* this function is either called from initUCA or from genUCA before
1047 * doing canonical closure for the UCA.
1048 */
1049
1050 /**
1051 * Set up to generate implicits.
1052 * @param minPrimary
1053 * @param maxPrimary
1054 * @param minTrail final byte
1055 * @param maxTrail final byte
1056 * @param gap3 the gap we leave for tailoring for 3-byte forms
1057 * @param gap4 the gap we leave for tailoring for 4-byte forms
1058 */
initImplicitConstants(int minPrimary,int maxPrimary,int minTrailIn,int maxTrailIn,int gap3,int primaries3count,UErrorCode * status)1059 static void initImplicitConstants(int minPrimary, int maxPrimary,
1060 int minTrailIn, int maxTrailIn,
1061 int gap3, int primaries3count,
1062 UErrorCode *status) {
1063 // some simple parameter checks
1064 if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) {
1065 *status = U_ILLEGAL_ARGUMENT_ERROR;
1066 return;
1067 };
1068 if (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) {
1069 *status = U_ILLEGAL_ARGUMENT_ERROR;
1070 return;
1071 };
1072 if (primaries3count < 1) {
1073 *status = U_ILLEGAL_ARGUMENT_ERROR;
1074 return;
1075 };
1076
1077 minTrail = minTrailIn;
1078 maxTrail = maxTrailIn;
1079
1080 min3Primary = minPrimary;
1081 max4Primary = maxPrimary;
1082 // compute constants for use later.
1083 // number of values we can use in trailing bytes
1084 // leave room for empty values between AND above, e.g. if gap = 2
1085 // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1086 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1087 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1088 final3Multiplier = gap3 + 1;
1089 final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
1090 max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
1091
1092 // medials can use full range
1093 medialCount = (maxTrail - minTrail + 1);
1094 // find out how many values fit in each form
1095 int32_t threeByteCount = medialCount * final3Count;
1096 // now determine where the 3/4 boundary is.
1097 // we use 3 bytes below the boundary, and 4 above
1098 int32_t primariesAvailable = maxPrimary - minPrimary + 1;
1099 int32_t primaries4count = primariesAvailable - primaries3count;
1100
1101
1102 int32_t min3ByteCoverage = primaries3count * threeByteCount;
1103 min4Primary = minPrimary + primaries3count;
1104 min4Boundary = min3ByteCoverage;
1105 // Now expand out the multiplier for the 4 bytes, and redo.
1106
1107 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
1108 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
1109 //if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte);
1110 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
1111 //if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte);
1112 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
1113 //if (DEBUG) System.out.println("expandedGap: " + gap4);
1114 if (gap4 < 1) {
1115 *status = U_ILLEGAL_ARGUMENT_ERROR;
1116 return;
1117 }
1118 final4Multiplier = gap4 + 1;
1119 final4Count = neededPerFinalByte;
1120 max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
1121 /*
1122 if (DEBUG) {
1123 System.out.println("final4Count: " + final4Count);
1124 for (int counter = 0; counter <= final4Count; ++counter) {
1125 int value = minTrail + (1 + counter)*final4Multiplier;
1126 System.out.println(counter + "\t" + value + "\t" + Utility.hex(value));
1127 }
1128 }
1129 */
1130 }
1131
1132 /**
1133 * Supply parameters for generating implicit CEs
1134 */
1135 U_CAPI void U_EXPORT2
uprv_uca_initImplicitConstants(int32_t,int32_t,UErrorCode * status)1136 uprv_uca_initImplicitConstants(int32_t, int32_t, UErrorCode *status) {
1137 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1138 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1139 initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
1140 }
1141
1142 U_CDECL_BEGIN
1143 static UBool U_CALLCONV
ucol_cleanup(void)1144 ucol_cleanup(void)
1145 {
1146 if (UCA_DATA_MEM) {
1147 udata_close(UCA_DATA_MEM);
1148 UCA_DATA_MEM = NULL;
1149 }
1150 if (_staticUCA) {
1151 ucol_close(_staticUCA);
1152 _staticUCA = NULL;
1153 }
1154 fcdTrieIndex = NULL;
1155 return TRUE;
1156 }
1157 U_CDECL_END
1158
1159 /* do not close UCA returned by ucol_initUCA! */
1160 UCollator *
ucol_initUCA(UErrorCode * status)1161 ucol_initUCA(UErrorCode *status) {
1162 if(U_FAILURE(*status)) {
1163 return NULL;
1164 }
1165 UBool needsInit;
1166 UMTX_CHECK(NULL, (_staticUCA == NULL), needsInit);
1167
1168 if(needsInit) {
1169 UCollator *newUCA = NULL;
1170 UDataMemory *result = udata_openChoice(NULL, UCA_DATA_TYPE, UCA_DATA_NAME, isAcceptableUCA, NULL, status);
1171
1172 if(U_FAILURE(*status)) {
1173 if (result) {
1174 udata_close(result);
1175 }
1176 uprv_free(newUCA);
1177 }
1178
1179 // init FCD data
1180 if (fcdTrieIndex == NULL) {
1181 fcdTrieIndex = unorm_getFCDTrie(status);
1182 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
1183 }
1184
1185 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1186 newUCA = ucol_initCollator((const UCATableHeader *)udata_getMemory(result), newUCA, newUCA, status);
1187 if(U_SUCCESS(*status)){
1188 umtx_lock(NULL);
1189 if(_staticUCA == NULL) {
1190 _staticUCA = newUCA;
1191 UCA_DATA_MEM = result;
1192 result = NULL;
1193 newUCA = NULL;
1194 }
1195 umtx_unlock(NULL);
1196
1197 if(newUCA != NULL) {
1198 udata_close(result);
1199 uprv_free(newUCA);
1200 }
1201 else {
1202 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
1203 }
1204 // Initalize variables for implicit generation
1205 const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)_staticUCA->image + _staticUCA->image->UCAConsts);
1206 uprv_uca_initImplicitConstants(UCAconsts->UCA_PRIMARY_IMPLICIT_MIN, UCAconsts->UCA_PRIMARY_IMPLICIT_MAX, status);
1207 //_staticUCA->mapping.getFoldingOffset = _getFoldingOffset;
1208 }else{
1209 udata_close(result);
1210 uprv_free(newUCA);
1211 _staticUCA= NULL;
1212 }
1213 }
1214 }
1215 return _staticUCA;
1216 }
1217
1218
1219 /* collIterNormalize Incremental Normalization happens here. */
1220 /* pick up the range of chars identifed by FCD, */
1221 /* normalize it into the collIterate's writable buffer, */
1222 /* switch the collIterate's state to use the writable buffer. */
1223 /* */
1224 static
collIterNormalize(collIterate * collationSource)1225 void collIterNormalize(collIterate *collationSource)
1226 {
1227 UErrorCode status = U_ZERO_ERROR;
1228
1229 int32_t normLen;
1230 UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */
1231 UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */
1232
1233 normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1234 srcP, (int32_t)(endP - srcP),
1235 FALSE, 0,
1236 &status);
1237 if(status == U_BUFFER_OVERFLOW_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) {
1238 // reallocate and terminate
1239 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1240 &collationSource->writableBuffer,
1241 (int32_t *)&collationSource->writableBufSize, normLen + 1,
1242 0)
1243 ) {
1244 #ifdef UCOL_DEBUG
1245 fprintf(stderr, "collIterNormalize(), out of memory\n");
1246 #endif
1247 return;
1248 }
1249 status = U_ZERO_ERROR;
1250 normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1251 srcP, (int32_t)(endP - srcP),
1252 FALSE, 0,
1253 &status);
1254 }
1255 if (U_FAILURE(status)) {
1256 #ifdef UCOL_DEBUG
1257 fprintf(stderr, "collIterNormalize(), unorm_decompose() failed, status = %s\n", u_errorName(status));
1258 #endif
1259 return;
1260 }
1261
1262 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1263 collationSource->flags |= UCOL_ITER_ALLOCATED;
1264 }
1265 collationSource->pos = collationSource->writableBuffer;
1266 collationSource->origFlags = collationSource->flags;
1267 collationSource->flags |= UCOL_ITER_INNORMBUF;
1268 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1269 }
1270
1271
1272 // This function takes the iterator and extracts normalized stuff up to the next boundary
1273 // It is similar in the end results to the collIterNormalize, but for the cases when we
1274 // use an iterator
1275 /*static
1276 inline void normalizeIterator(collIterate *collationSource) {
1277 UErrorCode status = U_ZERO_ERROR;
1278 UBool wasNormalized = FALSE;
1279 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1280 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
1281 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1282 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1283 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1284 // reallocate and terminate
1285 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1286 &collationSource->writableBuffer,
1287 (int32_t *)&collationSource->writableBufSize, normLen + 1,
1288 0)
1289 ) {
1290 #ifdef UCOL_DEBUG
1291 fprintf(stderr, "normalizeIterator(), out of memory\n");
1292 #endif
1293 return;
1294 }
1295 status = U_ZERO_ERROR;
1296 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1297 collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
1298 normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1299 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1300 }
1301 // Terminate the buffer - we already checked that it is big enough
1302 collationSource->writableBuffer[normLen] = 0;
1303 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1304 collationSource->flags |= UCOL_ITER_ALLOCATED;
1305 }
1306 collationSource->pos = collationSource->writableBuffer;
1307 collationSource->origFlags = collationSource->flags;
1308 collationSource->flags |= UCOL_ITER_INNORMBUF;
1309 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1310 }*/
1311
1312
1313 /* Incremental FCD check and normalize */
1314 /* Called from getNextCE when normalization state is suspect. */
1315 /* When entering, the state is known to be this: */
1316 /* o We are working in the main buffer of the collIterate, not the side */
1317 /* writable buffer. When in the side buffer, normalization mode is always off, */
1318 /* so we won't get here. */
1319 /* o The leading combining class from the current character is 0 or */
1320 /* the trailing combining class of the previous char was zero. */
1321 /* True because the previous call to this function will have always exited */
1322 /* that way, and we get called for every char where cc might be non-zero. */
1323 static
collIterFCD(collIterate * collationSource)1324 inline UBool collIterFCD(collIterate *collationSource) {
1325 UChar c, c2;
1326 const UChar *srcP, *endP;
1327 uint8_t leadingCC;
1328 uint8_t prevTrailingCC = 0;
1329 uint16_t fcd;
1330 UBool needNormalize = FALSE;
1331
1332 srcP = collationSource->pos-1;
1333
1334 if (collationSource->flags & UCOL_ITER_HASLEN) {
1335 endP = collationSource->endp;
1336 } else {
1337 endP = NULL;
1338 }
1339
1340 // Get the trailing combining class of the current character. If it's zero,
1341 // we are OK.
1342 c = *srcP++;
1343 /* trie access */
1344 fcd = unorm_getFCD16(fcdTrieIndex, c);
1345 if (fcd != 0) {
1346 if (U16_IS_LEAD(c)) {
1347 if ((endP == NULL || srcP != endP) && U16_IS_TRAIL(c2=*srcP)) {
1348 ++srcP;
1349 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
1350 } else {
1351 fcd = 0;
1352 }
1353 }
1354
1355 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1356
1357 if (prevTrailingCC != 0) {
1358 // The current char has a non-zero trailing CC. Scan forward until we find
1359 // a char with a leading cc of zero.
1360 while (endP == NULL || srcP != endP)
1361 {
1362 const UChar *savedSrcP = srcP;
1363
1364 c = *srcP++;
1365 /* trie access */
1366 fcd = unorm_getFCD16(fcdTrieIndex, c);
1367 if (fcd != 0 && U16_IS_LEAD(c)) {
1368 if ((endP == NULL || srcP != endP) && U16_IS_TRAIL(c2=*srcP)) {
1369 ++srcP;
1370 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
1371 } else {
1372 fcd = 0;
1373 }
1374 }
1375 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1376 if (leadingCC == 0) {
1377 srcP = savedSrcP; // Hit char that is not part of combining sequence.
1378 // back up over it. (Could be surrogate pair!)
1379 break;
1380 }
1381
1382 if (leadingCC < prevTrailingCC) {
1383 needNormalize = TRUE;
1384 }
1385
1386 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1387 }
1388 }
1389 }
1390
1391 collationSource->fcdPosition = (UChar *)srcP;
1392
1393 return needNormalize;
1394 }
1395
1396 /****************************************************************************/
1397 /* Following are the CE retrieval functions */
1398 /* */
1399 /****************************************************************************/
1400
1401 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
1402 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
1403
1404 /* there should be a macro version of this function in the header file */
1405 /* This is the first function that tries to fetch a collation element */
1406 /* If it's not succesfull or it encounters a more difficult situation */
1407 /* some more sofisticated and slower functions are invoked */
1408 static
ucol_IGetNextCE(const UCollator * coll,collIterate * collationSource,UErrorCode * status)1409 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1410 uint32_t order = 0;
1411 if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */
1412 order = *(collationSource->toReturn++); /* if so, return them */
1413 if(collationSource->CEpos == collationSource->toReturn) {
1414 collationSource->CEpos = collationSource->toReturn = collationSource->CEs;
1415 }
1416 return order;
1417 }
1418
1419 UChar ch = 0;
1420
1421 for (;;) /* Loop handles case when incremental normalize switches */
1422 { /* to or from the side buffer / original string, and we */
1423 /* need to start again to get the next character. */
1424
1425 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1426 {
1427 // The source string is null terminated and we're not working from the side buffer,
1428 // and we're not normalizing. This is the fast path.
1429 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1430 ch = *collationSource->pos++;
1431 if (ch != 0) {
1432 break;
1433 }
1434 else {
1435 return UCOL_NO_MORE_CES;
1436 }
1437 }
1438
1439 if (collationSource->flags & UCOL_ITER_HASLEN) {
1440 // Normal path for strings when length is specified.
1441 // (We can't be in side buffer because it is always null terminated.)
1442 if (collationSource->pos >= collationSource->endp) {
1443 // Ran off of the end of the main source string. We're done.
1444 return UCOL_NO_MORE_CES;
1445 }
1446 ch = *collationSource->pos++;
1447 }
1448 else if(collationSource->flags & UCOL_USE_ITERATOR) {
1449 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
1450 if(iterCh == U_SENTINEL) {
1451 return UCOL_NO_MORE_CES;
1452 }
1453 ch = (UChar)iterCh;
1454 }
1455 else
1456 {
1457 // Null terminated string.
1458 ch = *collationSource->pos++;
1459 if (ch == 0) {
1460 // Ran off end of buffer.
1461 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1462 // Ran off end of main string. backing up one character.
1463 collationSource->pos--;
1464 return UCOL_NO_MORE_CES;
1465 }
1466 else
1467 {
1468 // Hit null in the normalize side buffer.
1469 // Usually this means the end of the normalized data,
1470 // except for one odd case: a null followed by combining chars,
1471 // which is the case if we are at the start of the buffer.
1472 if (collationSource->pos == collationSource->writableBuffer+1) {
1473 break;
1474 }
1475
1476 // Null marked end of side buffer.
1477 // Revert to the main string and
1478 // loop back to top to try again to get a character.
1479 collationSource->pos = collationSource->fcdPosition;
1480 collationSource->flags = collationSource->origFlags;
1481 continue;
1482 }
1483 }
1484 }
1485
1486 if(collationSource->flags&UCOL_HIRAGANA_Q) {
1487 if((ch>=0x3040 && ch<=0x3094) || ch == 0x309d || ch == 0x309e) {
1488 collationSource->flags |= UCOL_WAS_HIRAGANA;
1489 } else {
1490 collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1491 }
1492 }
1493
1494 // We've got a character. See if there's any fcd and/or normalization stuff to do.
1495 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1496 if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1497 break;
1498 }
1499
1500 if (collationSource->fcdPosition >= collationSource->pos) {
1501 // An earlier FCD check has already covered the current character.
1502 // We can go ahead and process this char.
1503 break;
1504 }
1505
1506 if (ch < ZERO_CC_LIMIT_ ) {
1507 // Fast fcd safe path. Trailing combining class == 0. This char is OK.
1508 break;
1509 }
1510
1511 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1512 // We need to peek at the next character in order to tell if we are FCD
1513 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
1514 // We are at the last char of source string.
1515 // It is always OK for FCD check.
1516 break;
1517 }
1518
1519 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test
1520 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1521 break;
1522 }
1523 }
1524
1525
1526 // Need a more complete FCD check and possible normalization.
1527 if (collIterFCD(collationSource)) {
1528 collIterNormalize(collationSource);
1529 }
1530 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1531 // No normalization was needed. Go ahead and process the char we already had.
1532 break;
1533 }
1534
1535 // Some normalization happened. Next loop iteration will pick up a char
1536 // from the normalization buffer.
1537
1538 } // end for (;;)
1539
1540
1541 if (ch <= 0xFF) {
1542 /* For latin-1 characters we never need to fall back to the UCA table */
1543 /* because all of the UCA data is replicated in the latinOneMapping array */
1544 order = coll->latinOneMapping[ch];
1545 if (order > UCOL_NOT_FOUND) {
1546 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
1547 }
1548 }
1549 else
1550 {
1551 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1552 if(order > UCOL_NOT_FOUND) { /* if a CE is special */
1553 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */
1554 }
1555 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */
1556 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1557 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1558
1559 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
1560 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
1561 }
1562 }
1563 }
1564 if(order == UCOL_NOT_FOUND) {
1565 order = getImplicit(ch, collationSource);
1566 }
1567 return order; /* return the CE */
1568 }
1569
1570 /* ucol_getNextCE, out-of-line version for use from other files. */
1571 U_CAPI uint32_t U_EXPORT2
ucol_getNextCE(const UCollator * coll,collIterate * collationSource,UErrorCode * status)1572 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1573 return ucol_IGetNextCE(coll, collationSource, status);
1574 }
1575
1576
1577 /**
1578 * Incremental previous normalization happens here. Pick up the range of chars
1579 * identifed by FCD, normalize it into the collIterate's writable buffer,
1580 * switch the collIterate's state to use the writable buffer.
1581 * @param data collation iterator data
1582 */
1583 static
collPrevIterNormalize(collIterate * data)1584 void collPrevIterNormalize(collIterate *data)
1585 {
1586 UErrorCode status = U_ZERO_ERROR;
1587 UChar *pEnd = data->pos; /* End normalize + 1 */
1588 UChar *pStart;
1589 uint32_t normLen;
1590 UChar *pStartNorm;
1591
1592 /* Start normalize */
1593 if (data->fcdPosition == NULL) {
1594 pStart = data->string;
1595 }
1596 else {
1597 pStart = data->fcdPosition + 1;
1598 }
1599
1600 normLen = unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0,
1601 data->writableBuffer, 0, &status);
1602
1603 if (data->writableBufSize <= normLen) {
1604 freeHeapWritableBuffer(data);
1605 data->writableBuffer = (UChar *)uprv_malloc((normLen + 1) *
1606 sizeof(UChar));
1607 if(data->writableBuffer == NULL) { // something is wrong here, return
1608 return;
1609 }
1610 data->flags |= UCOL_ITER_ALLOCATED;
1611 /* to handle the zero termination */
1612 data->writableBufSize = normLen + 1;
1613 }
1614 status = U_ZERO_ERROR;
1615 /*
1616 this puts the null termination infront of the normalized string instead
1617 of the end
1618 */
1619 pStartNorm = data->writableBuffer + (data->writableBufSize - normLen);
1620 *(pStartNorm - 1) = 0;
1621 unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, pStartNorm,
1622 normLen, &status);
1623
1624 data->pos = data->writableBuffer + data->writableBufSize;
1625 data->origFlags = data->flags;
1626 data->flags |= UCOL_ITER_INNORMBUF;
1627 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1628 }
1629
1630
1631 /**
1632 * Incremental FCD check for previous iteration and normalize. Called from
1633 * getPrevCE when normalization state is suspect.
1634 * When entering, the state is known to be this:
1635 * o We are working in the main buffer of the collIterate, not the side
1636 * writable buffer. When in the side buffer, normalization mode is always
1637 * off, so we won't get here.
1638 * o The leading combining class from the current character is 0 or the
1639 * trailing combining class of the previous char was zero.
1640 * True because the previous call to this function will have always exited
1641 * that way, and we get called for every char where cc might be non-zero.
1642 * @param data collation iterate struct
1643 * @return normalization status, TRUE for normalization to be done, FALSE
1644 * otherwise
1645 */
1646 static
collPrevIterFCD(collIterate * data)1647 inline UBool collPrevIterFCD(collIterate *data)
1648 {
1649 const UChar *src, *start;
1650 UChar c, c2;
1651 uint8_t leadingCC;
1652 uint8_t trailingCC = 0;
1653 uint16_t fcd;
1654 UBool result = FALSE;
1655
1656 start = data->string;
1657 src = data->pos + 1;
1658
1659 /* Get the trailing combining class of the current character. */
1660 c = *--src;
1661 if (!U16_IS_SURROGATE(c)) {
1662 fcd = unorm_getFCD16(fcdTrieIndex, c);
1663 } else if (U16_IS_TRAIL(c) && start < src && U16_IS_LEAD(c2 = *(src - 1))) {
1664 --src;
1665 fcd = unorm_getFCD16(fcdTrieIndex, c2);
1666 if (fcd != 0) {
1667 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
1668 }
1669 } else /* unpaired surrogate */ {
1670 fcd = 0;
1671 }
1672
1673 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1674
1675 if (leadingCC != 0) {
1676 /*
1677 The current char has a non-zero leading combining class.
1678 Scan backward until we find a char with a trailing cc of zero.
1679 */
1680 for (;;)
1681 {
1682 if (start == src) {
1683 data->fcdPosition = NULL;
1684 return result;
1685 }
1686
1687 c = *--src;
1688 if (!U16_IS_SURROGATE(c)) {
1689 fcd = unorm_getFCD16(fcdTrieIndex, c);
1690 } else if (U16_IS_TRAIL(c) && start < src && U16_IS_LEAD(c2 = *(src - 1))) {
1691 --src;
1692 fcd = unorm_getFCD16(fcdTrieIndex, c2);
1693 if (fcd != 0) {
1694 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
1695 }
1696 } else /* unpaired surrogate */ {
1697 fcd = 0;
1698 }
1699
1700 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1701
1702 if (trailingCC == 0) {
1703 break;
1704 }
1705
1706 if (leadingCC < trailingCC) {
1707 result = TRUE;
1708 }
1709
1710 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1711 }
1712 }
1713
1714 data->fcdPosition = (UChar *)src;
1715
1716 return result;
1717 }
1718
1719 /** gets a character from the string at a given offset
1720 * Handles both normal and iterative cases.
1721 * No error checking - caller beware!
1722 */
1723 inline static
peekCharacter(collIterate * source,int32_t offset)1724 UChar peekCharacter(collIterate *source, int32_t offset) {
1725 if(source->pos != NULL) {
1726 return *(source->pos + offset);
1727 } else if(source->iterator != NULL) {
1728 if(offset != 0) {
1729 source->iterator->move(source->iterator, offset, UITER_CURRENT);
1730 UChar toReturn = (UChar)source->iterator->next(source->iterator);
1731 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1732 return toReturn;
1733 } else {
1734 return (UChar)source->iterator->current(source->iterator);
1735 }
1736 } else {
1737 return (UChar)U_SENTINEL;
1738 }
1739 }
1740
1741 /**
1742 * Determines if we are at the start of the data string in the backwards
1743 * collation iterator
1744 * @param data collation iterator
1745 * @return TRUE if we are at the start
1746 */
1747 static
isAtStartPrevIterate(collIterate * data)1748 inline UBool isAtStartPrevIterate(collIterate *data) {
1749 if(data->pos == NULL && data->iterator != NULL) {
1750 return !data->iterator->hasPrevious(data->iterator);
1751 }
1752 //return (collIter_bos(data)) ||
1753 return (data->pos == data->string) ||
1754 ((data->flags & UCOL_ITER_INNORMBUF) &&
1755 *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1756 }
1757
1758 static
goBackOne(collIterate * data)1759 inline void goBackOne(collIterate *data) {
1760 # if 0
1761 // somehow, it looks like we need to keep iterator synced up
1762 // at all times, as above.
1763 if(data->pos) {
1764 data->pos--;
1765 }
1766 if(data->iterator) {
1767 data->iterator->previous(data->iterator);
1768 }
1769 #endif
1770 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
1771 data->iterator->previous(data->iterator);
1772 }
1773 if(data->pos) {
1774 data->pos --;
1775 }
1776 }
1777
1778 /**
1779 * Inline function that gets a simple CE.
1780 * So what it does is that it will first check the expansion buffer. If the
1781 * expansion buffer is not empty, ie the end pointer to the expansion buffer
1782 * is different from the string pointer, we return the collation element at the
1783 * return pointer and decrement it.
1784 * For more complicated CEs it resorts to getComplicatedCE.
1785 * @param coll collator data
1786 * @param data collation iterator struct
1787 * @param status error status
1788 */
1789 static
ucol_IGetPrevCE(const UCollator * coll,collIterate * data,UErrorCode * status)1790 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
1791 UErrorCode *status)
1792 {
1793 uint32_t result = (uint32_t)UCOL_NULLORDER;
1794 if (data->toReturn > data->CEs) {
1795 data->toReturn --;
1796 result = *(data->toReturn);
1797 if (data->CEs == data->toReturn) {
1798 data->CEpos = data->toReturn;
1799 }
1800 }
1801 else {
1802 UChar ch = 0;
1803 /*
1804 Loop handles case when incremental normalize switches to or from the
1805 side buffer / original string, and we need to start again to get the
1806 next character.
1807 */
1808 for (;;) {
1809 if (data->flags & UCOL_ITER_HASLEN) {
1810 /*
1811 Normal path for strings when length is specified.
1812 Not in side buffer because it is always null terminated.
1813 */
1814 if (data->pos <= data->string) {
1815 /* End of the main source string */
1816 return UCOL_NO_MORE_CES;
1817 }
1818 data->pos --;
1819 ch = *data->pos;
1820 }
1821 // we are using an iterator to go back. Pray for us!
1822 else if (data->flags & UCOL_USE_ITERATOR) {
1823 UChar32 iterCh = data->iterator->previous(data->iterator);
1824 if(iterCh == U_SENTINEL) {
1825 return UCOL_NO_MORE_CES;
1826 } else {
1827 ch = (UChar)iterCh;
1828 }
1829 }
1830 else {
1831 data->pos --;
1832 ch = *data->pos;
1833 /* we are in the side buffer. */
1834 if (ch == 0) {
1835 /*
1836 At the start of the normalize side buffer.
1837 Go back to string.
1838 Because pointer points to the last accessed character,
1839 hence we have to increment it by one here.
1840 */
1841 if (data->fcdPosition == NULL) {
1842 data->pos = data->string;
1843 return UCOL_NO_MORE_CES;
1844 }
1845 else {
1846 data->pos = data->fcdPosition + 1;
1847 }
1848 data->flags = data->origFlags;
1849 continue;
1850 }
1851 }
1852
1853 if(data->flags&UCOL_HIRAGANA_Q) {
1854 if(ch>=0x3040 && ch<=0x309f) {
1855 data->flags |= UCOL_WAS_HIRAGANA;
1856 } else {
1857 data->flags &= ~UCOL_WAS_HIRAGANA;
1858 }
1859 }
1860
1861 /*
1862 * got a character to determine if there's fcd and/or normalization
1863 * stuff to do.
1864 * if the current character is not fcd.
1865 * if current character is at the start of the string
1866 * Trailing combining class == 0.
1867 * Note if pos is in the writablebuffer, norm is always 0
1868 */
1869 if (ch < ZERO_CC_LIMIT_ ||
1870 // this should propel us out of the loop in the iterator case
1871 (data->flags & UCOL_ITER_NORM) == 0 ||
1872 (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
1873 || data->string == data->pos) {
1874 break;
1875 }
1876
1877 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1878 /* if next character is FCD */
1879 if (data->pos == data->string) {
1880 /* First char of string is always OK for FCD check */
1881 break;
1882 }
1883
1884 /* Not first char of string, do the FCD fast test */
1885 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
1886 break;
1887 }
1888 }
1889
1890 /* Need a more complete FCD check and possible normalization. */
1891 if (collPrevIterFCD(data)) {
1892 collPrevIterNormalize(data);
1893 }
1894
1895 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
1896 /* No normalization. Go ahead and process the char. */
1897 break;
1898 }
1899
1900 /*
1901 Some normalization happened.
1902 Next loop picks up a char from the normalization buffer.
1903 */
1904 }
1905
1906 /* attempt to handle contractions, after removal of the backwards
1907 contraction
1908 */
1909 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
1910 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
1911 } else {
1912 if (ch <= 0xFF) {
1913 result = coll->latinOneMapping[ch];
1914 }
1915 else {
1916 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1917 }
1918 if (result > UCOL_NOT_FOUND) {
1919 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
1920 }
1921 if (result == UCOL_NOT_FOUND) { // Not found in master list
1922 if (!isAtStartPrevIterate(data) &&
1923 ucol_contractionEndCP(ch, data->coll)) {
1924 result = UCOL_CONTRACTION;
1925 } else {
1926 if(coll->UCA) {
1927 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1928 }
1929 }
1930
1931 if (result > UCOL_NOT_FOUND) {
1932 if(coll->UCA) {
1933 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
1934 }
1935 }
1936 }
1937 }
1938 if(result == UCOL_NOT_FOUND) {
1939 result = getPrevImplicit(ch, data);
1940 }
1941 }
1942 return result;
1943 }
1944
1945
1946 /* ucol_getPrevCE, out-of-line version for use from other files. */
1947 U_CFUNC uint32_t U_EXPORT2
ucol_getPrevCE(const UCollator * coll,collIterate * data,UErrorCode * status)1948 ucol_getPrevCE(const UCollator *coll, collIterate *data,
1949 UErrorCode *status) {
1950 return ucol_IGetPrevCE(coll, data, status);
1951 }
1952
1953
1954 /* this should be connected to special Jamo handling */
1955 U_CFUNC uint32_t U_EXPORT2
ucol_getFirstCE(const UCollator * coll,UChar u,UErrorCode * status)1956 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
1957 collIterate colIt;
1958 uint32_t order;
1959 IInit_collIterate(coll, &u, 1, &colIt);
1960 order = ucol_IGetNextCE(coll, &colIt, status);
1961 /*UCOL_GETNEXTCE(order, coll, colIt, status);*/
1962 return order;
1963 }
1964
1965 /**
1966 * Inserts the argument character into the end of the buffer pushing back the
1967 * null terminator.
1968 * @param data collIterate struct data
1969 * @param pNull pointer to the null termination
1970 * @param ch character to be appended
1971 * @return the position of the new addition
1972 */
1973 static
insertBufferEnd(collIterate * data,UChar * pNull,UChar ch)1974 inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar ch)
1975 {
1976 uint32_t size = data->writableBufSize;
1977 UChar *newbuffer;
1978 const uint32_t incsize = 5;
1979
1980 if ((data->writableBuffer + size) > (pNull + 1)) {
1981 *pNull = ch;
1982 *(pNull + 1) = 0;
1983 return pNull;
1984 }
1985
1986 /*
1987 buffer will always be null terminated at the end.
1988 giving extra space since it is likely that more characters will be added.
1989 */
1990 size += incsize;
1991 newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
1992 if(newbuffer != NULL) { // something wrong, but no status
1993 uprv_memcpy(newbuffer, data->writableBuffer,
1994 data->writableBufSize * sizeof(UChar));
1995
1996 freeHeapWritableBuffer(data);
1997 data->writableBufSize = size;
1998 data->writableBuffer = newbuffer;
1999
2000 newbuffer = newbuffer + data->writableBufSize;
2001 *newbuffer = ch;
2002 *(newbuffer + 1) = 0;
2003 }
2004 return newbuffer;
2005 }
2006
2007 /**
2008 * Inserts the argument string into the end of the buffer pushing back the
2009 * null terminator.
2010 * @param data collIterate struct data
2011 * @param pNull pointer to the null termination
2012 * @param string to be appended
2013 * @param length of the string to be appended
2014 * @return the position of the new addition
2015 */
2016 static
insertBufferEnd(collIterate * data,UChar * pNull,UChar * str,int32_t length)2017 inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar *str,
2018 int32_t length)
2019 {
2020 uint32_t size = pNull - data->writableBuffer;
2021 UChar *newbuffer;
2022
2023 if (data->writableBuffer + data->writableBufSize > pNull + length + 1) {
2024 uprv_memcpy(pNull, str, length * sizeof(UChar));
2025 *(pNull + length) = 0;
2026 return pNull;
2027 }
2028
2029 /*
2030 buffer will always be null terminated at the end.
2031 giving extra space since it is likely that more characters will be added.
2032 */
2033 newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * (size + length + 1));
2034 if(newbuffer != NULL) {
2035 uprv_memcpy(newbuffer, data->writableBuffer, size * sizeof(UChar));
2036 uprv_memcpy(newbuffer + size, str, length * sizeof(UChar));
2037
2038 freeHeapWritableBuffer(data);
2039 data->writableBufSize = size + length + 1;
2040 data->writableBuffer = newbuffer;
2041 }
2042
2043 return newbuffer;
2044 }
2045
2046 /**
2047 * Special normalization function for contraction in the forwards iterator.
2048 * This normalization sequence will place the current character at source->pos
2049 * and its following normalized sequence into the buffer.
2050 * The fcd position, pos will be changed.
2051 * pos will now point to positions in the buffer.
2052 * Flags will be changed accordingly.
2053 * @param data collation iterator data
2054 */
2055 static
normalizeNextContraction(collIterate * data)2056 inline void normalizeNextContraction(collIterate *data)
2057 {
2058 UChar *buffer = data->writableBuffer;
2059 uint32_t buffersize = data->writableBufSize;
2060 uint32_t strsize;
2061 UErrorCode status = U_ZERO_ERROR;
2062 /* because the pointer points to the next character */
2063 UChar *pStart = data->pos - 1;
2064 UChar *pEnd;
2065 uint32_t normLen;
2066 UChar *pStartNorm;
2067
2068 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2069 *data->writableBuffer = *(pStart - 1);
2070 strsize = 1;
2071 }
2072 else {
2073 strsize = u_strlen(data->writableBuffer);
2074 }
2075
2076 pEnd = data->fcdPosition;
2077
2078 normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
2079 &status);
2080
2081 if (buffersize <= normLen + strsize) {
2082 uint32_t size = strsize + normLen + 1;
2083 UChar *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
2084 if(temp != NULL) {
2085 uprv_memcpy(temp, buffer, sizeof(UChar) * strsize);
2086 freeHeapWritableBuffer(data);
2087 data->writableBuffer = temp;
2088 data->writableBufSize = size;
2089 data->flags |= UCOL_ITER_ALLOCATED;
2090 }
2091 }
2092
2093 status = U_ZERO_ERROR;
2094 pStartNorm = buffer + strsize;
2095 /* null-termination will be added here */
2096 unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm,
2097 normLen + 1, &status);
2098
2099 data->pos = data->writableBuffer + strsize;
2100 data->origFlags = data->flags;
2101 data->flags |= UCOL_ITER_INNORMBUF;
2102 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2103 }
2104
2105 /**
2106 * Contraction character management function that returns the next character
2107 * for the forwards iterator.
2108 * Does nothing if the next character is in buffer and not the first character
2109 * in it.
2110 * Else it checks next character in data string to see if it is normalizable.
2111 * If it is not, the character is simply copied into the buffer, else
2112 * the whole normalized substring is copied into the buffer, including the
2113 * current character.
2114 * @param data collation element iterator data
2115 * @return next character
2116 */
2117 static
getNextNormalizedChar(collIterate * data)2118 inline UChar getNextNormalizedChar(collIterate *data)
2119 {
2120 UChar nextch;
2121 UChar ch;
2122 // Here we need to add the iterator code. One problem is the way
2123 // end of string is handled. If we just return next char, it could
2124 // be the sentinel. Most of the cases already check for this, but we
2125 // need to be sure.
2126 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2127 /* if no normalization and not in buffer. */
2128 if(data->flags & UCOL_USE_ITERATOR) {
2129 return (UChar)data->iterator->next(data->iterator);
2130 } else {
2131 return *(data->pos ++);
2132 }
2133 }
2134
2135 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2136 //normalizeIterator(data);
2137 //}
2138
2139 UChar *pEndWritableBuffer = NULL;
2140 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2141 if ((innormbuf && *data->pos != 0) ||
2142 (data->fcdPosition != NULL && !innormbuf &&
2143 data->pos < data->fcdPosition)) {
2144 /*
2145 if next character is in normalized buffer, no further normalization
2146 is required
2147 */
2148 return *(data->pos ++);
2149 }
2150
2151 if (data->flags & UCOL_ITER_HASLEN) {
2152 /* in data string */
2153 if (data->pos + 1 == data->endp) {
2154 return *(data->pos ++);
2155 }
2156 }
2157 else {
2158 if (innormbuf) {
2159 // inside the normalization buffer, but at the end
2160 // (since we encountered zero). This means, in the
2161 // case we're using char iterator, that we need to
2162 // do another round of normalization.
2163 //if(data->origFlags & UCOL_USE_ITERATOR) {
2164 // we need to restore original flags,
2165 // otherwise, we'll lose them
2166 //data->flags = data->origFlags;
2167 //normalizeIterator(data);
2168 //return *(data->pos++);
2169 //} else {
2170 /*
2171 in writable buffer, at this point fcdPosition can not be
2172 pointing to the end of the data string. see contracting tag.
2173 */
2174 if(data->fcdPosition) {
2175 if (*(data->fcdPosition + 1) == 0 ||
2176 data->fcdPosition + 1 == data->endp) {
2177 /* at the end of the string, dump it into the normalizer */
2178 data->pos = insertBufferEnd(data, data->pos,
2179 *(data->fcdPosition)) + 1;
2180 return *(data->fcdPosition ++);
2181 }
2182 pEndWritableBuffer = data->pos;
2183 data->pos = data->fcdPosition;
2184 } else if(data->origFlags & UCOL_USE_ITERATOR) {
2185 // if we are here, we're using a normalizing iterator.
2186 // we should just continue further.
2187 data->flags = data->origFlags;
2188 data->pos = NULL;
2189 return (UChar)data->iterator->next(data->iterator);
2190 }
2191 //}
2192 }
2193 else {
2194 if (*(data->pos + 1) == 0) {
2195 return *(data->pos ++);
2196 }
2197 }
2198 }
2199
2200 ch = *data->pos ++;
2201 nextch = *data->pos;
2202
2203 /*
2204 * if the current character is not fcd.
2205 * Trailing combining class == 0.
2206 */
2207 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2208 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2209 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2210 /*
2211 Need a more complete FCD check and possible normalization.
2212 normalize substring will be appended to buffer
2213 */
2214 if (collIterFCD(data)) {
2215 normalizeNextContraction(data);
2216 return *(data->pos ++);
2217 }
2218 else if (innormbuf) {
2219 /* fcdposition shifted even when there's no normalization, if we
2220 don't input the rest into this, we'll get the wrong position when
2221 we reach the end of the writableBuffer */
2222 int32_t length = data->fcdPosition - data->pos + 1;
2223 data->pos = insertBufferEnd(data, pEndWritableBuffer,
2224 data->pos - 1, length);
2225 return *(data->pos ++);
2226 }
2227 }
2228
2229 if (innormbuf) {
2230 /*
2231 no normalization is to be done hence only one character will be
2232 appended to the buffer.
2233 */
2234 data->pos = insertBufferEnd(data, pEndWritableBuffer, ch) + 1;
2235 }
2236
2237 /* points back to the pos in string */
2238 return ch;
2239 }
2240
2241
2242
2243 /**
2244 * Function to copy the buffer into writableBuffer and sets the fcd position to
2245 * the correct position
2246 * @param source data string source
2247 * @param buffer character buffer
2248 * @param tempdb current position in buffer that has been used up
2249 */
2250 static
setDiscontiguosAttribute(collIterate * source,UChar * buffer,UChar * tempdb)2251 inline void setDiscontiguosAttribute(collIterate *source, UChar *buffer,
2252 UChar *tempdb)
2253 {
2254 /* okay confusing part here. to ensure that the skipped characters are
2255 considered later, we need to place it in the appropriate position in the
2256 normalization buffer and reassign the pos pointer. simple case if pos
2257 reside in string, simply copy to normalization buffer and
2258 fcdposition = pos, pos = start of normalization buffer. if pos in
2259 normalization buffer, we'll insert the copy infront of pos and point pos
2260 to the start of the normalization buffer. why am i doing these copies?
2261 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2262 not require any changes, which be really painful. */
2263 uint32_t length = u_strlen(buffer);;
2264 if (source->flags & UCOL_ITER_INNORMBUF) {
2265 u_strcpy(tempdb, source->pos);
2266 }
2267 else {
2268 source->fcdPosition = source->pos;
2269 source->origFlags = source->flags;
2270 source->flags |= UCOL_ITER_INNORMBUF;
2271 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2272 }
2273
2274 if (length >= source->writableBufSize) {
2275 freeHeapWritableBuffer(source);
2276 source->writableBuffer =
2277 (UChar *)uprv_malloc((length + 1) * sizeof(UChar));
2278 if(source->writableBuffer == NULL) {
2279 return;
2280 }
2281 source->writableBufSize = length;
2282 }
2283
2284 u_strcpy(source->writableBuffer, buffer);
2285 source->pos = source->writableBuffer;
2286 }
2287
2288 /**
2289 * Function to get the discontiguos collation element within the source.
2290 * Note this function will set the position to the appropriate places.
2291 * @param coll current collator used
2292 * @param source data string source
2293 * @param constart index to the start character in the contraction table
2294 * @return discontiguos collation element offset
2295 */
2296 static
getDiscontiguous(const UCollator * coll,collIterate * source,const UChar * constart)2297 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2298 const UChar *constart)
2299 {
2300 /* source->pos currently points to the second combining character after
2301 the start character */
2302 UChar *temppos = source->pos;
2303 UChar buffer[4*UCOL_MAX_BUFFER];
2304 UChar *tempdb = buffer;
2305 const UChar *tempconstart = constart;
2306 uint8_t tempflags = source->flags;
2307 UBool multicontraction = FALSE;
2308 UChar *tempbufferpos = 0;
2309 collIterateState discState;
2310
2311 backupState(source, &discState);
2312
2313 //*tempdb = *(source->pos - 1);
2314 *tempdb = peekCharacter(source, -1);
2315 tempdb++;
2316 for (;;) {
2317 UChar *UCharOffset;
2318 UChar schar,
2319 tchar;
2320 uint32_t result;
2321
2322 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2323 || (peekCharacter(source, 0) == 0 &&
2324 //|| (*source->pos == 0 &&
2325 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2326 source->fcdPosition == NULL ||
2327 source->fcdPosition == source->endp ||
2328 *(source->fcdPosition) == 0 ||
2329 u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2330 /* end of string in null terminated string or stopped by a
2331 null character, note fcd does not always point to a base
2332 character after the discontiguos change */
2333 u_getCombiningClass(peekCharacter(source, 0)) == 0) {
2334 //u_getCombiningClass(*(source->pos)) == 0) {
2335 //constart = (UChar *)coll->image + getContractOffset(CE);
2336 if (multicontraction) {
2337 *tempbufferpos = 0;
2338 source->pos = temppos - 1;
2339 setDiscontiguosAttribute(source, buffer, tempdb);
2340 return *(coll->contractionCEs +
2341 (tempconstart - coll->contractionIndex));
2342 }
2343 constart = tempconstart;
2344 break;
2345 }
2346
2347 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2348 schar = getNextNormalizedChar(source);
2349
2350 while (schar > (tchar = *UCharOffset)) {
2351 UCharOffset++;
2352 }
2353
2354 if (schar != tchar) {
2355 /* not the correct codepoint. we stuff the current codepoint into
2356 the discontiguos buffer and try the next character */
2357 *tempdb = schar;
2358 tempdb ++;
2359 continue;
2360 }
2361 else {
2362 if (u_getCombiningClass(schar) ==
2363 u_getCombiningClass(peekCharacter(source, -2))) {
2364 //u_getCombiningClass(*(source->pos - 2))) {
2365 *tempdb = schar;
2366 tempdb ++;
2367 continue;
2368 }
2369 result = *(coll->contractionCEs +
2370 (UCharOffset - coll->contractionIndex));
2371 }
2372 *tempdb = 0;
2373
2374 if (result == UCOL_NOT_FOUND) {
2375 break;
2376 } else if (isContraction(result)) {
2377 /* this is a multi-contraction*/
2378 tempconstart = (UChar *)coll->image + getContractOffset(result);
2379 if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2380 != UCOL_NOT_FOUND) {
2381 multicontraction = TRUE;
2382 temppos = source->pos + 1;
2383 tempbufferpos = buffer + u_strlen(buffer);
2384 }
2385 } else {
2386 setDiscontiguosAttribute(source, buffer, tempdb);
2387 return result;
2388 }
2389 }
2390
2391 /* no problems simply reverting just like that,
2392 if we are in string before getting into this function, points back to
2393 string hence no problem.
2394 if we are in normalization buffer before getting into this function,
2395 since we'll never use another normalization within this function, we
2396 know that fcdposition points to a base character. the normalization buffer
2397 never change, hence this revert works. */
2398 loadState(source, &discState, TRUE);
2399 goBackOne(source);
2400
2401 //source->pos = temppos - 1;
2402 source->flags = tempflags;
2403 return *(coll->contractionCEs + (constart - coll->contractionIndex));
2404 }
2405
2406 static
isNonChar(UChar32 cp)2407 inline UBool isNonChar(UChar32 cp) {
2408 if ((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF)) {
2409 return TRUE;
2410 }
2411 return FALSE;
2412 }
2413
2414 /* now uses Mark's getImplicitPrimary code */
2415 static
getImplicit(UChar32 cp,collIterate * collationSource)2416 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2417 if(isNonChar(cp)) {
2418 return 0;
2419 }
2420 uint32_t r = uprv_uca_getImplicitPrimary(cp);
2421 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2422 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2423 }
2424
2425 /**
2426 * Inserts the argument character into the front of the buffer replacing the
2427 * front null terminator.
2428 * @param data collation element iterator data
2429 * @param pNull pointer to the null terminator
2430 * @param ch character to be appended
2431 * @return positon of added character
2432 */
2433 static
insertBufferFront(collIterate * data,UChar * pNull,UChar ch)2434 inline UChar * insertBufferFront(collIterate *data, UChar *pNull, UChar ch)
2435 {
2436 uint32_t size = data->writableBufSize;
2437 UChar *end;
2438 UChar *newbuffer;
2439 const uint32_t incsize = 5;
2440
2441 if (pNull > data->writableBuffer + 1) {
2442 *pNull = ch;
2443 *(pNull - 1) = 0;
2444 return pNull;
2445 }
2446
2447 /*
2448 buffer will always be null terminated infront.
2449 giving extra space since it is likely that more characters will be added.
2450 */
2451 size += incsize;
2452 newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
2453 if(newbuffer == NULL) {
2454 return NULL;
2455 }
2456 end = newbuffer + incsize;
2457 uprv_memcpy(end, data->writableBuffer,
2458 data->writableBufSize * sizeof(UChar));
2459 *end = ch;
2460 *(end - 1) = 0;
2461
2462 freeHeapWritableBuffer(data);
2463
2464 data->writableBufSize = size;
2465 data->writableBuffer = newbuffer;
2466 return end;
2467 }
2468
2469 /**
2470 * Special normalization function for contraction in the previous iterator.
2471 * This normalization sequence will place the current character at source->pos
2472 * and its following normalized sequence into the buffer.
2473 * The fcd position, pos will be changed.
2474 * pos will now point to positions in the buffer.
2475 * Flags will be changed accordingly.
2476 * @param data collation iterator data
2477 */
2478 static
normalizePrevContraction(collIterate * data,UErrorCode * status)2479 inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
2480 {
2481 UChar *buffer = data->writableBuffer;
2482 uint32_t buffersize = data->writableBufSize;
2483 uint32_t nulltermsize;
2484 UErrorCode localstatus = U_ZERO_ERROR;
2485 UChar *pEnd = data->pos + 1; /* End normalize + 1 */
2486 UChar *pStart;
2487 uint32_t normLen;
2488 UChar *pStartNorm;
2489
2490 if (data->flags & UCOL_ITER_HASLEN) {
2491 /*
2492 normalization buffer not used yet, we'll pull down the next
2493 character into the end of the buffer
2494 */
2495 *(buffer + (buffersize - 1)) = *(data->pos + 1);
2496 nulltermsize = buffersize - 1;
2497 }
2498 else {
2499 nulltermsize = buffersize;
2500 UChar *temp = buffer + (nulltermsize - 1);
2501 while (*(temp --) != 0) {
2502 nulltermsize --;
2503 }
2504 }
2505
2506 /* Start normalize */
2507 if (data->fcdPosition == NULL) {
2508 pStart = data->string;
2509 }
2510 else {
2511 pStart = data->fcdPosition + 1;
2512 }
2513
2514 normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
2515 &localstatus);
2516
2517 if (nulltermsize <= normLen) {
2518 uint32_t size = buffersize - nulltermsize + normLen + 1;
2519 UChar *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
2520 if (temp == NULL) {
2521 *status = U_MEMORY_ALLOCATION_ERROR;
2522 return;
2523 }
2524 nulltermsize = normLen + 1;
2525 uprv_memcpy(temp + normLen, buffer,
2526 sizeof(UChar) * (buffersize - nulltermsize));
2527 freeHeapWritableBuffer(data);
2528 data->writableBuffer = temp;
2529 data->writableBufSize = size;
2530 }
2531
2532 /*
2533 this puts the null termination infront of the normalized string instead
2534 of the end
2535 */
2536 pStartNorm = buffer + (nulltermsize - normLen);
2537 *(pStartNorm - 1) = 0;
2538 unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen,
2539 status);
2540
2541 data->pos = data->writableBuffer + nulltermsize;
2542 data->origFlags = data->flags;
2543 data->flags |= UCOL_ITER_INNORMBUF;
2544 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2545 }
2546
2547 /**
2548 * Contraction character management function that returns the previous character
2549 * for the backwards iterator.
2550 * Does nothing if the previous character is in buffer and not the first
2551 * character in it.
2552 * Else it checks previous character in data string to see if it is
2553 * normalizable.
2554 * If it is not, the character is simply copied into the buffer, else
2555 * the whole normalized substring is copied into the buffer, including the
2556 * current character.
2557 * @param data collation element iterator data
2558 * @return previous character
2559 */
2560 static
getPrevNormalizedChar(collIterate * data,UErrorCode * status)2561 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
2562 {
2563 UChar prevch;
2564 UChar ch;
2565 UChar *start;
2566 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2567 UChar *pNull = NULL;
2568 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2569 (innormbuf && *(data->pos - 1) != 0)) {
2570 /*
2571 if no normalization.
2572 if previous character is in normalized buffer, no further normalization
2573 is required
2574 */
2575 if(data->flags & UCOL_USE_ITERATOR) {
2576 data->iterator->move(data->iterator, -1, UITER_CURRENT);
2577 return (UChar)data->iterator->next(data->iterator);
2578 } else {
2579 return *(data->pos - 1);
2580 }
2581 }
2582
2583 start = data->pos;
2584 if (data->flags & UCOL_ITER_HASLEN) {
2585 /* in data string */
2586 if ((start - 1) == data->string) {
2587 return *(start - 1);
2588 }
2589 start --;
2590 ch = *start;
2591 prevch = *(start - 1);
2592 }
2593 else {
2594 /*
2595 in writable buffer, at this point fcdPosition can not be NULL.
2596 see contracting tag.
2597 */
2598 if (data->fcdPosition == data->string) {
2599 /* at the start of the string, just dump it into the normalizer */
2600 insertBufferFront(data, data->pos - 1, *(data->fcdPosition));
2601 data->fcdPosition = NULL;
2602 return *(data->pos - 1);
2603 }
2604 pNull = data->pos - 1;
2605 start = data->fcdPosition;
2606 ch = *start;
2607 prevch = *(start - 1);
2608 }
2609 /*
2610 * if the current character is not fcd.
2611 * Trailing combining class == 0.
2612 */
2613 if (data->fcdPosition > start &&
2614 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2615 {
2616 /*
2617 Need a more complete FCD check and possible normalization.
2618 normalize substring will be appended to buffer
2619 */
2620 UChar *backuppos = data->pos;
2621 data->pos = start;
2622 if (collPrevIterFCD(data)) {
2623 normalizePrevContraction(data, status);
2624 return *(data->pos - 1);
2625 }
2626 data->pos = backuppos;
2627 data->fcdPosition ++;
2628 }
2629
2630 if (innormbuf) {
2631 /*
2632 no normalization is to be done hence only one character will be
2633 appended to the buffer.
2634 */
2635 insertBufferFront(data, pNull, ch);
2636 data->fcdPosition --;
2637 }
2638
2639 return ch;
2640 }
2641
2642 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2643 /* It is called by getNextCE */
2644
ucol_prv_getSpecialCE(const UCollator * coll,UChar ch,uint32_t CE,collIterate * source,UErrorCode * status)2645 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
2646 collIterateState entryState;
2647 backupState(source, &entryState);
2648 UChar32 cp = ch;
2649
2650 for (;;) {
2651 // This loop will repeat only in the case of contractions, and only when a contraction
2652 // is found and the first CE resulting from that contraction is itself a special
2653 // (an expansion, for example.) All other special CE types are fully handled the
2654 // first time through, and the loop exits.
2655
2656 const uint32_t *CEOffset = NULL;
2657 switch(getCETag(CE)) {
2658 case NOT_FOUND_TAG:
2659 /* This one is not found, and we'll let somebody else bother about it... no more games */
2660 return CE;
2661 case SURROGATE_TAG:
2662 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
2663 /* two things can happen here: next code point can be a trailing surrogate - we will use it */
2664 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
2665 /* we return 0 (completely ignorable - per UCA specification */
2666 {
2667 UChar trail;
2668 collIterateState state;
2669 backupState(source, &state);
2670 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
2671 // we chould have stepped one char forward and it might have turned that it
2672 // was not a trail surrogate. In that case, we have to backup.
2673 loadState(source, &state, TRUE);
2674 return 0;
2675 } else {
2676 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
2677 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
2678 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
2679 // We need to backup
2680 loadState(source, &state, TRUE);
2681 return CE;
2682 }
2683 // calculate the supplementary code point value, if surrogate was not tailored
2684 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
2685 }
2686 }
2687 break;
2688 case SPEC_PROC_TAG:
2689 {
2690 // Special processing is getting a CE that is preceded by a certain prefix
2691 // Currently this is only needed for optimizing Japanese length and iteration marks.
2692 // When we encouter a special processing tag, we go backwards and try to see if
2693 // we have a match.
2694 // Contraction tables are used - so the whole process is not unlike contraction.
2695 // prefix data is stored backwards in the table.
2696 const UChar *UCharOffset;
2697 UChar schar, tchar;
2698 collIterateState prefixState;
2699 backupState(source, &prefixState);
2700 loadState(source, &entryState, TRUE);
2701 goBackOne(source); // We want to look at the point where we entered - actually one
2702 // before that...
2703
2704 for(;;) {
2705 // This loop will run once per source string character, for as long as we
2706 // are matching a potential contraction sequence
2707
2708 // First we position ourselves at the begining of contraction sequence
2709 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2710 if (collIter_bos(source)) {
2711 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2712 break;
2713 }
2714 schar = getPrevNormalizedChar(source, status);
2715 goBackOne(source);
2716
2717 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2718 UCharOffset++;
2719 }
2720
2721 if (schar == tchar) {
2722 // Found the source string char in the table.
2723 // Pick up the corresponding CE from the table.
2724 CE = *(coll->contractionCEs +
2725 (UCharOffset - coll->contractionIndex));
2726 }
2727 else
2728 {
2729 // Source string char was not in the table.
2730 // We have not found the prefix.
2731 CE = *(coll->contractionCEs +
2732 (ContractionStart - coll->contractionIndex));
2733 }
2734
2735 if(!isPrefix(CE)) {
2736 // The source string char was in the contraction table, and the corresponding
2737 // CE is not a prefix CE. We found the prefix, break
2738 // out of loop, this CE will end up being returned. This is the normal
2739 // way out of prefix handling when the source actually contained
2740 // the prefix.
2741 break;
2742 }
2743 }
2744 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
2745 loadState(source, &prefixState, TRUE);
2746 if(source->origFlags & UCOL_USE_ITERATOR) {
2747 source->flags = source->origFlags;
2748 }
2749 } else { // prefix search was a failure, we have to backup all the way to the start
2750 loadState(source, &entryState, TRUE);
2751 }
2752 break;
2753 }
2754 case CONTRACTION_TAG:
2755 {
2756 /* This should handle contractions */
2757 collIterateState state;
2758 backupState(source, &state);
2759 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
2760 const UChar *UCharOffset;
2761 UChar schar, tchar;
2762
2763 for (;;) {
2764 /* This loop will run once per source string character, for as long as we */
2765 /* are matching a potential contraction sequence */
2766
2767 /* First we position ourselves at the begining of contraction sequence */
2768 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2769
2770 if (collIter_eos(source)) {
2771 // Ran off the end of the source string.
2772 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2773 // So we'll pick whatever we have at the point...
2774 if (CE == UCOL_NOT_FOUND) {
2775 // back up the source over all the chars we scanned going into this contraction.
2776 CE = firstCE;
2777 loadState(source, &state, TRUE);
2778 if(source->origFlags & UCOL_USE_ITERATOR) {
2779 source->flags = source->origFlags;
2780 }
2781 }
2782 break;
2783 }
2784
2785 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2786 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
2787
2788 schar = getNextNormalizedChar(source);
2789 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2790 UCharOffset++;
2791 }
2792
2793 if (schar == tchar) {
2794 // Found the source string char in the contraction table.
2795 // Pick up the corresponding CE from the table.
2796 CE = *(coll->contractionCEs +
2797 (UCharOffset - coll->contractionIndex));
2798 }
2799 else
2800 {
2801 // Source string char was not in contraction table.
2802 // Unless we have a discontiguous contraction, we have finished
2803 // with this contraction.
2804 UChar32 miss = schar;
2805 if(U16_IS_LEAD(schar)) { // in order to do the proper detection, we
2806 // need to see if we're dealing with a supplementary
2807 miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
2808 }
2809
2810 uint8_t sCC;
2811 if (miss < 0x300 ||
2812 maxCC == 0 ||
2813 (sCC = i_getCombiningClass(miss, coll)) == 0 ||
2814 sCC>maxCC ||
2815 (allSame != 0 && sCC == maxCC) ||
2816 collIter_eos(source)) {
2817 // Contraction can not be discontiguous.
2818 goBackOne(source); // back up the source string by one,
2819 // because the character we just looked at was
2820 // not part of the contraction. */
2821 if(U_IS_SUPPLEMENTARY(miss)) {
2822 goBackOne(source);
2823 }
2824 CE = *(coll->contractionCEs +
2825 (ContractionStart - coll->contractionIndex));
2826 } else {
2827 //
2828 // Contraction is possibly discontiguous.
2829 // Scan more of source string looking for a match
2830 //
2831 UChar tempchar;
2832 /* find the next character if schar is not a base character
2833 and we are not yet at the end of the string */
2834 tempchar = getNextNormalizedChar(source);
2835 // probably need another supplementary thingie here
2836 goBackOne(source);
2837 if (i_getCombiningClass(tempchar, coll) == 0) {
2838 goBackOne(source);
2839 if(U_IS_SUPPLEMENTARY(miss)) {
2840 goBackOne(source);
2841 }
2842 /* Spit out the last char of the string, wasn't tasty enough */
2843 CE = *(coll->contractionCEs +
2844 (ContractionStart - coll->contractionIndex));
2845 } else {
2846 CE = getDiscontiguous(coll, source, ContractionStart);
2847 }
2848 }
2849 } // else after if(schar == tchar)
2850
2851 if(CE == UCOL_NOT_FOUND) {
2852 /* The Source string did not match the contraction that we were checking. */
2853 /* Back up the source position to undo the effects of having partially */
2854 /* scanned through what ultimately proved to not be a contraction. */
2855 loadState(source, &state, TRUE);
2856 CE = firstCE;
2857 break;
2858 }
2859
2860 if(!isContraction(CE)) {
2861 // The source string char was in the contraction table, and the corresponding
2862 // CE is not a contraction CE. We completed the contraction, break
2863 // out of loop, this CE will end up being returned. This is the normal
2864 // way out of contraction handling when the source actually contained
2865 // the contraction.
2866 break;
2867 }
2868
2869
2870 // The source string char was in the contraction table, and the corresponding
2871 // CE is IS a contraction CE. We will continue looping to check the source
2872 // string for the remaining chars in the contraction.
2873 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
2874 if(tempCE != UCOL_NOT_FOUND) {
2875 // We have scanned a a section of source string for which there is a
2876 // CE from the contraction table. Remember the CE and scan position, so
2877 // that we can return to this point if further scanning fails to
2878 // match a longer contraction sequence.
2879 firstCE = tempCE;
2880
2881 goBackOne(source);
2882 backupState(source, &state);
2883 getNextNormalizedChar(source);
2884
2885 // Another way to do this is:
2886 //collIterateState tempState;
2887 //backupState(source, &tempState);
2888 //goBackOne(source);
2889 //backupState(source, &state);
2890 //loadState(source, &tempState, TRUE);
2891
2892 // The problem is that for incomplete contractions we have to remember the previous
2893 // position. Before, the only thing I needed to do was state.pos--;
2894 // After iterator introduction and especially after introduction of normalizing
2895 // iterators, it became much more difficult to decrease the saved state.
2896 // I'm not yet sure which of the two methods above is faster.
2897 }
2898 } // for(;;)
2899 break;
2900 } // case CONTRACTION_TAG:
2901 case LONG_PRIMARY_TAG:
2902 {
2903 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
2904 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
2905 return CE;
2906 }
2907 case EXPANSION_TAG:
2908 {
2909 /* This should handle expansion. */
2910 /* NOTE: we can encounter both continuations and expansions in an expansion! */
2911 /* I have to decide where continuations are going to be dealt with */
2912 uint32_t size;
2913 uint32_t i; /* general counter */
2914 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
2915 size = getExpansionCount(CE);
2916 CE = *CEOffset++;
2917 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
2918 for(i = 1; i<size; i++) {
2919 *(source->CEpos++) = *CEOffset++;
2920 }
2921 } else { /* else, we do */
2922 while(*CEOffset != 0) {
2923 *(source->CEpos++) = *CEOffset++;
2924 }
2925 }
2926 return CE;
2927 }
2928 case DIGIT_TAG:
2929 {
2930 /*
2931 We do a check to see if we want to collate digits as numbers; if so we generate
2932 a custom collation key. Otherwise we pull out the value stored in the expansion table.
2933 */
2934 //uint32_t size;
2935 uint32_t i; /* general counter */
2936
2937 if (source->coll->numericCollation == UCOL_ON){
2938 collIterateState digitState = {0,0,0,0,0,0,0,0};
2939 UChar32 char32 = 0;
2940
2941 uint32_t digIndx = 0;
2942 uint32_t endIndex = 0;
2943 uint32_t trailingZeroIndex = 0;
2944
2945 uint32_t primWeight = 0;
2946
2947 int32_t digVal = 0;
2948 uint8_t collateVal = 0;
2949
2950 UBool nonZeroValReached = FALSE;
2951
2952 uint8_t *numTempBuf;
2953 uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs.
2954 uint32_t numTempBufSize = UCOL_MAX_BUFFER;
2955
2956 numTempBuf = stackNumTempBuf;
2957 /*
2958 We parse the source string until we hit a char that's NOT a digit.
2959 Use this u_charDigitValue. This might be slow because we have to
2960 handle surrogates...
2961 */
2962 /*
2963 if (U16_IS_LEAD(ch)){
2964 if (!collIter_eos(source)) {
2965 backupState(source, &digitState);
2966 UChar trail = getNextNormalizedChar(source);
2967 if(U16_IS_TRAIL(trail)) {
2968 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
2969 } else {
2970 loadState(source, &digitState, TRUE);
2971 char32 = ch;
2972 }
2973 } else {
2974 char32 = ch;
2975 }
2976 } else {
2977 char32 = ch;
2978 }
2979 digVal = u_charDigitValue(char32);
2980 */
2981 digVal = u_charDigitValue(cp); // if we have arrived here, we have
2982 // already processed possible supplementaries that trigered the digit tag -
2983 // all supplementaries are marked in the UCA.
2984 /*
2985 We pad a zero in front of the first element anyways. This takes
2986 care of the (probably) most common case where people are sorting things followed
2987 by a single digit
2988 */
2989 digIndx++;
2990 for(;;){
2991 // Make sure we have enough space.
2992 if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
2993 {
2994 numTempBufSize *= 2;
2995 if (numTempBuf == stackNumTempBuf){
2996 numTempBuf = (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize);
2997 uprv_memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
2998 } else {
2999 uprv_realloc(numTempBuf, numTempBufSize);
3000 }
3001 }
3002
3003 // Skipping over leading zeroes.
3004 if (digVal != 0) {
3005 nonZeroValReached = TRUE;
3006 }
3007 if (nonZeroValReached) {
3008 /*
3009 We parse the digit string into base 100 numbers (this fits into a byte).
3010 We only add to the buffer in twos, thus if we are parsing an odd character,
3011 that serves as the 'tens' digit while the if we are parsing an even one, that
3012 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3013 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3014 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3015 than all the other bytes.
3016 */
3017
3018 if (digIndx % 2 == 1){
3019 collateVal += (uint8_t)digVal;
3020
3021 // We don't enter the low-order-digit case unless we've already seen
3022 // the high order, or for the first digit, which is always non-zero.
3023 if (collateVal != 0)
3024 trailingZeroIndex = 0;
3025
3026 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3027 collateVal = 0;
3028 }
3029 else{
3030 // We drop the collation value into the buffer so if we need to do
3031 // a "front patch" we don't have to check to see if we're hitting the
3032 // last element.
3033 collateVal = (uint8_t)(digVal * 10);
3034
3035 // Check for trailing zeroes.
3036 if (collateVal == 0)
3037 {
3038 if (!trailingZeroIndex)
3039 trailingZeroIndex = (digIndx/2) + 2;
3040 }
3041 else
3042 trailingZeroIndex = 0;
3043
3044 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3045 }
3046 digIndx++;
3047 }
3048
3049 // Get next character.
3050 if (!collIter_eos(source)){
3051 ch = getNextNormalizedChar(source);
3052 if (U16_IS_LEAD(ch)){
3053 if (!collIter_eos(source)) {
3054 backupState(source, &digitState);
3055 UChar trail = getNextNormalizedChar(source);
3056 if(U16_IS_TRAIL(trail)) {
3057 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3058 } else {
3059 loadState(source, &digitState, TRUE);
3060 char32 = ch;
3061 }
3062 }
3063 } else {
3064 char32 = ch;
3065 }
3066
3067 if ((digVal = u_charDigitValue(char32)) == -1){
3068 // Resetting position to point to the next unprocessed char. We
3069 // overshot it when doing our test/set for numbers.
3070 if (char32 > 0xFFFF) { // For surrogates.
3071 loadState(source, &digitState, TRUE);
3072 //goBackOne(source);
3073 }
3074 goBackOne(source);
3075 break;
3076 }
3077 } else {
3078 break;
3079 }
3080 }
3081
3082 if (nonZeroValReached == FALSE){
3083 digIndx = 2;
3084 numTempBuf[2] = 6;
3085 }
3086
3087 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
3088 if (digIndx % 2 != 0){
3089 /*
3090 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3091 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3092 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3093 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3094 */
3095
3096 for(i = 2; i < endIndex; i++){
3097 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3098 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3099 }
3100 --digIndx;
3101 }
3102
3103 // Subtract one off of the last byte.
3104 numTempBuf[endIndex-1] -= 1;
3105
3106 /*
3107 We want to skip over the first two slots in the buffer. The first slot
3108 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3109 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3110 */
3111 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3112 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3113
3114 // Now transfer the collation key to our collIterate struct.
3115 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3116 //size = ((endIndex+1) & ~1)/2;
3117 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3118 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3119 UCOL_BYTE_COMMON; // Tertiary weight.
3120 i = 2; // Reset the index into the buffer.
3121 while(i < endIndex)
3122 {
3123 primWeight = numTempBuf[i++] << 8;
3124 if ( i < endIndex)
3125 primWeight |= numTempBuf[i++];
3126 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3127 }
3128
3129 if (numTempBuf != stackNumTempBuf)
3130 uprv_free(numTempBuf);
3131 } else {
3132 // no numeric mode, we'll just switch to whatever we stashed and continue
3133 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3134 CE = *CEOffset++;
3135 break;
3136 }
3137 return CE;
3138 }
3139 /* various implicits optimization */
3140 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3141 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3142 //return getImplicit(cp, source, 0x04000000);
3143 return getImplicit(cp, source);
3144 case IMPLICIT_TAG: /* everything that is not defined otherwise */
3145 /* UCA is filled with these. Tailorings are NOT_FOUND */
3146 //return getImplicit(cp, source, 0);
3147 return getImplicit(cp, source);
3148 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3149 return 0; /* broken surrogate sequence */
3150 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
3151 UChar nextChar;
3152 if( source->flags & UCOL_USE_ITERATOR) {
3153 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
3154 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3155 source->iterator->next(source->iterator);
3156 return getImplicit(cp, source);
3157 } else {
3158 return 0;
3159 }
3160 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
3161 U_IS_TRAIL((nextChar=*source->pos))) {
3162 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3163 source->pos++;
3164 return getImplicit(cp, source);
3165 } else {
3166 return 0; /* completely ignorable */
3167 }
3168 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3169 {
3170 const uint32_t
3171 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3172 //const uint32_t LCount = 19;
3173 const uint32_t VCount = 21;
3174 const uint32_t TCount = 28;
3175 //const uint32_t NCount = VCount * TCount; // 588
3176 //const uint32_t SCount = LCount * NCount; // 11172
3177 uint32_t L = ch - SBase;
3178
3179 // divide into pieces
3180
3181 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
3182 L /= TCount;
3183 uint32_t V = L % VCount;
3184 L /= VCount;
3185
3186 // offset them
3187
3188 L += LBase;
3189 V += VBase;
3190 T += TBase;
3191
3192 // return the first CE, but first put the rest into the expansion buffer
3193 if (!source->coll->image->jamoSpecial) { // FAST PATH
3194
3195 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3196 if (T != TBase) {
3197 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3198 }
3199
3200 return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3201
3202 } else { // Jamo is Special
3203 // Since Hanguls pass the FCD check, it is
3204 // guaranteed that we won't be in
3205 // the normalization buffer if something like this happens
3206 // However, if we are using a uchar iterator and normalization
3207 // is ON, the Hangul that lead us here is going to be in that
3208 // normalization buffer. Here we want to restore the uchar
3209 // iterator state and pull out of the normalization buffer
3210 if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
3211 source->flags = source->origFlags; // restore the iterator
3212 source->pos = NULL;
3213 }
3214 // Move Jamos into normalization buffer
3215 source->writableBuffer[0] = (UChar)L;
3216 source->writableBuffer[1] = (UChar)V;
3217 if (T != TBase) {
3218 source->writableBuffer[2] = (UChar)T;
3219 source->writableBuffer[3] = 0;
3220 } else {
3221 source->writableBuffer[2] = 0;
3222 }
3223
3224 source->fcdPosition = source->pos; // Indicate where to continue in main input string
3225 // after exhausting the writableBuffer
3226 source->pos = source->writableBuffer;
3227 source->origFlags = source->flags;
3228 source->flags |= UCOL_ITER_INNORMBUF;
3229 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3230
3231 return(UCOL_IGNORABLE);
3232 }
3233 }
3234 case CHARSET_TAG:
3235 /* not yet implemented */
3236 /* probably after 1.8 */
3237 return UCOL_NOT_FOUND;
3238 default:
3239 *status = U_INTERNAL_PROGRAM_ERROR;
3240 CE=0;
3241 break;
3242 }
3243 if (CE <= UCOL_NOT_FOUND) break;
3244 }
3245 return CE;
3246 }
3247
3248
3249 /* now uses Mark's getImplicitPrimary code */
3250 static
getPrevImplicit(UChar32 cp,collIterate * collationSource)3251 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3252 if(isNonChar(cp)) {
3253 return 0;
3254 }
3255
3256 uint32_t r = uprv_uca_getImplicitPrimary(cp);
3257
3258 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3259 collationSource->toReturn = collationSource->CEpos;
3260 return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3261 }
3262
3263 /**
3264 * This function handles the special CEs like contractions, expansions,
3265 * surrogates, Thai.
3266 * It is called by both getPrevCE
3267 */
ucol_prv_getSpecialPrevCE(const UCollator * coll,UChar ch,uint32_t CE,collIterate * source,UErrorCode * status)3268 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3269 collIterate *source,
3270 UErrorCode *status)
3271 {
3272 const uint32_t *CEOffset = NULL;
3273 UChar *UCharOffset = NULL;
3274 UChar schar;
3275 const UChar *constart = NULL;
3276 uint32_t size;
3277 UChar buffer[UCOL_MAX_BUFFER];
3278 uint32_t *endCEBuffer;
3279 UChar *strbuffer;
3280 int32_t noChars = 0;
3281
3282 for(;;)
3283 {
3284 /* the only ces that loops are thai and contractions */
3285 switch (getCETag(CE))
3286 {
3287 case NOT_FOUND_TAG: /* this tag always returns */
3288 return CE;
3289 case SURROGATE_TAG: /* This is a surrogate pair */
3290 /* essentialy an engaged lead surrogate. */
3291 /* if you have encountered it here, it means that a */
3292 /* broken sequence was encountered and this is an error */
3293 return 0;
3294 case SPEC_PROC_TAG:
3295 {
3296 // Special processing is getting a CE that is preceded by a certain prefix
3297 // Currently this is only needed for optimizing Japanese length and iteration marks.
3298 // When we encouter a special processing tag, we go backwards and try to see if
3299 // we have a match.
3300 // Contraction tables are used - so the whole process is not unlike contraction.
3301 // prefix data is stored backwards in the table.
3302 const UChar *UCharOffset;
3303 UChar schar, tchar;
3304 collIterateState prefixState;
3305 backupState(source, &prefixState);
3306 for(;;) {
3307 // This loop will run once per source string character, for as long as we
3308 // are matching a potential contraction sequence
3309
3310 // First we position ourselves at the begining of contraction sequence
3311 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3312
3313 if (collIter_bos(source)) {
3314 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3315 break;
3316 }
3317 schar = getPrevNormalizedChar(source, status);
3318 goBackOne(source);
3319
3320 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3321 UCharOffset++;
3322 }
3323
3324 if (schar == tchar) {
3325 // Found the source string char in the table.
3326 // Pick up the corresponding CE from the table.
3327 CE = *(coll->contractionCEs +
3328 (UCharOffset - coll->contractionIndex));
3329 }
3330 else
3331 {
3332 // if there is a completely ignorable code point in the middle of
3333 // a prefix, we need to act as if it's not there
3334 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3335 // lone surrogates cannot be set to zero as it would break other processing
3336 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
3337 // it's easy for BMP code points
3338 if(isZeroCE == 0) {
3339 continue;
3340 } else if(U16_IS_TRAIL(schar) || U16_IS_LEAD(schar)) {
3341 // for supplementary code points, we have to check the next one
3342 // situations where we are going to ignore
3343 // 1. beginning of the string: schar is a lone surrogate
3344 // 2. schar is a lone surrogate
3345 // 3. schar is a trail surrogate in a valid surrogate sequence
3346 // that is explicitly set to zero.
3347 if (!collIter_bos(source)) {
3348 UChar lead;
3349 if(U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
3350 isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
3351 if(getCETag(isZeroCE) == SURROGATE_TAG) {
3352 uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
3353 if(finalCE == 0) {
3354 // this is a real, assigned completely ignorable code point
3355 goBackOne(source);
3356 continue;
3357 }
3358 }
3359 } else {
3360 // lone surrogate, completely ignorable
3361 continue;
3362 }
3363 } else {
3364 // lone surrogate at the beggining, completely ignorable
3365 continue;
3366 }
3367 }
3368 // Source string char was not in the table.
3369 // We have not found the prefix.
3370 CE = *(coll->contractionCEs +
3371 (ContractionStart - coll->contractionIndex));
3372 }
3373
3374 if(!isPrefix(CE)) {
3375 // The source string char was in the contraction table, and the corresponding
3376 // CE is not a prefix CE. We found the prefix, break
3377 // out of loop, this CE will end up being returned. This is the normal
3378 // way out of prefix handling when the source actually contained
3379 // the prefix.
3380 break;
3381 }
3382 }
3383 loadState(source, &prefixState, TRUE);
3384 break;
3385 }
3386
3387 case CONTRACTION_TAG:
3388 /* to ensure that the backwards and forwards iteration matches, we
3389 take the current region of most possible match and pass it through
3390 the forward iteration. this will ensure that the obstinate problem of
3391 overlapping contractions will not occur.
3392 */
3393 schar = peekCharacter(source, 0);
3394 constart = (UChar *)coll->image + getContractOffset(CE);
3395 if (isAtStartPrevIterate(source)
3396 /* commented away contraction end checks after adding the checks
3397 in getPrevCE */) {
3398 /* start of string or this is not the end of any contraction */
3399 CE = *(coll->contractionCEs +
3400 (constart - coll->contractionIndex));
3401 break;
3402 }
3403 strbuffer = buffer;
3404 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3405 *(UCharOffset --) = 0;
3406 noChars = 0;
3407 // have to swap thai characters
3408 while (ucol_unsafeCP(schar, coll)) {
3409 *(UCharOffset) = schar;
3410 noChars++;
3411 UCharOffset --;
3412 schar = getPrevNormalizedChar(source, status);
3413 goBackOne(source);
3414 // TODO: when we exhaust the contraction buffer,
3415 // it needs to get reallocated. The problem is
3416 // that the size depends on the string which is
3417 // not iterated over. However, since we're travelling
3418 // backwards, we already had to set the iterator at
3419 // the end - so we might as well know where we are?
3420 if (UCharOffset + 1 == buffer) {
3421 /* we have exhausted the buffer */
3422 int32_t newsize = 0;
3423 if(source->pos) { // actually dealing with a position
3424 newsize = source->pos - source->string + 1;
3425 } else { // iterator
3426 newsize = 4 * UCOL_MAX_BUFFER;
3427 }
3428 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3429 (newsize + UCOL_MAX_BUFFER));
3430 /* test for NULL */
3431 if (strbuffer == NULL) {
3432 *status = U_MEMORY_ALLOCATION_ERROR;
3433 return UCOL_NO_MORE_CES;
3434 }
3435 UCharOffset = strbuffer + newsize;
3436 uprv_memcpy(UCharOffset, buffer,
3437 UCOL_MAX_BUFFER * sizeof(UChar));
3438 UCharOffset --;
3439 }
3440 if ((source->pos && (source->pos == source->string ||
3441 ((source->flags & UCOL_ITER_INNORMBUF) &&
3442 *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3443 || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
3444 break;
3445 }
3446 }
3447 /* adds the initial base character to the string */
3448 *(UCharOffset) = schar;
3449 noChars++;
3450
3451 /* a new collIterate is used to simplify things, since using the current
3452 collIterate will mean that the forward and backwards iteration will
3453 share and change the same buffers. we don't want to get into that. */
3454 collIterate temp;
3455 //IInit_collIterate(coll, UCharOffset, -1, &temp);
3456 IInit_collIterate(coll, UCharOffset, noChars, &temp);
3457 temp.flags &= ~UCOL_ITER_NORM;
3458
3459 CE = ucol_IGetNextCE(coll, &temp, status);
3460 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3461 while (CE != UCOL_NO_MORE_CES) {
3462 *(source->CEpos ++) = CE;
3463 if (source->CEpos == endCEBuffer) {
3464 /* ran out of CE space, bail.
3465 there's no guarantee of the right character position after
3466 this bail*/
3467 *status = U_BUFFER_OVERFLOW_ERROR;
3468 source->CEpos = source->CEs;
3469 freeHeapWritableBuffer(&temp);
3470 if (strbuffer != buffer) {
3471 uprv_free(strbuffer);
3472 }
3473 return (uint32_t)UCOL_NULLORDER;
3474 }
3475 CE = ucol_IGetNextCE(coll, &temp, status);
3476 }
3477 freeHeapWritableBuffer(&temp);
3478 if (strbuffer != buffer) {
3479 uprv_free(strbuffer);
3480 }
3481 source->toReturn = source->CEpos - 1;
3482 if (source->toReturn == source->CEs) {
3483 source->CEpos = source->CEs;
3484 }
3485 return *(source->toReturn);
3486 case LONG_PRIMARY_TAG:
3487 {
3488 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3489 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3490 source->toReturn = source->CEpos - 1;
3491 return *(source->toReturn);
3492 }
3493 case EXPANSION_TAG: /* this tag always returns */
3494 /*
3495 This should handle expansion.
3496 NOTE: we can encounter both continuations and expansions in an expansion!
3497 I have to decide where continuations are going to be dealt with
3498 */
3499 /* find the offset to expansion table */
3500 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3501 size = getExpansionCount(CE);
3502 if (size != 0) {
3503 /*
3504 if there are less than 16 elements in expansion, we don't terminate
3505 */
3506 uint32_t count;
3507 for (count = 0; count < size; count++) {
3508 *(source->CEpos ++) = *CEOffset++;
3509 }
3510 }
3511 else {
3512 /* else, we do */
3513 while (*CEOffset != 0) {
3514 *(source->CEpos ++) = *CEOffset ++;
3515 }
3516 }
3517 source->toReturn = source->CEpos - 1;
3518 // in case of one element expansion, we
3519 // want to immediately return CEpos
3520 if(source->toReturn == source->CEs) {
3521 source->CEpos = source->CEs;
3522 }
3523 return *(source->toReturn);
3524 case DIGIT_TAG:
3525 {
3526 /*
3527 We do a check to see if we want to collate digits as numbers; if so we generate
3528 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3529 */
3530 //uint32_t size;
3531 uint32_t i; /* general counter */
3532
3533 if (source->coll->numericCollation == UCOL_ON){
3534 collIterateState state = {0,0,0,0,0,0,0,0};
3535 UChar32 char32 = 0;
3536
3537 uint32_t digIndx = 0;
3538 uint32_t endIndex = 0;
3539 uint32_t leadingZeroIndex = 0;
3540 uint32_t trailingZeroCount = 0;
3541
3542 uint32_t primWeight = 0;
3543
3544 int32_t digVal = 0;
3545 uint8_t collateVal = 0;
3546
3547 UBool nonZeroValReached = FALSE;
3548
3549 uint8_t *numTempBuf;
3550 uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs.
3551 uint32_t numTempBufSize = UCOL_MAX_BUFFER;
3552
3553 numTempBuf = stackNumTempBuf;
3554 /*
3555 We parse the source string until we hit a char that's NOT a digit.
3556 Use this u_charDigitValue. This might be slow because we have to
3557 handle surrogates...
3558 */
3559
3560 if (U16_IS_TRAIL (ch)){
3561 if (!collIter_bos(source)){
3562 UChar lead = getPrevNormalizedChar(source, status);
3563 if(U16_IS_LEAD(lead)) {
3564 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3565 goBackOne(source);
3566 } else {
3567 char32 = ch;
3568 }
3569 } else {
3570 char32 = ch;
3571 }
3572 } else {
3573 char32 = ch;
3574 }
3575 digVal = u_charDigitValue(char32);
3576
3577 for(;;){
3578 // Make sure we have enough space.
3579 if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
3580 {
3581 numTempBufSize *= 2;
3582 if (numTempBuf == stackNumTempBuf){
3583 numTempBuf = (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize);
3584 uprv_memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
3585 }else
3586 uprv_realloc(numTempBuf, numTempBufSize);
3587 }
3588
3589 // Skip over trailing zeroes, and keep a count of them.
3590 if (digVal != 0)
3591 nonZeroValReached = TRUE;
3592 if (nonZeroValReached){
3593 /*
3594 We parse the digit string into base 100 numbers (this fits into a byte).
3595 We only add to the buffer in twos, thus if we are parsing an odd character,
3596 that serves as the 'tens' digit while the if we are parsing an even one, that
3597 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3598 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3599 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3600 than all the other bytes.
3601
3602 Since we're doing in this reverse we want to put the first digit encountered into the
3603 ones place and the second digit encountered into the tens place.
3604 */
3605
3606 if ((digIndx + trailingZeroCount) % 2 == 1){
3607 // High-order digit case (tens place)
3608 collateVal += (uint8_t)(digVal * 10);
3609
3610 // We cannot set leadingZeroIndex unless it has been set for the
3611 // low-order digit. Therefore, all we can do for the high-order
3612 // digit is turn it off, never on.
3613 // The only time we will have a high digit without a low is for
3614 // the very first non-zero digit, so no zero check is necessary.
3615 if (collateVal != 0)
3616 leadingZeroIndex = 0;
3617
3618 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3619 collateVal = 0;
3620 }
3621 else{
3622 // Low-order digit case (ones place)
3623 collateVal = (uint8_t)digVal;
3624
3625 // Check for leading zeroes.
3626 if (collateVal == 0)
3627 {
3628 if (!leadingZeroIndex)
3629 leadingZeroIndex = (digIndx/2) + 2;
3630 }
3631 else
3632 leadingZeroIndex = 0;
3633
3634 // No need to write to buffer; the case of a last odd digit
3635 // is handled below.
3636 }
3637 ++digIndx;
3638 }
3639 else
3640 ++trailingZeroCount;
3641
3642 if (!collIter_bos(source)){
3643 ch = getPrevNormalizedChar(source, status);
3644 //goBackOne(source);
3645 if (U16_IS_TRAIL(ch)){
3646 backupState(source, &state);
3647 if (!collIter_bos(source))
3648 {
3649 goBackOne(source);
3650 UChar lead = getPrevNormalizedChar(source, status);
3651 if(U16_IS_LEAD(lead)) {
3652 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3653 } else {
3654 loadState(source, &state, FALSE);
3655 char32 = ch;
3656 }
3657 }
3658 }
3659 else
3660 char32 = ch;
3661
3662 if ((digVal = u_charDigitValue(char32)) == -1){
3663 if (char32 > 0xFFFF) {// For surrogates.
3664 loadState(source, &state, FALSE);
3665 }
3666 // Don't need to "reverse" the goBackOne call,
3667 // as this points to the next position to process..
3668 //if (char32 > 0xFFFF) // For surrogates.
3669 //getNextNormalizedChar(source);
3670 break;
3671 }
3672 goBackOne(source);
3673 }else
3674 break;
3675 }
3676
3677 if (nonZeroValReached == FALSE){
3678 digIndx = 2;
3679 trailingZeroCount = 0;
3680 numTempBuf[2] = 6;
3681 }
3682
3683 if ((digIndx + trailingZeroCount) % 2 != 0){
3684 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
3685 digIndx += 1; // The implicit leading zero
3686 }
3687 if (trailingZeroCount % 2 != 0){
3688 // We had to consume one trailing zero for the low digit
3689 // of the least significant byte
3690 digIndx += 1; // The trailing zero not in the exponent
3691 trailingZeroCount -= 1;
3692 }
3693
3694 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
3695
3696 // Subtract one off of the last byte. Really the first byte here, but it's reversed...
3697 numTempBuf[2] -= 1;
3698
3699 /*
3700 We want to skip over the first two slots in the buffer. The first slot
3701 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3702 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3703 The exponent must be adjusted by the number of leading zeroes, and the number of
3704 trailing zeroes.
3705 */
3706 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3707 uint32_t exponent = (digIndx+trailingZeroCount)/2;
3708 if (leadingZeroIndex)
3709 exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
3710 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
3711
3712 // Now transfer the collation key to our collIterate struct.
3713 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3714 //size = ((endIndex+1) & ~1)/2;
3715 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3716 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3717 UCOL_BYTE_COMMON; // Tertiary weight.
3718 i = endIndex - 1; // Reset the index into the buffer.
3719 while(i >= 2)
3720 {
3721 primWeight = numTempBuf[i--] << 8;
3722 if ( i >= 2)
3723 primWeight |= numTempBuf[i--];
3724 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3725 }
3726 if (numTempBuf != stackNumTempBuf)
3727 uprv_free(numTempBuf);
3728
3729 source->toReturn = source->CEpos -1;
3730 return *(source->toReturn);
3731 }
3732 else {
3733 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3734 CE = *(CEOffset++);
3735 break;
3736 }
3737 }
3738 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3739 {
3740 const uint32_t
3741 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3742 //const uint32_t LCount = 19;
3743 const uint32_t VCount = 21;
3744 const uint32_t TCount = 28;
3745 //const uint32_t NCount = VCount * TCount; /* 588 */
3746 //const uint32_t SCount = LCount * NCount; /* 11172 */
3747
3748 uint32_t L = ch - SBase;
3749 /*
3750 divide into pieces.
3751 we do it in this order since some compilers can do % and / in one
3752 operation
3753 */
3754 uint32_t T = L % TCount;
3755 L /= TCount;
3756 uint32_t V = L % VCount;
3757 L /= VCount;
3758
3759 /* offset them */
3760 L += LBase;
3761 V += VBase;
3762 T += TBase;
3763
3764 /*
3765 return the first CE, but first put the rest into the expansion buffer
3766 */
3767 if (!source->coll->image->jamoSpecial)
3768 {
3769 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3770 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3771 if (T != TBase)
3772 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3773
3774 source->toReturn = source->CEpos - 1;
3775 return *(source->toReturn);
3776 } else {
3777 // Since Hanguls pass the FCD check, it is
3778 // guaranteed that we won't be in
3779 // the normalization buffer if something like this happens
3780 // Move Jamos into normalization buffer
3781 /*
3782 Move the Jamos into the
3783 normalization buffer
3784 */
3785 UChar *tempbuffer = source->writableBuffer +
3786 (source->writableBufSize - 1);
3787 *(tempbuffer) = 0;
3788 if (T != TBase) {
3789 *(tempbuffer - 1) = (UChar)T;
3790 *(tempbuffer - 2) = (UChar)V;
3791 *(tempbuffer - 3) = (UChar)L;
3792 *(tempbuffer - 4) = 0;
3793 } else {
3794 *(tempbuffer - 1) = (UChar)V;
3795 *(tempbuffer - 2) = (UChar)L;
3796 *(tempbuffer - 3) = 0;
3797 }
3798
3799 /*
3800 Indicate where to continue in main input string after exhausting
3801 the writableBuffer
3802 */
3803 if (source->pos == source->string) {
3804 source->fcdPosition = NULL;
3805 } else {
3806 source->fcdPosition = source->pos-1;
3807 }
3808
3809 source->pos = tempbuffer;
3810 source->origFlags = source->flags;
3811 source->flags |= UCOL_ITER_INNORMBUF;
3812 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3813
3814 return(UCOL_IGNORABLE);
3815 }
3816 }
3817 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
3818 return 0; /* broken surrogate sequence */
3819 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3820 {
3821 UChar32 cp = 0;
3822 UChar prevChar;
3823 UChar *prev;
3824 if (isAtStartPrevIterate(source)) {
3825 /* we are at the start of the string, wrong place to be at */
3826 return 0;
3827 }
3828 if (source->pos != source->writableBuffer) {
3829 prev = source->pos - 1;
3830 } else {
3831 prev = source->fcdPosition;
3832 }
3833 prevChar = *prev;
3834
3835 /* Handles Han and Supplementary characters here.*/
3836 if (U16_IS_LEAD(prevChar)) {
3837 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3838 source->pos = prev;
3839 } else {
3840 return 0; /* completely ignorable */
3841 }
3842 return getPrevImplicit(cp, source);
3843 }
3844 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
3845 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3846 return getPrevImplicit(ch, source);
3847 case IMPLICIT_TAG: /* everything that is not defined otherwise */
3848 return getPrevImplicit(ch, source);
3849 /* UCA is filled with these. Tailorings are NOT_FOUND */
3850 /* not yet implemented */
3851 case CHARSET_TAG: /* this tag always returns */
3852 /* probably after 1.8 */
3853 return UCOL_NOT_FOUND;
3854 default: /* this tag always returns */
3855 *status = U_INTERNAL_PROGRAM_ERROR;
3856 CE=0;
3857 break;
3858 }
3859 if (CE <= UCOL_NOT_FOUND) {
3860 break;
3861 }
3862 }
3863 return CE;
3864 }
3865
3866 /* This should really be a macro */
3867 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
3868 /* anyway */
3869 static
reallocateBuffer(uint8_t ** secondaries,uint8_t * secStart,uint8_t * second,uint32_t * secSize,uint32_t newSize,UErrorCode * status)3870 uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) {
3871 #ifdef UCOL_DEBUG
3872 fprintf(stderr, ".");
3873 #endif
3874 uint8_t *newStart = NULL;
3875 uint32_t offset = *secondaries-secStart;
3876
3877 if(secStart==second) {
3878 newStart=(uint8_t*)uprv_malloc(newSize);
3879 if(newStart==NULL) {
3880 *status = U_MEMORY_ALLOCATION_ERROR;
3881 return NULL;
3882 }
3883 uprv_memcpy(newStart, secStart, *secondaries-secStart);
3884 } else {
3885 newStart=(uint8_t*)uprv_realloc(secStart, newSize);
3886 if(newStart==NULL) {
3887 *status = U_MEMORY_ALLOCATION_ERROR;
3888 return NULL;
3889 }
3890 }
3891 *secondaries=newStart+offset;
3892 *secSize=newSize;
3893 return newStart;
3894 }
3895
3896
3897 /* This should really be a macro */
3898 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
3899 /* secondaries in French */
3900 /*
3901 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
3902 uint8_t temp;
3903 while(start<end) {
3904 temp = *start;
3905 *start++ = *end;
3906 *end-- = temp;
3907 }
3908 }
3909 */
3910
3911 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
3912 TYPE tempA; \
3913 while((start)<(end)) { \
3914 tempA = *(start); \
3915 *(start)++ = *(end); \
3916 *(end)-- = tempA; \
3917 } \
3918 }
3919
3920 /****************************************************************************/
3921 /* Following are the sortkey generation functions */
3922 /* */
3923 /****************************************************************************/
3924
3925 /**
3926 * Merge two sort keys.
3927 * This is useful, for example, to combine sort keys from first and last names
3928 * to sort such pairs.
3929 * Merged sort keys consider on each collation level the first part first entirely,
3930 * then the second one.
3931 * It is possible to merge multiple sort keys by consecutively merging
3932 * another one with the intermediate result.
3933 *
3934 * The length of the merge result is the sum of the lengths of the input sort keys
3935 * minus 1.
3936 *
3937 * @param src1 the first sort key
3938 * @param src1Length the length of the first sort key, including the zero byte at the end;
3939 * can be -1 if the function is to find the length
3940 * @param src2 the second sort key
3941 * @param src2Length the length of the second sort key, including the zero byte at the end;
3942 * can be -1 if the function is to find the length
3943 * @param dest the buffer where the merged sort key is written,
3944 * can be NULL if destCapacity==0
3945 * @param destCapacity the number of bytes in the dest buffer
3946 * @return the length of the merged sort key, src1Length+src2Length-1;
3947 * can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
3948 * in which cases the contents of dest is undefined
3949 *
3950 * @draft
3951 */
3952 U_CAPI int32_t U_EXPORT2
ucol_mergeSortkeys(const uint8_t * src1,int32_t src1Length,const uint8_t * src2,int32_t src2Length,uint8_t * dest,int32_t destCapacity)3953 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
3954 const uint8_t *src2, int32_t src2Length,
3955 uint8_t *dest, int32_t destCapacity) {
3956 int32_t destLength;
3957 uint8_t b;
3958
3959 /* check arguments */
3960 if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
3961 src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
3962 destCapacity<0 || (destCapacity>0 && dest==NULL)
3963 ) {
3964 /* error, attempt to write a zero byte and return 0 */
3965 if(dest!=NULL && destCapacity>0) {
3966 *dest=0;
3967 }
3968 return 0;
3969 }
3970
3971 /* check lengths and capacity */
3972 if(src1Length<0) {
3973 src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
3974 }
3975 if(src2Length<0) {
3976 src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
3977 }
3978
3979 destLength=src1Length+src2Length-1;
3980 if(destLength>destCapacity) {
3981 /* the merged sort key does not fit into the destination */
3982 return destLength;
3983 }
3984
3985 /* merge the sort keys with the same number of levels */
3986 while(*src1!=0 && *src2!=0) { /* while both have another level */
3987 /* copy level from src1 not including 00 or 01 */
3988 while((b=*src1)>=2) {
3989 ++src1;
3990 *dest++=b;
3991 }
3992
3993 /* add a 02 merge separator */
3994 *dest++=2;
3995
3996 /* copy level from src2 not including 00 or 01 */
3997 while((b=*src2)>=2) {
3998 ++src2;
3999 *dest++=b;
4000 }
4001
4002 /* if both sort keys have another level, then add a 01 level separator and continue */
4003 if(*src1==1 && *src2==1) {
4004 ++src1;
4005 ++src2;
4006 *dest++=1;
4007 }
4008 }
4009
4010 /*
4011 * here, at least one sort key is finished now, but the other one
4012 * might have some contents left from containing more levels;
4013 * that contents is just appended to the result
4014 */
4015 if(*src1!=0) {
4016 /* src1 is not finished, therefore *src2==0, and src1 is appended */
4017 src2=src1;
4018 }
4019 /* append src2, "the other, unfinished sort key" */
4020 uprv_strcpy((char *)dest, (const char *)src2);
4021
4022 /* trust that neither sort key contained illegally embedded zero bytes */
4023 return destLength;
4024 }
4025
4026 /* sortkey API */
4027 U_CAPI int32_t U_EXPORT2
ucol_getSortKey(const UCollator * coll,const UChar * source,int32_t sourceLength,uint8_t * result,int32_t resultLength)4028 ucol_getSortKey(const UCollator *coll,
4029 const UChar *source,
4030 int32_t sourceLength,
4031 uint8_t *result,
4032 int32_t resultLength)
4033 {
4034 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
4035 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
4036 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
4037 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
4038 }
4039
4040 UErrorCode status = U_ZERO_ERROR;
4041 int32_t keySize = 0;
4042
4043 if(source != NULL) {
4044 // source == NULL is actually an error situation, but we would need to
4045 // have an error code to return it. Until we introduce a new
4046 // API, it stays like this
4047
4048 /* this uses the function pointer that is set in updateinternalstate */
4049 /* currently, there are two funcs: */
4050 /*ucol_calcSortKey(...);*/
4051 /*ucol_calcSortKeySimpleTertiary(...);*/
4052
4053 keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status);
4054 //((UCollator *)coll)->errorCode = status; /*semantically const */
4055 }
4056 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
4057 UTRACE_EXIT_STATUS(status);
4058 return keySize;
4059 }
4060
4061 /* this function is called by the C++ API for sortkey generation */
4062 U_CFUNC int32_t
ucol_getSortKeyWithAllocation(const UCollator * coll,const UChar * source,int32_t sourceLength,uint8_t ** pResult,UErrorCode * pErrorCode)4063 ucol_getSortKeyWithAllocation(const UCollator *coll,
4064 const UChar *source, int32_t sourceLength,
4065 uint8_t **pResult,
4066 UErrorCode *pErrorCode) {
4067 *pResult = 0;
4068 return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode);
4069 }
4070
4071 #define UCOL_FSEC_BUF_SIZE 256
4072
4073 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0 */
4074 /* or if we run out of space while making a sortkey and want to return ASAP */
ucol_getSortKeySize(const UCollator * coll,collIterate * s,int32_t currentSize,UColAttributeValue strength,int32_t len)4075 int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) {
4076 UErrorCode status = U_ZERO_ERROR;
4077 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4078 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4079 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4080 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4081 UBool compareIdent = (strength == UCOL_IDENTICAL);
4082 UBool doCase = (coll->caseLevel == UCOL_ON);
4083 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
4084 //UBool qShifted = shifted && (compareQuad == 0);
4085 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4086 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4087 uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE];
4088 uint8_t *fSecs = fSecsBuff;
4089 uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE;
4090 uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL;
4091
4092 uint32_t variableTopValue = coll->variableTopValue;
4093 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4094 if(doHiragana) {
4095 UCOL_COMMON_BOT4++;
4096 /* allocate one more space for hiragana */
4097 }
4098 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4099
4100 uint32_t order = UCOL_NO_MORE_CES;
4101 uint8_t primary1 = 0;
4102 uint8_t primary2 = 0;
4103 uint8_t secondary = 0;
4104 uint8_t tertiary = 0;
4105 int32_t caseShift = 0;
4106 uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */
4107
4108 uint8_t caseSwitch = coll->caseSwitch;
4109 uint8_t tertiaryMask = coll->tertiaryMask;
4110 uint8_t tertiaryCommon = coll->tertiaryCommon;
4111
4112 UBool wasShifted = FALSE;
4113 UBool notIsContinuation = FALSE;
4114 uint8_t leadPrimary = 0;
4115
4116
4117 for(;;) {
4118 order = ucol_IGetNextCE(coll, s, &status);
4119 if(order == UCOL_NO_MORE_CES) {
4120 break;
4121 }
4122
4123 if(order == 0) {
4124 continue;
4125 }
4126
4127 notIsContinuation = !isContinuation(order);
4128
4129
4130 if(notIsContinuation) {
4131 tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK));
4132 } else {
4133 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4134 }
4135 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4136 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4137 primary1 = (uint8_t)(order >> 8);
4138
4139
4140 if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4141 || (!notIsContinuation && wasShifted))
4142 || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
4143 /* and other ignorables should be removed if following a shifted code point */
4144 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4145 /* we should just completely ignore it */
4146 continue;
4147 }
4148 if(compareQuad == 0) {
4149 if(c4 > 0) {
4150 currentSize += (c2/UCOL_BOT_COUNT4)+1;
4151 c4 = 0;
4152 }
4153 currentSize++;
4154 if(primary2 != 0) {
4155 currentSize++;
4156 }
4157 }
4158 wasShifted = TRUE;
4159 } else {
4160 wasShifted = FALSE;
4161 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4162 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
4163 /* calculate sortkey size */
4164 if(primary1 != UCOL_IGNORABLE) {
4165 if(notIsContinuation) {
4166 if(leadPrimary == primary1) {
4167 currentSize++;
4168 } else {
4169 if(leadPrimary != 0) {
4170 currentSize++;
4171 }
4172 if(primary2 == UCOL_IGNORABLE) {
4173 /* one byter, not compressed */
4174 currentSize++;
4175 leadPrimary = 0;
4176 } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
4177 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
4178 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4179 (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
4180 /* not compressible */
4181 leadPrimary = 0;
4182 currentSize+=2;
4183 } else { /* compress */
4184 leadPrimary = primary1;
4185 currentSize+=2;
4186 }
4187 }
4188 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4189 currentSize++;
4190 if(primary2 != UCOL_IGNORABLE) {
4191 currentSize++;
4192 }
4193 }
4194 }
4195
4196 if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */
4197 if(!isFrenchSec){
4198 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4199 c2++;
4200 } else {
4201 if(c2 > 0) {
4202 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4203 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1;
4204 } else {
4205 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1;
4206 }
4207 c2 = 0;
4208 }
4209 currentSize++;
4210 }
4211 } else {
4212 fSecs[fSecsLen++] = secondary;
4213 if(fSecsLen == fSecsMaxLen) {
4214 if(fSecs == fSecsBuff) {
4215 fSecs = (uint8_t *)uprv_malloc(2*fSecsLen);
4216 } else {
4217 fSecs = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen);
4218 }
4219 if(fSecs == NULL) {
4220 status = U_MEMORY_ALLOCATION_ERROR;
4221 return -1;
4222 }
4223 fSecsMaxLen *= 2;
4224 }
4225 if(notIsContinuation) {
4226 if (frenchStartPtr != NULL) {
4227 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4228 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4229 frenchStartPtr = NULL;
4230 }
4231 } else {
4232 if (frenchStartPtr == NULL) {
4233 frenchStartPtr = fSecs+fSecsLen-2;
4234 }
4235 frenchEndPtr = fSecs+fSecsLen-1;
4236 }
4237 }
4238 }
4239
4240 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4241 // do the case level if we need to do it. We don't want to calculate
4242 // case level for primary ignorables if we have only primary strength and case level
4243 // otherwise we would break well formedness of CEs
4244 if (caseShift == 0) {
4245 currentSize++;
4246 caseShift = UCOL_CASE_SHIFT_START;
4247 }
4248 if((tertiary&0x3F) > 0 && notIsContinuation) {
4249 caseShift--;
4250 if((tertiary &0xC0) != 0) {
4251 if (caseShift == 0) {
4252 currentSize++;
4253 caseShift = UCOL_CASE_SHIFT_START;
4254 }
4255 caseShift--;
4256 }
4257 }
4258 } else {
4259 if(notIsContinuation) {
4260 tertiary ^= caseSwitch;
4261 }
4262 }
4263
4264 tertiary &= tertiaryMask;
4265 if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */
4266 if (tertiary == tertiaryCommon && notIsContinuation) {
4267 c3++;
4268 } else {
4269 if(c3 > 0) {
4270 if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
4271 || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
4272 currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1;
4273 } else {
4274 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1;
4275 }
4276 c3 = 0;
4277 }
4278 currentSize++;
4279 }
4280 }
4281
4282 if(/*qShifted*/(compareQuad==0) && notIsContinuation) {
4283 if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4284 if(c4>0) { // Close this part
4285 currentSize += (c4/UCOL_BOT_COUNT4)+1;
4286 c4 = 0;
4287 }
4288 currentSize++; // Add the Hiragana
4289 } else { // This wasn't Hiragana, so we can continue adding stuff
4290 c4++;
4291 }
4292 }
4293
4294 }
4295 }
4296
4297 if(!isFrenchSec){
4298 if(c2 > 0) {
4299 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4300 }
4301 } else {
4302 uint32_t i = 0;
4303 if(frenchStartPtr != NULL) {
4304 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4305 }
4306 for(i = 0; i<fSecsLen; i++) {
4307 secondary = *(fSecs+fSecsLen-i-1);
4308 /* This is compression code. */
4309 if (secondary == UCOL_COMMON2) {
4310 ++c2;
4311 } else {
4312 if(c2 > 0) {
4313 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4314 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0);
4315 } else {
4316 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4317 }
4318 c2 = 0;
4319 }
4320 currentSize++;
4321 }
4322 }
4323 if(c2 > 0) {
4324 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4325 }
4326 if(fSecs != fSecsBuff) {
4327 uprv_free(fSecs);
4328 }
4329 }
4330
4331 if(c3 > 0) {
4332 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0);
4333 }
4334
4335 if(c4 > 0 && compareQuad == 0) {
4336 currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0);
4337 }
4338
4339 if(compareIdent) {
4340 currentSize += u_lengthOfIdenticalLevelRun(s->string, len);
4341 }
4342 return currentSize;
4343
4344 }
4345
4346 static
doCaseShift(uint8_t ** cases,uint32_t & caseShift)4347 inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) {
4348 if (caseShift == 0) {
4349 *(*cases)++ = UCOL_CASE_BYTE_START;
4350 caseShift = UCOL_CASE_SHIFT_START;
4351 }
4352 }
4353
4354 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
4355 // know how many values we wanted to add, even if we didn't add them all
4356 static
addWithIncrement(uint8_t * & primaries,uint8_t * limit,uint32_t & size,const uint8_t value)4357 inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) {
4358 size++;
4359 if(primaries < limit) {
4360 *(primaries)++ = value;
4361 }
4362 }
4363
4364 // Packs the secondary buffer when processing French locale. Adds the terminator.
4365 static
packFrench(uint8_t * primaries,uint8_t * primEnd,uint8_t * secondaries,uint32_t * secsize,uint8_t * frenchStartPtr,uint8_t * frenchEndPtr)4366 inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) {
4367 uint8_t secondary;
4368 int32_t count2 = 0;
4369 uint32_t i = 0, size = 0;
4370 // we use i here since the key size already accounts for terminators, so we'll discard the increment
4371 addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR);
4372 /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
4373 if(frenchStartPtr != NULL) {
4374 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4375 }
4376 for(i = 0; i<*secsize; i++) {
4377 secondary = *(secondaries-i-1);
4378 /* This is compression code. */
4379 if (secondary == UCOL_COMMON2) {
4380 ++count2;
4381 } else {
4382 if (count2 > 0) {
4383 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4384 while (count2 > UCOL_TOP_COUNT2) {
4385 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
4386 count2 -= (uint32_t)UCOL_TOP_COUNT2;
4387 }
4388 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
4389 } else {
4390 while (count2 > UCOL_BOT_COUNT2) {
4391 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4392 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4393 }
4394 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4395 }
4396 count2 = 0;
4397 }
4398 addWithIncrement(primaries, primEnd, size, secondary);
4399 }
4400 }
4401 if (count2 > 0) {
4402 while (count2 > UCOL_BOT_COUNT2) {
4403 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4404 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4405 }
4406 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4407 }
4408 *secsize = size;
4409 return primaries;
4410 }
4411
4412 /* This is the sortkey work horse function */
4413 U_CFUNC int32_t U_CALLCONV
ucol_calcSortKey(const UCollator * coll,const UChar * source,int32_t sourceLength,uint8_t ** result,uint32_t resultLength,UBool allocateSKBuffer,UErrorCode * status)4414 ucol_calcSortKey(const UCollator *coll,
4415 const UChar *source,
4416 int32_t sourceLength,
4417 uint8_t **result,
4418 uint32_t resultLength,
4419 UBool allocateSKBuffer,
4420 UErrorCode *status)
4421 {
4422 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4423
4424 uint32_t i = 0; /* general purpose counter */
4425
4426 /* Stack allocated buffers for buffers we use */
4427 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER];
4428
4429 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad;
4430
4431 if(U_FAILURE(*status)) {
4432 return 0;
4433 }
4434
4435 if(primaries == NULL && allocateSKBuffer == TRUE) {
4436 primaries = *result = prim;
4437 resultLength = UCOL_PRIMARY_MAX_BUFFER;
4438 }
4439
4440 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER,
4441 caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER;
4442
4443 uint32_t sortKeySize = 1; /* it is always \0 terminated */
4444
4445 UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
4446 UChar *normSource = normBuffer;
4447 int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
4448
4449 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4450
4451 UColAttributeValue strength = coll->strength;
4452
4453 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4454 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4455 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4456 UBool compareIdent = (strength == UCOL_IDENTICAL);
4457 UBool doCase = (coll->caseLevel == UCOL_ON);
4458 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4459 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
4460 //UBool qShifted = shifted && (compareQuad == 0);
4461 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4462 /*const uint8_t *scriptOrder = coll->scriptOrder;*/
4463
4464 uint32_t variableTopValue = coll->variableTopValue;
4465 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4466 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4467 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4468 uint8_t UCOL_HIRAGANA_QUAD = 0;
4469 if(doHiragana) {
4470 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4471 /* allocate one more space for hiragana, value for hiragana */
4472 }
4473 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4474
4475 /* support for special features like caselevel and funky secondaries */
4476 uint8_t *frenchStartPtr = NULL;
4477 uint8_t *frenchEndPtr = NULL;
4478 uint32_t caseShift = 0;
4479
4480 sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShifted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0));
4481
4482 /* If we need to normalize, we'll do it all at once at the beginning! */
4483 UNormalizationMode normMode;
4484 if(compareIdent) {
4485 normMode = UNORM_NFD;
4486 } else if(coll->normalizationMode != UCOL_OFF) {
4487 normMode = UNORM_FCD;
4488 } else {
4489 normMode = UNORM_NONE;
4490 }
4491
4492 if(normMode != UNORM_NONE && UNORM_YES != unorm_quickCheck(source, len, normMode, status)) {
4493 len = unorm_internalNormalize(normSource, normSourceLen,
4494 source, len,
4495 normMode, FALSE,
4496 status);
4497 if(*status == U_BUFFER_OVERFLOW_ERROR) {
4498 normSourceLen = len;
4499 normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
4500 if(normSource == NULL) {
4501 *status = U_MEMORY_ALLOCATION_ERROR;
4502 return 0;
4503 }
4504 *status = U_ZERO_ERROR;
4505 len = unorm_internalNormalize(normSource, normSourceLen,
4506 source, len,
4507 normMode, FALSE,
4508 status);
4509 }
4510
4511 if(U_FAILURE(*status)) {
4512 return 0;
4513 }
4514 source = normSource;
4515 }
4516
4517 collIterate s;
4518 IInit_collIterate(coll, (UChar *)source, len, &s);
4519 if(source == normSource) {
4520 s.flags &= ~UCOL_ITER_NORM;
4521 }
4522
4523 if(resultLength == 0 || primaries == NULL) {
4524 int32_t keyLen = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4525 if(normSource != normBuffer) {
4526 uprv_free(normSource);
4527 }
4528 return keyLen;
4529 }
4530 uint8_t *primarySafeEnd = primaries + resultLength - 1;
4531 if(strength > UCOL_PRIMARY) {
4532 primarySafeEnd--;
4533 }
4534
4535 uint32_t minBufferSize = UCOL_MAX_BUFFER;
4536
4537 uint8_t *primStart = primaries;
4538 uint8_t *secStart = secondaries;
4539 uint8_t *terStart = tertiaries;
4540 uint8_t *caseStart = cases;
4541 uint8_t *quadStart = quads;
4542
4543 uint32_t order = 0;
4544
4545 uint8_t primary1 = 0;
4546 uint8_t primary2 = 0;
4547 uint8_t secondary = 0;
4548 uint8_t tertiary = 0;
4549 uint8_t caseSwitch = coll->caseSwitch;
4550 uint8_t tertiaryMask = coll->tertiaryMask;
4551 int8_t tertiaryAddition = (int8_t)coll->tertiaryAddition;
4552 uint8_t tertiaryTop = coll->tertiaryTop;
4553 uint8_t tertiaryBottom = coll->tertiaryBottom;
4554 uint8_t tertiaryCommon = coll->tertiaryCommon;
4555 uint8_t caseBits = 0;
4556
4557 UBool finished = FALSE;
4558 UBool wasShifted = FALSE;
4559 UBool notIsContinuation = FALSE;
4560
4561 uint32_t prevBuffSize = 0;
4562
4563 uint32_t count2 = 0, count3 = 0, count4 = 0;
4564 uint8_t leadPrimary = 0;
4565
4566 for(;;) {
4567 for(i=prevBuffSize; i<minBufferSize; ++i) {
4568
4569 order = ucol_IGetNextCE(coll, &s, status);
4570 if(order == UCOL_NO_MORE_CES) {
4571 finished = TRUE;
4572 break;
4573 }
4574
4575 if(order == 0) {
4576 continue;
4577 }
4578
4579 notIsContinuation = !isContinuation(order);
4580
4581 if(notIsContinuation) {
4582 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
4583 } else {
4584 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4585 }
4586
4587 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4588 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4589 primary1 = (uint8_t)(order >> 8);
4590
4591 /*if(notIsContinuation && scriptOrder != NULL) {
4592 primary1 = scriptOrder[primary1];
4593 }*/
4594
4595 if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4596 || (!notIsContinuation && wasShifted))
4597 || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
4598 /* and other ignorables should be removed if following a shifted code point */
4599 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4600 /* we should just completely ignore it */
4601 continue;
4602 }
4603 if(compareQuad == 0) {
4604 if(count4 > 0) {
4605 while (count4 > UCOL_BOT_COUNT4) {
4606 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4607 count4 -= UCOL_BOT_COUNT4;
4608 }
4609 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4610 count4 = 0;
4611 }
4612 /* We are dealing with a variable and we're treating them as shifted */
4613 /* This is a shifted ignorable */
4614 if(primary1 != 0) { /* we need to check this since we could be in continuation */
4615 *quads++ = primary1;
4616 }
4617 if(primary2 != 0) {
4618 *quads++ = primary2;
4619 }
4620 }
4621 wasShifted = TRUE;
4622 } else {
4623 wasShifted = FALSE;
4624 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4625 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
4626 /* regular and simple sortkey calc */
4627 if(primary1 != UCOL_IGNORABLE) {
4628 if(notIsContinuation) {
4629 if(leadPrimary == primary1) {
4630 *primaries++ = primary2;
4631 } else {
4632 if(leadPrimary != 0) {
4633 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
4634 }
4635 if(primary2 == UCOL_IGNORABLE) {
4636 /* one byter, not compressed */
4637 *primaries++ = primary1;
4638 leadPrimary = 0;
4639 } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
4640 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4641 (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
4642 /* not compressible */
4643 leadPrimary = 0;
4644 *primaries++ = primary1;
4645 *primaries++ = primary2;
4646 } else { /* compress */
4647 *primaries++ = leadPrimary = primary1;
4648 *primaries++ = primary2;
4649 }
4650 }
4651 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4652 *primaries++ = primary1;
4653 if(primary2 != UCOL_IGNORABLE) {
4654 *primaries++ = primary2; /* second part */
4655 }
4656 }
4657 }
4658
4659 if(secondary > compareSec) {
4660 if(!isFrenchSec) {
4661 /* This is compression code. */
4662 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4663 ++count2;
4664 } else {
4665 if (count2 > 0) {
4666 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4667 while (count2 > UCOL_TOP_COUNT2) {
4668 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
4669 count2 -= (uint32_t)UCOL_TOP_COUNT2;
4670 }
4671 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
4672 } else {
4673 while (count2 > UCOL_BOT_COUNT2) {
4674 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4675 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4676 }
4677 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
4678 }
4679 count2 = 0;
4680 }
4681 *secondaries++ = secondary;
4682 }
4683 } else {
4684 *secondaries++ = secondary;
4685 /* Do the special handling for French secondaries */
4686 /* We need to get continuation elements and do intermediate restore */
4687 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
4688 if(notIsContinuation) {
4689 if (frenchStartPtr != NULL) {
4690 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4691 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4692 frenchStartPtr = NULL;
4693 }
4694 } else {
4695 if (frenchStartPtr == NULL) {
4696 frenchStartPtr = secondaries - 2;
4697 }
4698 frenchEndPtr = secondaries-1;
4699 }
4700 }
4701 }
4702
4703 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4704 // do the case level if we need to do it. We don't want to calculate
4705 // case level for primary ignorables if we have only primary strength and case level
4706 // otherwise we would break well formedness of CEs
4707 doCaseShift(&cases, caseShift);
4708 if(notIsContinuation) {
4709 caseBits = (uint8_t)(tertiary & 0xC0);
4710
4711 if(tertiary != 0) {
4712 if(coll->caseFirst == UCOL_UPPER_FIRST) {
4713 if((caseBits & 0xC0) == 0) {
4714 *(cases-1) |= 1 << (--caseShift);
4715 } else {
4716 *(cases-1) |= 0 << (--caseShift);
4717 /* second bit */
4718 doCaseShift(&cases, caseShift);
4719 *(cases-1) |= ((caseBits>>6)&1) << (--caseShift);
4720 }
4721 } else {
4722 if((caseBits & 0xC0) == 0) {
4723 *(cases-1) |= 0 << (--caseShift);
4724 } else {
4725 *(cases-1) |= 1 << (--caseShift);
4726 /* second bit */
4727 doCaseShift(&cases, caseShift);
4728 *(cases-1) |= ((caseBits>>7)&1) << (--caseShift);
4729 }
4730 }
4731 }
4732
4733 }
4734 } else {
4735 if(notIsContinuation) {
4736 tertiary ^= caseSwitch;
4737 }
4738 }
4739
4740 tertiary &= tertiaryMask;
4741 if(tertiary > compareTer) {
4742 /* This is compression code. */
4743 /* sequence size check is included in the if clause */
4744 if (tertiary == tertiaryCommon && notIsContinuation) {
4745 ++count3;
4746 } else {
4747 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
4748 tertiary += tertiaryAddition;
4749 } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
4750 tertiary -= tertiaryAddition;
4751 }
4752 if (count3 > 0) {
4753 if ((tertiary > tertiaryCommon)) {
4754 while (count3 > coll->tertiaryTopCount) {
4755 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
4756 count3 -= (uint32_t)coll->tertiaryTopCount;
4757 }
4758 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
4759 } else {
4760 while (count3 > coll->tertiaryBottomCount) {
4761 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
4762 count3 -= (uint32_t)coll->tertiaryBottomCount;
4763 }
4764 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
4765 }
4766 count3 = 0;
4767 }
4768 *tertiaries++ = tertiary;
4769 }
4770 }
4771
4772 if(/*qShifted*/(compareQuad==0) && notIsContinuation) {
4773 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4774 if(count4>0) { // Close this part
4775 while (count4 > UCOL_BOT_COUNT4) {
4776 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4777 count4 -= UCOL_BOT_COUNT4;
4778 }
4779 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4780 count4 = 0;
4781 }
4782 *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana
4783 } else { // This wasn't Hiragana, so we can continue adding stuff
4784 count4++;
4785 }
4786 }
4787 }
4788
4789 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
4790 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
4791 IInit_collIterate(coll, (UChar *)source, len, &s);
4792 if(source == normSource) {
4793 s.flags &= ~UCOL_ITER_NORM;
4794 }
4795 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4796 *status = U_BUFFER_OVERFLOW_ERROR;
4797 finished = TRUE;
4798 break;
4799 } else { /* It's much nicer if we can actually reallocate */
4800 int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart);
4801 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
4802 if(U_SUCCESS(*status)) {
4803 *result = primStart;
4804 primarySafeEnd = primStart + resultLength - 1;
4805 if(strength > UCOL_PRIMARY) {
4806 primarySafeEnd--;
4807 }
4808 } else {
4809 IInit_collIterate(coll, (UChar *)source, len, &s);
4810 if(source == normSource) {
4811 s.flags &= ~UCOL_ITER_NORM;
4812 }
4813 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4814 finished = TRUE;
4815 break;
4816 }
4817 }
4818 }
4819 }
4820 if(finished) {
4821 break;
4822 } else {
4823 prevBuffSize = minBufferSize;
4824
4825 uint32_t frenchStartOffset = 0, frenchEndOffset = 0;
4826 if (frenchStartPtr != NULL) {
4827 frenchStartOffset = frenchStartPtr - secStart;
4828 frenchEndOffset = frenchEndPtr - secStart;
4829 }
4830 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
4831 if (frenchStartPtr != NULL) {
4832 frenchStartPtr = secStart + frenchStartOffset;
4833 frenchEndPtr = secStart + frenchEndOffset;
4834 }
4835
4836 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
4837 caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status);
4838 quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status);
4839 minBufferSize *= 2;
4840 if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
4841 IInit_collIterate(coll, (UChar *)source, len, &s);
4842 if(source == normSource) {
4843 s.flags &= ~UCOL_ITER_NORM;
4844 }
4845 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4846 break;
4847 }
4848 }
4849 }
4850
4851 /* Here, we are generally done with processing */
4852 /* bailing out would not be too productive */
4853
4854 if(U_SUCCESS(*status)) {
4855 sortKeySize += (primaries - primStart);
4856 /* we have done all the CE's, now let's put them together to form a key */
4857 if(compareSec == 0) {
4858 if (count2 > 0) {
4859 while (count2 > UCOL_BOT_COUNT2) {
4860 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4861 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4862 }
4863 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
4864 }
4865 uint32_t secsize = secondaries-secStart;
4866 if(!isFrenchSec) { // Regular situation, we know the length of secondaries
4867 sortKeySize += secsize;
4868 if(sortKeySize <= resultLength) {
4869 *(primaries++) = UCOL_LEVELTERMINATOR;
4870 uprv_memcpy(primaries, secStart, secsize);
4871 primaries += secsize;
4872 } else {
4873 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
4874 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4875 if(U_SUCCESS(*status)) {
4876 *result = primStart;
4877 *(primaries++) = UCOL_LEVELTERMINATOR;
4878 uprv_memcpy(primaries, secStart, secsize);
4879 primaries += secsize;
4880 }
4881 } else {
4882 *status = U_BUFFER_OVERFLOW_ERROR;
4883 }
4884 }
4885 } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
4886 uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
4887 sortKeySize += secsize;
4888 if(sortKeySize <= resultLength) { // if we managed to pack fine
4889 primaries = newPrim; // update the primary pointer
4890 } else { // overflow, need to reallocate and redo
4891 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
4892 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4893 if(U_SUCCESS(*status)) {
4894 primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
4895 }
4896 } else {
4897 *status = U_BUFFER_OVERFLOW_ERROR;
4898 }
4899 }
4900 }
4901 }
4902
4903 if(doCase) {
4904 uint32_t casesize = cases - caseStart;
4905 sortKeySize += casesize;
4906 if(sortKeySize <= resultLength) {
4907 *(primaries++) = UCOL_LEVELTERMINATOR;
4908 uprv_memcpy(primaries, caseStart, casesize);
4909 primaries += casesize;
4910 } else {
4911 if(allocateSKBuffer == TRUE) {
4912 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4913 if(U_SUCCESS(*status)) {
4914 *result = primStart;
4915 *(primaries++) = UCOL_LEVELTERMINATOR;
4916 uprv_memcpy(primaries, caseStart, casesize);
4917 }
4918 } else {
4919 *status = U_BUFFER_OVERFLOW_ERROR;
4920 }
4921 }
4922 }
4923
4924 if(compareTer == 0) {
4925 if (count3 > 0) {
4926 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
4927 while (count3 >= coll->tertiaryTopCount) {
4928 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
4929 count3 -= (uint32_t)coll->tertiaryTopCount;
4930 }
4931 *tertiaries++ = (uint8_t)(tertiaryTop - count3);
4932 } else {
4933 while (count3 > coll->tertiaryBottomCount) {
4934 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
4935 count3 -= (uint32_t)coll->tertiaryBottomCount;
4936 }
4937 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
4938 }
4939 }
4940 uint32_t tersize = tertiaries - terStart;
4941 sortKeySize += tersize;
4942 if(sortKeySize <= resultLength) {
4943 *(primaries++) = UCOL_LEVELTERMINATOR;
4944 uprv_memcpy(primaries, terStart, tersize);
4945 primaries += tersize;
4946 } else {
4947 if(allocateSKBuffer == TRUE) {
4948 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4949 if(U_SUCCESS(*status)) {
4950 *result = primStart;
4951 *(primaries++) = UCOL_LEVELTERMINATOR;
4952 uprv_memcpy(primaries, terStart, tersize);
4953 }
4954 } else {
4955 *status = U_BUFFER_OVERFLOW_ERROR;
4956 }
4957 }
4958
4959 if(compareQuad == 0/*qShifted == TRUE*/) {
4960 if(count4 > 0) {
4961 while (count4 > UCOL_BOT_COUNT4) {
4962 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4963 count4 -= UCOL_BOT_COUNT4;
4964 }
4965 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4966 }
4967 uint32_t quadsize = quads - quadStart;
4968 sortKeySize += quadsize;
4969 if(sortKeySize <= resultLength) {
4970 *(primaries++) = UCOL_LEVELTERMINATOR;
4971 uprv_memcpy(primaries, quadStart, quadsize);
4972 primaries += quadsize;
4973 } else {
4974 if(allocateSKBuffer == TRUE) {
4975 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4976 if(U_SUCCESS(*status)) {
4977 *result = primStart;
4978 *(primaries++) = UCOL_LEVELTERMINATOR;
4979 uprv_memcpy(primaries, quadStart, quadsize);
4980 }
4981 } else {
4982 *status = U_BUFFER_OVERFLOW_ERROR;
4983 }
4984 }
4985 }
4986
4987 if(compareIdent) {
4988 sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len);
4989 if(sortKeySize <= resultLength) {
4990 *(primaries++) = UCOL_LEVELTERMINATOR;
4991 primaries += u_writeIdenticalLevelRun(s.string, len, primaries);
4992 } else {
4993 if(allocateSKBuffer == TRUE) {
4994 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status);
4995 if(U_SUCCESS(*status)) {
4996 *result = primStart;
4997 *(primaries++) = UCOL_LEVELTERMINATOR;
4998 u_writeIdenticalLevelRun(s.string, len, primaries);
4999 }
5000 } else {
5001 *status = U_BUFFER_OVERFLOW_ERROR;
5002 }
5003 }
5004 }
5005 }
5006 *(primaries++) = '\0';
5007 }
5008
5009 if(terStart != tert) {
5010 uprv_free(terStart);
5011 uprv_free(secStart);
5012 uprv_free(caseStart);
5013 uprv_free(quadStart);
5014 }
5015
5016 if(normSource != normBuffer) {
5017 uprv_free(normSource);
5018 }
5019
5020 if(allocateSKBuffer == TRUE) {
5021 *result = (uint8_t*)uprv_malloc(sortKeySize);
5022 /* test for NULL */
5023 if (*result == NULL) {
5024 *status = U_MEMORY_ALLOCATION_ERROR;
5025 return sortKeySize;
5026 }
5027 uprv_memcpy(*result, primStart, sortKeySize);
5028 if(primStart != prim) {
5029 uprv_free(primStart);
5030 }
5031 }
5032
5033 return sortKeySize;
5034 }
5035
5036
5037 U_CFUNC int32_t U_CALLCONV
ucol_calcSortKeySimpleTertiary(const UCollator * coll,const UChar * source,int32_t sourceLength,uint8_t ** result,uint32_t resultLength,UBool allocateSKBuffer,UErrorCode * status)5038 ucol_calcSortKeySimpleTertiary(const UCollator *coll,
5039 const UChar *source,
5040 int32_t sourceLength,
5041 uint8_t **result,
5042 uint32_t resultLength,
5043 UBool allocateSKBuffer,
5044 UErrorCode *status)
5045 {
5046 U_ALIGN_CODE(16);
5047
5048 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
5049 uint32_t i = 0; /* general purpose counter */
5050
5051 /* Stack allocated buffers for buffers we use */
5052 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
5053
5054 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert;
5055
5056 if(U_FAILURE(*status)) {
5057 return 0;
5058 }
5059
5060 if(primaries == NULL && allocateSKBuffer == TRUE) {
5061 primaries = *result = prim;
5062 resultLength = UCOL_PRIMARY_MAX_BUFFER;
5063 }
5064
5065 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER;
5066
5067 uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */
5068
5069 UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
5070 UChar *normSource = normBuffer;
5071 int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
5072
5073 int32_t len = sourceLength;
5074
5075 /* If we need to normalize, we'll do it all at once at the beginning! */
5076 if(coll->normalizationMode != UCOL_OFF && UNORM_YES != unorm_quickCheck(source, len, UNORM_FCD, status)) {
5077 len = unorm_internalNormalize(normSource, normSourceLen,
5078 source, len,
5079 UNORM_FCD, FALSE,
5080 status);
5081 if(*status == U_BUFFER_OVERFLOW_ERROR) {
5082 normSourceLen = len;
5083 normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
5084 if(normSource == NULL) {
5085 *status = U_MEMORY_ALLOCATION_ERROR;
5086 return 0;
5087 }
5088 *status = U_ZERO_ERROR;
5089 len = unorm_internalNormalize(normSource, normSourceLen,
5090 source, len,
5091 UNORM_FCD, FALSE,
5092 status);
5093 }
5094
5095 if(U_FAILURE(*status)) {
5096 return 0;
5097 }
5098 source = normSource;
5099 }
5100
5101 collIterate s;
5102 IInit_collIterate(coll, (UChar *)source, len, &s);
5103 if(source == normSource) {
5104 s.flags &= ~UCOL_ITER_NORM;
5105 }
5106
5107 if(resultLength == 0 || primaries == NULL) {
5108 int32_t t = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5109 if(normSource != normBuffer) {
5110 uprv_free(normSource);
5111 }
5112 return t;
5113 }
5114
5115 uint8_t *primarySafeEnd = primaries + resultLength - 2;
5116
5117 uint32_t minBufferSize = UCOL_MAX_BUFFER;
5118
5119 uint8_t *primStart = primaries;
5120 uint8_t *secStart = secondaries;
5121 uint8_t *terStart = tertiaries;
5122
5123 uint32_t order = 0;
5124
5125 uint8_t primary1 = 0;
5126 uint8_t primary2 = 0;
5127 uint8_t secondary = 0;
5128 uint8_t tertiary = 0;
5129 uint8_t caseSwitch = coll->caseSwitch;
5130 uint8_t tertiaryMask = coll->tertiaryMask;
5131 int8_t tertiaryAddition = (int8_t)coll->tertiaryAddition;
5132 uint8_t tertiaryTop = coll->tertiaryTop;
5133 uint8_t tertiaryBottom = coll->tertiaryBottom;
5134 uint8_t tertiaryCommon = coll->tertiaryCommon;
5135
5136 uint32_t prevBuffSize = 0;
5137
5138 UBool finished = FALSE;
5139 UBool notIsContinuation = FALSE;
5140
5141 uint32_t count2 = 0, count3 = 0;
5142 uint8_t leadPrimary = 0;
5143
5144 for(;;) {
5145 for(i=prevBuffSize; i<minBufferSize; ++i) {
5146
5147 order = ucol_IGetNextCE(coll, &s, status);
5148
5149 if(order == 0) {
5150 continue;
5151 }
5152
5153 if(order == UCOL_NO_MORE_CES) {
5154 finished = TRUE;
5155 break;
5156 }
5157
5158 notIsContinuation = !isContinuation(order);
5159
5160 if(notIsContinuation) {
5161 tertiary = (uint8_t)((order & tertiaryMask));
5162 } else {
5163 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5164 }
5165 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5166 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5167 primary1 = (uint8_t)(order >> 8);
5168
5169 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5170 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
5171 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
5172 /* regular and simple sortkey calc */
5173 if(primary1 != UCOL_IGNORABLE) {
5174 if(notIsContinuation) {
5175 if(leadPrimary == primary1) {
5176 *primaries++ = primary2;
5177 } else {
5178 if(leadPrimary != 0) {
5179 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5180 }
5181 if(primary2 == UCOL_IGNORABLE) {
5182 /* one byter, not compressed */
5183 *primaries++ = primary1;
5184 leadPrimary = 0;
5185 } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
5186 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24)))
5187 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
5188 (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
5189 /* not compressible */
5190 leadPrimary = 0;
5191 *primaries++ = primary1;
5192 *primaries++ = primary2;
5193 } else { /* compress */
5194 *primaries++ = leadPrimary = primary1;
5195 *primaries++ = primary2;
5196 }
5197 }
5198 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5199 *primaries++ = primary1;
5200 if(primary2 != UCOL_IGNORABLE) {
5201 *primaries++ = primary2; /* second part */
5202 }
5203 }
5204 }
5205
5206 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5207 /* This is compression code. */
5208 if (secondary == UCOL_COMMON2 && notIsContinuation) {
5209 ++count2;
5210 } else {
5211 if (count2 > 0) {
5212 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5213 while (count2 > UCOL_TOP_COUNT2) {
5214 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5215 count2 -= (uint32_t)UCOL_TOP_COUNT2;
5216 }
5217 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
5218 } else {
5219 while (count2 > UCOL_BOT_COUNT2) {
5220 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5221 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5222 }
5223 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5224 }
5225 count2 = 0;
5226 }
5227 *secondaries++ = secondary;
5228 }
5229 }
5230
5231 if(notIsContinuation) {
5232 tertiary ^= caseSwitch;
5233 }
5234
5235 if(tertiary > 0) {
5236 /* This is compression code. */
5237 /* sequence size check is included in the if clause */
5238 if (tertiary == tertiaryCommon && notIsContinuation) {
5239 ++count3;
5240 } else {
5241 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5242 tertiary += tertiaryAddition;
5243 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5244 tertiary -= tertiaryAddition;
5245 }
5246 if (count3 > 0) {
5247 if ((tertiary > tertiaryCommon)) {
5248 while (count3 > coll->tertiaryTopCount) {
5249 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5250 count3 -= (uint32_t)coll->tertiaryTopCount;
5251 }
5252 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
5253 } else {
5254 while (count3 > coll->tertiaryBottomCount) {
5255 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5256 count3 -= (uint32_t)coll->tertiaryBottomCount;
5257 }
5258 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5259 }
5260 count3 = 0;
5261 }
5262 *tertiaries++ = tertiary;
5263 }
5264 }
5265
5266 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
5267 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
5268 IInit_collIterate(coll, (UChar *)source, len, &s);
5269 if(source == normSource) {
5270 s.flags &= ~UCOL_ITER_NORM;
5271 }
5272 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5273 *status = U_BUFFER_OVERFLOW_ERROR;
5274 finished = TRUE;
5275 break;
5276 } else { /* It's much nicer if we can actually reallocate */
5277 int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart);
5278 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
5279 if(U_SUCCESS(*status)) {
5280 *result = primStart;
5281 primarySafeEnd = primStart + resultLength - 2;
5282 } else {
5283 IInit_collIterate(coll, (UChar *)source, len, &s);
5284 if(source == normSource) {
5285 s.flags &= ~UCOL_ITER_NORM;
5286 }
5287 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5288 finished = TRUE;
5289 break;
5290 }
5291 }
5292 }
5293 }
5294 if(finished) {
5295 break;
5296 } else {
5297 prevBuffSize = minBufferSize;
5298 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
5299 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
5300 minBufferSize *= 2;
5301 if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5302 IInit_collIterate(coll, (UChar *)source, len, &s);
5303 if(source == normSource) {
5304 s.flags &= ~UCOL_ITER_NORM;
5305 }
5306 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5307 break;
5308 }
5309 }
5310 }
5311
5312 if(U_SUCCESS(*status)) {
5313 sortKeySize += (primaries - primStart);
5314 /* we have done all the CE's, now let's put them together to form a key */
5315 if (count2 > 0) {
5316 while (count2 > UCOL_BOT_COUNT2) {
5317 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5318 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5319 }
5320 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5321 }
5322 uint32_t secsize = secondaries-secStart;
5323 sortKeySize += secsize;
5324 if(sortKeySize <= resultLength) {
5325 *(primaries++) = UCOL_LEVELTERMINATOR;
5326 uprv_memcpy(primaries, secStart, secsize);
5327 primaries += secsize;
5328 } else {
5329 if(allocateSKBuffer == TRUE) {
5330 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5331 if(U_SUCCESS(*status)) {
5332 *(primaries++) = UCOL_LEVELTERMINATOR;
5333 *result = primStart;
5334 uprv_memcpy(primaries, secStart, secsize);
5335 }
5336 } else {
5337 *status = U_BUFFER_OVERFLOW_ERROR;
5338 }
5339 }
5340
5341 if (count3 > 0) {
5342 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5343 while (count3 >= coll->tertiaryTopCount) {
5344 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5345 count3 -= (uint32_t)coll->tertiaryTopCount;
5346 }
5347 *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5348 } else {
5349 while (count3 > coll->tertiaryBottomCount) {
5350 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5351 count3 -= (uint32_t)coll->tertiaryBottomCount;
5352 }
5353 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5354 }
5355 }
5356 uint32_t tersize = tertiaries - terStart;
5357 sortKeySize += tersize;
5358 if(sortKeySize <= resultLength) {
5359 *(primaries++) = UCOL_LEVELTERMINATOR;
5360 uprv_memcpy(primaries, terStart, tersize);
5361 primaries += tersize;
5362 } else {
5363 if(allocateSKBuffer == TRUE) {
5364 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5365 if(U_SUCCESS(*status)) {
5366 *result = primStart;
5367 *(primaries++) = UCOL_LEVELTERMINATOR;
5368 uprv_memcpy(primaries, terStart, tersize);
5369 }
5370 } else {
5371 *status = U_MEMORY_ALLOCATION_ERROR;
5372 }
5373 }
5374
5375 *(primaries++) = '\0';
5376 }
5377
5378 if(terStart != tert) {
5379 uprv_free(terStart);
5380 uprv_free(secStart);
5381 }
5382
5383 if(normSource != normBuffer) {
5384 uprv_free(normSource);
5385 }
5386
5387 if(allocateSKBuffer == TRUE) {
5388 *result = (uint8_t*)uprv_malloc(sortKeySize);
5389 /* test for NULL */
5390 if (*result == NULL) {
5391 *status = U_MEMORY_ALLOCATION_ERROR;
5392 return sortKeySize;
5393 }
5394 uprv_memcpy(*result, primStart, sortKeySize);
5395 if(primStart != prim) {
5396 uprv_free(primStart);
5397 }
5398 }
5399
5400 return sortKeySize;
5401 }
5402
5403 static inline
isShiftedCE(uint32_t CE,uint32_t LVT,UBool * wasShifted)5404 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5405 UBool notIsContinuation = !isContinuation(CE);
5406 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
5407 if(LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5408 || (!notIsContinuation && *wasShifted))
5409 || (*wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
5410 // The stuff below should probably be in the sortkey code... maybe not...
5411 if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
5412 /* we should just completely ignore it */
5413 *wasShifted = TRUE;
5414 //continue;
5415 }
5416 //*wasShifted = TRUE;
5417 return TRUE;
5418 } else {
5419 *wasShifted = FALSE;
5420 return FALSE;
5421 }
5422 }
5423 static inline
terminatePSKLevel(int32_t level,int32_t maxLevel,int32_t & i,uint8_t * dest)5424 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
5425 if(level < maxLevel) {
5426 dest[i++] = UCOL_LEVELTERMINATOR;
5427 } else {
5428 dest[i++] = 0;
5429 }
5430 }
5431
5432 /** enumeration of level identifiers for partial sort key generation */
5433 enum {
5434 UCOL_PSK_PRIMARY = 0,
5435 UCOL_PSK_SECONDARY = 1,
5436 UCOL_PSK_CASE = 2,
5437 UCOL_PSK_TERTIARY = 3,
5438 UCOL_PSK_QUATERNARY = 4,
5439 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have three bits to blow */
5440 UCOL_PSK_IDENTICAL = 6,
5441 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */
5442 UCOL_PSK_LIMIT
5443 };
5444
5445 /** collation state enum. *_SHIFT value is how much to shift right
5446 * to get the state piece to the right. *_MASK value should be
5447 * ANDed with the shifted state. This data is stored in state[1]
5448 * field.
5449 */
5450 enum {
5451 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */
5452 UCOL_PSK_LEVEL_MASK = 7, /** three bits */
5453 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
5454 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
5455 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5456 * This field is also used to denote that the French secondary level is finished
5457 */
5458 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5459 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5460 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
5461 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5462 /** When we do French we need to reverse secondary values. However, continuations
5463 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5464 */
5465 UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
5466 UCOL_PSK_BOCSU_BYTES_MASK = 3,
5467 UCOL_PSK_CONSUMED_CES_SHIFT = 9,
5468 UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
5469 };
5470
5471 // macro calculating the number of expansion CEs available
5472 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
5473
5474
5475 /** main sortkey part procedure. On the first call,
5476 * you should pass in a collator, an iterator, empty state
5477 * state[0] == state[1] == 0, a buffer to hold results
5478 * number of bytes you need and an error code pointer.
5479 * Make sure your buffer is big enough to hold the wanted
5480 * number of sortkey bytes. I don't check.
5481 * The only meaningful status you can get back is
5482 * U_BUFFER_OVERFLOW_ERROR, which basically means that you
5483 * have been dealt a raw deal and that you probably won't
5484 * be able to use partial sortkey generation for this
5485 * particular combination of string and collator. This
5486 * is highly unlikely, but you should still check the error code.
5487 * Any other status means that you're not in a sane situation
5488 * anymore. After the first call, preserve state values and
5489 * use them on subsequent calls to obtain more bytes of a sortkey.
5490 * Use until the number of bytes written is smaller than the requested
5491 * number of bytes. Generated sortkey is not compatible with the
5492 * one generated by ucol_getSortKey, as we don't do any compression.
5493 * However, levels are still terminated by a 1 (one) and the sortkey
5494 * is terminated by a 0 (zero). Identical level is the same as in the
5495 * regular sortkey - internal bocu-1 implementation is used.
5496 * For curious, although you cannot do much about this, here is
5497 * the structure of state words.
5498 * state[0] - iterator state. Depends on the iterator implementation,
5499 * but allows the iterator to continue where it stopped in
5500 * the last iteration.
5501 * state[1] - collation processing state. Here is the distribution
5502 * of the bits:
5503 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5504 * quaternary, quin (we don't use this one), identical and
5505 * null (producing only zeroes - first one to terminate the
5506 * sortkey and subsequent to fill the buffer).
5507 * 3 - byte count. Number of bytes written on the primary level.
5508 * 4 - was shifted. Whether the previous iteration finished in the
5509 * shifted state.
5510 * 5, 6 - French continuation bytes written. See the comment in the enum
5511 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on
5512 * the identical level.
5513 * 9..31 - CEs consumed. Number of getCE or next32 operations performed
5514 * since thes last successful update of the iterator state.
5515 */
5516 U_CAPI int32_t U_EXPORT2
ucol_nextSortKeyPart(const UCollator * coll,UCharIterator * iter,uint32_t state[2],uint8_t * dest,int32_t count,UErrorCode * status)5517 ucol_nextSortKeyPart(const UCollator *coll,
5518 UCharIterator *iter,
5519 uint32_t state[2],
5520 uint8_t *dest, int32_t count,
5521 UErrorCode *status) {
5522 /* error checking */
5523 if(status==NULL || U_FAILURE(*status)) {
5524 return 0;
5525 }
5526 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
5527 if( coll==NULL || iter==NULL ||
5528 state==NULL ||
5529 count<0 || (count>0 && dest==NULL)
5530 ) {
5531 *status=U_ILLEGAL_ARGUMENT_ERROR;
5532 UTRACE_EXIT_STATUS(status);
5533 return 0;
5534 }
5535
5536 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5537 coll, iter, state[0], state[1], dest, count);
5538
5539 if(count==0) {
5540 /* nothing to do */
5541 UTRACE_EXIT_VALUE(0);
5542 return 0;
5543 }
5544 /** Setting up situation according to the state we got from the previous iteration */
5545 // The state of the iterator from the previous invocation
5546 uint32_t iterState = state[0];
5547 // Has the last iteration ended in the shifted state
5548 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
5549 // What is the current level of the sortkey?
5550 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
5551 // Have we written only one byte from a two byte primary in the previous iteration?
5552 // Also on secondary level - have we finished with the French secondary?
5553 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
5554 // number of bytes in the continuation buffer for French
5555 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
5556 // Number of bytes already written from a bocsu sequence. Since
5557 // the longes bocsu sequence is 4 long, this can be up to 3.
5558 int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
5559 // Number of elements that need to be consumed in this iteration because
5560 // the iterator returned UITER_NO_STATE at the end of the last iteration,
5561 // so we had to save the last valid state.
5562 int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
5563
5564 /** values that depend on the collator attributes */
5565 // strength of the collator.
5566 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
5567 // maximal level of the partial sortkey. Need to take whether case level is done
5568 int32_t maxLevel = 0;
5569 if(strength < UCOL_TERTIARY) {
5570 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5571 maxLevel = UCOL_PSK_CASE;
5572 } else {
5573 maxLevel = strength;
5574 }
5575 } else {
5576 if(strength == UCOL_TERTIARY) {
5577 maxLevel = UCOL_PSK_TERTIARY;
5578 } else if(strength == UCOL_QUATERNARY) {
5579 maxLevel = UCOL_PSK_QUATERNARY;
5580 } else { // identical
5581 maxLevel = UCOL_IDENTICAL;
5582 }
5583 }
5584 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
5585 uint8_t UCOL_HIRAGANA_QUAD =
5586 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
5587 // Boundary value that decides whether a CE is shifted or not
5588 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
5589 // Are we doing French collation?
5590 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
5591
5592 /** initializing the collation state */
5593 UBool notIsContinuation = FALSE;
5594 uint32_t CE = UCOL_NO_MORE_CES;
5595
5596 collIterate s;
5597 IInit_collIterate(coll, NULL, -1, &s);
5598 s.iterator = iter;
5599 s.flags |= UCOL_USE_ITERATOR;
5600 // This variable tells us whether we have produced some other levels in this iteration
5601 // before we moved to the identical level. In that case, we need to switch the
5602 // type of the iterator.
5603 UBool doingIdenticalFromStart = FALSE;
5604 // Normalizing iterator
5605 // The division for the array length may truncate the array size to
5606 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
5607 // for all platforms anyway.
5608 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
5609 UNormIterator *normIter = NULL;
5610 // If the normalization is turned on for the collator and we are below identical level
5611 // we will use a FCD normalizing iterator
5612 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
5613 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5614 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
5615 s.flags &= ~UCOL_ITER_NORM;
5616 if(U_FAILURE(*status)) {
5617 UTRACE_EXIT_STATUS(*status);
5618 return 0;
5619 }
5620 } else if(level == UCOL_PSK_IDENTICAL) {
5621 // for identical level, we need a NFD iterator. We need to instantiate it here, since we
5622 // will be updating the state - and this cannot be done on an ordinary iterator.
5623 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5624 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5625 s.flags &= ~UCOL_ITER_NORM;
5626 if(U_FAILURE(*status)) {
5627 UTRACE_EXIT_STATUS(*status);
5628 return 0;
5629 }
5630 doingIdenticalFromStart = TRUE;
5631 }
5632
5633 // This is the tentative new state of the iterator. The problem
5634 // is that the iterator might return an undefined state, in
5635 // which case we should save the last valid state and increase
5636 // the iterator skip value.
5637 uint32_t newState = 0;
5638
5639 // First, we set the iterator to the last valid position
5640 // from the last iteration. This was saved in state[0].
5641 if(iterState == 0) {
5642 /* initial state */
5643 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
5644 s.iterator->move(s.iterator, 0, UITER_LIMIT);
5645 } else {
5646 s.iterator->move(s.iterator, 0, UITER_START);
5647 }
5648 } else {
5649 /* reset to previous state */
5650 s.iterator->setState(s.iterator, iterState, status);
5651 if(U_FAILURE(*status)) {
5652 UTRACE_EXIT_STATUS(*status);
5653 return 0;
5654 }
5655 }
5656
5657
5658
5659 // This variable tells us whether we can attempt to update the state
5660 // of iterator. Situations where we don't want to update iterator state
5661 // are the existence of expansion CEs that are not yet processed, and
5662 // finishing the case level without enough space in the buffer to insert
5663 // a level terminator.
5664 UBool canUpdateState = TRUE;
5665
5666 // Consume all the CEs that were consumed at the end of the previous
5667 // iteration without updating the iterator state. On identical level,
5668 // consume the code points.
5669 int32_t counter = cces;
5670 if(level < UCOL_PSK_IDENTICAL) {
5671 while(counter-->0) {
5672 // If we're doing French and we are on the secondary level,
5673 // we go backwards.
5674 if(level == UCOL_PSK_SECONDARY && doingFrench) {
5675 CE = ucol_IGetPrevCE(coll, &s, status);
5676 } else {
5677 CE = ucol_IGetNextCE(coll, &s, status);
5678 }
5679 if(CE==UCOL_NO_MORE_CES) {
5680 /* should not happen */
5681 *status=U_INTERNAL_PROGRAM_ERROR;
5682 UTRACE_EXIT_STATUS(*status);
5683 return 0;
5684 }
5685 if(uprv_numAvailableExpCEs(s)) {
5686 canUpdateState = FALSE;
5687 }
5688 }
5689 } else {
5690 while(counter-->0) {
5691 uiter_next32(s.iterator);
5692 }
5693 }
5694
5695 // French secondary needs to know whether the iterator state of zero came from previous level OR
5696 // from a new invocation...
5697 UBool wasDoingPrimary = FALSE;
5698 // destination buffer byte counter. When this guy
5699 // gets to count, we're done with the iteration
5700 int32_t i = 0;
5701 // used to count the zero bytes written after we
5702 // have finished with the sort key
5703 int32_t j = 0;
5704
5705
5706 // Hm.... I think we're ready to plunge in. Basic story is as following:
5707 // we have a fall through case based on level. This is used for initial
5708 // positioning on iteration start. Every level processor contains a
5709 // for(;;) which will be broken when we exhaust all the CEs. Other
5710 // way to exit is a goto saveState, which happens when we have filled
5711 // out our buffer.
5712 switch(level) {
5713 case UCOL_PSK_PRIMARY:
5714 wasDoingPrimary = TRUE;
5715 for(;;) {
5716 if(i==count) {
5717 goto saveState;
5718 }
5719 // We should save the state only if we
5720 // are sure that we are done with the
5721 // previous iterator state
5722 if(canUpdateState && byteCountOrFrenchDone == 0) {
5723 newState = s.iterator->getState(s.iterator);
5724 if(newState != UITER_NO_STATE) {
5725 iterState = newState;
5726 cces = 0;
5727 }
5728 }
5729 CE = ucol_IGetNextCE(coll, &s, status);
5730 cces++;
5731 if(CE==UCOL_NO_MORE_CES) {
5732 // Add the level separator
5733 terminatePSKLevel(level, maxLevel, i, dest);
5734 byteCountOrFrenchDone=0;
5735 // Restart the iteration an move to the
5736 // second level
5737 s.iterator->move(s.iterator, 0, UITER_START);
5738 cces = 0;
5739 level = UCOL_PSK_SECONDARY;
5740 break;
5741 }
5742 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5743 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
5744 if(CE != 0) {
5745 if(byteCountOrFrenchDone == 0) {
5746 // get the second byte of primary
5747 dest[i++]=(uint8_t)(CE >> 8);
5748 } else {
5749 byteCountOrFrenchDone = 0;
5750 }
5751 if((CE &=0xff)!=0) {
5752 if(i==count) {
5753 /* overflow */
5754 byteCountOrFrenchDone = 1;
5755 cces--;
5756 goto saveState;
5757 }
5758 dest[i++]=(uint8_t)CE;
5759 }
5760 }
5761 }
5762 if(uprv_numAvailableExpCEs(s)) {
5763 canUpdateState = FALSE;
5764 } else {
5765 canUpdateState = TRUE;
5766 }
5767 }
5768 /* fall through to next level */
5769 case UCOL_PSK_SECONDARY:
5770 if(strength >= UCOL_SECONDARY) {
5771 if(!doingFrench) {
5772 for(;;) {
5773 if(i == count) {
5774 goto saveState;
5775 }
5776 // We should save the state only if we
5777 // are sure that we are done with the
5778 // previous iterator state
5779 if(canUpdateState) {
5780 newState = s.iterator->getState(s.iterator);
5781 if(newState != UITER_NO_STATE) {
5782 iterState = newState;
5783 cces = 0;
5784 }
5785 }
5786 CE = ucol_IGetNextCE(coll, &s, status);
5787 cces++;
5788 if(CE==UCOL_NO_MORE_CES) {
5789 // Add the level separator
5790 terminatePSKLevel(level, maxLevel, i, dest);
5791 byteCountOrFrenchDone = 0;
5792 // Restart the iteration an move to the
5793 // second level
5794 s.iterator->move(s.iterator, 0, UITER_START);
5795 cces = 0;
5796 level = UCOL_PSK_CASE;
5797 break;
5798 }
5799 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5800 CE >>= 8; /* get secondary */
5801 if(CE != 0) {
5802 dest[i++]=(uint8_t)CE;
5803 }
5804 }
5805 if(uprv_numAvailableExpCEs(s)) {
5806 canUpdateState = FALSE;
5807 } else {
5808 canUpdateState = TRUE;
5809 }
5810 }
5811 } else { // French secondary processing
5812 uint8_t frenchBuff[UCOL_MAX_BUFFER];
5813 int32_t frenchIndex = 0;
5814 // Here we are going backwards.
5815 // If the iterator is at the beggining, it should be
5816 // moved to end.
5817 if(wasDoingPrimary) {
5818 s.iterator->move(s.iterator, 0, UITER_LIMIT);
5819 cces = 0;
5820 }
5821 for(;;) {
5822 if(i == count) {
5823 goto saveState;
5824 }
5825 if(canUpdateState) {
5826 newState = s.iterator->getState(s.iterator);
5827 if(newState != UITER_NO_STATE) {
5828 iterState = newState;
5829 cces = 0;
5830 }
5831 }
5832 CE = ucol_IGetPrevCE(coll, &s, status);
5833 cces++;
5834 if(CE==UCOL_NO_MORE_CES) {
5835 // Add the level separator
5836 terminatePSKLevel(level, maxLevel, i, dest);
5837 byteCountOrFrenchDone = 0;
5838 // Restart the iteration an move to the next level
5839 s.iterator->move(s.iterator, 0, UITER_START);
5840 level = UCOL_PSK_CASE;
5841 break;
5842 }
5843 if(isContinuation(CE)) { // if it's a continuation, we want to save it and
5844 // reverse when we get a first non-continuation CE.
5845 CE >>= 8;
5846 frenchBuff[frenchIndex++] = (uint8_t)CE;
5847 } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
5848 CE >>= 8; /* get secondary */
5849 if(!frenchIndex) {
5850 if(CE != 0) {
5851 dest[i++]=(uint8_t)CE;
5852 }
5853 } else {
5854 frenchBuff[frenchIndex++] = (uint8_t)CE;
5855 frenchIndex -= usedFrench;
5856 usedFrench = 0;
5857 while(i < count && frenchIndex) {
5858 dest[i++] = frenchBuff[--frenchIndex];
5859 usedFrench++;
5860 }
5861 }
5862 }
5863 if(uprv_numAvailableExpCEs(s)) {
5864 canUpdateState = FALSE;
5865 } else {
5866 canUpdateState = TRUE;
5867 }
5868 }
5869 }
5870 } else {
5871 level = UCOL_PSK_CASE;
5872 }
5873 /* fall through to next level */
5874 case UCOL_PSK_CASE:
5875 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5876 uint32_t caseShift = UCOL_CASE_SHIFT_START;
5877 uint8_t caseByte = UCOL_CASE_BYTE_START;
5878 uint8_t caseBits = 0;
5879
5880 for(;;) {
5881 if(i == count) {
5882 goto saveState;
5883 }
5884 // We should save the state only if we
5885 // are sure that we are done with the
5886 // previous iterator state
5887 if(canUpdateState) {
5888 newState = s.iterator->getState(s.iterator);
5889 if(newState != UITER_NO_STATE) {
5890 iterState = newState;
5891 cces = 0;
5892 }
5893 }
5894 CE = ucol_IGetNextCE(coll, &s, status);
5895 cces++;
5896 if(CE==UCOL_NO_MORE_CES) {
5897 // On the case level we might have an unfinished
5898 // case byte. Add one if it's started.
5899 if(caseShift != UCOL_CASE_SHIFT_START) {
5900 dest[i++] = caseByte;
5901 }
5902 cces = 0;
5903 // We have finished processing CEs on this level.
5904 // However, we don't know if we have enough space
5905 // to add a case level terminator.
5906 if(i < count) {
5907 // Add the level separator
5908 terminatePSKLevel(level, maxLevel, i, dest);
5909 // Restart the iteration and move to the
5910 // next level
5911 s.iterator->move(s.iterator, 0, UITER_START);
5912 level = UCOL_PSK_TERTIARY;
5913 } else {
5914 canUpdateState = FALSE;
5915 }
5916 break;
5917 }
5918
5919 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5920 if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
5921 // do the case level if we need to do it. We don't want to calculate
5922 // case level for primary ignorables if we have only primary strength and case level
5923 // otherwise we would break well formedness of CEs
5924 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5925 caseBits = (uint8_t)(CE & 0xC0);
5926 // this copies the case level logic from the
5927 // sort key generation code
5928 if(CE != 0) {
5929 if(coll->caseFirst == UCOL_UPPER_FIRST) {
5930 if((caseBits & 0xC0) == 0) {
5931 caseByte |= 1 << (--caseShift);
5932 } else {
5933 caseByte |= 0 << (--caseShift);
5934 /* second bit */
5935 if(caseShift == 0) {
5936 dest[i++] = caseByte;
5937 caseShift = UCOL_CASE_SHIFT_START;
5938 caseByte = UCOL_CASE_BYTE_START;
5939 }
5940 caseByte |= ((caseBits>>6)&1) << (--caseShift);
5941 }
5942 } else {
5943 if((caseBits & 0xC0) == 0) {
5944 caseByte |= 0 << (--caseShift);
5945 } else {
5946 caseByte |= 1 << (--caseShift);
5947 /* second bit */
5948 if(caseShift == 0) {
5949 dest[i++] = caseByte;
5950 caseShift = UCOL_CASE_SHIFT_START;
5951 caseByte = UCOL_CASE_BYTE_START;
5952 }
5953 caseByte |= ((caseBits>>7)&1) << (--caseShift);
5954 }
5955 }
5956 }
5957
5958 }
5959 }
5960 // Not sure this is correct for the case level - revisit
5961 if(uprv_numAvailableExpCEs(s)) {
5962 canUpdateState = FALSE;
5963 } else {
5964 canUpdateState = TRUE;
5965 }
5966 }
5967 } else {
5968 level = UCOL_PSK_TERTIARY;
5969 }
5970 /* fall through to next level */
5971 case UCOL_PSK_TERTIARY:
5972 if(strength >= UCOL_TERTIARY) {
5973 for(;;) {
5974 if(i == count) {
5975 goto saveState;
5976 }
5977 // We should save the state only if we
5978 // are sure that we are done with the
5979 // previous iterator state
5980 if(canUpdateState) {
5981 newState = s.iterator->getState(s.iterator);
5982 if(newState != UITER_NO_STATE) {
5983 iterState = newState;
5984 cces = 0;
5985 }
5986 }
5987 CE = ucol_IGetNextCE(coll, &s, status);
5988 cces++;
5989 if(CE==UCOL_NO_MORE_CES) {
5990 // Add the level separator
5991 terminatePSKLevel(level, maxLevel, i, dest);
5992 byteCountOrFrenchDone = 0;
5993 // Restart the iteration an move to the
5994 // second level
5995 s.iterator->move(s.iterator, 0, UITER_START);
5996 cces = 0;
5997 level = UCOL_PSK_QUATERNARY;
5998 break;
5999 }
6000 if(!isShiftedCE(CE, LVT, &wasShifted)) {
6001 notIsContinuation = !isContinuation(CE);
6002
6003 if(notIsContinuation) {
6004 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
6005 CE ^= coll->caseSwitch;
6006 CE &= coll->tertiaryMask;
6007 } else {
6008 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6009 }
6010
6011 if(CE != 0) {
6012 dest[i++]=(uint8_t)CE;
6013 }
6014 }
6015 if(uprv_numAvailableExpCEs(s)) {
6016 canUpdateState = FALSE;
6017 } else {
6018 canUpdateState = TRUE;
6019 }
6020 }
6021 } else {
6022 // if we're not doing tertiary
6023 // skip to the end
6024 level = UCOL_PSK_NULL;
6025 }
6026 /* fall through to next level */
6027 case UCOL_PSK_QUATERNARY:
6028 if(strength >= UCOL_QUATERNARY) {
6029 for(;;) {
6030 if(i == count) {
6031 goto saveState;
6032 }
6033 // We should save the state only if we
6034 // are sure that we are done with the
6035 // previous iterator state
6036 if(canUpdateState) {
6037 newState = s.iterator->getState(s.iterator);
6038 if(newState != UITER_NO_STATE) {
6039 iterState = newState;
6040 cces = 0;
6041 }
6042 }
6043 CE = ucol_IGetNextCE(coll, &s, status);
6044 cces++;
6045 if(CE==UCOL_NO_MORE_CES) {
6046 // Add the level separator
6047 terminatePSKLevel(level, maxLevel, i, dest);
6048 //dest[i++] = UCOL_LEVELTERMINATOR;
6049 byteCountOrFrenchDone = 0;
6050 // Restart the iteration an move to the
6051 // second level
6052 s.iterator->move(s.iterator, 0, UITER_START);
6053 cces = 0;
6054 level = UCOL_PSK_QUIN;
6055 break;
6056 }
6057 if(CE==0)
6058 continue;
6059 if(isShiftedCE(CE, LVT, &wasShifted)) {
6060 CE >>= 16; /* get primary */
6061 if(CE != 0) {
6062 if(byteCountOrFrenchDone == 0) {
6063 dest[i++]=(uint8_t)(CE >> 8);
6064 } else {
6065 byteCountOrFrenchDone = 0;
6066 }
6067 if((CE &=0xff)!=0) {
6068 if(i==count) {
6069 /* overflow */
6070 byteCountOrFrenchDone = 1;
6071 goto saveState;
6072 }
6073 dest[i++]=(uint8_t)CE;
6074 }
6075 }
6076 } else {
6077 notIsContinuation = !isContinuation(CE);
6078 if(notIsContinuation) {
6079 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
6080 dest[i++] = UCOL_HIRAGANA_QUAD;
6081 } else {
6082 dest[i++] = 0xFF;
6083 }
6084 }
6085 }
6086 if(uprv_numAvailableExpCEs(s)) {
6087 canUpdateState = FALSE;
6088 } else {
6089 canUpdateState = TRUE;
6090 }
6091 }
6092 } else {
6093 // if we're not doing quaternary
6094 // skip to the end
6095 level = UCOL_PSK_NULL;
6096 }
6097 /* fall through to next level */
6098 case UCOL_PSK_QUIN:
6099 level = UCOL_PSK_IDENTICAL;
6100 /* fall through to next level */
6101 case UCOL_PSK_IDENTICAL:
6102 if(strength >= UCOL_IDENTICAL) {
6103 UChar32 first, second;
6104 int32_t bocsuBytesWritten = 0;
6105 // We always need to do identical on
6106 // the NFD form of the string.
6107 if(normIter == NULL) {
6108 // we arrived from the level below and
6109 // normalization was not turned on.
6110 // therefore, we need to make a fresh NFD iterator
6111 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6112 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6113 } else if(!doingIdenticalFromStart) {
6114 // there is an iterator, but we did some other levels.
6115 // therefore, we have a FCD iterator - need to make
6116 // a NFD one.
6117 // normIter being at the beginning does not guarantee
6118 // that the underlying iterator is at the beginning
6119 iter->move(iter, 0, UITER_START);
6120 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6121 }
6122 // At this point we have a NFD iterator that is positioned
6123 // in the right place
6124 if(U_FAILURE(*status)) {
6125 UTRACE_EXIT_STATUS(*status);
6126 return 0;
6127 }
6128 first = uiter_previous32(s.iterator);
6129 // maybe we're at the start of the string
6130 if(first == U_SENTINEL) {
6131 first = 0;
6132 } else {
6133 uiter_next32(s.iterator);
6134 }
6135
6136 j = 0;
6137 for(;;) {
6138 if(i == count) {
6139 if(j+1 < bocsuBytesWritten) {
6140 bocsuBytesUsed = j+1;
6141 }
6142 goto saveState;
6143 }
6144
6145 // On identical level, we will always save
6146 // the state if we reach this point, since
6147 // we don't depend on getNextCE for content
6148 // all the content is in our buffer and we
6149 // already either stored the full buffer OR
6150 // otherwise we won't arrive here.
6151 newState = s.iterator->getState(s.iterator);
6152 if(newState != UITER_NO_STATE) {
6153 iterState = newState;
6154 cces = 0;
6155 }
6156
6157 uint8_t buff[4];
6158 second = uiter_next32(s.iterator);
6159 cces++;
6160
6161 // end condition for identical level
6162 if(second == U_SENTINEL) {
6163 terminatePSKLevel(level, maxLevel, i, dest);
6164 level = UCOL_PSK_NULL;
6165 break;
6166 }
6167 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
6168 first = second;
6169
6170 j = 0;
6171 if(bocsuBytesUsed != 0) {
6172 while(bocsuBytesUsed-->0) {
6173 j++;
6174 }
6175 }
6176
6177 while(i < count && j < bocsuBytesWritten) {
6178 dest[i++] = buff[j++];
6179 }
6180 }
6181
6182 } else {
6183 level = UCOL_PSK_NULL;
6184 }
6185 /* fall through to next level */
6186 case UCOL_PSK_NULL:
6187 j = i;
6188 while(j<count) {
6189 dest[j++]=0;
6190 }
6191 break;
6192 default:
6193 *status = U_INTERNAL_PROGRAM_ERROR;
6194 UTRACE_EXIT_STATUS(*status);
6195 return 0;
6196 }
6197
6198 saveState:
6199 // Now we need to return stuff. First we want to see whether we have
6200 // done everything for the current state of iterator.
6201 if(byteCountOrFrenchDone
6202 || canUpdateState == FALSE
6203 || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE) {
6204 // Any of above mean that the previous transaction
6205 // wasn't finished and that we should store the
6206 // previous iterator state.
6207 state[0] = iterState;
6208 } else {
6209 // The transaction is complete. We will continue in the next iteration.
6210 state[0] = s.iterator->getState(s.iterator);
6211 cces = 0;
6212 }
6213 // Store the number of bocsu bytes written.
6214 if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
6215 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6216 }
6217 state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
6218
6219 // Next we put in the level of comparison
6220 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
6221
6222 // If we are doing French, we need to store whether we have just finished the French level
6223 if(level == UCOL_PSK_SECONDARY && doingFrench) {
6224 state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6225 } else {
6226 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6227 }
6228
6229 // Was the latest CE shifted
6230 if(wasShifted) {
6231 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6232 }
6233 // Check for cces overflow
6234 if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
6235 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6236 }
6237 // Store cces
6238 state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
6239
6240 // Check for French overflow
6241 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6242 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6243 }
6244 // Store number of bytes written in the French secondary continuation sequence
6245 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
6246
6247
6248 // If we have used normalizing iterator, get rid of it
6249 if(normIter != NULL) {
6250 unorm_closeIter(normIter);
6251 }
6252
6253 // Return number of meaningful sortkey bytes.
6254 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
6255 dest,i, state[0], state[1]);
6256 UTRACE_EXIT_VALUE(i);
6257 return i;
6258 }
6259
6260 /**
6261 * Produce a bound for a given sortkey and a number of levels.
6262 */
6263 U_CAPI int32_t U_EXPORT2
ucol_getBound(const uint8_t * source,int32_t sourceLength,UColBoundMode boundType,uint32_t noOfLevels,uint8_t * result,int32_t resultLength,UErrorCode * status)6264 ucol_getBound(const uint8_t *source,
6265 int32_t sourceLength,
6266 UColBoundMode boundType,
6267 uint32_t noOfLevels,
6268 uint8_t *result,
6269 int32_t resultLength,
6270 UErrorCode *status) {
6271 // consistency checks
6272 if(status == NULL || U_FAILURE(*status)) {
6273 return 0;
6274 }
6275 if(source == NULL) {
6276 *status = U_ILLEGAL_ARGUMENT_ERROR;
6277 return 0;
6278 }
6279
6280 int32_t sourceIndex = 0;
6281 // Scan the string until we skip enough of the key OR reach the end of the key
6282 do {
6283 sourceIndex++;
6284 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
6285 noOfLevels--;
6286 }
6287 } while (noOfLevels > 0
6288 && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6289
6290 if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6291 && noOfLevels > 0) {
6292 *status = U_SORT_KEY_TOO_SHORT_WARNING;
6293 }
6294
6295
6296 // READ ME: this code assumes that the values for boundType
6297 // enum will not changes. They are set so that the enum value
6298 // corresponds to the number of extra bytes each bound type
6299 // needs.
6300 if(result != NULL && resultLength >= sourceIndex+boundType) {
6301 uprv_memcpy(result, source, sourceIndex);
6302 switch(boundType) {
6303 // Lower bound just gets terminated. No extra bytes
6304 case UCOL_BOUND_LOWER: // = 0
6305 break;
6306 // Upper bound needs one extra byte
6307 case UCOL_BOUND_UPPER: // = 1
6308 result[sourceIndex++] = 2;
6309 break;
6310 // Upper long bound needs two extra bytes
6311 case UCOL_BOUND_UPPER_LONG: // = 2
6312 result[sourceIndex++] = 0xFF;
6313 result[sourceIndex++] = 0xFF;
6314 break;
6315 default:
6316 *status = U_ILLEGAL_ARGUMENT_ERROR;
6317 return 0;
6318 }
6319 result[sourceIndex++] = 0;
6320
6321 return sourceIndex;
6322 } else {
6323 return sourceIndex+boundType+1;
6324 }
6325 }
6326
6327 /****************************************************************************/
6328 /* Following are the functions that deal with the properties of a collator */
6329 /* there are new APIs and some compatibility APIs */
6330 /****************************************************************************/
6331
6332 static inline void
ucol_addLatinOneEntry(UCollator * coll,UChar ch,uint32_t CE,int32_t * primShift,int32_t * secShift,int32_t * terShift)6333 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
6334 int32_t *primShift, int32_t *secShift, int32_t *terShift) {
6335 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6336 UBool reverseSecondary = FALSE;
6337 if(!isContinuation(CE)) {
6338 tertiary = (uint8_t)((CE & coll->tertiaryMask));
6339 tertiary ^= coll->caseSwitch;
6340 reverseSecondary = TRUE;
6341 } else {
6342 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6343 tertiary &= UCOL_REMOVE_CASE;
6344 reverseSecondary = FALSE;
6345 }
6346
6347 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6348 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6349 primary1 = (uint8_t)(CE >> 8);
6350
6351 if(primary1 != 0) {
6352 coll->latinOneCEs[ch] |= (primary1 << *primShift);
6353 *primShift -= 8;
6354 }
6355 if(primary2 != 0) {
6356 if(*primShift < 0) {
6357 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6358 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6359 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6360 return;
6361 }
6362 coll->latinOneCEs[ch] |= (primary2 << *primShift);
6363 *primShift -= 8;
6364 }
6365 if(secondary != 0) {
6366 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
6367 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
6368 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
6369 } else { // normal case
6370 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
6371 }
6372 *secShift -= 8;
6373 }
6374 if(tertiary != 0) {
6375 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
6376 *terShift -= 8;
6377 }
6378 }
6379
6380 static inline UBool
ucol_resizeLatinOneTable(UCollator * coll,int32_t size,UErrorCode * status)6381 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6382 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6383 if(newTable == NULL) {
6384 *status = U_MEMORY_ALLOCATION_ERROR;
6385 coll->latinOneFailed = TRUE;
6386 return FALSE;
6387 }
6388 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
6389 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6390 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6391 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
6392 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
6393 coll->latinOneTableLen = size;
6394 uprv_free(coll->latinOneCEs);
6395 coll->latinOneCEs = newTable;
6396 return TRUE;
6397 }
6398
6399 static UBool
ucol_setUpLatinOne(UCollator * coll,UErrorCode * status)6400 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6401 UBool result = TRUE;
6402 if(coll->latinOneCEs == NULL) {
6403 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
6404 if(coll->latinOneCEs == NULL) {
6405 *status = U_MEMORY_ALLOCATION_ERROR;
6406 return FALSE;
6407 }
6408 coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6409 }
6410 UChar ch = 0;
6411 UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6412 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
6413
6414 int32_t primShift = 24, secShift = 24, terShift = 24;
6415 uint32_t CE = 0;
6416 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6417
6418 // TODO: make safe if you get more than you wanted...
6419 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
6420 primShift = 24; secShift = 24; terShift = 24;
6421 if(ch < 0x100) {
6422 CE = coll->latinOneMapping[ch];
6423 } else {
6424 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
6425 if(CE == UCOL_NOT_FOUND && coll->UCA) {
6426 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
6427 }
6428 }
6429 if(CE < UCOL_NOT_FOUND) {
6430 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6431 } else {
6432 switch (getCETag(CE)) {
6433 case EXPANSION_TAG:
6434 case DIGIT_TAG:
6435 ucol_setText(it, &ch, 1, status);
6436 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
6437 if(primShift < 0 || secShift < 0 || terShift < 0) {
6438 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6439 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6440 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6441 break;
6442 }
6443 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6444 }
6445 break;
6446 case CONTRACTION_TAG:
6447 // here is the trick
6448 // F2 is contraction. We do something very similar to contractions
6449 // but have two indices, one in the real contraction table and the
6450 // other to where we stuffed things. This hopes that we don't have
6451 // many contractions (this should work for latin-1 tables).
6452 {
6453 if((CE & 0x00FFF000) != 0) {
6454 *status = U_UNSUPPORTED_ERROR;
6455 goto cleanup_after_failure;
6456 }
6457
6458 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
6459
6460 CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
6461
6462 coll->latinOneCEs[ch] = CE;
6463 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
6464 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
6465
6466 // We're going to jump into contraction table, pick the elements
6467 // and use them
6468 do {
6469 CE = *(coll->contractionCEs +
6470 (UCharOffset - coll->contractionIndex));
6471 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
6472 uint32_t size;
6473 uint32_t i; /* general counter */
6474 uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
6475 size = getExpansionCount(CE);
6476 //CE = *CEOffset++;
6477 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6478 for(i = 0; i<size; i++) {
6479 if(primShift < 0 || secShift < 0 || terShift < 0) {
6480 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6481 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6482 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6483 break;
6484 }
6485 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6486 }
6487 } else { /* else, we do */
6488 while(*CEOffset != 0) {
6489 if(primShift < 0 || secShift < 0 || terShift < 0) {
6490 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6491 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6492 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6493 break;
6494 }
6495 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6496 }
6497 }
6498 contractionOffset++;
6499 } else if(CE < UCOL_NOT_FOUND) {
6500 ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
6501 } else {
6502 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6503 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6504 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6505 contractionOffset++;
6506 }
6507 UCharOffset++;
6508 primShift = 24; secShift = 24; terShift = 24;
6509 if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
6510 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
6511 goto cleanup_after_failure;
6512 }
6513 }
6514 } while(*UCharOffset != 0xFFFF);
6515 }
6516 break;
6517 default:
6518 goto cleanup_after_failure;
6519 }
6520 }
6521 }
6522 // compact table
6523 if(contractionOffset < coll->latinOneTableLen) {
6524 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
6525 goto cleanup_after_failure;
6526 }
6527 }
6528 ucol_closeElements(it);
6529 return result;
6530
6531 cleanup_after_failure:
6532 // status should already be set before arriving here.
6533 coll->latinOneFailed = TRUE;
6534 ucol_closeElements(it);
6535 return FALSE;
6536 }
6537
ucol_updateInternalState(UCollator * coll,UErrorCode * status)6538 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
6539 if(U_SUCCESS(*status)) {
6540 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6541 coll->caseSwitch = UCOL_CASE_SWITCH;
6542 } else {
6543 coll->caseSwitch = UCOL_NO_CASE_SWITCH;
6544 }
6545
6546 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
6547 coll->tertiaryMask = UCOL_REMOVE_CASE;
6548 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6549 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_OFF;
6550 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
6551 coll->tertiaryBottom = UCOL_COMMON_BOT3;
6552 } else {
6553 coll->tertiaryMask = UCOL_KEEP_CASE;
6554 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
6555 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6556 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
6557 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
6558 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
6559 } else {
6560 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6561 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
6562 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
6563 }
6564 }
6565
6566 /* Set the compression values */
6567 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1);
6568 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
6569 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
6570
6571 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
6572 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE) {
6573 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
6574 } else {
6575 coll->sortKeyGen = ucol_calcSortKey;
6576 }
6577 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
6578 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed) {
6579 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
6580 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
6581 //fprintf(stderr, "F");
6582 coll->latinOneUse = TRUE;
6583 } else {
6584 coll->latinOneUse = FALSE;
6585 }
6586 if(*status == U_UNSUPPORTED_ERROR) {
6587 *status = U_ZERO_ERROR;
6588 }
6589 } else { // latin1Table exists and it doesn't need to be regenerated, just use it
6590 coll->latinOneUse = TRUE;
6591 }
6592 } else {
6593 coll->latinOneUse = FALSE;
6594 }
6595 }
6596 }
6597
6598 U_CAPI uint32_t U_EXPORT2
ucol_setVariableTop(UCollator * coll,const UChar * varTop,int32_t len,UErrorCode * status)6599 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
6600 if(U_FAILURE(*status) || coll == NULL) {
6601 return 0;
6602 }
6603 if(len == -1) {
6604 len = u_strlen(varTop);
6605 }
6606 if(len == 0) {
6607 *status = U_ILLEGAL_ARGUMENT_ERROR;
6608 return 0;
6609 }
6610
6611 collIterate s;
6612 IInit_collIterate(coll, varTop, len, &s);
6613
6614 uint32_t CE = ucol_IGetNextCE(coll, &s, status);
6615
6616 /* here we check if we have consumed all characters */
6617 /* you can put in either one character or a contraction */
6618 /* you shouldn't put more... */
6619 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
6620 *status = U_CE_NOT_FOUND_ERROR;
6621 return 0;
6622 }
6623
6624 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
6625
6626 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
6627 *status = U_PRIMARY_TOO_LONG_ERROR;
6628 return 0;
6629 }
6630 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
6631 coll->variableTopValueisDefault = FALSE;
6632 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
6633 }
6634
6635 return CE & UCOL_PRIMARYMASK;
6636 }
6637
ucol_getVariableTop(const UCollator * coll,UErrorCode * status)6638 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
6639 if(U_FAILURE(*status) || coll == NULL) {
6640 return 0;
6641 }
6642 return coll->variableTopValue<<16;
6643 }
6644
6645 U_CAPI void U_EXPORT2
ucol_restoreVariableTop(UCollator * coll,const uint32_t varTop,UErrorCode * status)6646 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
6647 if(U_FAILURE(*status) || coll == NULL) {
6648 return;
6649 }
6650
6651 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
6652 coll->variableTopValueisDefault = FALSE;
6653 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
6654 }
6655 }
6656 /* Attribute setter API */
6657 U_CAPI void U_EXPORT2
ucol_setAttribute(UCollator * coll,UColAttribute attr,UColAttributeValue value,UErrorCode * status)6658 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
6659 if(U_FAILURE(*status) || coll == NULL) {
6660 return;
6661 }
6662 UColAttributeValue oldFrench = coll->frenchCollation;
6663 UColAttributeValue oldCaseFirst = coll->caseFirst;
6664 switch(attr) {
6665 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
6666 if(value == UCOL_ON) {
6667 coll->numericCollation = UCOL_ON;
6668 coll->numericCollationisDefault = FALSE;
6669 } else if (value == UCOL_OFF) {
6670 coll->numericCollation = UCOL_OFF;
6671 coll->numericCollationisDefault = FALSE;
6672 } else if (value == UCOL_DEFAULT) {
6673 coll->numericCollationisDefault = TRUE;
6674 coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
6675 } else {
6676 *status = U_ILLEGAL_ARGUMENT_ERROR;
6677 }
6678 break;
6679 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
6680 if(value == UCOL_ON) {
6681 coll->hiraganaQ = UCOL_ON;
6682 coll->hiraganaQisDefault = FALSE;
6683 } else if (value == UCOL_OFF) {
6684 coll->hiraganaQ = UCOL_OFF;
6685 coll->hiraganaQisDefault = FALSE;
6686 } else if (value == UCOL_DEFAULT) {
6687 coll->hiraganaQisDefault = TRUE;
6688 coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ;
6689 } else {
6690 *status = U_ILLEGAL_ARGUMENT_ERROR;
6691 }
6692 break;
6693 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6694 if(value == UCOL_ON) {
6695 coll->frenchCollation = UCOL_ON;
6696 coll->frenchCollationisDefault = FALSE;
6697 } else if (value == UCOL_OFF) {
6698 coll->frenchCollation = UCOL_OFF;
6699 coll->frenchCollationisDefault = FALSE;
6700 } else if (value == UCOL_DEFAULT) {
6701 coll->frenchCollationisDefault = TRUE;
6702 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
6703 } else {
6704 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6705 }
6706 break;
6707 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6708 if(value == UCOL_SHIFTED) {
6709 coll->alternateHandling = UCOL_SHIFTED;
6710 coll->alternateHandlingisDefault = FALSE;
6711 } else if (value == UCOL_NON_IGNORABLE) {
6712 coll->alternateHandling = UCOL_NON_IGNORABLE;
6713 coll->alternateHandlingisDefault = FALSE;
6714 } else if (value == UCOL_DEFAULT) {
6715 coll->alternateHandlingisDefault = TRUE;
6716 coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
6717 } else {
6718 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6719 }
6720 break;
6721 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6722 if(value == UCOL_LOWER_FIRST) {
6723 coll->caseFirst = UCOL_LOWER_FIRST;
6724 coll->caseFirstisDefault = FALSE;
6725 } else if (value == UCOL_UPPER_FIRST) {
6726 coll->caseFirst = UCOL_UPPER_FIRST;
6727 coll->caseFirstisDefault = FALSE;
6728 } else if (value == UCOL_OFF) {
6729 coll->caseFirst = UCOL_OFF;
6730 coll->caseFirstisDefault = FALSE;
6731 } else if (value == UCOL_DEFAULT) {
6732 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
6733 coll->caseFirstisDefault = TRUE;
6734 } else {
6735 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6736 }
6737 break;
6738 case UCOL_CASE_LEVEL: /* do we have an extra case level */
6739 if(value == UCOL_ON) {
6740 coll->caseLevel = UCOL_ON;
6741 coll->caseLevelisDefault = FALSE;
6742 } else if (value == UCOL_OFF) {
6743 coll->caseLevel = UCOL_OFF;
6744 coll->caseLevelisDefault = FALSE;
6745 } else if (value == UCOL_DEFAULT) {
6746 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
6747 coll->caseLevelisDefault = TRUE;
6748 } else {
6749 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6750 }
6751 break;
6752 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6753 if(value == UCOL_ON) {
6754 coll->normalizationMode = UCOL_ON;
6755 coll->normalizationModeisDefault = FALSE;
6756 } else if (value == UCOL_OFF) {
6757 coll->normalizationMode = UCOL_OFF;
6758 coll->normalizationModeisDefault = FALSE;
6759 } else if (value == UCOL_DEFAULT) {
6760 coll->normalizationModeisDefault = TRUE;
6761 coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
6762 } else {
6763 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6764 }
6765 break;
6766 case UCOL_STRENGTH: /* attribute for strength */
6767 if (value == UCOL_DEFAULT) {
6768 coll->strengthisDefault = TRUE;
6769 coll->strength = (UColAttributeValue)coll->options->strength;
6770 } else if (value <= UCOL_IDENTICAL) {
6771 coll->strengthisDefault = FALSE;
6772 coll->strength = value;
6773 } else {
6774 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6775 }
6776 break;
6777 case UCOL_ATTRIBUTE_COUNT:
6778 default:
6779 *status = U_ILLEGAL_ARGUMENT_ERROR;
6780 break;
6781 }
6782 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
6783 coll->latinOneRegenTable = TRUE;
6784 } else {
6785 coll->latinOneRegenTable = FALSE;
6786 }
6787 ucol_updateInternalState(coll, status);
6788 }
6789
6790 U_CAPI UColAttributeValue U_EXPORT2
ucol_getAttribute(const UCollator * coll,UColAttribute attr,UErrorCode * status)6791 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
6792 if(U_FAILURE(*status) || coll == NULL) {
6793 return UCOL_DEFAULT;
6794 }
6795 switch(attr) {
6796 case UCOL_NUMERIC_COLLATION:
6797 return coll->numericCollation;
6798 case UCOL_HIRAGANA_QUATERNARY_MODE:
6799 return coll->hiraganaQ;
6800 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6801 return coll->frenchCollation;
6802 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6803 return coll->alternateHandling;
6804 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6805 return coll->caseFirst;
6806 case UCOL_CASE_LEVEL: /* do we have an extra case level */
6807 return coll->caseLevel;
6808 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6809 return coll->normalizationMode;
6810 case UCOL_STRENGTH: /* attribute for strength */
6811 return coll->strength;
6812 case UCOL_ATTRIBUTE_COUNT:
6813 default:
6814 *status = U_ILLEGAL_ARGUMENT_ERROR;
6815 break;
6816 }
6817 return UCOL_DEFAULT;
6818 }
6819
6820 U_CAPI void U_EXPORT2
ucol_setStrength(UCollator * coll,UCollationStrength strength)6821 ucol_setStrength( UCollator *coll,
6822 UCollationStrength strength)
6823 {
6824 UErrorCode status = U_ZERO_ERROR;
6825 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
6826 }
6827
6828 U_CAPI UCollationStrength U_EXPORT2
ucol_getStrength(const UCollator * coll)6829 ucol_getStrength(const UCollator *coll)
6830 {
6831 UErrorCode status = U_ZERO_ERROR;
6832 return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
6833 }
6834
6835 /****************************************************************************/
6836 /* Following are misc functions */
6837 /* there are new APIs and some compatibility APIs */
6838 /****************************************************************************/
6839
6840 U_CAPI void U_EXPORT2
ucol_getVersion(const UCollator * coll,UVersionInfo versionInfo)6841 ucol_getVersion(const UCollator* coll,
6842 UVersionInfo versionInfo)
6843 {
6844 /* RunTime version */
6845 uint8_t rtVersion = UCOL_RUNTIME_VERSION;
6846 /* Builder version*/
6847 uint8_t bdVersion = coll->image->version[0];
6848
6849 /* Charset Version. Need to get the version from cnv files
6850 * makeconv should populate cnv files with version and
6851 * an api has to be provided in ucnv.h to obtain this version
6852 */
6853 uint8_t csVersion = 0;
6854
6855 /* combine the version info */
6856 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
6857
6858 /* Tailoring rules */
6859 versionInfo[0] = (uint8_t)(cmbVersion>>8);
6860 versionInfo[1] = (uint8_t)cmbVersion;
6861 versionInfo[2] = coll->image->version[1];
6862 if(coll->UCA) {
6863 versionInfo[3] = coll->UCA->image->UCAVersion[0];
6864 } else {
6865 versionInfo[3] = 0;
6866 }
6867 }
6868
6869
6870 /* This internal API checks whether a character is tailored or not */
6871 U_CAPI UBool U_EXPORT2
ucol_isTailored(const UCollator * coll,const UChar u,UErrorCode * status)6872 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
6873 if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
6874 return FALSE;
6875 }
6876
6877 uint32_t CE = UCOL_NOT_FOUND;
6878 const UChar *ContractionStart = NULL;
6879 if(u < 0x100) { /* latin-1 */
6880 CE = coll->latinOneMapping[u];
6881 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
6882 return FALSE;
6883 }
6884 } else { /* regular */
6885 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
6886 }
6887
6888 if(isContraction(CE)) {
6889 ContractionStart = (UChar *)coll->image+getContractOffset(CE);
6890 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
6891 }
6892
6893 return (UBool)(CE != UCOL_NOT_FOUND);
6894 }
6895
6896
6897 /****************************************************************************/
6898 /* Following are the string compare functions */
6899 /* */
6900 /****************************************************************************/
6901
6902
6903 /* ucol_checkIdent internal function. Does byte level string compare. */
6904 /* Used by strcoll if strength == identical and strings */
6905 /* are otherwise equal. Moved out-of-line because this */
6906 /* is a rare case. */
6907 /* */
6908 /* Comparison must be done on NFD normalized strings. */
6909 /* FCD is not good enough. */
6910 /* */
6911 /* TODO: make an incremental NFD Comparison function, which could */
6912 /* be of general use */
6913
6914 static
ucol_checkIdent(collIterate * sColl,collIterate * tColl,UBool normalize,UErrorCode * status)6915 UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
6916 {
6917
6918 // TODO: When we have an UChar iterator, we need to access the whole string. One
6919 // useful modification would be a UChar iterator extract API, since reset next next...
6920 // is not optimal.
6921 // TODO: Handle long strings. Do the same in compareUsingSortKeys.
6922
6923 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
6924 // of same type, but that doesn't really mean that it will stay that way.
6925
6926 // The division for the array length may truncate the array size to
6927 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
6928 // for all platforms anyway.
6929 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6930 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6931 //UChar sStackBuf[256], tStackBuf[256];
6932 //int32_t sBufSize = 256, tBufSize = 256;
6933 int32_t comparison;
6934 int32_t sLen = 0;
6935 UChar *sBuf = NULL;
6936 int32_t tLen = 0;
6937 UChar *tBuf = NULL;
6938 UBool freeSBuf = FALSE, freeTBuf = FALSE;
6939
6940 if (sColl->flags & UCOL_USE_ITERATOR) {
6941 UNormIterator *sNIt = NULL, *tNIt = NULL;
6942 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
6943 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
6944 sColl->iterator->move(sColl->iterator, 0, UITER_START);
6945 tColl->iterator->move(tColl->iterator, 0, UITER_START);
6946 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
6947 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
6948 comparison = u_strCompareIter(sIt, tIt, TRUE);
6949 unorm_closeIter(sNIt);
6950 unorm_closeIter(tNIt);
6951 } else {
6952 sLen = (sColl->flags & UCOL_ITER_HASLEN) ? sColl->endp - sColl->string : -1;
6953 sBuf = sColl->string;
6954 tLen = (tColl->flags & UCOL_ITER_HASLEN) ? tColl->endp - tColl->string : -1;
6955 tBuf = tColl->string;
6956
6957 if (normalize) {
6958 *status = U_ZERO_ERROR;
6959 if (unorm_quickCheck(sBuf, sLen, UNORM_NFD, status) != UNORM_YES) {
6960 sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
6961 sBuf, sLen,
6962 FALSE, 0,
6963 status);
6964 if(*status == U_BUFFER_OVERFLOW_ERROR) {
6965 if(!u_growBufferFromStatic(sColl->stackWritableBuffer,
6966 &sColl->writableBuffer,
6967 (int32_t *)&sColl->writableBufSize, sLen,
6968 0)
6969 ) {
6970 *status = U_MEMORY_ALLOCATION_ERROR;
6971 return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
6972 }
6973 *status = U_ZERO_ERROR;
6974 sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
6975 sBuf, sLen,
6976 FALSE, 0,
6977 status);
6978 }
6979 if(freeSBuf) {
6980 uprv_free(sBuf);
6981 freeSBuf = FALSE;
6982 }
6983 sBuf = sColl->writableBuffer;
6984 if (sBuf != sColl->stackWritableBuffer) {
6985 sColl->flags |= UCOL_ITER_ALLOCATED;
6986 }
6987 }
6988
6989 *status = U_ZERO_ERROR;
6990 if (unorm_quickCheck(tBuf, tLen, UNORM_NFD, status) != UNORM_YES) {
6991 tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
6992 tBuf, tLen,
6993 FALSE, 0,
6994 status);
6995 if(*status == U_BUFFER_OVERFLOW_ERROR) {
6996 if(!u_growBufferFromStatic(tColl->stackWritableBuffer,
6997 &tColl->writableBuffer,
6998 (int32_t *)&tColl->writableBufSize, tLen,
6999 0)
7000 ) {
7001 *status = U_MEMORY_ALLOCATION_ERROR;
7002 return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7003 }
7004 *status = U_ZERO_ERROR;
7005 tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
7006 tBuf, tLen,
7007 FALSE, 0,
7008 status);
7009 }
7010 if(freeTBuf) {
7011 uprv_free(tBuf);
7012 freeTBuf = FALSE;
7013 }
7014 tBuf = tColl->writableBuffer;
7015 if (tBuf != tColl->stackWritableBuffer) {
7016 tColl->flags |= UCOL_ITER_ALLOCATED;
7017 }
7018 }
7019 }
7020
7021 if (sLen == -1 && tLen == -1) {
7022 comparison = u_strcmpCodePointOrder(sBuf, tBuf);
7023 } else {
7024 if (sLen == -1) {
7025 sLen = u_strlen(sBuf);
7026 }
7027 if (tLen == -1) {
7028 tLen = u_strlen(tBuf);
7029 }
7030 comparison = u_memcmpCodePointOrder(sBuf, tBuf, uprv_min(sLen, tLen));
7031 if (comparison == 0) {
7032 comparison = sLen - tLen;
7033 }
7034 }
7035 }
7036
7037 if (comparison < 0) {
7038 return UCOL_LESS;
7039 } else if (comparison == 0) {
7040 return UCOL_EQUAL;
7041 } else /* comparison > 0 */ {
7042 return UCOL_GREATER;
7043 }
7044 }
7045
7046 /* CEBuf - A struct and some inline functions to handle the saving */
7047 /* of CEs in a buffer within ucol_strcoll */
7048
7049 #define UCOL_CEBUF_SIZE 512
7050 typedef struct ucol_CEBuf {
7051 uint32_t *buf;
7052 uint32_t *endp;
7053 uint32_t *pos;
7054 uint32_t localArray[UCOL_CEBUF_SIZE];
7055 } ucol_CEBuf;
7056
7057
7058 static
UCOL_INIT_CEBUF(ucol_CEBuf * b)7059 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
7060 (b)->buf = (b)->pos = (b)->localArray;
7061 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
7062 }
7063
7064 static
ucol_CEBuf_Expand(ucol_CEBuf * b,collIterate * ci)7065 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci) {
7066 uint32_t oldSize;
7067 uint32_t newSize;
7068 uint32_t *newBuf;
7069
7070 ci->flags |= UCOL_ITER_ALLOCATED;
7071 oldSize = b->pos - b->buf;
7072 newSize = oldSize * 2;
7073 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
7074 if(newBuf != NULL) {
7075 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
7076 if (b->buf != b->localArray) {
7077 uprv_free(b->buf);
7078 }
7079 b->buf = newBuf;
7080 b->endp = b->buf + newSize;
7081 b->pos = b->buf + oldSize;
7082 }
7083 }
7084
7085 static
UCOL_CEBUF_PUT(ucol_CEBuf * b,uint32_t ce,collIterate * ci)7086 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci) {
7087 if (b->pos == b->endp) {
7088 ucol_CEBuf_Expand(b, ci);
7089 }
7090 *(b)->pos++ = ce;
7091 }
7092
7093 /* This is a trick string compare function that goes in and uses sortkeys to compare */
7094 /* It is used when compare gets in trouble and needs to bail out */
ucol_compareUsingSortKeys(collIterate * sColl,collIterate * tColl,UErrorCode * status)7095 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7096 collIterate *tColl,
7097 UErrorCode *status)
7098 {
7099 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7100 uint8_t *sourceKeyP = sourceKey;
7101 uint8_t *targetKeyP = targetKey;
7102 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7103 const UCollator *coll = sColl->coll;
7104 UChar *source = NULL;
7105 UChar *target = NULL;
7106 int32_t result = UCOL_EQUAL;
7107 UChar sStackBuf[256], tStackBuf[256];
7108 int32_t sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7109 int32_t targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7110
7111 // TODO: Handle long strings. Do the same in ucol_checkIdent.
7112 if(sColl->flags & UCOL_USE_ITERATOR) {
7113 sColl->iterator->move(sColl->iterator, 0, UITER_START);
7114 tColl->iterator->move(tColl->iterator, 0, UITER_START);
7115 source = sStackBuf;
7116 UChar *sBufp = source;
7117 target = tStackBuf;
7118 UChar *tBufp = target;
7119 while(sColl->iterator->hasNext(sColl->iterator)) {
7120 *sBufp++ = (UChar)sColl->iterator->next(sColl->iterator);
7121 }
7122 while(tColl->iterator->hasNext(tColl->iterator)) {
7123 *tBufp++ = (UChar)tColl->iterator->next(tColl->iterator);
7124 }
7125 sourceLength = sBufp - source;
7126 targetLength = tBufp - target;
7127 } else { // no iterators
7128 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7129 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7130 source = sColl->string;
7131 target = tColl->string;
7132 }
7133
7134
7135
7136 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7137 if(sourceKeyLen > UCOL_MAX_BUFFER) {
7138 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7139 if(sourceKeyP == NULL) {
7140 *status = U_MEMORY_ALLOCATION_ERROR;
7141 goto cleanup_and_do_compare;
7142 }
7143 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7144 }
7145
7146 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7147 if(targetKeyLen > UCOL_MAX_BUFFER) {
7148 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7149 if(targetKeyP == NULL) {
7150 *status = U_MEMORY_ALLOCATION_ERROR;
7151 goto cleanup_and_do_compare;
7152 }
7153 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7154 }
7155
7156 result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7157
7158 cleanup_and_do_compare:
7159 if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
7160 uprv_free(sourceKeyP);
7161 }
7162
7163 if(targetKeyP != NULL && targetKeyP != targetKey) {
7164 uprv_free(targetKeyP);
7165 }
7166
7167 if(result<0) {
7168 return UCOL_LESS;
7169 } else if(result>0) {
7170 return UCOL_GREATER;
7171 } else {
7172 return UCOL_EQUAL;
7173 }
7174 }
7175
7176
7177 static inline UCollationResult
ucol_strcollRegular(collIterate * sColl,collIterate * tColl,UErrorCode * status)7178 ucol_strcollRegular( collIterate *sColl, collIterate *tColl,
7179 // const UCollator *coll,
7180 // const UChar *source,
7181 // int32_t sourceLength,
7182 // const UChar *target,
7183 // int32_t targetLength,
7184 UErrorCode *status)
7185 {
7186 U_ALIGN_CODE(16);
7187
7188 const UCollator *coll = sColl->coll;
7189
7190
7191 // setting up the collator parameters
7192 UColAttributeValue strength = coll->strength;
7193 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY);
7194
7195 UBool checkSecTer = initialCheckSecTer;
7196 UBool checkTertiary = (strength >= UCOL_TERTIARY);
7197 UBool checkQuad = (strength >= UCOL_QUATERNARY);
7198 UBool checkIdent = (strength == UCOL_IDENTICAL);
7199 UBool checkCase = (coll->caseLevel == UCOL_ON);
7200 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7201 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7202 UBool qShifted = shifted && checkQuad;
7203 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7204
7205 if(doHiragana && shifted) {
7206 return (ucol_compareUsingSortKeys(sColl, tColl, status));
7207 }
7208 uint8_t caseSwitch = coll->caseSwitch;
7209 uint8_t tertiaryMask = coll->tertiaryMask;
7210
7211 // This is the lowest primary value that will not be ignored if shifted
7212 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7213
7214 UCollationResult result = UCOL_EQUAL;
7215 UCollationResult hirResult = UCOL_EQUAL;
7216
7217 // Preparing the CE buffers. They will be filled during the primary phase
7218 ucol_CEBuf sCEs;
7219 ucol_CEBuf tCEs;
7220 UCOL_INIT_CEBUF(&sCEs);
7221 UCOL_INIT_CEBUF(&tCEs);
7222
7223 uint32_t secS = 0, secT = 0;
7224 uint32_t sOrder=0, tOrder=0;
7225
7226 // Non shifted primary processing is quite simple
7227 if(!shifted) {
7228 for(;;) {
7229
7230 // We fetch CEs until we hit a non ignorable primary or end.
7231 do {
7232 // We get the next CE
7233 sOrder = ucol_IGetNextCE(coll, sColl, status);
7234 // Stuff it in the buffer
7235 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7236 // And keep just the primary part.
7237 sOrder &= UCOL_PRIMARYMASK;
7238 } while(sOrder == 0);
7239
7240 // see the comments on the above block
7241 do {
7242 tOrder = ucol_IGetNextCE(coll, tColl, status);
7243 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7244 tOrder &= UCOL_PRIMARYMASK;
7245 } while(tOrder == 0);
7246
7247 // if both primaries are the same
7248 if(sOrder == tOrder) {
7249 // and there are no more CEs, we advance to the next level
7250 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7251 break;
7252 }
7253 if(doHiragana && hirResult == UCOL_EQUAL) {
7254 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
7255 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
7256 ? UCOL_LESS:UCOL_GREATER;
7257 }
7258 }
7259 } else {
7260 // if two primaries are different, we are done
7261 result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER;
7262 goto commonReturn;
7263 }
7264 } // no primary difference... do the rest from the buffers
7265 } else { // shifted - do a slightly more complicated processing :)
7266 for(;;) {
7267 UBool sInShifted = FALSE;
7268 UBool tInShifted = FALSE;
7269 // This version of code can be refactored. However, it seems easier to understand this way.
7270 // Source loop. Sam as the target loop.
7271 for(;;) {
7272 sOrder = ucol_IGetNextCE(coll, sColl, status);
7273 if(sOrder == UCOL_NO_MORE_CES) {
7274 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7275 break;
7276 } else if(sOrder == 0
7277 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
7278 /* UCA amendment - ignore ignorables that follow shifted code points */
7279 continue;
7280 } else if(isContinuation(sOrder)) {
7281 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7282 if(sInShifted) {
7283 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7284 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7285 continue;
7286 } else {
7287 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7288 break;
7289 }
7290 } else { /* Just lower level values */
7291 if(sInShifted) {
7292 continue;
7293 } else {
7294 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7295 continue;
7296 }
7297 }
7298 } else { /* regular */
7299 if((sOrder & UCOL_PRIMARYMASK) > LVT) {
7300 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7301 break;
7302 } else {
7303 if((sOrder & UCOL_PRIMARYMASK) > 0) {
7304 sInShifted = TRUE;
7305 sOrder &= UCOL_PRIMARYMASK;
7306 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7307 continue;
7308 } else {
7309 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7310 sInShifted = FALSE;
7311 continue;
7312 }
7313 }
7314 }
7315 }
7316 sOrder &= UCOL_PRIMARYMASK;
7317 sInShifted = FALSE;
7318
7319 for(;;) {
7320 tOrder = ucol_IGetNextCE(coll, tColl, status);
7321 if(tOrder == UCOL_NO_MORE_CES) {
7322 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7323 break;
7324 } else if(tOrder == 0
7325 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
7326 /* UCA amendment - ignore ignorables that follow shifted code points */
7327 continue;
7328 } else if(isContinuation(tOrder)) {
7329 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7330 if(tInShifted) {
7331 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7332 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7333 continue;
7334 } else {
7335 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7336 break;
7337 }
7338 } else { /* Just lower level values */
7339 if(tInShifted) {
7340 continue;
7341 } else {
7342 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7343 continue;
7344 }
7345 }
7346 } else { /* regular */
7347 if((tOrder & UCOL_PRIMARYMASK) > LVT) {
7348 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7349 break;
7350 } else {
7351 if((tOrder & UCOL_PRIMARYMASK) > 0) {
7352 tInShifted = TRUE;
7353 tOrder &= UCOL_PRIMARYMASK;
7354 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7355 continue;
7356 } else {
7357 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7358 tInShifted = FALSE;
7359 continue;
7360 }
7361 }
7362 }
7363 }
7364 tOrder &= UCOL_PRIMARYMASK;
7365 tInShifted = FALSE;
7366
7367 if(sOrder == tOrder) {
7368 /*
7369 if(doHiragana && hirResult == UCOL_EQUAL) {
7370 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7371 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7372 ? UCOL_LESS:UCOL_GREATER;
7373 }
7374 }
7375 */
7376 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7377 break;
7378 } else {
7379 sOrder = 0; tOrder = 0;
7380 continue;
7381 }
7382 } else {
7383 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
7384 goto commonReturn;
7385 }
7386 } /* no primary difference... do the rest from the buffers */
7387 }
7388
7389 /* now, we're gonna reexamine collected CEs */
7390 uint32_t *sCE;
7391 uint32_t *tCE;
7392
7393 /* This is the secondary level of comparison */
7394 if(checkSecTer) {
7395 if(!isFrenchSec) { /* normal */
7396 sCE = sCEs.buf;
7397 tCE = tCEs.buf;
7398 for(;;) {
7399 while (secS == 0) {
7400 secS = *(sCE++) & UCOL_SECONDARYMASK;
7401 }
7402
7403 while(secT == 0) {
7404 secT = *(tCE++) & UCOL_SECONDARYMASK;
7405 }
7406
7407 if(secS == secT) {
7408 if(secS == UCOL_NO_MORE_CES_SECONDARY) {
7409 break;
7410 } else {
7411 secS = 0; secT = 0;
7412 continue;
7413 }
7414 } else {
7415 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7416 goto commonReturn;
7417 }
7418 }
7419 } else { /* do the French */
7420 uint32_t *sCESave = NULL;
7421 uint32_t *tCESave = NULL;
7422 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
7423 tCE = tCEs.pos-2;
7424 for(;;) {
7425 while (secS == 0 && sCE >= sCEs.buf) {
7426 if(sCESave == 0) {
7427 secS = *(sCE--);
7428 if(isContinuation(secS)) {
7429 while(isContinuation(secS = *(sCE--)));
7430 /* after this, secS has the start of continuation, and sCEs points before that */
7431 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
7432 sCE+=2; /* need to point to the first continuation CP */
7433 /* However, now you can just continue doing stuff */
7434 }
7435 } else {
7436 secS = *(sCE++);
7437 if(!isContinuation(secS)) { /* This means we have finished with this cont */
7438 sCE = sCESave; /* reset the pointer to before continuation */
7439 sCESave = 0;
7440 continue;
7441 }
7442 }
7443 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7444 }
7445
7446 while(secT == 0 && tCE >= tCEs.buf) {
7447 if(tCESave == 0) {
7448 secT = *(tCE--);
7449 if(isContinuation(secT)) {
7450 while(isContinuation(secT = *(tCE--)));
7451 /* after this, secS has the start of continuation, and sCEs points before that */
7452 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
7453 tCE+=2; /* need to point to the first continuation CP */
7454 /* However, now you can just continue doing stuff */
7455 }
7456 } else {
7457 secT = *(tCE++);
7458 if(!isContinuation(secT)) { /* This means we have finished with this cont */
7459 tCE = tCESave; /* reset the pointer to before continuation */
7460 tCESave = 0;
7461 continue;
7462 }
7463 }
7464 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7465 }
7466
7467 if(secS == secT) {
7468 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
7469 break;
7470 } else {
7471 secS = 0; secT = 0;
7472 continue;
7473 }
7474 } else {
7475 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7476 goto commonReturn;
7477 }
7478 }
7479 }
7480 }
7481
7482 /* doing the case bit */
7483 if(checkCase) {
7484 sCE = sCEs.buf;
7485 tCE = tCEs.buf;
7486 for(;;) {
7487 while((secS & UCOL_REMOVE_CASE) == 0) {
7488 if(!isContinuation(*sCE++)) {
7489 secS =*(sCE-1);
7490 if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7491 // primary ignorables should not be considered on the case level when the strength is primary
7492 // otherwise, the CEs stop being well-formed
7493 secS &= UCOL_TERT_CASE_MASK;
7494 secS ^= caseSwitch;
7495 } else {
7496 secS = 0;
7497 }
7498 } else {
7499 secS = 0;
7500 }
7501 }
7502
7503 while((secT & UCOL_REMOVE_CASE) == 0) {
7504 if(!isContinuation(*tCE++)) {
7505 secT = *(tCE-1);
7506 if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7507 // primary ignorables should not be considered on the case level when the strength is primary
7508 // otherwise, the CEs stop being well-formed
7509 secT &= UCOL_TERT_CASE_MASK;
7510 secT ^= caseSwitch;
7511 } else {
7512 secT = 0;
7513 }
7514 } else {
7515 secT = 0;
7516 }
7517 }
7518
7519 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
7520 result = UCOL_LESS;
7521 goto commonReturn;
7522 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
7523 result = UCOL_GREATER;
7524 goto commonReturn;
7525 }
7526
7527 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
7528 break;
7529 } else {
7530 secS = 0;
7531 secT = 0;
7532 }
7533 }
7534 }
7535
7536 /* Tertiary level */
7537 if(checkTertiary) {
7538 secS = 0;
7539 secT = 0;
7540 sCE = sCEs.buf;
7541 tCE = tCEs.buf;
7542 for(;;) {
7543 while((secS & UCOL_REMOVE_CASE) == 0) {
7544 secS = *(sCE++) & tertiaryMask;
7545 if(!isContinuation(secS)) {
7546 secS ^= caseSwitch;
7547 } else {
7548 secS &= UCOL_REMOVE_CASE;
7549 }
7550 }
7551
7552 while((secT & UCOL_REMOVE_CASE) == 0) {
7553 secT = *(tCE++) & tertiaryMask;
7554 if(!isContinuation(secT)) {
7555 secT ^= caseSwitch;
7556 } else {
7557 secT &= UCOL_REMOVE_CASE;
7558 }
7559 }
7560
7561 if(secS == secT) {
7562 if((secS & UCOL_REMOVE_CASE) == 1) {
7563 break;
7564 } else {
7565 secS = 0; secT = 0;
7566 continue;
7567 }
7568 } else {
7569 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7570 goto commonReturn;
7571 }
7572 }
7573 }
7574
7575
7576 if(qShifted /*checkQuad*/) {
7577 UBool sInShifted = TRUE;
7578 UBool tInShifted = TRUE;
7579 secS = 0;
7580 secT = 0;
7581 sCE = sCEs.buf;
7582 tCE = tCEs.buf;
7583 for(;;) {
7584 while(secS == 0 && secS != UCOL_NO_MORE_CES || (isContinuation(secS) && !sInShifted)) {
7585 secS = *(sCE++);
7586 if(isContinuation(secS)) {
7587 if(!sInShifted) {
7588 continue;
7589 }
7590 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
7591 secS = UCOL_PRIMARYMASK;
7592 sInShifted = FALSE;
7593 } else {
7594 sInShifted = TRUE;
7595 }
7596 }
7597 secS &= UCOL_PRIMARYMASK;
7598
7599
7600 while(secT == 0 && secT != UCOL_NO_MORE_CES || (isContinuation(secT) && !tInShifted)) {
7601 secT = *(tCE++);
7602 if(isContinuation(secT)) {
7603 if(!tInShifted) {
7604 continue;
7605 }
7606 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
7607 secT = UCOL_PRIMARYMASK;
7608 tInShifted = FALSE;
7609 } else {
7610 tInShifted = TRUE;
7611 }
7612 }
7613 secT &= UCOL_PRIMARYMASK;
7614
7615 if(secS == secT) {
7616 if(secS == UCOL_NO_MORE_CES_PRIMARY) {
7617 break;
7618 } else {
7619 secS = 0; secT = 0;
7620 continue;
7621 }
7622 } else {
7623 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7624 goto commonReturn;
7625 }
7626 }
7627 } else if(doHiragana && hirResult != UCOL_EQUAL) {
7628 // If we're fine on quaternaries, we might be different
7629 // on Hiragana. This, however, might fail us in shifted.
7630 result = hirResult;
7631 goto commonReturn;
7632 }
7633
7634 /* For IDENTICAL comparisons, we use a bitwise character comparison */
7635 /* as a tiebreaker if all else is equal. */
7636 /* Getting here should be quite rare - strings are not identical - */
7637 /* that is checked first, but compared == through all other checks. */
7638 if(checkIdent)
7639 {
7640 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
7641 result = ucol_checkIdent(sColl, tColl, TRUE, status);
7642 }
7643
7644 commonReturn:
7645 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
7646 freeHeapWritableBuffer(sColl);
7647 freeHeapWritableBuffer(tColl);
7648
7649 if (sCEs.buf != sCEs.localArray ) {
7650 uprv_free(sCEs.buf);
7651 }
7652 if (tCEs.buf != tCEs.localArray ) {
7653 uprv_free(tCEs.buf);
7654 }
7655 }
7656
7657 return result;
7658 }
7659
7660
7661 static inline uint32_t
ucol_getLatinOneContraction(const UCollator * coll,int32_t strength,uint32_t CE,const UChar * s,int32_t * index,int32_t len)7662 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
7663 uint32_t CE, const UChar *s, int32_t *index, int32_t len) {
7664 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
7665 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
7666 int32_t offset = 1;
7667 UChar schar = 0, tchar = 0;
7668
7669 for(;;) {
7670 if(len == -1) {
7671 if(s[*index] == 0) { // end of string
7672 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7673 } else {
7674 schar = s[*index];
7675 }
7676 } else {
7677 if(*index == len) {
7678 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7679 } else {
7680 schar = s[*index];
7681 }
7682 }
7683
7684 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
7685 offset++;
7686 }
7687
7688 if (schar == tchar) {
7689 (*index)++;
7690 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
7691 }
7692 else
7693 {
7694 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
7695 return UCOL_BAIL_OUT_CE;
7696 }
7697 // skip completely ignorables
7698 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
7699 if(isZeroCE == 0) { // we have to ignore completely ignorables
7700 (*index)++;
7701 continue;
7702 }
7703
7704 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7705 }
7706 }
7707 }
7708
7709
7710 /**
7711 * This is a fast strcoll, geared towards text in Latin-1.
7712 * It supports contractions of size two, French secondaries
7713 * and case switching. You can use it with strengths primary
7714 * to tertiary. It does not support shifted and case level.
7715 * It relies on the table build by setupLatin1Table. If it
7716 * doesn't understand something, it will go to the regular
7717 * strcoll.
7718 */
7719 static inline UCollationResult
ucol_strcollUseLatin1(const UCollator * coll,const UChar * source,int32_t sLen,const UChar * target,int32_t tLen,UErrorCode * status)7720 ucol_strcollUseLatin1( const UCollator *coll,
7721 const UChar *source,
7722 int32_t sLen,
7723 const UChar *target,
7724 int32_t tLen,
7725 UErrorCode *status)
7726 {
7727 U_ALIGN_CODE(16);
7728 int32_t strength = coll->strength;
7729
7730 int32_t sIndex = 0, tIndex = 0;
7731 UChar sChar = 0, tChar = 0;
7732 uint32_t sOrder=0, tOrder=0;
7733
7734 UBool endOfSource = FALSE;
7735
7736 uint32_t *elements = coll->latinOneCEs;
7737
7738 UBool haveContractions = FALSE; // if we have contractions in our string
7739 // we cannot do French secondary
7740
7741 // Do the primary level
7742 for(;;) {
7743 while(sOrder==0) { // this loop skips primary ignorables
7744 // sOrder=getNextlatinOneCE(source);
7745 if(sLen==-1) { // handling zero terminated strings
7746 sChar=source[sIndex++];
7747 if(sChar==0) {
7748 endOfSource = TRUE;
7749 break;
7750 }
7751 } else { // handling strings with known length
7752 if(sIndex==sLen) {
7753 endOfSource = TRUE;
7754 break;
7755 }
7756 sChar=source[sIndex++];
7757 }
7758 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7759 //fprintf(stderr, "R");
7760 goto returnRegular;
7761 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7762 }
7763 sOrder = elements[sChar];
7764 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
7765 // specials can basically be either contractions or bail-out signs. If we get anything
7766 // else, we'll bail out anywasy
7767 if(getCETag(sOrder) == CONTRACTION_TAG) {
7768 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
7769 haveContractions = TRUE; // if there are contractions, we cannot do French secondary
7770 // However, if there are contractions in the table, but we always use just one char,
7771 // we might be able to do French. This should be checked out.
7772 }
7773 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7774 //fprintf(stderr, "S");
7775 goto returnRegular;
7776 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7777 }
7778 }
7779 }
7780
7781 while(tOrder==0) { // this loop skips primary ignorables
7782 // tOrder=getNextlatinOneCE(target);
7783 if(tLen==-1) { // handling zero terminated strings
7784 tChar=target[tIndex++];
7785 if(tChar==0) {
7786 if(endOfSource) { // this is different than source loop,
7787 // as we already know that source loop is done here,
7788 // so we can either finish the primary loop if both
7789 // strings are done or anounce the result if only
7790 // target is done. Same below.
7791 goto endOfPrimLoop;
7792 } else {
7793 return UCOL_GREATER;
7794 }
7795 }
7796 } else { // handling strings with known length
7797 if(tIndex==tLen) {
7798 if(endOfSource) {
7799 goto endOfPrimLoop;
7800 } else {
7801 return UCOL_GREATER;
7802 }
7803 }
7804 tChar=target[tIndex++];
7805 }
7806 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7807 //fprintf(stderr, "R");
7808 goto returnRegular;
7809 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7810 }
7811 tOrder = elements[tChar];
7812 if(tOrder >= UCOL_NOT_FOUND) {
7813 // Handling specials, see the comments for source
7814 if(getCETag(tOrder) == CONTRACTION_TAG) {
7815 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
7816 haveContractions = TRUE;
7817 }
7818 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7819 //fprintf(stderr, "S");
7820 goto returnRegular;
7821 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7822 }
7823 }
7824 }
7825 if(endOfSource) { // source is finished, but target is not, say the result.
7826 return UCOL_LESS;
7827 }
7828
7829 if(sOrder == tOrder) { // if we have same CEs, we continue the loop
7830 sOrder = 0; tOrder = 0;
7831 continue;
7832 } else {
7833 // compare current top bytes
7834 if(((sOrder^tOrder)&0xFF000000)!=0) {
7835 // top bytes differ, return difference
7836 if(sOrder < tOrder) {
7837 return UCOL_LESS;
7838 } else if(sOrder > tOrder) {
7839 return UCOL_GREATER;
7840 }
7841 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
7842 // since we must return enum value
7843 }
7844
7845 // top bytes match, continue with following bytes
7846 sOrder<<=8;
7847 tOrder<<=8;
7848 }
7849 }
7850
7851 endOfPrimLoop:
7852 // after primary loop, we definitely know the sizes of strings,
7853 // so we set it and use simpler loop for secondaries and tertiaries
7854 sLen = sIndex; tLen = tIndex;
7855 if(strength >= UCOL_SECONDARY) {
7856 // adjust the table beggining
7857 elements += coll->latinOneTableLen;
7858 endOfSource = FALSE;
7859
7860 if(coll->frenchCollation == UCOL_OFF) { // non French
7861 // This loop is a simplified copy of primary loop
7862 // at this point we know that whole strings are latin-1, so we don't
7863 // check for that. We also know that we only have contractions as
7864 // specials.
7865 sIndex = 0; tIndex = 0;
7866 for(;;) {
7867 while(sOrder==0) {
7868 if(sIndex==sLen) {
7869 endOfSource = TRUE;
7870 break;
7871 }
7872 sChar=source[sIndex++];
7873 sOrder = elements[sChar];
7874 if(sOrder > UCOL_NOT_FOUND) {
7875 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
7876 }
7877 }
7878
7879 while(tOrder==0) {
7880 if(tIndex==tLen) {
7881 if(endOfSource) {
7882 goto endOfSecLoop;
7883 } else {
7884 return UCOL_GREATER;
7885 }
7886 }
7887 tChar=target[tIndex++];
7888 tOrder = elements[tChar];
7889 if(tOrder > UCOL_NOT_FOUND) {
7890 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
7891 }
7892 }
7893 if(endOfSource) {
7894 return UCOL_LESS;
7895 }
7896
7897 if(sOrder == tOrder) {
7898 sOrder = 0; tOrder = 0;
7899 continue;
7900 } else {
7901 // see primary loop for comments on this
7902 if(((sOrder^tOrder)&0xFF000000)!=0) {
7903 if(sOrder < tOrder) {
7904 return UCOL_LESS;
7905 } else if(sOrder > tOrder) {
7906 return UCOL_GREATER;
7907 }
7908 }
7909 sOrder<<=8;
7910 tOrder<<=8;
7911 }
7912 }
7913 } else { // French
7914 if(haveContractions) { // if we have contractions, we have to bail out
7915 // since we don't really know how to handle them here
7916 goto returnRegular;
7917 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7918 }
7919 // For French, we go backwards
7920 sIndex = sLen; tIndex = tLen;
7921 for(;;) {
7922 while(sOrder==0) {
7923 if(sIndex==0) {
7924 endOfSource = TRUE;
7925 break;
7926 }
7927 sChar=source[--sIndex];
7928 sOrder = elements[sChar];
7929 // don't even look for contractions
7930 }
7931
7932 while(tOrder==0) {
7933 if(tIndex==0) {
7934 if(endOfSource) {
7935 goto endOfSecLoop;
7936 } else {
7937 return UCOL_GREATER;
7938 }
7939 }
7940 tChar=target[--tIndex];
7941 tOrder = elements[tChar];
7942 // don't even look for contractions
7943 }
7944 if(endOfSource) {
7945 return UCOL_LESS;
7946 }
7947
7948 if(sOrder == tOrder) {
7949 sOrder = 0; tOrder = 0;
7950 continue;
7951 } else {
7952 // see the primary loop for comments
7953 if(((sOrder^tOrder)&0xFF000000)!=0) {
7954 if(sOrder < tOrder) {
7955 return UCOL_LESS;
7956 } else if(sOrder > tOrder) {
7957 return UCOL_GREATER;
7958 }
7959 }
7960 sOrder<<=8;
7961 tOrder<<=8;
7962 }
7963 }
7964 }
7965 }
7966
7967 endOfSecLoop:
7968 if(strength >= UCOL_TERTIARY) {
7969 // tertiary loop is the same as secondary (except no French)
7970 elements += coll->latinOneTableLen;
7971 sIndex = 0; tIndex = 0;
7972 endOfSource = FALSE;
7973 for(;;) {
7974 while(sOrder==0) {
7975 if(sIndex==sLen) {
7976 endOfSource = TRUE;
7977 break;
7978 }
7979 sChar=source[sIndex++];
7980 sOrder = elements[sChar];
7981 if(sOrder > UCOL_NOT_FOUND) {
7982 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
7983 }
7984 }
7985 while(tOrder==0) {
7986 if(tIndex==tLen) {
7987 if(endOfSource) {
7988 return UCOL_EQUAL; // if both strings are at the end, they are equal
7989 } else {
7990 return UCOL_GREATER;
7991 }
7992 }
7993 tChar=target[tIndex++];
7994 tOrder = elements[tChar];
7995 if(tOrder > UCOL_NOT_FOUND) {
7996 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
7997 }
7998 }
7999 if(endOfSource) {
8000 return UCOL_LESS;
8001 }
8002 if(sOrder == tOrder) {
8003 sOrder = 0; tOrder = 0;
8004 continue;
8005 } else {
8006 if(((sOrder^tOrder)&0xff000000)!=0) {
8007 if(sOrder < tOrder) {
8008 return UCOL_LESS;
8009 } else if(sOrder > tOrder) {
8010 return UCOL_GREATER;
8011 }
8012 }
8013 sOrder<<=8;
8014 tOrder<<=8;
8015 }
8016 }
8017 }
8018 return UCOL_EQUAL;
8019
8020 returnRegular:
8021 // Preparing the context objects for iterating over strings
8022 collIterate sColl, tColl;
8023
8024 IInit_collIterate(coll, source, sLen, &sColl);
8025 IInit_collIterate(coll, target, tLen, &tColl);
8026 return ucol_strcollRegular(&sColl, &tColl, status);
8027 }
8028
8029
8030 U_CAPI UCollationResult U_EXPORT2
ucol_strcollIter(const UCollator * coll,UCharIterator * sIter,UCharIterator * tIter,UErrorCode * status)8031 ucol_strcollIter( const UCollator *coll,
8032 UCharIterator *sIter,
8033 UCharIterator *tIter,
8034 UErrorCode *status) {
8035 if(!status || U_FAILURE(*status)) {
8036 return UCOL_EQUAL;
8037 }
8038
8039 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
8040 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
8041
8042 if (sIter == tIter) {
8043 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8044 return UCOL_EQUAL;
8045 }
8046 if(sIter == NULL || tIter == NULL || coll == NULL) {
8047 *status = U_ILLEGAL_ARGUMENT_ERROR;
8048 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8049 return UCOL_EQUAL;
8050 }
8051
8052 UCollationResult result = UCOL_EQUAL;
8053
8054 // Preparing the context objects for iterating over strings
8055 collIterate sColl, tColl;
8056 // The division for the array length may truncate the array size to
8057 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8058 // for all platforms anyway.
8059 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8060 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8061 UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8062
8063 IInit_collIterate(coll, NULL, -1, &sColl);
8064 sColl.iterator = sIter;
8065 sColl.flags |= UCOL_USE_ITERATOR;
8066 IInit_collIterate(coll, NULL, -1, &tColl);
8067 tColl.flags |= UCOL_USE_ITERATOR;
8068 tColl.iterator = tIter;
8069
8070 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8071 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8072 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8073 sColl.flags &= ~UCOL_ITER_NORM;
8074
8075 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8076 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8077 tColl.flags &= ~UCOL_ITER_NORM;
8078 }
8079
8080 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
8081
8082 while((sChar = sColl.iterator->next(sColl.iterator)) ==
8083 (tChar = tColl.iterator->next(tColl.iterator))) {
8084 if(sChar == U_SENTINEL) {
8085 result = UCOL_EQUAL;
8086 goto end_compare;
8087 }
8088 }
8089
8090 if(sChar == U_SENTINEL) {
8091 tChar = tColl.iterator->previous(tColl.iterator);
8092 }
8093
8094 if(tChar == U_SENTINEL) {
8095 sChar = sColl.iterator->previous(sColl.iterator);
8096 }
8097
8098 sChar = sColl.iterator->previous(sColl.iterator);
8099 tChar = tColl.iterator->previous(tColl.iterator);
8100
8101 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8102 {
8103 // We are stopped in the middle of a contraction.
8104 // Scan backwards through the == part of the string looking for the start of the contraction.
8105 // It doesn't matter which string we scan, since they are the same in this region.
8106 do
8107 {
8108 sChar = sColl.iterator->previous(sColl.iterator);
8109 tChar = tColl.iterator->previous(tColl.iterator);
8110 }
8111 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8112 }
8113
8114
8115 if(U_SUCCESS(*status)) {
8116 result = ucol_strcollRegular(&sColl, &tColl, status);
8117 }
8118
8119 end_compare:
8120 if(sNormIter || tNormIter) {
8121 unorm_closeIter(sNormIter);
8122 unorm_closeIter(tNormIter);
8123 }
8124
8125 UTRACE_EXIT_VALUE_STATUS(result, *status)
8126 return result;
8127 }
8128
8129
8130
8131 /* */
8132 /* ucol_strcoll Main public API string comparison function */
8133 /* */
8134 U_CAPI UCollationResult U_EXPORT2
ucol_strcoll(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)8135 ucol_strcoll( const UCollator *coll,
8136 const UChar *source,
8137 int32_t sourceLength,
8138 const UChar *target,
8139 int32_t targetLength) {
8140 U_ALIGN_CODE(16);
8141
8142 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
8143 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8144 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8145 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
8146 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
8147 }
8148
8149 UErrorCode status = U_ZERO_ERROR;
8150 if(source == NULL || target == NULL) {
8151 // do not crash, but return. Should have
8152 // status argument to return error.
8153 UTRACE_EXIT_VALUE(UTRACE_UCOL_STRCOLL);
8154 return UCOL_EQUAL;
8155 }
8156 collIterate sColl, tColl;
8157
8158 /* Scan the strings. Find: */
8159 /* The length of any leading portion that is equal */
8160 /* Whether they are exactly equal. (in which case we just return) */
8161 const UChar *pSrc = source;
8162 const UChar *pTarg = target;
8163 int32_t equalLength;
8164
8165 if (sourceLength == -1 && targetLength == -1) {
8166 // Both strings are null terminated.
8167 // Check for them being the same string, and scan through
8168 // any leading equal portion.
8169 if (source==target) {
8170 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8171 return UCOL_EQUAL;
8172 }
8173
8174 for (;;) {
8175 if ( *pSrc != *pTarg || *pSrc == 0) {
8176 break;
8177 }
8178 pSrc++;
8179 pTarg++;
8180 }
8181 if (*pSrc == 0 && *pTarg == 0) {
8182 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8183 return UCOL_EQUAL;
8184 }
8185 equalLength = pSrc - source;
8186 }
8187 else
8188 {
8189 // One or both strings has an explicit length.
8190 /* check if source and target are same strings */
8191
8192 if (source==target && sourceLength==targetLength) {
8193 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8194 return UCOL_EQUAL;
8195 }
8196 const UChar *pSrcEnd = source + sourceLength;
8197 const UChar *pTargEnd = target + targetLength;
8198
8199
8200 // Scan while the strings are bitwise ==, or until one is exhausted.
8201 for (;;) {
8202 if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8203 break;
8204 }
8205 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8206 break;
8207 }
8208 if (*pSrc != *pTarg) {
8209 break;
8210 }
8211 pSrc++;
8212 pTarg++;
8213 }
8214 equalLength = pSrc - source;
8215
8216 // If we made it all the way through both strings, we are done. They are ==
8217 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */
8218 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) { /* and also at end of dest string */
8219 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8220 return UCOL_EQUAL;
8221 }
8222 }
8223 if (equalLength > 0) {
8224 /* There is an identical portion at the beginning of the two strings. */
8225 /* If the identical portion ends within a contraction or a comibining */
8226 /* character sequence, back up to the start of that sequence. */
8227 pSrc = source + equalLength; /* point to the first differing chars */
8228 pTarg = target + equalLength;
8229 if (pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll) ||
8230 pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))
8231 {
8232 // We are stopped in the middle of a contraction.
8233 // Scan backwards through the == part of the string looking for the start of the contraction.
8234 // It doesn't matter which string we scan, since they are the same in this region.
8235 do
8236 {
8237 equalLength--;
8238 pSrc--;
8239 }
8240 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
8241 }
8242
8243 source += equalLength;
8244 target += equalLength;
8245 if (sourceLength > 0) {
8246 sourceLength -= equalLength;
8247 }
8248 if (targetLength > 0) {
8249 targetLength -= equalLength;
8250 }
8251 }
8252
8253 UCollationResult returnVal;
8254 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
8255 // Preparing the context objects for iterating over strings
8256 IInit_collIterate(coll, source, sourceLength, &sColl);
8257 IInit_collIterate(coll, target, targetLength, &tColl);
8258 returnVal = ucol_strcollRegular(&sColl, &tColl, &status);
8259 } else {
8260 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
8261 }
8262 UTRACE_EXIT_VALUE(returnVal);
8263 return returnVal;
8264 }
8265
8266 /* convenience function for comparing strings */
8267 U_CAPI UBool U_EXPORT2
ucol_greater(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)8268 ucol_greater( const UCollator *coll,
8269 const UChar *source,
8270 int32_t sourceLength,
8271 const UChar *target,
8272 int32_t targetLength)
8273 {
8274 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8275 == UCOL_GREATER);
8276 }
8277
8278 /* convenience function for comparing strings */
8279 U_CAPI UBool U_EXPORT2
ucol_greaterOrEqual(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)8280 ucol_greaterOrEqual( const UCollator *coll,
8281 const UChar *source,
8282 int32_t sourceLength,
8283 const UChar *target,
8284 int32_t targetLength)
8285 {
8286 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8287 != UCOL_LESS);
8288 }
8289
8290 /* convenience function for comparing strings */
8291 U_CAPI UBool U_EXPORT2
ucol_equal(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)8292 ucol_equal( const UCollator *coll,
8293 const UChar *source,
8294 int32_t sourceLength,
8295 const UChar *target,
8296 int32_t targetLength)
8297 {
8298 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8299 == UCOL_EQUAL);
8300 }
8301
8302 U_CAPI void U_EXPORT2
ucol_getUCAVersion(const UCollator * coll,UVersionInfo info)8303 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
8304 if(coll && coll->UCA) {
8305 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
8306 }
8307 }
8308
8309 U_CAPI int32_t U_EXPORT2
ucol_cloneBinary(const UCollator * coll,uint8_t * buffer,int32_t capacity,UErrorCode * status)8310 ucol_cloneBinary(const UCollator *coll,
8311 uint8_t *buffer, int32_t capacity,
8312 UErrorCode *status)
8313 {
8314 int32_t length = 0;
8315 if(U_FAILURE(*status)) {
8316 return length;
8317 }
8318 if(capacity < 0) {
8319 *status = U_ILLEGAL_ARGUMENT_ERROR;
8320 return length;
8321 }
8322 if(coll->hasRealData == TRUE) {
8323 length = coll->image->size;
8324 if(length <= capacity) {
8325 uprv_memcpy(buffer, coll->image, length);
8326 } else {
8327 *status = U_BUFFER_OVERFLOW_ERROR;
8328 }
8329 } else {
8330 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
8331 if(length <= capacity) {
8332 /* build the UCATableHeader with minimal entries */
8333 /* do not copy the header from the UCA file because its values are wrong! */
8334 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
8335
8336 /* reset everything */
8337 uprv_memset(buffer, 0, length);
8338
8339 /* set the tailoring-specific values */
8340 UCATableHeader *myData = (UCATableHeader *)buffer;
8341 myData->size = length;
8342
8343 /* offset for the options, the only part of the data that is present after the header */
8344 myData->options = sizeof(UCATableHeader);
8345
8346 /* need to always set the expansion value for an upper bound of the options */
8347 myData->expansion = myData->options + sizeof(UColOptionSet);
8348
8349 myData->magic = UCOL_HEADER_MAGIC;
8350 myData->isBigEndian = U_IS_BIG_ENDIAN;
8351 myData->charSetFamily = U_CHARSET_FAMILY;
8352
8353 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
8354 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
8355
8356 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
8357 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
8358 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
8359 myData->jamoSpecial = coll->image->jamoSpecial;
8360
8361 /* copy the collator options */
8362 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
8363 } else {
8364 *status = U_BUFFER_OVERFLOW_ERROR;
8365 }
8366 }
8367 return length;
8368 }
8369
8370 U_CAPI void U_EXPORT2
ucol_forgetUCA(void)8371 ucol_forgetUCA(void)
8372 {
8373 _staticUCA = NULL;
8374 UCA_DATA_MEM = NULL;
8375 }
8376
8377 #endif /* #if !UCONFIG_NO_COLLATION */
8378
8379