1 /*
2 *******************************************************************************
3 * Copyright (C) 1996-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: ucol.cpp
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * Modification history
12 * Date Name Comments
13 * 1996-1999 various members of ICU team maintained C API for collation framework
14 * 02/16/2001 synwee Added internal method getPrevSpecialCE
15 * 03/01/2001 synwee Added maxexpansion functionality.
16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant
17 */
18
19 #include "unicode/utypes.h"
20
21 #if !UCONFIG_NO_COLLATION
22
23 #include "unicode/coleitr.h"
24 #include "unicode/unorm.h"
25 #include "unicode/udata.h"
26 #include "unicode/ustring.h"
27
28 #include "ucol_imp.h"
29 #include "bocsu.h"
30
31 #include "normalizer2impl.h"
32 #include "unorm_it.h"
33 #include "umutex.h"
34 #include "cmemory.h"
35 #include "ucln_in.h"
36 #include "cstring.h"
37 #include "utracimp.h"
38 #include "putilimp.h"
39 #include "uassert.h"
40
41 #ifdef UCOL_DEBUG
42 #include <stdio.h>
43 #endif
44
45 U_NAMESPACE_USE
46
47 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
48
49 #define LAST_BYTE_MASK_ 0xFF
50 #define SECOND_LAST_BYTE_SHIFT_ 8
51
52 #define ZERO_CC_LIMIT_ 0xC0
53
54 // this is static pointer to the normalizer fcdTrieIndex
55 // it is always the same between calls to u_cleanup
56 // and therefore writing to it is not synchronized.
57 // It is cleaned in ucol_cleanup
58 static const uint16_t *fcdTrieIndex=NULL;
59 // Code points at fcdHighStart and above have a zero FCD value.
60 static UChar32 fcdHighStart = 0;
61
62 // These are values from UCA required for
63 // implicit generation and supressing sort key compression
64 // they should regularly be in the UCA, but if one
65 // is running without UCA, it could be a problem
66 static const int32_t maxRegularPrimary = 0xA0;
67 static const int32_t minImplicitPrimary = 0xE0;
68 static const int32_t maxImplicitPrimary = 0xE4;
69
70 U_CDECL_BEGIN
71 static UBool U_CALLCONV
ucol_cleanup(void)72 ucol_cleanup(void)
73 {
74 fcdTrieIndex = NULL;
75 return TRUE;
76 }
77
78 static int32_t U_CALLCONV
_getFoldingOffset(uint32_t data)79 _getFoldingOffset(uint32_t data) {
80 return (int32_t)(data&0xFFFFFF);
81 }
82
83 U_CDECL_END
84
85 static
IInit_collIterate(const UCollator * collator,const UChar * sourceString,int32_t sourceLen,collIterate * s,UErrorCode * status)86 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
87 int32_t sourceLen, collIterate *s,
88 UErrorCode *status)
89 {
90 (s)->string = (s)->pos = sourceString;
91 (s)->origFlags = 0;
92 (s)->flags = 0;
93 if (sourceLen >= 0) {
94 s->flags |= UCOL_ITER_HASLEN;
95 (s)->endp = (UChar *)sourceString+sourceLen;
96 }
97 else {
98 /* change to enable easier checking for end of string for fcdpositon */
99 (s)->endp = NULL;
100 }
101 (s)->extendCEs = NULL;
102 (s)->extendCEsSize = 0;
103 (s)->CEpos = (s)->toReturn = (s)->CEs;
104 (s)->offsetBuffer = NULL;
105 (s)->offsetBufferSize = 0;
106 (s)->offsetReturn = (s)->offsetStore = NULL;
107 (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
108 (s)->coll = (collator);
109 (s)->nfd = Normalizer2Factory::getNFDInstance(*status);
110 (s)->fcdPosition = 0;
111 if(collator->normalizationMode == UCOL_ON) {
112 (s)->flags |= UCOL_ITER_NORM;
113 }
114 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
115 (s)->flags |= UCOL_HIRAGANA_Q;
116 }
117 (s)->iterator = NULL;
118 //(s)->iteratorIndex = 0;
119 }
120
121 U_CAPI void U_EXPORT2
uprv_init_collIterate(const UCollator * collator,const UChar * sourceString,int32_t sourceLen,collIterate * s,UErrorCode * status)122 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
123 int32_t sourceLen, collIterate *s,
124 UErrorCode *status) {
125 /* Out-of-line version for use from other files. */
126 IInit_collIterate(collator, sourceString, sourceLen, s, status);
127 }
128
129 U_CAPI collIterate * U_EXPORT2
uprv_new_collIterate(UErrorCode * status)130 uprv_new_collIterate(UErrorCode *status) {
131 if(U_FAILURE(*status)) {
132 return NULL;
133 }
134 collIterate *s = new collIterate;
135 if(s == NULL) {
136 *status = U_MEMORY_ALLOCATION_ERROR;
137 return NULL;
138 }
139 return s;
140 }
141
142 U_CAPI void U_EXPORT2
uprv_delete_collIterate(collIterate * s)143 uprv_delete_collIterate(collIterate *s) {
144 delete s;
145 }
146
147 U_CAPI UBool U_EXPORT2
uprv_collIterateAtEnd(collIterate * s)148 uprv_collIterateAtEnd(collIterate *s) {
149 return s == NULL || s->pos == s->endp;
150 }
151
152 /**
153 * Backup the state of the collIterate struct data
154 * @param data collIterate to backup
155 * @param backup storage
156 */
157 static
backupState(const collIterate * data,collIterateState * backup)158 inline void backupState(const collIterate *data, collIterateState *backup)
159 {
160 backup->fcdPosition = data->fcdPosition;
161 backup->flags = data->flags;
162 backup->origFlags = data->origFlags;
163 backup->pos = data->pos;
164 backup->bufferaddress = data->writableBuffer.getBuffer();
165 backup->buffersize = data->writableBuffer.length();
166 backup->iteratorMove = 0;
167 backup->iteratorIndex = 0;
168 if(data->iterator != NULL) {
169 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
170 backup->iteratorIndex = data->iterator->getState(data->iterator);
171 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
172 if(backup->iteratorIndex == UITER_NO_STATE) {
173 while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
174 backup->iteratorMove++;
175 data->iterator->move(data->iterator, -1, UITER_CURRENT);
176 }
177 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
178 }
179 }
180 }
181
182 /**
183 * Loads the state into the collIterate struct data
184 * @param data collIterate to backup
185 * @param backup storage
186 * @param forwards boolean to indicate if forwards iteration is used,
187 * false indicates backwards iteration
188 */
189 static
loadState(collIterate * data,const collIterateState * backup,UBool forwards)190 inline void loadState(collIterate *data, const collIterateState *backup,
191 UBool forwards)
192 {
193 UErrorCode status = U_ZERO_ERROR;
194 data->flags = backup->flags;
195 data->origFlags = backup->origFlags;
196 if(data->iterator != NULL) {
197 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
198 data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
199 if(backup->iteratorMove != 0) {
200 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
201 }
202 }
203 data->pos = backup->pos;
204
205 if ((data->flags & UCOL_ITER_INNORMBUF) &&
206 data->writableBuffer.getBuffer() != backup->bufferaddress) {
207 /*
208 this is when a new buffer has been reallocated and we'll have to
209 calculate the new position.
210 note the new buffer has to contain the contents of the old buffer.
211 */
212 if (forwards) {
213 data->pos = data->writableBuffer.getTerminatedBuffer() +
214 (data->pos - backup->bufferaddress);
215 }
216 else {
217 /* backwards direction */
218 int32_t temp = backup->buffersize -
219 (int32_t)(data->pos - backup->bufferaddress);
220 data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp);
221 }
222 }
223 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
224 /*
225 this is alittle tricky.
226 if we are initially not in the normalization buffer, even if we
227 normalize in the later stage, the data in the buffer will be
228 ignored, since we skip back up to the data string.
229 however if we are already in the normalization buffer, any
230 further normalization will pull data into the normalization
231 buffer and modify the fcdPosition.
232 since we are keeping the data in the buffer for use, the
233 fcdPosition can not be reverted back.
234 arrgghh....
235 */
236 data->fcdPosition = backup->fcdPosition;
237 }
238 }
239
240 static UBool
reallocCEs(collIterate * data,int32_t newCapacity)241 reallocCEs(collIterate *data, int32_t newCapacity) {
242 uint32_t *oldCEs = data->extendCEs;
243 if(oldCEs == NULL) {
244 oldCEs = data->CEs;
245 }
246 int32_t length = data->CEpos - oldCEs;
247 uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4);
248 if(newCEs == NULL) {
249 return FALSE;
250 }
251 uprv_memcpy(newCEs, oldCEs, length * 4);
252 uprv_free(data->extendCEs);
253 data->extendCEs = newCEs;
254 data->extendCEsSize = newCapacity;
255 data->CEpos = newCEs + length;
256 return TRUE;
257 }
258
259 static UBool
increaseCEsCapacity(collIterate * data)260 increaseCEsCapacity(collIterate *data) {
261 int32_t oldCapacity;
262 if(data->extendCEs != NULL) {
263 oldCapacity = data->extendCEsSize;
264 } else {
265 oldCapacity = LENGTHOF(data->CEs);
266 }
267 return reallocCEs(data, 2 * oldCapacity);
268 }
269
270 static UBool
ensureCEsCapacity(collIterate * data,int32_t minCapacity)271 ensureCEsCapacity(collIterate *data, int32_t minCapacity) {
272 int32_t oldCapacity;
273 if(data->extendCEs != NULL) {
274 oldCapacity = data->extendCEsSize;
275 } else {
276 oldCapacity = LENGTHOF(data->CEs);
277 }
278 if(minCapacity <= oldCapacity) {
279 return TRUE;
280 }
281 oldCapacity *= 2;
282 return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity);
283 }
284
285 /*
286 * collIter_eos()
287 * Checks for a collIterate being positioned at the end of
288 * its source string.
289 *
290 */
291 static
collIter_eos(collIterate * s)292 inline UBool collIter_eos(collIterate *s) {
293 if(s->flags & UCOL_USE_ITERATOR) {
294 return !(s->iterator->hasNext(s->iterator));
295 }
296 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
297 // Null terminated string, but not at null, so not at end.
298 // Whether in main or normalization buffer doesn't matter.
299 return FALSE;
300 }
301
302 // String with length. Can't be in normalization buffer, which is always
303 // null termintated.
304 if (s->flags & UCOL_ITER_HASLEN) {
305 return (s->pos == s->endp);
306 }
307
308 // We are at a null termination, could be either normalization buffer or main string.
309 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
310 // At null at end of main string.
311 return TRUE;
312 }
313
314 // At null at end of normalization buffer. Need to check whether there there are
315 // any characters left in the main buffer.
316 if(s->origFlags & UCOL_USE_ITERATOR) {
317 return !(s->iterator->hasNext(s->iterator));
318 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
319 // Null terminated main string. fcdPosition is the 'return' position into main buf.
320 return (*s->fcdPosition == 0);
321 }
322 else {
323 // Main string with an end pointer.
324 return s->fcdPosition == s->endp;
325 }
326 }
327
328 /*
329 * collIter_bos()
330 * Checks for a collIterate being positioned at the start of
331 * its source string.
332 *
333 */
334 static
collIter_bos(collIterate * source)335 inline UBool collIter_bos(collIterate *source) {
336 // if we're going backwards, we need to know whether there is more in the
337 // iterator, even if we are in the side buffer
338 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
339 return !source->iterator->hasPrevious(source->iterator);
340 }
341 if (source->pos <= source->string ||
342 ((source->flags & UCOL_ITER_INNORMBUF) &&
343 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
344 return TRUE;
345 }
346 return FALSE;
347 }
348
349 /*static
350 inline UBool collIter_SimpleBos(collIterate *source) {
351 // if we're going backwards, we need to know whether there is more in the
352 // iterator, even if we are in the side buffer
353 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
354 return !source->iterator->hasPrevious(source->iterator);
355 }
356 if (source->pos == source->string) {
357 return TRUE;
358 }
359 return FALSE;
360 }*/
361 //return (data->pos == data->string) ||
362
363
364 /****************************************************************************/
365 /* Following are the open/close functions */
366 /* */
367 /****************************************************************************/
368
369 static UCollator*
ucol_initFromBinary(const uint8_t * bin,int32_t length,const UCollator * base,UCollator * fillIn,UErrorCode * status)370 ucol_initFromBinary(const uint8_t *bin, int32_t length,
371 const UCollator *base,
372 UCollator *fillIn,
373 UErrorCode *status)
374 {
375 UCollator *result = fillIn;
376 if(U_FAILURE(*status)) {
377 return NULL;
378 }
379 /*
380 if(base == NULL) {
381 // we don't support null base yet
382 *status = U_ILLEGAL_ARGUMENT_ERROR;
383 return NULL;
384 }
385 */
386 // We need these and we could be running without UCA
387 uprv_uca_initImplicitConstants(status);
388 UCATableHeader *colData = (UCATableHeader *)bin;
389 // do we want version check here? We're trying to figure out whether collators are compatible
390 if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
391 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
392 colData->version[0] != UCOL_BUILDER_VERSION)
393 {
394 *status = U_COLLATOR_VERSION_MISMATCH;
395 return NULL;
396 }
397 else {
398 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
399 result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
400 if(U_FAILURE(*status)){
401 return NULL;
402 }
403 result->hasRealData = TRUE;
404 }
405 else {
406 if(base) {
407 result = ucol_initCollator(base->image, result, base, status);
408 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
409 if(U_FAILURE(*status)){
410 return NULL;
411 }
412 result->hasRealData = FALSE;
413 }
414 else {
415 *status = U_USELESS_COLLATOR_ERROR;
416 return NULL;
417 }
418 }
419 result->freeImageOnClose = FALSE;
420 }
421 result->actualLocale = NULL;
422 result->validLocale = NULL;
423 result->requestedLocale = NULL;
424 result->rules = NULL;
425 result->rulesLength = 0;
426 result->freeRulesOnClose = FALSE;
427 result->ucaRules = NULL;
428 return result;
429 }
430
431 U_CAPI UCollator* U_EXPORT2
ucol_openBinary(const uint8_t * bin,int32_t length,const UCollator * base,UErrorCode * status)432 ucol_openBinary(const uint8_t *bin, int32_t length,
433 const UCollator *base,
434 UErrorCode *status)
435 {
436 return ucol_initFromBinary(bin, length, base, NULL, status);
437 }
438
439 U_CAPI int32_t U_EXPORT2
ucol_cloneBinary(const UCollator * coll,uint8_t * buffer,int32_t capacity,UErrorCode * status)440 ucol_cloneBinary(const UCollator *coll,
441 uint8_t *buffer, int32_t capacity,
442 UErrorCode *status)
443 {
444 int32_t length = 0;
445 if(U_FAILURE(*status)) {
446 return length;
447 }
448 if(capacity < 0) {
449 *status = U_ILLEGAL_ARGUMENT_ERROR;
450 return length;
451 }
452 if(coll->hasRealData == TRUE) {
453 length = coll->image->size;
454 if(length <= capacity) {
455 uprv_memcpy(buffer, coll->image, length);
456 } else {
457 *status = U_BUFFER_OVERFLOW_ERROR;
458 }
459 } else {
460 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
461 if(length <= capacity) {
462 /* build the UCATableHeader with minimal entries */
463 /* do not copy the header from the UCA file because its values are wrong! */
464 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
465
466 /* reset everything */
467 uprv_memset(buffer, 0, length);
468
469 /* set the tailoring-specific values */
470 UCATableHeader *myData = (UCATableHeader *)buffer;
471 myData->size = length;
472
473 /* offset for the options, the only part of the data that is present after the header */
474 myData->options = sizeof(UCATableHeader);
475
476 /* need to always set the expansion value for an upper bound of the options */
477 myData->expansion = myData->options + sizeof(UColOptionSet);
478
479 myData->magic = UCOL_HEADER_MAGIC;
480 myData->isBigEndian = U_IS_BIG_ENDIAN;
481 myData->charSetFamily = U_CHARSET_FAMILY;
482
483 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
484 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
485
486 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
487 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
488 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
489 myData->jamoSpecial = coll->image->jamoSpecial;
490
491 /* copy the collator options */
492 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
493 } else {
494 *status = U_BUFFER_OVERFLOW_ERROR;
495 }
496 }
497 return length;
498 }
499
500 U_CAPI UCollator* U_EXPORT2
ucol_safeClone(const UCollator * coll,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)501 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
502 {
503 UCollator * localCollator;
504 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
505 char *stackBufferChars = (char *)stackBuffer;
506 int32_t imageSize = 0;
507 int32_t rulesSize = 0;
508 int32_t rulesPadding = 0;
509 uint8_t *image;
510 UChar *rules;
511 UBool colAllocated = FALSE;
512 UBool imageAllocated = FALSE;
513
514 if (status == NULL || U_FAILURE(*status)){
515 return 0;
516 }
517 if ((stackBuffer && !pBufferSize) || !coll){
518 *status = U_ILLEGAL_ARGUMENT_ERROR;
519 return 0;
520 }
521 if (coll->rules && coll->freeRulesOnClose) {
522 rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
523 rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
524 bufferSizeNeeded += rulesSize + rulesPadding;
525 }
526
527 if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
528 *pBufferSize = bufferSizeNeeded;
529 return 0;
530 }
531
532 /* Pointers on 64-bit platforms need to be aligned
533 * on a 64-bit boundry in memory.
534 */
535 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
536 int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
537 if (*pBufferSize > offsetUp) {
538 *pBufferSize -= offsetUp;
539 stackBufferChars += offsetUp;
540 }
541 else {
542 /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
543 *pBufferSize = 1;
544 }
545 }
546 stackBuffer = (void *)stackBufferChars;
547
548 if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) {
549 /* allocate one here...*/
550 stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
551 // Null pointer check.
552 if (stackBufferChars == NULL) {
553 *status = U_MEMORY_ALLOCATION_ERROR;
554 return NULL;
555 }
556 colAllocated = TRUE;
557 if (U_SUCCESS(*status)) {
558 *status = U_SAFECLONE_ALLOCATED_WARNING;
559 }
560 }
561 localCollator = (UCollator *)stackBufferChars;
562 rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
563 {
564 UErrorCode tempStatus = U_ZERO_ERROR;
565 imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
566 }
567 if (coll->freeImageOnClose) {
568 image = (uint8_t *)uprv_malloc(imageSize);
569 // Null pointer check
570 if (image == NULL) {
571 *status = U_MEMORY_ALLOCATION_ERROR;
572 return NULL;
573 }
574 ucol_cloneBinary(coll, image, imageSize, status);
575 imageAllocated = TRUE;
576 }
577 else {
578 image = (uint8_t *)coll->image;
579 }
580 localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
581 if (U_FAILURE(*status)) {
582 return NULL;
583 }
584
585 if (coll->rules) {
586 if (coll->freeRulesOnClose) {
587 localCollator->rules = u_strcpy(rules, coll->rules);
588 //bufferEnd += rulesSize;
589 }
590 else {
591 localCollator->rules = coll->rules;
592 }
593 localCollator->freeRulesOnClose = FALSE;
594 localCollator->rulesLength = coll->rulesLength;
595 }
596
597 int32_t i;
598 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
599 ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
600 }
601 // zero copies of pointers
602 localCollator->actualLocale = NULL;
603 localCollator->validLocale = NULL;
604 localCollator->requestedLocale = NULL;
605 localCollator->ucaRules = coll->ucaRules; // There should only be one copy here.
606 localCollator->freeOnClose = colAllocated;
607 localCollator->freeImageOnClose = imageAllocated;
608 return localCollator;
609 }
610
611 U_CAPI void U_EXPORT2
ucol_close(UCollator * coll)612 ucol_close(UCollator *coll)
613 {
614 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
615 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
616 if(coll != NULL) {
617 // these are always owned by each UCollator struct,
618 // so we always free them
619 if(coll->validLocale != NULL) {
620 uprv_free(coll->validLocale);
621 }
622 if(coll->actualLocale != NULL) {
623 uprv_free(coll->actualLocale);
624 }
625 if(coll->requestedLocale != NULL) {
626 uprv_free(coll->requestedLocale);
627 }
628 if(coll->latinOneCEs != NULL) {
629 uprv_free(coll->latinOneCEs);
630 }
631 if(coll->options != NULL && coll->freeOptionsOnClose) {
632 uprv_free(coll->options);
633 }
634 if(coll->rules != NULL && coll->freeRulesOnClose) {
635 uprv_free((UChar *)coll->rules);
636 }
637 if(coll->image != NULL && coll->freeImageOnClose) {
638 uprv_free((UCATableHeader *)coll->image);
639 }
640
641 /* Here, it would be advisable to close: */
642 /* - UData for UCA (unless we stuff it in the root resb */
643 /* Again, do we need additional housekeeping... HMMM! */
644 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
645 if(coll->freeOnClose){
646 /* for safeClone, if freeOnClose is FALSE,
647 don't free the other instance data */
648 uprv_free(coll);
649 }
650 }
651 UTRACE_EXIT();
652 }
653
654 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
655 /* you should be able to get the binary chunk to write out... Doesn't look very full now */
656 U_CFUNC uint8_t* U_EXPORT2
ucol_cloneRuleData(const UCollator * coll,int32_t * length,UErrorCode * status)657 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
658 {
659 uint8_t *result = NULL;
660 if(U_FAILURE(*status)) {
661 return NULL;
662 }
663 if(coll->hasRealData == TRUE) {
664 *length = coll->image->size;
665 result = (uint8_t *)uprv_malloc(*length);
666 /* test for NULL */
667 if (result == NULL) {
668 *status = U_MEMORY_ALLOCATION_ERROR;
669 return NULL;
670 }
671 uprv_memcpy(result, coll->image, *length);
672 } else {
673 *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
674 result = (uint8_t *)uprv_malloc(*length);
675 /* test for NULL */
676 if (result == NULL) {
677 *status = U_MEMORY_ALLOCATION_ERROR;
678 return NULL;
679 }
680
681 /* build the UCATableHeader with minimal entries */
682 /* do not copy the header from the UCA file because its values are wrong! */
683 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
684
685 /* reset everything */
686 uprv_memset(result, 0, *length);
687
688 /* set the tailoring-specific values */
689 UCATableHeader *myData = (UCATableHeader *)result;
690 myData->size = *length;
691
692 /* offset for the options, the only part of the data that is present after the header */
693 myData->options = sizeof(UCATableHeader);
694
695 /* need to always set the expansion value for an upper bound of the options */
696 myData->expansion = myData->options + sizeof(UColOptionSet);
697
698 myData->magic = UCOL_HEADER_MAGIC;
699 myData->isBigEndian = U_IS_BIG_ENDIAN;
700 myData->charSetFamily = U_CHARSET_FAMILY;
701
702 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
703 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
704
705 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
706 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
707 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
708 myData->jamoSpecial = coll->image->jamoSpecial;
709
710 /* copy the collator options */
711 uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
712 }
713 return result;
714 }
715
ucol_setOptionsFromHeader(UCollator * result,UColOptionSet * opts,UErrorCode * status)716 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
717 if(U_FAILURE(*status)) {
718 return;
719 }
720 result->caseFirst = (UColAttributeValue)opts->caseFirst;
721 result->caseLevel = (UColAttributeValue)opts->caseLevel;
722 result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
723 result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
724 result->strength = (UColAttributeValue)opts->strength;
725 result->variableTopValue = opts->variableTopValue;
726 result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
727 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
728 result->numericCollation = (UColAttributeValue)opts->numericCollation;
729
730 result->caseFirstisDefault = TRUE;
731 result->caseLevelisDefault = TRUE;
732 result->frenchCollationisDefault = TRUE;
733 result->normalizationModeisDefault = TRUE;
734 result->strengthisDefault = TRUE;
735 result->variableTopValueisDefault = TRUE;
736 result->hiraganaQisDefault = TRUE;
737 result->numericCollationisDefault = TRUE;
738
739 ucol_updateInternalState(result, status);
740
741 result->options = opts;
742 }
743
744
745 /**
746 * Approximate determination if a character is at a contraction end.
747 * Guaranteed to be TRUE if a character is at the end of a contraction,
748 * otherwise it is not deterministic.
749 * @param c character to be determined
750 * @param coll collator
751 */
752 static
ucol_contractionEndCP(UChar c,const UCollator * coll)753 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
754 if (c < coll->minContrEndCP) {
755 return FALSE;
756 }
757
758 int32_t hash = c;
759 uint8_t htbyte;
760 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
761 if (U16_IS_TRAIL(c)) {
762 return TRUE;
763 }
764 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
765 }
766 htbyte = coll->contrEndCP[hash>>3];
767 return (((htbyte >> (hash & 7)) & 1) == 1);
768 }
769
770
771
772 /*
773 * i_getCombiningClass()
774 * A fast, at least partly inline version of u_getCombiningClass()
775 * This is a candidate for further optimization. Used heavily
776 * in contraction processing.
777 */
778 static
i_getCombiningClass(UChar32 c,const UCollator * coll)779 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
780 uint8_t sCC = 0;
781 if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
782 sCC = u_getCombiningClass(c);
783 }
784 return sCC;
785 }
786
ucol_initCollator(const UCATableHeader * image,UCollator * fillIn,const UCollator * UCA,UErrorCode * status)787 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
788 UChar c;
789 UCollator *result = fillIn;
790 if(U_FAILURE(*status) || image == NULL) {
791 return NULL;
792 }
793
794 if(result == NULL) {
795 result = (UCollator *)uprv_malloc(sizeof(UCollator));
796 if(result == NULL) {
797 *status = U_MEMORY_ALLOCATION_ERROR;
798 return result;
799 }
800 result->freeOnClose = TRUE;
801 } else {
802 result->freeOnClose = FALSE;
803 }
804
805 // init FCD data
806 if (fcdTrieIndex == NULL) {
807 // The result is constant, until the library is reloaded.
808 fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
809 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
810 }
811
812 result->image = image;
813 result->mapping.getFoldingOffset = _getFoldingOffset;
814 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
815 utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
816 if(U_FAILURE(*status)) {
817 if(result->freeOnClose == TRUE) {
818 uprv_free(result);
819 result = NULL;
820 }
821 return result;
822 }
823
824 /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/
825 result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
826 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
827 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
828 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
829
830 result->options = (UColOptionSet*)((uint8_t*)result->image+result->image->options);
831 result->freeOptionsOnClose = FALSE;
832
833 /* set attributes */
834 result->caseFirst = (UColAttributeValue)result->options->caseFirst;
835 result->caseLevel = (UColAttributeValue)result->options->caseLevel;
836 result->frenchCollation = (UColAttributeValue)result->options->frenchCollation;
837 result->normalizationMode = (UColAttributeValue)result->options->normalizationMode;
838 result->strength = (UColAttributeValue)result->options->strength;
839 result->variableTopValue = result->options->variableTopValue;
840 result->alternateHandling = (UColAttributeValue)result->options->alternateHandling;
841 result->hiraganaQ = (UColAttributeValue)result->options->hiraganaQ;
842 result->numericCollation = (UColAttributeValue)result->options->numericCollation;
843
844 result->caseFirstisDefault = TRUE;
845 result->caseLevelisDefault = TRUE;
846 result->frenchCollationisDefault = TRUE;
847 result->normalizationModeisDefault = TRUE;
848 result->strengthisDefault = TRUE;
849 result->variableTopValueisDefault = TRUE;
850 result->alternateHandlingisDefault = TRUE;
851 result->hiraganaQisDefault = TRUE;
852 result->numericCollationisDefault = TRUE;
853
854 /*result->scriptOrder = NULL;*/
855
856 result->rules = NULL;
857 result->rulesLength = 0;
858 result->freeRulesOnClose = FALSE;
859
860 /* get the version info from UCATableHeader and populate the Collator struct*/
861 result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
862 result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
863 result->dataVersion[2] = 0;
864 result->dataVersion[3] = 0;
865
866 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
867 result->minUnsafeCP = 0;
868 for (c=0; c<0x300; c++) { // Find the smallest unsafe char.
869 if (ucol_unsafeCP(c, result)) break;
870 }
871 result->minUnsafeCP = c;
872
873 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
874 result->minContrEndCP = 0;
875 for (c=0; c<0x300; c++) { // Find the Contraction-ending char.
876 if (ucol_contractionEndCP(c, result)) break;
877 }
878 result->minContrEndCP = c;
879
880 /* max expansion tables */
881 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
882 result->image->endExpansionCE);
883 result->lastEndExpansionCE = result->endExpansionCE +
884 result->image->endExpansionCECount - 1;
885 result->expansionCESize = (uint8_t*)result->image +
886 result->image->expansionCESize;
887
888
889 //result->errorCode = *status;
890
891 result->latinOneCEs = NULL;
892
893 result->latinOneRegenTable = FALSE;
894 result->latinOneFailed = FALSE;
895 result->UCA = UCA;
896
897 ucol_updateInternalState(result, status);
898
899 /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
900 result->ucaRules = NULL;
901 result->actualLocale = NULL;
902 result->validLocale = NULL;
903 result->requestedLocale = NULL;
904 result->hasRealData = FALSE; // real data lives in .dat file...
905 result->freeImageOnClose = FALSE;
906
907 return result;
908 }
909
910 /* new Mark's code */
911
912 /**
913 * For generation of Implicit CEs
914 * @author Davis
915 *
916 * Cleaned up so that changes can be made more easily.
917 * Old values:
918 # First Implicit: E26A792D
919 # Last Implicit: E3DC70C0
920 # First CJK: E0030300
921 # Last CJK: E0A9DD00
922 # First CJK_A: E0A9DF00
923 # Last CJK_A: E0DE3100
924 */
925 /* Following is a port of Mark's code for new treatment of implicits.
926 * It is positioned here, since ucol_initUCA need to initialize the
927 * variables below according to the data in the fractional UCA.
928 */
929
930 /**
931 * Function used to:
932 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
933 * b) bump any non-CJK characters by 10FFFF.
934 * The relevant blocks are:
935 * A: 4E00..9FFF; CJK Unified Ideographs
936 * F900..FAFF; CJK Compatibility Ideographs
937 * B: 3400..4DBF; CJK Unified Ideographs Extension A
938 * 20000..XX; CJK Unified Ideographs Extension B (and others later on)
939 * As long as
940 * no new B characters are allocated between 4E00 and FAFF, and
941 * no new A characters are outside of this range,
942 * (very high probability) this simple code will work.
943 * The reordered blocks are:
944 * Block1 is CJK
945 * Block2 is CJK_COMPAT_USED
946 * Block3 is CJK_A
947 * (all contiguous)
948 * Any other CJK gets its normal code point
949 * Any non-CJK gets +10FFFF
950 * When we reorder Block1, we make sure that it is at the very start,
951 * so that it will use a 3-byte form.
952 * Warning: the we only pick up the compatibility characters that are
953 * NOT decomposed, so that block is smaller!
954 */
955
956 // CONSTANTS
957 static const UChar32
958 NON_CJK_OFFSET = 0x110000,
959 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
960
961 /**
962 * Precomputed by initImplicitConstants()
963 */
964 static int32_t
965 final3Multiplier = 0,
966 final4Multiplier = 0,
967 final3Count = 0,
968 final4Count = 0,
969 medialCount = 0,
970 min3Primary = 0,
971 min4Primary = 0,
972 max4Primary = 0,
973 minTrail = 0,
974 maxTrail = 0,
975 max3Trail = 0,
976 max4Trail = 0,
977 min4Boundary = 0;
978
979 static const UChar32
980 CJK_BASE = 0x4E00,
981 CJK_LIMIT = 0x9FFF+1,
982 CJK_COMPAT_USED_BASE = 0xFA0E,
983 CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
984 CJK_A_BASE = 0x3400,
985 CJK_A_LIMIT = 0x4DBF+1,
986 CJK_B_BASE = 0x20000,
987 CJK_B_LIMIT = 0x2A6DF+1;
988
swapCJK(UChar32 i)989 static UChar32 swapCJK(UChar32 i) {
990
991 if (i >= CJK_BASE) {
992 if (i < CJK_LIMIT) return i - CJK_BASE;
993
994 if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET;
995
996 if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE
997 + (CJK_LIMIT - CJK_BASE);
998 if (i < CJK_B_BASE) return i + NON_CJK_OFFSET;
999
1000 if (i < CJK_B_LIMIT) return i; // non-BMP-CJK
1001
1002 return i + NON_CJK_OFFSET; // non-CJK
1003 }
1004 if (i < CJK_A_BASE) return i + NON_CJK_OFFSET;
1005
1006 if (i < CJK_A_LIMIT) return i - CJK_A_BASE
1007 + (CJK_LIMIT - CJK_BASE)
1008 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1009 return i + NON_CJK_OFFSET; // non-CJK
1010 }
1011
1012 U_CAPI UChar32 U_EXPORT2
uprv_uca_getRawFromCodePoint(UChar32 i)1013 uprv_uca_getRawFromCodePoint(UChar32 i) {
1014 return swapCJK(i)+1;
1015 }
1016
1017 U_CAPI UChar32 U_EXPORT2
uprv_uca_getCodePointFromRaw(UChar32 i)1018 uprv_uca_getCodePointFromRaw(UChar32 i) {
1019 i--;
1020 UChar32 result = 0;
1021 if(i >= NON_CJK_OFFSET) {
1022 result = i - NON_CJK_OFFSET;
1023 } else if(i >= CJK_B_BASE) {
1024 result = i;
1025 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
1026 if(i < CJK_LIMIT - CJK_BASE) {
1027 result = i + CJK_BASE;
1028 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
1029 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
1030 } else {
1031 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1032 }
1033 } else {
1034 result = -1;
1035 }
1036 return result;
1037 }
1038
1039 // GET IMPLICIT PRIMARY WEIGHTS
1040 // Return value is left justified primary key
1041 U_CAPI uint32_t U_EXPORT2
uprv_uca_getImplicitFromRaw(UChar32 cp)1042 uprv_uca_getImplicitFromRaw(UChar32 cp) {
1043 /*
1044 if (cp < 0 || cp > UCOL_MAX_INPUT) {
1045 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
1046 }
1047 */
1048 int32_t last0 = cp - min4Boundary;
1049 if (last0 < 0) {
1050 int32_t last1 = cp / final3Count;
1051 last0 = cp % final3Count;
1052
1053 int32_t last2 = last1 / medialCount;
1054 last1 %= medialCount;
1055
1056 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
1057 last1 = minTrail + last1; // offset
1058 last2 = min3Primary + last2; // offset
1059 /*
1060 if (last2 >= min4Primary) {
1061 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
1062 }
1063 */
1064 return (last2 << 24) + (last1 << 16) + (last0 << 8);
1065 } else {
1066 int32_t last1 = last0 / final4Count;
1067 last0 %= final4Count;
1068
1069 int32_t last2 = last1 / medialCount;
1070 last1 %= medialCount;
1071
1072 int32_t last3 = last2 / medialCount;
1073 last2 %= medialCount;
1074
1075 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
1076 last1 = minTrail + last1; // offset
1077 last2 = minTrail + last2; // offset
1078 last3 = min4Primary + last3; // offset
1079 /*
1080 if (last3 > max4Primary) {
1081 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
1082 }
1083 */
1084 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
1085 }
1086 }
1087
1088 static uint32_t U_EXPORT2
uprv_uca_getImplicitPrimary(UChar32 cp)1089 uprv_uca_getImplicitPrimary(UChar32 cp) {
1090 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1091
1092 cp = swapCJK(cp);
1093 cp++;
1094 // we now have a range of numbers from 0 to 21FFFF.
1095
1096 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1097
1098 return uprv_uca_getImplicitFromRaw(cp);
1099 }
1100
1101 /**
1102 * Converts implicit CE into raw integer ("code point")
1103 * @param implicit
1104 * @return -1 if illegal format
1105 */
1106 U_CAPI UChar32 U_EXPORT2
uprv_uca_getRawFromImplicit(uint32_t implicit)1107 uprv_uca_getRawFromImplicit(uint32_t implicit) {
1108 UChar32 result;
1109 UChar32 b3 = implicit & 0xFF;
1110 UChar32 b2 = (implicit >> 8) & 0xFF;
1111 UChar32 b1 = (implicit >> 16) & 0xFF;
1112 UChar32 b0 = (implicit >> 24) & 0xFF;
1113
1114 // simple parameter checks
1115 if (b0 < min3Primary || b0 > max4Primary
1116 || b1 < minTrail || b1 > maxTrail)
1117 return -1;
1118 // normal offsets
1119 b1 -= minTrail;
1120
1121 // take care of the final values, and compose
1122 if (b0 < min4Primary) {
1123 if (b2 < minTrail || b2 > max3Trail || b3 != 0)
1124 return -1;
1125 b2 -= minTrail;
1126 UChar32 remainder = b2 % final3Multiplier;
1127 if (remainder != 0)
1128 return -1;
1129 b0 -= min3Primary;
1130 b2 /= final3Multiplier;
1131 result = ((b0 * medialCount) + b1) * final3Count + b2;
1132 } else {
1133 if (b2 < minTrail || b2 > maxTrail
1134 || b3 < minTrail || b3 > max4Trail)
1135 return -1;
1136 b2 -= minTrail;
1137 b3 -= minTrail;
1138 UChar32 remainder = b3 % final4Multiplier;
1139 if (remainder != 0)
1140 return -1;
1141 b3 /= final4Multiplier;
1142 b0 -= min4Primary;
1143 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
1144 }
1145 // final check
1146 if (result < 0 || result > UCOL_MAX_INPUT)
1147 return -1;
1148 return result;
1149 }
1150
1151
divideAndRoundUp(int a,int b)1152 static inline int32_t divideAndRoundUp(int a, int b) {
1153 return 1 + (a-1)/b;
1154 }
1155
1156 /* this function is either called from initUCA or from genUCA before
1157 * doing canonical closure for the UCA.
1158 */
1159
1160 /**
1161 * Set up to generate implicits.
1162 * Maintenance Note: this function may end up being called more than once, due
1163 * to threading races during initialization. Make sure that
1164 * none of the Constants is ever transiently assigned an
1165 * incorrect value.
1166 * @param minPrimary
1167 * @param maxPrimary
1168 * @param minTrail final byte
1169 * @param maxTrail final byte
1170 * @param gap3 the gap we leave for tailoring for 3-byte forms
1171 * @param gap4 the gap we leave for tailoring for 4-byte forms
1172 */
initImplicitConstants(int minPrimary,int maxPrimary,int minTrailIn,int maxTrailIn,int gap3,int primaries3count,UErrorCode * status)1173 static void initImplicitConstants(int minPrimary, int maxPrimary,
1174 int minTrailIn, int maxTrailIn,
1175 int gap3, int primaries3count,
1176 UErrorCode *status) {
1177 // some simple parameter checks
1178 if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
1179 || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
1180 || (primaries3count < 1))
1181 {
1182 *status = U_ILLEGAL_ARGUMENT_ERROR;
1183 return;
1184 };
1185
1186 minTrail = minTrailIn;
1187 maxTrail = maxTrailIn;
1188
1189 min3Primary = minPrimary;
1190 max4Primary = maxPrimary;
1191 // compute constants for use later.
1192 // number of values we can use in trailing bytes
1193 // leave room for empty values between AND above, e.g. if gap = 2
1194 // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1195 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1196 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1197 final3Multiplier = gap3 + 1;
1198 final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
1199 max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
1200
1201 // medials can use full range
1202 medialCount = (maxTrail - minTrail + 1);
1203 // find out how many values fit in each form
1204 int32_t threeByteCount = medialCount * final3Count;
1205 // now determine where the 3/4 boundary is.
1206 // we use 3 bytes below the boundary, and 4 above
1207 int32_t primariesAvailable = maxPrimary - minPrimary + 1;
1208 int32_t primaries4count = primariesAvailable - primaries3count;
1209
1210
1211 int32_t min3ByteCoverage = primaries3count * threeByteCount;
1212 min4Primary = minPrimary + primaries3count;
1213 min4Boundary = min3ByteCoverage;
1214 // Now expand out the multiplier for the 4 bytes, and redo.
1215
1216 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
1217 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
1218 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
1219 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
1220 if (gap4 < 1) {
1221 *status = U_ILLEGAL_ARGUMENT_ERROR;
1222 return;
1223 }
1224 final4Multiplier = gap4 + 1;
1225 final4Count = neededPerFinalByte;
1226 max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
1227 }
1228
1229 /**
1230 * Supply parameters for generating implicit CEs
1231 */
1232 U_CAPI void U_EXPORT2
uprv_uca_initImplicitConstants(UErrorCode * status)1233 uprv_uca_initImplicitConstants(UErrorCode *status) {
1234 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1235 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1236 initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
1237 }
1238
1239
1240 /* collIterNormalize Incremental Normalization happens here. */
1241 /* pick up the range of chars identifed by FCD, */
1242 /* normalize it into the collIterate's writable buffer, */
1243 /* switch the collIterate's state to use the writable buffer. */
1244 /* */
1245 static
collIterNormalize(collIterate * collationSource)1246 void collIterNormalize(collIterate *collationSource)
1247 {
1248 UErrorCode status = U_ZERO_ERROR;
1249 const UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */
1250 const UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */
1251
1252 collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)),
1253 collationSource->writableBuffer,
1254 status);
1255 if (U_FAILURE(status)) {
1256 #ifdef UCOL_DEBUG
1257 fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status));
1258 #endif
1259 return;
1260 }
1261
1262 collationSource->pos = collationSource->writableBuffer.getTerminatedBuffer();
1263 collationSource->origFlags = collationSource->flags;
1264 collationSource->flags |= UCOL_ITER_INNORMBUF;
1265 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1266 }
1267
1268
1269 // This function takes the iterator and extracts normalized stuff up to the next boundary
1270 // It is similar in the end results to the collIterNormalize, but for the cases when we
1271 // use an iterator
1272 /*static
1273 inline void normalizeIterator(collIterate *collationSource) {
1274 UErrorCode status = U_ZERO_ERROR;
1275 UBool wasNormalized = FALSE;
1276 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1277 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
1278 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1279 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1280 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1281 // reallocate and terminate
1282 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1283 &collationSource->writableBuffer,
1284 (int32_t *)&collationSource->writableBufSize, normLen + 1,
1285 0)
1286 ) {
1287 #ifdef UCOL_DEBUG
1288 fprintf(stderr, "normalizeIterator(), out of memory\n");
1289 #endif
1290 return;
1291 }
1292 status = U_ZERO_ERROR;
1293 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1294 collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
1295 normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1296 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1297 }
1298 // Terminate the buffer - we already checked that it is big enough
1299 collationSource->writableBuffer[normLen] = 0;
1300 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1301 collationSource->flags |= UCOL_ITER_ALLOCATED;
1302 }
1303 collationSource->pos = collationSource->writableBuffer;
1304 collationSource->origFlags = collationSource->flags;
1305 collationSource->flags |= UCOL_ITER_INNORMBUF;
1306 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1307 }*/
1308
1309
1310 /* Incremental FCD check and normalize */
1311 /* Called from getNextCE when normalization state is suspect. */
1312 /* When entering, the state is known to be this: */
1313 /* o We are working in the main buffer of the collIterate, not the side */
1314 /* writable buffer. When in the side buffer, normalization mode is always off, */
1315 /* so we won't get here. */
1316 /* o The leading combining class from the current character is 0 or */
1317 /* the trailing combining class of the previous char was zero. */
1318 /* True because the previous call to this function will have always exited */
1319 /* that way, and we get called for every char where cc might be non-zero. */
1320 static
collIterFCD(collIterate * collationSource)1321 inline UBool collIterFCD(collIterate *collationSource) {
1322 const UChar *srcP, *endP;
1323 uint8_t leadingCC;
1324 uint8_t prevTrailingCC = 0;
1325 uint16_t fcd;
1326 UBool needNormalize = FALSE;
1327
1328 srcP = collationSource->pos-1;
1329
1330 if (collationSource->flags & UCOL_ITER_HASLEN) {
1331 endP = collationSource->endp;
1332 } else {
1333 endP = NULL;
1334 }
1335
1336 // Get the trailing combining class of the current character. If it's zero,
1337 // we are OK.
1338 /* trie access */
1339 fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
1340 if (fcd != 0) {
1341 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1342
1343 if (prevTrailingCC != 0) {
1344 // The current char has a non-zero trailing CC. Scan forward until we find
1345 // a char with a leading cc of zero.
1346 while (endP == NULL || srcP != endP)
1347 {
1348 const UChar *savedSrcP = srcP;
1349
1350 /* trie access */
1351 fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
1352 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1353 if (leadingCC == 0) {
1354 srcP = savedSrcP; // Hit char that is not part of combining sequence.
1355 // back up over it. (Could be surrogate pair!)
1356 break;
1357 }
1358
1359 if (leadingCC < prevTrailingCC) {
1360 needNormalize = TRUE;
1361 }
1362
1363 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1364 }
1365 }
1366 }
1367
1368 collationSource->fcdPosition = (UChar *)srcP;
1369
1370 return needNormalize;
1371 }
1372
1373 /****************************************************************************/
1374 /* Following are the CE retrieval functions */
1375 /* */
1376 /****************************************************************************/
1377
1378 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
1379 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
1380
1381 /* there should be a macro version of this function in the header file */
1382 /* This is the first function that tries to fetch a collation element */
1383 /* If it's not succesfull or it encounters a more difficult situation */
1384 /* some more sofisticated and slower functions are invoked */
1385 static
ucol_IGetNextCE(const UCollator * coll,collIterate * collationSource,UErrorCode * status)1386 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1387 uint32_t order = 0;
1388 if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */
1389 order = *(collationSource->toReturn++); /* if so, return them */
1390 if(collationSource->CEpos == collationSource->toReturn) {
1391 collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs;
1392 }
1393 return order;
1394 }
1395
1396 UChar ch = 0;
1397 collationSource->offsetReturn = NULL;
1398
1399 for (;;) /* Loop handles case when incremental normalize switches */
1400 { /* to or from the side buffer / original string, and we */
1401 /* need to start again to get the next character. */
1402
1403 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1404 {
1405 // The source string is null terminated and we're not working from the side buffer,
1406 // and we're not normalizing. This is the fast path.
1407 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1408 ch = *collationSource->pos++;
1409 if (ch != 0) {
1410 break;
1411 }
1412 else {
1413 return UCOL_NO_MORE_CES;
1414 }
1415 }
1416
1417 if (collationSource->flags & UCOL_ITER_HASLEN) {
1418 // Normal path for strings when length is specified.
1419 // (We can't be in side buffer because it is always null terminated.)
1420 if (collationSource->pos >= collationSource->endp) {
1421 // Ran off of the end of the main source string. We're done.
1422 return UCOL_NO_MORE_CES;
1423 }
1424 ch = *collationSource->pos++;
1425 }
1426 else if(collationSource->flags & UCOL_USE_ITERATOR) {
1427 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
1428 if(iterCh == U_SENTINEL) {
1429 return UCOL_NO_MORE_CES;
1430 }
1431 ch = (UChar)iterCh;
1432 }
1433 else
1434 {
1435 // Null terminated string.
1436 ch = *collationSource->pos++;
1437 if (ch == 0) {
1438 // Ran off end of buffer.
1439 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1440 // Ran off end of main string. backing up one character.
1441 collationSource->pos--;
1442 return UCOL_NO_MORE_CES;
1443 }
1444 else
1445 {
1446 // Hit null in the normalize side buffer.
1447 // Usually this means the end of the normalized data,
1448 // except for one odd case: a null followed by combining chars,
1449 // which is the case if we are at the start of the buffer.
1450 if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) {
1451 break;
1452 }
1453
1454 // Null marked end of side buffer.
1455 // Revert to the main string and
1456 // loop back to top to try again to get a character.
1457 collationSource->pos = collationSource->fcdPosition;
1458 collationSource->flags = collationSource->origFlags;
1459 continue;
1460 }
1461 }
1462 }
1463
1464 if(collationSource->flags&UCOL_HIRAGANA_Q) {
1465 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
1466 * based on whether the previous codepoint was Hiragana or Katakana.
1467 */
1468 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
1469 ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
1470 collationSource->flags |= UCOL_WAS_HIRAGANA;
1471 } else {
1472 collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1473 }
1474 }
1475
1476 // We've got a character. See if there's any fcd and/or normalization stuff to do.
1477 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1478 if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1479 break;
1480 }
1481
1482 if (collationSource->fcdPosition >= collationSource->pos) {
1483 // An earlier FCD check has already covered the current character.
1484 // We can go ahead and process this char.
1485 break;
1486 }
1487
1488 if (ch < ZERO_CC_LIMIT_ ) {
1489 // Fast fcd safe path. Trailing combining class == 0. This char is OK.
1490 break;
1491 }
1492
1493 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1494 // We need to peek at the next character in order to tell if we are FCD
1495 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
1496 // We are at the last char of source string.
1497 // It is always OK for FCD check.
1498 break;
1499 }
1500
1501 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test
1502 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1503 break;
1504 }
1505 }
1506
1507
1508 // Need a more complete FCD check and possible normalization.
1509 if (collIterFCD(collationSource)) {
1510 collIterNormalize(collationSource);
1511 }
1512 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1513 // No normalization was needed. Go ahead and process the char we already had.
1514 break;
1515 }
1516
1517 // Some normalization happened. Next loop iteration will pick up a char
1518 // from the normalization buffer.
1519
1520 } // end for (;;)
1521
1522
1523 if (ch <= 0xFF) {
1524 /* For latin-1 characters we never need to fall back to the UCA table */
1525 /* because all of the UCA data is replicated in the latinOneMapping array */
1526 order = coll->latinOneMapping[ch];
1527 if (order > UCOL_NOT_FOUND) {
1528 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
1529 }
1530 }
1531 else
1532 {
1533 // Always use UCA for Han, Hangul
1534 // (Han extension A is before main Han block)
1535 // **** Han compatibility chars ?? ****
1536 if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
1537 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
1538 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
1539 // between the two target ranges; do normal lookup
1540 // **** this range is YI, Modifier tone letters, ****
1541 // **** Latin-D, Syloti Nagari, Phagas-pa. ****
1542 // **** Latin-D might be tailored, so we need to ****
1543 // **** do the normal lookup for these guys. ****
1544 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1545 } else {
1546 // in one of the target ranges; use UCA
1547 order = UCOL_NOT_FOUND;
1548 }
1549 } else {
1550 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1551 }
1552
1553 if(order > UCOL_NOT_FOUND) { /* if a CE is special */
1554 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */
1555 }
1556
1557 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */
1558 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1559 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1560
1561 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
1562 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
1563 }
1564 }
1565 }
1566 if(order == UCOL_NOT_FOUND) {
1567 order = getImplicit(ch, collationSource);
1568 }
1569 return order; /* return the CE */
1570 }
1571
1572 /* ucol_getNextCE, out-of-line version for use from other files. */
1573 U_CAPI uint32_t U_EXPORT2
ucol_getNextCE(const UCollator * coll,collIterate * collationSource,UErrorCode * status)1574 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1575 return ucol_IGetNextCE(coll, collationSource, status);
1576 }
1577
1578
1579 /**
1580 * Incremental previous normalization happens here. Pick up the range of chars
1581 * identifed by FCD, normalize it into the collIterate's writable buffer,
1582 * switch the collIterate's state to use the writable buffer.
1583 * @param data collation iterator data
1584 */
1585 static
collPrevIterNormalize(collIterate * data)1586 void collPrevIterNormalize(collIterate *data)
1587 {
1588 UErrorCode status = U_ZERO_ERROR;
1589 const UChar *pEnd = data->pos; /* End normalize + 1 */
1590 const UChar *pStart;
1591
1592 /* Start normalize */
1593 if (data->fcdPosition == NULL) {
1594 pStart = data->string;
1595 }
1596 else {
1597 pStart = data->fcdPosition + 1;
1598 }
1599
1600 int32_t normLen =
1601 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)),
1602 data->writableBuffer,
1603 status).
1604 length();
1605 if(U_FAILURE(status)) {
1606 return;
1607 }
1608 /*
1609 this puts the null termination infront of the normalized string instead
1610 of the end
1611 */
1612 data->writableBuffer.insert(0, (UChar)0);
1613
1614 if (data->offsetBuffer == NULL) {
1615 int32_t len = normLen >= UCOL_EXPAND_CE_BUFFER_SIZE ? normLen + 1 : UCOL_EXPAND_CE_BUFFER_SIZE;
1616
1617 data->offsetBufferSize = len;
1618 data->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * len);
1619 data->offsetStore = data->offsetBuffer;
1620 } else if(data->offsetBufferSize < normLen) {
1621 int32_t storeIX = (int32_t)(data->offsetStore - data->offsetBuffer);
1622 int32_t *tob = (int32_t *) uprv_realloc(data->offsetBuffer, sizeof(int32_t) * (normLen + 1));
1623
1624 if (tob != NULL) {
1625 data->offsetBuffer = tob;
1626 data->offsetStore = &data->offsetBuffer[storeIX];
1627 data->offsetBufferSize = normLen + 1;
1628 }
1629 }
1630
1631 /*
1632 * The usual case at this point is that we've got a base
1633 * character followed by marks that were normalized. If
1634 * fcdPosition is NULL, that means that we backed up to
1635 * the beginning of the string and there's no base character.
1636 *
1637 * Forward processing will usually normalize when it sees
1638 * the first mark, so that mark will get it's natural offset
1639 * and the rest will get the offset of the character following
1640 * the marks. The base character will also get its natural offset.
1641 *
1642 * We write the offset of the base character, if there is one,
1643 * followed by the offset of the first mark and then the offsets
1644 * of the rest of the marks.
1645 */
1646 int32_t firstMarkOffset = 0;
1647 int32_t trailOffset = (int32_t)(data->pos - data->string + 1);
1648 int32_t trailCount = normLen - 1;
1649
1650 if (data->fcdPosition != NULL) {
1651 int32_t baseOffset = (int32_t)(data->fcdPosition - data->string);
1652 UChar baseChar = *data->fcdPosition;
1653
1654 firstMarkOffset = baseOffset + 1;
1655
1656 /*
1657 * If the base character is the start of a contraction, forward processing
1658 * will normalize the marks while checking for the contraction, which means
1659 * that the offset of the first mark will the same as the other marks.
1660 *
1661 * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
1662 */
1663 if (baseChar >= 0x100) {
1664 uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
1665
1666 if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
1667 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
1668 }
1669
1670 if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
1671 firstMarkOffset = trailOffset;
1672 }
1673 }
1674
1675 *(data->offsetStore++) = baseOffset;
1676 }
1677
1678 *(data->offsetStore++) = firstMarkOffset;
1679
1680 for (int32_t i = 0; i < trailCount; i += 1) {
1681 *(data->offsetStore++) = trailOffset;
1682 }
1683
1684 data->offsetRepeatValue = trailOffset;
1685
1686 data->offsetReturn = data->offsetStore - 1;
1687 if (data->offsetReturn == data->offsetBuffer) {
1688 data->offsetStore = data->offsetBuffer;
1689 }
1690
1691 data->pos = data->writableBuffer.getTerminatedBuffer() + 1 + normLen;
1692 data->origFlags = data->flags;
1693 data->flags |= UCOL_ITER_INNORMBUF;
1694 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1695 }
1696
1697
1698 /**
1699 * Incremental FCD check for previous iteration and normalize. Called from
1700 * getPrevCE when normalization state is suspect.
1701 * When entering, the state is known to be this:
1702 * o We are working in the main buffer of the collIterate, not the side
1703 * writable buffer. When in the side buffer, normalization mode is always
1704 * off, so we won't get here.
1705 * o The leading combining class from the current character is 0 or the
1706 * trailing combining class of the previous char was zero.
1707 * True because the previous call to this function will have always exited
1708 * that way, and we get called for every char where cc might be non-zero.
1709 * @param data collation iterate struct
1710 * @return normalization status, TRUE for normalization to be done, FALSE
1711 * otherwise
1712 */
1713 static
collPrevIterFCD(collIterate * data)1714 inline UBool collPrevIterFCD(collIterate *data)
1715 {
1716 const UChar *src, *start;
1717 uint8_t leadingCC;
1718 uint8_t trailingCC = 0;
1719 uint16_t fcd;
1720 UBool result = FALSE;
1721
1722 start = data->string;
1723 src = data->pos + 1;
1724
1725 /* Get the trailing combining class of the current character. */
1726 fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
1727
1728 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1729
1730 if (leadingCC != 0) {
1731 /*
1732 The current char has a non-zero leading combining class.
1733 Scan backward until we find a char with a trailing cc of zero.
1734 */
1735 for (;;)
1736 {
1737 if (start == src) {
1738 data->fcdPosition = NULL;
1739 return result;
1740 }
1741
1742 fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
1743
1744 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1745
1746 if (trailingCC == 0) {
1747 break;
1748 }
1749
1750 if (leadingCC < trailingCC) {
1751 result = TRUE;
1752 }
1753
1754 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1755 }
1756 }
1757
1758 data->fcdPosition = (UChar *)src;
1759
1760 return result;
1761 }
1762
1763 /** gets a character from the string at a given offset
1764 * Handles both normal and iterative cases.
1765 * No error checking - caller beware!
1766 */
1767 inline static
peekCharacter(collIterate * source,int32_t offset)1768 UChar peekCharacter(collIterate *source, int32_t offset) {
1769 if(source->pos != NULL) {
1770 return *(source->pos + offset);
1771 } else if(source->iterator != NULL) {
1772 if(offset != 0) {
1773 source->iterator->move(source->iterator, offset, UITER_CURRENT);
1774 UChar toReturn = (UChar)source->iterator->next(source->iterator);
1775 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1776 return toReturn;
1777 } else {
1778 return (UChar)source->iterator->current(source->iterator);
1779 }
1780 } else {
1781 return (UChar)U_SENTINEL;
1782 }
1783 }
1784
1785 /**
1786 * Determines if we are at the start of the data string in the backwards
1787 * collation iterator
1788 * @param data collation iterator
1789 * @return TRUE if we are at the start
1790 */
1791 static
isAtStartPrevIterate(collIterate * data)1792 inline UBool isAtStartPrevIterate(collIterate *data) {
1793 if(data->pos == NULL && data->iterator != NULL) {
1794 return !data->iterator->hasPrevious(data->iterator);
1795 }
1796 //return (collIter_bos(data)) ||
1797 return (data->pos == data->string) ||
1798 ((data->flags & UCOL_ITER_INNORMBUF) &&
1799 *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1800 }
1801
1802 static
goBackOne(collIterate * data)1803 inline void goBackOne(collIterate *data) {
1804 # if 0
1805 // somehow, it looks like we need to keep iterator synced up
1806 // at all times, as above.
1807 if(data->pos) {
1808 data->pos--;
1809 }
1810 if(data->iterator) {
1811 data->iterator->previous(data->iterator);
1812 }
1813 #endif
1814 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
1815 data->iterator->previous(data->iterator);
1816 }
1817 if(data->pos) {
1818 data->pos --;
1819 }
1820 }
1821
1822 /**
1823 * Inline function that gets a simple CE.
1824 * So what it does is that it will first check the expansion buffer. If the
1825 * expansion buffer is not empty, ie the end pointer to the expansion buffer
1826 * is different from the string pointer, we return the collation element at the
1827 * return pointer and decrement it.
1828 * For more complicated CEs it resorts to getComplicatedCE.
1829 * @param coll collator data
1830 * @param data collation iterator struct
1831 * @param status error status
1832 */
1833 static
ucol_IGetPrevCE(const UCollator * coll,collIterate * data,UErrorCode * status)1834 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
1835 UErrorCode *status)
1836 {
1837 uint32_t result = (uint32_t)UCOL_NULLORDER;
1838
1839 if (data->offsetReturn != NULL) {
1840 if (data->offsetRepeatCount > 0) {
1841 data->offsetRepeatCount -= 1;
1842 } else {
1843 if (data->offsetReturn == data->offsetBuffer) {
1844 data->offsetReturn = NULL;
1845 data->offsetStore = data->offsetBuffer;
1846 } else {
1847 data->offsetReturn -= 1;
1848 }
1849 }
1850 }
1851
1852 if ((data->extendCEs && data->toReturn > data->extendCEs) ||
1853 (!data->extendCEs && data->toReturn > data->CEs))
1854 {
1855 data->toReturn -= 1;
1856 result = *(data->toReturn);
1857 if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
1858 data->CEpos = data->toReturn;
1859 }
1860 }
1861 else {
1862 UChar ch = 0;
1863
1864 /*
1865 Loop handles case when incremental normalize switches to or from the
1866 side buffer / original string, and we need to start again to get the
1867 next character.
1868 */
1869 for (;;) {
1870 if (data->flags & UCOL_ITER_HASLEN) {
1871 /*
1872 Normal path for strings when length is specified.
1873 Not in side buffer because it is always null terminated.
1874 */
1875 if (data->pos <= data->string) {
1876 /* End of the main source string */
1877 return UCOL_NO_MORE_CES;
1878 }
1879 data->pos --;
1880 ch = *data->pos;
1881 }
1882 // we are using an iterator to go back. Pray for us!
1883 else if (data->flags & UCOL_USE_ITERATOR) {
1884 UChar32 iterCh = data->iterator->previous(data->iterator);
1885 if(iterCh == U_SENTINEL) {
1886 return UCOL_NO_MORE_CES;
1887 } else {
1888 ch = (UChar)iterCh;
1889 }
1890 }
1891 else {
1892 data->pos --;
1893 ch = *data->pos;
1894 /* we are in the side buffer. */
1895 if (ch == 0) {
1896 /*
1897 At the start of the normalize side buffer.
1898 Go back to string.
1899 Because pointer points to the last accessed character,
1900 hence we have to increment it by one here.
1901 */
1902 data->flags = data->origFlags;
1903 data->offsetRepeatValue = 0;
1904
1905 if (data->fcdPosition == NULL) {
1906 data->pos = data->string;
1907 return UCOL_NO_MORE_CES;
1908 }
1909 else {
1910 data->pos = data->fcdPosition + 1;
1911 }
1912
1913 continue;
1914 }
1915 }
1916
1917 if(data->flags&UCOL_HIRAGANA_Q) {
1918 if(ch>=0x3040 && ch<=0x309f) {
1919 data->flags |= UCOL_WAS_HIRAGANA;
1920 } else {
1921 data->flags &= ~UCOL_WAS_HIRAGANA;
1922 }
1923 }
1924
1925 /*
1926 * got a character to determine if there's fcd and/or normalization
1927 * stuff to do.
1928 * if the current character is not fcd.
1929 * if current character is at the start of the string
1930 * Trailing combining class == 0.
1931 * Note if pos is in the writablebuffer, norm is always 0
1932 */
1933 if (ch < ZERO_CC_LIMIT_ ||
1934 // this should propel us out of the loop in the iterator case
1935 (data->flags & UCOL_ITER_NORM) == 0 ||
1936 (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
1937 || data->string == data->pos) {
1938 break;
1939 }
1940
1941 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1942 /* if next character is FCD */
1943 if (data->pos == data->string) {
1944 /* First char of string is always OK for FCD check */
1945 break;
1946 }
1947
1948 /* Not first char of string, do the FCD fast test */
1949 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
1950 break;
1951 }
1952 }
1953
1954 /* Need a more complete FCD check and possible normalization. */
1955 if (collPrevIterFCD(data)) {
1956 collPrevIterNormalize(data);
1957 }
1958
1959 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
1960 /* No normalization. Go ahead and process the char. */
1961 break;
1962 }
1963
1964 /*
1965 Some normalization happened.
1966 Next loop picks up a char from the normalization buffer.
1967 */
1968 }
1969
1970 /* attempt to handle contractions, after removal of the backwards
1971 contraction
1972 */
1973 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
1974 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
1975 } else {
1976 if (ch <= 0xFF) {
1977 result = coll->latinOneMapping[ch];
1978 }
1979 else {
1980 // Always use UCA for [3400..9FFF], [AC00..D7AF]
1981 // **** [FA0E..FA2F] ?? ****
1982 if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
1983 (ch >= 0x3400 && ch <= 0xD7AF)) {
1984 if (ch > 0x9FFF && ch < 0xAC00) {
1985 // between the two target ranges; do normal lookup
1986 // **** this range is YI, Modifier tone letters, ****
1987 // **** Latin-D, Syloti Nagari, Phagas-pa. ****
1988 // **** Latin-D might be tailored, so we need to ****
1989 // **** do the normal lookup for these guys. ****
1990 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1991 } else {
1992 result = UCOL_NOT_FOUND;
1993 }
1994 } else {
1995 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1996 }
1997 }
1998 if (result > UCOL_NOT_FOUND) {
1999 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
2000 }
2001 if (result == UCOL_NOT_FOUND) { // Not found in master list
2002 if (!isAtStartPrevIterate(data) &&
2003 ucol_contractionEndCP(ch, data->coll))
2004 {
2005 result = UCOL_CONTRACTION;
2006 } else {
2007 if(coll->UCA) {
2008 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
2009 }
2010 }
2011
2012 if (result > UCOL_NOT_FOUND) {
2013 if(coll->UCA) {
2014 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
2015 }
2016 }
2017 }
2018 }
2019
2020 if(result == UCOL_NOT_FOUND) {
2021 result = getPrevImplicit(ch, data);
2022 }
2023 }
2024
2025 return result;
2026 }
2027
2028
2029 /* ucol_getPrevCE, out-of-line version for use from other files. */
2030 U_CFUNC uint32_t U_EXPORT2
ucol_getPrevCE(const UCollator * coll,collIterate * data,UErrorCode * status)2031 ucol_getPrevCE(const UCollator *coll, collIterate *data,
2032 UErrorCode *status) {
2033 return ucol_IGetPrevCE(coll, data, status);
2034 }
2035
2036
2037 /* this should be connected to special Jamo handling */
2038 U_CFUNC uint32_t U_EXPORT2
ucol_getFirstCE(const UCollator * coll,UChar u,UErrorCode * status)2039 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
2040 collIterate colIt;
2041 IInit_collIterate(coll, &u, 1, &colIt, status);
2042 if(U_FAILURE(*status)) {
2043 return 0;
2044 }
2045 return ucol_IGetNextCE(coll, &colIt, status);
2046 }
2047
2048 /**
2049 * Inserts the argument character into the end of the buffer pushing back the
2050 * null terminator.
2051 * @param data collIterate struct data
2052 * @param ch character to be appended
2053 * @return the position of the new addition
2054 */
2055 static
insertBufferEnd(collIterate * data,UChar ch)2056 inline const UChar * insertBufferEnd(collIterate *data, UChar ch)
2057 {
2058 int32_t oldLength = data->writableBuffer.length();
2059 return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength;
2060 }
2061
2062 /**
2063 * Inserts the argument string into the end of the buffer pushing back the
2064 * null terminator.
2065 * @param data collIterate struct data
2066 * @param string to be appended
2067 * @param length of the string to be appended
2068 * @return the position of the new addition
2069 */
2070 static
insertBufferEnd(collIterate * data,const UChar * str,int32_t length)2071 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length)
2072 {
2073 int32_t oldLength = data->writableBuffer.length();
2074 return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength;
2075 }
2076
2077 /**
2078 * Special normalization function for contraction in the forwards iterator.
2079 * This normalization sequence will place the current character at source->pos
2080 * and its following normalized sequence into the buffer.
2081 * The fcd position, pos will be changed.
2082 * pos will now point to positions in the buffer.
2083 * Flags will be changed accordingly.
2084 * @param data collation iterator data
2085 */
2086 static
normalizeNextContraction(collIterate * data)2087 inline void normalizeNextContraction(collIterate *data)
2088 {
2089 int32_t strsize;
2090 UErrorCode status = U_ZERO_ERROR;
2091 /* because the pointer points to the next character */
2092 const UChar *pStart = data->pos - 1;
2093 const UChar *pEnd;
2094
2095 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2096 data->writableBuffer.setTo(*(pStart - 1));
2097 strsize = 1;
2098 }
2099 else {
2100 strsize = data->writableBuffer.length();
2101 }
2102
2103 pEnd = data->fcdPosition;
2104
2105 data->writableBuffer.append(
2106 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status));
2107 if(U_FAILURE(status)) {
2108 return;
2109 }
2110
2111 data->pos = data->writableBuffer.getTerminatedBuffer() + strsize;
2112 data->origFlags = data->flags;
2113 data->flags |= UCOL_ITER_INNORMBUF;
2114 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2115 }
2116
2117 /**
2118 * Contraction character management function that returns the next character
2119 * for the forwards iterator.
2120 * Does nothing if the next character is in buffer and not the first character
2121 * in it.
2122 * Else it checks next character in data string to see if it is normalizable.
2123 * If it is not, the character is simply copied into the buffer, else
2124 * the whole normalized substring is copied into the buffer, including the
2125 * current character.
2126 * @param data collation element iterator data
2127 * @return next character
2128 */
2129 static
getNextNormalizedChar(collIterate * data)2130 inline UChar getNextNormalizedChar(collIterate *data)
2131 {
2132 UChar nextch;
2133 UChar ch;
2134 // Here we need to add the iterator code. One problem is the way
2135 // end of string is handled. If we just return next char, it could
2136 // be the sentinel. Most of the cases already check for this, but we
2137 // need to be sure.
2138 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2139 /* if no normalization and not in buffer. */
2140 if(data->flags & UCOL_USE_ITERATOR) {
2141 return (UChar)data->iterator->next(data->iterator);
2142 } else {
2143 return *(data->pos ++);
2144 }
2145 }
2146
2147 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2148 //normalizeIterator(data);
2149 //}
2150
2151 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2152 if ((innormbuf && *data->pos != 0) ||
2153 (data->fcdPosition != NULL && !innormbuf &&
2154 data->pos < data->fcdPosition)) {
2155 /*
2156 if next character is in normalized buffer, no further normalization
2157 is required
2158 */
2159 return *(data->pos ++);
2160 }
2161
2162 if (data->flags & UCOL_ITER_HASLEN) {
2163 /* in data string */
2164 if (data->pos + 1 == data->endp) {
2165 return *(data->pos ++);
2166 }
2167 }
2168 else {
2169 if (innormbuf) {
2170 // inside the normalization buffer, but at the end
2171 // (since we encountered zero). This means, in the
2172 // case we're using char iterator, that we need to
2173 // do another round of normalization.
2174 //if(data->origFlags & UCOL_USE_ITERATOR) {
2175 // we need to restore original flags,
2176 // otherwise, we'll lose them
2177 //data->flags = data->origFlags;
2178 //normalizeIterator(data);
2179 //return *(data->pos++);
2180 //} else {
2181 /*
2182 in writable buffer, at this point fcdPosition can not be
2183 pointing to the end of the data string. see contracting tag.
2184 */
2185 if(data->fcdPosition) {
2186 if (*(data->fcdPosition + 1) == 0 ||
2187 data->fcdPosition + 1 == data->endp) {
2188 /* at the end of the string, dump it into the normalizer */
2189 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1;
2190 // Check if data->pos received a null pointer
2191 if (data->pos == NULL) {
2192 return (UChar)-1; // Return to indicate error.
2193 }
2194 return *(data->fcdPosition ++);
2195 }
2196 data->pos = data->fcdPosition;
2197 } else if(data->origFlags & UCOL_USE_ITERATOR) {
2198 // if we are here, we're using a normalizing iterator.
2199 // we should just continue further.
2200 data->flags = data->origFlags;
2201 data->pos = NULL;
2202 return (UChar)data->iterator->next(data->iterator);
2203 }
2204 //}
2205 }
2206 else {
2207 if (*(data->pos + 1) == 0) {
2208 return *(data->pos ++);
2209 }
2210 }
2211 }
2212
2213 ch = *data->pos ++;
2214 nextch = *data->pos;
2215
2216 /*
2217 * if the current character is not fcd.
2218 * Trailing combining class == 0.
2219 */
2220 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2221 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2222 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2223 /*
2224 Need a more complete FCD check and possible normalization.
2225 normalize substring will be appended to buffer
2226 */
2227 if (collIterFCD(data)) {
2228 normalizeNextContraction(data);
2229 return *(data->pos ++);
2230 }
2231 else if (innormbuf) {
2232 /* fcdposition shifted even when there's no normalization, if we
2233 don't input the rest into this, we'll get the wrong position when
2234 we reach the end of the writableBuffer */
2235 int32_t length = (int32_t)(data->fcdPosition - data->pos + 1);
2236 data->pos = insertBufferEnd(data, data->pos - 1, length);
2237 // Check if data->pos received a null pointer
2238 if (data->pos == NULL) {
2239 return (UChar)-1; // Return to indicate error.
2240 }
2241 return *(data->pos ++);
2242 }
2243 }
2244
2245 if (innormbuf) {
2246 /*
2247 no normalization is to be done hence only one character will be
2248 appended to the buffer.
2249 */
2250 data->pos = insertBufferEnd(data, ch) + 1;
2251 // Check if data->pos received a null pointer
2252 if (data->pos == NULL) {
2253 return (UChar)-1; // Return to indicate error.
2254 }
2255 }
2256
2257 /* points back to the pos in string */
2258 return ch;
2259 }
2260
2261
2262
2263 /**
2264 * Function to copy the buffer into writableBuffer and sets the fcd position to
2265 * the correct position
2266 * @param source data string source
2267 * @param buffer character buffer
2268 */
2269 static
setDiscontiguosAttribute(collIterate * source,const UnicodeString & buffer)2270 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer)
2271 {
2272 /* okay confusing part here. to ensure that the skipped characters are
2273 considered later, we need to place it in the appropriate position in the
2274 normalization buffer and reassign the pos pointer. simple case if pos
2275 reside in string, simply copy to normalization buffer and
2276 fcdposition = pos, pos = start of normalization buffer. if pos in
2277 normalization buffer, we'll insert the copy infront of pos and point pos
2278 to the start of the normalization buffer. why am i doing these copies?
2279 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2280 not require any changes, which be really painful. */
2281 if (source->flags & UCOL_ITER_INNORMBUF) {
2282 int32_t replaceLength = source->pos - source->writableBuffer.getBuffer();
2283 source->writableBuffer.replace(0, replaceLength, buffer);
2284 }
2285 else {
2286 source->fcdPosition = source->pos;
2287 source->origFlags = source->flags;
2288 source->flags |= UCOL_ITER_INNORMBUF;
2289 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2290 source->writableBuffer = buffer;
2291 }
2292
2293 source->pos = source->writableBuffer.getTerminatedBuffer();
2294 }
2295
2296 /**
2297 * Function to get the discontiguos collation element within the source.
2298 * Note this function will set the position to the appropriate places.
2299 * @param coll current collator used
2300 * @param source data string source
2301 * @param constart index to the start character in the contraction table
2302 * @return discontiguos collation element offset
2303 */
2304 static
getDiscontiguous(const UCollator * coll,collIterate * source,const UChar * constart)2305 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2306 const UChar *constart)
2307 {
2308 /* source->pos currently points to the second combining character after
2309 the start character */
2310 const UChar *temppos = source->pos;
2311 UnicodeString buffer;
2312 const UChar *tempconstart = constart;
2313 uint8_t tempflags = source->flags;
2314 UBool multicontraction = FALSE;
2315 collIterateState discState;
2316
2317 backupState(source, &discState);
2318
2319 buffer.setTo(peekCharacter(source, -1));
2320 for (;;) {
2321 UChar *UCharOffset;
2322 UChar schar,
2323 tchar;
2324 uint32_t result;
2325
2326 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2327 || (peekCharacter(source, 0) == 0 &&
2328 //|| (*source->pos == 0 &&
2329 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2330 source->fcdPosition == NULL ||
2331 source->fcdPosition == source->endp ||
2332 *(source->fcdPosition) == 0 ||
2333 u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2334 /* end of string in null terminated string or stopped by a
2335 null character, note fcd does not always point to a base
2336 character after the discontiguos change */
2337 u_getCombiningClass(peekCharacter(source, 0)) == 0) {
2338 //u_getCombiningClass(*(source->pos)) == 0) {
2339 //constart = (UChar *)coll->image + getContractOffset(CE);
2340 if (multicontraction) {
2341 source->pos = temppos - 1;
2342 setDiscontiguosAttribute(source, buffer);
2343 return *(coll->contractionCEs +
2344 (tempconstart - coll->contractionIndex));
2345 }
2346 constart = tempconstart;
2347 break;
2348 }
2349
2350 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2351 schar = getNextNormalizedChar(source);
2352
2353 while (schar > (tchar = *UCharOffset)) {
2354 UCharOffset++;
2355 }
2356
2357 if (schar != tchar) {
2358 /* not the correct codepoint. we stuff the current codepoint into
2359 the discontiguos buffer and try the next character */
2360 buffer.append(schar);
2361 continue;
2362 }
2363 else {
2364 if (u_getCombiningClass(schar) ==
2365 u_getCombiningClass(peekCharacter(source, -2))) {
2366 //u_getCombiningClass(*(source->pos - 2))) {
2367 buffer.append(schar);
2368 continue;
2369 }
2370 result = *(coll->contractionCEs +
2371 (UCharOffset - coll->contractionIndex));
2372 }
2373
2374 if (result == UCOL_NOT_FOUND) {
2375 break;
2376 } else if (isContraction(result)) {
2377 /* this is a multi-contraction*/
2378 tempconstart = (UChar *)coll->image + getContractOffset(result);
2379 if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2380 != UCOL_NOT_FOUND) {
2381 multicontraction = TRUE;
2382 temppos = source->pos + 1;
2383 }
2384 } else {
2385 setDiscontiguosAttribute(source, buffer);
2386 return result;
2387 }
2388 }
2389
2390 /* no problems simply reverting just like that,
2391 if we are in string before getting into this function, points back to
2392 string hence no problem.
2393 if we are in normalization buffer before getting into this function,
2394 since we'll never use another normalization within this function, we
2395 know that fcdposition points to a base character. the normalization buffer
2396 never change, hence this revert works. */
2397 loadState(source, &discState, TRUE);
2398 goBackOne(source);
2399
2400 //source->pos = temppos - 1;
2401 source->flags = tempflags;
2402 return *(coll->contractionCEs + (constart - coll->contractionIndex));
2403 }
2404
2405 static
isNonChar(UChar32 cp)2406 inline UBool isNonChar(UChar32 cp) {
2407 return (UBool)((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF));
2408 }
2409
2410 /* now uses Mark's getImplicitPrimary code */
2411 static
getImplicit(UChar32 cp,collIterate * collationSource)2412 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2413 if(isNonChar(cp)) {
2414 return 0;
2415 }
2416 uint32_t r = uprv_uca_getImplicitPrimary(cp);
2417 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2418 collationSource->offsetRepeatCount += 1;
2419 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2420 }
2421
2422 /**
2423 * Inserts the argument character into the front of the buffer replacing the
2424 * front null terminator.
2425 * @param data collation element iterator data
2426 * @param ch character to be appended
2427 */
2428 static
insertBufferFront(collIterate * data,UChar ch)2429 inline void insertBufferFront(collIterate *data, UChar ch)
2430 {
2431 data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2;
2432 }
2433
2434 /**
2435 * Special normalization function for contraction in the previous iterator.
2436 * This normalization sequence will place the current character at source->pos
2437 * and its following normalized sequence into the buffer.
2438 * The fcd position, pos will be changed.
2439 * pos will now point to positions in the buffer.
2440 * Flags will be changed accordingly.
2441 * @param data collation iterator data
2442 */
2443 static
normalizePrevContraction(collIterate * data,UErrorCode * status)2444 inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
2445 {
2446 const UChar *pEnd = data->pos + 1; /* End normalize + 1 */
2447 const UChar *pStart;
2448
2449 UnicodeString endOfBuffer;
2450 if (data->flags & UCOL_ITER_HASLEN) {
2451 /*
2452 normalization buffer not used yet, we'll pull down the next
2453 character into the end of the buffer
2454 */
2455 endOfBuffer.setTo(*pEnd);
2456 }
2457 else {
2458 endOfBuffer.setTo(data->writableBuffer, 1); // after the leading NUL
2459 }
2460
2461 if (data->fcdPosition == NULL) {
2462 pStart = data->string;
2463 }
2464 else {
2465 pStart = data->fcdPosition + 1;
2466 }
2467 int32_t normLen =
2468 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)),
2469 data->writableBuffer,
2470 *status).
2471 length();
2472 if(U_FAILURE(*status)) {
2473 return;
2474 }
2475 /*
2476 this puts the null termination infront of the normalized string instead
2477 of the end
2478 */
2479 data->pos =
2480 data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() +
2481 1 + normLen;
2482 data->origFlags = data->flags;
2483 data->flags |= UCOL_ITER_INNORMBUF;
2484 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2485 }
2486
2487 /**
2488 * Contraction character management function that returns the previous character
2489 * for the backwards iterator.
2490 * Does nothing if the previous character is in buffer and not the first
2491 * character in it.
2492 * Else it checks previous character in data string to see if it is
2493 * normalizable.
2494 * If it is not, the character is simply copied into the buffer, else
2495 * the whole normalized substring is copied into the buffer, including the
2496 * current character.
2497 * @param data collation element iterator data
2498 * @return previous character
2499 */
2500 static
getPrevNormalizedChar(collIterate * data,UErrorCode * status)2501 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
2502 {
2503 UChar prevch;
2504 UChar ch;
2505 const UChar *start;
2506 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2507 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2508 (innormbuf && *(data->pos - 1) != 0)) {
2509 /*
2510 if no normalization.
2511 if previous character is in normalized buffer, no further normalization
2512 is required
2513 */
2514 if(data->flags & UCOL_USE_ITERATOR) {
2515 data->iterator->move(data->iterator, -1, UITER_CURRENT);
2516 return (UChar)data->iterator->next(data->iterator);
2517 } else {
2518 return *(data->pos - 1);
2519 }
2520 }
2521
2522 start = data->pos;
2523 if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
2524 /* in data string */
2525 if ((start - 1) == data->string) {
2526 return *(start - 1);
2527 }
2528 start --;
2529 ch = *start;
2530 prevch = *(start - 1);
2531 }
2532 else {
2533 /*
2534 in writable buffer, at this point fcdPosition can not be NULL.
2535 see contracting tag.
2536 */
2537 if (data->fcdPosition == data->string) {
2538 /* at the start of the string, just dump it into the normalizer */
2539 insertBufferFront(data, *(data->fcdPosition));
2540 data->fcdPosition = NULL;
2541 return *(data->pos - 1);
2542 }
2543 start = data->fcdPosition;
2544 ch = *start;
2545 prevch = *(start - 1);
2546 }
2547 /*
2548 * if the current character is not fcd.
2549 * Trailing combining class == 0.
2550 */
2551 if (data->fcdPosition > start &&
2552 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2553 {
2554 /*
2555 Need a more complete FCD check and possible normalization.
2556 normalize substring will be appended to buffer
2557 */
2558 const UChar *backuppos = data->pos;
2559 data->pos = start;
2560 if (collPrevIterFCD(data)) {
2561 normalizePrevContraction(data, status);
2562 return *(data->pos - 1);
2563 }
2564 data->pos = backuppos;
2565 data->fcdPosition ++;
2566 }
2567
2568 if (innormbuf) {
2569 /*
2570 no normalization is to be done hence only one character will be
2571 appended to the buffer.
2572 */
2573 insertBufferFront(data, ch);
2574 data->fcdPosition --;
2575 }
2576
2577 return ch;
2578 }
2579
2580 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2581 /* It is called by getNextCE */
2582
2583 /* The following should be even */
2584 #define UCOL_MAX_DIGITS_FOR_NUMBER 254
2585
ucol_prv_getSpecialCE(const UCollator * coll,UChar ch,uint32_t CE,collIterate * source,UErrorCode * status)2586 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
2587 collIterateState entryState;
2588 backupState(source, &entryState);
2589 UChar32 cp = ch;
2590
2591 for (;;) {
2592 // This loop will repeat only in the case of contractions, and only when a contraction
2593 // is found and the first CE resulting from that contraction is itself a special
2594 // (an expansion, for example.) All other special CE types are fully handled the
2595 // first time through, and the loop exits.
2596
2597 const uint32_t *CEOffset = NULL;
2598 switch(getCETag(CE)) {
2599 case NOT_FOUND_TAG:
2600 /* This one is not found, and we'll let somebody else bother about it... no more games */
2601 return CE;
2602 case SPEC_PROC_TAG:
2603 {
2604 // Special processing is getting a CE that is preceded by a certain prefix
2605 // Currently this is only needed for optimizing Japanese length and iteration marks.
2606 // When we encouter a special processing tag, we go backwards and try to see if
2607 // we have a match.
2608 // Contraction tables are used - so the whole process is not unlike contraction.
2609 // prefix data is stored backwards in the table.
2610 const UChar *UCharOffset;
2611 UChar schar, tchar;
2612 collIterateState prefixState;
2613 backupState(source, &prefixState);
2614 loadState(source, &entryState, TRUE);
2615 goBackOne(source); // We want to look at the point where we entered - actually one
2616 // before that...
2617
2618 for(;;) {
2619 // This loop will run once per source string character, for as long as we
2620 // are matching a potential contraction sequence
2621
2622 // First we position ourselves at the begining of contraction sequence
2623 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2624 if (collIter_bos(source)) {
2625 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2626 break;
2627 }
2628 schar = getPrevNormalizedChar(source, status);
2629 goBackOne(source);
2630
2631 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2632 UCharOffset++;
2633 }
2634
2635 if (schar == tchar) {
2636 // Found the source string char in the table.
2637 // Pick up the corresponding CE from the table.
2638 CE = *(coll->contractionCEs +
2639 (UCharOffset - coll->contractionIndex));
2640 }
2641 else
2642 {
2643 // Source string char was not in the table.
2644 // We have not found the prefix.
2645 CE = *(coll->contractionCEs +
2646 (ContractionStart - coll->contractionIndex));
2647 }
2648
2649 if(!isPrefix(CE)) {
2650 // The source string char was in the contraction table, and the corresponding
2651 // CE is not a prefix CE. We found the prefix, break
2652 // out of loop, this CE will end up being returned. This is the normal
2653 // way out of prefix handling when the source actually contained
2654 // the prefix.
2655 break;
2656 }
2657 }
2658 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
2659 loadState(source, &prefixState, TRUE);
2660 if(source->origFlags & UCOL_USE_ITERATOR) {
2661 source->flags = source->origFlags;
2662 }
2663 } else { // prefix search was a failure, we have to backup all the way to the start
2664 loadState(source, &entryState, TRUE);
2665 }
2666 break;
2667 }
2668 case CONTRACTION_TAG:
2669 {
2670 /* This should handle contractions */
2671 collIterateState state;
2672 backupState(source, &state);
2673 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
2674 const UChar *UCharOffset;
2675 UChar schar, tchar;
2676
2677 for (;;) {
2678 /* This loop will run once per source string character, for as long as we */
2679 /* are matching a potential contraction sequence */
2680
2681 /* First we position ourselves at the begining of contraction sequence */
2682 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2683
2684 if (collIter_eos(source)) {
2685 // Ran off the end of the source string.
2686 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2687 // So we'll pick whatever we have at the point...
2688 if (CE == UCOL_NOT_FOUND) {
2689 // back up the source over all the chars we scanned going into this contraction.
2690 CE = firstCE;
2691 loadState(source, &state, TRUE);
2692 if(source->origFlags & UCOL_USE_ITERATOR) {
2693 source->flags = source->origFlags;
2694 }
2695 }
2696 break;
2697 }
2698
2699 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2700 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
2701
2702 schar = getNextNormalizedChar(source);
2703 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2704 UCharOffset++;
2705 }
2706
2707 if (schar == tchar) {
2708 // Found the source string char in the contraction table.
2709 // Pick up the corresponding CE from the table.
2710 CE = *(coll->contractionCEs +
2711 (UCharOffset - coll->contractionIndex));
2712 }
2713 else
2714 {
2715 // Source string char was not in contraction table.
2716 // Unless we have a discontiguous contraction, we have finished
2717 // with this contraction.
2718 // in order to do the proper detection, we
2719 // need to see if we're dealing with a supplementary
2720 /* We test whether the next two char are surrogate pairs.
2721 * This test is done if the iterator is not NULL.
2722 * If there is no surrogate pair, the iterator
2723 * goes back one if needed. */
2724 UChar32 miss = schar;
2725 if (source->iterator) {
2726 UChar32 surrNextChar; /* the next char in the iteration to test */
2727 int32_t prevPos; /* holds the previous position before move forward of the source iterator */
2728 if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) {
2729 prevPos = source->iterator->index;
2730 surrNextChar = getNextNormalizedChar(source);
2731 if (U16_IS_TRAIL(surrNextChar)) {
2732 miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar);
2733 } else if (prevPos < source->iterator->index){
2734 goBackOne(source);
2735 }
2736 }
2737 } else if (U16_IS_LEAD(schar)) {
2738 miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
2739 }
2740
2741 uint8_t sCC;
2742 if (miss < 0x300 ||
2743 maxCC == 0 ||
2744 (sCC = i_getCombiningClass(miss, coll)) == 0 ||
2745 sCC>maxCC ||
2746 (allSame != 0 && sCC == maxCC) ||
2747 collIter_eos(source))
2748 {
2749 // Contraction can not be discontiguous.
2750 goBackOne(source); // back up the source string by one,
2751 // because the character we just looked at was
2752 // not part of the contraction. */
2753 if(U_IS_SUPPLEMENTARY(miss)) {
2754 goBackOne(source);
2755 }
2756 CE = *(coll->contractionCEs +
2757 (ContractionStart - coll->contractionIndex));
2758 } else {
2759 //
2760 // Contraction is possibly discontiguous.
2761 // Scan more of source string looking for a match
2762 //
2763 UChar tempchar;
2764 /* find the next character if schar is not a base character
2765 and we are not yet at the end of the string */
2766 tempchar = getNextNormalizedChar(source);
2767 // probably need another supplementary thingie here
2768 goBackOne(source);
2769 if (i_getCombiningClass(tempchar, coll) == 0) {
2770 goBackOne(source);
2771 if(U_IS_SUPPLEMENTARY(miss)) {
2772 goBackOne(source);
2773 }
2774 /* Spit out the last char of the string, wasn't tasty enough */
2775 CE = *(coll->contractionCEs +
2776 (ContractionStart - coll->contractionIndex));
2777 } else {
2778 CE = getDiscontiguous(coll, source, ContractionStart);
2779 }
2780 }
2781 } // else after if(schar == tchar)
2782
2783 if(CE == UCOL_NOT_FOUND) {
2784 /* The Source string did not match the contraction that we were checking. */
2785 /* Back up the source position to undo the effects of having partially */
2786 /* scanned through what ultimately proved to not be a contraction. */
2787 loadState(source, &state, TRUE);
2788 CE = firstCE;
2789 break;
2790 }
2791
2792 if(!isContraction(CE)) {
2793 // The source string char was in the contraction table, and the corresponding
2794 // CE is not a contraction CE. We completed the contraction, break
2795 // out of loop, this CE will end up being returned. This is the normal
2796 // way out of contraction handling when the source actually contained
2797 // the contraction.
2798 break;
2799 }
2800
2801
2802 // The source string char was in the contraction table, and the corresponding
2803 // CE is IS a contraction CE. We will continue looping to check the source
2804 // string for the remaining chars in the contraction.
2805 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
2806 if(tempCE != UCOL_NOT_FOUND) {
2807 // We have scanned a a section of source string for which there is a
2808 // CE from the contraction table. Remember the CE and scan position, so
2809 // that we can return to this point if further scanning fails to
2810 // match a longer contraction sequence.
2811 firstCE = tempCE;
2812
2813 goBackOne(source);
2814 backupState(source, &state);
2815 getNextNormalizedChar(source);
2816
2817 // Another way to do this is:
2818 //collIterateState tempState;
2819 //backupState(source, &tempState);
2820 //goBackOne(source);
2821 //backupState(source, &state);
2822 //loadState(source, &tempState, TRUE);
2823
2824 // The problem is that for incomplete contractions we have to remember the previous
2825 // position. Before, the only thing I needed to do was state.pos--;
2826 // After iterator introduction and especially after introduction of normalizing
2827 // iterators, it became much more difficult to decrease the saved state.
2828 // I'm not yet sure which of the two methods above is faster.
2829 }
2830 } // for(;;)
2831 break;
2832 } // case CONTRACTION_TAG:
2833 case LONG_PRIMARY_TAG:
2834 {
2835 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
2836 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
2837 source->offsetRepeatCount += 1;
2838 return CE;
2839 }
2840 case EXPANSION_TAG:
2841 {
2842 /* This should handle expansion. */
2843 /* NOTE: we can encounter both continuations and expansions in an expansion! */
2844 /* I have to decide where continuations are going to be dealt with */
2845 uint32_t size;
2846 uint32_t i; /* general counter */
2847
2848 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
2849 size = getExpansionCount(CE);
2850 CE = *CEOffset++;
2851 //source->offsetRepeatCount = -1;
2852
2853 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
2854 for(i = 1; i<size; i++) {
2855 *(source->CEpos++) = *CEOffset++;
2856 source->offsetRepeatCount += 1;
2857 }
2858 } else { /* else, we do */
2859 while(*CEOffset != 0) {
2860 *(source->CEpos++) = *CEOffset++;
2861 source->offsetRepeatCount += 1;
2862 }
2863 }
2864
2865 return CE;
2866 }
2867 case DIGIT_TAG:
2868 {
2869 /*
2870 We do a check to see if we want to collate digits as numbers; if so we generate
2871 a custom collation key. Otherwise we pull out the value stored in the expansion table.
2872 */
2873 //uint32_t size;
2874 uint32_t i; /* general counter */
2875
2876 if (source->coll->numericCollation == UCOL_ON){
2877 collIterateState digitState = {0,0,0,0,0,0,0,0,0};
2878 UChar32 char32 = 0;
2879 int32_t digVal = 0;
2880
2881 uint32_t digIndx = 0;
2882 uint32_t endIndex = 0;
2883 uint32_t trailingZeroIndex = 0;
2884
2885 uint8_t collateVal = 0;
2886
2887 UBool nonZeroValReached = FALSE;
2888
2889 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs.
2890 /*
2891 We parse the source string until we hit a char that's NOT a digit.
2892 Use this u_charDigitValue. This might be slow because we have to
2893 handle surrogates...
2894 */
2895 /*
2896 if (U16_IS_LEAD(ch)){
2897 if (!collIter_eos(source)) {
2898 backupState(source, &digitState);
2899 UChar trail = getNextNormalizedChar(source);
2900 if(U16_IS_TRAIL(trail)) {
2901 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
2902 } else {
2903 loadState(source, &digitState, TRUE);
2904 char32 = ch;
2905 }
2906 } else {
2907 char32 = ch;
2908 }
2909 } else {
2910 char32 = ch;
2911 }
2912 digVal = u_charDigitValue(char32);
2913 */
2914 digVal = u_charDigitValue(cp); // if we have arrived here, we have
2915 // already processed possible supplementaries that trigered the digit tag -
2916 // all supplementaries are marked in the UCA.
2917 /*
2918 We pad a zero in front of the first element anyways. This takes
2919 care of the (probably) most common case where people are sorting things followed
2920 by a single digit
2921 */
2922 digIndx++;
2923 for(;;){
2924 // Make sure we have enough space. No longer needed;
2925 // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
2926 // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
2927 // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
2928
2929 // Skipping over leading zeroes.
2930 if (digVal != 0) {
2931 nonZeroValReached = TRUE;
2932 }
2933 if (nonZeroValReached) {
2934 /*
2935 We parse the digit string into base 100 numbers (this fits into a byte).
2936 We only add to the buffer in twos, thus if we are parsing an odd character,
2937 that serves as the 'tens' digit while the if we are parsing an even one, that
2938 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
2939 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
2940 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
2941 than all the other bytes.
2942 */
2943
2944 if (digIndx % 2 == 1){
2945 collateVal += (uint8_t)digVal;
2946
2947 // We don't enter the low-order-digit case unless we've already seen
2948 // the high order, or for the first digit, which is always non-zero.
2949 if (collateVal != 0)
2950 trailingZeroIndex = 0;
2951
2952 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
2953 collateVal = 0;
2954 }
2955 else{
2956 // We drop the collation value into the buffer so if we need to do
2957 // a "front patch" we don't have to check to see if we're hitting the
2958 // last element.
2959 collateVal = (uint8_t)(digVal * 10);
2960
2961 // Check for trailing zeroes.
2962 if (collateVal == 0)
2963 {
2964 if (!trailingZeroIndex)
2965 trailingZeroIndex = (digIndx/2) + 2;
2966 }
2967 else
2968 trailingZeroIndex = 0;
2969
2970 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
2971 }
2972 digIndx++;
2973 }
2974
2975 // Get next character.
2976 if (!collIter_eos(source)){
2977 ch = getNextNormalizedChar(source);
2978 if (U16_IS_LEAD(ch)){
2979 if (!collIter_eos(source)) {
2980 backupState(source, &digitState);
2981 UChar trail = getNextNormalizedChar(source);
2982 if(U16_IS_TRAIL(trail)) {
2983 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
2984 } else {
2985 loadState(source, &digitState, TRUE);
2986 char32 = ch;
2987 }
2988 }
2989 } else {
2990 char32 = ch;
2991 }
2992
2993 if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){
2994 // Resetting position to point to the next unprocessed char. We
2995 // overshot it when doing our test/set for numbers.
2996 if (char32 > 0xFFFF) { // For surrogates.
2997 loadState(source, &digitState, TRUE);
2998 //goBackOne(source);
2999 }
3000 goBackOne(source);
3001 break;
3002 }
3003 } else {
3004 break;
3005 }
3006 }
3007
3008 if (nonZeroValReached == FALSE){
3009 digIndx = 2;
3010 numTempBuf[2] = 6;
3011 }
3012
3013 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
3014 if (digIndx % 2 != 0){
3015 /*
3016 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3017 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3018 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3019 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3020 */
3021
3022 for(i = 2; i < endIndex; i++){
3023 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3024 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3025 }
3026 --digIndx;
3027 }
3028
3029 // Subtract one off of the last byte.
3030 numTempBuf[endIndex-1] -= 1;
3031
3032 /*
3033 We want to skip over the first two slots in the buffer. The first slot
3034 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3035 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3036 */
3037 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3038 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3039
3040 // Now transfer the collation key to our collIterate struct.
3041 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3042 //size = ((endIndex+1) & ~1)/2;
3043 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3044 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3045 UCOL_BYTE_COMMON; // Tertiary weight.
3046 i = 2; // Reset the index into the buffer.
3047 while(i < endIndex)
3048 {
3049 uint32_t primWeight = numTempBuf[i++] << 8;
3050 if ( i < endIndex)
3051 primWeight |= numTempBuf[i++];
3052 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3053 }
3054
3055 } else {
3056 // no numeric mode, we'll just switch to whatever we stashed and continue
3057 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3058 CE = *CEOffset++;
3059 break;
3060 }
3061 return CE;
3062 }
3063 /* various implicits optimization */
3064 case IMPLICIT_TAG: /* everything that is not defined otherwise */
3065 /* UCA is filled with these. Tailorings are NOT_FOUND */
3066 return getImplicit(cp, source);
3067 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3068 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3069 return getImplicit(cp, source);
3070 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3071 {
3072 static const uint32_t
3073 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3074 //const uint32_t LCount = 19;
3075 static const uint32_t VCount = 21;
3076 static const uint32_t TCount = 28;
3077 //const uint32_t NCount = VCount * TCount; // 588
3078 //const uint32_t SCount = LCount * NCount; // 11172
3079 uint32_t L = ch - SBase;
3080
3081 // divide into pieces
3082
3083 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
3084 L /= TCount;
3085 uint32_t V = L % VCount;
3086 L /= VCount;
3087
3088 // offset them
3089
3090 L += LBase;
3091 V += VBase;
3092 T += TBase;
3093
3094 // return the first CE, but first put the rest into the expansion buffer
3095 if (!source->coll->image->jamoSpecial) { // FAST PATH
3096
3097 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3098 if (T != TBase) {
3099 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3100 }
3101
3102 return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3103
3104 } else { // Jamo is Special
3105 // Since Hanguls pass the FCD check, it is
3106 // guaranteed that we won't be in
3107 // the normalization buffer if something like this happens
3108 // However, if we are using a uchar iterator and normalization
3109 // is ON, the Hangul that lead us here is going to be in that
3110 // normalization buffer. Here we want to restore the uchar
3111 // iterator state and pull out of the normalization buffer
3112 if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
3113 source->flags = source->origFlags; // restore the iterator
3114 source->pos = NULL;
3115 }
3116 // Move Jamos into normalization buffer
3117 UChar *buffer = source->writableBuffer.getBuffer(4);
3118 int32_t bufferLength;
3119 buffer[0] = (UChar)L;
3120 buffer[1] = (UChar)V;
3121 if (T != TBase) {
3122 buffer[2] = (UChar)T;
3123 bufferLength = 3;
3124 } else {
3125 bufferLength = 2;
3126 }
3127 source->writableBuffer.releaseBuffer(bufferLength);
3128
3129 source->fcdPosition = source->pos; // Indicate where to continue in main input string
3130 // after exhausting the writableBuffer
3131 source->pos = source->writableBuffer.getTerminatedBuffer();
3132 source->origFlags = source->flags;
3133 source->flags |= UCOL_ITER_INNORMBUF;
3134 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3135
3136 return(UCOL_IGNORABLE);
3137 }
3138 }
3139 case SURROGATE_TAG:
3140 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
3141 /* two things can happen here: next code point can be a trailing surrogate - we will use it */
3142 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
3143 /* we return 0 (completely ignorable - per UCA specification */
3144 {
3145 UChar trail;
3146 collIterateState state;
3147 backupState(source, &state);
3148 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
3149 // we chould have stepped one char forward and it might have turned that it
3150 // was not a trail surrogate. In that case, we have to backup.
3151 loadState(source, &state, TRUE);
3152 return 0;
3153 } else {
3154 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
3155 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
3156 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
3157 // We need to backup
3158 loadState(source, &state, TRUE);
3159 return CE;
3160 }
3161 // calculate the supplementary code point value, if surrogate was not tailored
3162 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3163 }
3164 }
3165 break;
3166 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
3167 UChar nextChar;
3168 if( source->flags & UCOL_USE_ITERATOR) {
3169 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
3170 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3171 source->iterator->next(source->iterator);
3172 return getImplicit(cp, source);
3173 } else {
3174 return 0;
3175 }
3176 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
3177 U_IS_TRAIL((nextChar=*source->pos))) {
3178 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3179 source->pos++;
3180 return getImplicit(cp, source);
3181 } else {
3182 return 0; /* completely ignorable */
3183 }
3184 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3185 return 0; /* broken surrogate sequence */
3186 case CHARSET_TAG:
3187 /* not yet implemented */
3188 /* probably after 1.8 */
3189 return UCOL_NOT_FOUND;
3190 default:
3191 *status = U_INTERNAL_PROGRAM_ERROR;
3192 CE=0;
3193 break;
3194 }
3195 if (CE <= UCOL_NOT_FOUND) break;
3196 }
3197 return CE;
3198 }
3199
3200
3201 /* now uses Mark's getImplicitPrimary code */
3202 static
getPrevImplicit(UChar32 cp,collIterate * collationSource)3203 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3204 if(isNonChar(cp)) {
3205 return 0;
3206 }
3207
3208 uint32_t r = uprv_uca_getImplicitPrimary(cp);
3209
3210 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3211 collationSource->toReturn = collationSource->CEpos;
3212
3213 if (collationSource->offsetBuffer == NULL) {
3214 collationSource->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
3215 collationSource->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
3216 collationSource->offsetStore = collationSource->offsetBuffer;
3217 }
3218
3219 // **** doesn't work if using iterator ****
3220 if (collationSource->flags & UCOL_ITER_INNORMBUF) {
3221 collationSource->offsetRepeatCount = 1;
3222 } else {
3223 int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
3224
3225 *(collationSource->offsetStore++) = firstOffset;
3226 *(collationSource->offsetStore++) = firstOffset + 1;
3227
3228 collationSource->offsetReturn = collationSource->offsetStore - 1;
3229 *(collationSource->offsetBuffer) = firstOffset;
3230 if (collationSource->offsetReturn == collationSource->offsetBuffer) {
3231 collationSource->offsetStore = collationSource->offsetBuffer;
3232 }
3233 }
3234
3235 return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3236 }
3237
3238 /**
3239 * This function handles the special CEs like contractions, expansions,
3240 * surrogates, Thai.
3241 * It is called by both getPrevCE
3242 */
ucol_prv_getSpecialPrevCE(const UCollator * coll,UChar ch,uint32_t CE,collIterate * source,UErrorCode * status)3243 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3244 collIterate *source,
3245 UErrorCode *status)
3246 {
3247 const uint32_t *CEOffset = NULL;
3248 UChar *UCharOffset = NULL;
3249 UChar schar;
3250 const UChar *constart = NULL;
3251 uint32_t size;
3252 UChar buffer[UCOL_MAX_BUFFER];
3253 uint32_t *endCEBuffer;
3254 UChar *strbuffer;
3255 int32_t noChars = 0;
3256 int32_t CECount = 0;
3257
3258 for(;;)
3259 {
3260 /* the only ces that loops are thai and contractions */
3261 switch (getCETag(CE))
3262 {
3263 case NOT_FOUND_TAG: /* this tag always returns */
3264 return CE;
3265
3266 case SPEC_PROC_TAG:
3267 {
3268 // Special processing is getting a CE that is preceded by a certain prefix
3269 // Currently this is only needed for optimizing Japanese length and iteration marks.
3270 // When we encouter a special processing tag, we go backwards and try to see if
3271 // we have a match.
3272 // Contraction tables are used - so the whole process is not unlike contraction.
3273 // prefix data is stored backwards in the table.
3274 const UChar *UCharOffset;
3275 UChar schar, tchar;
3276 collIterateState prefixState;
3277 backupState(source, &prefixState);
3278 for(;;) {
3279 // This loop will run once per source string character, for as long as we
3280 // are matching a potential contraction sequence
3281
3282 // First we position ourselves at the begining of contraction sequence
3283 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3284
3285 if (collIter_bos(source)) {
3286 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3287 break;
3288 }
3289 schar = getPrevNormalizedChar(source, status);
3290 goBackOne(source);
3291
3292 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3293 UCharOffset++;
3294 }
3295
3296 if (schar == tchar) {
3297 // Found the source string char in the table.
3298 // Pick up the corresponding CE from the table.
3299 CE = *(coll->contractionCEs +
3300 (UCharOffset - coll->contractionIndex));
3301 }
3302 else
3303 {
3304 // if there is a completely ignorable code point in the middle of
3305 // a prefix, we need to act as if it's not there
3306 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3307 // lone surrogates cannot be set to zero as it would break other processing
3308 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
3309 // it's easy for BMP code points
3310 if(isZeroCE == 0) {
3311 continue;
3312 } else if(U16_IS_TRAIL(schar) || U16_IS_LEAD(schar)) {
3313 // for supplementary code points, we have to check the next one
3314 // situations where we are going to ignore
3315 // 1. beginning of the string: schar is a lone surrogate
3316 // 2. schar is a lone surrogate
3317 // 3. schar is a trail surrogate in a valid surrogate sequence
3318 // that is explicitly set to zero.
3319 if (!collIter_bos(source)) {
3320 UChar lead;
3321 if(U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
3322 isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
3323 if(getCETag(isZeroCE) == SURROGATE_TAG) {
3324 uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
3325 if(finalCE == 0) {
3326 // this is a real, assigned completely ignorable code point
3327 goBackOne(source);
3328 continue;
3329 }
3330 }
3331 } else {
3332 // lone surrogate, completely ignorable
3333 continue;
3334 }
3335 } else {
3336 // lone surrogate at the beggining, completely ignorable
3337 continue;
3338 }
3339 }
3340 // Source string char was not in the table.
3341 // We have not found the prefix.
3342 CE = *(coll->contractionCEs +
3343 (ContractionStart - coll->contractionIndex));
3344 }
3345
3346 if(!isPrefix(CE)) {
3347 // The source string char was in the contraction table, and the corresponding
3348 // CE is not a prefix CE. We found the prefix, break
3349 // out of loop, this CE will end up being returned. This is the normal
3350 // way out of prefix handling when the source actually contained
3351 // the prefix.
3352 break;
3353 }
3354 }
3355 loadState(source, &prefixState, TRUE);
3356 break;
3357 }
3358
3359 case CONTRACTION_TAG: {
3360 /* to ensure that the backwards and forwards iteration matches, we
3361 take the current region of most possible match and pass it through
3362 the forward iteration. this will ensure that the obstinate problem of
3363 overlapping contractions will not occur.
3364 */
3365 schar = peekCharacter(source, 0);
3366 constart = (UChar *)coll->image + getContractOffset(CE);
3367 if (isAtStartPrevIterate(source)
3368 /* commented away contraction end checks after adding the checks
3369 in getPrevCE */) {
3370 /* start of string or this is not the end of any contraction */
3371 CE = *(coll->contractionCEs +
3372 (constart - coll->contractionIndex));
3373 break;
3374 }
3375 strbuffer = buffer;
3376 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3377 *(UCharOffset --) = 0;
3378 noChars = 0;
3379 // have to swap thai characters
3380 while (ucol_unsafeCP(schar, coll)) {
3381 *(UCharOffset) = schar;
3382 noChars++;
3383 UCharOffset --;
3384 schar = getPrevNormalizedChar(source, status);
3385 goBackOne(source);
3386 // TODO: when we exhaust the contraction buffer,
3387 // it needs to get reallocated. The problem is
3388 // that the size depends on the string which is
3389 // not iterated over. However, since we're travelling
3390 // backwards, we already had to set the iterator at
3391 // the end - so we might as well know where we are?
3392 if (UCharOffset + 1 == buffer) {
3393 /* we have exhausted the buffer */
3394 int32_t newsize = 0;
3395 if(source->pos) { // actually dealing with a position
3396 newsize = (int32_t)(source->pos - source->string + 1);
3397 } else { // iterator
3398 newsize = 4 * UCOL_MAX_BUFFER;
3399 }
3400 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3401 (newsize + UCOL_MAX_BUFFER));
3402 /* test for NULL */
3403 if (strbuffer == NULL) {
3404 *status = U_MEMORY_ALLOCATION_ERROR;
3405 return UCOL_NO_MORE_CES;
3406 }
3407 UCharOffset = strbuffer + newsize;
3408 uprv_memcpy(UCharOffset, buffer,
3409 UCOL_MAX_BUFFER * sizeof(UChar));
3410 UCharOffset --;
3411 }
3412 if ((source->pos && (source->pos == source->string ||
3413 ((source->flags & UCOL_ITER_INNORMBUF) &&
3414 *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3415 || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
3416 break;
3417 }
3418 }
3419 /* adds the initial base character to the string */
3420 *(UCharOffset) = schar;
3421 noChars++;
3422
3423 int32_t offsetBias;
3424
3425 // **** doesn't work if using iterator ****
3426 if (source->flags & UCOL_ITER_INNORMBUF) {
3427 offsetBias = -1;
3428 } else {
3429 offsetBias = (int32_t)(source->pos - source->string);
3430 }
3431
3432 /* a new collIterate is used to simplify things, since using the current
3433 collIterate will mean that the forward and backwards iteration will
3434 share and change the same buffers. we don't want to get into that. */
3435 collIterate temp;
3436 int32_t rawOffset;
3437
3438 IInit_collIterate(coll, UCharOffset, noChars, &temp, status);
3439 if(U_FAILURE(*status)) {
3440 return UCOL_NULLORDER;
3441 }
3442 temp.flags &= ~UCOL_ITER_NORM;
3443 temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;
3444
3445 rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero?
3446 CE = ucol_IGetNextCE(coll, &temp, status);
3447
3448 if (source->extendCEs) {
3449 endCEBuffer = source->extendCEs + source->extendCEsSize;
3450 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t));
3451 } else {
3452 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3453 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t));
3454 }
3455
3456 if (source->offsetBuffer == NULL) {
3457 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
3458 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
3459 source->offsetStore = source->offsetBuffer;
3460 }
3461
3462 while (CE != UCOL_NO_MORE_CES) {
3463 *(source->CEpos ++) = CE;
3464
3465 if (offsetBias >= 0) {
3466 *(source->offsetStore ++) = rawOffset + offsetBias;
3467 }
3468
3469 CECount++;
3470 if (source->CEpos == endCEBuffer) {
3471 /* ran out of CE space, reallocate to new buffer.
3472 If reallocation fails, reset pointers and bail out,
3473 there's no guarantee of the right character position after
3474 this bail*/
3475 if (!increaseCEsCapacity(source)) {
3476 *status = U_MEMORY_ALLOCATION_ERROR;
3477 if (strbuffer != buffer) {
3478 uprv_free(strbuffer);
3479 }
3480
3481 return (uint32_t)UCOL_NULLORDER;
3482 }
3483
3484 endCEBuffer = source->extendCEs + source->extendCEsSize;
3485 }
3486
3487 if (offsetBias >= 0 && source->offsetStore >= &source->offsetBuffer[source->offsetBufferSize]) {
3488 int32_t storeIX = (int32_t)(source->offsetStore - source->offsetBuffer);
3489 int32_t *tob = (int32_t *) uprv_realloc(source->offsetBuffer,
3490 sizeof(int32_t) * (source->offsetBufferSize + UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE));
3491
3492 if (tob != NULL) {
3493 source->offsetBuffer = tob;
3494 source->offsetStore = &source->offsetBuffer[storeIX];
3495 source->offsetBufferSize += UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE;
3496 } else {
3497 // memory error...
3498 *status = U_MEMORY_ALLOCATION_ERROR;
3499 source->CEpos = source->CEs;
3500
3501 if (strbuffer != buffer) {
3502 uprv_free(strbuffer);
3503 }
3504
3505 return (uint32_t) UCOL_NULLORDER;
3506 }
3507 }
3508
3509 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
3510 rawOffset = (int32_t)(temp.fcdPosition - temp.string);
3511 } else {
3512 rawOffset = (int32_t)(temp.pos - temp.string);
3513 }
3514
3515 CE = ucol_IGetNextCE(coll, &temp, status);
3516 }
3517
3518 if (source->offsetRepeatValue != 0) {
3519 if (CECount > noChars) {
3520 source->offsetRepeatCount += temp.offsetRepeatCount;
3521 } else {
3522 // **** does this really skip the right offsets? ****
3523 source->offsetReturn -= (noChars - CECount);
3524 }
3525 }
3526
3527 if (strbuffer != buffer) {
3528 uprv_free(strbuffer);
3529 }
3530
3531 if (offsetBias >= 0) {
3532 source->offsetReturn = source->offsetStore - 1;
3533 if (source->offsetReturn == source->offsetBuffer) {
3534 source->offsetStore = source->offsetBuffer;
3535 }
3536 }
3537
3538 source->toReturn = source->CEpos - 1;
3539 if (source->toReturn == source->CEs) {
3540 source->CEpos = source->CEs;
3541 }
3542
3543 return *(source->toReturn);
3544 }
3545 case LONG_PRIMARY_TAG:
3546 {
3547 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3548 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3549 source->toReturn = source->CEpos - 1;
3550
3551 if (source->offsetBuffer == NULL) {
3552 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
3553 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
3554 source->offsetStore = source->offsetBuffer;
3555 }
3556
3557 if (source->flags & UCOL_ITER_INNORMBUF) {
3558 source->offsetRepeatCount = 1;
3559 } else {
3560 int32_t firstOffset = (int32_t)(source->pos - source->string);
3561
3562 *(source->offsetStore++) = firstOffset;
3563 *(source->offsetStore++) = firstOffset + 1;
3564
3565 source->offsetReturn = source->offsetStore - 1;
3566 *(source->offsetBuffer) = firstOffset;
3567 if (source->offsetReturn == source->offsetBuffer) {
3568 source->offsetStore = source->offsetBuffer;
3569 }
3570 }
3571
3572
3573 return *(source->toReturn);
3574 }
3575
3576 case EXPANSION_TAG: /* this tag always returns */
3577 {
3578 /*
3579 This should handle expansion.
3580 NOTE: we can encounter both continuations and expansions in an expansion!
3581 I have to decide where continuations are going to be dealt with
3582 */
3583 int32_t firstOffset = (int32_t)(source->pos - source->string);
3584
3585 // **** doesn't work if using iterator ****
3586 if (source->offsetReturn != NULL) {
3587 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) {
3588 source->offsetStore = source->offsetBuffer;
3589 }else {
3590 firstOffset = -1;
3591 }
3592 }
3593
3594 if (source->offsetBuffer == NULL) {
3595 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
3596 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
3597 source->offsetStore = source->offsetBuffer;
3598 }
3599
3600 /* find the offset to expansion table */
3601 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3602 size = getExpansionCount(CE);
3603 if (size != 0) {
3604 /*
3605 if there are less than 16 elements in expansion, we don't terminate
3606 */
3607 uint32_t count;
3608
3609 for (count = 0; count < size; count++) {
3610 *(source->CEpos ++) = *CEOffset++;
3611
3612 if (firstOffset >= 0) {
3613 *(source->offsetStore ++) = firstOffset + 1;
3614 }
3615 }
3616 } else {
3617 /* else, we do */
3618 while (*CEOffset != 0) {
3619 *(source->CEpos ++) = *CEOffset ++;
3620
3621 if (firstOffset >= 0) {
3622 *(source->offsetStore ++) = firstOffset + 1;
3623 }
3624 }
3625 }
3626
3627 if (firstOffset >= 0) {
3628 source->offsetReturn = source->offsetStore - 1;
3629 *(source->offsetBuffer) = firstOffset;
3630 if (source->offsetReturn == source->offsetBuffer) {
3631 source->offsetStore = source->offsetBuffer;
3632 }
3633 } else {
3634 source->offsetRepeatCount += size - 1;
3635 }
3636
3637 source->toReturn = source->CEpos - 1;
3638 // in case of one element expansion, we
3639 // want to immediately return CEpos
3640 if(source->toReturn == source->CEs) {
3641 source->CEpos = source->CEs;
3642 }
3643
3644 return *(source->toReturn);
3645 }
3646
3647 case DIGIT_TAG:
3648 {
3649 /*
3650 We do a check to see if we want to collate digits as numbers; if so we generate
3651 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3652 */
3653 uint32_t i; /* general counter */
3654
3655 if (source->coll->numericCollation == UCOL_ON){
3656 uint32_t digIndx = 0;
3657 uint32_t endIndex = 0;
3658 uint32_t leadingZeroIndex = 0;
3659 uint32_t trailingZeroCount = 0;
3660
3661 uint8_t collateVal = 0;
3662
3663 UBool nonZeroValReached = FALSE;
3664
3665 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs.
3666 /*
3667 We parse the source string until we hit a char that's NOT a digit.
3668 Use this u_charDigitValue. This might be slow because we have to
3669 handle surrogates...
3670 */
3671 /*
3672 We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less,
3673 with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
3674 element we process when going backward. To determine how long that chunk might be, we may need to make
3675 two passes through the loop that collects digits - one to see how long the string is (and how much is
3676 leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has
3677 more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation
3678 element chunk after resetting the state to the initialState at the right side of the digit string.
3679 */
3680 uint32_t ceLimit = 0;
3681 UChar initial_ch = ch;
3682 collIterateState initialState = {0,0,0,0,0,0,0,0,0};
3683 backupState(source, &initialState);
3684
3685 for(;;) {
3686 collIterateState state = {0,0,0,0,0,0,0,0,0};
3687 UChar32 char32 = 0;
3688 int32_t digVal = 0;
3689
3690 if (U16_IS_TRAIL (ch)) {
3691 if (!collIter_bos(source)){
3692 UChar lead = getPrevNormalizedChar(source, status);
3693 if(U16_IS_LEAD(lead)) {
3694 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3695 goBackOne(source);
3696 } else {
3697 char32 = ch;
3698 }
3699 } else {
3700 char32 = ch;
3701 }
3702 } else {
3703 char32 = ch;
3704 }
3705 digVal = u_charDigitValue(char32);
3706
3707 for(;;) {
3708 // Make sure we have enough space. No longer needed;
3709 // at this point the largest value of digIndx when we need to save data in numTempBuf
3710 // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure
3711 // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2).
3712
3713 // Skip over trailing zeroes, and keep a count of them.
3714 if (digVal != 0)
3715 nonZeroValReached = TRUE;
3716
3717 if (nonZeroValReached) {
3718 /*
3719 We parse the digit string into base 100 numbers (this fits into a byte).
3720 We only add to the buffer in twos, thus if we are parsing an odd character,
3721 that serves as the 'tens' digit while the if we are parsing an even one, that
3722 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3723 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3724 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3725 than all the other bytes.
3726
3727 Since we're doing in this reverse we want to put the first digit encountered into the
3728 ones place and the second digit encountered into the tens place.
3729 */
3730
3731 if ((digIndx + trailingZeroCount) % 2 == 1) {
3732 // High-order digit case (tens place)
3733 collateVal += (uint8_t)(digVal * 10);
3734
3735 // We cannot set leadingZeroIndex unless it has been set for the
3736 // low-order digit. Therefore, all we can do for the high-order
3737 // digit is turn it off, never on.
3738 // The only time we will have a high digit without a low is for
3739 // the very first non-zero digit, so no zero check is necessary.
3740 if (collateVal != 0)
3741 leadingZeroIndex = 0;
3742
3743 // The first pass through, digIndx may exceed the limit, but in that case
3744 // we no longer care about numTempBuf contents since they will be discarded
3745 if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {
3746 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3747 }
3748 collateVal = 0;
3749 } else {
3750 // Low-order digit case (ones place)
3751 collateVal = (uint8_t)digVal;
3752
3753 // Check for leading zeroes.
3754 if (collateVal == 0) {
3755 if (!leadingZeroIndex)
3756 leadingZeroIndex = (digIndx/2) + 2;
3757 } else
3758 leadingZeroIndex = 0;
3759
3760 // No need to write to buffer; the case of a last odd digit
3761 // is handled below.
3762 }
3763 ++digIndx;
3764 } else
3765 ++trailingZeroCount;
3766
3767 if (!collIter_bos(source)) {
3768 ch = getPrevNormalizedChar(source, status);
3769 //goBackOne(source);
3770 if (U16_IS_TRAIL(ch)) {
3771 backupState(source, &state);
3772 if (!collIter_bos(source)) {
3773 goBackOne(source);
3774 UChar lead = getPrevNormalizedChar(source, status);
3775
3776 if(U16_IS_LEAD(lead)) {
3777 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3778 } else {
3779 loadState(source, &state, FALSE);
3780 char32 = ch;
3781 }
3782 }
3783 } else
3784 char32 = ch;
3785
3786 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {
3787 if (char32 > 0xFFFF) {// For surrogates.
3788 loadState(source, &state, FALSE);
3789 }
3790 // Don't need to "reverse" the goBackOne call,
3791 // as this points to the next position to process..
3792 //if (char32 > 0xFFFF) // For surrogates.
3793 //getNextNormalizedChar(source);
3794 break;
3795 }
3796
3797 goBackOne(source);
3798 }else
3799 break;
3800 }
3801
3802 if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) {
3803 // our collation element is not too big, go ahead and finish with it
3804 break;
3805 }
3806 // our digit string is too long for a collation element;
3807 // set the limit for it, reset the state and begin again
3808 ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER;
3809 if ( ceLimit == 0 ) {
3810 ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;
3811 }
3812 ch = initial_ch;
3813 loadState(source, &initialState, FALSE);
3814 digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0;
3815 collateVal = 0;
3816 nonZeroValReached = FALSE;
3817 }
3818
3819 if (! nonZeroValReached) {
3820 digIndx = 2;
3821 trailingZeroCount = 0;
3822 numTempBuf[2] = 6;
3823 }
3824
3825 if ((digIndx + trailingZeroCount) % 2 != 0) {
3826 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
3827 digIndx += 1; // The implicit leading zero
3828 }
3829 if (trailingZeroCount % 2 != 0) {
3830 // We had to consume one trailing zero for the low digit
3831 // of the least significant byte
3832 digIndx += 1; // The trailing zero not in the exponent
3833 trailingZeroCount -= 1;
3834 }
3835
3836 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
3837
3838 // Subtract one off of the last byte. Really the first byte here, but it's reversed...
3839 numTempBuf[2] -= 1;
3840
3841 /*
3842 We want to skip over the first two slots in the buffer. The first slot
3843 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3844 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3845 The exponent must be adjusted by the number of leading zeroes, and the number of
3846 trailing zeroes.
3847 */
3848 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3849 uint32_t exponent = (digIndx+trailingZeroCount)/2;
3850 if (leadingZeroIndex)
3851 exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
3852 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
3853
3854 // Now transfer the collation key to our collIterate struct.
3855 // The total size for our collation key is half of endIndex, rounded up.
3856 int32_t size = (endIndex+1)/2;
3857 if(!ensureCEsCapacity(source, size)) {
3858 return UCOL_NULLORDER;
3859 }
3860 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3861 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3862 UCOL_BYTE_COMMON; // Tertiary weight.
3863 i = endIndex - 1; // Reset the index into the buffer.
3864 while(i >= 2) {
3865 uint32_t primWeight = numTempBuf[i--] << 8;
3866 if ( i >= 2)
3867 primWeight |= numTempBuf[i--];
3868 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3869 }
3870
3871 source->toReturn = source->CEpos -1;
3872 return *(source->toReturn);
3873 } else {
3874 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3875 CE = *(CEOffset++);
3876 break;
3877 }
3878 }
3879
3880 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3881 {
3882 static const uint32_t
3883 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3884 //const uint32_t LCount = 19;
3885 static const uint32_t VCount = 21;
3886 static const uint32_t TCount = 28;
3887 //const uint32_t NCount = VCount * TCount; /* 588 */
3888 //const uint32_t SCount = LCount * NCount; /* 11172 */
3889
3890 uint32_t L = ch - SBase;
3891 /*
3892 divide into pieces.
3893 we do it in this order since some compilers can do % and / in one
3894 operation
3895 */
3896 uint32_t T = L % TCount;
3897 L /= TCount;
3898 uint32_t V = L % VCount;
3899 L /= VCount;
3900
3901 /* offset them */
3902 L += LBase;
3903 V += VBase;
3904 T += TBase;
3905
3906 if (source->offsetBuffer == NULL) {
3907 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
3908 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
3909 source->offsetStore = source->offsetBuffer;
3910 }
3911
3912 int32_t firstOffset = (int32_t)(source->pos - source->string);
3913
3914 *(source->offsetStore++) = firstOffset;
3915
3916 /*
3917 * return the first CE, but first put the rest into the expansion buffer
3918 */
3919 if (!source->coll->image->jamoSpecial) {
3920 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3921 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3922 *(source->offsetStore++) = firstOffset + 1;
3923
3924 if (T != TBase) {
3925 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3926 *(source->offsetStore++) = firstOffset + 1;
3927 }
3928
3929 source->toReturn = source->CEpos - 1;
3930
3931 source->offsetReturn = source->offsetStore - 1;
3932 if (source->offsetReturn == source->offsetBuffer) {
3933 source->offsetStore = source->offsetBuffer;
3934 }
3935
3936 return *(source->toReturn);
3937 } else {
3938 // Since Hanguls pass the FCD check, it is
3939 // guaranteed that we won't be in
3940 // the normalization buffer if something like this happens
3941 // Move Jamos into normalization buffer
3942 /*
3943 Move the Jamos into the
3944 normalization buffer
3945 */
3946 UChar *tempbuffer = source->writableBuffer.getBuffer(5);
3947 int32_t tempbufferLength;
3948 tempbuffer[0] = 0;
3949 tempbuffer[1] = (UChar)L;
3950 tempbuffer[2] = (UChar)V;
3951 if (T != TBase) {
3952 tempbuffer[3] = (UChar)T;
3953 tempbufferLength = 4;
3954 } else {
3955 tempbufferLength = 3;
3956 }
3957 source->writableBuffer.releaseBuffer(tempbufferLength);
3958
3959 /*
3960 Indicate where to continue in main input string after exhausting
3961 the writableBuffer
3962 */
3963 if (source->pos == source->string) {
3964 source->fcdPosition = NULL;
3965 } else {
3966 source->fcdPosition = source->pos-1;
3967 }
3968
3969 source->pos = source->writableBuffer.getTerminatedBuffer() + tempbufferLength;
3970 source->origFlags = source->flags;
3971 source->flags |= UCOL_ITER_INNORMBUF;
3972 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3973
3974 return(UCOL_IGNORABLE);
3975 }
3976 }
3977
3978 case IMPLICIT_TAG: /* everything that is not defined otherwise */
3979 return getPrevImplicit(ch, source);
3980
3981 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
3982 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3983 return getPrevImplicit(ch, source);
3984
3985 case SURROGATE_TAG: /* This is a surrogate pair */
3986 /* essentialy an engaged lead surrogate. */
3987 /* if you have encountered it here, it means that a */
3988 /* broken sequence was encountered and this is an error */
3989 return 0;
3990
3991 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
3992 return 0; /* broken surrogate sequence */
3993
3994 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3995 {
3996 UChar32 cp = 0;
3997 UChar prevChar;
3998 const UChar *prev;
3999 if (isAtStartPrevIterate(source)) {
4000 /* we are at the start of the string, wrong place to be at */
4001 return 0;
4002 }
4003 if (source->pos != source->writableBuffer.getBuffer()) {
4004 prev = source->pos - 1;
4005 } else {
4006 prev = source->fcdPosition;
4007 }
4008 prevChar = *prev;
4009
4010 /* Handles Han and Supplementary characters here.*/
4011 if (U16_IS_LEAD(prevChar)) {
4012 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
4013 source->pos = prev;
4014 } else {
4015 return 0; /* completely ignorable */
4016 }
4017
4018 return getPrevImplicit(cp, source);
4019 }
4020
4021 /* UCA is filled with these. Tailorings are NOT_FOUND */
4022 /* not yet implemented */
4023 case CHARSET_TAG: /* this tag always returns */
4024 /* probably after 1.8 */
4025 return UCOL_NOT_FOUND;
4026
4027 default: /* this tag always returns */
4028 *status = U_INTERNAL_PROGRAM_ERROR;
4029 CE=0;
4030 break;
4031 }
4032
4033 if (CE <= UCOL_NOT_FOUND) {
4034 break;
4035 }
4036 }
4037
4038 return CE;
4039 }
4040
4041 /* This should really be a macro */
4042 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
4043 /* anyway */
4044 static
reallocateBuffer(uint8_t ** secondaries,uint8_t * secStart,uint8_t * second,uint32_t * secSize,uint32_t newSize,UErrorCode * status)4045 uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) {
4046 #ifdef UCOL_DEBUG
4047 fprintf(stderr, ".");
4048 #endif
4049 uint8_t *newStart = NULL;
4050 uint32_t offset = (uint32_t)(*secondaries-secStart);
4051
4052 if(secStart==second) {
4053 newStart=(uint8_t*)uprv_malloc(newSize);
4054 if(newStart==NULL) {
4055 *status = U_MEMORY_ALLOCATION_ERROR;
4056 return NULL;
4057 }
4058 uprv_memcpy(newStart, secStart, *secondaries-secStart);
4059 } else {
4060 newStart=(uint8_t*)uprv_realloc(secStart, newSize);
4061 if(newStart==NULL) {
4062 *status = U_MEMORY_ALLOCATION_ERROR;
4063 /* Since we're reallocating, return original reference so we don't loose it. */
4064 return secStart;
4065 }
4066 }
4067 *secondaries=newStart+offset;
4068 *secSize=newSize;
4069 return newStart;
4070 }
4071
4072
4073 /* This should really be a macro */
4074 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
4075 /* secondaries in French */
4076 /*
4077 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
4078 uint8_t temp;
4079 while(start<end) {
4080 temp = *start;
4081 *start++ = *end;
4082 *end-- = temp;
4083 }
4084 }
4085 */
4086
4087 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
4088 TYPE tempA; \
4089 while((start)<(end)) { \
4090 tempA = *(start); \
4091 *(start)++ = *(end); \
4092 *(end)-- = tempA; \
4093 } \
4094 }
4095
4096 /****************************************************************************/
4097 /* Following are the sortkey generation functions */
4098 /* */
4099 /****************************************************************************/
4100
4101 /**
4102 * Merge two sort keys.
4103 * This is useful, for example, to combine sort keys from first and last names
4104 * to sort such pairs.
4105 * Merged sort keys consider on each collation level the first part first entirely,
4106 * then the second one.
4107 * It is possible to merge multiple sort keys by consecutively merging
4108 * another one with the intermediate result.
4109 *
4110 * The length of the merge result is the sum of the lengths of the input sort keys
4111 * minus 1.
4112 *
4113 * @param src1 the first sort key
4114 * @param src1Length the length of the first sort key, including the zero byte at the end;
4115 * can be -1 if the function is to find the length
4116 * @param src2 the second sort key
4117 * @param src2Length the length of the second sort key, including the zero byte at the end;
4118 * can be -1 if the function is to find the length
4119 * @param dest the buffer where the merged sort key is written,
4120 * can be NULL if destCapacity==0
4121 * @param destCapacity the number of bytes in the dest buffer
4122 * @return the length of the merged sort key, src1Length+src2Length-1;
4123 * can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
4124 * in which cases the contents of dest is undefined
4125 *
4126 * @draft
4127 */
4128 U_CAPI int32_t U_EXPORT2
ucol_mergeSortkeys(const uint8_t * src1,int32_t src1Length,const uint8_t * src2,int32_t src2Length,uint8_t * dest,int32_t destCapacity)4129 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
4130 const uint8_t *src2, int32_t src2Length,
4131 uint8_t *dest, int32_t destCapacity) {
4132 int32_t destLength;
4133 uint8_t b;
4134
4135 /* check arguments */
4136 if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
4137 src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
4138 destCapacity<0 || (destCapacity>0 && dest==NULL)
4139 ) {
4140 /* error, attempt to write a zero byte and return 0 */
4141 if(dest!=NULL && destCapacity>0) {
4142 *dest=0;
4143 }
4144 return 0;
4145 }
4146
4147 /* check lengths and capacity */
4148 if(src1Length<0) {
4149 src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
4150 }
4151 if(src2Length<0) {
4152 src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
4153 }
4154
4155 destLength=src1Length+src2Length-1;
4156 if(destLength>destCapacity) {
4157 /* the merged sort key does not fit into the destination */
4158 return destLength;
4159 }
4160
4161 /* merge the sort keys with the same number of levels */
4162 while(*src1!=0 && *src2!=0) { /* while both have another level */
4163 /* copy level from src1 not including 00 or 01 */
4164 while((b=*src1)>=2) {
4165 ++src1;
4166 *dest++=b;
4167 }
4168
4169 /* add a 02 merge separator */
4170 *dest++=2;
4171
4172 /* copy level from src2 not including 00 or 01 */
4173 while((b=*src2)>=2) {
4174 ++src2;
4175 *dest++=b;
4176 }
4177
4178 /* if both sort keys have another level, then add a 01 level separator and continue */
4179 if(*src1==1 && *src2==1) {
4180 ++src1;
4181 ++src2;
4182 *dest++=1;
4183 }
4184 }
4185
4186 /*
4187 * here, at least one sort key is finished now, but the other one
4188 * might have some contents left from containing more levels;
4189 * that contents is just appended to the result
4190 */
4191 if(*src1!=0) {
4192 /* src1 is not finished, therefore *src2==0, and src1 is appended */
4193 src2=src1;
4194 }
4195 /* append src2, "the other, unfinished sort key" */
4196 uprv_strcpy((char *)dest, (const char *)src2);
4197
4198 /* trust that neither sort key contained illegally embedded zero bytes */
4199 return destLength;
4200 }
4201
4202 /* sortkey API */
4203 U_CAPI int32_t U_EXPORT2
ucol_getSortKey(const UCollator * coll,const UChar * source,int32_t sourceLength,uint8_t * result,int32_t resultLength)4204 ucol_getSortKey(const UCollator *coll,
4205 const UChar *source,
4206 int32_t sourceLength,
4207 uint8_t *result,
4208 int32_t resultLength)
4209 {
4210 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
4211 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
4212 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
4213 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
4214 }
4215
4216 UErrorCode status = U_ZERO_ERROR;
4217 int32_t keySize = 0;
4218
4219 if(source != NULL) {
4220 // source == NULL is actually an error situation, but we would need to
4221 // have an error code to return it. Until we introduce a new
4222 // API, it stays like this
4223
4224 /* this uses the function pointer that is set in updateinternalstate */
4225 /* currently, there are two funcs: */
4226 /*ucol_calcSortKey(...);*/
4227 /*ucol_calcSortKeySimpleTertiary(...);*/
4228
4229 keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status);
4230 //if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && result && resultLength > 0) {
4231 // That's not good. Something unusual happened.
4232 // We don't know how much we initialized before we failed.
4233 // NULL terminate for safety.
4234 // We have no way say that we have generated a partial sort key.
4235 //result[0] = 0;
4236 //keySize = 0;
4237 //}
4238 }
4239 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
4240 UTRACE_EXIT_STATUS(status);
4241 return keySize;
4242 }
4243
4244 /* this function is called by the C++ API for sortkey generation */
4245 U_CFUNC int32_t
ucol_getSortKeyWithAllocation(const UCollator * coll,const UChar * source,int32_t sourceLength,uint8_t ** pResult,UErrorCode * pErrorCode)4246 ucol_getSortKeyWithAllocation(const UCollator *coll,
4247 const UChar *source, int32_t sourceLength,
4248 uint8_t **pResult,
4249 UErrorCode *pErrorCode) {
4250 *pResult = 0;
4251 return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode);
4252 }
4253
4254 #define UCOL_FSEC_BUF_SIZE 256
4255
4256 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0 */
4257 /* or if we run out of space while making a sortkey and want to return ASAP */
ucol_getSortKeySize(const UCollator * coll,collIterate * s,int32_t currentSize,UColAttributeValue strength,int32_t len)4258 int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) {
4259 UErrorCode status = U_ZERO_ERROR;
4260 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4261 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4262 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4263 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4264 UBool compareIdent = (strength == UCOL_IDENTICAL);
4265 UBool doCase = (coll->caseLevel == UCOL_ON);
4266 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
4267 //UBool qShifted = shifted && (compareQuad == 0);
4268 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4269 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4270 uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE];
4271 uint8_t *fSecs = fSecsBuff;
4272 uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE;
4273 uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL;
4274
4275 uint32_t variableTopValue = coll->variableTopValue;
4276 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4277 if(doHiragana) {
4278 UCOL_COMMON_BOT4++;
4279 /* allocate one more space for hiragana */
4280 }
4281 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4282
4283 uint32_t order = UCOL_NO_MORE_CES;
4284 uint8_t primary1 = 0;
4285 uint8_t primary2 = 0;
4286 uint8_t secondary = 0;
4287 uint8_t tertiary = 0;
4288 int32_t caseShift = 0;
4289 uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */
4290
4291 uint8_t caseSwitch = coll->caseSwitch;
4292 uint8_t tertiaryMask = coll->tertiaryMask;
4293 uint8_t tertiaryCommon = coll->tertiaryCommon;
4294
4295 UBool wasShifted = FALSE;
4296 UBool notIsContinuation = FALSE;
4297 uint8_t leadPrimary = 0;
4298
4299
4300 for(;;) {
4301 order = ucol_IGetNextCE(coll, s, &status);
4302 if(order == UCOL_NO_MORE_CES) {
4303 break;
4304 }
4305
4306 if(order == 0) {
4307 continue;
4308 }
4309
4310 notIsContinuation = !isContinuation(order);
4311
4312
4313 if(notIsContinuation) {
4314 tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK));
4315 } else {
4316 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4317 }
4318 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4319 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4320 primary1 = (uint8_t)(order >> 8);
4321
4322
4323 if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4324 || (!notIsContinuation && wasShifted))
4325 || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
4326 /* and other ignorables should be removed if following a shifted code point */
4327 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4328 /* we should just completely ignore it */
4329 continue;
4330 }
4331 if(compareQuad == 0) {
4332 if(c4 > 0) {
4333 currentSize += (c2/UCOL_BOT_COUNT4)+1;
4334 c4 = 0;
4335 }
4336 currentSize++;
4337 if(primary2 != 0) {
4338 currentSize++;
4339 }
4340 }
4341 wasShifted = TRUE;
4342 } else {
4343 wasShifted = FALSE;
4344 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4345 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
4346 /* calculate sortkey size */
4347 if(primary1 != UCOL_IGNORABLE) {
4348 if(notIsContinuation) {
4349 if(leadPrimary == primary1) {
4350 currentSize++;
4351 } else {
4352 if(leadPrimary != 0) {
4353 currentSize++;
4354 }
4355 if(primary2 == UCOL_IGNORABLE) {
4356 /* one byter, not compressed */
4357 currentSize++;
4358 leadPrimary = 0;
4359 }
4360 else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
4361 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
4362 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4363 (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary))
4364 {
4365 /* not compressible */
4366 leadPrimary = 0;
4367 currentSize+=2;
4368 }
4369 else { /* compress */
4370 leadPrimary = primary1;
4371 currentSize+=2;
4372 }
4373 }
4374 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4375 currentSize++;
4376 if(primary2 != UCOL_IGNORABLE) {
4377 currentSize++;
4378 }
4379 }
4380 }
4381
4382 if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */
4383 if(!isFrenchSec){
4384 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4385 c2++;
4386 } else {
4387 if(c2 > 0) {
4388 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4389 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1;
4390 } else {
4391 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1;
4392 }
4393 c2 = 0;
4394 }
4395 currentSize++;
4396 }
4397 } else {
4398 fSecs[fSecsLen++] = secondary;
4399 if(fSecsLen == fSecsMaxLen) {
4400 uint8_t *fSecsTemp;
4401 if(fSecs == fSecsBuff) {
4402 fSecsTemp = (uint8_t *)uprv_malloc(2*fSecsLen);
4403 } else {
4404 fSecsTemp = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen);
4405 }
4406 if(fSecsTemp == NULL) {
4407 status = U_MEMORY_ALLOCATION_ERROR;
4408 return 0;
4409 }
4410 fSecs = fSecsTemp;
4411 fSecsMaxLen *= 2;
4412 }
4413 if(notIsContinuation) {
4414 if (frenchStartPtr != NULL) {
4415 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4416 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4417 frenchStartPtr = NULL;
4418 }
4419 } else {
4420 if (frenchStartPtr == NULL) {
4421 frenchStartPtr = fSecs+fSecsLen-2;
4422 }
4423 frenchEndPtr = fSecs+fSecsLen-1;
4424 }
4425 }
4426 }
4427
4428 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4429 // do the case level if we need to do it. We don't want to calculate
4430 // case level for primary ignorables if we have only primary strength and case level
4431 // otherwise we would break well formedness of CEs
4432 if (caseShift == 0) {
4433 currentSize++;
4434 caseShift = UCOL_CASE_SHIFT_START;
4435 }
4436 if((tertiary&0x3F) > 0 && notIsContinuation) {
4437 caseShift--;
4438 if((tertiary &0xC0) != 0) {
4439 if (caseShift == 0) {
4440 currentSize++;
4441 caseShift = UCOL_CASE_SHIFT_START;
4442 }
4443 caseShift--;
4444 }
4445 }
4446 } else {
4447 if(notIsContinuation) {
4448 tertiary ^= caseSwitch;
4449 }
4450 }
4451
4452 tertiary &= tertiaryMask;
4453 if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */
4454 if (tertiary == tertiaryCommon && notIsContinuation) {
4455 c3++;
4456 } else {
4457 if(c3 > 0) {
4458 if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
4459 || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
4460 currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1;
4461 } else {
4462 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1;
4463 }
4464 c3 = 0;
4465 }
4466 currentSize++;
4467 }
4468 }
4469
4470 if(/*qShifted*/(compareQuad==0) && notIsContinuation) {
4471 if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4472 if(c4>0) { // Close this part
4473 currentSize += (c4/UCOL_BOT_COUNT4)+1;
4474 c4 = 0;
4475 }
4476 currentSize++; // Add the Hiragana
4477 } else { // This wasn't Hiragana, so we can continue adding stuff
4478 c4++;
4479 }
4480 }
4481 }
4482 }
4483
4484 if(!isFrenchSec){
4485 if(c2 > 0) {
4486 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4487 }
4488 } else {
4489 uint32_t i = 0;
4490 if(frenchStartPtr != NULL) {
4491 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4492 }
4493 for(i = 0; i<fSecsLen; i++) {
4494 secondary = *(fSecs+fSecsLen-i-1);
4495 /* This is compression code. */
4496 if (secondary == UCOL_COMMON2) {
4497 ++c2;
4498 } else {
4499 if(c2 > 0) {
4500 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4501 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0);
4502 } else {
4503 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4504 }
4505 c2 = 0;
4506 }
4507 currentSize++;
4508 }
4509 }
4510 if(c2 > 0) {
4511 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4512 }
4513 if(fSecs != fSecsBuff) {
4514 uprv_free(fSecs);
4515 }
4516 }
4517
4518 if(c3 > 0) {
4519 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0);
4520 }
4521
4522 if(c4 > 0 && compareQuad == 0) {
4523 currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0);
4524 }
4525
4526 if(compareIdent) {
4527 currentSize += u_lengthOfIdenticalLevelRun(s->string, len);
4528 }
4529 return currentSize;
4530 }
4531
4532 static
doCaseShift(uint8_t ** cases,uint32_t & caseShift)4533 inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) {
4534 if (caseShift == 0) {
4535 *(*cases)++ = UCOL_CASE_BYTE_START;
4536 caseShift = UCOL_CASE_SHIFT_START;
4537 }
4538 }
4539
4540 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
4541 // know how many values we wanted to add, even if we didn't add them all
4542 static
addWithIncrement(uint8_t * & primaries,uint8_t * limit,uint32_t & size,const uint8_t value)4543 inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) {
4544 size++;
4545 if(primaries < limit) {
4546 *(primaries)++ = value;
4547 }
4548 }
4549
4550 // Packs the secondary buffer when processing French locale. Adds the terminator.
4551 static
packFrench(uint8_t * primaries,uint8_t * primEnd,uint8_t * secondaries,uint32_t * secsize,uint8_t * frenchStartPtr,uint8_t * frenchEndPtr)4552 inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) {
4553 uint8_t secondary;
4554 int32_t count2 = 0;
4555 uint32_t i = 0, size = 0;
4556 // we use i here since the key size already accounts for terminators, so we'll discard the increment
4557 addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR);
4558 /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
4559 if(frenchStartPtr != NULL) {
4560 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4561 }
4562 for(i = 0; i<*secsize; i++) {
4563 secondary = *(secondaries-i-1);
4564 /* This is compression code. */
4565 if (secondary == UCOL_COMMON2) {
4566 ++count2;
4567 } else {
4568 if (count2 > 0) {
4569 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4570 while (count2 > UCOL_TOP_COUNT2) {
4571 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
4572 count2 -= (uint32_t)UCOL_TOP_COUNT2;
4573 }
4574 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
4575 } else {
4576 while (count2 > UCOL_BOT_COUNT2) {
4577 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4578 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4579 }
4580 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4581 }
4582 count2 = 0;
4583 }
4584 addWithIncrement(primaries, primEnd, size, secondary);
4585 }
4586 }
4587 if (count2 > 0) {
4588 while (count2 > UCOL_BOT_COUNT2) {
4589 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4590 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4591 }
4592 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4593 }
4594 *secsize = size;
4595 return primaries;
4596 }
4597
4598 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
4599
4600 /* This is the sortkey work horse function */
4601 U_CFUNC int32_t U_CALLCONV
ucol_calcSortKey(const UCollator * coll,const UChar * source,int32_t sourceLength,uint8_t ** result,uint32_t resultLength,UBool allocateSKBuffer,UErrorCode * status)4602 ucol_calcSortKey(const UCollator *coll,
4603 const UChar *source,
4604 int32_t sourceLength,
4605 uint8_t **result,
4606 uint32_t resultLength,
4607 UBool allocateSKBuffer,
4608 UErrorCode *status)
4609 {
4610 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4611
4612 uint32_t i = 0; /* general purpose counter */
4613
4614 /* Stack allocated buffers for buffers we use */
4615 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER];
4616
4617 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad;
4618
4619 if(U_FAILURE(*status)) {
4620 return 0;
4621 }
4622
4623 if(primaries == NULL && allocateSKBuffer == TRUE) {
4624 primaries = *result = prim;
4625 resultLength = UCOL_PRIMARY_MAX_BUFFER;
4626 }
4627
4628 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER,
4629 caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER;
4630
4631 uint32_t sortKeySize = 1; /* it is always \0 terminated */
4632
4633 UnicodeString normSource;
4634
4635 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4636
4637 UColAttributeValue strength = coll->strength;
4638
4639 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4640 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4641 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4642 UBool compareIdent = (strength == UCOL_IDENTICAL);
4643 UBool doCase = (coll->caseLevel == UCOL_ON);
4644 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4645 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
4646 //UBool qShifted = shifted && (compareQuad == 0);
4647 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4648 /*const uint8_t *scriptOrder = coll->scriptOrder;*/
4649
4650 uint32_t variableTopValue = coll->variableTopValue;
4651 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4652 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4653 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4654 uint8_t UCOL_HIRAGANA_QUAD = 0;
4655 if(doHiragana) {
4656 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4657 /* allocate one more space for hiragana, value for hiragana */
4658 }
4659 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4660
4661 /* support for special features like caselevel and funky secondaries */
4662 uint8_t *frenchStartPtr = NULL;
4663 uint8_t *frenchEndPtr = NULL;
4664 uint32_t caseShift = 0;
4665
4666 sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShifted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0));
4667
4668 /* If we need to normalize, we'll do it all at once at the beginning! */
4669 const Normalizer2 *norm2;
4670 if(compareIdent) {
4671 norm2 = Normalizer2Factory::getNFDInstance(*status);
4672 } else if(coll->normalizationMode != UCOL_OFF) {
4673 norm2 = Normalizer2Factory::getFCDInstance(*status);
4674 } else {
4675 norm2 = NULL;
4676 }
4677 if(norm2 != NULL) {
4678 normSource.setTo(FALSE, source, len);
4679 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
4680 if(qcYesLength != len) {
4681 UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
4682 normSource.truncate(qcYesLength);
4683 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
4684 source = normSource.getBuffer();
4685 len = normSource.length();
4686 }
4687 }
4688 collIterate s;
4689 IInit_collIterate(coll, source, len, &s, status);
4690 if(U_FAILURE(*status)) {
4691 return 0;
4692 }
4693 if(source == normSource.getBuffer()) {
4694 s.flags &= ~UCOL_ITER_NORM;
4695 }
4696
4697 if(resultLength == 0 || primaries == NULL) {
4698 return ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4699 }
4700 uint8_t *primarySafeEnd = primaries + resultLength - 1;
4701 if(strength > UCOL_PRIMARY) {
4702 primarySafeEnd--;
4703 }
4704
4705 uint32_t minBufferSize = UCOL_MAX_BUFFER;
4706
4707 uint8_t *primStart = primaries;
4708 uint8_t *secStart = secondaries;
4709 uint8_t *terStart = tertiaries;
4710 uint8_t *caseStart = cases;
4711 uint8_t *quadStart = quads;
4712
4713 uint32_t order = 0;
4714
4715 uint8_t primary1 = 0;
4716 uint8_t primary2 = 0;
4717 uint8_t secondary = 0;
4718 uint8_t tertiary = 0;
4719 uint8_t caseSwitch = coll->caseSwitch;
4720 uint8_t tertiaryMask = coll->tertiaryMask;
4721 int8_t tertiaryAddition = coll->tertiaryAddition;
4722 uint8_t tertiaryTop = coll->tertiaryTop;
4723 uint8_t tertiaryBottom = coll->tertiaryBottom;
4724 uint8_t tertiaryCommon = coll->tertiaryCommon;
4725 uint8_t caseBits = 0;
4726
4727 UBool finished = FALSE;
4728 UBool wasShifted = FALSE;
4729 UBool notIsContinuation = FALSE;
4730
4731 uint32_t prevBuffSize = 0;
4732
4733 uint32_t count2 = 0, count3 = 0, count4 = 0;
4734 uint8_t leadPrimary = 0;
4735
4736 for(;;) {
4737 for(i=prevBuffSize; i<minBufferSize; ++i) {
4738
4739 order = ucol_IGetNextCE(coll, &s, status);
4740 if(order == UCOL_NO_MORE_CES) {
4741 finished = TRUE;
4742 break;
4743 }
4744
4745 if(order == 0) {
4746 continue;
4747 }
4748
4749 notIsContinuation = !isContinuation(order);
4750
4751 if(notIsContinuation) {
4752 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
4753 } else {
4754 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4755 }
4756
4757 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4758 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4759 primary1 = (uint8_t)(order >> 8);
4760
4761 /*if(notIsContinuation && scriptOrder != NULL) {
4762 primary1 = scriptOrder[primary1];
4763 }*/
4764
4765 if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4766 || (!notIsContinuation && wasShifted))
4767 || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
4768 {
4769 /* and other ignorables should be removed if following a shifted code point */
4770 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4771 /* we should just completely ignore it */
4772 continue;
4773 }
4774 if(compareQuad == 0) {
4775 if(count4 > 0) {
4776 while (count4 > UCOL_BOT_COUNT4) {
4777 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4778 count4 -= UCOL_BOT_COUNT4;
4779 }
4780 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4781 count4 = 0;
4782 }
4783 /* We are dealing with a variable and we're treating them as shifted */
4784 /* This is a shifted ignorable */
4785 if(primary1 != 0) { /* we need to check this since we could be in continuation */
4786 *quads++ = primary1;
4787 }
4788 if(primary2 != 0) {
4789 *quads++ = primary2;
4790 }
4791 }
4792 wasShifted = TRUE;
4793 } else {
4794 wasShifted = FALSE;
4795 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4796 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
4797 /* regular and simple sortkey calc */
4798 if(primary1 != UCOL_IGNORABLE) {
4799 if(notIsContinuation) {
4800 if(leadPrimary == primary1) {
4801 *primaries++ = primary2;
4802 } else {
4803 if(leadPrimary != 0) {
4804 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
4805 }
4806 if(primary2 == UCOL_IGNORABLE) {
4807 /* one byter, not compressed */
4808 *primaries++ = primary1;
4809 leadPrimary = 0;
4810 } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
4811 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4812 (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
4813 /* not compressible */
4814 leadPrimary = 0;
4815 *primaries++ = primary1;
4816 if(primaries <= primarySafeEnd) {
4817 *primaries++ = primary2;
4818 }
4819 } else { /* compress */
4820 *primaries++ = leadPrimary = primary1;
4821 if(primaries <= primarySafeEnd) {
4822 *primaries++ = primary2;
4823 }
4824 }
4825 }
4826 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4827 *primaries++ = primary1;
4828 if((primary2 != UCOL_IGNORABLE) && (primaries <= primarySafeEnd)) {
4829 *primaries++ = primary2; /* second part */
4830 }
4831 }
4832 }
4833
4834 if(secondary > compareSec) {
4835 if(!isFrenchSec) {
4836 /* This is compression code. */
4837 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4838 ++count2;
4839 } else {
4840 if (count2 > 0) {
4841 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4842 while (count2 > UCOL_TOP_COUNT2) {
4843 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
4844 count2 -= (uint32_t)UCOL_TOP_COUNT2;
4845 }
4846 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
4847 } else {
4848 while (count2 > UCOL_BOT_COUNT2) {
4849 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4850 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4851 }
4852 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
4853 }
4854 count2 = 0;
4855 }
4856 *secondaries++ = secondary;
4857 }
4858 } else {
4859 *secondaries++ = secondary;
4860 /* Do the special handling for French secondaries */
4861 /* We need to get continuation elements and do intermediate restore */
4862 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
4863 if(notIsContinuation) {
4864 if (frenchStartPtr != NULL) {
4865 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4866 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4867 frenchStartPtr = NULL;
4868 }
4869 } else {
4870 if (frenchStartPtr == NULL) {
4871 frenchStartPtr = secondaries - 2;
4872 }
4873 frenchEndPtr = secondaries-1;
4874 }
4875 }
4876 }
4877
4878 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4879 // do the case level if we need to do it. We don't want to calculate
4880 // case level for primary ignorables if we have only primary strength and case level
4881 // otherwise we would break well formedness of CEs
4882 doCaseShift(&cases, caseShift);
4883 if(notIsContinuation) {
4884 caseBits = (uint8_t)(tertiary & 0xC0);
4885
4886 if(tertiary != 0) {
4887 if(coll->caseFirst == UCOL_UPPER_FIRST) {
4888 if((caseBits & 0xC0) == 0) {
4889 *(cases-1) |= 1 << (--caseShift);
4890 } else {
4891 *(cases-1) |= 0 << (--caseShift);
4892 /* second bit */
4893 doCaseShift(&cases, caseShift);
4894 *(cases-1) |= ((caseBits>>6)&1) << (--caseShift);
4895 }
4896 } else {
4897 if((caseBits & 0xC0) == 0) {
4898 *(cases-1) |= 0 << (--caseShift);
4899 } else {
4900 *(cases-1) |= 1 << (--caseShift);
4901 /* second bit */
4902 doCaseShift(&cases, caseShift);
4903 *(cases-1) |= ((caseBits>>7)&1) << (--caseShift);
4904 }
4905 }
4906 }
4907
4908 }
4909 } else {
4910 if(notIsContinuation) {
4911 tertiary ^= caseSwitch;
4912 }
4913 }
4914
4915 tertiary &= tertiaryMask;
4916 if(tertiary > compareTer) {
4917 /* This is compression code. */
4918 /* sequence size check is included in the if clause */
4919 if (tertiary == tertiaryCommon && notIsContinuation) {
4920 ++count3;
4921 } else {
4922 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
4923 tertiary += tertiaryAddition;
4924 } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
4925 tertiary -= tertiaryAddition;
4926 }
4927 if (count3 > 0) {
4928 if ((tertiary > tertiaryCommon)) {
4929 while (count3 > coll->tertiaryTopCount) {
4930 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
4931 count3 -= (uint32_t)coll->tertiaryTopCount;
4932 }
4933 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
4934 } else {
4935 while (count3 > coll->tertiaryBottomCount) {
4936 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
4937 count3 -= (uint32_t)coll->tertiaryBottomCount;
4938 }
4939 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
4940 }
4941 count3 = 0;
4942 }
4943 *tertiaries++ = tertiary;
4944 }
4945 }
4946
4947 if(/*qShifted*/(compareQuad==0) && notIsContinuation) {
4948 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4949 if(count4>0) { // Close this part
4950 while (count4 > UCOL_BOT_COUNT4) {
4951 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4952 count4 -= UCOL_BOT_COUNT4;
4953 }
4954 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4955 count4 = 0;
4956 }
4957 *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana
4958 } else { // This wasn't Hiragana, so we can continue adding stuff
4959 count4++;
4960 }
4961 }
4962 }
4963
4964 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
4965 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
4966 IInit_collIterate(coll, (UChar *)source, len, &s, status);
4967 if(U_FAILURE(*status)) {
4968 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
4969 finished = TRUE;
4970 break;
4971 }
4972 if(source == normSource.getBuffer()) {
4973 s.flags &= ~UCOL_ITER_NORM;
4974 }
4975 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4976 *status = U_BUFFER_OVERFLOW_ERROR;
4977 finished = TRUE;
4978 break;
4979 } else { /* It's much nicer if we can actually reallocate */
4980 int32_t sks = sortKeySize+(int32_t)((primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart));
4981 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
4982 if(U_SUCCESS(*status)) {
4983 *result = primStart;
4984 primarySafeEnd = primStart + resultLength - 1;
4985 if(strength > UCOL_PRIMARY) {
4986 primarySafeEnd--;
4987 }
4988 } else {
4989 /* We ran out of memory!? We can't recover. */
4990 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
4991 finished = TRUE;
4992 break;
4993 }
4994 }
4995 }
4996 }
4997 if(finished) {
4998 break;
4999 } else {
5000 prevBuffSize = minBufferSize;
5001
5002 uint32_t frenchStartOffset = 0, frenchEndOffset = 0;
5003 if (frenchStartPtr != NULL) {
5004 frenchStartOffset = (uint32_t)(frenchStartPtr - secStart);
5005 frenchEndOffset = (uint32_t)(frenchEndPtr - secStart);
5006 }
5007 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
5008 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
5009 caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status);
5010 quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status);
5011 if(U_FAILURE(*status)) {
5012 /* We ran out of memory!? We can't recover. */
5013 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5014 break;
5015 }
5016 if (frenchStartPtr != NULL) {
5017 frenchStartPtr = secStart + frenchStartOffset;
5018 frenchEndPtr = secStart + frenchEndOffset;
5019 }
5020 minBufferSize *= 2;
5021 }
5022 }
5023
5024 /* Here, we are generally done with processing */
5025 /* bailing out would not be too productive */
5026
5027 if(U_SUCCESS(*status)) {
5028 sortKeySize += (uint32_t)(primaries - primStart);
5029 /* we have done all the CE's, now let's put them together to form a key */
5030 if(compareSec == 0) {
5031 if (count2 > 0) {
5032 while (count2 > UCOL_BOT_COUNT2) {
5033 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5034 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5035 }
5036 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5037 }
5038 uint32_t secsize = (uint32_t)(secondaries-secStart);
5039 if(!isFrenchSec) { // Regular situation, we know the length of secondaries
5040 sortKeySize += secsize;
5041 if(sortKeySize <= resultLength) {
5042 *(primaries++) = UCOL_LEVELTERMINATOR;
5043 uprv_memcpy(primaries, secStart, secsize);
5044 primaries += secsize;
5045 } else {
5046 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
5047 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5048 if(U_SUCCESS(*status)) {
5049 *result = primStart;
5050 *(primaries++) = UCOL_LEVELTERMINATOR;
5051 uprv_memcpy(primaries, secStart, secsize);
5052 primaries += secsize;
5053 }
5054 else {
5055 /* We ran out of memory!? We can't recover. */
5056 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5057 goto cleanup;
5058 }
5059 } else {
5060 *status = U_BUFFER_OVERFLOW_ERROR;
5061 }
5062 }
5063 } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
5064 uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
5065 sortKeySize += secsize;
5066 if(sortKeySize <= resultLength) { // if we managed to pack fine
5067 primaries = newPrim; // update the primary pointer
5068 } else { // overflow, need to reallocate and redo
5069 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
5070 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5071 if(U_SUCCESS(*status)) {
5072 primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
5073 }
5074 else {
5075 /* We ran out of memory!? We can't recover. */
5076 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5077 goto cleanup;
5078 }
5079 } else {
5080 *status = U_BUFFER_OVERFLOW_ERROR;
5081 }
5082 }
5083 }
5084 }
5085
5086 if(doCase) {
5087 uint32_t casesize = (uint32_t)(cases - caseStart);
5088 sortKeySize += casesize;
5089 if(sortKeySize <= resultLength) {
5090 *(primaries++) = UCOL_LEVELTERMINATOR;
5091 uprv_memcpy(primaries, caseStart, casesize);
5092 primaries += casesize;
5093 } else {
5094 if(allocateSKBuffer == TRUE) {
5095 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5096 if(U_SUCCESS(*status)) {
5097 *result = primStart;
5098 *(primaries++) = UCOL_LEVELTERMINATOR;
5099 uprv_memcpy(primaries, caseStart, casesize);
5100 }
5101 else {
5102 /* We ran out of memory!? We can't recover. */
5103 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5104 goto cleanup;
5105 }
5106 } else {
5107 *status = U_BUFFER_OVERFLOW_ERROR;
5108 }
5109 }
5110 }
5111
5112 if(compareTer == 0) {
5113 if (count3 > 0) {
5114 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
5115 while (count3 >= coll->tertiaryTopCount) {
5116 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5117 count3 -= (uint32_t)coll->tertiaryTopCount;
5118 }
5119 *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5120 } else {
5121 while (count3 > coll->tertiaryBottomCount) {
5122 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5123 count3 -= (uint32_t)coll->tertiaryBottomCount;
5124 }
5125 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5126 }
5127 }
5128 uint32_t tersize = (uint32_t)(tertiaries - terStart);
5129 sortKeySize += tersize;
5130 if(sortKeySize <= resultLength) {
5131 *(primaries++) = UCOL_LEVELTERMINATOR;
5132 uprv_memcpy(primaries, terStart, tersize);
5133 primaries += tersize;
5134 } else {
5135 if(allocateSKBuffer == TRUE) {
5136 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5137 if(U_SUCCESS(*status)) {
5138 *result = primStart;
5139 *(primaries++) = UCOL_LEVELTERMINATOR;
5140 uprv_memcpy(primaries, terStart, tersize);
5141 }
5142 else {
5143 /* We ran out of memory!? We can't recover. */
5144 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5145 goto cleanup;
5146 }
5147 } else {
5148 *status = U_BUFFER_OVERFLOW_ERROR;
5149 }
5150 }
5151
5152 if(compareQuad == 0/*qShifted == TRUE*/) {
5153 if(count4 > 0) {
5154 while (count4 > UCOL_BOT_COUNT4) {
5155 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
5156 count4 -= UCOL_BOT_COUNT4;
5157 }
5158 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
5159 }
5160 uint32_t quadsize = (uint32_t)(quads - quadStart);
5161 sortKeySize += quadsize;
5162 if(sortKeySize <= resultLength) {
5163 *(primaries++) = UCOL_LEVELTERMINATOR;
5164 uprv_memcpy(primaries, quadStart, quadsize);
5165 primaries += quadsize;
5166 } else {
5167 if(allocateSKBuffer == TRUE) {
5168 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5169 if(U_SUCCESS(*status)) {
5170 *result = primStart;
5171 *(primaries++) = UCOL_LEVELTERMINATOR;
5172 uprv_memcpy(primaries, quadStart, quadsize);
5173 }
5174 else {
5175 /* We ran out of memory!? We can't recover. */
5176 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5177 goto cleanup;
5178 }
5179 } else {
5180 *status = U_BUFFER_OVERFLOW_ERROR;
5181 }
5182 }
5183 }
5184
5185 if(compareIdent) {
5186 sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len);
5187 if(sortKeySize <= resultLength) {
5188 *(primaries++) = UCOL_LEVELTERMINATOR;
5189 primaries += u_writeIdenticalLevelRun(s.string, len, primaries);
5190 } else {
5191 if(allocateSKBuffer == TRUE) {
5192 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status);
5193 if(U_SUCCESS(*status)) {
5194 *result = primStart;
5195 *(primaries++) = UCOL_LEVELTERMINATOR;
5196 u_writeIdenticalLevelRun(s.string, len, primaries);
5197 }
5198 else {
5199 /* We ran out of memory!? We can't recover. */
5200 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5201 goto cleanup;
5202 }
5203 } else {
5204 *status = U_BUFFER_OVERFLOW_ERROR;
5205 }
5206 }
5207 }
5208 }
5209 *(primaries++) = '\0';
5210 }
5211
5212 if(allocateSKBuffer == TRUE) {
5213 *result = (uint8_t*)uprv_malloc(sortKeySize);
5214 /* test for NULL */
5215 if (*result == NULL) {
5216 *status = U_MEMORY_ALLOCATION_ERROR;
5217 goto cleanup;
5218 }
5219 uprv_memcpy(*result, primStart, sortKeySize);
5220 if(primStart != prim) {
5221 uprv_free(primStart);
5222 }
5223 }
5224
5225 cleanup:
5226 if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
5227 /* NULL terminate for safety */
5228 **result = 0;
5229 }
5230 if(terStart != tert) {
5231 uprv_free(terStart);
5232 uprv_free(secStart);
5233 uprv_free(caseStart);
5234 uprv_free(quadStart);
5235 }
5236
5237 /* To avoid memory leak, free the offset buffer if necessary. */
5238 ucol_freeOffsetBuffer(&s);
5239
5240 return sortKeySize;
5241 }
5242
5243
5244 U_CFUNC int32_t U_CALLCONV
ucol_calcSortKeySimpleTertiary(const UCollator * coll,const UChar * source,int32_t sourceLength,uint8_t ** result,uint32_t resultLength,UBool allocateSKBuffer,UErrorCode * status)5245 ucol_calcSortKeySimpleTertiary(const UCollator *coll,
5246 const UChar *source,
5247 int32_t sourceLength,
5248 uint8_t **result,
5249 uint32_t resultLength,
5250 UBool allocateSKBuffer,
5251 UErrorCode *status)
5252 {
5253 U_ALIGN_CODE(16);
5254
5255 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
5256 uint32_t i = 0; /* general purpose counter */
5257
5258 /* Stack allocated buffers for buffers we use */
5259 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
5260
5261 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert;
5262
5263 if(U_FAILURE(*status)) {
5264 return 0;
5265 }
5266
5267 if(primaries == NULL && allocateSKBuffer == TRUE) {
5268 primaries = *result = prim;
5269 resultLength = UCOL_PRIMARY_MAX_BUFFER;
5270 }
5271
5272 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER;
5273
5274 uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */
5275
5276 UnicodeString normSource;
5277
5278 int32_t len = sourceLength;
5279
5280 /* If we need to normalize, we'll do it all at once at the beginning! */
5281 if(coll->normalizationMode != UCOL_OFF) {
5282 normSource.setTo(len < 0, source, len);
5283 const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status);
5284 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
5285 if(qcYesLength != normSource.length()) {
5286 UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
5287 normSource.truncate(qcYesLength);
5288 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
5289 source = normSource.getBuffer();
5290 len = normSource.length();
5291 }
5292 }
5293 collIterate s;
5294 IInit_collIterate(coll, (UChar *)source, len, &s, status);
5295 if(U_FAILURE(*status)) {
5296 return 0;
5297 }
5298 if(source == normSource.getBuffer()) {
5299 s.flags &= ~UCOL_ITER_NORM;
5300 }
5301
5302 if(resultLength == 0 || primaries == NULL) {
5303 return ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5304 }
5305
5306 uint8_t *primarySafeEnd = primaries + resultLength - 2;
5307
5308 uint32_t minBufferSize = UCOL_MAX_BUFFER;
5309
5310 uint8_t *primStart = primaries;
5311 uint8_t *secStart = secondaries;
5312 uint8_t *terStart = tertiaries;
5313
5314 uint32_t order = 0;
5315
5316 uint8_t primary1 = 0;
5317 uint8_t primary2 = 0;
5318 uint8_t secondary = 0;
5319 uint8_t tertiary = 0;
5320 uint8_t caseSwitch = coll->caseSwitch;
5321 uint8_t tertiaryMask = coll->tertiaryMask;
5322 int8_t tertiaryAddition = coll->tertiaryAddition;
5323 uint8_t tertiaryTop = coll->tertiaryTop;
5324 uint8_t tertiaryBottom = coll->tertiaryBottom;
5325 uint8_t tertiaryCommon = coll->tertiaryCommon;
5326
5327 uint32_t prevBuffSize = 0;
5328
5329 UBool finished = FALSE;
5330 UBool notIsContinuation = FALSE;
5331
5332 uint32_t count2 = 0, count3 = 0;
5333 uint8_t leadPrimary = 0;
5334
5335 for(;;) {
5336 for(i=prevBuffSize; i<minBufferSize; ++i) {
5337
5338 order = ucol_IGetNextCE(coll, &s, status);
5339
5340 if(order == 0) {
5341 continue;
5342 }
5343
5344 if(order == UCOL_NO_MORE_CES) {
5345 finished = TRUE;
5346 break;
5347 }
5348
5349 notIsContinuation = !isContinuation(order);
5350
5351 if(notIsContinuation) {
5352 tertiary = (uint8_t)((order & tertiaryMask));
5353 } else {
5354 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5355 }
5356 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5357 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5358 primary1 = (uint8_t)(order >> 8);
5359
5360 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5361 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
5362 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
5363 /* regular and simple sortkey calc */
5364 if(primary1 != UCOL_IGNORABLE) {
5365 if(notIsContinuation) {
5366 if(leadPrimary == primary1) {
5367 *primaries++ = primary2;
5368 } else {
5369 if(leadPrimary != 0) {
5370 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5371 }
5372 if(primary2 == UCOL_IGNORABLE) {
5373 /* one byter, not compressed */
5374 *primaries++ = primary1;
5375 leadPrimary = 0;
5376 } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
5377 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24)))
5378 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
5379 (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
5380 /* not compressible */
5381 leadPrimary = 0;
5382 *primaries++ = primary1;
5383 *primaries++ = primary2;
5384 } else { /* compress */
5385 *primaries++ = leadPrimary = primary1;
5386 *primaries++ = primary2;
5387 }
5388 }
5389 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5390 *primaries++ = primary1;
5391 if(primary2 != UCOL_IGNORABLE) {
5392 *primaries++ = primary2; /* second part */
5393 }
5394 }
5395 }
5396
5397 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5398 /* This is compression code. */
5399 if (secondary == UCOL_COMMON2 && notIsContinuation) {
5400 ++count2;
5401 } else {
5402 if (count2 > 0) {
5403 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5404 while (count2 > UCOL_TOP_COUNT2) {
5405 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5406 count2 -= (uint32_t)UCOL_TOP_COUNT2;
5407 }
5408 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
5409 } else {
5410 while (count2 > UCOL_BOT_COUNT2) {
5411 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5412 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5413 }
5414 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5415 }
5416 count2 = 0;
5417 }
5418 *secondaries++ = secondary;
5419 }
5420 }
5421
5422 if(notIsContinuation) {
5423 tertiary ^= caseSwitch;
5424 }
5425
5426 if(tertiary > 0) {
5427 /* This is compression code. */
5428 /* sequence size check is included in the if clause */
5429 if (tertiary == tertiaryCommon && notIsContinuation) {
5430 ++count3;
5431 } else {
5432 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5433 tertiary += tertiaryAddition;
5434 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5435 tertiary -= tertiaryAddition;
5436 }
5437 if (count3 > 0) {
5438 if ((tertiary > tertiaryCommon)) {
5439 while (count3 > coll->tertiaryTopCount) {
5440 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5441 count3 -= (uint32_t)coll->tertiaryTopCount;
5442 }
5443 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
5444 } else {
5445 while (count3 > coll->tertiaryBottomCount) {
5446 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5447 count3 -= (uint32_t)coll->tertiaryBottomCount;
5448 }
5449 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5450 }
5451 count3 = 0;
5452 }
5453 *tertiaries++ = tertiary;
5454 }
5455 }
5456
5457 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
5458 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
5459 IInit_collIterate(coll, (UChar *)source, len, &s, status);
5460 if(U_FAILURE(*status)) {
5461 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5462 finished = TRUE;
5463 break;
5464 }
5465 if(source == normSource.getBuffer()) {
5466 s.flags &= ~UCOL_ITER_NORM;
5467 }
5468 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5469 *status = U_BUFFER_OVERFLOW_ERROR;
5470 finished = TRUE;
5471 break;
5472 } else { /* It's much nicer if we can actually reallocate */
5473 int32_t sks = sortKeySize+(int32_t)((primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart));
5474 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
5475 if(U_SUCCESS(*status)) {
5476 *result = primStart;
5477 primarySafeEnd = primStart + resultLength - 2;
5478 } else {
5479 /* We ran out of memory!? We can't recover. */
5480 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5481 finished = TRUE;
5482 break;
5483 }
5484 }
5485 }
5486 }
5487 if(finished) {
5488 break;
5489 } else {
5490 prevBuffSize = minBufferSize;
5491 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
5492 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
5493 minBufferSize *= 2;
5494 if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5495 /* We ran out of memory!? We can't recover. */
5496 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5497 break;
5498 }
5499 }
5500 }
5501
5502 if(U_SUCCESS(*status)) {
5503 sortKeySize += (uint32_t)(primaries - primStart);
5504 /* we have done all the CE's, now let's put them together to form a key */
5505 if (count2 > 0) {
5506 while (count2 > UCOL_BOT_COUNT2) {
5507 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5508 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5509 }
5510 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5511 }
5512 uint32_t secsize = (uint32_t)(secondaries-secStart);
5513 sortKeySize += secsize;
5514 if(sortKeySize <= resultLength) {
5515 *(primaries++) = UCOL_LEVELTERMINATOR;
5516 uprv_memcpy(primaries, secStart, secsize);
5517 primaries += secsize;
5518 } else {
5519 if(allocateSKBuffer == TRUE) {
5520 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5521 if(U_SUCCESS(*status)) {
5522 *(primaries++) = UCOL_LEVELTERMINATOR;
5523 *result = primStart;
5524 uprv_memcpy(primaries, secStart, secsize);
5525 }
5526 else {
5527 /* We ran out of memory!? We can't recover. */
5528 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5529 goto cleanup;
5530 }
5531 } else {
5532 *status = U_BUFFER_OVERFLOW_ERROR;
5533 }
5534 }
5535
5536 if (count3 > 0) {
5537 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5538 while (count3 >= coll->tertiaryTopCount) {
5539 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5540 count3 -= (uint32_t)coll->tertiaryTopCount;
5541 }
5542 *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5543 } else {
5544 while (count3 > coll->tertiaryBottomCount) {
5545 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5546 count3 -= (uint32_t)coll->tertiaryBottomCount;
5547 }
5548 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5549 }
5550 }
5551 uint32_t tersize = (uint32_t)(tertiaries - terStart);
5552 sortKeySize += tersize;
5553 if(sortKeySize <= resultLength) {
5554 *(primaries++) = UCOL_LEVELTERMINATOR;
5555 uprv_memcpy(primaries, terStart, tersize);
5556 primaries += tersize;
5557 } else {
5558 if(allocateSKBuffer == TRUE) {
5559 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5560 if(U_SUCCESS(*status)) {
5561 *result = primStart;
5562 *(primaries++) = UCOL_LEVELTERMINATOR;
5563 uprv_memcpy(primaries, terStart, tersize);
5564 }
5565 else {
5566 /* We ran out of memory!? We can't recover. */
5567 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5568 goto cleanup;
5569 }
5570 } else {
5571 *status = U_MEMORY_ALLOCATION_ERROR;
5572 }
5573 }
5574
5575 *(primaries++) = '\0';
5576 }
5577
5578 if(allocateSKBuffer == TRUE) {
5579 *result = (uint8_t*)uprv_malloc(sortKeySize);
5580 /* test for NULL */
5581 if (*result == NULL) {
5582 *status = U_MEMORY_ALLOCATION_ERROR;
5583 goto cleanup;
5584 }
5585 uprv_memcpy(*result, primStart, sortKeySize);
5586 if(primStart != prim) {
5587 uprv_free(primStart);
5588 }
5589 }
5590
5591 cleanup:
5592 if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
5593 /* NULL terminate for safety */
5594 **result = 0;
5595 }
5596 if(terStart != tert) {
5597 uprv_free(terStart);
5598 uprv_free(secStart);
5599 }
5600
5601 /* To avoid memory leak, free the offset buffer if necessary. */
5602 ucol_freeOffsetBuffer(&s);
5603
5604 return sortKeySize;
5605 }
5606
5607 static inline
isShiftedCE(uint32_t CE,uint32_t LVT,UBool * wasShifted)5608 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5609 UBool notIsContinuation = !isContinuation(CE);
5610 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
5611 if(LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5612 || (!notIsContinuation && *wasShifted))
5613 || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
5614 {
5615 // The stuff below should probably be in the sortkey code... maybe not...
5616 if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
5617 /* we should just completely ignore it */
5618 *wasShifted = TRUE;
5619 //continue;
5620 }
5621 //*wasShifted = TRUE;
5622 return TRUE;
5623 } else {
5624 *wasShifted = FALSE;
5625 return FALSE;
5626 }
5627 }
5628 static inline
terminatePSKLevel(int32_t level,int32_t maxLevel,int32_t & i,uint8_t * dest)5629 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
5630 if(level < maxLevel) {
5631 dest[i++] = UCOL_LEVELTERMINATOR;
5632 } else {
5633 dest[i++] = 0;
5634 }
5635 }
5636
5637 /** enumeration of level identifiers for partial sort key generation */
5638 enum {
5639 UCOL_PSK_PRIMARY = 0,
5640 UCOL_PSK_SECONDARY = 1,
5641 UCOL_PSK_CASE = 2,
5642 UCOL_PSK_TERTIARY = 3,
5643 UCOL_PSK_QUATERNARY = 4,
5644 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have three bits to blow */
5645 UCOL_PSK_IDENTICAL = 6,
5646 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */
5647 UCOL_PSK_LIMIT
5648 };
5649
5650 /** collation state enum. *_SHIFT value is how much to shift right
5651 * to get the state piece to the right. *_MASK value should be
5652 * ANDed with the shifted state. This data is stored in state[1]
5653 * field.
5654 */
5655 enum {
5656 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */
5657 UCOL_PSK_LEVEL_MASK = 7, /** three bits */
5658 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
5659 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
5660 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5661 * This field is also used to denote that the French secondary level is finished
5662 */
5663 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5664 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5665 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
5666 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5667 /** When we do French we need to reverse secondary values. However, continuations
5668 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5669 */
5670 UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
5671 UCOL_PSK_BOCSU_BYTES_MASK = 3,
5672 UCOL_PSK_CONSUMED_CES_SHIFT = 9,
5673 UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
5674 };
5675
5676 // macro calculating the number of expansion CEs available
5677 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
5678
5679
5680 /** main sortkey part procedure. On the first call,
5681 * you should pass in a collator, an iterator, empty state
5682 * state[0] == state[1] == 0, a buffer to hold results
5683 * number of bytes you need and an error code pointer.
5684 * Make sure your buffer is big enough to hold the wanted
5685 * number of sortkey bytes. I don't check.
5686 * The only meaningful status you can get back is
5687 * U_BUFFER_OVERFLOW_ERROR, which basically means that you
5688 * have been dealt a raw deal and that you probably won't
5689 * be able to use partial sortkey generation for this
5690 * particular combination of string and collator. This
5691 * is highly unlikely, but you should still check the error code.
5692 * Any other status means that you're not in a sane situation
5693 * anymore. After the first call, preserve state values and
5694 * use them on subsequent calls to obtain more bytes of a sortkey.
5695 * Use until the number of bytes written is smaller than the requested
5696 * number of bytes. Generated sortkey is not compatible with the
5697 * one generated by ucol_getSortKey, as we don't do any compression.
5698 * However, levels are still terminated by a 1 (one) and the sortkey
5699 * is terminated by a 0 (zero). Identical level is the same as in the
5700 * regular sortkey - internal bocu-1 implementation is used.
5701 * For curious, although you cannot do much about this, here is
5702 * the structure of state words.
5703 * state[0] - iterator state. Depends on the iterator implementation,
5704 * but allows the iterator to continue where it stopped in
5705 * the last iteration.
5706 * state[1] - collation processing state. Here is the distribution
5707 * of the bits:
5708 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5709 * quaternary, quin (we don't use this one), identical and
5710 * null (producing only zeroes - first one to terminate the
5711 * sortkey and subsequent to fill the buffer).
5712 * 3 - byte count. Number of bytes written on the primary level.
5713 * 4 - was shifted. Whether the previous iteration finished in the
5714 * shifted state.
5715 * 5, 6 - French continuation bytes written. See the comment in the enum
5716 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on
5717 * the identical level.
5718 * 9..31 - CEs consumed. Number of getCE or next32 operations performed
5719 * since thes last successful update of the iterator state.
5720 */
5721 U_CAPI int32_t U_EXPORT2
ucol_nextSortKeyPart(const UCollator * coll,UCharIterator * iter,uint32_t state[2],uint8_t * dest,int32_t count,UErrorCode * status)5722 ucol_nextSortKeyPart(const UCollator *coll,
5723 UCharIterator *iter,
5724 uint32_t state[2],
5725 uint8_t *dest, int32_t count,
5726 UErrorCode *status)
5727 {
5728 /* error checking */
5729 if(status==NULL || U_FAILURE(*status)) {
5730 return 0;
5731 }
5732 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
5733 if( coll==NULL || iter==NULL ||
5734 state==NULL ||
5735 count<0 || (count>0 && dest==NULL)
5736 ) {
5737 *status=U_ILLEGAL_ARGUMENT_ERROR;
5738 UTRACE_EXIT_STATUS(status);
5739 return 0;
5740 }
5741
5742 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5743 coll, iter, state[0], state[1], dest, count);
5744
5745 if(count==0) {
5746 /* nothing to do */
5747 UTRACE_EXIT_VALUE(0);
5748 return 0;
5749 }
5750 /** Setting up situation according to the state we got from the previous iteration */
5751 // The state of the iterator from the previous invocation
5752 uint32_t iterState = state[0];
5753 // Has the last iteration ended in the shifted state
5754 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
5755 // What is the current level of the sortkey?
5756 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
5757 // Have we written only one byte from a two byte primary in the previous iteration?
5758 // Also on secondary level - have we finished with the French secondary?
5759 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
5760 // number of bytes in the continuation buffer for French
5761 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
5762 // Number of bytes already written from a bocsu sequence. Since
5763 // the longes bocsu sequence is 4 long, this can be up to 3.
5764 int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
5765 // Number of elements that need to be consumed in this iteration because
5766 // the iterator returned UITER_NO_STATE at the end of the last iteration,
5767 // so we had to save the last valid state.
5768 int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
5769
5770 /** values that depend on the collator attributes */
5771 // strength of the collator.
5772 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
5773 // maximal level of the partial sortkey. Need to take whether case level is done
5774 int32_t maxLevel = 0;
5775 if(strength < UCOL_TERTIARY) {
5776 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5777 maxLevel = UCOL_PSK_CASE;
5778 } else {
5779 maxLevel = strength;
5780 }
5781 } else {
5782 if(strength == UCOL_TERTIARY) {
5783 maxLevel = UCOL_PSK_TERTIARY;
5784 } else if(strength == UCOL_QUATERNARY) {
5785 maxLevel = UCOL_PSK_QUATERNARY;
5786 } else { // identical
5787 maxLevel = UCOL_IDENTICAL;
5788 }
5789 }
5790 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
5791 uint8_t UCOL_HIRAGANA_QUAD =
5792 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
5793 // Boundary value that decides whether a CE is shifted or not
5794 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
5795 // Are we doing French collation?
5796 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
5797
5798 /** initializing the collation state */
5799 UBool notIsContinuation = FALSE;
5800 uint32_t CE = UCOL_NO_MORE_CES;
5801
5802 collIterate s;
5803 IInit_collIterate(coll, NULL, -1, &s, status);
5804 if(U_FAILURE(*status)) {
5805 UTRACE_EXIT_STATUS(*status);
5806 return 0;
5807 }
5808 s.iterator = iter;
5809 s.flags |= UCOL_USE_ITERATOR;
5810 // This variable tells us whether we have produced some other levels in this iteration
5811 // before we moved to the identical level. In that case, we need to switch the
5812 // type of the iterator.
5813 UBool doingIdenticalFromStart = FALSE;
5814 // Normalizing iterator
5815 // The division for the array length may truncate the array size to
5816 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
5817 // for all platforms anyway.
5818 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
5819 UNormIterator *normIter = NULL;
5820 // If the normalization is turned on for the collator and we are below identical level
5821 // we will use a FCD normalizing iterator
5822 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
5823 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5824 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
5825 s.flags &= ~UCOL_ITER_NORM;
5826 if(U_FAILURE(*status)) {
5827 UTRACE_EXIT_STATUS(*status);
5828 return 0;
5829 }
5830 } else if(level == UCOL_PSK_IDENTICAL) {
5831 // for identical level, we need a NFD iterator. We need to instantiate it here, since we
5832 // will be updating the state - and this cannot be done on an ordinary iterator.
5833 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5834 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5835 s.flags &= ~UCOL_ITER_NORM;
5836 if(U_FAILURE(*status)) {
5837 UTRACE_EXIT_STATUS(*status);
5838 return 0;
5839 }
5840 doingIdenticalFromStart = TRUE;
5841 }
5842
5843 // This is the tentative new state of the iterator. The problem
5844 // is that the iterator might return an undefined state, in
5845 // which case we should save the last valid state and increase
5846 // the iterator skip value.
5847 uint32_t newState = 0;
5848
5849 // First, we set the iterator to the last valid position
5850 // from the last iteration. This was saved in state[0].
5851 if(iterState == 0) {
5852 /* initial state */
5853 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
5854 s.iterator->move(s.iterator, 0, UITER_LIMIT);
5855 } else {
5856 s.iterator->move(s.iterator, 0, UITER_START);
5857 }
5858 } else {
5859 /* reset to previous state */
5860 s.iterator->setState(s.iterator, iterState, status);
5861 if(U_FAILURE(*status)) {
5862 UTRACE_EXIT_STATUS(*status);
5863 return 0;
5864 }
5865 }
5866
5867
5868
5869 // This variable tells us whether we can attempt to update the state
5870 // of iterator. Situations where we don't want to update iterator state
5871 // are the existence of expansion CEs that are not yet processed, and
5872 // finishing the case level without enough space in the buffer to insert
5873 // a level terminator.
5874 UBool canUpdateState = TRUE;
5875
5876 // Consume all the CEs that were consumed at the end of the previous
5877 // iteration without updating the iterator state. On identical level,
5878 // consume the code points.
5879 int32_t counter = cces;
5880 if(level < UCOL_PSK_IDENTICAL) {
5881 while(counter-->0) {
5882 // If we're doing French and we are on the secondary level,
5883 // we go backwards.
5884 if(level == UCOL_PSK_SECONDARY && doingFrench) {
5885 CE = ucol_IGetPrevCE(coll, &s, status);
5886 } else {
5887 CE = ucol_IGetNextCE(coll, &s, status);
5888 }
5889 if(CE==UCOL_NO_MORE_CES) {
5890 /* should not happen */
5891 *status=U_INTERNAL_PROGRAM_ERROR;
5892 UTRACE_EXIT_STATUS(*status);
5893 return 0;
5894 }
5895 if(uprv_numAvailableExpCEs(s)) {
5896 canUpdateState = FALSE;
5897 }
5898 }
5899 } else {
5900 while(counter-->0) {
5901 uiter_next32(s.iterator);
5902 }
5903 }
5904
5905 // French secondary needs to know whether the iterator state of zero came from previous level OR
5906 // from a new invocation...
5907 UBool wasDoingPrimary = FALSE;
5908 // destination buffer byte counter. When this guy
5909 // gets to count, we're done with the iteration
5910 int32_t i = 0;
5911 // used to count the zero bytes written after we
5912 // have finished with the sort key
5913 int32_t j = 0;
5914
5915
5916 // Hm.... I think we're ready to plunge in. Basic story is as following:
5917 // we have a fall through case based on level. This is used for initial
5918 // positioning on iteration start. Every level processor contains a
5919 // for(;;) which will be broken when we exhaust all the CEs. Other
5920 // way to exit is a goto saveState, which happens when we have filled
5921 // out our buffer.
5922 switch(level) {
5923 case UCOL_PSK_PRIMARY:
5924 wasDoingPrimary = TRUE;
5925 for(;;) {
5926 if(i==count) {
5927 goto saveState;
5928 }
5929 // We should save the state only if we
5930 // are sure that we are done with the
5931 // previous iterator state
5932 if(canUpdateState && byteCountOrFrenchDone == 0) {
5933 newState = s.iterator->getState(s.iterator);
5934 if(newState != UITER_NO_STATE) {
5935 iterState = newState;
5936 cces = 0;
5937 }
5938 }
5939 CE = ucol_IGetNextCE(coll, &s, status);
5940 cces++;
5941 if(CE==UCOL_NO_MORE_CES) {
5942 // Add the level separator
5943 terminatePSKLevel(level, maxLevel, i, dest);
5944 byteCountOrFrenchDone=0;
5945 // Restart the iteration an move to the
5946 // second level
5947 s.iterator->move(s.iterator, 0, UITER_START);
5948 cces = 0;
5949 level = UCOL_PSK_SECONDARY;
5950 break;
5951 }
5952 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5953 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
5954 if(CE != 0) {
5955 if(byteCountOrFrenchDone == 0) {
5956 // get the second byte of primary
5957 dest[i++]=(uint8_t)(CE >> 8);
5958 } else {
5959 byteCountOrFrenchDone = 0;
5960 }
5961 if((CE &=0xff)!=0) {
5962 if(i==count) {
5963 /* overflow */
5964 byteCountOrFrenchDone = 1;
5965 cces--;
5966 goto saveState;
5967 }
5968 dest[i++]=(uint8_t)CE;
5969 }
5970 }
5971 }
5972 if(uprv_numAvailableExpCEs(s)) {
5973 canUpdateState = FALSE;
5974 } else {
5975 canUpdateState = TRUE;
5976 }
5977 }
5978 /* fall through to next level */
5979 case UCOL_PSK_SECONDARY:
5980 if(strength >= UCOL_SECONDARY) {
5981 if(!doingFrench) {
5982 for(;;) {
5983 if(i == count) {
5984 goto saveState;
5985 }
5986 // We should save the state only if we
5987 // are sure that we are done with the
5988 // previous iterator state
5989 if(canUpdateState) {
5990 newState = s.iterator->getState(s.iterator);
5991 if(newState != UITER_NO_STATE) {
5992 iterState = newState;
5993 cces = 0;
5994 }
5995 }
5996 CE = ucol_IGetNextCE(coll, &s, status);
5997 cces++;
5998 if(CE==UCOL_NO_MORE_CES) {
5999 // Add the level separator
6000 terminatePSKLevel(level, maxLevel, i, dest);
6001 byteCountOrFrenchDone = 0;
6002 // Restart the iteration an move to the
6003 // second level
6004 s.iterator->move(s.iterator, 0, UITER_START);
6005 cces = 0;
6006 level = UCOL_PSK_CASE;
6007 break;
6008 }
6009 if(!isShiftedCE(CE, LVT, &wasShifted)) {
6010 CE >>= 8; /* get secondary */
6011 if(CE != 0) {
6012 dest[i++]=(uint8_t)CE;
6013 }
6014 }
6015 if(uprv_numAvailableExpCEs(s)) {
6016 canUpdateState = FALSE;
6017 } else {
6018 canUpdateState = TRUE;
6019 }
6020 }
6021 } else { // French secondary processing
6022 uint8_t frenchBuff[UCOL_MAX_BUFFER];
6023 int32_t frenchIndex = 0;
6024 // Here we are going backwards.
6025 // If the iterator is at the beggining, it should be
6026 // moved to end.
6027 if(wasDoingPrimary) {
6028 s.iterator->move(s.iterator, 0, UITER_LIMIT);
6029 cces = 0;
6030 }
6031 for(;;) {
6032 if(i == count) {
6033 goto saveState;
6034 }
6035 if(canUpdateState) {
6036 newState = s.iterator->getState(s.iterator);
6037 if(newState != UITER_NO_STATE) {
6038 iterState = newState;
6039 cces = 0;
6040 }
6041 }
6042 CE = ucol_IGetPrevCE(coll, &s, status);
6043 cces++;
6044 if(CE==UCOL_NO_MORE_CES) {
6045 // Add the level separator
6046 terminatePSKLevel(level, maxLevel, i, dest);
6047 byteCountOrFrenchDone = 0;
6048 // Restart the iteration an move to the next level
6049 s.iterator->move(s.iterator, 0, UITER_START);
6050 level = UCOL_PSK_CASE;
6051 break;
6052 }
6053 if(isContinuation(CE)) { // if it's a continuation, we want to save it and
6054 // reverse when we get a first non-continuation CE.
6055 CE >>= 8;
6056 frenchBuff[frenchIndex++] = (uint8_t)CE;
6057 } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
6058 CE >>= 8; /* get secondary */
6059 if(!frenchIndex) {
6060 if(CE != 0) {
6061 dest[i++]=(uint8_t)CE;
6062 }
6063 } else {
6064 frenchBuff[frenchIndex++] = (uint8_t)CE;
6065 frenchIndex -= usedFrench;
6066 usedFrench = 0;
6067 while(i < count && frenchIndex) {
6068 dest[i++] = frenchBuff[--frenchIndex];
6069 usedFrench++;
6070 }
6071 }
6072 }
6073 if(uprv_numAvailableExpCEs(s)) {
6074 canUpdateState = FALSE;
6075 } else {
6076 canUpdateState = TRUE;
6077 }
6078 }
6079 }
6080 } else {
6081 level = UCOL_PSK_CASE;
6082 }
6083 /* fall through to next level */
6084 case UCOL_PSK_CASE:
6085 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
6086 uint32_t caseShift = UCOL_CASE_SHIFT_START;
6087 uint8_t caseByte = UCOL_CASE_BYTE_START;
6088 uint8_t caseBits = 0;
6089
6090 for(;;) {
6091 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START);
6092 if(i == count) {
6093 goto saveState;
6094 }
6095 // We should save the state only if we
6096 // are sure that we are done with the
6097 // previous iterator state
6098 if(canUpdateState) {
6099 newState = s.iterator->getState(s.iterator);
6100 if(newState != UITER_NO_STATE) {
6101 iterState = newState;
6102 cces = 0;
6103 }
6104 }
6105 CE = ucol_IGetNextCE(coll, &s, status);
6106 cces++;
6107 if(CE==UCOL_NO_MORE_CES) {
6108 // On the case level we might have an unfinished
6109 // case byte. Add one if it's started.
6110 if(caseShift != UCOL_CASE_SHIFT_START) {
6111 dest[i++] = caseByte;
6112 }
6113 cces = 0;
6114 // We have finished processing CEs on this level.
6115 // However, we don't know if we have enough space
6116 // to add a case level terminator.
6117 if(i < count) {
6118 // Add the level separator
6119 terminatePSKLevel(level, maxLevel, i, dest);
6120 // Restart the iteration and move to the
6121 // next level
6122 s.iterator->move(s.iterator, 0, UITER_START);
6123 level = UCOL_PSK_TERTIARY;
6124 } else {
6125 canUpdateState = FALSE;
6126 }
6127 break;
6128 }
6129
6130 if(!isShiftedCE(CE, LVT, &wasShifted)) {
6131 if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
6132 // do the case level if we need to do it. We don't want to calculate
6133 // case level for primary ignorables if we have only primary strength and case level
6134 // otherwise we would break well formedness of CEs
6135 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
6136 caseBits = (uint8_t)(CE & 0xC0);
6137 // this copies the case level logic from the
6138 // sort key generation code
6139 if(CE != 0) {
6140 if (caseShift == 0) {
6141 dest[i++] = caseByte;
6142 caseShift = UCOL_CASE_SHIFT_START;
6143 caseByte = UCOL_CASE_BYTE_START;
6144 }
6145 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6146 if((caseBits & 0xC0) == 0) {
6147 caseByte |= 1 << (--caseShift);
6148 } else {
6149 caseByte |= 0 << (--caseShift);
6150 /* second bit */
6151 if(caseShift == 0) {
6152 dest[i++] = caseByte;
6153 caseShift = UCOL_CASE_SHIFT_START;
6154 caseByte = UCOL_CASE_BYTE_START;
6155 }
6156 caseByte |= ((caseBits>>6)&1) << (--caseShift);
6157 }
6158 } else {
6159 if((caseBits & 0xC0) == 0) {
6160 caseByte |= 0 << (--caseShift);
6161 } else {
6162 caseByte |= 1 << (--caseShift);
6163 /* second bit */
6164 if(caseShift == 0) {
6165 dest[i++] = caseByte;
6166 caseShift = UCOL_CASE_SHIFT_START;
6167 caseByte = UCOL_CASE_BYTE_START;
6168 }
6169 caseByte |= ((caseBits>>7)&1) << (--caseShift);
6170 }
6171 }
6172 }
6173
6174 }
6175 }
6176 // Not sure this is correct for the case level - revisit
6177 if(uprv_numAvailableExpCEs(s)) {
6178 canUpdateState = FALSE;
6179 } else {
6180 canUpdateState = TRUE;
6181 }
6182 }
6183 } else {
6184 level = UCOL_PSK_TERTIARY;
6185 }
6186 /* fall through to next level */
6187 case UCOL_PSK_TERTIARY:
6188 if(strength >= UCOL_TERTIARY) {
6189 for(;;) {
6190 if(i == count) {
6191 goto saveState;
6192 }
6193 // We should save the state only if we
6194 // are sure that we are done with the
6195 // previous iterator state
6196 if(canUpdateState) {
6197 newState = s.iterator->getState(s.iterator);
6198 if(newState != UITER_NO_STATE) {
6199 iterState = newState;
6200 cces = 0;
6201 }
6202 }
6203 CE = ucol_IGetNextCE(coll, &s, status);
6204 cces++;
6205 if(CE==UCOL_NO_MORE_CES) {
6206 // Add the level separator
6207 terminatePSKLevel(level, maxLevel, i, dest);
6208 byteCountOrFrenchDone = 0;
6209 // Restart the iteration an move to the
6210 // second level
6211 s.iterator->move(s.iterator, 0, UITER_START);
6212 cces = 0;
6213 level = UCOL_PSK_QUATERNARY;
6214 break;
6215 }
6216 if(!isShiftedCE(CE, LVT, &wasShifted)) {
6217 notIsContinuation = !isContinuation(CE);
6218
6219 if(notIsContinuation) {
6220 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
6221 CE ^= coll->caseSwitch;
6222 CE &= coll->tertiaryMask;
6223 } else {
6224 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6225 }
6226
6227 if(CE != 0) {
6228 dest[i++]=(uint8_t)CE;
6229 }
6230 }
6231 if(uprv_numAvailableExpCEs(s)) {
6232 canUpdateState = FALSE;
6233 } else {
6234 canUpdateState = TRUE;
6235 }
6236 }
6237 } else {
6238 // if we're not doing tertiary
6239 // skip to the end
6240 level = UCOL_PSK_NULL;
6241 }
6242 /* fall through to next level */
6243 case UCOL_PSK_QUATERNARY:
6244 if(strength >= UCOL_QUATERNARY) {
6245 for(;;) {
6246 if(i == count) {
6247 goto saveState;
6248 }
6249 // We should save the state only if we
6250 // are sure that we are done with the
6251 // previous iterator state
6252 if(canUpdateState) {
6253 newState = s.iterator->getState(s.iterator);
6254 if(newState != UITER_NO_STATE) {
6255 iterState = newState;
6256 cces = 0;
6257 }
6258 }
6259 CE = ucol_IGetNextCE(coll, &s, status);
6260 cces++;
6261 if(CE==UCOL_NO_MORE_CES) {
6262 // Add the level separator
6263 terminatePSKLevel(level, maxLevel, i, dest);
6264 //dest[i++] = UCOL_LEVELTERMINATOR;
6265 byteCountOrFrenchDone = 0;
6266 // Restart the iteration an move to the
6267 // second level
6268 s.iterator->move(s.iterator, 0, UITER_START);
6269 cces = 0;
6270 level = UCOL_PSK_QUIN;
6271 break;
6272 }
6273 if(CE==0)
6274 continue;
6275 if(isShiftedCE(CE, LVT, &wasShifted)) {
6276 CE >>= 16; /* get primary */
6277 if(CE != 0) {
6278 if(byteCountOrFrenchDone == 0) {
6279 dest[i++]=(uint8_t)(CE >> 8);
6280 } else {
6281 byteCountOrFrenchDone = 0;
6282 }
6283 if((CE &=0xff)!=0) {
6284 if(i==count) {
6285 /* overflow */
6286 byteCountOrFrenchDone = 1;
6287 goto saveState;
6288 }
6289 dest[i++]=(uint8_t)CE;
6290 }
6291 }
6292 } else {
6293 notIsContinuation = !isContinuation(CE);
6294 if(notIsContinuation) {
6295 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
6296 dest[i++] = UCOL_HIRAGANA_QUAD;
6297 } else {
6298 dest[i++] = 0xFF;
6299 }
6300 }
6301 }
6302 if(uprv_numAvailableExpCEs(s)) {
6303 canUpdateState = FALSE;
6304 } else {
6305 canUpdateState = TRUE;
6306 }
6307 }
6308 } else {
6309 // if we're not doing quaternary
6310 // skip to the end
6311 level = UCOL_PSK_NULL;
6312 }
6313 /* fall through to next level */
6314 case UCOL_PSK_QUIN:
6315 level = UCOL_PSK_IDENTICAL;
6316 /* fall through to next level */
6317 case UCOL_PSK_IDENTICAL:
6318 if(strength >= UCOL_IDENTICAL) {
6319 UChar32 first, second;
6320 int32_t bocsuBytesWritten = 0;
6321 // We always need to do identical on
6322 // the NFD form of the string.
6323 if(normIter == NULL) {
6324 // we arrived from the level below and
6325 // normalization was not turned on.
6326 // therefore, we need to make a fresh NFD iterator
6327 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6328 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6329 } else if(!doingIdenticalFromStart) {
6330 // there is an iterator, but we did some other levels.
6331 // therefore, we have a FCD iterator - need to make
6332 // a NFD one.
6333 // normIter being at the beginning does not guarantee
6334 // that the underlying iterator is at the beginning
6335 iter->move(iter, 0, UITER_START);
6336 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6337 }
6338 // At this point we have a NFD iterator that is positioned
6339 // in the right place
6340 if(U_FAILURE(*status)) {
6341 UTRACE_EXIT_STATUS(*status);
6342 return 0;
6343 }
6344 first = uiter_previous32(s.iterator);
6345 // maybe we're at the start of the string
6346 if(first == U_SENTINEL) {
6347 first = 0;
6348 } else {
6349 uiter_next32(s.iterator);
6350 }
6351
6352 j = 0;
6353 for(;;) {
6354 if(i == count) {
6355 if(j+1 < bocsuBytesWritten) {
6356 bocsuBytesUsed = j+1;
6357 }
6358 goto saveState;
6359 }
6360
6361 // On identical level, we will always save
6362 // the state if we reach this point, since
6363 // we don't depend on getNextCE for content
6364 // all the content is in our buffer and we
6365 // already either stored the full buffer OR
6366 // otherwise we won't arrive here.
6367 newState = s.iterator->getState(s.iterator);
6368 if(newState != UITER_NO_STATE) {
6369 iterState = newState;
6370 cces = 0;
6371 }
6372
6373 uint8_t buff[4];
6374 second = uiter_next32(s.iterator);
6375 cces++;
6376
6377 // end condition for identical level
6378 if(second == U_SENTINEL) {
6379 terminatePSKLevel(level, maxLevel, i, dest);
6380 level = UCOL_PSK_NULL;
6381 break;
6382 }
6383 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
6384 first = second;
6385
6386 j = 0;
6387 if(bocsuBytesUsed != 0) {
6388 while(bocsuBytesUsed-->0) {
6389 j++;
6390 }
6391 }
6392
6393 while(i < count && j < bocsuBytesWritten) {
6394 dest[i++] = buff[j++];
6395 }
6396 }
6397
6398 } else {
6399 level = UCOL_PSK_NULL;
6400 }
6401 /* fall through to next level */
6402 case UCOL_PSK_NULL:
6403 j = i;
6404 while(j<count) {
6405 dest[j++]=0;
6406 }
6407 break;
6408 default:
6409 *status = U_INTERNAL_PROGRAM_ERROR;
6410 UTRACE_EXIT_STATUS(*status);
6411 return 0;
6412 }
6413
6414 saveState:
6415 // Now we need to return stuff. First we want to see whether we have
6416 // done everything for the current state of iterator.
6417 if(byteCountOrFrenchDone
6418 || canUpdateState == FALSE
6419 || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)
6420 {
6421 // Any of above mean that the previous transaction
6422 // wasn't finished and that we should store the
6423 // previous iterator state.
6424 state[0] = iterState;
6425 } else {
6426 // The transaction is complete. We will continue in the next iteration.
6427 state[0] = s.iterator->getState(s.iterator);
6428 cces = 0;
6429 }
6430 // Store the number of bocsu bytes written.
6431 if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
6432 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6433 }
6434 state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
6435
6436 // Next we put in the level of comparison
6437 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
6438
6439 // If we are doing French, we need to store whether we have just finished the French level
6440 if(level == UCOL_PSK_SECONDARY && doingFrench) {
6441 state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6442 } else {
6443 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6444 }
6445
6446 // Was the latest CE shifted
6447 if(wasShifted) {
6448 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6449 }
6450 // Check for cces overflow
6451 if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
6452 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6453 }
6454 // Store cces
6455 state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
6456
6457 // Check for French overflow
6458 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6459 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6460 }
6461 // Store number of bytes written in the French secondary continuation sequence
6462 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
6463
6464
6465 // If we have used normalizing iterator, get rid of it
6466 if(normIter != NULL) {
6467 unorm_closeIter(normIter);
6468 }
6469
6470 /* To avoid memory leak, free the offset buffer if necessary. */
6471 ucol_freeOffsetBuffer(&s);
6472
6473 // Return number of meaningful sortkey bytes.
6474 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
6475 dest,i, state[0], state[1]);
6476 UTRACE_EXIT_VALUE(i);
6477 return i;
6478 }
6479
6480 /**
6481 * Produce a bound for a given sortkey and a number of levels.
6482 */
6483 U_CAPI int32_t U_EXPORT2
ucol_getBound(const uint8_t * source,int32_t sourceLength,UColBoundMode boundType,uint32_t noOfLevels,uint8_t * result,int32_t resultLength,UErrorCode * status)6484 ucol_getBound(const uint8_t *source,
6485 int32_t sourceLength,
6486 UColBoundMode boundType,
6487 uint32_t noOfLevels,
6488 uint8_t *result,
6489 int32_t resultLength,
6490 UErrorCode *status)
6491 {
6492 // consistency checks
6493 if(status == NULL || U_FAILURE(*status)) {
6494 return 0;
6495 }
6496 if(source == NULL) {
6497 *status = U_ILLEGAL_ARGUMENT_ERROR;
6498 return 0;
6499 }
6500
6501 int32_t sourceIndex = 0;
6502 // Scan the string until we skip enough of the key OR reach the end of the key
6503 do {
6504 sourceIndex++;
6505 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
6506 noOfLevels--;
6507 }
6508 } while (noOfLevels > 0
6509 && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6510
6511 if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6512 && noOfLevels > 0) {
6513 *status = U_SORT_KEY_TOO_SHORT_WARNING;
6514 }
6515
6516
6517 // READ ME: this code assumes that the values for boundType
6518 // enum will not changes. They are set so that the enum value
6519 // corresponds to the number of extra bytes each bound type
6520 // needs.
6521 if(result != NULL && resultLength >= sourceIndex+boundType) {
6522 uprv_memcpy(result, source, sourceIndex);
6523 switch(boundType) {
6524 // Lower bound just gets terminated. No extra bytes
6525 case UCOL_BOUND_LOWER: // = 0
6526 break;
6527 // Upper bound needs one extra byte
6528 case UCOL_BOUND_UPPER: // = 1
6529 result[sourceIndex++] = 2;
6530 break;
6531 // Upper long bound needs two extra bytes
6532 case UCOL_BOUND_UPPER_LONG: // = 2
6533 result[sourceIndex++] = 0xFF;
6534 result[sourceIndex++] = 0xFF;
6535 break;
6536 default:
6537 *status = U_ILLEGAL_ARGUMENT_ERROR;
6538 return 0;
6539 }
6540 result[sourceIndex++] = 0;
6541
6542 return sourceIndex;
6543 } else {
6544 return sourceIndex+boundType+1;
6545 }
6546 }
6547
6548 /****************************************************************************/
6549 /* Following are the functions that deal with the properties of a collator */
6550 /* there are new APIs and some compatibility APIs */
6551 /****************************************************************************/
6552
6553 static inline void
ucol_addLatinOneEntry(UCollator * coll,UChar ch,uint32_t CE,int32_t * primShift,int32_t * secShift,int32_t * terShift)6554 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
6555 int32_t *primShift, int32_t *secShift, int32_t *terShift)
6556 {
6557 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6558 UBool reverseSecondary = FALSE;
6559 if(!isContinuation(CE)) {
6560 tertiary = (uint8_t)((CE & coll->tertiaryMask));
6561 tertiary ^= coll->caseSwitch;
6562 reverseSecondary = TRUE;
6563 } else {
6564 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6565 tertiary &= UCOL_REMOVE_CASE;
6566 reverseSecondary = FALSE;
6567 }
6568
6569 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6570 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6571 primary1 = (uint8_t)(CE >> 8);
6572
6573 if(primary1 != 0) {
6574 coll->latinOneCEs[ch] |= (primary1 << *primShift);
6575 *primShift -= 8;
6576 }
6577 if(primary2 != 0) {
6578 if(*primShift < 0) {
6579 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6580 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6581 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6582 return;
6583 }
6584 coll->latinOneCEs[ch] |= (primary2 << *primShift);
6585 *primShift -= 8;
6586 }
6587 if(secondary != 0) {
6588 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
6589 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
6590 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
6591 } else { // normal case
6592 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
6593 }
6594 *secShift -= 8;
6595 }
6596 if(tertiary != 0) {
6597 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
6598 *terShift -= 8;
6599 }
6600 }
6601
6602 static inline UBool
ucol_resizeLatinOneTable(UCollator * coll,int32_t size,UErrorCode * status)6603 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6604 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6605 if(newTable == NULL) {
6606 *status = U_MEMORY_ALLOCATION_ERROR;
6607 coll->latinOneFailed = TRUE;
6608 return FALSE;
6609 }
6610 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
6611 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6612 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6613 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
6614 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
6615 coll->latinOneTableLen = size;
6616 uprv_free(coll->latinOneCEs);
6617 coll->latinOneCEs = newTable;
6618 return TRUE;
6619 }
6620
6621 static UBool
ucol_setUpLatinOne(UCollator * coll,UErrorCode * status)6622 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6623 UBool result = TRUE;
6624 if(coll->latinOneCEs == NULL) {
6625 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
6626 if(coll->latinOneCEs == NULL) {
6627 *status = U_MEMORY_ALLOCATION_ERROR;
6628 return FALSE;
6629 }
6630 coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6631 }
6632 UChar ch = 0;
6633 UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6634 // Check for null pointer
6635 if (U_FAILURE(*status)) {
6636 return FALSE;
6637 }
6638 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
6639
6640 int32_t primShift = 24, secShift = 24, terShift = 24;
6641 uint32_t CE = 0;
6642 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6643
6644 // TODO: make safe if you get more than you wanted...
6645 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
6646 primShift = 24; secShift = 24; terShift = 24;
6647 if(ch < 0x100) {
6648 CE = coll->latinOneMapping[ch];
6649 } else {
6650 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
6651 if(CE == UCOL_NOT_FOUND && coll->UCA) {
6652 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
6653 }
6654 }
6655 if(CE < UCOL_NOT_FOUND) {
6656 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6657 } else {
6658 switch (getCETag(CE)) {
6659 case EXPANSION_TAG:
6660 case DIGIT_TAG:
6661 ucol_setText(it, &ch, 1, status);
6662 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
6663 if(primShift < 0 || secShift < 0 || terShift < 0) {
6664 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6665 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6666 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6667 break;
6668 }
6669 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6670 }
6671 break;
6672 case CONTRACTION_TAG:
6673 // here is the trick
6674 // F2 is contraction. We do something very similar to contractions
6675 // but have two indices, one in the real contraction table and the
6676 // other to where we stuffed things. This hopes that we don't have
6677 // many contractions (this should work for latin-1 tables).
6678 {
6679 if((CE & 0x00FFF000) != 0) {
6680 *status = U_UNSUPPORTED_ERROR;
6681 goto cleanup_after_failure;
6682 }
6683
6684 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
6685
6686 CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
6687
6688 coll->latinOneCEs[ch] = CE;
6689 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
6690 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
6691
6692 // We're going to jump into contraction table, pick the elements
6693 // and use them
6694 do {
6695 CE = *(coll->contractionCEs +
6696 (UCharOffset - coll->contractionIndex));
6697 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
6698 uint32_t size;
6699 uint32_t i; /* general counter */
6700 uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
6701 size = getExpansionCount(CE);
6702 //CE = *CEOffset++;
6703 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6704 for(i = 0; i<size; i++) {
6705 if(primShift < 0 || secShift < 0 || terShift < 0) {
6706 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6707 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6708 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6709 break;
6710 }
6711 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6712 }
6713 } else { /* else, we do */
6714 while(*CEOffset != 0) {
6715 if(primShift < 0 || secShift < 0 || terShift < 0) {
6716 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6717 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6718 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6719 break;
6720 }
6721 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6722 }
6723 }
6724 contractionOffset++;
6725 } else if(CE < UCOL_NOT_FOUND) {
6726 ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
6727 } else {
6728 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6729 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6730 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6731 contractionOffset++;
6732 }
6733 UCharOffset++;
6734 primShift = 24; secShift = 24; terShift = 24;
6735 if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
6736 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
6737 goto cleanup_after_failure;
6738 }
6739 }
6740 } while(*UCharOffset != 0xFFFF);
6741 }
6742 break;;
6743 case SPEC_PROC_TAG:
6744 {
6745 // 0xB7 is a precontext character defined in UCA5.1, a special
6746 // handle is implemeted in order to save LatinOne table for
6747 // most locales.
6748 if (ch==0xb7) {
6749 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6750 }
6751 else {
6752 goto cleanup_after_failure;
6753 }
6754 }
6755 break;
6756 default:
6757 goto cleanup_after_failure;
6758 }
6759 }
6760 }
6761 // compact table
6762 if(contractionOffset < coll->latinOneTableLen) {
6763 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
6764 goto cleanup_after_failure;
6765 }
6766 }
6767 ucol_closeElements(it);
6768 return result;
6769
6770 cleanup_after_failure:
6771 // status should already be set before arriving here.
6772 coll->latinOneFailed = TRUE;
6773 ucol_closeElements(it);
6774 return FALSE;
6775 }
6776
ucol_updateInternalState(UCollator * coll,UErrorCode * status)6777 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
6778 if(U_SUCCESS(*status)) {
6779 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6780 coll->caseSwitch = UCOL_CASE_SWITCH;
6781 } else {
6782 coll->caseSwitch = UCOL_NO_CASE_SWITCH;
6783 }
6784
6785 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
6786 coll->tertiaryMask = UCOL_REMOVE_CASE;
6787 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6788 coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */
6789 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
6790 coll->tertiaryBottom = UCOL_COMMON_BOT3;
6791 } else {
6792 coll->tertiaryMask = UCOL_KEEP_CASE;
6793 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
6794 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6795 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
6796 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
6797 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
6798 } else {
6799 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6800 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
6801 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
6802 }
6803 }
6804
6805 /* Set the compression values */
6806 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1);
6807 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
6808 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
6809
6810 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
6811 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE)
6812 {
6813 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
6814 } else {
6815 coll->sortKeyGen = ucol_calcSortKey;
6816 }
6817 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
6818 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed)
6819 {
6820 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
6821 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
6822 //fprintf(stderr, "F");
6823 coll->latinOneUse = TRUE;
6824 } else {
6825 coll->latinOneUse = FALSE;
6826 }
6827 if(*status == U_UNSUPPORTED_ERROR) {
6828 *status = U_ZERO_ERROR;
6829 }
6830 } else { // latin1Table exists and it doesn't need to be regenerated, just use it
6831 coll->latinOneUse = TRUE;
6832 }
6833 } else {
6834 coll->latinOneUse = FALSE;
6835 }
6836 }
6837 }
6838
6839 U_CAPI uint32_t U_EXPORT2
ucol_setVariableTop(UCollator * coll,const UChar * varTop,int32_t len,UErrorCode * status)6840 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
6841 if(U_FAILURE(*status) || coll == NULL) {
6842 return 0;
6843 }
6844 if(len == -1) {
6845 len = u_strlen(varTop);
6846 }
6847 if(len == 0) {
6848 *status = U_ILLEGAL_ARGUMENT_ERROR;
6849 return 0;
6850 }
6851
6852 collIterate s;
6853 IInit_collIterate(coll, varTop, len, &s, status);
6854 if(U_FAILURE(*status)) {
6855 return 0;
6856 }
6857
6858 uint32_t CE = ucol_IGetNextCE(coll, &s, status);
6859
6860 /* here we check if we have consumed all characters */
6861 /* you can put in either one character or a contraction */
6862 /* you shouldn't put more... */
6863 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
6864 *status = U_CE_NOT_FOUND_ERROR;
6865 return 0;
6866 }
6867
6868 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
6869
6870 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
6871 *status = U_PRIMARY_TOO_LONG_ERROR;
6872 return 0;
6873 }
6874 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
6875 coll->variableTopValueisDefault = FALSE;
6876 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
6877 }
6878
6879 /* To avoid memory leak, free the offset buffer if necessary. */
6880 ucol_freeOffsetBuffer(&s);
6881
6882 return CE & UCOL_PRIMARYMASK;
6883 }
6884
ucol_getVariableTop(const UCollator * coll,UErrorCode * status)6885 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
6886 if(U_FAILURE(*status) || coll == NULL) {
6887 return 0;
6888 }
6889 return coll->variableTopValue<<16;
6890 }
6891
6892 U_CAPI void U_EXPORT2
ucol_restoreVariableTop(UCollator * coll,const uint32_t varTop,UErrorCode * status)6893 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
6894 if(U_FAILURE(*status) || coll == NULL) {
6895 return;
6896 }
6897
6898 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
6899 coll->variableTopValueisDefault = FALSE;
6900 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
6901 }
6902 }
6903 /* Attribute setter API */
6904 U_CAPI void U_EXPORT2
ucol_setAttribute(UCollator * coll,UColAttribute attr,UColAttributeValue value,UErrorCode * status)6905 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
6906 if(U_FAILURE(*status) || coll == NULL) {
6907 return;
6908 }
6909 UColAttributeValue oldFrench = coll->frenchCollation;
6910 UColAttributeValue oldCaseFirst = coll->caseFirst;
6911 switch(attr) {
6912 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
6913 if(value == UCOL_ON) {
6914 coll->numericCollation = UCOL_ON;
6915 coll->numericCollationisDefault = FALSE;
6916 } else if (value == UCOL_OFF) {
6917 coll->numericCollation = UCOL_OFF;
6918 coll->numericCollationisDefault = FALSE;
6919 } else if (value == UCOL_DEFAULT) {
6920 coll->numericCollationisDefault = TRUE;
6921 coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
6922 } else {
6923 *status = U_ILLEGAL_ARGUMENT_ERROR;
6924 }
6925 break;
6926 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
6927 if(value == UCOL_ON) {
6928 coll->hiraganaQ = UCOL_ON;
6929 coll->hiraganaQisDefault = FALSE;
6930 } else if (value == UCOL_OFF) {
6931 coll->hiraganaQ = UCOL_OFF;
6932 coll->hiraganaQisDefault = FALSE;
6933 } else if (value == UCOL_DEFAULT) {
6934 coll->hiraganaQisDefault = TRUE;
6935 coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ;
6936 } else {
6937 *status = U_ILLEGAL_ARGUMENT_ERROR;
6938 }
6939 break;
6940 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6941 if(value == UCOL_ON) {
6942 coll->frenchCollation = UCOL_ON;
6943 coll->frenchCollationisDefault = FALSE;
6944 } else if (value == UCOL_OFF) {
6945 coll->frenchCollation = UCOL_OFF;
6946 coll->frenchCollationisDefault = FALSE;
6947 } else if (value == UCOL_DEFAULT) {
6948 coll->frenchCollationisDefault = TRUE;
6949 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
6950 } else {
6951 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6952 }
6953 break;
6954 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6955 if(value == UCOL_SHIFTED) {
6956 coll->alternateHandling = UCOL_SHIFTED;
6957 coll->alternateHandlingisDefault = FALSE;
6958 } else if (value == UCOL_NON_IGNORABLE) {
6959 coll->alternateHandling = UCOL_NON_IGNORABLE;
6960 coll->alternateHandlingisDefault = FALSE;
6961 } else if (value == UCOL_DEFAULT) {
6962 coll->alternateHandlingisDefault = TRUE;
6963 coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
6964 } else {
6965 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6966 }
6967 break;
6968 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6969 if(value == UCOL_LOWER_FIRST) {
6970 coll->caseFirst = UCOL_LOWER_FIRST;
6971 coll->caseFirstisDefault = FALSE;
6972 } else if (value == UCOL_UPPER_FIRST) {
6973 coll->caseFirst = UCOL_UPPER_FIRST;
6974 coll->caseFirstisDefault = FALSE;
6975 } else if (value == UCOL_OFF) {
6976 coll->caseFirst = UCOL_OFF;
6977 coll->caseFirstisDefault = FALSE;
6978 } else if (value == UCOL_DEFAULT) {
6979 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
6980 coll->caseFirstisDefault = TRUE;
6981 } else {
6982 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6983 }
6984 break;
6985 case UCOL_CASE_LEVEL: /* do we have an extra case level */
6986 if(value == UCOL_ON) {
6987 coll->caseLevel = UCOL_ON;
6988 coll->caseLevelisDefault = FALSE;
6989 } else if (value == UCOL_OFF) {
6990 coll->caseLevel = UCOL_OFF;
6991 coll->caseLevelisDefault = FALSE;
6992 } else if (value == UCOL_DEFAULT) {
6993 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
6994 coll->caseLevelisDefault = TRUE;
6995 } else {
6996 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6997 }
6998 break;
6999 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
7000 if(value == UCOL_ON) {
7001 coll->normalizationMode = UCOL_ON;
7002 coll->normalizationModeisDefault = FALSE;
7003 } else if (value == UCOL_OFF) {
7004 coll->normalizationMode = UCOL_OFF;
7005 coll->normalizationModeisDefault = FALSE;
7006 } else if (value == UCOL_DEFAULT) {
7007 coll->normalizationModeisDefault = TRUE;
7008 coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
7009 } else {
7010 *status = U_ILLEGAL_ARGUMENT_ERROR ;
7011 }
7012 break;
7013 case UCOL_STRENGTH: /* attribute for strength */
7014 if (value == UCOL_DEFAULT) {
7015 coll->strengthisDefault = TRUE;
7016 coll->strength = (UColAttributeValue)coll->options->strength;
7017 } else if (value <= UCOL_IDENTICAL) {
7018 coll->strengthisDefault = FALSE;
7019 coll->strength = value;
7020 } else {
7021 *status = U_ILLEGAL_ARGUMENT_ERROR ;
7022 }
7023 break;
7024 case UCOL_ATTRIBUTE_COUNT:
7025 default:
7026 *status = U_ILLEGAL_ARGUMENT_ERROR;
7027 break;
7028 }
7029 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
7030 coll->latinOneRegenTable = TRUE;
7031 } else {
7032 coll->latinOneRegenTable = FALSE;
7033 }
7034 ucol_updateInternalState(coll, status);
7035 }
7036
7037 U_CAPI UColAttributeValue U_EXPORT2
ucol_getAttribute(const UCollator * coll,UColAttribute attr,UErrorCode * status)7038 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
7039 if(U_FAILURE(*status) || coll == NULL) {
7040 return UCOL_DEFAULT;
7041 }
7042 switch(attr) {
7043 case UCOL_NUMERIC_COLLATION:
7044 return coll->numericCollation;
7045 case UCOL_HIRAGANA_QUATERNARY_MODE:
7046 return coll->hiraganaQ;
7047 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
7048 return coll->frenchCollation;
7049 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
7050 return coll->alternateHandling;
7051 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
7052 return coll->caseFirst;
7053 case UCOL_CASE_LEVEL: /* do we have an extra case level */
7054 return coll->caseLevel;
7055 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
7056 return coll->normalizationMode;
7057 case UCOL_STRENGTH: /* attribute for strength */
7058 return coll->strength;
7059 case UCOL_ATTRIBUTE_COUNT:
7060 default:
7061 *status = U_ILLEGAL_ARGUMENT_ERROR;
7062 break;
7063 }
7064 return UCOL_DEFAULT;
7065 }
7066
7067 U_CAPI void U_EXPORT2
ucol_setStrength(UCollator * coll,UCollationStrength strength)7068 ucol_setStrength( UCollator *coll,
7069 UCollationStrength strength)
7070 {
7071 UErrorCode status = U_ZERO_ERROR;
7072 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
7073 }
7074
7075 U_CAPI UCollationStrength U_EXPORT2
ucol_getStrength(const UCollator * coll)7076 ucol_getStrength(const UCollator *coll)
7077 {
7078 UErrorCode status = U_ZERO_ERROR;
7079 return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
7080 }
7081
7082 /****************************************************************************/
7083 /* Following are misc functions */
7084 /* there are new APIs and some compatibility APIs */
7085 /****************************************************************************/
7086
7087 U_CAPI void U_EXPORT2
ucol_getVersion(const UCollator * coll,UVersionInfo versionInfo)7088 ucol_getVersion(const UCollator* coll,
7089 UVersionInfo versionInfo)
7090 {
7091 /* RunTime version */
7092 uint8_t rtVersion = UCOL_RUNTIME_VERSION;
7093 /* Builder version*/
7094 uint8_t bdVersion = coll->image->version[0];
7095
7096 /* Charset Version. Need to get the version from cnv files
7097 * makeconv should populate cnv files with version and
7098 * an api has to be provided in ucnv.h to obtain this version
7099 */
7100 uint8_t csVersion = 0;
7101
7102 /* combine the version info */
7103 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
7104
7105 /* Tailoring rules */
7106 versionInfo[0] = (uint8_t)(cmbVersion>>8);
7107 versionInfo[1] = (uint8_t)cmbVersion;
7108 versionInfo[2] = coll->image->version[1];
7109 if(coll->UCA) {
7110 /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */
7111 versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07);
7112 } else {
7113 versionInfo[3] = 0;
7114 }
7115 }
7116
7117
7118 /* This internal API checks whether a character is tailored or not */
7119 U_CAPI UBool U_EXPORT2
ucol_isTailored(const UCollator * coll,const UChar u,UErrorCode * status)7120 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
7121 if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
7122 return FALSE;
7123 }
7124
7125 uint32_t CE = UCOL_NOT_FOUND;
7126 const UChar *ContractionStart = NULL;
7127 if(u < 0x100) { /* latin-1 */
7128 CE = coll->latinOneMapping[u];
7129 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
7130 return FALSE;
7131 }
7132 } else { /* regular */
7133 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
7134 }
7135
7136 if(isContraction(CE)) {
7137 ContractionStart = (UChar *)coll->image+getContractOffset(CE);
7138 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
7139 }
7140
7141 return (UBool)(CE != UCOL_NOT_FOUND);
7142 }
7143
7144
7145 /****************************************************************************/
7146 /* Following are the string compare functions */
7147 /* */
7148 /****************************************************************************/
7149
7150
7151 /* ucol_checkIdent internal function. Does byte level string compare. */
7152 /* Used by strcoll if strength == identical and strings */
7153 /* are otherwise equal. */
7154 /* */
7155 /* Comparison must be done on NFD normalized strings. */
7156 /* FCD is not good enough. */
7157
7158 static
ucol_checkIdent(collIterate * sColl,collIterate * tColl,UBool normalize,UErrorCode * status)7159 UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
7160 {
7161 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
7162 // of same type, but that doesn't really mean that it will stay that way.
7163 int32_t comparison;
7164
7165 if (sColl->flags & UCOL_USE_ITERATOR) {
7166 // The division for the array length may truncate the array size to
7167 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
7168 // for all platforms anyway.
7169 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7170 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7171 UNormIterator *sNIt = NULL, *tNIt = NULL;
7172 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
7173 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
7174 sColl->iterator->move(sColl->iterator, 0, UITER_START);
7175 tColl->iterator->move(tColl->iterator, 0, UITER_START);
7176 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
7177 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
7178 comparison = u_strCompareIter(sIt, tIt, TRUE);
7179 unorm_closeIter(sNIt);
7180 unorm_closeIter(tNIt);
7181 } else {
7182 int32_t sLen = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1;
7183 const UChar *sBuf = sColl->string;
7184 int32_t tLen = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1;
7185 const UChar *tBuf = tColl->string;
7186
7187 if (normalize) {
7188 *status = U_ZERO_ERROR;
7189 // Note: We could use Normalizer::compare() or similar, but for short strings
7190 // which may not be in FCD it might be faster to just NFD them.
7191 // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than
7192 // NFD'ing immediately might be faster for long strings,
7193 // but string comparison is usually done on relatively short strings.
7194 sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen),
7195 sColl->writableBuffer,
7196 *status);
7197 tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen),
7198 tColl->writableBuffer,
7199 *status);
7200 if(U_FAILURE(*status)) {
7201 return UCOL_LESS;
7202 }
7203 comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer);
7204 } else {
7205 comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE);
7206 }
7207 }
7208
7209 if (comparison < 0) {
7210 return UCOL_LESS;
7211 } else if (comparison == 0) {
7212 return UCOL_EQUAL;
7213 } else /* comparison > 0 */ {
7214 return UCOL_GREATER;
7215 }
7216 }
7217
7218 /* CEBuf - A struct and some inline functions to handle the saving */
7219 /* of CEs in a buffer within ucol_strcoll */
7220
7221 #define UCOL_CEBUF_SIZE 512
7222 typedef struct ucol_CEBuf {
7223 uint32_t *buf;
7224 uint32_t *endp;
7225 uint32_t *pos;
7226 uint32_t localArray[UCOL_CEBUF_SIZE];
7227 } ucol_CEBuf;
7228
7229
7230 static
UCOL_INIT_CEBUF(ucol_CEBuf * b)7231 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
7232 (b)->buf = (b)->pos = (b)->localArray;
7233 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
7234 }
7235
7236 static
ucol_CEBuf_Expand(ucol_CEBuf * b,collIterate * ci,UErrorCode * status)7237 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) {
7238 uint32_t oldSize;
7239 uint32_t newSize;
7240 uint32_t *newBuf;
7241
7242 ci->flags |= UCOL_ITER_ALLOCATED;
7243 oldSize = (uint32_t)(b->pos - b->buf);
7244 newSize = oldSize * 2;
7245 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
7246 if(newBuf == NULL) {
7247 *status = U_MEMORY_ALLOCATION_ERROR;
7248 }
7249 else {
7250 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
7251 if (b->buf != b->localArray) {
7252 uprv_free(b->buf);
7253 }
7254 b->buf = newBuf;
7255 b->endp = b->buf + newSize;
7256 b->pos = b->buf + oldSize;
7257 }
7258 }
7259
7260 static
UCOL_CEBUF_PUT(ucol_CEBuf * b,uint32_t ce,collIterate * ci,UErrorCode * status)7261 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) {
7262 if (b->pos == b->endp) {
7263 ucol_CEBuf_Expand(b, ci, status);
7264 }
7265 if (U_SUCCESS(*status)) {
7266 *(b)->pos++ = ce;
7267 }
7268 }
7269
7270 /* This is a trick string compare function that goes in and uses sortkeys to compare */
7271 /* It is used when compare gets in trouble and needs to bail out */
ucol_compareUsingSortKeys(collIterate * sColl,collIterate * tColl,UErrorCode * status)7272 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7273 collIterate *tColl,
7274 UErrorCode *status)
7275 {
7276 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7277 uint8_t *sourceKeyP = sourceKey;
7278 uint8_t *targetKeyP = targetKey;
7279 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7280 const UCollator *coll = sColl->coll;
7281 const UChar *source = NULL;
7282 const UChar *target = NULL;
7283 int32_t result = UCOL_EQUAL;
7284 UnicodeString sourceString, targetString;
7285 int32_t sourceLength;
7286 int32_t targetLength;
7287
7288 if(sColl->flags & UCOL_USE_ITERATOR) {
7289 sColl->iterator->move(sColl->iterator, 0, UITER_START);
7290 tColl->iterator->move(tColl->iterator, 0, UITER_START);
7291 UChar32 c;
7292 while((c=sColl->iterator->next(sColl->iterator))>=0) {
7293 sourceString.append((UChar)c);
7294 }
7295 while((c=tColl->iterator->next(tColl->iterator))>=0) {
7296 targetString.append((UChar)c);
7297 }
7298 source = sourceString.getBuffer();
7299 sourceLength = sourceString.length();
7300 target = targetString.getBuffer();
7301 targetLength = targetString.length();
7302 } else { // no iterators
7303 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1;
7304 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1;
7305 source = sColl->string;
7306 target = tColl->string;
7307 }
7308
7309
7310
7311 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7312 if(sourceKeyLen > UCOL_MAX_BUFFER) {
7313 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7314 if(sourceKeyP == NULL) {
7315 *status = U_MEMORY_ALLOCATION_ERROR;
7316 goto cleanup_and_do_compare;
7317 }
7318 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7319 }
7320
7321 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7322 if(targetKeyLen > UCOL_MAX_BUFFER) {
7323 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7324 if(targetKeyP == NULL) {
7325 *status = U_MEMORY_ALLOCATION_ERROR;
7326 goto cleanup_and_do_compare;
7327 }
7328 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7329 }
7330
7331 result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7332
7333 cleanup_and_do_compare:
7334 if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
7335 uprv_free(sourceKeyP);
7336 }
7337
7338 if(targetKeyP != NULL && targetKeyP != targetKey) {
7339 uprv_free(targetKeyP);
7340 }
7341
7342 if(result<0) {
7343 return UCOL_LESS;
7344 } else if(result>0) {
7345 return UCOL_GREATER;
7346 } else {
7347 return UCOL_EQUAL;
7348 }
7349 }
7350
7351
7352 static UCollationResult
ucol_strcollRegular(collIterate * sColl,collIterate * tColl,UErrorCode * status)7353 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
7354 {
7355 U_ALIGN_CODE(16);
7356
7357 const UCollator *coll = sColl->coll;
7358
7359
7360 // setting up the collator parameters
7361 UColAttributeValue strength = coll->strength;
7362 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY);
7363
7364 UBool checkSecTer = initialCheckSecTer;
7365 UBool checkTertiary = (strength >= UCOL_TERTIARY);
7366 UBool checkQuad = (strength >= UCOL_QUATERNARY);
7367 UBool checkIdent = (strength == UCOL_IDENTICAL);
7368 UBool checkCase = (coll->caseLevel == UCOL_ON);
7369 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7370 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7371 UBool qShifted = shifted && checkQuad;
7372 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7373
7374 if(doHiragana && shifted) {
7375 return (ucol_compareUsingSortKeys(sColl, tColl, status));
7376 }
7377 uint8_t caseSwitch = coll->caseSwitch;
7378 uint8_t tertiaryMask = coll->tertiaryMask;
7379
7380 // This is the lowest primary value that will not be ignored if shifted
7381 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7382
7383 UCollationResult result = UCOL_EQUAL;
7384 UCollationResult hirResult = UCOL_EQUAL;
7385
7386 // Preparing the CE buffers. They will be filled during the primary phase
7387 ucol_CEBuf sCEs;
7388 ucol_CEBuf tCEs;
7389 UCOL_INIT_CEBUF(&sCEs);
7390 UCOL_INIT_CEBUF(&tCEs);
7391
7392 uint32_t secS = 0, secT = 0;
7393 uint32_t sOrder=0, tOrder=0;
7394
7395 // Non shifted primary processing is quite simple
7396 if(!shifted) {
7397 for(;;) {
7398
7399 // We fetch CEs until we hit a non ignorable primary or end.
7400 do {
7401 // We get the next CE
7402 sOrder = ucol_IGetNextCE(coll, sColl, status);
7403 // Stuff it in the buffer
7404 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7405 // And keep just the primary part.
7406 sOrder &= UCOL_PRIMARYMASK;
7407 } while(sOrder == 0);
7408
7409 // see the comments on the above block
7410 do {
7411 tOrder = ucol_IGetNextCE(coll, tColl, status);
7412 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7413 tOrder &= UCOL_PRIMARYMASK;
7414 } while(tOrder == 0);
7415
7416 // if both primaries are the same
7417 if(sOrder == tOrder) {
7418 // and there are no more CEs, we advance to the next level
7419 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7420 break;
7421 }
7422 if(doHiragana && hirResult == UCOL_EQUAL) {
7423 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
7424 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
7425 ? UCOL_LESS:UCOL_GREATER;
7426 }
7427 }
7428 } else {
7429 // if two primaries are different, we are done
7430 result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER;
7431 goto commonReturn;
7432 }
7433 } // no primary difference... do the rest from the buffers
7434 } else { // shifted - do a slightly more complicated processing :)
7435 for(;;) {
7436 UBool sInShifted = FALSE;
7437 UBool tInShifted = FALSE;
7438 // This version of code can be refactored. However, it seems easier to understand this way.
7439 // Source loop. Sam as the target loop.
7440 for(;;) {
7441 sOrder = ucol_IGetNextCE(coll, sColl, status);
7442 if(sOrder == UCOL_NO_MORE_CES) {
7443 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7444 break;
7445 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
7446 /* UCA amendment - ignore ignorables that follow shifted code points */
7447 continue;
7448 } else if(isContinuation(sOrder)) {
7449 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7450 if(sInShifted) {
7451 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7452 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7453 continue;
7454 } else {
7455 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7456 break;
7457 }
7458 } else { /* Just lower level values */
7459 if(sInShifted) {
7460 continue;
7461 } else {
7462 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7463 continue;
7464 }
7465 }
7466 } else { /* regular */
7467 if((sOrder & UCOL_PRIMARYMASK) > LVT) {
7468 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7469 break;
7470 } else {
7471 if((sOrder & UCOL_PRIMARYMASK) > 0) {
7472 sInShifted = TRUE;
7473 sOrder &= UCOL_PRIMARYMASK;
7474 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7475 continue;
7476 } else {
7477 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7478 sInShifted = FALSE;
7479 continue;
7480 }
7481 }
7482 }
7483 }
7484 sOrder &= UCOL_PRIMARYMASK;
7485 sInShifted = FALSE;
7486
7487 for(;;) {
7488 tOrder = ucol_IGetNextCE(coll, tColl, status);
7489 if(tOrder == UCOL_NO_MORE_CES) {
7490 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7491 break;
7492 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
7493 /* UCA amendment - ignore ignorables that follow shifted code points */
7494 continue;
7495 } else if(isContinuation(tOrder)) {
7496 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7497 if(tInShifted) {
7498 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7499 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7500 continue;
7501 } else {
7502 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7503 break;
7504 }
7505 } else { /* Just lower level values */
7506 if(tInShifted) {
7507 continue;
7508 } else {
7509 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7510 continue;
7511 }
7512 }
7513 } else { /* regular */
7514 if((tOrder & UCOL_PRIMARYMASK) > LVT) {
7515 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7516 break;
7517 } else {
7518 if((tOrder & UCOL_PRIMARYMASK) > 0) {
7519 tInShifted = TRUE;
7520 tOrder &= UCOL_PRIMARYMASK;
7521 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7522 continue;
7523 } else {
7524 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7525 tInShifted = FALSE;
7526 continue;
7527 }
7528 }
7529 }
7530 }
7531 tOrder &= UCOL_PRIMARYMASK;
7532 tInShifted = FALSE;
7533
7534 if(sOrder == tOrder) {
7535 /*
7536 if(doHiragana && hirResult == UCOL_EQUAL) {
7537 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7538 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7539 ? UCOL_LESS:UCOL_GREATER;
7540 }
7541 }
7542 */
7543 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7544 break;
7545 } else {
7546 sOrder = 0;
7547 tOrder = 0;
7548 continue;
7549 }
7550 } else {
7551 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
7552 goto commonReturn;
7553 }
7554 } /* no primary difference... do the rest from the buffers */
7555 }
7556
7557 /* now, we're gonna reexamine collected CEs */
7558 uint32_t *sCE;
7559 uint32_t *tCE;
7560
7561 /* This is the secondary level of comparison */
7562 if(checkSecTer) {
7563 if(!isFrenchSec) { /* normal */
7564 sCE = sCEs.buf;
7565 tCE = tCEs.buf;
7566 for(;;) {
7567 while (secS == 0) {
7568 secS = *(sCE++) & UCOL_SECONDARYMASK;
7569 }
7570
7571 while(secT == 0) {
7572 secT = *(tCE++) & UCOL_SECONDARYMASK;
7573 }
7574
7575 if(secS == secT) {
7576 if(secS == UCOL_NO_MORE_CES_SECONDARY) {
7577 break;
7578 } else {
7579 secS = 0; secT = 0;
7580 continue;
7581 }
7582 } else {
7583 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7584 goto commonReturn;
7585 }
7586 }
7587 } else { /* do the French */
7588 uint32_t *sCESave = NULL;
7589 uint32_t *tCESave = NULL;
7590 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
7591 tCE = tCEs.pos-2;
7592 for(;;) {
7593 while (secS == 0 && sCE >= sCEs.buf) {
7594 if(sCESave == 0) {
7595 secS = *(sCE--);
7596 if(isContinuation(secS)) {
7597 while(isContinuation(secS = *(sCE--)))
7598 ;
7599 /* after this, secS has the start of continuation, and sCEs points before that */
7600 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
7601 sCE+=2; /* need to point to the first continuation CP */
7602 /* However, now you can just continue doing stuff */
7603 }
7604 } else {
7605 secS = *(sCE++);
7606 if(!isContinuation(secS)) { /* This means we have finished with this cont */
7607 sCE = sCESave; /* reset the pointer to before continuation */
7608 sCESave = 0;
7609 continue;
7610 }
7611 }
7612 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7613 }
7614
7615 while(secT == 0 && tCE >= tCEs.buf) {
7616 if(tCESave == 0) {
7617 secT = *(tCE--);
7618 if(isContinuation(secT)) {
7619 while(isContinuation(secT = *(tCE--)))
7620 ;
7621 /* after this, secS has the start of continuation, and sCEs points before that */
7622 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
7623 tCE+=2; /* need to point to the first continuation CP */
7624 /* However, now you can just continue doing stuff */
7625 }
7626 } else {
7627 secT = *(tCE++);
7628 if(!isContinuation(secT)) { /* This means we have finished with this cont */
7629 tCE = tCESave; /* reset the pointer to before continuation */
7630 tCESave = 0;
7631 continue;
7632 }
7633 }
7634 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7635 }
7636
7637 if(secS == secT) {
7638 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
7639 break;
7640 } else {
7641 secS = 0; secT = 0;
7642 continue;
7643 }
7644 } else {
7645 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7646 goto commonReturn;
7647 }
7648 }
7649 }
7650 }
7651
7652 /* doing the case bit */
7653 if(checkCase) {
7654 sCE = sCEs.buf;
7655 tCE = tCEs.buf;
7656 for(;;) {
7657 while((secS & UCOL_REMOVE_CASE) == 0) {
7658 if(!isContinuation(*sCE++)) {
7659 secS =*(sCE-1);
7660 if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7661 // primary ignorables should not be considered on the case level when the strength is primary
7662 // otherwise, the CEs stop being well-formed
7663 secS &= UCOL_TERT_CASE_MASK;
7664 secS ^= caseSwitch;
7665 } else {
7666 secS = 0;
7667 }
7668 } else {
7669 secS = 0;
7670 }
7671 }
7672
7673 while((secT & UCOL_REMOVE_CASE) == 0) {
7674 if(!isContinuation(*tCE++)) {
7675 secT = *(tCE-1);
7676 if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7677 // primary ignorables should not be considered on the case level when the strength is primary
7678 // otherwise, the CEs stop being well-formed
7679 secT &= UCOL_TERT_CASE_MASK;
7680 secT ^= caseSwitch;
7681 } else {
7682 secT = 0;
7683 }
7684 } else {
7685 secT = 0;
7686 }
7687 }
7688
7689 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
7690 result = UCOL_LESS;
7691 goto commonReturn;
7692 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
7693 result = UCOL_GREATER;
7694 goto commonReturn;
7695 }
7696
7697 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
7698 break;
7699 } else {
7700 secS = 0;
7701 secT = 0;
7702 }
7703 }
7704 }
7705
7706 /* Tertiary level */
7707 if(checkTertiary) {
7708 secS = 0;
7709 secT = 0;
7710 sCE = sCEs.buf;
7711 tCE = tCEs.buf;
7712 for(;;) {
7713 while((secS & UCOL_REMOVE_CASE) == 0) {
7714 secS = *(sCE++) & tertiaryMask;
7715 if(!isContinuation(secS)) {
7716 secS ^= caseSwitch;
7717 } else {
7718 secS &= UCOL_REMOVE_CASE;
7719 }
7720 }
7721
7722 while((secT & UCOL_REMOVE_CASE) == 0) {
7723 secT = *(tCE++) & tertiaryMask;
7724 if(!isContinuation(secT)) {
7725 secT ^= caseSwitch;
7726 } else {
7727 secT &= UCOL_REMOVE_CASE;
7728 }
7729 }
7730
7731 if(secS == secT) {
7732 if((secS & UCOL_REMOVE_CASE) == 1) {
7733 break;
7734 } else {
7735 secS = 0; secT = 0;
7736 continue;
7737 }
7738 } else {
7739 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7740 goto commonReturn;
7741 }
7742 }
7743 }
7744
7745
7746 if(qShifted /*checkQuad*/) {
7747 UBool sInShifted = TRUE;
7748 UBool tInShifted = TRUE;
7749 secS = 0;
7750 secT = 0;
7751 sCE = sCEs.buf;
7752 tCE = tCEs.buf;
7753 for(;;) {
7754 while(secS == 0 && secS != UCOL_NO_MORE_CES || (isContinuation(secS) && !sInShifted)) {
7755 secS = *(sCE++);
7756 if(isContinuation(secS)) {
7757 if(!sInShifted) {
7758 continue;
7759 }
7760 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
7761 secS = UCOL_PRIMARYMASK;
7762 sInShifted = FALSE;
7763 } else {
7764 sInShifted = TRUE;
7765 }
7766 }
7767 secS &= UCOL_PRIMARYMASK;
7768
7769
7770 while(secT == 0 && secT != UCOL_NO_MORE_CES || (isContinuation(secT) && !tInShifted)) {
7771 secT = *(tCE++);
7772 if(isContinuation(secT)) {
7773 if(!tInShifted) {
7774 continue;
7775 }
7776 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
7777 secT = UCOL_PRIMARYMASK;
7778 tInShifted = FALSE;
7779 } else {
7780 tInShifted = TRUE;
7781 }
7782 }
7783 secT &= UCOL_PRIMARYMASK;
7784
7785 if(secS == secT) {
7786 if(secS == UCOL_NO_MORE_CES_PRIMARY) {
7787 break;
7788 } else {
7789 secS = 0; secT = 0;
7790 continue;
7791 }
7792 } else {
7793 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7794 goto commonReturn;
7795 }
7796 }
7797 } else if(doHiragana && hirResult != UCOL_EQUAL) {
7798 // If we're fine on quaternaries, we might be different
7799 // on Hiragana. This, however, might fail us in shifted.
7800 result = hirResult;
7801 goto commonReturn;
7802 }
7803
7804 /* For IDENTICAL comparisons, we use a bitwise character comparison */
7805 /* as a tiebreaker if all else is equal. */
7806 /* Getting here should be quite rare - strings are not identical - */
7807 /* that is checked first, but compared == through all other checks. */
7808 if(checkIdent)
7809 {
7810 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
7811 result = ucol_checkIdent(sColl, tColl, TRUE, status);
7812 }
7813
7814 commonReturn:
7815 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
7816 if (sCEs.buf != sCEs.localArray ) {
7817 uprv_free(sCEs.buf);
7818 }
7819 if (tCEs.buf != tCEs.localArray ) {
7820 uprv_free(tCEs.buf);
7821 }
7822 }
7823
7824 return result;
7825 }
7826
7827 static UCollationResult
ucol_strcollRegular(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength,UErrorCode * status)7828 ucol_strcollRegular(const UCollator *coll,
7829 const UChar *source, int32_t sourceLength,
7830 const UChar *target, int32_t targetLength,
7831 UErrorCode *status) {
7832 collIterate sColl, tColl;
7833 // Preparing the context objects for iterating over strings
7834 IInit_collIterate(coll, source, sourceLength, &sColl, status);
7835 IInit_collIterate(coll, target, targetLength, &tColl, status);
7836 if(U_FAILURE(*status)) {
7837 return UCOL_LESS;
7838 }
7839 return ucol_strcollRegular(&sColl, &tColl, status);
7840 }
7841
7842 static inline uint32_t
ucol_getLatinOneContraction(const UCollator * coll,int32_t strength,uint32_t CE,const UChar * s,int32_t * index,int32_t len)7843 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
7844 uint32_t CE, const UChar *s, int32_t *index, int32_t len)
7845 {
7846 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
7847 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
7848 int32_t offset = 1;
7849 UChar schar = 0, tchar = 0;
7850
7851 for(;;) {
7852 if(len == -1) {
7853 if(s[*index] == 0) { // end of string
7854 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7855 } else {
7856 schar = s[*index];
7857 }
7858 } else {
7859 if(*index == len) {
7860 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7861 } else {
7862 schar = s[*index];
7863 }
7864 }
7865
7866 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
7867 offset++;
7868 }
7869
7870 if (schar == tchar) {
7871 (*index)++;
7872 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
7873 }
7874 else
7875 {
7876 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
7877 return UCOL_BAIL_OUT_CE;
7878 }
7879 // skip completely ignorables
7880 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
7881 if(isZeroCE == 0) { // we have to ignore completely ignorables
7882 (*index)++;
7883 continue;
7884 }
7885
7886 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7887 }
7888 }
7889 }
7890
7891
7892 /**
7893 * This is a fast strcoll, geared towards text in Latin-1.
7894 * It supports contractions of size two, French secondaries
7895 * and case switching. You can use it with strengths primary
7896 * to tertiary. It does not support shifted and case level.
7897 * It relies on the table build by setupLatin1Table. If it
7898 * doesn't understand something, it will go to the regular
7899 * strcoll.
7900 */
7901 static UCollationResult
ucol_strcollUseLatin1(const UCollator * coll,const UChar * source,int32_t sLen,const UChar * target,int32_t tLen,UErrorCode * status)7902 ucol_strcollUseLatin1( const UCollator *coll,
7903 const UChar *source,
7904 int32_t sLen,
7905 const UChar *target,
7906 int32_t tLen,
7907 UErrorCode *status)
7908 {
7909 U_ALIGN_CODE(16);
7910 int32_t strength = coll->strength;
7911
7912 int32_t sIndex = 0, tIndex = 0;
7913 UChar sChar = 0, tChar = 0;
7914 uint32_t sOrder=0, tOrder=0;
7915
7916 UBool endOfSource = FALSE;
7917
7918 uint32_t *elements = coll->latinOneCEs;
7919
7920 UBool haveContractions = FALSE; // if we have contractions in our string
7921 // we cannot do French secondary
7922
7923 // Do the primary level
7924 for(;;) {
7925 while(sOrder==0) { // this loop skips primary ignorables
7926 // sOrder=getNextlatinOneCE(source);
7927 if(sLen==-1) { // handling zero terminated strings
7928 sChar=source[sIndex++];
7929 if(sChar==0) {
7930 endOfSource = TRUE;
7931 break;
7932 }
7933 } else { // handling strings with known length
7934 if(sIndex==sLen) {
7935 endOfSource = TRUE;
7936 break;
7937 }
7938 sChar=source[sIndex++];
7939 }
7940 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7941 //fprintf(stderr, "R");
7942 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7943 }
7944 sOrder = elements[sChar];
7945 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
7946 // specials can basically be either contractions or bail-out signs. If we get anything
7947 // else, we'll bail out anywasy
7948 if(getCETag(sOrder) == CONTRACTION_TAG) {
7949 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
7950 haveContractions = TRUE; // if there are contractions, we cannot do French secondary
7951 // However, if there are contractions in the table, but we always use just one char,
7952 // we might be able to do French. This should be checked out.
7953 }
7954 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7955 //fprintf(stderr, "S");
7956 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7957 }
7958 }
7959 }
7960
7961 while(tOrder==0) { // this loop skips primary ignorables
7962 // tOrder=getNextlatinOneCE(target);
7963 if(tLen==-1) { // handling zero terminated strings
7964 tChar=target[tIndex++];
7965 if(tChar==0) {
7966 if(endOfSource) { // this is different than source loop,
7967 // as we already know that source loop is done here,
7968 // so we can either finish the primary loop if both
7969 // strings are done or anounce the result if only
7970 // target is done. Same below.
7971 goto endOfPrimLoop;
7972 } else {
7973 return UCOL_GREATER;
7974 }
7975 }
7976 } else { // handling strings with known length
7977 if(tIndex==tLen) {
7978 if(endOfSource) {
7979 goto endOfPrimLoop;
7980 } else {
7981 return UCOL_GREATER;
7982 }
7983 }
7984 tChar=target[tIndex++];
7985 }
7986 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7987 //fprintf(stderr, "R");
7988 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7989 }
7990 tOrder = elements[tChar];
7991 if(tOrder >= UCOL_NOT_FOUND) {
7992 // Handling specials, see the comments for source
7993 if(getCETag(tOrder) == CONTRACTION_TAG) {
7994 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
7995 haveContractions = TRUE;
7996 }
7997 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7998 //fprintf(stderr, "S");
7999 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8000 }
8001 }
8002 }
8003 if(endOfSource) { // source is finished, but target is not, say the result.
8004 return UCOL_LESS;
8005 }
8006
8007 if(sOrder == tOrder) { // if we have same CEs, we continue the loop
8008 sOrder = 0; tOrder = 0;
8009 continue;
8010 } else {
8011 // compare current top bytes
8012 if(((sOrder^tOrder)&0xFF000000)!=0) {
8013 // top bytes differ, return difference
8014 if(sOrder < tOrder) {
8015 return UCOL_LESS;
8016 } else if(sOrder > tOrder) {
8017 return UCOL_GREATER;
8018 }
8019 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
8020 // since we must return enum value
8021 }
8022
8023 // top bytes match, continue with following bytes
8024 sOrder<<=8;
8025 tOrder<<=8;
8026 }
8027 }
8028
8029 endOfPrimLoop:
8030 // after primary loop, we definitely know the sizes of strings,
8031 // so we set it and use simpler loop for secondaries and tertiaries
8032 sLen = sIndex; tLen = tIndex;
8033 if(strength >= UCOL_SECONDARY) {
8034 // adjust the table beggining
8035 elements += coll->latinOneTableLen;
8036 endOfSource = FALSE;
8037
8038 if(coll->frenchCollation == UCOL_OFF) { // non French
8039 // This loop is a simplified copy of primary loop
8040 // at this point we know that whole strings are latin-1, so we don't
8041 // check for that. We also know that we only have contractions as
8042 // specials.
8043 sIndex = 0; tIndex = 0;
8044 for(;;) {
8045 while(sOrder==0) {
8046 if(sIndex==sLen) {
8047 endOfSource = TRUE;
8048 break;
8049 }
8050 sChar=source[sIndex++];
8051 sOrder = elements[sChar];
8052 if(sOrder > UCOL_NOT_FOUND) {
8053 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
8054 }
8055 }
8056
8057 while(tOrder==0) {
8058 if(tIndex==tLen) {
8059 if(endOfSource) {
8060 goto endOfSecLoop;
8061 } else {
8062 return UCOL_GREATER;
8063 }
8064 }
8065 tChar=target[tIndex++];
8066 tOrder = elements[tChar];
8067 if(tOrder > UCOL_NOT_FOUND) {
8068 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
8069 }
8070 }
8071 if(endOfSource) {
8072 return UCOL_LESS;
8073 }
8074
8075 if(sOrder == tOrder) {
8076 sOrder = 0; tOrder = 0;
8077 continue;
8078 } else {
8079 // see primary loop for comments on this
8080 if(((sOrder^tOrder)&0xFF000000)!=0) {
8081 if(sOrder < tOrder) {
8082 return UCOL_LESS;
8083 } else if(sOrder > tOrder) {
8084 return UCOL_GREATER;
8085 }
8086 }
8087 sOrder<<=8;
8088 tOrder<<=8;
8089 }
8090 }
8091 } else { // French
8092 if(haveContractions) { // if we have contractions, we have to bail out
8093 // since we don't really know how to handle them here
8094 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8095 }
8096 // For French, we go backwards
8097 sIndex = sLen; tIndex = tLen;
8098 for(;;) {
8099 while(sOrder==0) {
8100 if(sIndex==0) {
8101 endOfSource = TRUE;
8102 break;
8103 }
8104 sChar=source[--sIndex];
8105 sOrder = elements[sChar];
8106 // don't even look for contractions
8107 }
8108
8109 while(tOrder==0) {
8110 if(tIndex==0) {
8111 if(endOfSource) {
8112 goto endOfSecLoop;
8113 } else {
8114 return UCOL_GREATER;
8115 }
8116 }
8117 tChar=target[--tIndex];
8118 tOrder = elements[tChar];
8119 // don't even look for contractions
8120 }
8121 if(endOfSource) {
8122 return UCOL_LESS;
8123 }
8124
8125 if(sOrder == tOrder) {
8126 sOrder = 0; tOrder = 0;
8127 continue;
8128 } else {
8129 // see the primary loop for comments
8130 if(((sOrder^tOrder)&0xFF000000)!=0) {
8131 if(sOrder < tOrder) {
8132 return UCOL_LESS;
8133 } else if(sOrder > tOrder) {
8134 return UCOL_GREATER;
8135 }
8136 }
8137 sOrder<<=8;
8138 tOrder<<=8;
8139 }
8140 }
8141 }
8142 }
8143
8144 endOfSecLoop:
8145 if(strength >= UCOL_TERTIARY) {
8146 // tertiary loop is the same as secondary (except no French)
8147 elements += coll->latinOneTableLen;
8148 sIndex = 0; tIndex = 0;
8149 endOfSource = FALSE;
8150 for(;;) {
8151 while(sOrder==0) {
8152 if(sIndex==sLen) {
8153 endOfSource = TRUE;
8154 break;
8155 }
8156 sChar=source[sIndex++];
8157 sOrder = elements[sChar];
8158 if(sOrder > UCOL_NOT_FOUND) {
8159 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
8160 }
8161 }
8162 while(tOrder==0) {
8163 if(tIndex==tLen) {
8164 if(endOfSource) {
8165 return UCOL_EQUAL; // if both strings are at the end, they are equal
8166 } else {
8167 return UCOL_GREATER;
8168 }
8169 }
8170 tChar=target[tIndex++];
8171 tOrder = elements[tChar];
8172 if(tOrder > UCOL_NOT_FOUND) {
8173 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
8174 }
8175 }
8176 if(endOfSource) {
8177 return UCOL_LESS;
8178 }
8179 if(sOrder == tOrder) {
8180 sOrder = 0; tOrder = 0;
8181 continue;
8182 } else {
8183 if(((sOrder^tOrder)&0xff000000)!=0) {
8184 if(sOrder < tOrder) {
8185 return UCOL_LESS;
8186 } else if(sOrder > tOrder) {
8187 return UCOL_GREATER;
8188 }
8189 }
8190 sOrder<<=8;
8191 tOrder<<=8;
8192 }
8193 }
8194 }
8195 return UCOL_EQUAL;
8196 }
8197
8198
8199 U_CAPI UCollationResult U_EXPORT2
ucol_strcollIter(const UCollator * coll,UCharIterator * sIter,UCharIterator * tIter,UErrorCode * status)8200 ucol_strcollIter( const UCollator *coll,
8201 UCharIterator *sIter,
8202 UCharIterator *tIter,
8203 UErrorCode *status)
8204 {
8205 if(!status || U_FAILURE(*status)) {
8206 return UCOL_EQUAL;
8207 }
8208
8209 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
8210 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
8211
8212 if (sIter == tIter) {
8213 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8214 return UCOL_EQUAL;
8215 }
8216 if(sIter == NULL || tIter == NULL || coll == NULL) {
8217 *status = U_ILLEGAL_ARGUMENT_ERROR;
8218 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8219 return UCOL_EQUAL;
8220 }
8221
8222 UCollationResult result = UCOL_EQUAL;
8223
8224 // Preparing the context objects for iterating over strings
8225 collIterate sColl, tColl;
8226 IInit_collIterate(coll, NULL, -1, &sColl, status);
8227 IInit_collIterate(coll, NULL, -1, &tColl, status);
8228 if(U_FAILURE(*status)) {
8229 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8230 return UCOL_EQUAL;
8231 }
8232 // The division for the array length may truncate the array size to
8233 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8234 // for all platforms anyway.
8235 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8236 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8237 UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8238
8239 sColl.iterator = sIter;
8240 sColl.flags |= UCOL_USE_ITERATOR;
8241 tColl.flags |= UCOL_USE_ITERATOR;
8242 tColl.iterator = tIter;
8243
8244 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8245 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8246 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8247 sColl.flags &= ~UCOL_ITER_NORM;
8248
8249 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8250 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8251 tColl.flags &= ~UCOL_ITER_NORM;
8252 }
8253
8254 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
8255
8256 while((sChar = sColl.iterator->next(sColl.iterator)) ==
8257 (tChar = tColl.iterator->next(tColl.iterator))) {
8258 if(sChar == U_SENTINEL) {
8259 result = UCOL_EQUAL;
8260 goto end_compare;
8261 }
8262 }
8263
8264 if(sChar == U_SENTINEL) {
8265 tChar = tColl.iterator->previous(tColl.iterator);
8266 }
8267
8268 if(tChar == U_SENTINEL) {
8269 sChar = sColl.iterator->previous(sColl.iterator);
8270 }
8271
8272 sChar = sColl.iterator->previous(sColl.iterator);
8273 tChar = tColl.iterator->previous(tColl.iterator);
8274
8275 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8276 {
8277 // We are stopped in the middle of a contraction.
8278 // Scan backwards through the == part of the string looking for the start of the contraction.
8279 // It doesn't matter which string we scan, since they are the same in this region.
8280 do
8281 {
8282 sChar = sColl.iterator->previous(sColl.iterator);
8283 tChar = tColl.iterator->previous(tColl.iterator);
8284 }
8285 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8286 }
8287
8288
8289 if(U_SUCCESS(*status)) {
8290 result = ucol_strcollRegular(&sColl, &tColl, status);
8291 }
8292
8293 end_compare:
8294 if(sNormIter || tNormIter) {
8295 unorm_closeIter(sNormIter);
8296 unorm_closeIter(tNormIter);
8297 }
8298
8299 UTRACE_EXIT_VALUE_STATUS(result, *status)
8300 return result;
8301 }
8302
8303
8304 /* */
8305 /* ucol_strcoll Main public API string comparison function */
8306 /* */
8307 U_CAPI UCollationResult U_EXPORT2
ucol_strcoll(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)8308 ucol_strcoll( const UCollator *coll,
8309 const UChar *source,
8310 int32_t sourceLength,
8311 const UChar *target,
8312 int32_t targetLength)
8313 {
8314 U_ALIGN_CODE(16);
8315
8316 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
8317 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8318 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8319 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
8320 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
8321 }
8322
8323 if(source == NULL || target == NULL) {
8324 // do not crash, but return. Should have
8325 // status argument to return error.
8326 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8327 return UCOL_EQUAL;
8328 }
8329
8330 /* Quick check if source and target are same strings. */
8331 /* They should either both be NULL terminated or the explicit length should be set on both. */
8332 if (source==target && sourceLength==targetLength) {
8333 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8334 return UCOL_EQUAL;
8335 }
8336
8337 /* Scan the strings. Find: */
8338 /* The length of any leading portion that is equal */
8339 /* Whether they are exactly equal. (in which case we just return) */
8340 const UChar *pSrc = source;
8341 const UChar *pTarg = target;
8342 int32_t equalLength;
8343
8344 if (sourceLength == -1 && targetLength == -1) {
8345 // Both strings are null terminated.
8346 // Scan through any leading equal portion.
8347 while (*pSrc == *pTarg && *pSrc != 0) {
8348 pSrc++;
8349 pTarg++;
8350 }
8351 if (*pSrc == 0 && *pTarg == 0) {
8352 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8353 return UCOL_EQUAL;
8354 }
8355 equalLength = (int32_t)(pSrc - source);
8356 }
8357 else
8358 {
8359 // One or both strings has an explicit length.
8360 const UChar *pSrcEnd = source + sourceLength;
8361 const UChar *pTargEnd = target + targetLength;
8362
8363 // Scan while the strings are bitwise ==, or until one is exhausted.
8364 for (;;) {
8365 if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8366 break;
8367 }
8368 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8369 break;
8370 }
8371 if (*pSrc != *pTarg) {
8372 break;
8373 }
8374 pSrc++;
8375 pTarg++;
8376 }
8377 equalLength = (int32_t)(pSrc - source);
8378
8379 // If we made it all the way through both strings, we are done. They are ==
8380 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */
8381 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) /* and also at end of dest string */
8382 {
8383 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8384 return UCOL_EQUAL;
8385 }
8386 }
8387 if (equalLength > 0) {
8388 /* There is an identical portion at the beginning of the two strings. */
8389 /* If the identical portion ends within a contraction or a comibining */
8390 /* character sequence, back up to the start of that sequence. */
8391
8392 // These values should already be set by the code above.
8393 //pSrc = source + equalLength; /* point to the first differing chars */
8394 //pTarg = target + equalLength;
8395 if (pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll) ||
8396 pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))
8397 {
8398 // We are stopped in the middle of a contraction.
8399 // Scan backwards through the == part of the string looking for the start of the contraction.
8400 // It doesn't matter which string we scan, since they are the same in this region.
8401 do
8402 {
8403 equalLength--;
8404 pSrc--;
8405 }
8406 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
8407 }
8408
8409 source += equalLength;
8410 target += equalLength;
8411 if (sourceLength > 0) {
8412 sourceLength -= equalLength;
8413 }
8414 if (targetLength > 0) {
8415 targetLength -= equalLength;
8416 }
8417 }
8418
8419 UErrorCode status = U_ZERO_ERROR;
8420 UCollationResult returnVal;
8421 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
8422 returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status);
8423 } else {
8424 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
8425 }
8426 UTRACE_EXIT_VALUE(returnVal);
8427 return returnVal;
8428 }
8429
8430 /* convenience function for comparing strings */
8431 U_CAPI UBool U_EXPORT2
ucol_greater(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)8432 ucol_greater( const UCollator *coll,
8433 const UChar *source,
8434 int32_t sourceLength,
8435 const UChar *target,
8436 int32_t targetLength)
8437 {
8438 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8439 == UCOL_GREATER);
8440 }
8441
8442 /* convenience function for comparing strings */
8443 U_CAPI UBool U_EXPORT2
ucol_greaterOrEqual(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)8444 ucol_greaterOrEqual( const UCollator *coll,
8445 const UChar *source,
8446 int32_t sourceLength,
8447 const UChar *target,
8448 int32_t targetLength)
8449 {
8450 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8451 != UCOL_LESS);
8452 }
8453
8454 /* convenience function for comparing strings */
8455 U_CAPI UBool U_EXPORT2
ucol_equal(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)8456 ucol_equal( const UCollator *coll,
8457 const UChar *source,
8458 int32_t sourceLength,
8459 const UChar *target,
8460 int32_t targetLength)
8461 {
8462 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8463 == UCOL_EQUAL);
8464 }
8465
8466 U_CAPI void U_EXPORT2
ucol_getUCAVersion(const UCollator * coll,UVersionInfo info)8467 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
8468 if(coll && coll->UCA) {
8469 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
8470 }
8471 }
8472
8473 #endif /* #if !UCONFIG_NO_COLLATION */
8474