1 /*
2 *******************************************************************************
3 * Copyright (C) 1996-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: ucol.cpp
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * Modification history
12 * Date Name Comments
13 * 1996-1999 various members of ICU team maintained C API for collation framework
14 * 02/16/2001 synwee Added internal method getPrevSpecialCE
15 * 03/01/2001 synwee Added maxexpansion functionality.
16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant
17 */
18
19 #include "unicode/utypes.h"
20
21 #if !UCONFIG_NO_COLLATION
22
23 #include "unicode/bytestream.h"
24 #include "unicode/coleitr.h"
25 #include "unicode/unorm.h"
26 #include "unicode/udata.h"
27 #include "unicode/ustring.h"
28 #include "unicode/utf8.h"
29
30 #include "ucol_imp.h"
31 #include "bocsu.h"
32
33 #include "normalizer2impl.h"
34 #include "unorm_it.h"
35 #include "umutex.h"
36 #include "cmemory.h"
37 #include "ucln_in.h"
38 #include "cstring.h"
39 #include "utracimp.h"
40 #include "putilimp.h"
41 #include "uassert.h"
42 #include "unicode/coll.h"
43
44 #ifdef UCOL_DEBUG
45 #include <stdio.h>
46 #endif
47
48 U_NAMESPACE_USE
49
50 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
51
52 #define LAST_BYTE_MASK_ 0xFF
53 #define SECOND_LAST_BYTE_SHIFT_ 8
54
55 #define ZERO_CC_LIMIT_ 0xC0
56
57 // These are static pointers to the NFC/NFD implementation instance.
58 // Each of them is always the same between calls to u_cleanup
59 // and therefore writing to it is not synchronized.
60 // They are cleaned in ucol_cleanup
61 static const Normalizer2 *g_nfd = NULL;
62 static const Normalizer2Impl *g_nfcImpl = NULL;
63
64 // These are values from UCA required for
65 // implicit generation and supressing sort key compression
66 // they should regularly be in the UCA, but if one
67 // is running without UCA, it could be a problem
68 static const int32_t maxRegularPrimary = 0x7A;
69 static const int32_t minImplicitPrimary = 0xE0;
70 static const int32_t maxImplicitPrimary = 0xE4;
71
72 U_CDECL_BEGIN
73 static UBool U_CALLCONV
ucol_cleanup(void)74 ucol_cleanup(void)
75 {
76 g_nfd = NULL;
77 g_nfcImpl = NULL;
78 return TRUE;
79 }
80
81 static int32_t U_CALLCONV
_getFoldingOffset(uint32_t data)82 _getFoldingOffset(uint32_t data) {
83 return (int32_t)(data&0xFFFFFF);
84 }
85
86 U_CDECL_END
87
88 static inline
initializeNFD(UErrorCode * status)89 UBool initializeNFD(UErrorCode *status) {
90 if (g_nfd != NULL) {
91 return TRUE;
92 } else {
93 // The result is constant, until the library is reloaded.
94 g_nfd = Normalizer2Factory::getNFDInstance(*status);
95 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
96 return U_SUCCESS(*status);
97 }
98 }
99
100 // init FCD data
101 static inline
initializeFCD(UErrorCode * status)102 UBool initializeFCD(UErrorCode *status) {
103 if (g_nfcImpl != NULL) {
104 return TRUE;
105 } else {
106 // The result is constant, until the library is reloaded.
107 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
108 // Note: Alternatively, we could also store this pointer in each collIterate struct,
109 // same as Normalizer2Factory::getImpl(collIterate->nfd).
110 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
111 return U_SUCCESS(*status);
112 }
113 }
114
115 static
IInit_collIterate(const UCollator * collator,const UChar * sourceString,int32_t sourceLen,collIterate * s,UErrorCode * status)116 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
117 int32_t sourceLen, collIterate *s,
118 UErrorCode *status)
119 {
120 (s)->string = (s)->pos = sourceString;
121 (s)->origFlags = 0;
122 (s)->flags = 0;
123 if (sourceLen >= 0) {
124 s->flags |= UCOL_ITER_HASLEN;
125 (s)->endp = (UChar *)sourceString+sourceLen;
126 }
127 else {
128 /* change to enable easier checking for end of string for fcdpositon */
129 (s)->endp = NULL;
130 }
131 (s)->extendCEs = NULL;
132 (s)->extendCEsSize = 0;
133 (s)->CEpos = (s)->toReturn = (s)->CEs;
134 (s)->offsetBuffer = NULL;
135 (s)->offsetBufferSize = 0;
136 (s)->offsetReturn = (s)->offsetStore = NULL;
137 (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
138 (s)->coll = (collator);
139 if (initializeNFD(status)) {
140 (s)->nfd = g_nfd;
141 } else {
142 return;
143 }
144 (s)->fcdPosition = 0;
145 if(collator->normalizationMode == UCOL_ON) {
146 (s)->flags |= UCOL_ITER_NORM;
147 }
148 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
149 (s)->flags |= UCOL_HIRAGANA_Q;
150 }
151 (s)->iterator = NULL;
152 //(s)->iteratorIndex = 0;
153 }
154
155 U_CAPI void U_EXPORT2
uprv_init_collIterate(const UCollator * collator,const UChar * sourceString,int32_t sourceLen,collIterate * s,UErrorCode * status)156 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
157 int32_t sourceLen, collIterate *s,
158 UErrorCode *status) {
159 /* Out-of-line version for use from other files. */
160 IInit_collIterate(collator, sourceString, sourceLen, s, status);
161 }
162
163 U_CAPI collIterate * U_EXPORT2
uprv_new_collIterate(UErrorCode * status)164 uprv_new_collIterate(UErrorCode *status) {
165 if(U_FAILURE(*status)) {
166 return NULL;
167 }
168 collIterate *s = new collIterate;
169 if(s == NULL) {
170 *status = U_MEMORY_ALLOCATION_ERROR;
171 return NULL;
172 }
173 return s;
174 }
175
176 U_CAPI void U_EXPORT2
uprv_delete_collIterate(collIterate * s)177 uprv_delete_collIterate(collIterate *s) {
178 delete s;
179 }
180
181 U_CAPI UBool U_EXPORT2
uprv_collIterateAtEnd(collIterate * s)182 uprv_collIterateAtEnd(collIterate *s) {
183 return s == NULL || s->pos == s->endp;
184 }
185
186 /**
187 * Backup the state of the collIterate struct data
188 * @param data collIterate to backup
189 * @param backup storage
190 */
191 static
backupState(const collIterate * data,collIterateState * backup)192 inline void backupState(const collIterate *data, collIterateState *backup)
193 {
194 backup->fcdPosition = data->fcdPosition;
195 backup->flags = data->flags;
196 backup->origFlags = data->origFlags;
197 backup->pos = data->pos;
198 backup->bufferaddress = data->writableBuffer.getBuffer();
199 backup->buffersize = data->writableBuffer.length();
200 backup->iteratorMove = 0;
201 backup->iteratorIndex = 0;
202 if(data->iterator != NULL) {
203 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
204 backup->iteratorIndex = data->iterator->getState(data->iterator);
205 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
206 if(backup->iteratorIndex == UITER_NO_STATE) {
207 while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
208 backup->iteratorMove++;
209 data->iterator->move(data->iterator, -1, UITER_CURRENT);
210 }
211 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
212 }
213 }
214 }
215
216 /**
217 * Loads the state into the collIterate struct data
218 * @param data collIterate to backup
219 * @param backup storage
220 * @param forwards boolean to indicate if forwards iteration is used,
221 * false indicates backwards iteration
222 */
223 static
loadState(collIterate * data,const collIterateState * backup,UBool forwards)224 inline void loadState(collIterate *data, const collIterateState *backup,
225 UBool forwards)
226 {
227 UErrorCode status = U_ZERO_ERROR;
228 data->flags = backup->flags;
229 data->origFlags = backup->origFlags;
230 if(data->iterator != NULL) {
231 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
232 data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
233 if(backup->iteratorMove != 0) {
234 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
235 }
236 }
237 data->pos = backup->pos;
238
239 if ((data->flags & UCOL_ITER_INNORMBUF) &&
240 data->writableBuffer.getBuffer() != backup->bufferaddress) {
241 /*
242 this is when a new buffer has been reallocated and we'll have to
243 calculate the new position.
244 note the new buffer has to contain the contents of the old buffer.
245 */
246 if (forwards) {
247 data->pos = data->writableBuffer.getTerminatedBuffer() +
248 (data->pos - backup->bufferaddress);
249 }
250 else {
251 /* backwards direction */
252 int32_t temp = backup->buffersize -
253 (int32_t)(data->pos - backup->bufferaddress);
254 data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp);
255 }
256 }
257 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
258 /*
259 this is alittle tricky.
260 if we are initially not in the normalization buffer, even if we
261 normalize in the later stage, the data in the buffer will be
262 ignored, since we skip back up to the data string.
263 however if we are already in the normalization buffer, any
264 further normalization will pull data into the normalization
265 buffer and modify the fcdPosition.
266 since we are keeping the data in the buffer for use, the
267 fcdPosition can not be reverted back.
268 arrgghh....
269 */
270 data->fcdPosition = backup->fcdPosition;
271 }
272 }
273
274 static UBool
reallocCEs(collIterate * data,int32_t newCapacity)275 reallocCEs(collIterate *data, int32_t newCapacity) {
276 uint32_t *oldCEs = data->extendCEs;
277 if(oldCEs == NULL) {
278 oldCEs = data->CEs;
279 }
280 int32_t length = data->CEpos - oldCEs;
281 uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4);
282 if(newCEs == NULL) {
283 return FALSE;
284 }
285 uprv_memcpy(newCEs, oldCEs, length * 4);
286 uprv_free(data->extendCEs);
287 data->extendCEs = newCEs;
288 data->extendCEsSize = newCapacity;
289 data->CEpos = newCEs + length;
290 return TRUE;
291 }
292
293 static UBool
increaseCEsCapacity(collIterate * data)294 increaseCEsCapacity(collIterate *data) {
295 int32_t oldCapacity;
296 if(data->extendCEs != NULL) {
297 oldCapacity = data->extendCEsSize;
298 } else {
299 oldCapacity = LENGTHOF(data->CEs);
300 }
301 return reallocCEs(data, 2 * oldCapacity);
302 }
303
304 static UBool
ensureCEsCapacity(collIterate * data,int32_t minCapacity)305 ensureCEsCapacity(collIterate *data, int32_t minCapacity) {
306 int32_t oldCapacity;
307 if(data->extendCEs != NULL) {
308 oldCapacity = data->extendCEsSize;
309 } else {
310 oldCapacity = LENGTHOF(data->CEs);
311 }
312 if(minCapacity <= oldCapacity) {
313 return TRUE;
314 }
315 oldCapacity *= 2;
316 return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity);
317 }
318
appendOffset(int32_t offset,UErrorCode & errorCode)319 void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) {
320 if(U_FAILURE(errorCode)) {
321 return;
322 }
323 int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer);
324 U_ASSERT(length >= offsetBufferSize || offsetStore != NULL);
325 if(length >= offsetBufferSize) {
326 int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE;
327 int32_t *newBuffer = static_cast<int32_t *>(uprv_malloc(newCapacity * 4));
328 if(newBuffer == NULL) {
329 errorCode = U_MEMORY_ALLOCATION_ERROR;
330 return;
331 }
332 if(length > 0) {
333 uprv_memcpy(newBuffer, offsetBuffer, length * 4);
334 }
335 uprv_free(offsetBuffer);
336 offsetBuffer = newBuffer;
337 offsetStore = offsetBuffer + length;
338 offsetBufferSize = newCapacity;
339 }
340 *offsetStore++ = offset;
341 }
342
343 /*
344 * collIter_eos()
345 * Checks for a collIterate being positioned at the end of
346 * its source string.
347 *
348 */
349 static
collIter_eos(collIterate * s)350 inline UBool collIter_eos(collIterate *s) {
351 if(s->flags & UCOL_USE_ITERATOR) {
352 return !(s->iterator->hasNext(s->iterator));
353 }
354 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
355 // Null terminated string, but not at null, so not at end.
356 // Whether in main or normalization buffer doesn't matter.
357 return FALSE;
358 }
359
360 // String with length. Can't be in normalization buffer, which is always
361 // null termintated.
362 if (s->flags & UCOL_ITER_HASLEN) {
363 return (s->pos == s->endp);
364 }
365
366 // We are at a null termination, could be either normalization buffer or main string.
367 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
368 // At null at end of main string.
369 return TRUE;
370 }
371
372 // At null at end of normalization buffer. Need to check whether there there are
373 // any characters left in the main buffer.
374 if(s->origFlags & UCOL_USE_ITERATOR) {
375 return !(s->iterator->hasNext(s->iterator));
376 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
377 // Null terminated main string. fcdPosition is the 'return' position into main buf.
378 return (*s->fcdPosition == 0);
379 }
380 else {
381 // Main string with an end pointer.
382 return s->fcdPosition == s->endp;
383 }
384 }
385
386 /*
387 * collIter_bos()
388 * Checks for a collIterate being positioned at the start of
389 * its source string.
390 *
391 */
392 static
collIter_bos(collIterate * source)393 inline UBool collIter_bos(collIterate *source) {
394 // if we're going backwards, we need to know whether there is more in the
395 // iterator, even if we are in the side buffer
396 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
397 return !source->iterator->hasPrevious(source->iterator);
398 }
399 if (source->pos <= source->string ||
400 ((source->flags & UCOL_ITER_INNORMBUF) &&
401 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
402 return TRUE;
403 }
404 return FALSE;
405 }
406
407 /*static
408 inline UBool collIter_SimpleBos(collIterate *source) {
409 // if we're going backwards, we need to know whether there is more in the
410 // iterator, even if we are in the side buffer
411 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
412 return !source->iterator->hasPrevious(source->iterator);
413 }
414 if (source->pos == source->string) {
415 return TRUE;
416 }
417 return FALSE;
418 }*/
419 //return (data->pos == data->string) ||
420
421
422 /****************************************************************************/
423 /* Following are the open/close functions */
424 /* */
425 /****************************************************************************/
426
427 static UCollator*
ucol_initFromBinary(const uint8_t * bin,int32_t length,const UCollator * base,UCollator * fillIn,UErrorCode * status)428 ucol_initFromBinary(const uint8_t *bin, int32_t length,
429 const UCollator *base,
430 UCollator *fillIn,
431 UErrorCode *status)
432 {
433 UCollator *result = fillIn;
434 if(U_FAILURE(*status)) {
435 return NULL;
436 }
437 /*
438 if(base == NULL) {
439 // we don't support null base yet
440 *status = U_ILLEGAL_ARGUMENT_ERROR;
441 return NULL;
442 }
443 */
444 // We need these and we could be running without UCA
445 uprv_uca_initImplicitConstants(status);
446 UCATableHeader *colData = (UCATableHeader *)bin;
447 // do we want version check here? We're trying to figure out whether collators are compatible
448 if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
449 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
450 colData->version[0] != UCOL_BUILDER_VERSION)
451 {
452 *status = U_COLLATOR_VERSION_MISMATCH;
453 return NULL;
454 }
455 else {
456 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
457 result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
458 if(U_FAILURE(*status)){
459 return NULL;
460 }
461 result->hasRealData = TRUE;
462 }
463 else {
464 if(base) {
465 result = ucol_initCollator(base->image, result, base, status);
466 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
467 if(U_FAILURE(*status)){
468 return NULL;
469 }
470 result->hasRealData = FALSE;
471 }
472 else {
473 *status = U_USELESS_COLLATOR_ERROR;
474 return NULL;
475 }
476 }
477 result->freeImageOnClose = FALSE;
478 }
479 result->actualLocale = NULL;
480 result->validLocale = NULL;
481 result->requestedLocale = NULL;
482 result->rules = NULL;
483 result->rulesLength = 0;
484 result->freeRulesOnClose = FALSE;
485 result->ucaRules = NULL;
486 return result;
487 }
488
489 U_CAPI UCollator* U_EXPORT2
ucol_openBinary(const uint8_t * bin,int32_t length,const UCollator * base,UErrorCode * status)490 ucol_openBinary(const uint8_t *bin, int32_t length,
491 const UCollator *base,
492 UErrorCode *status)
493 {
494 return ucol_initFromBinary(bin, length, base, NULL, status);
495 }
496
497 U_CAPI int32_t U_EXPORT2
ucol_cloneBinary(const UCollator * coll,uint8_t * buffer,int32_t capacity,UErrorCode * status)498 ucol_cloneBinary(const UCollator *coll,
499 uint8_t *buffer, int32_t capacity,
500 UErrorCode *status)
501 {
502 int32_t length = 0;
503 if(U_FAILURE(*status)) {
504 return length;
505 }
506 if(capacity < 0) {
507 *status = U_ILLEGAL_ARGUMENT_ERROR;
508 return length;
509 }
510 if(coll->hasRealData == TRUE) {
511 length = coll->image->size;
512 if(length <= capacity) {
513 uprv_memcpy(buffer, coll->image, length);
514 } else {
515 *status = U_BUFFER_OVERFLOW_ERROR;
516 }
517 } else {
518 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
519 if(length <= capacity) {
520 /* build the UCATableHeader with minimal entries */
521 /* do not copy the header from the UCA file because its values are wrong! */
522 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
523
524 /* reset everything */
525 uprv_memset(buffer, 0, length);
526
527 /* set the tailoring-specific values */
528 UCATableHeader *myData = (UCATableHeader *)buffer;
529 myData->size = length;
530
531 /* offset for the options, the only part of the data that is present after the header */
532 myData->options = sizeof(UCATableHeader);
533
534 /* need to always set the expansion value for an upper bound of the options */
535 myData->expansion = myData->options + sizeof(UColOptionSet);
536
537 myData->magic = UCOL_HEADER_MAGIC;
538 myData->isBigEndian = U_IS_BIG_ENDIAN;
539 myData->charSetFamily = U_CHARSET_FAMILY;
540
541 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
542 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
543
544 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
545 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
546 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
547 myData->jamoSpecial = coll->image->jamoSpecial;
548
549 /* copy the collator options */
550 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
551 } else {
552 *status = U_BUFFER_OVERFLOW_ERROR;
553 }
554 }
555 return length;
556 }
557
558 U_CAPI UCollator* U_EXPORT2
ucol_safeClone(const UCollator * coll,void *,int32_t * pBufferSize,UErrorCode * status)559 ucol_safeClone(const UCollator *coll, void * /*stackBuffer*/, int32_t * pBufferSize, UErrorCode *status)
560 {
561 UCollator * localCollator;
562 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
563 int32_t imageSize = 0;
564 int32_t rulesSize = 0;
565 int32_t rulesPadding = 0;
566 int32_t defaultReorderCodesSize = 0;
567 int32_t reorderCodesSize = 0;
568 uint8_t *image;
569 UChar *rules;
570 int32_t* defaultReorderCodes;
571 int32_t* reorderCodes;
572 uint8_t* leadBytePermutationTable;
573 UBool imageAllocated = FALSE;
574
575 if (status == NULL || U_FAILURE(*status)){
576 return NULL;
577 }
578 if (coll == NULL) {
579 *status = U_ILLEGAL_ARGUMENT_ERROR;
580 return NULL;
581 }
582
583 if (coll->rules && coll->freeRulesOnClose) {
584 rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
585 rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
586 bufferSizeNeeded += rulesSize + rulesPadding;
587 }
588 // no padding for alignment needed from here since the next two are 4 byte quantities
589 if (coll->defaultReorderCodes) {
590 defaultReorderCodesSize = coll->defaultReorderCodesLength * sizeof(int32_t);
591 bufferSizeNeeded += defaultReorderCodesSize;
592 }
593 if (coll->reorderCodes) {
594 reorderCodesSize = coll->reorderCodesLength * sizeof(int32_t);
595 bufferSizeNeeded += reorderCodesSize;
596 }
597 if (coll->leadBytePermutationTable) {
598 bufferSizeNeeded += 256 * sizeof(uint8_t);
599 }
600
601 if (pBufferSize != NULL) {
602 int32_t inputSize = *pBufferSize;
603 *pBufferSize = 1;
604 if (inputSize == 0) {
605 return NULL; // preflighting for deprecated functionality
606 }
607 }
608
609 char *stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
610 // Null pointer check.
611 if (stackBufferChars == NULL) {
612 *status = U_MEMORY_ALLOCATION_ERROR;
613 return NULL;
614 }
615 *status = U_SAFECLONE_ALLOCATED_WARNING;
616
617 localCollator = (UCollator *)stackBufferChars;
618 rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
619 defaultReorderCodes = (int32_t*)((uint8_t*)rules + rulesSize);
620 reorderCodes = (int32_t*)((uint8_t*)defaultReorderCodes + defaultReorderCodesSize);
621 leadBytePermutationTable = (uint8_t*)reorderCodes + reorderCodesSize;
622
623 {
624 UErrorCode tempStatus = U_ZERO_ERROR;
625 imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
626 }
627 if (coll->freeImageOnClose) {
628 image = (uint8_t *)uprv_malloc(imageSize);
629 // Null pointer check
630 if (image == NULL) {
631 *status = U_MEMORY_ALLOCATION_ERROR;
632 return NULL;
633 }
634 ucol_cloneBinary(coll, image, imageSize, status);
635 imageAllocated = TRUE;
636 }
637 else {
638 image = (uint8_t *)coll->image;
639 }
640 localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
641 if (U_FAILURE(*status)) {
642 return NULL;
643 }
644
645 if (coll->rules) {
646 if (coll->freeRulesOnClose) {
647 localCollator->rules = u_strcpy(rules, coll->rules);
648 //bufferEnd += rulesSize;
649 }
650 else {
651 localCollator->rules = coll->rules;
652 }
653 localCollator->freeRulesOnClose = FALSE;
654 localCollator->rulesLength = coll->rulesLength;
655 }
656
657 // collator reordering
658 if (coll->defaultReorderCodes) {
659 localCollator->defaultReorderCodes =
660 (int32_t*) uprv_memcpy(defaultReorderCodes, coll->defaultReorderCodes, coll->defaultReorderCodesLength * sizeof(int32_t));
661 localCollator->defaultReorderCodesLength = coll->defaultReorderCodesLength;
662 localCollator->freeDefaultReorderCodesOnClose = FALSE;
663 }
664 if (coll->reorderCodes) {
665 localCollator->reorderCodes =
666 (int32_t*)uprv_memcpy(reorderCodes, coll->reorderCodes, coll->reorderCodesLength * sizeof(int32_t));
667 localCollator->reorderCodesLength = coll->reorderCodesLength;
668 localCollator->freeReorderCodesOnClose = FALSE;
669 }
670 if (coll->leadBytePermutationTable) {
671 localCollator->leadBytePermutationTable =
672 (uint8_t*) uprv_memcpy(leadBytePermutationTable, coll->leadBytePermutationTable, 256);
673 localCollator->freeLeadBytePermutationTableOnClose = FALSE;
674 }
675
676 int32_t i;
677 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
678 ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
679 }
680 // zero copies of pointers
681 localCollator->actualLocale = NULL;
682 localCollator->validLocale = NULL;
683 localCollator->requestedLocale = NULL;
684 localCollator->ucaRules = coll->ucaRules; // There should only be one copy here.
685 localCollator->freeOnClose = TRUE;
686 localCollator->freeImageOnClose = imageAllocated;
687 return localCollator;
688 }
689
690 U_CAPI void U_EXPORT2
ucol_close(UCollator * coll)691 ucol_close(UCollator *coll)
692 {
693 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
694 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
695 if(coll != NULL) {
696 // these are always owned by each UCollator struct,
697 // so we always free them
698 if(coll->validLocale != NULL) {
699 uprv_free(coll->validLocale);
700 }
701 if(coll->actualLocale != NULL) {
702 uprv_free(coll->actualLocale);
703 }
704 if(coll->requestedLocale != NULL) {
705 uprv_free(coll->requestedLocale);
706 }
707 if(coll->latinOneCEs != NULL) {
708 uprv_free(coll->latinOneCEs);
709 }
710 if(coll->options != NULL && coll->freeOptionsOnClose) {
711 uprv_free(coll->options);
712 }
713 if(coll->rules != NULL && coll->freeRulesOnClose) {
714 uprv_free((UChar *)coll->rules);
715 }
716 if(coll->image != NULL && coll->freeImageOnClose) {
717 uprv_free((UCATableHeader *)coll->image);
718 }
719
720 if(coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
721 uprv_free(coll->leadBytePermutationTable);
722 }
723 if(coll->defaultReorderCodes != NULL && coll->freeDefaultReorderCodesOnClose == TRUE) {
724 uprv_free(coll->defaultReorderCodes);
725 }
726 if(coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
727 uprv_free(coll->reorderCodes);
728 }
729
730 if(coll->delegate != NULL) {
731 delete (Collator*)coll->delegate;
732 }
733
734 /* Here, it would be advisable to close: */
735 /* - UData for UCA (unless we stuff it in the root resb */
736 /* Again, do we need additional housekeeping... HMMM! */
737 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
738 if(coll->freeOnClose){
739 /* for safeClone, if freeOnClose is FALSE,
740 don't free the other instance data */
741 uprv_free(coll);
742 }
743 }
744 UTRACE_EXIT();
745 }
746
ucol_setOptionsFromHeader(UCollator * result,UColOptionSet * opts,UErrorCode * status)747 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
748 if(U_FAILURE(*status)) {
749 return;
750 }
751 result->caseFirst = (UColAttributeValue)opts->caseFirst;
752 result->caseLevel = (UColAttributeValue)opts->caseLevel;
753 result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
754 result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
755 if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) {
756 return;
757 }
758 result->strength = (UColAttributeValue)opts->strength;
759 result->variableTopValue = opts->variableTopValue;
760 result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
761 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
762 result->numericCollation = (UColAttributeValue)opts->numericCollation;
763 result->caseFirstisDefault = TRUE;
764 result->caseLevelisDefault = TRUE;
765 result->frenchCollationisDefault = TRUE;
766 result->normalizationModeisDefault = TRUE;
767 result->strengthisDefault = TRUE;
768 result->variableTopValueisDefault = TRUE;
769 result->alternateHandlingisDefault = TRUE;
770 result->hiraganaQisDefault = TRUE;
771 result->numericCollationisDefault = TRUE;
772
773 ucol_updateInternalState(result, status);
774
775 result->options = opts;
776 }
777
778
779 /**
780 * Approximate determination if a character is at a contraction end.
781 * Guaranteed to be TRUE if a character is at the end of a contraction,
782 * otherwise it is not deterministic.
783 * @param c character to be determined
784 * @param coll collator
785 */
786 static
ucol_contractionEndCP(UChar c,const UCollator * coll)787 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
788 if (c < coll->minContrEndCP) {
789 return FALSE;
790 }
791
792 int32_t hash = c;
793 uint8_t htbyte;
794 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
795 if (U16_IS_TRAIL(c)) {
796 return TRUE;
797 }
798 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
799 }
800 htbyte = coll->contrEndCP[hash>>3];
801 return (((htbyte >> (hash & 7)) & 1) == 1);
802 }
803
804
805
806 /*
807 * i_getCombiningClass()
808 * A fast, at least partly inline version of u_getCombiningClass()
809 * This is a candidate for further optimization. Used heavily
810 * in contraction processing.
811 */
812 static
i_getCombiningClass(UChar32 c,const UCollator * coll)813 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
814 uint8_t sCC = 0;
815 if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
816 sCC = u_getCombiningClass(c);
817 }
818 return sCC;
819 }
820
ucol_initCollator(const UCATableHeader * image,UCollator * fillIn,const UCollator * UCA,UErrorCode * status)821 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
822 UChar c;
823 UCollator *result = fillIn;
824 if(U_FAILURE(*status) || image == NULL) {
825 return NULL;
826 }
827
828 if(result == NULL) {
829 result = (UCollator *)uprv_malloc(sizeof(UCollator));
830 if(result == NULL) {
831 *status = U_MEMORY_ALLOCATION_ERROR;
832 return result;
833 }
834 result->freeOnClose = TRUE;
835 } else {
836 result->freeOnClose = FALSE;
837 }
838
839 result->delegate = NULL;
840
841 result->image = image;
842 result->mapping.getFoldingOffset = _getFoldingOffset;
843 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
844 utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
845 if(U_FAILURE(*status)) {
846 if(result->freeOnClose == TRUE) {
847 uprv_free(result);
848 result = NULL;
849 }
850 return result;
851 }
852
853 result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
854 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
855 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
856 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
857 result->rules = NULL;
858 result->rulesLength = 0;
859 result->freeRulesOnClose = FALSE;
860 result->defaultReorderCodes = NULL;
861 result->defaultReorderCodesLength = 0;
862 result->freeDefaultReorderCodesOnClose = FALSE;
863 result->reorderCodes = NULL;
864 result->reorderCodesLength = 0;
865 result->freeReorderCodesOnClose = FALSE;
866 result->leadBytePermutationTable = NULL;
867 result->freeLeadBytePermutationTableOnClose = FALSE;
868
869 /* get the version info from UCATableHeader and populate the Collator struct*/
870 result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
871 result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
872 result->dataVersion[2] = 0;
873 result->dataVersion[3] = 0;
874
875 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
876 result->minUnsafeCP = 0;
877 for (c=0; c<0x300; c++) { // Find the smallest unsafe char.
878 if (ucol_unsafeCP(c, result)) break;
879 }
880 result->minUnsafeCP = c;
881
882 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
883 result->minContrEndCP = 0;
884 for (c=0; c<0x300; c++) { // Find the Contraction-ending char.
885 if (ucol_contractionEndCP(c, result)) break;
886 }
887 result->minContrEndCP = c;
888
889 /* max expansion tables */
890 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
891 result->image->endExpansionCE);
892 result->lastEndExpansionCE = result->endExpansionCE +
893 result->image->endExpansionCECount - 1;
894 result->expansionCESize = (uint8_t*)result->image +
895 result->image->expansionCESize;
896
897
898 //result->errorCode = *status;
899
900 result->latinOneCEs = NULL;
901
902 result->latinOneRegenTable = FALSE;
903 result->latinOneFailed = FALSE;
904 result->UCA = UCA;
905
906 /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
907 result->ucaRules = NULL;
908 result->actualLocale = NULL;
909 result->validLocale = NULL;
910 result->requestedLocale = NULL;
911 result->hasRealData = FALSE; // real data lives in .dat file...
912 result->freeImageOnClose = FALSE;
913
914 /* set attributes */
915 ucol_setOptionsFromHeader(
916 result,
917 (UColOptionSet*)((uint8_t*)result->image+result->image->options),
918 status);
919 result->freeOptionsOnClose = FALSE;
920
921 return result;
922 }
923
924 /* new Mark's code */
925
926 /**
927 * For generation of Implicit CEs
928 * @author Davis
929 *
930 * Cleaned up so that changes can be made more easily.
931 * Old values:
932 # First Implicit: E26A792D
933 # Last Implicit: E3DC70C0
934 # First CJK: E0030300
935 # Last CJK: E0A9DD00
936 # First CJK_A: E0A9DF00
937 # Last CJK_A: E0DE3100
938 */
939 /* Following is a port of Mark's code for new treatment of implicits.
940 * It is positioned here, since ucol_initUCA need to initialize the
941 * variables below according to the data in the fractional UCA.
942 */
943
944 /**
945 * Function used to:
946 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
947 * b) bump any non-CJK characters by 10FFFF.
948 * The relevant blocks are:
949 * A: 4E00..9FFF; CJK Unified Ideographs
950 * F900..FAFF; CJK Compatibility Ideographs
951 * B: 3400..4DBF; CJK Unified Ideographs Extension A
952 * 20000..XX; CJK Unified Ideographs Extension B (and others later on)
953 * As long as
954 * no new B characters are allocated between 4E00 and FAFF, and
955 * no new A characters are outside of this range,
956 * (very high probability) this simple code will work.
957 * The reordered blocks are:
958 * Block1 is CJK
959 * Block2 is CJK_COMPAT_USED
960 * Block3 is CJK_A
961 * (all contiguous)
962 * Any other CJK gets its normal code point
963 * Any non-CJK gets +10FFFF
964 * When we reorder Block1, we make sure that it is at the very start,
965 * so that it will use a 3-byte form.
966 * Warning: the we only pick up the compatibility characters that are
967 * NOT decomposed, so that block is smaller!
968 */
969
970 // CONSTANTS
971 static const UChar32
972 NON_CJK_OFFSET = 0x110000,
973 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
974
975 /**
976 * Precomputed by initImplicitConstants()
977 */
978 static int32_t
979 final3Multiplier = 0,
980 final4Multiplier = 0,
981 final3Count = 0,
982 final4Count = 0,
983 medialCount = 0,
984 min3Primary = 0,
985 min4Primary = 0,
986 max4Primary = 0,
987 minTrail = 0,
988 maxTrail = 0,
989 max3Trail = 0,
990 max4Trail = 0,
991 min4Boundary = 0;
992
993 static const UChar32
994 // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
995 // 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; (Unicode 6.1)
996 CJK_BASE = 0x4E00,
997 CJK_LIMIT = 0x9FCC+1,
998 // Unified CJK ideographs in the compatibility ideographs block.
999 CJK_COMPAT_USED_BASE = 0xFA0E,
1000 CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
1001 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
1002 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
1003 CJK_A_BASE = 0x3400,
1004 CJK_A_LIMIT = 0x4DB5+1,
1005 // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
1006 // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
1007 CJK_B_BASE = 0x20000,
1008 CJK_B_LIMIT = 0x2A6D6+1,
1009 // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
1010 // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
1011 CJK_C_BASE = 0x2A700,
1012 CJK_C_LIMIT = 0x2B734+1,
1013 // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
1014 // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
1015 CJK_D_BASE = 0x2B740,
1016 CJK_D_LIMIT = 0x2B81D+1;
1017 // when adding to this list, look for all occurrences (in project)
1018 // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!!
1019
swapCJK(UChar32 i)1020 static UChar32 swapCJK(UChar32 i) {
1021 if (i < CJK_A_BASE) {
1022 // non-CJK
1023 } else if (i < CJK_A_LIMIT) {
1024 // Extension A has lower code points than the original Unihan+compat
1025 // but sorts higher.
1026 return i - CJK_A_BASE
1027 + (CJK_LIMIT - CJK_BASE)
1028 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1029 } else if (i < CJK_BASE) {
1030 // non-CJK
1031 } else if (i < CJK_LIMIT) {
1032 return i - CJK_BASE;
1033 } else if (i < CJK_COMPAT_USED_BASE) {
1034 // non-CJK
1035 } else if (i < CJK_COMPAT_USED_LIMIT) {
1036 return i - CJK_COMPAT_USED_BASE
1037 + (CJK_LIMIT - CJK_BASE);
1038 } else if (i < CJK_B_BASE) {
1039 // non-CJK
1040 } else if (i < CJK_B_LIMIT) {
1041 return i; // non-BMP-CJK
1042 } else if (i < CJK_C_BASE) {
1043 // non-CJK
1044 } else if (i < CJK_C_LIMIT) {
1045 return i; // non-BMP-CJK
1046 } else if (i < CJK_D_BASE) {
1047 // non-CJK
1048 } else if (i < CJK_D_LIMIT) {
1049 return i; // non-BMP-CJK
1050 }
1051 return i + NON_CJK_OFFSET; // non-CJK
1052 }
1053
1054 U_CAPI UChar32 U_EXPORT2
uprv_uca_getRawFromCodePoint(UChar32 i)1055 uprv_uca_getRawFromCodePoint(UChar32 i) {
1056 return swapCJK(i)+1;
1057 }
1058
1059 U_CAPI UChar32 U_EXPORT2
uprv_uca_getCodePointFromRaw(UChar32 i)1060 uprv_uca_getCodePointFromRaw(UChar32 i) {
1061 i--;
1062 UChar32 result = 0;
1063 if(i >= NON_CJK_OFFSET) {
1064 result = i - NON_CJK_OFFSET;
1065 } else if(i >= CJK_B_BASE) {
1066 result = i;
1067 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
1068 if(i < CJK_LIMIT - CJK_BASE) {
1069 result = i + CJK_BASE;
1070 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
1071 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
1072 } else {
1073 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1074 }
1075 } else {
1076 result = -1;
1077 }
1078 return result;
1079 }
1080
1081 // GET IMPLICIT PRIMARY WEIGHTS
1082 // Return value is left justified primary key
1083 U_CAPI uint32_t U_EXPORT2
uprv_uca_getImplicitFromRaw(UChar32 cp)1084 uprv_uca_getImplicitFromRaw(UChar32 cp) {
1085 /*
1086 if (cp < 0 || cp > UCOL_MAX_INPUT) {
1087 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
1088 }
1089 */
1090 int32_t last0 = cp - min4Boundary;
1091 if (last0 < 0) {
1092 int32_t last1 = cp / final3Count;
1093 last0 = cp % final3Count;
1094
1095 int32_t last2 = last1 / medialCount;
1096 last1 %= medialCount;
1097
1098 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
1099 last1 = minTrail + last1; // offset
1100 last2 = min3Primary + last2; // offset
1101 /*
1102 if (last2 >= min4Primary) {
1103 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
1104 }
1105 */
1106 return (last2 << 24) + (last1 << 16) + (last0 << 8);
1107 } else {
1108 int32_t last1 = last0 / final4Count;
1109 last0 %= final4Count;
1110
1111 int32_t last2 = last1 / medialCount;
1112 last1 %= medialCount;
1113
1114 int32_t last3 = last2 / medialCount;
1115 last2 %= medialCount;
1116
1117 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
1118 last1 = minTrail + last1; // offset
1119 last2 = minTrail + last2; // offset
1120 last3 = min4Primary + last3; // offset
1121 /*
1122 if (last3 > max4Primary) {
1123 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
1124 }
1125 */
1126 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
1127 }
1128 }
1129
1130 static uint32_t U_EXPORT2
uprv_uca_getImplicitPrimary(UChar32 cp)1131 uprv_uca_getImplicitPrimary(UChar32 cp) {
1132 //fprintf(stdout, "Incoming: %04x\n", cp);
1133 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1134
1135 cp = swapCJK(cp);
1136 cp++;
1137 // we now have a range of numbers from 0 to 21FFFF.
1138
1139 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1140 //fprintf(stdout, "CJK swapped: %04x\n", cp);
1141
1142 return uprv_uca_getImplicitFromRaw(cp);
1143 }
1144
1145 /**
1146 * Converts implicit CE into raw integer ("code point")
1147 * @param implicit
1148 * @return -1 if illegal format
1149 */
1150 U_CAPI UChar32 U_EXPORT2
uprv_uca_getRawFromImplicit(uint32_t implicit)1151 uprv_uca_getRawFromImplicit(uint32_t implicit) {
1152 UChar32 result;
1153 UChar32 b3 = implicit & 0xFF;
1154 UChar32 b2 = (implicit >> 8) & 0xFF;
1155 UChar32 b1 = (implicit >> 16) & 0xFF;
1156 UChar32 b0 = (implicit >> 24) & 0xFF;
1157
1158 // simple parameter checks
1159 if (b0 < min3Primary || b0 > max4Primary
1160 || b1 < minTrail || b1 > maxTrail)
1161 return -1;
1162 // normal offsets
1163 b1 -= minTrail;
1164
1165 // take care of the final values, and compose
1166 if (b0 < min4Primary) {
1167 if (b2 < minTrail || b2 > max3Trail || b3 != 0)
1168 return -1;
1169 b2 -= minTrail;
1170 UChar32 remainder = b2 % final3Multiplier;
1171 if (remainder != 0)
1172 return -1;
1173 b0 -= min3Primary;
1174 b2 /= final3Multiplier;
1175 result = ((b0 * medialCount) + b1) * final3Count + b2;
1176 } else {
1177 if (b2 < minTrail || b2 > maxTrail
1178 || b3 < minTrail || b3 > max4Trail)
1179 return -1;
1180 b2 -= minTrail;
1181 b3 -= minTrail;
1182 UChar32 remainder = b3 % final4Multiplier;
1183 if (remainder != 0)
1184 return -1;
1185 b3 /= final4Multiplier;
1186 b0 -= min4Primary;
1187 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
1188 }
1189 // final check
1190 if (result < 0 || result > UCOL_MAX_INPUT)
1191 return -1;
1192 return result;
1193 }
1194
1195
divideAndRoundUp(int a,int b)1196 static inline int32_t divideAndRoundUp(int a, int b) {
1197 return 1 + (a-1)/b;
1198 }
1199
1200 /* this function is either called from initUCA or from genUCA before
1201 * doing canonical closure for the UCA.
1202 */
1203
1204 /**
1205 * Set up to generate implicits.
1206 * Maintenance Note: this function may end up being called more than once, due
1207 * to threading races during initialization. Make sure that
1208 * none of the Constants is ever transiently assigned an
1209 * incorrect value.
1210 * @param minPrimary
1211 * @param maxPrimary
1212 * @param minTrail final byte
1213 * @param maxTrail final byte
1214 * @param gap3 the gap we leave for tailoring for 3-byte forms
1215 * @param gap4 the gap we leave for tailoring for 4-byte forms
1216 */
initImplicitConstants(int minPrimary,int maxPrimary,int minTrailIn,int maxTrailIn,int gap3,int primaries3count,UErrorCode * status)1217 static void initImplicitConstants(int minPrimary, int maxPrimary,
1218 int minTrailIn, int maxTrailIn,
1219 int gap3, int primaries3count,
1220 UErrorCode *status) {
1221 // some simple parameter checks
1222 if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
1223 || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
1224 || (primaries3count < 1))
1225 {
1226 *status = U_ILLEGAL_ARGUMENT_ERROR;
1227 return;
1228 };
1229
1230 minTrail = minTrailIn;
1231 maxTrail = maxTrailIn;
1232
1233 min3Primary = minPrimary;
1234 max4Primary = maxPrimary;
1235 // compute constants for use later.
1236 // number of values we can use in trailing bytes
1237 // leave room for empty values between AND above, e.g. if gap = 2
1238 // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1239 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1240 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1241 final3Multiplier = gap3 + 1;
1242 final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
1243 max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
1244
1245 // medials can use full range
1246 medialCount = (maxTrail - minTrail + 1);
1247 // find out how many values fit in each form
1248 int32_t threeByteCount = medialCount * final3Count;
1249 // now determine where the 3/4 boundary is.
1250 // we use 3 bytes below the boundary, and 4 above
1251 int32_t primariesAvailable = maxPrimary - minPrimary + 1;
1252 int32_t primaries4count = primariesAvailable - primaries3count;
1253
1254
1255 int32_t min3ByteCoverage = primaries3count * threeByteCount;
1256 min4Primary = minPrimary + primaries3count;
1257 min4Boundary = min3ByteCoverage;
1258 // Now expand out the multiplier for the 4 bytes, and redo.
1259
1260 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
1261 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
1262 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
1263 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
1264 if (gap4 < 1) {
1265 *status = U_ILLEGAL_ARGUMENT_ERROR;
1266 return;
1267 }
1268 final4Multiplier = gap4 + 1;
1269 final4Count = neededPerFinalByte;
1270 max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
1271 }
1272
1273 /**
1274 * Supply parameters for generating implicit CEs
1275 */
1276 U_CAPI void U_EXPORT2
uprv_uca_initImplicitConstants(UErrorCode * status)1277 uprv_uca_initImplicitConstants(UErrorCode *status) {
1278 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1279 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1280 initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
1281 }
1282
1283
1284 /* collIterNormalize Incremental Normalization happens here. */
1285 /* pick up the range of chars identifed by FCD, */
1286 /* normalize it into the collIterate's writable buffer, */
1287 /* switch the collIterate's state to use the writable buffer. */
1288 /* */
1289 static
collIterNormalize(collIterate * collationSource)1290 void collIterNormalize(collIterate *collationSource)
1291 {
1292 UErrorCode status = U_ZERO_ERROR;
1293 const UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */
1294 const UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */
1295
1296 collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)),
1297 collationSource->writableBuffer,
1298 status);
1299 if (U_FAILURE(status)) {
1300 #ifdef UCOL_DEBUG
1301 fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status));
1302 #endif
1303 return;
1304 }
1305
1306 collationSource->pos = collationSource->writableBuffer.getTerminatedBuffer();
1307 collationSource->origFlags = collationSource->flags;
1308 collationSource->flags |= UCOL_ITER_INNORMBUF;
1309 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1310 }
1311
1312
1313 // This function takes the iterator and extracts normalized stuff up to the next boundary
1314 // It is similar in the end results to the collIterNormalize, but for the cases when we
1315 // use an iterator
1316 /*static
1317 inline void normalizeIterator(collIterate *collationSource) {
1318 UErrorCode status = U_ZERO_ERROR;
1319 UBool wasNormalized = FALSE;
1320 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1321 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
1322 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1323 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1324 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1325 // reallocate and terminate
1326 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1327 &collationSource->writableBuffer,
1328 (int32_t *)&collationSource->writableBufSize, normLen + 1,
1329 0)
1330 ) {
1331 #ifdef UCOL_DEBUG
1332 fprintf(stderr, "normalizeIterator(), out of memory\n");
1333 #endif
1334 return;
1335 }
1336 status = U_ZERO_ERROR;
1337 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1338 collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
1339 normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1340 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1341 }
1342 // Terminate the buffer - we already checked that it is big enough
1343 collationSource->writableBuffer[normLen] = 0;
1344 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1345 collationSource->flags |= UCOL_ITER_ALLOCATED;
1346 }
1347 collationSource->pos = collationSource->writableBuffer;
1348 collationSource->origFlags = collationSource->flags;
1349 collationSource->flags |= UCOL_ITER_INNORMBUF;
1350 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1351 }*/
1352
1353
1354 /* Incremental FCD check and normalize */
1355 /* Called from getNextCE when normalization state is suspect. */
1356 /* When entering, the state is known to be this: */
1357 /* o We are working in the main buffer of the collIterate, not the side */
1358 /* writable buffer. When in the side buffer, normalization mode is always off, */
1359 /* so we won't get here. */
1360 /* o The leading combining class from the current character is 0 or */
1361 /* the trailing combining class of the previous char was zero. */
1362 /* True because the previous call to this function will have always exited */
1363 /* that way, and we get called for every char where cc might be non-zero. */
1364 static
collIterFCD(collIterate * collationSource)1365 inline UBool collIterFCD(collIterate *collationSource) {
1366 const UChar *srcP, *endP;
1367 uint8_t leadingCC;
1368 uint8_t prevTrailingCC = 0;
1369 uint16_t fcd;
1370 UBool needNormalize = FALSE;
1371
1372 srcP = collationSource->pos-1;
1373
1374 if (collationSource->flags & UCOL_ITER_HASLEN) {
1375 endP = collationSource->endp;
1376 } else {
1377 endP = NULL;
1378 }
1379
1380 // Get the trailing combining class of the current character. If it's zero, we are OK.
1381 fcd = g_nfcImpl->nextFCD16(srcP, endP);
1382 if (fcd != 0) {
1383 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1384
1385 if (prevTrailingCC != 0) {
1386 // The current char has a non-zero trailing CC. Scan forward until we find
1387 // a char with a leading cc of zero.
1388 while (endP == NULL || srcP != endP)
1389 {
1390 const UChar *savedSrcP = srcP;
1391
1392 fcd = g_nfcImpl->nextFCD16(srcP, endP);
1393 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1394 if (leadingCC == 0) {
1395 srcP = savedSrcP; // Hit char that is not part of combining sequence.
1396 // back up over it. (Could be surrogate pair!)
1397 break;
1398 }
1399
1400 if (leadingCC < prevTrailingCC) {
1401 needNormalize = TRUE;
1402 }
1403
1404 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1405 }
1406 }
1407 }
1408
1409 collationSource->fcdPosition = (UChar *)srcP;
1410
1411 return needNormalize;
1412 }
1413
1414 /****************************************************************************/
1415 /* Following are the CE retrieval functions */
1416 /* */
1417 /****************************************************************************/
1418
1419 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
1420 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
1421
1422 /* there should be a macro version of this function in the header file */
1423 /* This is the first function that tries to fetch a collation element */
1424 /* If it's not succesfull or it encounters a more difficult situation */
1425 /* some more sofisticated and slower functions are invoked */
1426 static
ucol_IGetNextCE(const UCollator * coll,collIterate * collationSource,UErrorCode * status)1427 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1428 uint32_t order = 0;
1429 if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */
1430 order = *(collationSource->toReturn++); /* if so, return them */
1431 if(collationSource->CEpos == collationSource->toReturn) {
1432 collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs;
1433 }
1434 return order;
1435 }
1436
1437 UChar ch = 0;
1438 collationSource->offsetReturn = NULL;
1439
1440 do {
1441 for (;;) /* Loop handles case when incremental normalize switches */
1442 { /* to or from the side buffer / original string, and we */
1443 /* need to start again to get the next character. */
1444
1445 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1446 {
1447 // The source string is null terminated and we're not working from the side buffer,
1448 // and we're not normalizing. This is the fast path.
1449 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1450 ch = *collationSource->pos++;
1451 if (ch != 0) {
1452 break;
1453 }
1454 else {
1455 return UCOL_NO_MORE_CES;
1456 }
1457 }
1458
1459 if (collationSource->flags & UCOL_ITER_HASLEN) {
1460 // Normal path for strings when length is specified.
1461 // (We can't be in side buffer because it is always null terminated.)
1462 if (collationSource->pos >= collationSource->endp) {
1463 // Ran off of the end of the main source string. We're done.
1464 return UCOL_NO_MORE_CES;
1465 }
1466 ch = *collationSource->pos++;
1467 }
1468 else if(collationSource->flags & UCOL_USE_ITERATOR) {
1469 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
1470 if(iterCh == U_SENTINEL) {
1471 return UCOL_NO_MORE_CES;
1472 }
1473 ch = (UChar)iterCh;
1474 }
1475 else
1476 {
1477 // Null terminated string.
1478 ch = *collationSource->pos++;
1479 if (ch == 0) {
1480 // Ran off end of buffer.
1481 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1482 // Ran off end of main string. backing up one character.
1483 collationSource->pos--;
1484 return UCOL_NO_MORE_CES;
1485 }
1486 else
1487 {
1488 // Hit null in the normalize side buffer.
1489 // Usually this means the end of the normalized data,
1490 // except for one odd case: a null followed by combining chars,
1491 // which is the case if we are at the start of the buffer.
1492 if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) {
1493 break;
1494 }
1495
1496 // Null marked end of side buffer.
1497 // Revert to the main string and
1498 // loop back to top to try again to get a character.
1499 collationSource->pos = collationSource->fcdPosition;
1500 collationSource->flags = collationSource->origFlags;
1501 continue;
1502 }
1503 }
1504 }
1505
1506 if(collationSource->flags&UCOL_HIRAGANA_Q) {
1507 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
1508 * based on whether the previous codepoint was Hiragana or Katakana.
1509 */
1510 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
1511 ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
1512 collationSource->flags |= UCOL_WAS_HIRAGANA;
1513 } else {
1514 collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1515 }
1516 }
1517
1518 // We've got a character. See if there's any fcd and/or normalization stuff to do.
1519 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1520 if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1521 break;
1522 }
1523
1524 if (collationSource->fcdPosition >= collationSource->pos) {
1525 // An earlier FCD check has already covered the current character.
1526 // We can go ahead and process this char.
1527 break;
1528 }
1529
1530 if (ch < ZERO_CC_LIMIT_ ) {
1531 // Fast fcd safe path. Trailing combining class == 0. This char is OK.
1532 break;
1533 }
1534
1535 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1536 // We need to peek at the next character in order to tell if we are FCD
1537 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
1538 // We are at the last char of source string.
1539 // It is always OK for FCD check.
1540 break;
1541 }
1542
1543 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test
1544 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1545 break;
1546 }
1547 }
1548
1549
1550 // Need a more complete FCD check and possible normalization.
1551 if (collIterFCD(collationSource)) {
1552 collIterNormalize(collationSource);
1553 }
1554 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1555 // No normalization was needed. Go ahead and process the char we already had.
1556 break;
1557 }
1558
1559 // Some normalization happened. Next loop iteration will pick up a char
1560 // from the normalization buffer.
1561
1562 } // end for (;;)
1563
1564
1565 if (ch <= 0xFF) {
1566 /* For latin-1 characters we never need to fall back to the UCA table */
1567 /* because all of the UCA data is replicated in the latinOneMapping array */
1568 order = coll->latinOneMapping[ch];
1569 if (order > UCOL_NOT_FOUND) {
1570 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
1571 }
1572 }
1573 else
1574 {
1575 // Always use UCA for Han, Hangul
1576 // (Han extension A is before main Han block)
1577 // **** Han compatibility chars ?? ****
1578 if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
1579 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
1580 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
1581 // between the two target ranges; do normal lookup
1582 // **** this range is YI, Modifier tone letters, ****
1583 // **** Latin-D, Syloti Nagari, Phagas-pa. ****
1584 // **** Latin-D might be tailored, so we need to ****
1585 // **** do the normal lookup for these guys. ****
1586 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1587 } else {
1588 // in one of the target ranges; use UCA
1589 order = UCOL_NOT_FOUND;
1590 }
1591 } else {
1592 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1593 }
1594
1595 if(order > UCOL_NOT_FOUND) { /* if a CE is special */
1596 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */
1597 }
1598
1599 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */
1600 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1601 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1602
1603 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
1604 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
1605 }
1606 }
1607 }
1608 } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
1609
1610 if(order == UCOL_NOT_FOUND) {
1611 order = getImplicit(ch, collationSource);
1612 }
1613 return order; /* return the CE */
1614 }
1615
1616 /* ucol_getNextCE, out-of-line version for use from other files. */
1617 U_CAPI uint32_t U_EXPORT2
ucol_getNextCE(const UCollator * coll,collIterate * collationSource,UErrorCode * status)1618 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1619 return ucol_IGetNextCE(coll, collationSource, status);
1620 }
1621
1622
1623 /**
1624 * Incremental previous normalization happens here. Pick up the range of chars
1625 * identifed by FCD, normalize it into the collIterate's writable buffer,
1626 * switch the collIterate's state to use the writable buffer.
1627 * @param data collation iterator data
1628 */
1629 static
collPrevIterNormalize(collIterate * data)1630 void collPrevIterNormalize(collIterate *data)
1631 {
1632 UErrorCode status = U_ZERO_ERROR;
1633 const UChar *pEnd = data->pos; /* End normalize + 1 */
1634 const UChar *pStart;
1635
1636 /* Start normalize */
1637 if (data->fcdPosition == NULL) {
1638 pStart = data->string;
1639 }
1640 else {
1641 pStart = data->fcdPosition + 1;
1642 }
1643
1644 int32_t normLen =
1645 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)),
1646 data->writableBuffer,
1647 status).
1648 length();
1649 if(U_FAILURE(status)) {
1650 return;
1651 }
1652 /*
1653 this puts the null termination infront of the normalized string instead
1654 of the end
1655 */
1656 data->writableBuffer.insert(0, (UChar)0);
1657
1658 /*
1659 * The usual case at this point is that we've got a base
1660 * character followed by marks that were normalized. If
1661 * fcdPosition is NULL, that means that we backed up to
1662 * the beginning of the string and there's no base character.
1663 *
1664 * Forward processing will usually normalize when it sees
1665 * the first mark, so that mark will get it's natural offset
1666 * and the rest will get the offset of the character following
1667 * the marks. The base character will also get its natural offset.
1668 *
1669 * We write the offset of the base character, if there is one,
1670 * followed by the offset of the first mark and then the offsets
1671 * of the rest of the marks.
1672 */
1673 int32_t firstMarkOffset = 0;
1674 int32_t trailOffset = (int32_t)(data->pos - data->string + 1);
1675 int32_t trailCount = normLen - 1;
1676
1677 if (data->fcdPosition != NULL) {
1678 int32_t baseOffset = (int32_t)(data->fcdPosition - data->string);
1679 UChar baseChar = *data->fcdPosition;
1680
1681 firstMarkOffset = baseOffset + 1;
1682
1683 /*
1684 * If the base character is the start of a contraction, forward processing
1685 * will normalize the marks while checking for the contraction, which means
1686 * that the offset of the first mark will the same as the other marks.
1687 *
1688 * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
1689 */
1690 if (baseChar >= 0x100) {
1691 uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
1692
1693 if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
1694 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
1695 }
1696
1697 if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
1698 firstMarkOffset = trailOffset;
1699 }
1700 }
1701
1702 data->appendOffset(baseOffset, status);
1703 }
1704
1705 data->appendOffset(firstMarkOffset, status);
1706
1707 for (int32_t i = 0; i < trailCount; i += 1) {
1708 data->appendOffset(trailOffset, status);
1709 }
1710
1711 data->offsetRepeatValue = trailOffset;
1712
1713 data->offsetReturn = data->offsetStore - 1;
1714 if (data->offsetReturn == data->offsetBuffer) {
1715 data->offsetStore = data->offsetBuffer;
1716 }
1717
1718 data->pos = data->writableBuffer.getTerminatedBuffer() + 1 + normLen;
1719 data->origFlags = data->flags;
1720 data->flags |= UCOL_ITER_INNORMBUF;
1721 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1722 }
1723
1724
1725 /**
1726 * Incremental FCD check for previous iteration and normalize. Called from
1727 * getPrevCE when normalization state is suspect.
1728 * When entering, the state is known to be this:
1729 * o We are working in the main buffer of the collIterate, not the side
1730 * writable buffer. When in the side buffer, normalization mode is always
1731 * off, so we won't get here.
1732 * o The leading combining class from the current character is 0 or the
1733 * trailing combining class of the previous char was zero.
1734 * True because the previous call to this function will have always exited
1735 * that way, and we get called for every char where cc might be non-zero.
1736 * @param data collation iterate struct
1737 * @return normalization status, TRUE for normalization to be done, FALSE
1738 * otherwise
1739 */
1740 static
collPrevIterFCD(collIterate * data)1741 inline UBool collPrevIterFCD(collIterate *data)
1742 {
1743 const UChar *src, *start;
1744 uint8_t leadingCC;
1745 uint8_t trailingCC = 0;
1746 uint16_t fcd;
1747 UBool result = FALSE;
1748
1749 start = data->string;
1750 src = data->pos + 1;
1751
1752 /* Get the trailing combining class of the current character. */
1753 fcd = g_nfcImpl->previousFCD16(start, src);
1754
1755 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1756
1757 if (leadingCC != 0) {
1758 /*
1759 The current char has a non-zero leading combining class.
1760 Scan backward until we find a char with a trailing cc of zero.
1761 */
1762 for (;;)
1763 {
1764 if (start == src) {
1765 data->fcdPosition = NULL;
1766 return result;
1767 }
1768
1769 fcd = g_nfcImpl->previousFCD16(start, src);
1770
1771 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1772
1773 if (trailingCC == 0) {
1774 break;
1775 }
1776
1777 if (leadingCC < trailingCC) {
1778 result = TRUE;
1779 }
1780
1781 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1782 }
1783 }
1784
1785 data->fcdPosition = (UChar *)src;
1786
1787 return result;
1788 }
1789
1790 /** gets a code unit from the string at a given offset
1791 * Handles both normal and iterative cases.
1792 * No error checking - caller beware!
1793 */
1794 static inline
peekCodeUnit(collIterate * source,int32_t offset)1795 UChar peekCodeUnit(collIterate *source, int32_t offset) {
1796 if(source->pos != NULL) {
1797 return *(source->pos + offset);
1798 } else if(source->iterator != NULL) {
1799 UChar32 c;
1800 if(offset != 0) {
1801 source->iterator->move(source->iterator, offset, UITER_CURRENT);
1802 c = source->iterator->next(source->iterator);
1803 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1804 } else {
1805 c = source->iterator->current(source->iterator);
1806 }
1807 return c >= 0 ? (UChar)c : 0xfffd; // If the caller works properly, we should never see c<0.
1808 } else {
1809 return 0xfffd;
1810 }
1811 }
1812
1813 // Code point version. Treats the offset as a _code point_ delta.
1814 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16.
1815 // We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer.
1816 static inline
peekCodePoint(collIterate * source,int32_t offset)1817 UChar32 peekCodePoint(collIterate *source, int32_t offset) {
1818 UChar32 c;
1819 if(source->pos != NULL) {
1820 const UChar *p = source->pos;
1821 if(offset >= 0) {
1822 // Skip forward over (offset-1) code points.
1823 while(--offset >= 0) {
1824 if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) {
1825 ++p;
1826 }
1827 }
1828 // Read the code point there.
1829 c = *p++;
1830 UChar trail;
1831 if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) {
1832 c = U16_GET_SUPPLEMENTARY(c, trail);
1833 }
1834 } else /* offset<0 */ {
1835 // Skip backward over (offset-1) code points.
1836 while(++offset < 0) {
1837 if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) {
1838 --p;
1839 }
1840 }
1841 // Read the code point before that.
1842 c = *--p;
1843 UChar lead;
1844 if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) {
1845 c = U16_GET_SUPPLEMENTARY(lead, c);
1846 }
1847 }
1848 } else if(source->iterator != NULL) {
1849 if(offset >= 0) {
1850 // Skip forward over (offset-1) code points.
1851 int32_t fwd = offset;
1852 while(fwd-- > 0) {
1853 uiter_next32(source->iterator);
1854 }
1855 // Read the code point there.
1856 c = uiter_current32(source->iterator);
1857 // Return to the starting point, skipping backward over (offset-1) code points.
1858 while(offset-- > 0) {
1859 uiter_previous32(source->iterator);
1860 }
1861 } else /* offset<0 */ {
1862 // Read backward, reading offset code points, remember only the last-read one.
1863 int32_t back = offset;
1864 do {
1865 c = uiter_previous32(source->iterator);
1866 } while(++back < 0);
1867 // Return to the starting position, skipping forward over offset code points.
1868 do {
1869 uiter_next32(source->iterator);
1870 } while(++offset < 0);
1871 }
1872 } else {
1873 c = U_SENTINEL;
1874 }
1875 return c;
1876 }
1877
1878 /**
1879 * Determines if we are at the start of the data string in the backwards
1880 * collation iterator
1881 * @param data collation iterator
1882 * @return TRUE if we are at the start
1883 */
1884 static
isAtStartPrevIterate(collIterate * data)1885 inline UBool isAtStartPrevIterate(collIterate *data) {
1886 if(data->pos == NULL && data->iterator != NULL) {
1887 return !data->iterator->hasPrevious(data->iterator);
1888 }
1889 //return (collIter_bos(data)) ||
1890 return (data->pos == data->string) ||
1891 ((data->flags & UCOL_ITER_INNORMBUF) && (data->pos != NULL) &&
1892 *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1893 }
1894
1895 static
goBackOne(collIterate * data)1896 inline void goBackOne(collIterate *data) {
1897 # if 0
1898 // somehow, it looks like we need to keep iterator synced up
1899 // at all times, as above.
1900 if(data->pos) {
1901 data->pos--;
1902 }
1903 if(data->iterator) {
1904 data->iterator->previous(data->iterator);
1905 }
1906 #endif
1907 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
1908 data->iterator->previous(data->iterator);
1909 }
1910 if(data->pos) {
1911 data->pos --;
1912 }
1913 }
1914
1915 /**
1916 * Inline function that gets a simple CE.
1917 * So what it does is that it will first check the expansion buffer. If the
1918 * expansion buffer is not empty, ie the end pointer to the expansion buffer
1919 * is different from the string pointer, we return the collation element at the
1920 * return pointer and decrement it.
1921 * For more complicated CEs it resorts to getComplicatedCE.
1922 * @param coll collator data
1923 * @param data collation iterator struct
1924 * @param status error status
1925 */
1926 static
ucol_IGetPrevCE(const UCollator * coll,collIterate * data,UErrorCode * status)1927 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
1928 UErrorCode *status)
1929 {
1930 uint32_t result = (uint32_t)UCOL_NULLORDER;
1931
1932 if (data->offsetReturn != NULL) {
1933 if (data->offsetRepeatCount > 0) {
1934 data->offsetRepeatCount -= 1;
1935 } else {
1936 if (data->offsetReturn == data->offsetBuffer) {
1937 data->offsetReturn = NULL;
1938 data->offsetStore = data->offsetBuffer;
1939 } else {
1940 data->offsetReturn -= 1;
1941 }
1942 }
1943 }
1944
1945 if ((data->extendCEs && data->toReturn > data->extendCEs) ||
1946 (!data->extendCEs && data->toReturn > data->CEs))
1947 {
1948 data->toReturn -= 1;
1949 result = *(data->toReturn);
1950 if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
1951 data->CEpos = data->toReturn;
1952 }
1953 }
1954 else {
1955 UChar ch = 0;
1956
1957 do {
1958 /*
1959 Loop handles case when incremental normalize switches to or from the
1960 side buffer / original string, and we need to start again to get the
1961 next character.
1962 */
1963 for (;;) {
1964 if (data->flags & UCOL_ITER_HASLEN) {
1965 /*
1966 Normal path for strings when length is specified.
1967 Not in side buffer because it is always null terminated.
1968 */
1969 if (data->pos <= data->string) {
1970 /* End of the main source string */
1971 return UCOL_NO_MORE_CES;
1972 }
1973 data->pos --;
1974 ch = *data->pos;
1975 }
1976 // we are using an iterator to go back. Pray for us!
1977 else if (data->flags & UCOL_USE_ITERATOR) {
1978 UChar32 iterCh = data->iterator->previous(data->iterator);
1979 if(iterCh == U_SENTINEL) {
1980 return UCOL_NO_MORE_CES;
1981 } else {
1982 ch = (UChar)iterCh;
1983 }
1984 }
1985 else {
1986 data->pos --;
1987 ch = *data->pos;
1988 /* we are in the side buffer. */
1989 if (ch == 0) {
1990 /*
1991 At the start of the normalize side buffer.
1992 Go back to string.
1993 Because pointer points to the last accessed character,
1994 hence we have to increment it by one here.
1995 */
1996 data->flags = data->origFlags;
1997 data->offsetRepeatValue = 0;
1998
1999 if (data->fcdPosition == NULL) {
2000 data->pos = data->string;
2001 return UCOL_NO_MORE_CES;
2002 }
2003 else {
2004 data->pos = data->fcdPosition + 1;
2005 }
2006
2007 continue;
2008 }
2009 }
2010
2011 if(data->flags&UCOL_HIRAGANA_Q) {
2012 if(ch>=0x3040 && ch<=0x309f) {
2013 data->flags |= UCOL_WAS_HIRAGANA;
2014 } else {
2015 data->flags &= ~UCOL_WAS_HIRAGANA;
2016 }
2017 }
2018
2019 /*
2020 * got a character to determine if there's fcd and/or normalization
2021 * stuff to do.
2022 * if the current character is not fcd.
2023 * if current character is at the start of the string
2024 * Trailing combining class == 0.
2025 * Note if pos is in the writablebuffer, norm is always 0
2026 */
2027 if (ch < ZERO_CC_LIMIT_ ||
2028 // this should propel us out of the loop in the iterator case
2029 (data->flags & UCOL_ITER_NORM) == 0 ||
2030 (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
2031 || data->string == data->pos) {
2032 break;
2033 }
2034
2035 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
2036 /* if next character is FCD */
2037 if (data->pos == data->string) {
2038 /* First char of string is always OK for FCD check */
2039 break;
2040 }
2041
2042 /* Not first char of string, do the FCD fast test */
2043 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
2044 break;
2045 }
2046 }
2047
2048 /* Need a more complete FCD check and possible normalization. */
2049 if (collPrevIterFCD(data)) {
2050 collPrevIterNormalize(data);
2051 }
2052
2053 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2054 /* No normalization. Go ahead and process the char. */
2055 break;
2056 }
2057
2058 /*
2059 Some normalization happened.
2060 Next loop picks up a char from the normalization buffer.
2061 */
2062 }
2063
2064 /* attempt to handle contractions, after removal of the backwards
2065 contraction
2066 */
2067 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
2068 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
2069 } else {
2070 if (ch <= 0xFF) {
2071 result = coll->latinOneMapping[ch];
2072 }
2073 else {
2074 // Always use UCA for [3400..9FFF], [AC00..D7AF]
2075 // **** [FA0E..FA2F] ?? ****
2076 if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
2077 (ch >= 0x3400 && ch <= 0xD7AF)) {
2078 if (ch > 0x9FFF && ch < 0xAC00) {
2079 // between the two target ranges; do normal lookup
2080 // **** this range is YI, Modifier tone letters, ****
2081 // **** Latin-D, Syloti Nagari, Phagas-pa. ****
2082 // **** Latin-D might be tailored, so we need to ****
2083 // **** do the normal lookup for these guys. ****
2084 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
2085 } else {
2086 result = UCOL_NOT_FOUND;
2087 }
2088 } else {
2089 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
2090 }
2091 }
2092 if (result > UCOL_NOT_FOUND) {
2093 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
2094 }
2095 if (result == UCOL_NOT_FOUND) { // Not found in master list
2096 if (!isAtStartPrevIterate(data) &&
2097 ucol_contractionEndCP(ch, data->coll))
2098 {
2099 result = UCOL_CONTRACTION;
2100 } else {
2101 if(coll->UCA) {
2102 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
2103 }
2104 }
2105
2106 if (result > UCOL_NOT_FOUND) {
2107 if(coll->UCA) {
2108 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
2109 }
2110 }
2111 }
2112 }
2113 } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
2114
2115 if(result == UCOL_NOT_FOUND) {
2116 result = getPrevImplicit(ch, data);
2117 }
2118 }
2119
2120 return result;
2121 }
2122
2123
2124 /* ucol_getPrevCE, out-of-line version for use from other files. */
2125 U_CFUNC uint32_t U_EXPORT2
ucol_getPrevCE(const UCollator * coll,collIterate * data,UErrorCode * status)2126 ucol_getPrevCE(const UCollator *coll, collIterate *data,
2127 UErrorCode *status) {
2128 return ucol_IGetPrevCE(coll, data, status);
2129 }
2130
2131
2132 /* this should be connected to special Jamo handling */
2133 U_CFUNC uint32_t U_EXPORT2
ucol_getFirstCE(const UCollator * coll,UChar u,UErrorCode * status)2134 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
2135 collIterate colIt;
2136 IInit_collIterate(coll, &u, 1, &colIt, status);
2137 if(U_FAILURE(*status)) {
2138 return 0;
2139 }
2140 return ucol_IGetNextCE(coll, &colIt, status);
2141 }
2142
2143 /**
2144 * Inserts the argument character into the end of the buffer pushing back the
2145 * null terminator.
2146 * @param data collIterate struct data
2147 * @param ch character to be appended
2148 * @return the position of the new addition
2149 */
2150 static
insertBufferEnd(collIterate * data,UChar ch)2151 inline const UChar * insertBufferEnd(collIterate *data, UChar ch)
2152 {
2153 int32_t oldLength = data->writableBuffer.length();
2154 return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength;
2155 }
2156
2157 /**
2158 * Inserts the argument string into the end of the buffer pushing back the
2159 * null terminator.
2160 * @param data collIterate struct data
2161 * @param string to be appended
2162 * @param length of the string to be appended
2163 * @return the position of the new addition
2164 */
2165 static
insertBufferEnd(collIterate * data,const UChar * str,int32_t length)2166 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length)
2167 {
2168 int32_t oldLength = data->writableBuffer.length();
2169 return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength;
2170 }
2171
2172 /**
2173 * Special normalization function for contraction in the forwards iterator.
2174 * This normalization sequence will place the current character at source->pos
2175 * and its following normalized sequence into the buffer.
2176 * The fcd position, pos will be changed.
2177 * pos will now point to positions in the buffer.
2178 * Flags will be changed accordingly.
2179 * @param data collation iterator data
2180 */
2181 static
normalizeNextContraction(collIterate * data)2182 inline void normalizeNextContraction(collIterate *data)
2183 {
2184 int32_t strsize;
2185 UErrorCode status = U_ZERO_ERROR;
2186 /* because the pointer points to the next character */
2187 const UChar *pStart = data->pos - 1;
2188 const UChar *pEnd;
2189
2190 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2191 data->writableBuffer.setTo(*(pStart - 1));
2192 strsize = 1;
2193 }
2194 else {
2195 strsize = data->writableBuffer.length();
2196 }
2197
2198 pEnd = data->fcdPosition;
2199
2200 data->writableBuffer.append(
2201 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status));
2202 if(U_FAILURE(status)) {
2203 return;
2204 }
2205
2206 data->pos = data->writableBuffer.getTerminatedBuffer() + strsize;
2207 data->origFlags = data->flags;
2208 data->flags |= UCOL_ITER_INNORMBUF;
2209 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2210 }
2211
2212 /**
2213 * Contraction character management function that returns the next character
2214 * for the forwards iterator.
2215 * Does nothing if the next character is in buffer and not the first character
2216 * in it.
2217 * Else it checks next character in data string to see if it is normalizable.
2218 * If it is not, the character is simply copied into the buffer, else
2219 * the whole normalized substring is copied into the buffer, including the
2220 * current character.
2221 * @param data collation element iterator data
2222 * @return next character
2223 */
2224 static
getNextNormalizedChar(collIterate * data)2225 inline UChar getNextNormalizedChar(collIterate *data)
2226 {
2227 UChar nextch;
2228 UChar ch;
2229 // Here we need to add the iterator code. One problem is the way
2230 // end of string is handled. If we just return next char, it could
2231 // be the sentinel. Most of the cases already check for this, but we
2232 // need to be sure.
2233 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2234 /* if no normalization and not in buffer. */
2235 if(data->flags & UCOL_USE_ITERATOR) {
2236 return (UChar)data->iterator->next(data->iterator);
2237 } else {
2238 return *(data->pos ++);
2239 }
2240 }
2241
2242 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2243 //normalizeIterator(data);
2244 //}
2245
2246 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2247 if ((innormbuf && *data->pos != 0) ||
2248 (data->fcdPosition != NULL && !innormbuf &&
2249 data->pos < data->fcdPosition)) {
2250 /*
2251 if next character is in normalized buffer, no further normalization
2252 is required
2253 */
2254 return *(data->pos ++);
2255 }
2256
2257 if (data->flags & UCOL_ITER_HASLEN) {
2258 /* in data string */
2259 if (data->pos + 1 == data->endp) {
2260 return *(data->pos ++);
2261 }
2262 }
2263 else {
2264 if (innormbuf) {
2265 // inside the normalization buffer, but at the end
2266 // (since we encountered zero). This means, in the
2267 // case we're using char iterator, that we need to
2268 // do another round of normalization.
2269 //if(data->origFlags & UCOL_USE_ITERATOR) {
2270 // we need to restore original flags,
2271 // otherwise, we'll lose them
2272 //data->flags = data->origFlags;
2273 //normalizeIterator(data);
2274 //return *(data->pos++);
2275 //} else {
2276 /*
2277 in writable buffer, at this point fcdPosition can not be
2278 pointing to the end of the data string. see contracting tag.
2279 */
2280 if(data->fcdPosition) {
2281 if (*(data->fcdPosition + 1) == 0 ||
2282 data->fcdPosition + 1 == data->endp) {
2283 /* at the end of the string, dump it into the normalizer */
2284 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1;
2285 // Check if data->pos received a null pointer
2286 if (data->pos == NULL) {
2287 return (UChar)-1; // Return to indicate error.
2288 }
2289 return *(data->fcdPosition ++);
2290 }
2291 data->pos = data->fcdPosition;
2292 } else if(data->origFlags & UCOL_USE_ITERATOR) {
2293 // if we are here, we're using a normalizing iterator.
2294 // we should just continue further.
2295 data->flags = data->origFlags;
2296 data->pos = NULL;
2297 return (UChar)data->iterator->next(data->iterator);
2298 }
2299 //}
2300 }
2301 else {
2302 if (*(data->pos + 1) == 0) {
2303 return *(data->pos ++);
2304 }
2305 }
2306 }
2307
2308 ch = *data->pos ++;
2309 nextch = *data->pos;
2310
2311 /*
2312 * if the current character is not fcd.
2313 * Trailing combining class == 0.
2314 */
2315 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2316 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2317 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2318 /*
2319 Need a more complete FCD check and possible normalization.
2320 normalize substring will be appended to buffer
2321 */
2322 if (collIterFCD(data)) {
2323 normalizeNextContraction(data);
2324 return *(data->pos ++);
2325 }
2326 else if (innormbuf) {
2327 /* fcdposition shifted even when there's no normalization, if we
2328 don't input the rest into this, we'll get the wrong position when
2329 we reach the end of the writableBuffer */
2330 int32_t length = (int32_t)(data->fcdPosition - data->pos + 1);
2331 data->pos = insertBufferEnd(data, data->pos - 1, length);
2332 // Check if data->pos received a null pointer
2333 if (data->pos == NULL) {
2334 return (UChar)-1; // Return to indicate error.
2335 }
2336 return *(data->pos ++);
2337 }
2338 }
2339
2340 if (innormbuf) {
2341 /*
2342 no normalization is to be done hence only one character will be
2343 appended to the buffer.
2344 */
2345 data->pos = insertBufferEnd(data, ch) + 1;
2346 // Check if data->pos received a null pointer
2347 if (data->pos == NULL) {
2348 return (UChar)-1; // Return to indicate error.
2349 }
2350 }
2351
2352 /* points back to the pos in string */
2353 return ch;
2354 }
2355
2356
2357
2358 /**
2359 * Function to copy the buffer into writableBuffer and sets the fcd position to
2360 * the correct position
2361 * @param source data string source
2362 * @param buffer character buffer
2363 */
2364 static
setDiscontiguosAttribute(collIterate * source,const UnicodeString & buffer)2365 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer)
2366 {
2367 /* okay confusing part here. to ensure that the skipped characters are
2368 considered later, we need to place it in the appropriate position in the
2369 normalization buffer and reassign the pos pointer. simple case if pos
2370 reside in string, simply copy to normalization buffer and
2371 fcdposition = pos, pos = start of normalization buffer. if pos in
2372 normalization buffer, we'll insert the copy infront of pos and point pos
2373 to the start of the normalization buffer. why am i doing these copies?
2374 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2375 not require any changes, which be really painful. */
2376 if (source->flags & UCOL_ITER_INNORMBUF) {
2377 int32_t replaceLength = source->pos - source->writableBuffer.getBuffer();
2378 source->writableBuffer.replace(0, replaceLength, buffer);
2379 }
2380 else {
2381 source->fcdPosition = source->pos;
2382 source->origFlags = source->flags;
2383 source->flags |= UCOL_ITER_INNORMBUF;
2384 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2385 source->writableBuffer = buffer;
2386 }
2387
2388 source->pos = source->writableBuffer.getTerminatedBuffer();
2389 }
2390
2391 /**
2392 * Function to get the discontiguos collation element within the source.
2393 * Note this function will set the position to the appropriate places.
2394 * @param coll current collator used
2395 * @param source data string source
2396 * @param constart index to the start character in the contraction table
2397 * @return discontiguos collation element offset
2398 */
2399 static
getDiscontiguous(const UCollator * coll,collIterate * source,const UChar * constart)2400 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2401 const UChar *constart)
2402 {
2403 /* source->pos currently points to the second combining character after
2404 the start character */
2405 const UChar *temppos = source->pos;
2406 UnicodeString buffer;
2407 const UChar *tempconstart = constart;
2408 uint8_t tempflags = source->flags;
2409 UBool multicontraction = FALSE;
2410 collIterateState discState;
2411
2412 backupState(source, &discState);
2413
2414 buffer.setTo(peekCodePoint(source, -1));
2415 for (;;) {
2416 UChar *UCharOffset;
2417 UChar schar,
2418 tchar;
2419 uint32_t result;
2420
2421 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2422 || (peekCodeUnit(source, 0) == 0 &&
2423 //|| (*source->pos == 0 &&
2424 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2425 source->fcdPosition == NULL ||
2426 source->fcdPosition == source->endp ||
2427 *(source->fcdPosition) == 0 ||
2428 u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2429 /* end of string in null terminated string or stopped by a
2430 null character, note fcd does not always point to a base
2431 character after the discontiguos change */
2432 u_getCombiningClass(peekCodePoint(source, 0)) == 0) {
2433 //u_getCombiningClass(*(source->pos)) == 0) {
2434 //constart = (UChar *)coll->image + getContractOffset(CE);
2435 if (multicontraction) {
2436 source->pos = temppos - 1;
2437 setDiscontiguosAttribute(source, buffer);
2438 return *(coll->contractionCEs +
2439 (tempconstart - coll->contractionIndex));
2440 }
2441 constart = tempconstart;
2442 break;
2443 }
2444
2445 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2446 schar = getNextNormalizedChar(source);
2447
2448 while (schar > (tchar = *UCharOffset)) {
2449 UCharOffset++;
2450 }
2451
2452 if (schar != tchar) {
2453 /* not the correct codepoint. we stuff the current codepoint into
2454 the discontiguos buffer and try the next character */
2455 buffer.append(schar);
2456 continue;
2457 }
2458 else {
2459 if (u_getCombiningClass(schar) ==
2460 u_getCombiningClass(peekCodePoint(source, -2))) {
2461 buffer.append(schar);
2462 continue;
2463 }
2464 result = *(coll->contractionCEs +
2465 (UCharOffset - coll->contractionIndex));
2466 }
2467
2468 if (result == UCOL_NOT_FOUND) {
2469 break;
2470 } else if (isContraction(result)) {
2471 /* this is a multi-contraction*/
2472 tempconstart = (UChar *)coll->image + getContractOffset(result);
2473 if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2474 != UCOL_NOT_FOUND) {
2475 multicontraction = TRUE;
2476 temppos = source->pos + 1;
2477 }
2478 } else {
2479 setDiscontiguosAttribute(source, buffer);
2480 return result;
2481 }
2482 }
2483
2484 /* no problems simply reverting just like that,
2485 if we are in string before getting into this function, points back to
2486 string hence no problem.
2487 if we are in normalization buffer before getting into this function,
2488 since we'll never use another normalization within this function, we
2489 know that fcdposition points to a base character. the normalization buffer
2490 never change, hence this revert works. */
2491 loadState(source, &discState, TRUE);
2492 goBackOne(source);
2493
2494 //source->pos = temppos - 1;
2495 source->flags = tempflags;
2496 return *(coll->contractionCEs + (constart - coll->contractionIndex));
2497 }
2498
2499 /* now uses Mark's getImplicitPrimary code */
2500 static
getImplicit(UChar32 cp,collIterate * collationSource)2501 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2502 uint32_t r = uprv_uca_getImplicitPrimary(cp);
2503 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2504 collationSource->offsetRepeatCount += 1;
2505 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2506 }
2507
2508 /**
2509 * Inserts the argument character into the front of the buffer replacing the
2510 * front null terminator.
2511 * @param data collation element iterator data
2512 * @param ch character to be appended
2513 */
2514 static
insertBufferFront(collIterate * data,UChar ch)2515 inline void insertBufferFront(collIterate *data, UChar ch)
2516 {
2517 data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2;
2518 }
2519
2520 /**
2521 * Special normalization function for contraction in the previous iterator.
2522 * This normalization sequence will place the current character at source->pos
2523 * and its following normalized sequence into the buffer.
2524 * The fcd position, pos will be changed.
2525 * pos will now point to positions in the buffer.
2526 * Flags will be changed accordingly.
2527 * @param data collation iterator data
2528 */
2529 static
normalizePrevContraction(collIterate * data,UErrorCode * status)2530 inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
2531 {
2532 const UChar *pEnd = data->pos + 1; /* End normalize + 1 */
2533 const UChar *pStart;
2534
2535 UnicodeString endOfBuffer;
2536 if (data->flags & UCOL_ITER_HASLEN) {
2537 /*
2538 normalization buffer not used yet, we'll pull down the next
2539 character into the end of the buffer
2540 */
2541 endOfBuffer.setTo(*pEnd);
2542 }
2543 else {
2544 endOfBuffer.setTo(data->writableBuffer, 1); // after the leading NUL
2545 }
2546
2547 if (data->fcdPosition == NULL) {
2548 pStart = data->string;
2549 }
2550 else {
2551 pStart = data->fcdPosition + 1;
2552 }
2553 int32_t normLen =
2554 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)),
2555 data->writableBuffer,
2556 *status).
2557 length();
2558 if(U_FAILURE(*status)) {
2559 return;
2560 }
2561 /*
2562 this puts the null termination infront of the normalized string instead
2563 of the end
2564 */
2565 data->pos =
2566 data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() +
2567 1 + normLen;
2568 data->origFlags = data->flags;
2569 data->flags |= UCOL_ITER_INNORMBUF;
2570 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2571 }
2572
2573 /**
2574 * Contraction character management function that returns the previous character
2575 * for the backwards iterator.
2576 * Does nothing if the previous character is in buffer and not the first
2577 * character in it.
2578 * Else it checks previous character in data string to see if it is
2579 * normalizable.
2580 * If it is not, the character is simply copied into the buffer, else
2581 * the whole normalized substring is copied into the buffer, including the
2582 * current character.
2583 * @param data collation element iterator data
2584 * @return previous character
2585 */
2586 static
getPrevNormalizedChar(collIterate * data,UErrorCode * status)2587 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
2588 {
2589 UChar prevch;
2590 UChar ch;
2591 const UChar *start;
2592 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2593 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2594 (innormbuf && *(data->pos - 1) != 0)) {
2595 /*
2596 if no normalization.
2597 if previous character is in normalized buffer, no further normalization
2598 is required
2599 */
2600 if(data->flags & UCOL_USE_ITERATOR) {
2601 data->iterator->move(data->iterator, -1, UITER_CURRENT);
2602 return (UChar)data->iterator->next(data->iterator);
2603 } else {
2604 return *(data->pos - 1);
2605 }
2606 }
2607
2608 start = data->pos;
2609 if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
2610 /* in data string */
2611 if ((start - 1) == data->string) {
2612 return *(start - 1);
2613 }
2614 start --;
2615 ch = *start;
2616 prevch = *(start - 1);
2617 }
2618 else {
2619 /*
2620 in writable buffer, at this point fcdPosition can not be NULL.
2621 see contracting tag.
2622 */
2623 if (data->fcdPosition == data->string) {
2624 /* at the start of the string, just dump it into the normalizer */
2625 insertBufferFront(data, *(data->fcdPosition));
2626 data->fcdPosition = NULL;
2627 return *(data->pos - 1);
2628 }
2629 start = data->fcdPosition;
2630 ch = *start;
2631 prevch = *(start - 1);
2632 }
2633 /*
2634 * if the current character is not fcd.
2635 * Trailing combining class == 0.
2636 */
2637 if (data->fcdPosition > start &&
2638 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2639 {
2640 /*
2641 Need a more complete FCD check and possible normalization.
2642 normalize substring will be appended to buffer
2643 */
2644 const UChar *backuppos = data->pos;
2645 data->pos = start;
2646 if (collPrevIterFCD(data)) {
2647 normalizePrevContraction(data, status);
2648 return *(data->pos - 1);
2649 }
2650 data->pos = backuppos;
2651 data->fcdPosition ++;
2652 }
2653
2654 if (innormbuf) {
2655 /*
2656 no normalization is to be done hence only one character will be
2657 appended to the buffer.
2658 */
2659 insertBufferFront(data, ch);
2660 data->fcdPosition --;
2661 }
2662
2663 return ch;
2664 }
2665
2666 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2667 /* It is called by getNextCE */
2668
2669 /* The following should be even */
2670 #define UCOL_MAX_DIGITS_FOR_NUMBER 254
2671
ucol_prv_getSpecialCE(const UCollator * coll,UChar ch,uint32_t CE,collIterate * source,UErrorCode * status)2672 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
2673 collIterateState entryState;
2674 backupState(source, &entryState);
2675 UChar32 cp = ch;
2676
2677 for (;;) {
2678 // This loop will repeat only in the case of contractions, and only when a contraction
2679 // is found and the first CE resulting from that contraction is itself a special
2680 // (an expansion, for example.) All other special CE types are fully handled the
2681 // first time through, and the loop exits.
2682
2683 const uint32_t *CEOffset = NULL;
2684 switch(getCETag(CE)) {
2685 case NOT_FOUND_TAG:
2686 /* This one is not found, and we'll let somebody else bother about it... no more games */
2687 return CE;
2688 case SPEC_PROC_TAG:
2689 {
2690 // Special processing is getting a CE that is preceded by a certain prefix
2691 // Currently this is only needed for optimizing Japanese length and iteration marks.
2692 // When we encouter a special processing tag, we go backwards and try to see if
2693 // we have a match.
2694 // Contraction tables are used - so the whole process is not unlike contraction.
2695 // prefix data is stored backwards in the table.
2696 const UChar *UCharOffset;
2697 UChar schar, tchar;
2698 collIterateState prefixState;
2699 backupState(source, &prefixState);
2700 loadState(source, &entryState, TRUE);
2701 goBackOne(source); // We want to look at the point where we entered - actually one
2702 // before that...
2703
2704 for(;;) {
2705 // This loop will run once per source string character, for as long as we
2706 // are matching a potential contraction sequence
2707
2708 // First we position ourselves at the begining of contraction sequence
2709 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2710 if (collIter_bos(source)) {
2711 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2712 break;
2713 }
2714 schar = getPrevNormalizedChar(source, status);
2715 goBackOne(source);
2716
2717 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2718 UCharOffset++;
2719 }
2720
2721 if (schar == tchar) {
2722 // Found the source string char in the table.
2723 // Pick up the corresponding CE from the table.
2724 CE = *(coll->contractionCEs +
2725 (UCharOffset - coll->contractionIndex));
2726 }
2727 else
2728 {
2729 // Source string char was not in the table.
2730 // We have not found the prefix.
2731 CE = *(coll->contractionCEs +
2732 (ContractionStart - coll->contractionIndex));
2733 }
2734
2735 if(!isPrefix(CE)) {
2736 // The source string char was in the contraction table, and the corresponding
2737 // CE is not a prefix CE. We found the prefix, break
2738 // out of loop, this CE will end up being returned. This is the normal
2739 // way out of prefix handling when the source actually contained
2740 // the prefix.
2741 break;
2742 }
2743 }
2744 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
2745 loadState(source, &prefixState, TRUE);
2746 if(source->origFlags & UCOL_USE_ITERATOR) {
2747 source->flags = source->origFlags;
2748 }
2749 } else { // prefix search was a failure, we have to backup all the way to the start
2750 loadState(source, &entryState, TRUE);
2751 }
2752 break;
2753 }
2754 case CONTRACTION_TAG:
2755 {
2756 /* This should handle contractions */
2757 collIterateState state;
2758 backupState(source, &state);
2759 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
2760 const UChar *UCharOffset;
2761 UChar schar, tchar;
2762
2763 for (;;) {
2764 /* This loop will run once per source string character, for as long as we */
2765 /* are matching a potential contraction sequence */
2766
2767 /* First we position ourselves at the begining of contraction sequence */
2768 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2769
2770 if (collIter_eos(source)) {
2771 // Ran off the end of the source string.
2772 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2773 // So we'll pick whatever we have at the point...
2774 if (CE == UCOL_NOT_FOUND) {
2775 // back up the source over all the chars we scanned going into this contraction.
2776 CE = firstCE;
2777 loadState(source, &state, TRUE);
2778 if(source->origFlags & UCOL_USE_ITERATOR) {
2779 source->flags = source->origFlags;
2780 }
2781 }
2782 break;
2783 }
2784
2785 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2786 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
2787
2788 schar = getNextNormalizedChar(source);
2789 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2790 UCharOffset++;
2791 }
2792
2793 if (schar == tchar) {
2794 // Found the source string char in the contraction table.
2795 // Pick up the corresponding CE from the table.
2796 CE = *(coll->contractionCEs +
2797 (UCharOffset - coll->contractionIndex));
2798 }
2799 else
2800 {
2801 // Source string char was not in contraction table.
2802 // Unless we have a discontiguous contraction, we have finished
2803 // with this contraction.
2804 // in order to do the proper detection, we
2805 // need to see if we're dealing with a supplementary
2806 /* We test whether the next two char are surrogate pairs.
2807 * This test is done if the iterator is not NULL.
2808 * If there is no surrogate pair, the iterator
2809 * goes back one if needed. */
2810 UChar32 miss = schar;
2811 if (source->iterator) {
2812 UChar32 surrNextChar; /* the next char in the iteration to test */
2813 int32_t prevPos; /* holds the previous position before move forward of the source iterator */
2814 if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) {
2815 prevPos = source->iterator->index;
2816 surrNextChar = getNextNormalizedChar(source);
2817 if (U16_IS_TRAIL(surrNextChar)) {
2818 miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar);
2819 } else if (prevPos < source->iterator->index){
2820 goBackOne(source);
2821 }
2822 }
2823 } else if (U16_IS_LEAD(schar)) {
2824 miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
2825 }
2826
2827 uint8_t sCC;
2828 if (miss < 0x300 ||
2829 maxCC == 0 ||
2830 (sCC = i_getCombiningClass(miss, coll)) == 0 ||
2831 sCC>maxCC ||
2832 (allSame != 0 && sCC == maxCC) ||
2833 collIter_eos(source))
2834 {
2835 // Contraction can not be discontiguous.
2836 goBackOne(source); // back up the source string by one,
2837 // because the character we just looked at was
2838 // not part of the contraction. */
2839 if(U_IS_SUPPLEMENTARY(miss)) {
2840 goBackOne(source);
2841 }
2842 CE = *(coll->contractionCEs +
2843 (ContractionStart - coll->contractionIndex));
2844 } else {
2845 //
2846 // Contraction is possibly discontiguous.
2847 // Scan more of source string looking for a match
2848 //
2849 UChar tempchar;
2850 /* find the next character if schar is not a base character
2851 and we are not yet at the end of the string */
2852 tempchar = getNextNormalizedChar(source);
2853 // probably need another supplementary thingie here
2854 goBackOne(source);
2855 if (i_getCombiningClass(tempchar, coll) == 0) {
2856 goBackOne(source);
2857 if(U_IS_SUPPLEMENTARY(miss)) {
2858 goBackOne(source);
2859 }
2860 /* Spit out the last char of the string, wasn't tasty enough */
2861 CE = *(coll->contractionCEs +
2862 (ContractionStart - coll->contractionIndex));
2863 } else {
2864 CE = getDiscontiguous(coll, source, ContractionStart);
2865 }
2866 }
2867 } // else after if(schar == tchar)
2868
2869 if(CE == UCOL_NOT_FOUND) {
2870 /* The Source string did not match the contraction that we were checking. */
2871 /* Back up the source position to undo the effects of having partially */
2872 /* scanned through what ultimately proved to not be a contraction. */
2873 loadState(source, &state, TRUE);
2874 CE = firstCE;
2875 break;
2876 }
2877
2878 if(!isContraction(CE)) {
2879 // The source string char was in the contraction table, and the corresponding
2880 // CE is not a contraction CE. We completed the contraction, break
2881 // out of loop, this CE will end up being returned. This is the normal
2882 // way out of contraction handling when the source actually contained
2883 // the contraction.
2884 break;
2885 }
2886
2887
2888 // The source string char was in the contraction table, and the corresponding
2889 // CE is IS a contraction CE. We will continue looping to check the source
2890 // string for the remaining chars in the contraction.
2891 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
2892 if(tempCE != UCOL_NOT_FOUND) {
2893 // We have scanned a a section of source string for which there is a
2894 // CE from the contraction table. Remember the CE and scan position, so
2895 // that we can return to this point if further scanning fails to
2896 // match a longer contraction sequence.
2897 firstCE = tempCE;
2898
2899 goBackOne(source);
2900 backupState(source, &state);
2901 getNextNormalizedChar(source);
2902
2903 // Another way to do this is:
2904 //collIterateState tempState;
2905 //backupState(source, &tempState);
2906 //goBackOne(source);
2907 //backupState(source, &state);
2908 //loadState(source, &tempState, TRUE);
2909
2910 // The problem is that for incomplete contractions we have to remember the previous
2911 // position. Before, the only thing I needed to do was state.pos--;
2912 // After iterator introduction and especially after introduction of normalizing
2913 // iterators, it became much more difficult to decrease the saved state.
2914 // I'm not yet sure which of the two methods above is faster.
2915 }
2916 } // for(;;)
2917 break;
2918 } // case CONTRACTION_TAG:
2919 case LONG_PRIMARY_TAG:
2920 {
2921 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
2922 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
2923 source->offsetRepeatCount += 1;
2924 return CE;
2925 }
2926 case EXPANSION_TAG:
2927 {
2928 /* This should handle expansion. */
2929 /* NOTE: we can encounter both continuations and expansions in an expansion! */
2930 /* I have to decide where continuations are going to be dealt with */
2931 uint32_t size;
2932 uint32_t i; /* general counter */
2933
2934 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
2935 size = getExpansionCount(CE);
2936 CE = *CEOffset++;
2937 //source->offsetRepeatCount = -1;
2938
2939 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
2940 for(i = 1; i<size; i++) {
2941 *(source->CEpos++) = *CEOffset++;
2942 source->offsetRepeatCount += 1;
2943 }
2944 } else { /* else, we do */
2945 while(*CEOffset != 0) {
2946 *(source->CEpos++) = *CEOffset++;
2947 source->offsetRepeatCount += 1;
2948 }
2949 }
2950
2951 return CE;
2952 }
2953 case DIGIT_TAG:
2954 {
2955 /*
2956 We do a check to see if we want to collate digits as numbers; if so we generate
2957 a custom collation key. Otherwise we pull out the value stored in the expansion table.
2958 */
2959 //uint32_t size;
2960 uint32_t i; /* general counter */
2961
2962 if (source->coll->numericCollation == UCOL_ON){
2963 collIterateState digitState = {0,0,0,0,0,0,0,0,0};
2964 UChar32 char32 = 0;
2965 int32_t digVal = 0;
2966
2967 uint32_t digIndx = 0;
2968 uint32_t endIndex = 0;
2969 uint32_t trailingZeroIndex = 0;
2970
2971 uint8_t collateVal = 0;
2972
2973 UBool nonZeroValReached = FALSE;
2974
2975 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs.
2976 /*
2977 We parse the source string until we hit a char that's NOT a digit.
2978 Use this u_charDigitValue. This might be slow because we have to
2979 handle surrogates...
2980 */
2981 /*
2982 if (U16_IS_LEAD(ch)){
2983 if (!collIter_eos(source)) {
2984 backupState(source, &digitState);
2985 UChar trail = getNextNormalizedChar(source);
2986 if(U16_IS_TRAIL(trail)) {
2987 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
2988 } else {
2989 loadState(source, &digitState, TRUE);
2990 char32 = ch;
2991 }
2992 } else {
2993 char32 = ch;
2994 }
2995 } else {
2996 char32 = ch;
2997 }
2998 digVal = u_charDigitValue(char32);
2999 */
3000 digVal = u_charDigitValue(cp); // if we have arrived here, we have
3001 // already processed possible supplementaries that trigered the digit tag -
3002 // all supplementaries are marked in the UCA.
3003 /*
3004 We pad a zero in front of the first element anyways. This takes
3005 care of the (probably) most common case where people are sorting things followed
3006 by a single digit
3007 */
3008 digIndx++;
3009 for(;;){
3010 // Make sure we have enough space. No longer needed;
3011 // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
3012 // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
3013 // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
3014
3015 // Skipping over leading zeroes.
3016 if (digVal != 0) {
3017 nonZeroValReached = TRUE;
3018 }
3019 if (nonZeroValReached) {
3020 /*
3021 We parse the digit string into base 100 numbers (this fits into a byte).
3022 We only add to the buffer in twos, thus if we are parsing an odd character,
3023 that serves as the 'tens' digit while the if we are parsing an even one, that
3024 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3025 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3026 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3027 than all the other bytes.
3028 */
3029
3030 if (digIndx % 2 == 1){
3031 collateVal += (uint8_t)digVal;
3032
3033 // We don't enter the low-order-digit case unless we've already seen
3034 // the high order, or for the first digit, which is always non-zero.
3035 if (collateVal != 0)
3036 trailingZeroIndex = 0;
3037
3038 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3039 collateVal = 0;
3040 }
3041 else{
3042 // We drop the collation value into the buffer so if we need to do
3043 // a "front patch" we don't have to check to see if we're hitting the
3044 // last element.
3045 collateVal = (uint8_t)(digVal * 10);
3046
3047 // Check for trailing zeroes.
3048 if (collateVal == 0)
3049 {
3050 if (!trailingZeroIndex)
3051 trailingZeroIndex = (digIndx/2) + 2;
3052 }
3053 else
3054 trailingZeroIndex = 0;
3055
3056 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3057 }
3058 digIndx++;
3059 }
3060
3061 // Get next character.
3062 if (!collIter_eos(source)){
3063 ch = getNextNormalizedChar(source);
3064 if (U16_IS_LEAD(ch)){
3065 if (!collIter_eos(source)) {
3066 backupState(source, &digitState);
3067 UChar trail = getNextNormalizedChar(source);
3068 if(U16_IS_TRAIL(trail)) {
3069 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3070 } else {
3071 loadState(source, &digitState, TRUE);
3072 char32 = ch;
3073 }
3074 }
3075 } else {
3076 char32 = ch;
3077 }
3078
3079 if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){
3080 // Resetting position to point to the next unprocessed char. We
3081 // overshot it when doing our test/set for numbers.
3082 if (char32 > 0xFFFF) { // For surrogates.
3083 loadState(source, &digitState, TRUE);
3084 //goBackOne(source);
3085 }
3086 goBackOne(source);
3087 break;
3088 }
3089 } else {
3090 break;
3091 }
3092 }
3093
3094 if (nonZeroValReached == FALSE){
3095 digIndx = 2;
3096 numTempBuf[2] = 6;
3097 }
3098
3099 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
3100 if (digIndx % 2 != 0){
3101 /*
3102 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3103 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3104 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3105 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3106 */
3107
3108 for(i = 2; i < endIndex; i++){
3109 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3110 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3111 }
3112 --digIndx;
3113 }
3114
3115 // Subtract one off of the last byte.
3116 numTempBuf[endIndex-1] -= 1;
3117
3118 /*
3119 We want to skip over the first two slots in the buffer. The first slot
3120 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3121 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3122 */
3123 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3124 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3125
3126 // Now transfer the collation key to our collIterate struct.
3127 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3128 //size = ((endIndex+1) & ~1)/2;
3129 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3130 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3131 UCOL_BYTE_COMMON; // Tertiary weight.
3132 i = 2; // Reset the index into the buffer.
3133 while(i < endIndex)
3134 {
3135 uint32_t primWeight = numTempBuf[i++] << 8;
3136 if ( i < endIndex)
3137 primWeight |= numTempBuf[i++];
3138 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3139 }
3140
3141 } else {
3142 // no numeric mode, we'll just switch to whatever we stashed and continue
3143 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3144 CE = *CEOffset++;
3145 break;
3146 }
3147 return CE;
3148 }
3149 /* various implicits optimization */
3150 case IMPLICIT_TAG: /* everything that is not defined otherwise */
3151 /* UCA is filled with these. Tailorings are NOT_FOUND */
3152 return getImplicit(cp, source);
3153 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3154 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3155 return getImplicit(cp, source);
3156 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3157 {
3158 static const uint32_t
3159 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3160 //const uint32_t LCount = 19;
3161 static const uint32_t VCount = 21;
3162 static const uint32_t TCount = 28;
3163 //const uint32_t NCount = VCount * TCount; // 588
3164 //const uint32_t SCount = LCount * NCount; // 11172
3165 uint32_t L = ch - SBase;
3166
3167 // divide into pieces
3168
3169 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
3170 L /= TCount;
3171 uint32_t V = L % VCount;
3172 L /= VCount;
3173
3174 // offset them
3175
3176 L += LBase;
3177 V += VBase;
3178 T += TBase;
3179
3180 // return the first CE, but first put the rest into the expansion buffer
3181 if (!source->coll->image->jamoSpecial) { // FAST PATH
3182
3183 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3184 if (T != TBase) {
3185 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3186 }
3187
3188 return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3189
3190 } else { // Jamo is Special
3191 // Since Hanguls pass the FCD check, it is
3192 // guaranteed that we won't be in
3193 // the normalization buffer if something like this happens
3194
3195 // However, if we are using a uchar iterator and normalization
3196 // is ON, the Hangul that lead us here is going to be in that
3197 // normalization buffer. Here we want to restore the uchar
3198 // iterator state and pull out of the normalization buffer
3199 if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
3200 source->flags = source->origFlags; // restore the iterator
3201 source->pos = NULL;
3202 }
3203
3204 // Move Jamos into normalization buffer
3205 UChar *buffer = source->writableBuffer.getBuffer(4);
3206 int32_t bufferLength;
3207 buffer[0] = (UChar)L;
3208 buffer[1] = (UChar)V;
3209 if (T != TBase) {
3210 buffer[2] = (UChar)T;
3211 bufferLength = 3;
3212 } else {
3213 bufferLength = 2;
3214 }
3215 source->writableBuffer.releaseBuffer(bufferLength);
3216
3217 // Indicate where to continue in main input string after exhausting the writableBuffer
3218 source->fcdPosition = source->pos;
3219
3220 source->pos = source->writableBuffer.getTerminatedBuffer();
3221 source->origFlags = source->flags;
3222 source->flags |= UCOL_ITER_INNORMBUF;
3223 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3224
3225 return(UCOL_IGNORABLE);
3226 }
3227 }
3228 case SURROGATE_TAG:
3229 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
3230 /* two things can happen here: next code point can be a trailing surrogate - we will use it */
3231 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
3232 /* we treat it like an unassigned code point. */
3233 {
3234 UChar trail;
3235 collIterateState state;
3236 backupState(source, &state);
3237 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
3238 // we chould have stepped one char forward and it might have turned that it
3239 // was not a trail surrogate. In that case, we have to backup.
3240 loadState(source, &state, TRUE);
3241 return UCOL_NOT_FOUND;
3242 } else {
3243 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
3244 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
3245 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
3246 // We need to backup
3247 loadState(source, &state, TRUE);
3248 return CE;
3249 }
3250 // calculate the supplementary code point value, if surrogate was not tailored
3251 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3252 }
3253 }
3254 break;
3255 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
3256 UChar nextChar;
3257 if( source->flags & UCOL_USE_ITERATOR) {
3258 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
3259 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3260 source->iterator->next(source->iterator);
3261 return getImplicit(cp, source);
3262 }
3263 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
3264 U_IS_TRAIL((nextChar=*source->pos))) {
3265 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3266 source->pos++;
3267 return getImplicit(cp, source);
3268 }
3269 return UCOL_NOT_FOUND;
3270 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3271 return UCOL_NOT_FOUND; /* broken surrogate sequence */
3272 case CHARSET_TAG:
3273 /* not yet implemented */
3274 /* probably after 1.8 */
3275 return UCOL_NOT_FOUND;
3276 default:
3277 *status = U_INTERNAL_PROGRAM_ERROR;
3278 CE=0;
3279 break;
3280 }
3281 if (CE <= UCOL_NOT_FOUND) break;
3282 }
3283 return CE;
3284 }
3285
3286
3287 /* now uses Mark's getImplicitPrimary code */
3288 static
getPrevImplicit(UChar32 cp,collIterate * collationSource)3289 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3290 uint32_t r = uprv_uca_getImplicitPrimary(cp);
3291
3292 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3293 collationSource->toReturn = collationSource->CEpos;
3294
3295 // **** doesn't work if using iterator ****
3296 if (collationSource->flags & UCOL_ITER_INNORMBUF) {
3297 collationSource->offsetRepeatCount = 1;
3298 } else {
3299 int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
3300
3301 UErrorCode errorCode = U_ZERO_ERROR;
3302 collationSource->appendOffset(firstOffset, errorCode);
3303 collationSource->appendOffset(firstOffset + 1, errorCode);
3304
3305 collationSource->offsetReturn = collationSource->offsetStore - 1;
3306 *(collationSource->offsetBuffer) = firstOffset;
3307 if (collationSource->offsetReturn == collationSource->offsetBuffer) {
3308 collationSource->offsetStore = collationSource->offsetBuffer;
3309 }
3310 }
3311
3312 return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3313 }
3314
3315 /**
3316 * This function handles the special CEs like contractions, expansions,
3317 * surrogates, Thai.
3318 * It is called by both getPrevCE
3319 */
ucol_prv_getSpecialPrevCE(const UCollator * coll,UChar ch,uint32_t CE,collIterate * source,UErrorCode * status)3320 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3321 collIterate *source,
3322 UErrorCode *status)
3323 {
3324 const uint32_t *CEOffset = NULL;
3325 UChar *UCharOffset = NULL;
3326 UChar schar;
3327 const UChar *constart = NULL;
3328 uint32_t size;
3329 UChar buffer[UCOL_MAX_BUFFER];
3330 uint32_t *endCEBuffer;
3331 UChar *strbuffer;
3332 int32_t noChars = 0;
3333 int32_t CECount = 0;
3334
3335 for(;;)
3336 {
3337 /* the only ces that loops are thai and contractions */
3338 switch (getCETag(CE))
3339 {
3340 case NOT_FOUND_TAG: /* this tag always returns */
3341 return CE;
3342
3343 case SPEC_PROC_TAG:
3344 {
3345 // Special processing is getting a CE that is preceded by a certain prefix
3346 // Currently this is only needed for optimizing Japanese length and iteration marks.
3347 // When we encouter a special processing tag, we go backwards and try to see if
3348 // we have a match.
3349 // Contraction tables are used - so the whole process is not unlike contraction.
3350 // prefix data is stored backwards in the table.
3351 const UChar *UCharOffset;
3352 UChar schar, tchar;
3353 collIterateState prefixState;
3354 backupState(source, &prefixState);
3355 for(;;) {
3356 // This loop will run once per source string character, for as long as we
3357 // are matching a potential contraction sequence
3358
3359 // First we position ourselves at the begining of contraction sequence
3360 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3361
3362 if (collIter_bos(source)) {
3363 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3364 break;
3365 }
3366 schar = getPrevNormalizedChar(source, status);
3367 goBackOne(source);
3368
3369 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3370 UCharOffset++;
3371 }
3372
3373 if (schar == tchar) {
3374 // Found the source string char in the table.
3375 // Pick up the corresponding CE from the table.
3376 CE = *(coll->contractionCEs +
3377 (UCharOffset - coll->contractionIndex));
3378 }
3379 else
3380 {
3381 // if there is a completely ignorable code point in the middle of
3382 // a prefix, we need to act as if it's not there
3383 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3384 // lone surrogates cannot be set to zero as it would break other processing
3385 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
3386 // it's easy for BMP code points
3387 if(isZeroCE == 0) {
3388 continue;
3389 } else if(U16_IS_SURROGATE(schar)) {
3390 // for supplementary code points, we have to check the next one
3391 // situations where we are going to ignore
3392 // 1. beginning of the string: schar is a lone surrogate
3393 // 2. schar is a lone surrogate
3394 // 3. schar is a trail surrogate in a valid surrogate sequence
3395 // that is explicitly set to zero.
3396 if (!collIter_bos(source)) {
3397 UChar lead;
3398 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
3399 isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
3400 if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) {
3401 uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
3402 if(finalCE == 0) {
3403 // this is a real, assigned completely ignorable code point
3404 goBackOne(source);
3405 continue;
3406 }
3407 }
3408 } else {
3409 // lone surrogate, treat like unassigned
3410 return UCOL_NOT_FOUND;
3411 }
3412 } else {
3413 // lone surrogate at the beggining, treat like unassigned
3414 return UCOL_NOT_FOUND;
3415 }
3416 }
3417 // Source string char was not in the table.
3418 // We have not found the prefix.
3419 CE = *(coll->contractionCEs +
3420 (ContractionStart - coll->contractionIndex));
3421 }
3422
3423 if(!isPrefix(CE)) {
3424 // The source string char was in the contraction table, and the corresponding
3425 // CE is not a prefix CE. We found the prefix, break
3426 // out of loop, this CE will end up being returned. This is the normal
3427 // way out of prefix handling when the source actually contained
3428 // the prefix.
3429 break;
3430 }
3431 }
3432 loadState(source, &prefixState, TRUE);
3433 break;
3434 }
3435
3436 case CONTRACTION_TAG: {
3437 /* to ensure that the backwards and forwards iteration matches, we
3438 take the current region of most possible match and pass it through
3439 the forward iteration. this will ensure that the obstinate problem of
3440 overlapping contractions will not occur.
3441 */
3442 schar = peekCodeUnit(source, 0);
3443 constart = (UChar *)coll->image + getContractOffset(CE);
3444 if (isAtStartPrevIterate(source)
3445 /* commented away contraction end checks after adding the checks
3446 in getPrevCE */) {
3447 /* start of string or this is not the end of any contraction */
3448 CE = *(coll->contractionCEs +
3449 (constart - coll->contractionIndex));
3450 break;
3451 }
3452 strbuffer = buffer;
3453 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3454 *(UCharOffset --) = 0;
3455 noChars = 0;
3456 // have to swap thai characters
3457 while (ucol_unsafeCP(schar, coll)) {
3458 *(UCharOffset) = schar;
3459 noChars++;
3460 UCharOffset --;
3461 schar = getPrevNormalizedChar(source, status);
3462 goBackOne(source);
3463 // TODO: when we exhaust the contraction buffer,
3464 // it needs to get reallocated. The problem is
3465 // that the size depends on the string which is
3466 // not iterated over. However, since we're travelling
3467 // backwards, we already had to set the iterator at
3468 // the end - so we might as well know where we are?
3469 if (UCharOffset + 1 == buffer) {
3470 /* we have exhausted the buffer */
3471 int32_t newsize = 0;
3472 if(source->pos) { // actually dealing with a position
3473 newsize = (int32_t)(source->pos - source->string + 1);
3474 } else { // iterator
3475 newsize = 4 * UCOL_MAX_BUFFER;
3476 }
3477 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3478 (newsize + UCOL_MAX_BUFFER));
3479 /* test for NULL */
3480 if (strbuffer == NULL) {
3481 *status = U_MEMORY_ALLOCATION_ERROR;
3482 return UCOL_NO_MORE_CES;
3483 }
3484 UCharOffset = strbuffer + newsize;
3485 uprv_memcpy(UCharOffset, buffer,
3486 UCOL_MAX_BUFFER * sizeof(UChar));
3487 UCharOffset --;
3488 }
3489 if ((source->pos && (source->pos == source->string ||
3490 ((source->flags & UCOL_ITER_INNORMBUF) &&
3491 *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3492 || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
3493 break;
3494 }
3495 }
3496 /* adds the initial base character to the string */
3497 *(UCharOffset) = schar;
3498 noChars++;
3499
3500 int32_t offsetBias;
3501
3502 // **** doesn't work if using iterator ****
3503 if (source->flags & UCOL_ITER_INNORMBUF) {
3504 offsetBias = -1;
3505 } else {
3506 offsetBias = (int32_t)(source->pos - source->string);
3507 }
3508
3509 /* a new collIterate is used to simplify things, since using the current
3510 collIterate will mean that the forward and backwards iteration will
3511 share and change the same buffers. we don't want to get into that. */
3512 collIterate temp;
3513 int32_t rawOffset;
3514
3515 IInit_collIterate(coll, UCharOffset, noChars, &temp, status);
3516 if(U_FAILURE(*status)) {
3517 return (uint32_t)UCOL_NULLORDER;
3518 }
3519 temp.flags &= ~UCOL_ITER_NORM;
3520 temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;
3521
3522 rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero?
3523 CE = ucol_IGetNextCE(coll, &temp, status);
3524
3525 if (source->extendCEs) {
3526 endCEBuffer = source->extendCEs + source->extendCEsSize;
3527 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t));
3528 } else {
3529 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3530 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t));
3531 }
3532
3533 while (CE != UCOL_NO_MORE_CES) {
3534 *(source->CEpos ++) = CE;
3535
3536 if (offsetBias >= 0) {
3537 source->appendOffset(rawOffset + offsetBias, *status);
3538 }
3539
3540 CECount++;
3541 if (source->CEpos == endCEBuffer) {
3542 /* ran out of CE space, reallocate to new buffer.
3543 If reallocation fails, reset pointers and bail out,
3544 there's no guarantee of the right character position after
3545 this bail*/
3546 if (!increaseCEsCapacity(source)) {
3547 *status = U_MEMORY_ALLOCATION_ERROR;
3548 break;
3549 }
3550
3551 endCEBuffer = source->extendCEs + source->extendCEsSize;
3552 }
3553
3554 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
3555 rawOffset = (int32_t)(temp.fcdPosition - temp.string);
3556 } else {
3557 rawOffset = (int32_t)(temp.pos - temp.string);
3558 }
3559
3560 CE = ucol_IGetNextCE(coll, &temp, status);
3561 }
3562
3563 if (strbuffer != buffer) {
3564 uprv_free(strbuffer);
3565 }
3566 if (U_FAILURE(*status)) {
3567 return (uint32_t)UCOL_NULLORDER;
3568 }
3569
3570 if (source->offsetRepeatValue != 0) {
3571 if (CECount > noChars) {
3572 source->offsetRepeatCount += temp.offsetRepeatCount;
3573 } else {
3574 // **** does this really skip the right offsets? ****
3575 source->offsetReturn -= (noChars - CECount);
3576 }
3577 }
3578
3579 if (offsetBias >= 0) {
3580 source->offsetReturn = source->offsetStore - 1;
3581 if (source->offsetReturn == source->offsetBuffer) {
3582 source->offsetStore = source->offsetBuffer;
3583 }
3584 }
3585
3586 source->toReturn = source->CEpos - 1;
3587 if (source->toReturn == source->CEs) {
3588 source->CEpos = source->CEs;
3589 }
3590
3591 return *(source->toReturn);
3592 }
3593 case LONG_PRIMARY_TAG:
3594 {
3595 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3596 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3597 source->toReturn = source->CEpos - 1;
3598
3599 if (source->flags & UCOL_ITER_INNORMBUF) {
3600 source->offsetRepeatCount = 1;
3601 } else {
3602 int32_t firstOffset = (int32_t)(source->pos - source->string);
3603
3604 source->appendOffset(firstOffset, *status);
3605 source->appendOffset(firstOffset + 1, *status);
3606
3607 source->offsetReturn = source->offsetStore - 1;
3608 *(source->offsetBuffer) = firstOffset;
3609 if (source->offsetReturn == source->offsetBuffer) {
3610 source->offsetStore = source->offsetBuffer;
3611 }
3612 }
3613
3614
3615 return *(source->toReturn);
3616 }
3617
3618 case EXPANSION_TAG: /* this tag always returns */
3619 {
3620 /*
3621 This should handle expansion.
3622 NOTE: we can encounter both continuations and expansions in an expansion!
3623 I have to decide where continuations are going to be dealt with
3624 */
3625 int32_t firstOffset = (int32_t)(source->pos - source->string);
3626
3627 // **** doesn't work if using iterator ****
3628 if (source->offsetReturn != NULL) {
3629 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) {
3630 source->offsetStore = source->offsetBuffer;
3631 }else {
3632 firstOffset = -1;
3633 }
3634 }
3635
3636 /* find the offset to expansion table */
3637 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3638 size = getExpansionCount(CE);
3639 if (size != 0) {
3640 /*
3641 if there are less than 16 elements in expansion, we don't terminate
3642 */
3643 uint32_t count;
3644
3645 for (count = 0; count < size; count++) {
3646 *(source->CEpos ++) = *CEOffset++;
3647
3648 if (firstOffset >= 0) {
3649 source->appendOffset(firstOffset + 1, *status);
3650 }
3651 }
3652 } else {
3653 /* else, we do */
3654 while (*CEOffset != 0) {
3655 *(source->CEpos ++) = *CEOffset ++;
3656
3657 if (firstOffset >= 0) {
3658 source->appendOffset(firstOffset + 1, *status);
3659 }
3660 }
3661 }
3662
3663 if (firstOffset >= 0) {
3664 source->offsetReturn = source->offsetStore - 1;
3665 *(source->offsetBuffer) = firstOffset;
3666 if (source->offsetReturn == source->offsetBuffer) {
3667 source->offsetStore = source->offsetBuffer;
3668 }
3669 } else {
3670 source->offsetRepeatCount += size - 1;
3671 }
3672
3673 source->toReturn = source->CEpos - 1;
3674 // in case of one element expansion, we
3675 // want to immediately return CEpos
3676 if(source->toReturn == source->CEs) {
3677 source->CEpos = source->CEs;
3678 }
3679
3680 return *(source->toReturn);
3681 }
3682
3683 case DIGIT_TAG:
3684 {
3685 /*
3686 We do a check to see if we want to collate digits as numbers; if so we generate
3687 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3688 */
3689 uint32_t i; /* general counter */
3690
3691 if (source->coll->numericCollation == UCOL_ON){
3692 uint32_t digIndx = 0;
3693 uint32_t endIndex = 0;
3694 uint32_t leadingZeroIndex = 0;
3695 uint32_t trailingZeroCount = 0;
3696
3697 uint8_t collateVal = 0;
3698
3699 UBool nonZeroValReached = FALSE;
3700
3701 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs.
3702 /*
3703 We parse the source string until we hit a char that's NOT a digit.
3704 Use this u_charDigitValue. This might be slow because we have to
3705 handle surrogates...
3706 */
3707 /*
3708 We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less,
3709 with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
3710 element we process when going backward. To determine how long that chunk might be, we may need to make
3711 two passes through the loop that collects digits - one to see how long the string is (and how much is
3712 leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has
3713 more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation
3714 element chunk after resetting the state to the initialState at the right side of the digit string.
3715 */
3716 uint32_t ceLimit = 0;
3717 UChar initial_ch = ch;
3718 collIterateState initialState = {0,0,0,0,0,0,0,0,0};
3719 backupState(source, &initialState);
3720
3721 for(;;) {
3722 collIterateState state = {0,0,0,0,0,0,0,0,0};
3723 UChar32 char32 = 0;
3724 int32_t digVal = 0;
3725
3726 if (U16_IS_TRAIL (ch)) {
3727 if (!collIter_bos(source)){
3728 UChar lead = getPrevNormalizedChar(source, status);
3729 if(U16_IS_LEAD(lead)) {
3730 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3731 goBackOne(source);
3732 } else {
3733 char32 = ch;
3734 }
3735 } else {
3736 char32 = ch;
3737 }
3738 } else {
3739 char32 = ch;
3740 }
3741 digVal = u_charDigitValue(char32);
3742
3743 for(;;) {
3744 // Make sure we have enough space. No longer needed;
3745 // at this point the largest value of digIndx when we need to save data in numTempBuf
3746 // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure
3747 // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2).
3748
3749 // Skip over trailing zeroes, and keep a count of them.
3750 if (digVal != 0)
3751 nonZeroValReached = TRUE;
3752
3753 if (nonZeroValReached) {
3754 /*
3755 We parse the digit string into base 100 numbers (this fits into a byte).
3756 We only add to the buffer in twos, thus if we are parsing an odd character,
3757 that serves as the 'tens' digit while the if we are parsing an even one, that
3758 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3759 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3760 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3761 than all the other bytes.
3762
3763 Since we're doing in this reverse we want to put the first digit encountered into the
3764 ones place and the second digit encountered into the tens place.
3765 */
3766
3767 if ((digIndx + trailingZeroCount) % 2 == 1) {
3768 // High-order digit case (tens place)
3769 collateVal += (uint8_t)(digVal * 10);
3770
3771 // We cannot set leadingZeroIndex unless it has been set for the
3772 // low-order digit. Therefore, all we can do for the high-order
3773 // digit is turn it off, never on.
3774 // The only time we will have a high digit without a low is for
3775 // the very first non-zero digit, so no zero check is necessary.
3776 if (collateVal != 0)
3777 leadingZeroIndex = 0;
3778
3779 // The first pass through, digIndx may exceed the limit, but in that case
3780 // we no longer care about numTempBuf contents since they will be discarded
3781 if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {
3782 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3783 }
3784 collateVal = 0;
3785 } else {
3786 // Low-order digit case (ones place)
3787 collateVal = (uint8_t)digVal;
3788
3789 // Check for leading zeroes.
3790 if (collateVal == 0) {
3791 if (!leadingZeroIndex)
3792 leadingZeroIndex = (digIndx/2) + 2;
3793 } else
3794 leadingZeroIndex = 0;
3795
3796 // No need to write to buffer; the case of a last odd digit
3797 // is handled below.
3798 }
3799 ++digIndx;
3800 } else
3801 ++trailingZeroCount;
3802
3803 if (!collIter_bos(source)) {
3804 ch = getPrevNormalizedChar(source, status);
3805 //goBackOne(source);
3806 if (U16_IS_TRAIL(ch)) {
3807 backupState(source, &state);
3808 if (!collIter_bos(source)) {
3809 goBackOne(source);
3810 UChar lead = getPrevNormalizedChar(source, status);
3811
3812 if(U16_IS_LEAD(lead)) {
3813 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3814 } else {
3815 loadState(source, &state, FALSE);
3816 char32 = ch;
3817 }
3818 }
3819 } else
3820 char32 = ch;
3821
3822 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {
3823 if (char32 > 0xFFFF) {// For surrogates.
3824 loadState(source, &state, FALSE);
3825 }
3826 // Don't need to "reverse" the goBackOne call,
3827 // as this points to the next position to process..
3828 //if (char32 > 0xFFFF) // For surrogates.
3829 //getNextNormalizedChar(source);
3830 break;
3831 }
3832
3833 goBackOne(source);
3834 }else
3835 break;
3836 }
3837
3838 if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) {
3839 // our collation element is not too big, go ahead and finish with it
3840 break;
3841 }
3842 // our digit string is too long for a collation element;
3843 // set the limit for it, reset the state and begin again
3844 ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER;
3845 if ( ceLimit == 0 ) {
3846 ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;
3847 }
3848 ch = initial_ch;
3849 loadState(source, &initialState, FALSE);
3850 digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0;
3851 collateVal = 0;
3852 nonZeroValReached = FALSE;
3853 }
3854
3855 if (! nonZeroValReached) {
3856 digIndx = 2;
3857 trailingZeroCount = 0;
3858 numTempBuf[2] = 6;
3859 }
3860
3861 if ((digIndx + trailingZeroCount) % 2 != 0) {
3862 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
3863 digIndx += 1; // The implicit leading zero
3864 }
3865 if (trailingZeroCount % 2 != 0) {
3866 // We had to consume one trailing zero for the low digit
3867 // of the least significant byte
3868 digIndx += 1; // The trailing zero not in the exponent
3869 trailingZeroCount -= 1;
3870 }
3871
3872 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
3873
3874 // Subtract one off of the last byte. Really the first byte here, but it's reversed...
3875 numTempBuf[2] -= 1;
3876
3877 /*
3878 We want to skip over the first two slots in the buffer. The first slot
3879 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3880 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3881 The exponent must be adjusted by the number of leading zeroes, and the number of
3882 trailing zeroes.
3883 */
3884 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3885 uint32_t exponent = (digIndx+trailingZeroCount)/2;
3886 if (leadingZeroIndex)
3887 exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
3888 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
3889
3890 // Now transfer the collation key to our collIterate struct.
3891 // The total size for our collation key is half of endIndex, rounded up.
3892 int32_t size = (endIndex+1)/2;
3893 if(!ensureCEsCapacity(source, size)) {
3894 return (uint32_t)UCOL_NULLORDER;
3895 }
3896 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3897 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3898 UCOL_BYTE_COMMON; // Tertiary weight.
3899 i = endIndex - 1; // Reset the index into the buffer.
3900 while(i >= 2) {
3901 uint32_t primWeight = numTempBuf[i--] << 8;
3902 if ( i >= 2)
3903 primWeight |= numTempBuf[i--];
3904 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3905 }
3906
3907 source->toReturn = source->CEpos -1;
3908 return *(source->toReturn);
3909 } else {
3910 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3911 CE = *(CEOffset++);
3912 break;
3913 }
3914 }
3915
3916 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3917 {
3918 static const uint32_t
3919 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3920 //const uint32_t LCount = 19;
3921 static const uint32_t VCount = 21;
3922 static const uint32_t TCount = 28;
3923 //const uint32_t NCount = VCount * TCount; /* 588 */
3924 //const uint32_t SCount = LCount * NCount; /* 11172 */
3925
3926 uint32_t L = ch - SBase;
3927 /*
3928 divide into pieces.
3929 we do it in this order since some compilers can do % and / in one
3930 operation
3931 */
3932 uint32_t T = L % TCount;
3933 L /= TCount;
3934 uint32_t V = L % VCount;
3935 L /= VCount;
3936
3937 /* offset them */
3938 L += LBase;
3939 V += VBase;
3940 T += TBase;
3941
3942 int32_t firstOffset = (int32_t)(source->pos - source->string);
3943 source->appendOffset(firstOffset, *status);
3944
3945 /*
3946 * return the first CE, but first put the rest into the expansion buffer
3947 */
3948 if (!source->coll->image->jamoSpecial) {
3949 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3950 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3951 source->appendOffset(firstOffset + 1, *status);
3952
3953 if (T != TBase) {
3954 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3955 source->appendOffset(firstOffset + 1, *status);
3956 }
3957
3958 source->toReturn = source->CEpos - 1;
3959
3960 source->offsetReturn = source->offsetStore - 1;
3961 if (source->offsetReturn == source->offsetBuffer) {
3962 source->offsetStore = source->offsetBuffer;
3963 }
3964
3965 return *(source->toReturn);
3966 } else {
3967 // Since Hanguls pass the FCD check, it is
3968 // guaranteed that we won't be in
3969 // the normalization buffer if something like this happens
3970
3971 // Move Jamos into normalization buffer
3972 UChar *tempbuffer = source->writableBuffer.getBuffer(5);
3973 int32_t tempbufferLength, jamoOffset;
3974 tempbuffer[0] = 0;
3975 tempbuffer[1] = (UChar)L;
3976 tempbuffer[2] = (UChar)V;
3977 if (T != TBase) {
3978 tempbuffer[3] = (UChar)T;
3979 tempbufferLength = 4;
3980 } else {
3981 tempbufferLength = 3;
3982 }
3983 source->writableBuffer.releaseBuffer(tempbufferLength);
3984
3985 // Indicate where to continue in main input string after exhausting the writableBuffer
3986 if (source->pos == source->string) {
3987 jamoOffset = 0;
3988 source->fcdPosition = NULL;
3989 } else {
3990 jamoOffset = source->pos - source->string;
3991 source->fcdPosition = source->pos-1;
3992 }
3993
3994 // Append offsets for the additional chars
3995 // (not the 0, and not the L whose offsets match the original Hangul)
3996 int32_t jamoRemaining = tempbufferLength - 2;
3997 jamoOffset++; // appended offsets should match end of original Hangul
3998 while (jamoRemaining-- > 0) {
3999 source->appendOffset(jamoOffset, *status);
4000 }
4001
4002 source->offsetRepeatValue = jamoOffset;
4003
4004 source->offsetReturn = source->offsetStore - 1;
4005 if (source->offsetReturn == source->offsetBuffer) {
4006 source->offsetStore = source->offsetBuffer;
4007 }
4008
4009 source->pos = source->writableBuffer.getTerminatedBuffer() + tempbufferLength;
4010 source->origFlags = source->flags;
4011 source->flags |= UCOL_ITER_INNORMBUF;
4012 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
4013
4014 return(UCOL_IGNORABLE);
4015 }
4016 }
4017
4018 case IMPLICIT_TAG: /* everything that is not defined otherwise */
4019 return getPrevImplicit(ch, source);
4020
4021 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
4022 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
4023 return getPrevImplicit(ch, source);
4024
4025 case SURROGATE_TAG: /* This is a surrogate pair */
4026 /* essentially an engaged lead surrogate. */
4027 /* if you have encountered it here, it means that a */
4028 /* broken sequence was encountered and this is an error */
4029 return UCOL_NOT_FOUND;
4030
4031 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
4032 return UCOL_NOT_FOUND; /* broken surrogate sequence */
4033
4034 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
4035 {
4036 UChar32 cp = 0;
4037 UChar prevChar;
4038 const UChar *prev;
4039 if (isAtStartPrevIterate(source)) {
4040 /* we are at the start of the string, wrong place to be at */
4041 return UCOL_NOT_FOUND;
4042 }
4043 if (source->pos != source->writableBuffer.getBuffer()) {
4044 prev = source->pos - 1;
4045 } else {
4046 prev = source->fcdPosition;
4047 }
4048 prevChar = *prev;
4049
4050 /* Handles Han and Supplementary characters here.*/
4051 if (U16_IS_LEAD(prevChar)) {
4052 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
4053 source->pos = prev;
4054 } else {
4055 return UCOL_NOT_FOUND; /* like unassigned */
4056 }
4057
4058 return getPrevImplicit(cp, source);
4059 }
4060
4061 /* UCA is filled with these. Tailorings are NOT_FOUND */
4062 /* not yet implemented */
4063 case CHARSET_TAG: /* this tag always returns */
4064 /* probably after 1.8 */
4065 return UCOL_NOT_FOUND;
4066
4067 default: /* this tag always returns */
4068 *status = U_INTERNAL_PROGRAM_ERROR;
4069 CE=0;
4070 break;
4071 }
4072
4073 if (CE <= UCOL_NOT_FOUND) {
4074 break;
4075 }
4076 }
4077
4078 return CE;
4079 }
4080
4081 /* This should really be a macro */
4082 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
4083 /* secondaries in French */
4084 /*
4085 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
4086 uint8_t temp;
4087 while(start<end) {
4088 temp = *start;
4089 *start++ = *end;
4090 *end-- = temp;
4091 }
4092 }
4093 */
4094
4095 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
4096 TYPE tempA; \
4097 while((start)<(end)) { \
4098 tempA = *(start); \
4099 *(start)++ = *(end); \
4100 *(end)-- = tempA; \
4101 } \
4102 }
4103
4104 /****************************************************************************/
4105 /* Following are the sortkey generation functions */
4106 /* */
4107 /****************************************************************************/
4108
4109 U_CAPI int32_t U_EXPORT2
ucol_mergeSortkeys(const uint8_t * src1,int32_t src1Length,const uint8_t * src2,int32_t src2Length,uint8_t * dest,int32_t destCapacity)4110 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
4111 const uint8_t *src2, int32_t src2Length,
4112 uint8_t *dest, int32_t destCapacity) {
4113 /* check arguments */
4114 if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
4115 src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
4116 destCapacity<0 || (destCapacity>0 && dest==NULL)
4117 ) {
4118 /* error, attempt to write a zero byte and return 0 */
4119 if(dest!=NULL && destCapacity>0) {
4120 *dest=0;
4121 }
4122 return 0;
4123 }
4124
4125 /* check lengths and capacity */
4126 if(src1Length<0) {
4127 src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
4128 }
4129 if(src2Length<0) {
4130 src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
4131 }
4132
4133 int32_t destLength=src1Length+src2Length;
4134 if(destLength>destCapacity) {
4135 /* the merged sort key does not fit into the destination */
4136 return destLength;
4137 }
4138
4139 /* merge the sort keys with the same number of levels */
4140 uint8_t *p=dest;
4141 for(;;) {
4142 /* copy level from src1 not including 00 or 01 */
4143 uint8_t b;
4144 while((b=*src1)>=2) {
4145 ++src1;
4146 *p++=b;
4147 }
4148
4149 /* add a 02 merge separator */
4150 *p++=2;
4151
4152 /* copy level from src2 not including 00 or 01 */
4153 while((b=*src2)>=2) {
4154 ++src2;
4155 *p++=b;
4156 }
4157
4158 /* if both sort keys have another level, then add a 01 level separator and continue */
4159 if(*src1==1 && *src2==1) {
4160 ++src1;
4161 ++src2;
4162 *p++=1;
4163 } else {
4164 break;
4165 }
4166 }
4167
4168 /*
4169 * here, at least one sort key is finished now, but the other one
4170 * might have some contents left from containing more levels;
4171 * that contents is just appended to the result
4172 */
4173 if(*src1!=0) {
4174 /* src1 is not finished, therefore *src2==0, and src1 is appended */
4175 src2=src1;
4176 }
4177 /* append src2, "the other, unfinished sort key" */
4178 while((*p++=*src2++)!=0) {}
4179
4180 /* the actual length might be less than destLength if either sort key contained illegally embedded zero bytes */
4181 return (int32_t)(p-dest);
4182 }
4183
4184 U_NAMESPACE_BEGIN
4185
4186 class SortKeyByteSink : public ByteSink {
4187 public:
SortKeyByteSink(char * dest,int32_t destCapacity)4188 SortKeyByteSink(char *dest, int32_t destCapacity)
4189 : buffer_(dest), capacity_(destCapacity),
4190 appended_(0) {
4191 if (buffer_ == NULL) {
4192 capacity_ = 0;
4193 } else if(capacity_ < 0) {
4194 buffer_ = NULL;
4195 capacity_ = 0;
4196 }
4197 }
4198 virtual ~SortKeyByteSink();
4199
4200 virtual void Append(const char *bytes, int32_t n);
Append(uint32_t b)4201 void Append(uint32_t b) {
4202 if (appended_ < capacity_ || Resize(1, appended_)) {
4203 buffer_[appended_] = (char)b;
4204 }
4205 ++appended_;
4206 }
Append(uint32_t b1,uint32_t b2)4207 void Append(uint32_t b1, uint32_t b2) {
4208 int32_t a2 = appended_ + 2;
4209 if (a2 <= capacity_ || Resize(2, appended_)) {
4210 buffer_[appended_] = (char)b1;
4211 buffer_[appended_ + 1] = (char)b2;
4212 } else if(appended_ < capacity_) {
4213 buffer_[appended_] = (char)b1;
4214 }
4215 appended_ = a2;
4216 }
4217 virtual char *GetAppendBuffer(int32_t min_capacity,
4218 int32_t desired_capacity_hint,
4219 char *scratch, int32_t scratch_capacity,
4220 int32_t *result_capacity);
NumberOfBytesAppended() const4221 int32_t NumberOfBytesAppended() const { return appended_; }
4222 /** @return FALSE if memory allocation failed */
IsOk() const4223 UBool IsOk() const { return buffer_ != NULL; }
4224
4225 protected:
4226 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) = 0;
4227 virtual UBool Resize(int32_t appendCapacity, int32_t length) = 0;
4228
SetNotOk()4229 void SetNotOk() {
4230 buffer_ = NULL;
4231 capacity_ = 0;
4232 }
4233
4234 char *buffer_;
4235 int32_t capacity_;
4236 int32_t appended_;
4237
4238 private:
4239 SortKeyByteSink(const SortKeyByteSink &); // copy constructor not implemented
4240 SortKeyByteSink &operator=(const SortKeyByteSink &); // assignment operator not implemented
4241 };
4242
~SortKeyByteSink()4243 SortKeyByteSink::~SortKeyByteSink() {}
4244
4245 void
Append(const char * bytes,int32_t n)4246 SortKeyByteSink::Append(const char *bytes, int32_t n) {
4247 if (n <= 0 || bytes == NULL) {
4248 return;
4249 }
4250 int32_t length = appended_;
4251 appended_ += n;
4252 if ((buffer_ + length) == bytes) {
4253 return; // the caller used GetAppendBuffer() and wrote the bytes already
4254 }
4255 int32_t available = capacity_ - length;
4256 if (n <= available) {
4257 uprv_memcpy(buffer_ + length, bytes, n);
4258 } else {
4259 AppendBeyondCapacity(bytes, n, length);
4260 }
4261 }
4262
4263 char *
GetAppendBuffer(int32_t min_capacity,int32_t desired_capacity_hint,char * scratch,int32_t scratch_capacity,int32_t * result_capacity)4264 SortKeyByteSink::GetAppendBuffer(int32_t min_capacity,
4265 int32_t desired_capacity_hint,
4266 char *scratch,
4267 int32_t scratch_capacity,
4268 int32_t *result_capacity) {
4269 if (min_capacity < 1 || scratch_capacity < min_capacity) {
4270 *result_capacity = 0;
4271 return NULL;
4272 }
4273 int32_t available = capacity_ - appended_;
4274 if (available >= min_capacity) {
4275 *result_capacity = available;
4276 return buffer_ + appended_;
4277 } else if (Resize(desired_capacity_hint, appended_)) {
4278 *result_capacity = capacity_ - appended_;
4279 return buffer_ + appended_;
4280 } else {
4281 *result_capacity = scratch_capacity;
4282 return scratch;
4283 }
4284 }
4285
4286 class FixedSortKeyByteSink : public SortKeyByteSink {
4287 public:
FixedSortKeyByteSink(char * dest,int32_t destCapacity)4288 FixedSortKeyByteSink(char *dest, int32_t destCapacity)
4289 : SortKeyByteSink(dest, destCapacity) {}
4290 virtual ~FixedSortKeyByteSink();
4291
4292 private:
4293 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
4294 virtual UBool Resize(int32_t appendCapacity, int32_t length);
4295 };
4296
~FixedSortKeyByteSink()4297 FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
4298
4299 void
AppendBeyondCapacity(const char * bytes,int32_t,int32_t length)4300 FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) {
4301 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
4302 // Fill the buffer completely.
4303 int32_t available = capacity_ - length;
4304 if (available > 0) {
4305 uprv_memcpy(buffer_ + length, bytes, available);
4306 }
4307 }
4308
4309 UBool
Resize(int32_t,int32_t)4310 FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
4311 return FALSE;
4312 }
4313
4314 class CollationKeyByteSink : public SortKeyByteSink {
4315 public:
CollationKeyByteSink(CollationKey & key)4316 CollationKeyByteSink(CollationKey &key)
4317 : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()),
4318 key_(key) {}
4319 virtual ~CollationKeyByteSink();
4320
4321 private:
4322 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
4323 virtual UBool Resize(int32_t appendCapacity, int32_t length);
4324
4325 CollationKey &key_;
4326 };
4327
~CollationKeyByteSink()4328 CollationKeyByteSink::~CollationKeyByteSink() {}
4329
4330 void
AppendBeyondCapacity(const char * bytes,int32_t n,int32_t length)4331 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) {
4332 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
4333 if (Resize(n, length)) {
4334 uprv_memcpy(buffer_ + length, bytes, n);
4335 }
4336 }
4337
4338 UBool
Resize(int32_t appendCapacity,int32_t length)4339 CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
4340 if (buffer_ == NULL) {
4341 return FALSE; // allocation failed before already
4342 }
4343 int32_t newCapacity = 2 * capacity_;
4344 int32_t altCapacity = length + 2 * appendCapacity;
4345 if (newCapacity < altCapacity) {
4346 newCapacity = altCapacity;
4347 }
4348 if (newCapacity < 200) {
4349 newCapacity = 200;
4350 }
4351 uint8_t *newBuffer = key_.reallocate(newCapacity, length);
4352 if (newBuffer == NULL) {
4353 SetNotOk();
4354 return FALSE;
4355 }
4356 buffer_ = reinterpret_cast<char *>(newBuffer);
4357 capacity_ = newCapacity;
4358 return TRUE;
4359 }
4360
4361 /**
4362 * uint8_t byte buffer, similar to CharString but simpler.
4363 */
4364 class SortKeyLevel : public UMemory {
4365 public:
SortKeyLevel()4366 SortKeyLevel() : len(0), ok(TRUE) {}
~SortKeyLevel()4367 ~SortKeyLevel() {}
4368
4369 /** @return FALSE if memory allocation failed */
isOk() const4370 UBool isOk() const { return ok; }
isEmpty() const4371 UBool isEmpty() const { return len == 0; }
length() const4372 int32_t length() const { return len; }
data() const4373 const uint8_t *data() const { return buffer.getAlias(); }
operator [](int32_t index) const4374 uint8_t operator[](int32_t index) const { return buffer[index]; }
4375
4376 void appendByte(uint32_t b);
4377
appendTo(ByteSink & sink) const4378 void appendTo(ByteSink &sink) const {
4379 sink.Append(reinterpret_cast<const char *>(buffer.getAlias()), len);
4380 }
4381
lastByte()4382 uint8_t &lastByte() {
4383 U_ASSERT(len > 0);
4384 return buffer[len - 1];
4385 }
4386
getLastFewBytes(int32_t n)4387 uint8_t *getLastFewBytes(int32_t n) {
4388 if (ok && len >= n) {
4389 return buffer.getAlias() + len - n;
4390 } else {
4391 return NULL;
4392 }
4393 }
4394
4395 private:
4396 MaybeStackArray<uint8_t, 40> buffer;
4397 int32_t len;
4398 UBool ok;
4399
4400 UBool ensureCapacity(int32_t appendCapacity);
4401
4402 SortKeyLevel(const SortKeyLevel &other); // forbid copying of this class
4403 SortKeyLevel &operator=(const SortKeyLevel &other); // forbid copying of this class
4404 };
4405
appendByte(uint32_t b)4406 void SortKeyLevel::appendByte(uint32_t b) {
4407 if(len < buffer.getCapacity() || ensureCapacity(1)) {
4408 buffer[len++] = (uint8_t)b;
4409 }
4410 }
4411
ensureCapacity(int32_t appendCapacity)4412 UBool SortKeyLevel::ensureCapacity(int32_t appendCapacity) {
4413 if(!ok) {
4414 return FALSE;
4415 }
4416 int32_t newCapacity = 2 * buffer.getCapacity();
4417 int32_t altCapacity = len + 2 * appendCapacity;
4418 if (newCapacity < altCapacity) {
4419 newCapacity = altCapacity;
4420 }
4421 if (newCapacity < 200) {
4422 newCapacity = 200;
4423 }
4424 if(buffer.resize(newCapacity, len)==NULL) {
4425 return ok = FALSE;
4426 }
4427 return TRUE;
4428 }
4429
4430 U_NAMESPACE_END
4431
4432 /* sortkey API */
4433 U_CAPI int32_t U_EXPORT2
ucol_getSortKey(const UCollator * coll,const UChar * source,int32_t sourceLength,uint8_t * result,int32_t resultLength)4434 ucol_getSortKey(const UCollator *coll,
4435 const UChar *source,
4436 int32_t sourceLength,
4437 uint8_t *result,
4438 int32_t resultLength)
4439 {
4440 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
4441 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
4442 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
4443 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
4444 }
4445
4446 if(coll->delegate != NULL) {
4447 return ((const Collator*)coll->delegate)->getSortKey(source, sourceLength, result, resultLength);
4448 }
4449
4450 UErrorCode status = U_ZERO_ERROR;
4451 int32_t keySize = 0;
4452
4453 if(source != NULL) {
4454 // source == NULL is actually an error situation, but we would need to
4455 // have an error code to return it. Until we introduce a new
4456 // API, it stays like this
4457
4458 /* this uses the function pointer that is set in updateinternalstate */
4459 /* currently, there are two funcs: */
4460 /*ucol_calcSortKey(...);*/
4461 /*ucol_calcSortKeySimpleTertiary(...);*/
4462
4463 uint8_t noDest[1] = { 0 };
4464 if(result == NULL) {
4465 // Distinguish pure preflighting from an allocation error.
4466 result = noDest;
4467 resultLength = 0;
4468 }
4469 FixedSortKeyByteSink sink(reinterpret_cast<char *>(result), resultLength);
4470 coll->sortKeyGen(coll, source, sourceLength, sink, &status);
4471 if(U_SUCCESS(status)) {
4472 keySize = sink.NumberOfBytesAppended();
4473 }
4474 }
4475 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
4476 UTRACE_EXIT_STATUS(status);
4477 return keySize;
4478 }
4479
4480 U_CFUNC int32_t
ucol_getCollationKey(const UCollator * coll,const UChar * source,int32_t sourceLength,CollationKey & key,UErrorCode & errorCode)4481 ucol_getCollationKey(const UCollator *coll,
4482 const UChar *source, int32_t sourceLength,
4483 CollationKey &key,
4484 UErrorCode &errorCode) {
4485 CollationKeyByteSink sink(key);
4486 coll->sortKeyGen(coll, source, sourceLength, sink, &errorCode);
4487 return sink.NumberOfBytesAppended();
4488 }
4489
4490 // Is this primary weight compressible?
4491 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
4492 // TODO: This should use per-lead-byte flags from FractionalUCA.txt.
4493 static inline UBool
isCompressible(const UCollator *,uint8_t primary1)4494 isCompressible(const UCollator * /*coll*/, uint8_t primary1) {
4495 return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary;
4496 }
4497
4498 static
doCaseShift(SortKeyLevel & cases,uint32_t & caseShift)4499 inline void doCaseShift(SortKeyLevel &cases, uint32_t &caseShift) {
4500 if (caseShift == 0) {
4501 cases.appendByte(UCOL_CASE_BYTE_START);
4502 caseShift = UCOL_CASE_SHIFT_START;
4503 }
4504 }
4505
4506 // Packs the secondary buffer when processing French locale.
4507 static void
packFrench(const uint8_t * secondaries,int32_t secsize,SortKeyByteSink & result)4508 packFrench(const uint8_t *secondaries, int32_t secsize, SortKeyByteSink &result) {
4509 secondaries += secsize; // We read the secondary-level bytes back to front.
4510 uint8_t secondary;
4511 int32_t count2 = 0;
4512 int32_t i = 0;
4513 // we use i here since the key size already accounts for terminators, so we'll discard the increment
4514 for(i = 0; i<secsize; i++) {
4515 secondary = *(secondaries-i-1);
4516 /* This is compression code. */
4517 if (secondary == UCOL_COMMON2) {
4518 ++count2;
4519 } else {
4520 if (count2 > 0) {
4521 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4522 while (count2 > UCOL_TOP_COUNT2) {
4523 result.Append(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
4524 count2 -= (uint32_t)UCOL_TOP_COUNT2;
4525 }
4526 result.Append(UCOL_COMMON_TOP2 - (count2-1));
4527 } else {
4528 while (count2 > UCOL_BOT_COUNT2) {
4529 result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4530 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4531 }
4532 result.Append(UCOL_COMMON_BOT2 + (count2-1));
4533 }
4534 count2 = 0;
4535 }
4536 result.Append(secondary);
4537 }
4538 }
4539 if (count2 > 0) {
4540 while (count2 > UCOL_BOT_COUNT2) {
4541 result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4542 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4543 }
4544 result.Append(UCOL_COMMON_BOT2 + (count2-1));
4545 }
4546 }
4547
4548 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
4549
4550 /* This is the sortkey work horse function */
4551 U_CFUNC void U_CALLCONV
ucol_calcSortKey(const UCollator * coll,const UChar * source,int32_t sourceLength,SortKeyByteSink & result,UErrorCode * status)4552 ucol_calcSortKey(const UCollator *coll,
4553 const UChar *source,
4554 int32_t sourceLength,
4555 SortKeyByteSink &result,
4556 UErrorCode *status)
4557 {
4558 if(U_FAILURE(*status)) {
4559 return;
4560 }
4561
4562 SortKeyByteSink &primaries = result;
4563 SortKeyLevel secondaries;
4564 SortKeyLevel tertiaries;
4565 SortKeyLevel cases;
4566 SortKeyLevel quads;
4567
4568 UnicodeString normSource;
4569
4570 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4571
4572 UColAttributeValue strength = coll->strength;
4573
4574 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4575 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4576 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4577 UBool compareIdent = (strength == UCOL_IDENTICAL);
4578 UBool doCase = (coll->caseLevel == UCOL_ON);
4579 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4580 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
4581 //UBool qShifted = shifted && (compareQuad == 0);
4582 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4583
4584 uint32_t variableTopValue = coll->variableTopValue;
4585 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4586 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4587 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4588 uint8_t UCOL_HIRAGANA_QUAD = 0;
4589 if(doHiragana) {
4590 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4591 /* allocate one more space for hiragana, value for hiragana */
4592 }
4593 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4594
4595 /* support for special features like caselevel and funky secondaries */
4596 int32_t lastSecondaryLength = 0;
4597 uint32_t caseShift = 0;
4598
4599 /* If we need to normalize, we'll do it all at once at the beginning! */
4600 const Normalizer2 *norm2;
4601 if(compareIdent) {
4602 norm2 = Normalizer2Factory::getNFDInstance(*status);
4603 } else if(coll->normalizationMode != UCOL_OFF) {
4604 norm2 = Normalizer2Factory::getFCDInstance(*status);
4605 } else {
4606 norm2 = NULL;
4607 }
4608 if(norm2 != NULL) {
4609 normSource.setTo(FALSE, source, len);
4610 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
4611 if(qcYesLength != len) {
4612 UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
4613 normSource.truncate(qcYesLength);
4614 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
4615 source = normSource.getBuffer();
4616 len = normSource.length();
4617 }
4618 }
4619 collIterate s;
4620 IInit_collIterate(coll, source, len, &s, status);
4621 if(U_FAILURE(*status)) {
4622 return;
4623 }
4624 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized.
4625
4626 uint32_t order = 0;
4627
4628 uint8_t primary1 = 0;
4629 uint8_t primary2 = 0;
4630 uint8_t secondary = 0;
4631 uint8_t tertiary = 0;
4632 uint8_t caseSwitch = coll->caseSwitch;
4633 uint8_t tertiaryMask = coll->tertiaryMask;
4634 int8_t tertiaryAddition = coll->tertiaryAddition;
4635 uint8_t tertiaryTop = coll->tertiaryTop;
4636 uint8_t tertiaryBottom = coll->tertiaryBottom;
4637 uint8_t tertiaryCommon = coll->tertiaryCommon;
4638 uint8_t caseBits = 0;
4639
4640 UBool wasShifted = FALSE;
4641 UBool notIsContinuation = FALSE;
4642
4643 uint32_t count2 = 0, count3 = 0, count4 = 0;
4644 uint8_t leadPrimary = 0;
4645
4646 for(;;) {
4647 order = ucol_IGetNextCE(coll, &s, status);
4648 if(order == UCOL_NO_MORE_CES) {
4649 break;
4650 }
4651
4652 if(order == 0) {
4653 continue;
4654 }
4655
4656 notIsContinuation = !isContinuation(order);
4657
4658 if(notIsContinuation) {
4659 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
4660 } else {
4661 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4662 }
4663
4664 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4665 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4666 primary1 = (uint8_t)(order >> 8);
4667
4668 uint8_t originalPrimary1 = primary1;
4669 if(notIsContinuation && coll->leadBytePermutationTable != NULL) {
4670 primary1 = coll->leadBytePermutationTable[primary1];
4671 }
4672
4673 if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4674 || (!notIsContinuation && wasShifted)))
4675 || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
4676 {
4677 /* and other ignorables should be removed if following a shifted code point */
4678 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4679 /* we should just completely ignore it */
4680 continue;
4681 }
4682 if(compareQuad == 0) {
4683 if(count4 > 0) {
4684 while (count4 > UCOL_BOT_COUNT4) {
4685 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4686 count4 -= UCOL_BOT_COUNT4;
4687 }
4688 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
4689 count4 = 0;
4690 }
4691 /* We are dealing with a variable and we're treating them as shifted */
4692 /* This is a shifted ignorable */
4693 if(primary1 != 0) { /* we need to check this since we could be in continuation */
4694 quads.appendByte(primary1);
4695 }
4696 if(primary2 != 0) {
4697 quads.appendByte(primary2);
4698 }
4699 }
4700 wasShifted = TRUE;
4701 } else {
4702 wasShifted = FALSE;
4703 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4704 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */
4705 /* regular and simple sortkey calc */
4706 if(primary1 != UCOL_IGNORABLE) {
4707 if(notIsContinuation) {
4708 if(leadPrimary == primary1) {
4709 primaries.Append(primary2);
4710 } else {
4711 if(leadPrimary != 0) {
4712 primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
4713 }
4714 if(primary2 == UCOL_IGNORABLE) {
4715 /* one byter, not compressed */
4716 primaries.Append(primary1);
4717 leadPrimary = 0;
4718 } else if(isCompressible(coll, originalPrimary1)) {
4719 /* compress */
4720 primaries.Append(leadPrimary = primary1, primary2);
4721 } else {
4722 leadPrimary = 0;
4723 primaries.Append(primary1, primary2);
4724 }
4725 }
4726 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4727 if(primary2 == UCOL_IGNORABLE) {
4728 primaries.Append(primary1);
4729 } else {
4730 primaries.Append(primary1, primary2);
4731 }
4732 }
4733 }
4734
4735 if(secondary > compareSec) {
4736 if(!isFrenchSec) {
4737 /* This is compression code. */
4738 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4739 ++count2;
4740 } else {
4741 if (count2 > 0) {
4742 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4743 while (count2 > UCOL_TOP_COUNT2) {
4744 secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
4745 count2 -= (uint32_t)UCOL_TOP_COUNT2;
4746 }
4747 secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1));
4748 } else {
4749 while (count2 > UCOL_BOT_COUNT2) {
4750 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4751 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4752 }
4753 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
4754 }
4755 count2 = 0;
4756 }
4757 secondaries.appendByte(secondary);
4758 }
4759 } else {
4760 /* Do the special handling for French secondaries */
4761 /* We need to get continuation elements and do intermediate restore */
4762 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
4763 if(notIsContinuation) {
4764 if (lastSecondaryLength > 1) {
4765 uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength);
4766 if (frenchStartPtr != NULL) {
4767 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4768 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
4769 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4770 }
4771 }
4772 lastSecondaryLength = 1;
4773 } else {
4774 ++lastSecondaryLength;
4775 }
4776 secondaries.appendByte(secondary);
4777 }
4778 }
4779
4780 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4781 // do the case level if we need to do it. We don't want to calculate
4782 // case level for primary ignorables if we have only primary strength and case level
4783 // otherwise we would break well formedness of CEs
4784 doCaseShift(cases, caseShift);
4785 if(notIsContinuation) {
4786 caseBits = (uint8_t)(tertiary & 0xC0);
4787
4788 if(tertiary != 0) {
4789 if(coll->caseFirst == UCOL_UPPER_FIRST) {
4790 if((caseBits & 0xC0) == 0) {
4791 cases.lastByte() |= 1 << (--caseShift);
4792 } else {
4793 cases.lastByte() |= 0 << (--caseShift);
4794 /* second bit */
4795 doCaseShift(cases, caseShift);
4796 cases.lastByte() |= ((caseBits>>6)&1) << (--caseShift);
4797 }
4798 } else {
4799 if((caseBits & 0xC0) == 0) {
4800 cases.lastByte() |= 0 << (--caseShift);
4801 } else {
4802 cases.lastByte() |= 1 << (--caseShift);
4803 /* second bit */
4804 doCaseShift(cases, caseShift);
4805 cases.lastByte() |= ((caseBits>>7)&1) << (--caseShift);
4806 }
4807 }
4808 }
4809 }
4810 } else {
4811 if(notIsContinuation) {
4812 tertiary ^= caseSwitch;
4813 }
4814 }
4815
4816 tertiary &= tertiaryMask;
4817 if(tertiary > compareTer) {
4818 /* This is compression code. */
4819 /* sequence size check is included in the if clause */
4820 if (tertiary == tertiaryCommon && notIsContinuation) {
4821 ++count3;
4822 } else {
4823 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
4824 tertiary += tertiaryAddition;
4825 } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
4826 tertiary -= tertiaryAddition;
4827 }
4828 if (count3 > 0) {
4829 if ((tertiary > tertiaryCommon)) {
4830 while (count3 > coll->tertiaryTopCount) {
4831 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
4832 count3 -= (uint32_t)coll->tertiaryTopCount;
4833 }
4834 tertiaries.appendByte(tertiaryTop - (count3-1));
4835 } else {
4836 while (count3 > coll->tertiaryBottomCount) {
4837 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
4838 count3 -= (uint32_t)coll->tertiaryBottomCount;
4839 }
4840 tertiaries.appendByte(tertiaryBottom + (count3-1));
4841 }
4842 count3 = 0;
4843 }
4844 tertiaries.appendByte(tertiary);
4845 }
4846 }
4847
4848 if(/*qShifted*/(compareQuad==0) && notIsContinuation) {
4849 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4850 if(count4>0) { // Close this part
4851 while (count4 > UCOL_BOT_COUNT4) {
4852 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4853 count4 -= UCOL_BOT_COUNT4;
4854 }
4855 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
4856 count4 = 0;
4857 }
4858 quads.appendByte(UCOL_HIRAGANA_QUAD); // Add the Hiragana
4859 } else { // This wasn't Hiragana, so we can continue adding stuff
4860 count4++;
4861 }
4862 }
4863 }
4864 }
4865
4866 /* Here, we are generally done with processing */
4867 /* bailing out would not be too productive */
4868
4869 UBool ok = TRUE;
4870 if(U_SUCCESS(*status)) {
4871 /* we have done all the CE's, now let's put them together to form a key */
4872 if(compareSec == 0) {
4873 if (count2 > 0) {
4874 while (count2 > UCOL_BOT_COUNT2) {
4875 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4876 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4877 }
4878 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
4879 }
4880 result.Append(UCOL_LEVELTERMINATOR);
4881 if(!secondaries.isOk()) {
4882 ok = FALSE;
4883 } else if(!isFrenchSec) {
4884 secondaries.appendTo(result);
4885 } else {
4886 // If there are any unresolved continuation secondaries,
4887 // reverse them here so that we can reverse the whole secondary thing.
4888 if (lastSecondaryLength > 1) {
4889 uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength);
4890 if (frenchStartPtr != NULL) {
4891 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4892 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
4893 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4894 }
4895 }
4896 packFrench(secondaries.data(), secondaries.length(), result);
4897 }
4898 }
4899
4900 if(doCase) {
4901 ok &= cases.isOk();
4902 result.Append(UCOL_LEVELTERMINATOR);
4903 cases.appendTo(result);
4904 }
4905
4906 if(compareTer == 0) {
4907 if (count3 > 0) {
4908 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
4909 while (count3 >= coll->tertiaryTopCount) {
4910 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
4911 count3 -= (uint32_t)coll->tertiaryTopCount;
4912 }
4913 tertiaries.appendByte(tertiaryTop - count3);
4914 } else {
4915 while (count3 > coll->tertiaryBottomCount) {
4916 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
4917 count3 -= (uint32_t)coll->tertiaryBottomCount;
4918 }
4919 tertiaries.appendByte(tertiaryBottom + (count3-1));
4920 }
4921 }
4922 ok &= tertiaries.isOk();
4923 result.Append(UCOL_LEVELTERMINATOR);
4924 tertiaries.appendTo(result);
4925
4926 if(compareQuad == 0/*qShifted == TRUE*/) {
4927 if(count4 > 0) {
4928 while (count4 > UCOL_BOT_COUNT4) {
4929 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4930 count4 -= UCOL_BOT_COUNT4;
4931 }
4932 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
4933 }
4934 ok &= quads.isOk();
4935 result.Append(UCOL_LEVELTERMINATOR);
4936 quads.appendTo(result);
4937 }
4938
4939 if(compareIdent) {
4940 result.Append(UCOL_LEVELTERMINATOR);
4941 u_writeIdenticalLevelRun(s.string, len, result);
4942 }
4943 }
4944 result.Append(0);
4945 }
4946
4947 /* To avoid memory leak, free the offset buffer if necessary. */
4948 ucol_freeOffsetBuffer(&s);
4949
4950 ok &= result.IsOk();
4951 if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; }
4952 }
4953
4954
4955 U_CFUNC void U_CALLCONV
ucol_calcSortKeySimpleTertiary(const UCollator * coll,const UChar * source,int32_t sourceLength,SortKeyByteSink & result,UErrorCode * status)4956 ucol_calcSortKeySimpleTertiary(const UCollator *coll,
4957 const UChar *source,
4958 int32_t sourceLength,
4959 SortKeyByteSink &result,
4960 UErrorCode *status)
4961 {
4962 U_ALIGN_CODE(16);
4963
4964 if(U_FAILURE(*status)) {
4965 return;
4966 }
4967
4968 SortKeyByteSink &primaries = result;
4969 SortKeyLevel secondaries;
4970 SortKeyLevel tertiaries;
4971
4972 UnicodeString normSource;
4973
4974 int32_t len = sourceLength;
4975
4976 /* If we need to normalize, we'll do it all at once at the beginning! */
4977 if(coll->normalizationMode != UCOL_OFF) {
4978 normSource.setTo(len < 0, source, len);
4979 const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status);
4980 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
4981 if(qcYesLength != normSource.length()) {
4982 UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
4983 normSource.truncate(qcYesLength);
4984 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
4985 source = normSource.getBuffer();
4986 len = normSource.length();
4987 }
4988 }
4989 collIterate s;
4990 IInit_collIterate(coll, (UChar *)source, len, &s, status);
4991 if(U_FAILURE(*status)) {
4992 return;
4993 }
4994 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized.
4995
4996 uint32_t order = 0;
4997
4998 uint8_t primary1 = 0;
4999 uint8_t primary2 = 0;
5000 uint8_t secondary = 0;
5001 uint8_t tertiary = 0;
5002 uint8_t caseSwitch = coll->caseSwitch;
5003 uint8_t tertiaryMask = coll->tertiaryMask;
5004 int8_t tertiaryAddition = coll->tertiaryAddition;
5005 uint8_t tertiaryTop = coll->tertiaryTop;
5006 uint8_t tertiaryBottom = coll->tertiaryBottom;
5007 uint8_t tertiaryCommon = coll->tertiaryCommon;
5008
5009 UBool notIsContinuation = FALSE;
5010
5011 uint32_t count2 = 0, count3 = 0;
5012 uint8_t leadPrimary = 0;
5013
5014 for(;;) {
5015 order = ucol_IGetNextCE(coll, &s, status);
5016
5017 if(order == 0) {
5018 continue;
5019 }
5020
5021 if(order == UCOL_NO_MORE_CES) {
5022 break;
5023 }
5024
5025 notIsContinuation = !isContinuation(order);
5026
5027 if(notIsContinuation) {
5028 tertiary = (uint8_t)((order & tertiaryMask));
5029 } else {
5030 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5031 }
5032
5033 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5034 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5035 primary1 = (uint8_t)(order >> 8);
5036
5037 uint8_t originalPrimary1 = primary1;
5038 if (coll->leadBytePermutationTable != NULL && notIsContinuation) {
5039 primary1 = coll->leadBytePermutationTable[primary1];
5040 }
5041
5042 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5043 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */
5044 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
5045 /* regular and simple sortkey calc */
5046 if(primary1 != UCOL_IGNORABLE) {
5047 if(notIsContinuation) {
5048 if(leadPrimary == primary1) {
5049 primaries.Append(primary2);
5050 } else {
5051 if(leadPrimary != 0) {
5052 primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5053 }
5054 if(primary2 == UCOL_IGNORABLE) {
5055 /* one byter, not compressed */
5056 primaries.Append(primary1);
5057 leadPrimary = 0;
5058 } else if(isCompressible(coll, originalPrimary1)) {
5059 /* compress */
5060 primaries.Append(leadPrimary = primary1, primary2);
5061 } else {
5062 leadPrimary = 0;
5063 primaries.Append(primary1, primary2);
5064 }
5065 }
5066 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5067 if(primary2 == UCOL_IGNORABLE) {
5068 primaries.Append(primary1);
5069 } else {
5070 primaries.Append(primary1, primary2);
5071 }
5072 }
5073 }
5074
5075 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5076 /* This is compression code. */
5077 if (secondary == UCOL_COMMON2 && notIsContinuation) {
5078 ++count2;
5079 } else {
5080 if (count2 > 0) {
5081 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5082 while (count2 > UCOL_TOP_COUNT2) {
5083 secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5084 count2 -= (uint32_t)UCOL_TOP_COUNT2;
5085 }
5086 secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1));
5087 } else {
5088 while (count2 > UCOL_BOT_COUNT2) {
5089 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5090 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5091 }
5092 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
5093 }
5094 count2 = 0;
5095 }
5096 secondaries.appendByte(secondary);
5097 }
5098 }
5099
5100 if(notIsContinuation) {
5101 tertiary ^= caseSwitch;
5102 }
5103
5104 if(tertiary > 0) {
5105 /* This is compression code. */
5106 /* sequence size check is included in the if clause */
5107 if (tertiary == tertiaryCommon && notIsContinuation) {
5108 ++count3;
5109 } else {
5110 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5111 tertiary += tertiaryAddition;
5112 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5113 tertiary -= tertiaryAddition;
5114 }
5115 if (count3 > 0) {
5116 if ((tertiary > tertiaryCommon)) {
5117 while (count3 > coll->tertiaryTopCount) {
5118 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
5119 count3 -= (uint32_t)coll->tertiaryTopCount;
5120 }
5121 tertiaries.appendByte(tertiaryTop - (count3-1));
5122 } else {
5123 while (count3 > coll->tertiaryBottomCount) {
5124 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
5125 count3 -= (uint32_t)coll->tertiaryBottomCount;
5126 }
5127 tertiaries.appendByte(tertiaryBottom + (count3-1));
5128 }
5129 count3 = 0;
5130 }
5131 tertiaries.appendByte(tertiary);
5132 }
5133 }
5134 }
5135
5136 UBool ok = TRUE;
5137 if(U_SUCCESS(*status)) {
5138 /* we have done all the CE's, now let's put them together to form a key */
5139 if (count2 > 0) {
5140 while (count2 > UCOL_BOT_COUNT2) {
5141 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5142 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5143 }
5144 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
5145 }
5146 ok &= secondaries.isOk();
5147 result.Append(UCOL_LEVELTERMINATOR);
5148 secondaries.appendTo(result);
5149
5150 if (count3 > 0) {
5151 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5152 while (count3 >= coll->tertiaryTopCount) {
5153 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
5154 count3 -= (uint32_t)coll->tertiaryTopCount;
5155 }
5156 tertiaries.appendByte(tertiaryTop - count3);
5157 } else {
5158 while (count3 > coll->tertiaryBottomCount) {
5159 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
5160 count3 -= (uint32_t)coll->tertiaryBottomCount;
5161 }
5162 tertiaries.appendByte(tertiaryBottom + (count3-1));
5163 }
5164 }
5165 ok &= tertiaries.isOk();
5166 result.Append(UCOL_LEVELTERMINATOR);
5167 tertiaries.appendTo(result);
5168
5169 result.Append(0);
5170 }
5171
5172 /* To avoid memory leak, free the offset buffer if necessary. */
5173 ucol_freeOffsetBuffer(&s);
5174
5175 ok &= result.IsOk();
5176 if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; }
5177 }
5178
5179 static inline
isShiftedCE(uint32_t CE,uint32_t LVT,UBool * wasShifted)5180 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5181 UBool notIsContinuation = !isContinuation(CE);
5182 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
5183 if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5184 || (!notIsContinuation && *wasShifted)))
5185 || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
5186 {
5187 // The stuff below should probably be in the sortkey code... maybe not...
5188 if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
5189 /* we should just completely ignore it */
5190 *wasShifted = TRUE;
5191 //continue;
5192 }
5193 //*wasShifted = TRUE;
5194 return TRUE;
5195 } else {
5196 *wasShifted = FALSE;
5197 return FALSE;
5198 }
5199 }
5200 static inline
terminatePSKLevel(int32_t level,int32_t maxLevel,int32_t & i,uint8_t * dest)5201 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
5202 if(level < maxLevel) {
5203 dest[i++] = UCOL_LEVELTERMINATOR;
5204 } else {
5205 dest[i++] = 0;
5206 }
5207 }
5208
5209 /** enumeration of level identifiers for partial sort key generation */
5210 enum {
5211 UCOL_PSK_PRIMARY = 0,
5212 UCOL_PSK_SECONDARY = 1,
5213 UCOL_PSK_CASE = 2,
5214 UCOL_PSK_TERTIARY = 3,
5215 UCOL_PSK_QUATERNARY = 4,
5216 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have three bits to blow */
5217 UCOL_PSK_IDENTICAL = 6,
5218 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */
5219 UCOL_PSK_LIMIT
5220 };
5221
5222 /** collation state enum. *_SHIFT value is how much to shift right
5223 * to get the state piece to the right. *_MASK value should be
5224 * ANDed with the shifted state. This data is stored in state[1]
5225 * field.
5226 */
5227 enum {
5228 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */
5229 UCOL_PSK_LEVEL_MASK = 7, /** three bits */
5230 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
5231 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
5232 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5233 * This field is also used to denote that the French secondary level is finished
5234 */
5235 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5236 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5237 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
5238 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5239 /** When we do French we need to reverse secondary values. However, continuations
5240 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5241 */
5242 UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
5243 UCOL_PSK_BOCSU_BYTES_MASK = 3,
5244 UCOL_PSK_CONSUMED_CES_SHIFT = 9,
5245 UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
5246 };
5247
5248 // macro calculating the number of expansion CEs available
5249 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
5250
5251
5252 /** main sortkey part procedure. On the first call,
5253 * you should pass in a collator, an iterator, empty state
5254 * state[0] == state[1] == 0, a buffer to hold results
5255 * number of bytes you need and an error code pointer.
5256 * Make sure your buffer is big enough to hold the wanted
5257 * number of sortkey bytes. I don't check.
5258 * The only meaningful status you can get back is
5259 * U_BUFFER_OVERFLOW_ERROR, which basically means that you
5260 * have been dealt a raw deal and that you probably won't
5261 * be able to use partial sortkey generation for this
5262 * particular combination of string and collator. This
5263 * is highly unlikely, but you should still check the error code.
5264 * Any other status means that you're not in a sane situation
5265 * anymore. After the first call, preserve state values and
5266 * use them on subsequent calls to obtain more bytes of a sortkey.
5267 * Use until the number of bytes written is smaller than the requested
5268 * number of bytes. Generated sortkey is not compatible with the
5269 * one generated by ucol_getSortKey, as we don't do any compression.
5270 * However, levels are still terminated by a 1 (one) and the sortkey
5271 * is terminated by a 0 (zero). Identical level is the same as in the
5272 * regular sortkey - internal bocu-1 implementation is used.
5273 * For curious, although you cannot do much about this, here is
5274 * the structure of state words.
5275 * state[0] - iterator state. Depends on the iterator implementation,
5276 * but allows the iterator to continue where it stopped in
5277 * the last iteration.
5278 * state[1] - collation processing state. Here is the distribution
5279 * of the bits:
5280 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5281 * quaternary, quin (we don't use this one), identical and
5282 * null (producing only zeroes - first one to terminate the
5283 * sortkey and subsequent to fill the buffer).
5284 * 3 - byte count. Number of bytes written on the primary level.
5285 * 4 - was shifted. Whether the previous iteration finished in the
5286 * shifted state.
5287 * 5, 6 - French continuation bytes written. See the comment in the enum
5288 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on
5289 * the identical level.
5290 * 9..31 - CEs consumed. Number of getCE or next32 operations performed
5291 * since thes last successful update of the iterator state.
5292 */
5293 U_CAPI int32_t U_EXPORT2
ucol_nextSortKeyPart(const UCollator * coll,UCharIterator * iter,uint32_t state[2],uint8_t * dest,int32_t count,UErrorCode * status)5294 ucol_nextSortKeyPart(const UCollator *coll,
5295 UCharIterator *iter,
5296 uint32_t state[2],
5297 uint8_t *dest, int32_t count,
5298 UErrorCode *status)
5299 {
5300 /* error checking */
5301 if(status==NULL || U_FAILURE(*status)) {
5302 return 0;
5303 }
5304 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
5305 if( coll==NULL || iter==NULL ||
5306 state==NULL ||
5307 count<0 || (count>0 && dest==NULL)
5308 ) {
5309 *status=U_ILLEGAL_ARGUMENT_ERROR;
5310 UTRACE_EXIT_STATUS(status);
5311 return 0;
5312 }
5313
5314 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5315 coll, iter, state[0], state[1], dest, count);
5316
5317 if(count==0) {
5318 /* nothing to do */
5319 UTRACE_EXIT_VALUE(0);
5320 return 0;
5321 }
5322 /** Setting up situation according to the state we got from the previous iteration */
5323 // The state of the iterator from the previous invocation
5324 uint32_t iterState = state[0];
5325 // Has the last iteration ended in the shifted state
5326 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
5327 // What is the current level of the sortkey?
5328 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
5329 // Have we written only one byte from a two byte primary in the previous iteration?
5330 // Also on secondary level - have we finished with the French secondary?
5331 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
5332 // number of bytes in the continuation buffer for French
5333 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
5334 // Number of bytes already written from a bocsu sequence. Since
5335 // the longes bocsu sequence is 4 long, this can be up to 3.
5336 int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
5337 // Number of elements that need to be consumed in this iteration because
5338 // the iterator returned UITER_NO_STATE at the end of the last iteration,
5339 // so we had to save the last valid state.
5340 int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
5341
5342 /** values that depend on the collator attributes */
5343 // strength of the collator.
5344 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
5345 // maximal level of the partial sortkey. Need to take whether case level is done
5346 int32_t maxLevel = 0;
5347 if(strength < UCOL_TERTIARY) {
5348 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5349 maxLevel = UCOL_PSK_CASE;
5350 } else {
5351 maxLevel = strength;
5352 }
5353 } else {
5354 if(strength == UCOL_TERTIARY) {
5355 maxLevel = UCOL_PSK_TERTIARY;
5356 } else if(strength == UCOL_QUATERNARY) {
5357 maxLevel = UCOL_PSK_QUATERNARY;
5358 } else { // identical
5359 maxLevel = UCOL_IDENTICAL;
5360 }
5361 }
5362 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
5363 uint8_t UCOL_HIRAGANA_QUAD =
5364 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
5365 // Boundary value that decides whether a CE is shifted or not
5366 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
5367 // Are we doing French collation?
5368 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
5369
5370 /** initializing the collation state */
5371 UBool notIsContinuation = FALSE;
5372 uint32_t CE = UCOL_NO_MORE_CES;
5373
5374 collIterate s;
5375 IInit_collIterate(coll, NULL, -1, &s, status);
5376 if(U_FAILURE(*status)) {
5377 UTRACE_EXIT_STATUS(*status);
5378 return 0;
5379 }
5380 s.iterator = iter;
5381 s.flags |= UCOL_USE_ITERATOR;
5382 // This variable tells us whether we have produced some other levels in this iteration
5383 // before we moved to the identical level. In that case, we need to switch the
5384 // type of the iterator.
5385 UBool doingIdenticalFromStart = FALSE;
5386 // Normalizing iterator
5387 // The division for the array length may truncate the array size to
5388 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
5389 // for all platforms anyway.
5390 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
5391 UNormIterator *normIter = NULL;
5392 // If the normalization is turned on for the collator and we are below identical level
5393 // we will use a FCD normalizing iterator
5394 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
5395 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5396 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
5397 s.flags &= ~UCOL_ITER_NORM;
5398 if(U_FAILURE(*status)) {
5399 UTRACE_EXIT_STATUS(*status);
5400 return 0;
5401 }
5402 } else if(level == UCOL_PSK_IDENTICAL) {
5403 // for identical level, we need a NFD iterator. We need to instantiate it here, since we
5404 // will be updating the state - and this cannot be done on an ordinary iterator.
5405 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5406 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5407 s.flags &= ~UCOL_ITER_NORM;
5408 if(U_FAILURE(*status)) {
5409 UTRACE_EXIT_STATUS(*status);
5410 return 0;
5411 }
5412 doingIdenticalFromStart = TRUE;
5413 }
5414
5415 // This is the tentative new state of the iterator. The problem
5416 // is that the iterator might return an undefined state, in
5417 // which case we should save the last valid state and increase
5418 // the iterator skip value.
5419 uint32_t newState = 0;
5420
5421 // First, we set the iterator to the last valid position
5422 // from the last iteration. This was saved in state[0].
5423 if(iterState == 0) {
5424 /* initial state */
5425 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
5426 s.iterator->move(s.iterator, 0, UITER_LIMIT);
5427 } else {
5428 s.iterator->move(s.iterator, 0, UITER_START);
5429 }
5430 } else {
5431 /* reset to previous state */
5432 s.iterator->setState(s.iterator, iterState, status);
5433 if(U_FAILURE(*status)) {
5434 UTRACE_EXIT_STATUS(*status);
5435 return 0;
5436 }
5437 }
5438
5439
5440
5441 // This variable tells us whether we can attempt to update the state
5442 // of iterator. Situations where we don't want to update iterator state
5443 // are the existence of expansion CEs that are not yet processed, and
5444 // finishing the case level without enough space in the buffer to insert
5445 // a level terminator.
5446 UBool canUpdateState = TRUE;
5447
5448 // Consume all the CEs that were consumed at the end of the previous
5449 // iteration without updating the iterator state. On identical level,
5450 // consume the code points.
5451 int32_t counter = cces;
5452 if(level < UCOL_PSK_IDENTICAL) {
5453 while(counter-->0) {
5454 // If we're doing French and we are on the secondary level,
5455 // we go backwards.
5456 if(level == UCOL_PSK_SECONDARY && doingFrench) {
5457 CE = ucol_IGetPrevCE(coll, &s, status);
5458 } else {
5459 CE = ucol_IGetNextCE(coll, &s, status);
5460 }
5461 if(CE==UCOL_NO_MORE_CES) {
5462 /* should not happen */
5463 *status=U_INTERNAL_PROGRAM_ERROR;
5464 UTRACE_EXIT_STATUS(*status);
5465 return 0;
5466 }
5467 if(uprv_numAvailableExpCEs(s)) {
5468 canUpdateState = FALSE;
5469 }
5470 }
5471 } else {
5472 while(counter-->0) {
5473 uiter_next32(s.iterator);
5474 }
5475 }
5476
5477 // French secondary needs to know whether the iterator state of zero came from previous level OR
5478 // from a new invocation...
5479 UBool wasDoingPrimary = FALSE;
5480 // destination buffer byte counter. When this guy
5481 // gets to count, we're done with the iteration
5482 int32_t i = 0;
5483 // used to count the zero bytes written after we
5484 // have finished with the sort key
5485 int32_t j = 0;
5486
5487
5488 // Hm.... I think we're ready to plunge in. Basic story is as following:
5489 // we have a fall through case based on level. This is used for initial
5490 // positioning on iteration start. Every level processor contains a
5491 // for(;;) which will be broken when we exhaust all the CEs. Other
5492 // way to exit is a goto saveState, which happens when we have filled
5493 // out our buffer.
5494 switch(level) {
5495 case UCOL_PSK_PRIMARY:
5496 wasDoingPrimary = TRUE;
5497 for(;;) {
5498 if(i==count) {
5499 goto saveState;
5500 }
5501 // We should save the state only if we
5502 // are sure that we are done with the
5503 // previous iterator state
5504 if(canUpdateState && byteCountOrFrenchDone == 0) {
5505 newState = s.iterator->getState(s.iterator);
5506 if(newState != UITER_NO_STATE) {
5507 iterState = newState;
5508 cces = 0;
5509 }
5510 }
5511 CE = ucol_IGetNextCE(coll, &s, status);
5512 cces++;
5513 if(CE==UCOL_NO_MORE_CES) {
5514 // Add the level separator
5515 terminatePSKLevel(level, maxLevel, i, dest);
5516 byteCountOrFrenchDone=0;
5517 // Restart the iteration an move to the
5518 // second level
5519 s.iterator->move(s.iterator, 0, UITER_START);
5520 cces = 0;
5521 level = UCOL_PSK_SECONDARY;
5522 break;
5523 }
5524 if(!isContinuation(CE)){
5525 if(coll->leadBytePermutationTable != NULL){
5526 CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF);
5527 }
5528 }
5529 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5530 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
5531 if(CE != 0) {
5532 if(byteCountOrFrenchDone == 0) {
5533 // get the second byte of primary
5534 dest[i++]=(uint8_t)(CE >> 8);
5535 } else {
5536 byteCountOrFrenchDone = 0;
5537 }
5538 if((CE &=0xff)!=0) {
5539 if(i==count) {
5540 /* overflow */
5541 byteCountOrFrenchDone = 1;
5542 cces--;
5543 goto saveState;
5544 }
5545 dest[i++]=(uint8_t)CE;
5546 }
5547 }
5548 }
5549 if(uprv_numAvailableExpCEs(s)) {
5550 canUpdateState = FALSE;
5551 } else {
5552 canUpdateState = TRUE;
5553 }
5554 }
5555 /* fall through to next level */
5556 case UCOL_PSK_SECONDARY:
5557 if(strength >= UCOL_SECONDARY) {
5558 if(!doingFrench) {
5559 for(;;) {
5560 if(i == count) {
5561 goto saveState;
5562 }
5563 // We should save the state only if we
5564 // are sure that we are done with the
5565 // previous iterator state
5566 if(canUpdateState) {
5567 newState = s.iterator->getState(s.iterator);
5568 if(newState != UITER_NO_STATE) {
5569 iterState = newState;
5570 cces = 0;
5571 }
5572 }
5573 CE = ucol_IGetNextCE(coll, &s, status);
5574 cces++;
5575 if(CE==UCOL_NO_MORE_CES) {
5576 // Add the level separator
5577 terminatePSKLevel(level, maxLevel, i, dest);
5578 byteCountOrFrenchDone = 0;
5579 // Restart the iteration an move to the
5580 // second level
5581 s.iterator->move(s.iterator, 0, UITER_START);
5582 cces = 0;
5583 level = UCOL_PSK_CASE;
5584 break;
5585 }
5586 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5587 CE >>= 8; /* get secondary */
5588 if(CE != 0) {
5589 dest[i++]=(uint8_t)CE;
5590 }
5591 }
5592 if(uprv_numAvailableExpCEs(s)) {
5593 canUpdateState = FALSE;
5594 } else {
5595 canUpdateState = TRUE;
5596 }
5597 }
5598 } else { // French secondary processing
5599 uint8_t frenchBuff[UCOL_MAX_BUFFER];
5600 int32_t frenchIndex = 0;
5601 // Here we are going backwards.
5602 // If the iterator is at the beggining, it should be
5603 // moved to end.
5604 if(wasDoingPrimary) {
5605 s.iterator->move(s.iterator, 0, UITER_LIMIT);
5606 cces = 0;
5607 }
5608 for(;;) {
5609 if(i == count) {
5610 goto saveState;
5611 }
5612 if(canUpdateState) {
5613 newState = s.iterator->getState(s.iterator);
5614 if(newState != UITER_NO_STATE) {
5615 iterState = newState;
5616 cces = 0;
5617 }
5618 }
5619 CE = ucol_IGetPrevCE(coll, &s, status);
5620 cces++;
5621 if(CE==UCOL_NO_MORE_CES) {
5622 // Add the level separator
5623 terminatePSKLevel(level, maxLevel, i, dest);
5624 byteCountOrFrenchDone = 0;
5625 // Restart the iteration an move to the next level
5626 s.iterator->move(s.iterator, 0, UITER_START);
5627 level = UCOL_PSK_CASE;
5628 break;
5629 }
5630 if(isContinuation(CE)) { // if it's a continuation, we want to save it and
5631 // reverse when we get a first non-continuation CE.
5632 CE >>= 8;
5633 frenchBuff[frenchIndex++] = (uint8_t)CE;
5634 } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
5635 CE >>= 8; /* get secondary */
5636 if(!frenchIndex) {
5637 if(CE != 0) {
5638 dest[i++]=(uint8_t)CE;
5639 }
5640 } else {
5641 frenchBuff[frenchIndex++] = (uint8_t)CE;
5642 frenchIndex -= usedFrench;
5643 usedFrench = 0;
5644 while(i < count && frenchIndex) {
5645 dest[i++] = frenchBuff[--frenchIndex];
5646 usedFrench++;
5647 }
5648 }
5649 }
5650 if(uprv_numAvailableExpCEs(s)) {
5651 canUpdateState = FALSE;
5652 } else {
5653 canUpdateState = TRUE;
5654 }
5655 }
5656 }
5657 } else {
5658 level = UCOL_PSK_CASE;
5659 }
5660 /* fall through to next level */
5661 case UCOL_PSK_CASE:
5662 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5663 uint32_t caseShift = UCOL_CASE_SHIFT_START;
5664 uint8_t caseByte = UCOL_CASE_BYTE_START;
5665 uint8_t caseBits = 0;
5666
5667 for(;;) {
5668 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START);
5669 if(i == count) {
5670 goto saveState;
5671 }
5672 // We should save the state only if we
5673 // are sure that we are done with the
5674 // previous iterator state
5675 if(canUpdateState) {
5676 newState = s.iterator->getState(s.iterator);
5677 if(newState != UITER_NO_STATE) {
5678 iterState = newState;
5679 cces = 0;
5680 }
5681 }
5682 CE = ucol_IGetNextCE(coll, &s, status);
5683 cces++;
5684 if(CE==UCOL_NO_MORE_CES) {
5685 // On the case level we might have an unfinished
5686 // case byte. Add one if it's started.
5687 if(caseShift != UCOL_CASE_SHIFT_START) {
5688 dest[i++] = caseByte;
5689 }
5690 cces = 0;
5691 // We have finished processing CEs on this level.
5692 // However, we don't know if we have enough space
5693 // to add a case level terminator.
5694 if(i < count) {
5695 // Add the level separator
5696 terminatePSKLevel(level, maxLevel, i, dest);
5697 // Restart the iteration and move to the
5698 // next level
5699 s.iterator->move(s.iterator, 0, UITER_START);
5700 level = UCOL_PSK_TERTIARY;
5701 } else {
5702 canUpdateState = FALSE;
5703 }
5704 break;
5705 }
5706
5707 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5708 if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
5709 // do the case level if we need to do it. We don't want to calculate
5710 // case level for primary ignorables if we have only primary strength and case level
5711 // otherwise we would break well formedness of CEs
5712 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5713 caseBits = (uint8_t)(CE & 0xC0);
5714 // this copies the case level logic from the
5715 // sort key generation code
5716 if(CE != 0) {
5717 if (caseShift == 0) {
5718 dest[i++] = caseByte;
5719 caseShift = UCOL_CASE_SHIFT_START;
5720 caseByte = UCOL_CASE_BYTE_START;
5721 }
5722 if(coll->caseFirst == UCOL_UPPER_FIRST) {
5723 if((caseBits & 0xC0) == 0) {
5724 caseByte |= 1 << (--caseShift);
5725 } else {
5726 caseByte |= 0 << (--caseShift);
5727 /* second bit */
5728 if(caseShift == 0) {
5729 dest[i++] = caseByte;
5730 caseShift = UCOL_CASE_SHIFT_START;
5731 caseByte = UCOL_CASE_BYTE_START;
5732 }
5733 caseByte |= ((caseBits>>6)&1) << (--caseShift);
5734 }
5735 } else {
5736 if((caseBits & 0xC0) == 0) {
5737 caseByte |= 0 << (--caseShift);
5738 } else {
5739 caseByte |= 1 << (--caseShift);
5740 /* second bit */
5741 if(caseShift == 0) {
5742 dest[i++] = caseByte;
5743 caseShift = UCOL_CASE_SHIFT_START;
5744 caseByte = UCOL_CASE_BYTE_START;
5745 }
5746 caseByte |= ((caseBits>>7)&1) << (--caseShift);
5747 }
5748 }
5749 }
5750
5751 }
5752 }
5753 // Not sure this is correct for the case level - revisit
5754 if(uprv_numAvailableExpCEs(s)) {
5755 canUpdateState = FALSE;
5756 } else {
5757 canUpdateState = TRUE;
5758 }
5759 }
5760 } else {
5761 level = UCOL_PSK_TERTIARY;
5762 }
5763 /* fall through to next level */
5764 case UCOL_PSK_TERTIARY:
5765 if(strength >= UCOL_TERTIARY) {
5766 for(;;) {
5767 if(i == count) {
5768 goto saveState;
5769 }
5770 // We should save the state only if we
5771 // are sure that we are done with the
5772 // previous iterator state
5773 if(canUpdateState) {
5774 newState = s.iterator->getState(s.iterator);
5775 if(newState != UITER_NO_STATE) {
5776 iterState = newState;
5777 cces = 0;
5778 }
5779 }
5780 CE = ucol_IGetNextCE(coll, &s, status);
5781 cces++;
5782 if(CE==UCOL_NO_MORE_CES) {
5783 // Add the level separator
5784 terminatePSKLevel(level, maxLevel, i, dest);
5785 byteCountOrFrenchDone = 0;
5786 // Restart the iteration an move to the
5787 // second level
5788 s.iterator->move(s.iterator, 0, UITER_START);
5789 cces = 0;
5790 level = UCOL_PSK_QUATERNARY;
5791 break;
5792 }
5793 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5794 notIsContinuation = !isContinuation(CE);
5795
5796 if(notIsContinuation) {
5797 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5798 CE ^= coll->caseSwitch;
5799 CE &= coll->tertiaryMask;
5800 } else {
5801 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
5802 }
5803
5804 if(CE != 0) {
5805 dest[i++]=(uint8_t)CE;
5806 }
5807 }
5808 if(uprv_numAvailableExpCEs(s)) {
5809 canUpdateState = FALSE;
5810 } else {
5811 canUpdateState = TRUE;
5812 }
5813 }
5814 } else {
5815 // if we're not doing tertiary
5816 // skip to the end
5817 level = UCOL_PSK_NULL;
5818 }
5819 /* fall through to next level */
5820 case UCOL_PSK_QUATERNARY:
5821 if(strength >= UCOL_QUATERNARY) {
5822 for(;;) {
5823 if(i == count) {
5824 goto saveState;
5825 }
5826 // We should save the state only if we
5827 // are sure that we are done with the
5828 // previous iterator state
5829 if(canUpdateState) {
5830 newState = s.iterator->getState(s.iterator);
5831 if(newState != UITER_NO_STATE) {
5832 iterState = newState;
5833 cces = 0;
5834 }
5835 }
5836 CE = ucol_IGetNextCE(coll, &s, status);
5837 cces++;
5838 if(CE==UCOL_NO_MORE_CES) {
5839 // Add the level separator
5840 terminatePSKLevel(level, maxLevel, i, dest);
5841 //dest[i++] = UCOL_LEVELTERMINATOR;
5842 byteCountOrFrenchDone = 0;
5843 // Restart the iteration an move to the
5844 // second level
5845 s.iterator->move(s.iterator, 0, UITER_START);
5846 cces = 0;
5847 level = UCOL_PSK_QUIN;
5848 break;
5849 }
5850 if(CE==0)
5851 continue;
5852 if(isShiftedCE(CE, LVT, &wasShifted)) {
5853 CE >>= 16; /* get primary */
5854 if(CE != 0) {
5855 if(byteCountOrFrenchDone == 0) {
5856 dest[i++]=(uint8_t)(CE >> 8);
5857 } else {
5858 byteCountOrFrenchDone = 0;
5859 }
5860 if((CE &=0xff)!=0) {
5861 if(i==count) {
5862 /* overflow */
5863 byteCountOrFrenchDone = 1;
5864 goto saveState;
5865 }
5866 dest[i++]=(uint8_t)CE;
5867 }
5868 }
5869 } else {
5870 notIsContinuation = !isContinuation(CE);
5871 if(notIsContinuation) {
5872 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
5873 dest[i++] = UCOL_HIRAGANA_QUAD;
5874 } else {
5875 dest[i++] = 0xFF;
5876 }
5877 }
5878 }
5879 if(uprv_numAvailableExpCEs(s)) {
5880 canUpdateState = FALSE;
5881 } else {
5882 canUpdateState = TRUE;
5883 }
5884 }
5885 } else {
5886 // if we're not doing quaternary
5887 // skip to the end
5888 level = UCOL_PSK_NULL;
5889 }
5890 /* fall through to next level */
5891 case UCOL_PSK_QUIN:
5892 level = UCOL_PSK_IDENTICAL;
5893 /* fall through to next level */
5894 case UCOL_PSK_IDENTICAL:
5895 if(strength >= UCOL_IDENTICAL) {
5896 UChar32 first, second;
5897 int32_t bocsuBytesWritten = 0;
5898 // We always need to do identical on
5899 // the NFD form of the string.
5900 if(normIter == NULL) {
5901 // we arrived from the level below and
5902 // normalization was not turned on.
5903 // therefore, we need to make a fresh NFD iterator
5904 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5905 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5906 } else if(!doingIdenticalFromStart) {
5907 // there is an iterator, but we did some other levels.
5908 // therefore, we have a FCD iterator - need to make
5909 // a NFD one.
5910 // normIter being at the beginning does not guarantee
5911 // that the underlying iterator is at the beginning
5912 iter->move(iter, 0, UITER_START);
5913 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5914 }
5915 // At this point we have a NFD iterator that is positioned
5916 // in the right place
5917 if(U_FAILURE(*status)) {
5918 UTRACE_EXIT_STATUS(*status);
5919 return 0;
5920 }
5921 first = uiter_previous32(s.iterator);
5922 // maybe we're at the start of the string
5923 if(first == U_SENTINEL) {
5924 first = 0;
5925 } else {
5926 uiter_next32(s.iterator);
5927 }
5928
5929 j = 0;
5930 for(;;) {
5931 if(i == count) {
5932 if(j+1 < bocsuBytesWritten) {
5933 bocsuBytesUsed = j+1;
5934 }
5935 goto saveState;
5936 }
5937
5938 // On identical level, we will always save
5939 // the state if we reach this point, since
5940 // we don't depend on getNextCE for content
5941 // all the content is in our buffer and we
5942 // already either stored the full buffer OR
5943 // otherwise we won't arrive here.
5944 newState = s.iterator->getState(s.iterator);
5945 if(newState != UITER_NO_STATE) {
5946 iterState = newState;
5947 cces = 0;
5948 }
5949
5950 uint8_t buff[4];
5951 second = uiter_next32(s.iterator);
5952 cces++;
5953
5954 // end condition for identical level
5955 if(second == U_SENTINEL) {
5956 terminatePSKLevel(level, maxLevel, i, dest);
5957 level = UCOL_PSK_NULL;
5958 break;
5959 }
5960 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
5961 first = second;
5962
5963 j = 0;
5964 if(bocsuBytesUsed != 0) {
5965 while(bocsuBytesUsed-->0) {
5966 j++;
5967 }
5968 }
5969
5970 while(i < count && j < bocsuBytesWritten) {
5971 dest[i++] = buff[j++];
5972 }
5973 }
5974
5975 } else {
5976 level = UCOL_PSK_NULL;
5977 }
5978 /* fall through to next level */
5979 case UCOL_PSK_NULL:
5980 j = i;
5981 while(j<count) {
5982 dest[j++]=0;
5983 }
5984 break;
5985 default:
5986 *status = U_INTERNAL_PROGRAM_ERROR;
5987 UTRACE_EXIT_STATUS(*status);
5988 return 0;
5989 }
5990
5991 saveState:
5992 // Now we need to return stuff. First we want to see whether we have
5993 // done everything for the current state of iterator.
5994 if(byteCountOrFrenchDone
5995 || canUpdateState == FALSE
5996 || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)
5997 {
5998 // Any of above mean that the previous transaction
5999 // wasn't finished and that we should store the
6000 // previous iterator state.
6001 state[0] = iterState;
6002 } else {
6003 // The transaction is complete. We will continue in the next iteration.
6004 state[0] = s.iterator->getState(s.iterator);
6005 cces = 0;
6006 }
6007 // Store the number of bocsu bytes written.
6008 if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
6009 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6010 }
6011 state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
6012
6013 // Next we put in the level of comparison
6014 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
6015
6016 // If we are doing French, we need to store whether we have just finished the French level
6017 if(level == UCOL_PSK_SECONDARY && doingFrench) {
6018 state[1] |= (((int32_t)(state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6019 } else {
6020 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6021 }
6022
6023 // Was the latest CE shifted
6024 if(wasShifted) {
6025 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6026 }
6027 // Check for cces overflow
6028 if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
6029 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6030 }
6031 // Store cces
6032 state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
6033
6034 // Check for French overflow
6035 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6036 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6037 }
6038 // Store number of bytes written in the French secondary continuation sequence
6039 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
6040
6041
6042 // If we have used normalizing iterator, get rid of it
6043 if(normIter != NULL) {
6044 unorm_closeIter(normIter);
6045 }
6046
6047 /* To avoid memory leak, free the offset buffer if necessary. */
6048 ucol_freeOffsetBuffer(&s);
6049
6050 // Return number of meaningful sortkey bytes.
6051 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
6052 dest,i, state[0], state[1]);
6053 UTRACE_EXIT_VALUE(i);
6054 return i;
6055 }
6056
6057 /**
6058 * Produce a bound for a given sortkey and a number of levels.
6059 */
6060 U_CAPI int32_t U_EXPORT2
ucol_getBound(const uint8_t * source,int32_t sourceLength,UColBoundMode boundType,uint32_t noOfLevels,uint8_t * result,int32_t resultLength,UErrorCode * status)6061 ucol_getBound(const uint8_t *source,
6062 int32_t sourceLength,
6063 UColBoundMode boundType,
6064 uint32_t noOfLevels,
6065 uint8_t *result,
6066 int32_t resultLength,
6067 UErrorCode *status)
6068 {
6069 // consistency checks
6070 if(status == NULL || U_FAILURE(*status)) {
6071 return 0;
6072 }
6073 if(source == NULL) {
6074 *status = U_ILLEGAL_ARGUMENT_ERROR;
6075 return 0;
6076 }
6077
6078 int32_t sourceIndex = 0;
6079 // Scan the string until we skip enough of the key OR reach the end of the key
6080 do {
6081 sourceIndex++;
6082 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
6083 noOfLevels--;
6084 }
6085 } while (noOfLevels > 0
6086 && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6087
6088 if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6089 && noOfLevels > 0) {
6090 *status = U_SORT_KEY_TOO_SHORT_WARNING;
6091 }
6092
6093
6094 // READ ME: this code assumes that the values for boundType
6095 // enum will not changes. They are set so that the enum value
6096 // corresponds to the number of extra bytes each bound type
6097 // needs.
6098 if(result != NULL && resultLength >= sourceIndex+boundType) {
6099 uprv_memcpy(result, source, sourceIndex);
6100 switch(boundType) {
6101 // Lower bound just gets terminated. No extra bytes
6102 case UCOL_BOUND_LOWER: // = 0
6103 break;
6104 // Upper bound needs one extra byte
6105 case UCOL_BOUND_UPPER: // = 1
6106 result[sourceIndex++] = 2;
6107 break;
6108 // Upper long bound needs two extra bytes
6109 case UCOL_BOUND_UPPER_LONG: // = 2
6110 result[sourceIndex++] = 0xFF;
6111 result[sourceIndex++] = 0xFF;
6112 break;
6113 default:
6114 *status = U_ILLEGAL_ARGUMENT_ERROR;
6115 return 0;
6116 }
6117 result[sourceIndex++] = 0;
6118
6119 return sourceIndex;
6120 } else {
6121 return sourceIndex+boundType+1;
6122 }
6123 }
6124
6125 /****************************************************************************/
6126 /* Following are the functions that deal with the properties of a collator */
6127 /* there are new APIs and some compatibility APIs */
6128 /****************************************************************************/
6129
6130 static inline void
ucol_addLatinOneEntry(UCollator * coll,UChar ch,uint32_t CE,int32_t * primShift,int32_t * secShift,int32_t * terShift)6131 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
6132 int32_t *primShift, int32_t *secShift, int32_t *terShift)
6133 {
6134 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6135 UBool reverseSecondary = FALSE;
6136 UBool continuation = isContinuation(CE);
6137 if(!continuation) {
6138 tertiary = (uint8_t)((CE & coll->tertiaryMask));
6139 tertiary ^= coll->caseSwitch;
6140 reverseSecondary = TRUE;
6141 } else {
6142 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6143 tertiary &= UCOL_REMOVE_CASE;
6144 reverseSecondary = FALSE;
6145 }
6146
6147 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6148 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6149 primary1 = (uint8_t)(CE >> 8);
6150
6151 if(primary1 != 0) {
6152 if (coll->leadBytePermutationTable != NULL && !continuation) {
6153 primary1 = coll->leadBytePermutationTable[primary1];
6154 }
6155
6156 coll->latinOneCEs[ch] |= (primary1 << *primShift);
6157 *primShift -= 8;
6158 }
6159 if(primary2 != 0) {
6160 if(*primShift < 0) {
6161 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6162 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6163 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6164 return;
6165 }
6166 coll->latinOneCEs[ch] |= (primary2 << *primShift);
6167 *primShift -= 8;
6168 }
6169 if(secondary != 0) {
6170 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
6171 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
6172 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
6173 } else { // normal case
6174 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
6175 }
6176 *secShift -= 8;
6177 }
6178 if(tertiary != 0) {
6179 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
6180 *terShift -= 8;
6181 }
6182 }
6183
6184 static inline UBool
ucol_resizeLatinOneTable(UCollator * coll,int32_t size,UErrorCode * status)6185 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6186 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6187 if(newTable == NULL) {
6188 *status = U_MEMORY_ALLOCATION_ERROR;
6189 coll->latinOneFailed = TRUE;
6190 return FALSE;
6191 }
6192 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
6193 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6194 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6195 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
6196 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
6197 coll->latinOneTableLen = size;
6198 uprv_free(coll->latinOneCEs);
6199 coll->latinOneCEs = newTable;
6200 return TRUE;
6201 }
6202
6203 static UBool
ucol_setUpLatinOne(UCollator * coll,UErrorCode * status)6204 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6205 UBool result = TRUE;
6206 if(coll->latinOneCEs == NULL) {
6207 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
6208 if(coll->latinOneCEs == NULL) {
6209 *status = U_MEMORY_ALLOCATION_ERROR;
6210 return FALSE;
6211 }
6212 coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6213 }
6214 UChar ch = 0;
6215 UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6216 // Check for null pointer
6217 if (U_FAILURE(*status)) {
6218 ucol_closeElements(it);
6219 return FALSE;
6220 }
6221 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
6222
6223 int32_t primShift = 24, secShift = 24, terShift = 24;
6224 uint32_t CE = 0;
6225 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6226
6227 // TODO: make safe if you get more than you wanted...
6228 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
6229 primShift = 24; secShift = 24; terShift = 24;
6230 if(ch < 0x100) {
6231 CE = coll->latinOneMapping[ch];
6232 } else {
6233 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
6234 if(CE == UCOL_NOT_FOUND && coll->UCA) {
6235 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
6236 }
6237 }
6238 if(CE < UCOL_NOT_FOUND) {
6239 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6240 } else {
6241 switch (getCETag(CE)) {
6242 case EXPANSION_TAG:
6243 case DIGIT_TAG:
6244 ucol_setText(it, &ch, 1, status);
6245 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
6246 if(primShift < 0 || secShift < 0 || terShift < 0) {
6247 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6248 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6249 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6250 break;
6251 }
6252 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6253 }
6254 break;
6255 case CONTRACTION_TAG:
6256 // here is the trick
6257 // F2 is contraction. We do something very similar to contractions
6258 // but have two indices, one in the real contraction table and the
6259 // other to where we stuffed things. This hopes that we don't have
6260 // many contractions (this should work for latin-1 tables).
6261 {
6262 if((CE & 0x00FFF000) != 0) {
6263 *status = U_UNSUPPORTED_ERROR;
6264 goto cleanup_after_failure;
6265 }
6266
6267 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
6268
6269 CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
6270
6271 coll->latinOneCEs[ch] = CE;
6272 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
6273 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
6274
6275 // We're going to jump into contraction table, pick the elements
6276 // and use them
6277 do {
6278 CE = *(coll->contractionCEs +
6279 (UCharOffset - coll->contractionIndex));
6280 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
6281 uint32_t size;
6282 uint32_t i; /* general counter */
6283 uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
6284 size = getExpansionCount(CE);
6285 //CE = *CEOffset++;
6286 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6287 for(i = 0; i<size; i++) {
6288 if(primShift < 0 || secShift < 0 || terShift < 0) {
6289 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6290 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6291 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6292 break;
6293 }
6294 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6295 }
6296 } else { /* else, we do */
6297 while(*CEOffset != 0) {
6298 if(primShift < 0 || secShift < 0 || terShift < 0) {
6299 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6300 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6301 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6302 break;
6303 }
6304 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6305 }
6306 }
6307 contractionOffset++;
6308 } else if(CE < UCOL_NOT_FOUND) {
6309 ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
6310 } else {
6311 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6312 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6313 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6314 contractionOffset++;
6315 }
6316 UCharOffset++;
6317 primShift = 24; secShift = 24; terShift = 24;
6318 if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
6319 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
6320 goto cleanup_after_failure;
6321 }
6322 }
6323 } while(*UCharOffset != 0xFFFF);
6324 }
6325 break;;
6326 case SPEC_PROC_TAG:
6327 {
6328 // 0xB7 is a precontext character defined in UCA5.1, a special
6329 // handle is implemeted in order to save LatinOne table for
6330 // most locales.
6331 if (ch==0xb7) {
6332 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6333 }
6334 else {
6335 goto cleanup_after_failure;
6336 }
6337 }
6338 break;
6339 default:
6340 goto cleanup_after_failure;
6341 }
6342 }
6343 }
6344 // compact table
6345 if(contractionOffset < coll->latinOneTableLen) {
6346 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
6347 goto cleanup_after_failure;
6348 }
6349 }
6350 ucol_closeElements(it);
6351 return result;
6352
6353 cleanup_after_failure:
6354 // status should already be set before arriving here.
6355 coll->latinOneFailed = TRUE;
6356 ucol_closeElements(it);
6357 return FALSE;
6358 }
6359
ucol_updateInternalState(UCollator * coll,UErrorCode * status)6360 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
6361 if(U_SUCCESS(*status)) {
6362 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6363 coll->caseSwitch = UCOL_CASE_SWITCH;
6364 } else {
6365 coll->caseSwitch = UCOL_NO_CASE_SWITCH;
6366 }
6367
6368 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
6369 coll->tertiaryMask = UCOL_REMOVE_CASE;
6370 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6371 coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */
6372 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
6373 coll->tertiaryBottom = UCOL_COMMON_BOT3;
6374 } else {
6375 coll->tertiaryMask = UCOL_KEEP_CASE;
6376 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
6377 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6378 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
6379 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
6380 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
6381 } else {
6382 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6383 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
6384 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
6385 }
6386 }
6387
6388 /* Set the compression values */
6389 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - coll->tertiaryBottom - 1);
6390 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
6391 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
6392
6393 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
6394 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE)
6395 {
6396 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
6397 } else {
6398 coll->sortKeyGen = ucol_calcSortKey;
6399 }
6400 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
6401 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed)
6402 {
6403 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
6404 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
6405 //fprintf(stderr, "F");
6406 coll->latinOneUse = TRUE;
6407 } else {
6408 coll->latinOneUse = FALSE;
6409 }
6410 if(*status == U_UNSUPPORTED_ERROR) {
6411 *status = U_ZERO_ERROR;
6412 }
6413 } else { // latin1Table exists and it doesn't need to be regenerated, just use it
6414 coll->latinOneUse = TRUE;
6415 }
6416 } else {
6417 coll->latinOneUse = FALSE;
6418 }
6419 }
6420 }
6421
6422 U_CAPI uint32_t U_EXPORT2
ucol_setVariableTop(UCollator * coll,const UChar * varTop,int32_t len,UErrorCode * status)6423 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
6424 if(U_FAILURE(*status) || coll == NULL) {
6425 return 0;
6426 }
6427 if(len == -1) {
6428 len = u_strlen(varTop);
6429 }
6430 if(len == 0) {
6431 *status = U_ILLEGAL_ARGUMENT_ERROR;
6432 return 0;
6433 }
6434
6435 if(coll->delegate!=NULL) {
6436 return ((Collator*)coll->delegate)->setVariableTop(varTop, len, *status);
6437 }
6438
6439
6440 collIterate s;
6441 IInit_collIterate(coll, varTop, len, &s, status);
6442 if(U_FAILURE(*status)) {
6443 return 0;
6444 }
6445
6446 uint32_t CE = ucol_IGetNextCE(coll, &s, status);
6447
6448 /* here we check if we have consumed all characters */
6449 /* you can put in either one character or a contraction */
6450 /* you shouldn't put more... */
6451 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
6452 *status = U_CE_NOT_FOUND_ERROR;
6453 return 0;
6454 }
6455
6456 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
6457
6458 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
6459 *status = U_PRIMARY_TOO_LONG_ERROR;
6460 return 0;
6461 }
6462 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
6463 coll->variableTopValueisDefault = FALSE;
6464 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
6465 }
6466
6467 /* To avoid memory leak, free the offset buffer if necessary. */
6468 ucol_freeOffsetBuffer(&s);
6469
6470 return CE & UCOL_PRIMARYMASK;
6471 }
6472
ucol_getVariableTop(const UCollator * coll,UErrorCode * status)6473 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
6474 if(U_FAILURE(*status) || coll == NULL) {
6475 return 0;
6476 }
6477 if(coll->delegate!=NULL) {
6478 return ((const Collator*)coll->delegate)->getVariableTop(*status);
6479 }
6480 return coll->variableTopValue<<16;
6481 }
6482
6483 U_CAPI void U_EXPORT2
ucol_restoreVariableTop(UCollator * coll,const uint32_t varTop,UErrorCode * status)6484 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
6485 if(U_FAILURE(*status) || coll == NULL) {
6486 return;
6487 }
6488
6489 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
6490 coll->variableTopValueisDefault = FALSE;
6491 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
6492 }
6493 }
6494 /* Attribute setter API */
6495 U_CAPI void U_EXPORT2
ucol_setAttribute(UCollator * coll,UColAttribute attr,UColAttributeValue value,UErrorCode * status)6496 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
6497 if(U_FAILURE(*status) || coll == NULL) {
6498 return;
6499 }
6500
6501 if(coll->delegate != NULL) {
6502 ((Collator*)coll->delegate)->setAttribute(attr,value,*status);
6503 return;
6504 }
6505
6506 UColAttributeValue oldFrench = coll->frenchCollation;
6507 UColAttributeValue oldCaseFirst = coll->caseFirst;
6508 switch(attr) {
6509 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
6510 if(value == UCOL_ON) {
6511 coll->numericCollation = UCOL_ON;
6512 coll->numericCollationisDefault = FALSE;
6513 } else if (value == UCOL_OFF) {
6514 coll->numericCollation = UCOL_OFF;
6515 coll->numericCollationisDefault = FALSE;
6516 } else if (value == UCOL_DEFAULT) {
6517 coll->numericCollationisDefault = TRUE;
6518 coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
6519 } else {
6520 *status = U_ILLEGAL_ARGUMENT_ERROR;
6521 }
6522 break;
6523 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
6524 if(value == UCOL_ON || value == UCOL_OFF || value == UCOL_DEFAULT) {
6525 // This attribute is an implementation detail of the CLDR Japanese tailoring.
6526 // The implementation might change to use a different mechanism
6527 // to achieve the same Japanese sort order.
6528 // Since ICU 50, this attribute is not settable any more via API functions.
6529 } else {
6530 *status = U_ILLEGAL_ARGUMENT_ERROR;
6531 }
6532 break;
6533 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6534 if(value == UCOL_ON) {
6535 coll->frenchCollation = UCOL_ON;
6536 coll->frenchCollationisDefault = FALSE;
6537 } else if (value == UCOL_OFF) {
6538 coll->frenchCollation = UCOL_OFF;
6539 coll->frenchCollationisDefault = FALSE;
6540 } else if (value == UCOL_DEFAULT) {
6541 coll->frenchCollationisDefault = TRUE;
6542 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
6543 } else {
6544 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6545 }
6546 break;
6547 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6548 if(value == UCOL_SHIFTED) {
6549 coll->alternateHandling = UCOL_SHIFTED;
6550 coll->alternateHandlingisDefault = FALSE;
6551 } else if (value == UCOL_NON_IGNORABLE) {
6552 coll->alternateHandling = UCOL_NON_IGNORABLE;
6553 coll->alternateHandlingisDefault = FALSE;
6554 } else if (value == UCOL_DEFAULT) {
6555 coll->alternateHandlingisDefault = TRUE;
6556 coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
6557 } else {
6558 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6559 }
6560 break;
6561 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6562 if(value == UCOL_LOWER_FIRST) {
6563 coll->caseFirst = UCOL_LOWER_FIRST;
6564 coll->caseFirstisDefault = FALSE;
6565 } else if (value == UCOL_UPPER_FIRST) {
6566 coll->caseFirst = UCOL_UPPER_FIRST;
6567 coll->caseFirstisDefault = FALSE;
6568 } else if (value == UCOL_OFF) {
6569 coll->caseFirst = UCOL_OFF;
6570 coll->caseFirstisDefault = FALSE;
6571 } else if (value == UCOL_DEFAULT) {
6572 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
6573 coll->caseFirstisDefault = TRUE;
6574 } else {
6575 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6576 }
6577 break;
6578 case UCOL_CASE_LEVEL: /* do we have an extra case level */
6579 if(value == UCOL_ON) {
6580 coll->caseLevel = UCOL_ON;
6581 coll->caseLevelisDefault = FALSE;
6582 } else if (value == UCOL_OFF) {
6583 coll->caseLevel = UCOL_OFF;
6584 coll->caseLevelisDefault = FALSE;
6585 } else if (value == UCOL_DEFAULT) {
6586 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
6587 coll->caseLevelisDefault = TRUE;
6588 } else {
6589 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6590 }
6591 break;
6592 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6593 if(value == UCOL_ON) {
6594 coll->normalizationMode = UCOL_ON;
6595 coll->normalizationModeisDefault = FALSE;
6596 initializeFCD(status);
6597 } else if (value == UCOL_OFF) {
6598 coll->normalizationMode = UCOL_OFF;
6599 coll->normalizationModeisDefault = FALSE;
6600 } else if (value == UCOL_DEFAULT) {
6601 coll->normalizationModeisDefault = TRUE;
6602 coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
6603 if(coll->normalizationMode == UCOL_ON) {
6604 initializeFCD(status);
6605 }
6606 } else {
6607 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6608 }
6609 break;
6610 case UCOL_STRENGTH: /* attribute for strength */
6611 if (value == UCOL_DEFAULT) {
6612 coll->strengthisDefault = TRUE;
6613 coll->strength = (UColAttributeValue)coll->options->strength;
6614 } else if (value <= UCOL_IDENTICAL) {
6615 coll->strengthisDefault = FALSE;
6616 coll->strength = value;
6617 } else {
6618 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6619 }
6620 break;
6621 case UCOL_ATTRIBUTE_COUNT:
6622 default:
6623 *status = U_ILLEGAL_ARGUMENT_ERROR;
6624 break;
6625 }
6626 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
6627 coll->latinOneRegenTable = TRUE;
6628 } else {
6629 coll->latinOneRegenTable = FALSE;
6630 }
6631 ucol_updateInternalState(coll, status);
6632 }
6633
6634 U_CAPI UColAttributeValue U_EXPORT2
ucol_getAttribute(const UCollator * coll,UColAttribute attr,UErrorCode * status)6635 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
6636 if(U_FAILURE(*status) || coll == NULL) {
6637 return UCOL_DEFAULT;
6638 }
6639
6640 if(coll->delegate != NULL) {
6641 return ((Collator*)coll->delegate)->getAttribute(attr,*status);
6642 }
6643
6644 switch(attr) {
6645 case UCOL_NUMERIC_COLLATION:
6646 return coll->numericCollation;
6647 case UCOL_HIRAGANA_QUATERNARY_MODE:
6648 return coll->hiraganaQ;
6649 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6650 return coll->frenchCollation;
6651 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6652 return coll->alternateHandling;
6653 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6654 return coll->caseFirst;
6655 case UCOL_CASE_LEVEL: /* do we have an extra case level */
6656 return coll->caseLevel;
6657 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6658 return coll->normalizationMode;
6659 case UCOL_STRENGTH: /* attribute for strength */
6660 return coll->strength;
6661 case UCOL_ATTRIBUTE_COUNT:
6662 default:
6663 *status = U_ILLEGAL_ARGUMENT_ERROR;
6664 break;
6665 }
6666 return UCOL_DEFAULT;
6667 }
6668
6669 U_CAPI void U_EXPORT2
ucol_setStrength(UCollator * coll,UCollationStrength strength)6670 ucol_setStrength( UCollator *coll,
6671 UCollationStrength strength)
6672 {
6673 UErrorCode status = U_ZERO_ERROR;
6674 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
6675 }
6676
6677 U_CAPI UCollationStrength U_EXPORT2
ucol_getStrength(const UCollator * coll)6678 ucol_getStrength(const UCollator *coll)
6679 {
6680 UErrorCode status = U_ZERO_ERROR;
6681 return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
6682 }
6683
6684 U_CAPI int32_t U_EXPORT2
ucol_getReorderCodes(const UCollator * coll,int32_t * dest,int32_t destCapacity,UErrorCode * status)6685 ucol_getReorderCodes(const UCollator *coll,
6686 int32_t *dest,
6687 int32_t destCapacity,
6688 UErrorCode *status) {
6689 if (U_FAILURE(*status)) {
6690 return 0;
6691 }
6692
6693 if(coll->delegate!=NULL) {
6694 return ((const Collator*)coll->delegate)->getReorderCodes(dest, destCapacity, *status);
6695 }
6696
6697 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
6698 *status = U_ILLEGAL_ARGUMENT_ERROR;
6699 return 0;
6700 }
6701
6702 #ifdef UCOL_DEBUG
6703 printf("coll->reorderCodesLength = %d\n", coll->reorderCodesLength);
6704 printf("coll->defaultReorderCodesLength = %d\n", coll->defaultReorderCodesLength);
6705 #endif
6706
6707 if (coll->reorderCodesLength > destCapacity) {
6708 *status = U_BUFFER_OVERFLOW_ERROR;
6709 return coll->reorderCodesLength;
6710 }
6711 for (int32_t i = 0; i < coll->reorderCodesLength; i++) {
6712 dest[i] = coll->reorderCodes[i];
6713 }
6714 return coll->reorderCodesLength;
6715 }
6716
6717 U_CAPI void U_EXPORT2
ucol_setReorderCodes(UCollator * coll,const int32_t * reorderCodes,int32_t reorderCodesLength,UErrorCode * status)6718 ucol_setReorderCodes(UCollator* coll,
6719 const int32_t* reorderCodes,
6720 int32_t reorderCodesLength,
6721 UErrorCode *status) {
6722 if (U_FAILURE(*status)) {
6723 return;
6724 }
6725
6726 if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NULL)) {
6727 *status = U_ILLEGAL_ARGUMENT_ERROR;
6728 return;
6729 }
6730
6731 if(coll->delegate!=NULL) {
6732 ((Collator*)coll->delegate)->setReorderCodes(reorderCodes, reorderCodesLength, *status);
6733 return;
6734 }
6735
6736 if (coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
6737 uprv_free(coll->reorderCodes);
6738 }
6739 coll->reorderCodes = NULL;
6740 coll->freeReorderCodesOnClose = FALSE;
6741 coll->reorderCodesLength = 0;
6742 if (reorderCodesLength == 0) {
6743 if (coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
6744 uprv_free(coll->leadBytePermutationTable);
6745 }
6746 coll->leadBytePermutationTable = NULL;
6747 coll->freeLeadBytePermutationTableOnClose = FALSE;
6748 return;
6749 }
6750 coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int32_t));
6751 if (coll->reorderCodes == NULL) {
6752 *status = U_MEMORY_ALLOCATION_ERROR;
6753 return;
6754 }
6755 coll->freeReorderCodesOnClose = TRUE;
6756 for (int32_t i = 0; i < reorderCodesLength; i++) {
6757 coll->reorderCodes[i] = reorderCodes[i];
6758 }
6759 coll->reorderCodesLength = reorderCodesLength;
6760 ucol_buildPermutationTable(coll, status);
6761 }
6762
6763 U_CAPI int32_t U_EXPORT2
ucol_getEquivalentReorderCodes(int32_t reorderCode,int32_t * dest,int32_t destCapacity,UErrorCode * pErrorCode)6764 ucol_getEquivalentReorderCodes(int32_t reorderCode,
6765 int32_t* dest,
6766 int32_t destCapacity,
6767 UErrorCode *pErrorCode) {
6768 bool equivalentCodesSet[USCRIPT_CODE_LIMIT];
6769 uint16_t leadBytes[256];
6770 int leadBytesCount;
6771 int leadByteIndex;
6772 int16_t reorderCodesForLeadByte[USCRIPT_CODE_LIMIT];
6773 int reorderCodesForLeadByteCount;
6774 int reorderCodeIndex;
6775
6776 int32_t equivalentCodesCount = 0;
6777 int setIndex;
6778
6779 if (U_FAILURE(*pErrorCode)) {
6780 return 0;
6781 }
6782
6783 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
6784 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
6785 return 0;
6786 }
6787
6788 uprv_memset(equivalentCodesSet, 0, USCRIPT_CODE_LIMIT * sizeof(bool));
6789
6790 const UCollator* uca = ucol_initUCA(pErrorCode);
6791 if (U_FAILURE(*pErrorCode)) {
6792 return 0;
6793 }
6794 leadBytesCount = ucol_getLeadBytesForReorderCode(uca, reorderCode, leadBytes, 256);
6795 for (leadByteIndex = 0; leadByteIndex < leadBytesCount; leadByteIndex++) {
6796 reorderCodesForLeadByteCount = ucol_getReorderCodesForLeadByte(
6797 uca, leadBytes[leadByteIndex], reorderCodesForLeadByte, USCRIPT_CODE_LIMIT);
6798 for (reorderCodeIndex = 0; reorderCodeIndex < reorderCodesForLeadByteCount; reorderCodeIndex++) {
6799 equivalentCodesSet[reorderCodesForLeadByte[reorderCodeIndex]] = true;
6800 }
6801 }
6802
6803 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
6804 if (equivalentCodesSet[setIndex] == true) {
6805 equivalentCodesCount++;
6806 }
6807 }
6808
6809 if (destCapacity == 0) {
6810 return equivalentCodesCount;
6811 }
6812
6813 equivalentCodesCount = 0;
6814 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
6815 if (equivalentCodesSet[setIndex] == true) {
6816 dest[equivalentCodesCount++] = setIndex;
6817 if (equivalentCodesCount >= destCapacity) {
6818 break;
6819 }
6820 }
6821 }
6822 return equivalentCodesCount;
6823 }
6824
6825
6826 /****************************************************************************/
6827 /* Following are misc functions */
6828 /* there are new APIs and some compatibility APIs */
6829 /****************************************************************************/
6830
6831 U_CAPI void U_EXPORT2
ucol_getVersion(const UCollator * coll,UVersionInfo versionInfo)6832 ucol_getVersion(const UCollator* coll,
6833 UVersionInfo versionInfo)
6834 {
6835 if(coll->delegate!=NULL) {
6836 ((const Collator*)coll->delegate)->getVersion(versionInfo);
6837 return;
6838 }
6839 /* RunTime version */
6840 uint8_t rtVersion = UCOL_RUNTIME_VERSION;
6841 /* Builder version*/
6842 uint8_t bdVersion = coll->image->version[0];
6843
6844 /* Charset Version. Need to get the version from cnv files
6845 * makeconv should populate cnv files with version and
6846 * an api has to be provided in ucnv.h to obtain this version
6847 */
6848 uint8_t csVersion = 0;
6849
6850 /* combine the version info */
6851 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
6852
6853 /* Tailoring rules */
6854 versionInfo[0] = (uint8_t)(cmbVersion>>8);
6855 versionInfo[1] = (uint8_t)cmbVersion;
6856 versionInfo[2] = coll->image->version[1];
6857 if(coll->UCA) {
6858 /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */
6859 versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07);
6860 } else {
6861 versionInfo[3] = 0;
6862 }
6863 }
6864
6865
6866 /* This internal API checks whether a character is tailored or not */
6867 U_CAPI UBool U_EXPORT2
ucol_isTailored(const UCollator * coll,const UChar u,UErrorCode * status)6868 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
6869 if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
6870 return FALSE;
6871 }
6872
6873 uint32_t CE = UCOL_NOT_FOUND;
6874 const UChar *ContractionStart = NULL;
6875 if(u < 0x100) { /* latin-1 */
6876 CE = coll->latinOneMapping[u];
6877 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
6878 return FALSE;
6879 }
6880 } else { /* regular */
6881 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
6882 }
6883
6884 if(isContraction(CE)) {
6885 ContractionStart = (UChar *)coll->image+getContractOffset(CE);
6886 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
6887 }
6888
6889 return (UBool)(CE != UCOL_NOT_FOUND);
6890 }
6891
6892
6893 /****************************************************************************/
6894 /* Following are the string compare functions */
6895 /* */
6896 /****************************************************************************/
6897
6898
6899 /* ucol_checkIdent internal function. Does byte level string compare. */
6900 /* Used by strcoll if strength == identical and strings */
6901 /* are otherwise equal. */
6902 /* */
6903 /* Comparison must be done on NFD normalized strings. */
6904 /* FCD is not good enough. */
6905
6906 static
ucol_checkIdent(collIterate * sColl,collIterate * tColl,UBool normalize,UErrorCode * status)6907 UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
6908 {
6909 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
6910 // of same type, but that doesn't really mean that it will stay that way.
6911 int32_t comparison;
6912
6913 if (sColl->flags & UCOL_USE_ITERATOR) {
6914 // The division for the array length may truncate the array size to
6915 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
6916 // for all platforms anyway.
6917 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6918 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6919 UNormIterator *sNIt = NULL, *tNIt = NULL;
6920 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
6921 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
6922 sColl->iterator->move(sColl->iterator, 0, UITER_START);
6923 tColl->iterator->move(tColl->iterator, 0, UITER_START);
6924 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
6925 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
6926 comparison = u_strCompareIter(sIt, tIt, TRUE);
6927 unorm_closeIter(sNIt);
6928 unorm_closeIter(tNIt);
6929 } else {
6930 int32_t sLen = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1;
6931 const UChar *sBuf = sColl->string;
6932 int32_t tLen = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1;
6933 const UChar *tBuf = tColl->string;
6934
6935 if (normalize) {
6936 *status = U_ZERO_ERROR;
6937 // Note: We could use Normalizer::compare() or similar, but for short strings
6938 // which may not be in FCD it might be faster to just NFD them.
6939 // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than
6940 // NFD'ing immediately might be faster for long strings,
6941 // but string comparison is usually done on relatively short strings.
6942 sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen),
6943 sColl->writableBuffer,
6944 *status);
6945 tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen),
6946 tColl->writableBuffer,
6947 *status);
6948 if(U_FAILURE(*status)) {
6949 return UCOL_LESS;
6950 }
6951 comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer);
6952 } else {
6953 comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE);
6954 }
6955 }
6956
6957 if (comparison < 0) {
6958 return UCOL_LESS;
6959 } else if (comparison == 0) {
6960 return UCOL_EQUAL;
6961 } else /* comparison > 0 */ {
6962 return UCOL_GREATER;
6963 }
6964 }
6965
6966 /* CEBuf - A struct and some inline functions to handle the saving */
6967 /* of CEs in a buffer within ucol_strcoll */
6968
6969 #define UCOL_CEBUF_SIZE 512
6970 typedef struct ucol_CEBuf {
6971 uint32_t *buf;
6972 uint32_t *endp;
6973 uint32_t *pos;
6974 uint32_t localArray[UCOL_CEBUF_SIZE];
6975 } ucol_CEBuf;
6976
6977
6978 static
UCOL_INIT_CEBUF(ucol_CEBuf * b)6979 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
6980 (b)->buf = (b)->pos = (b)->localArray;
6981 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
6982 }
6983
6984 static
ucol_CEBuf_Expand(ucol_CEBuf * b,collIterate * ci,UErrorCode * status)6985 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) {
6986 uint32_t oldSize;
6987 uint32_t newSize;
6988 uint32_t *newBuf;
6989
6990 ci->flags |= UCOL_ITER_ALLOCATED;
6991 oldSize = (uint32_t)(b->pos - b->buf);
6992 newSize = oldSize * 2;
6993 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
6994 if(newBuf == NULL) {
6995 *status = U_MEMORY_ALLOCATION_ERROR;
6996 }
6997 else {
6998 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
6999 if (b->buf != b->localArray) {
7000 uprv_free(b->buf);
7001 }
7002 b->buf = newBuf;
7003 b->endp = b->buf + newSize;
7004 b->pos = b->buf + oldSize;
7005 }
7006 }
7007
7008 static
UCOL_CEBUF_PUT(ucol_CEBuf * b,uint32_t ce,collIterate * ci,UErrorCode * status)7009 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) {
7010 if (b->pos == b->endp) {
7011 ucol_CEBuf_Expand(b, ci, status);
7012 }
7013 if (U_SUCCESS(*status)) {
7014 *(b)->pos++ = ce;
7015 }
7016 }
7017
7018 /* This is a trick string compare function that goes in and uses sortkeys to compare */
7019 /* It is used when compare gets in trouble and needs to bail out */
ucol_compareUsingSortKeys(collIterate * sColl,collIterate * tColl,UErrorCode * status)7020 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7021 collIterate *tColl,
7022 UErrorCode *status)
7023 {
7024 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7025 uint8_t *sourceKeyP = sourceKey;
7026 uint8_t *targetKeyP = targetKey;
7027 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7028 const UCollator *coll = sColl->coll;
7029 const UChar *source = NULL;
7030 const UChar *target = NULL;
7031 int32_t result = UCOL_EQUAL;
7032 UnicodeString sourceString, targetString;
7033 int32_t sourceLength;
7034 int32_t targetLength;
7035
7036 if(sColl->flags & UCOL_USE_ITERATOR) {
7037 sColl->iterator->move(sColl->iterator, 0, UITER_START);
7038 tColl->iterator->move(tColl->iterator, 0, UITER_START);
7039 UChar32 c;
7040 while((c=sColl->iterator->next(sColl->iterator))>=0) {
7041 sourceString.append((UChar)c);
7042 }
7043 while((c=tColl->iterator->next(tColl->iterator))>=0) {
7044 targetString.append((UChar)c);
7045 }
7046 source = sourceString.getBuffer();
7047 sourceLength = sourceString.length();
7048 target = targetString.getBuffer();
7049 targetLength = targetString.length();
7050 } else { // no iterators
7051 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1;
7052 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1;
7053 source = sColl->string;
7054 target = tColl->string;
7055 }
7056
7057
7058
7059 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7060 if(sourceKeyLen > UCOL_MAX_BUFFER) {
7061 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7062 if(sourceKeyP == NULL) {
7063 *status = U_MEMORY_ALLOCATION_ERROR;
7064 goto cleanup_and_do_compare;
7065 }
7066 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7067 }
7068
7069 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7070 if(targetKeyLen > UCOL_MAX_BUFFER) {
7071 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7072 if(targetKeyP == NULL) {
7073 *status = U_MEMORY_ALLOCATION_ERROR;
7074 goto cleanup_and_do_compare;
7075 }
7076 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7077 }
7078
7079 result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7080
7081 cleanup_and_do_compare:
7082 if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
7083 uprv_free(sourceKeyP);
7084 }
7085
7086 if(targetKeyP != NULL && targetKeyP != targetKey) {
7087 uprv_free(targetKeyP);
7088 }
7089
7090 if(result<0) {
7091 return UCOL_LESS;
7092 } else if(result>0) {
7093 return UCOL_GREATER;
7094 } else {
7095 return UCOL_EQUAL;
7096 }
7097 }
7098
7099
7100 static UCollationResult
ucol_strcollRegular(collIterate * sColl,collIterate * tColl,UErrorCode * status)7101 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
7102 {
7103 U_ALIGN_CODE(16);
7104
7105 const UCollator *coll = sColl->coll;
7106
7107
7108 // setting up the collator parameters
7109 UColAttributeValue strength = coll->strength;
7110 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY);
7111
7112 UBool checkSecTer = initialCheckSecTer;
7113 UBool checkTertiary = (strength >= UCOL_TERTIARY);
7114 UBool checkQuad = (strength >= UCOL_QUATERNARY);
7115 UBool checkIdent = (strength == UCOL_IDENTICAL);
7116 UBool checkCase = (coll->caseLevel == UCOL_ON);
7117 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7118 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7119 UBool qShifted = shifted && checkQuad;
7120 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7121
7122 if(doHiragana && shifted) {
7123 return (ucol_compareUsingSortKeys(sColl, tColl, status));
7124 }
7125 uint8_t caseSwitch = coll->caseSwitch;
7126 uint8_t tertiaryMask = coll->tertiaryMask;
7127
7128 // This is the lowest primary value that will not be ignored if shifted
7129 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7130
7131 UCollationResult result = UCOL_EQUAL;
7132 UCollationResult hirResult = UCOL_EQUAL;
7133
7134 // Preparing the CE buffers. They will be filled during the primary phase
7135 ucol_CEBuf sCEs;
7136 ucol_CEBuf tCEs;
7137 UCOL_INIT_CEBUF(&sCEs);
7138 UCOL_INIT_CEBUF(&tCEs);
7139
7140 uint32_t secS = 0, secT = 0;
7141 uint32_t sOrder=0, tOrder=0;
7142
7143 // Non shifted primary processing is quite simple
7144 if(!shifted) {
7145 for(;;) {
7146 // We fetch CEs until we hit a non ignorable primary or end.
7147 uint32_t sPrimary;
7148 do {
7149 // We get the next CE
7150 sOrder = ucol_IGetNextCE(coll, sColl, status);
7151 // Stuff it in the buffer
7152 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7153 // And keep just the primary part.
7154 sPrimary = sOrder & UCOL_PRIMARYMASK;
7155 } while(sPrimary == 0);
7156
7157 // see the comments on the above block
7158 uint32_t tPrimary;
7159 do {
7160 tOrder = ucol_IGetNextCE(coll, tColl, status);
7161 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7162 tPrimary = tOrder & UCOL_PRIMARYMASK;
7163 } while(tPrimary == 0);
7164
7165 // if both primaries are the same
7166 if(sPrimary == tPrimary) {
7167 // and there are no more CEs, we advance to the next level
7168 if(sPrimary == UCOL_NO_MORE_CES_PRIMARY) {
7169 break;
7170 }
7171 if(doHiragana && hirResult == UCOL_EQUAL) {
7172 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
7173 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
7174 ? UCOL_LESS:UCOL_GREATER;
7175 }
7176 }
7177 } else {
7178 // only need to check one for continuation
7179 // if one is then the other must be or the preceding CE would be a prefix of the other
7180 if (coll->leadBytePermutationTable != NULL && !isContinuation(sOrder)) {
7181 sPrimary = (coll->leadBytePermutationTable[sPrimary>>24] << 24) | (sPrimary & 0x00FFFFFF);
7182 tPrimary = (coll->leadBytePermutationTable[tPrimary>>24] << 24) | (tPrimary & 0x00FFFFFF);
7183 }
7184 // if two primaries are different, we are done
7185 result = (sPrimary < tPrimary) ? UCOL_LESS: UCOL_GREATER;
7186 goto commonReturn;
7187 }
7188 } // no primary difference... do the rest from the buffers
7189 } else { // shifted - do a slightly more complicated processing :)
7190 for(;;) {
7191 UBool sInShifted = FALSE;
7192 UBool tInShifted = FALSE;
7193 // This version of code can be refactored. However, it seems easier to understand this way.
7194 // Source loop. Same as the target loop.
7195 for(;;) {
7196 sOrder = ucol_IGetNextCE(coll, sColl, status);
7197 if(sOrder == UCOL_NO_MORE_CES) {
7198 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7199 break;
7200 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
7201 /* UCA amendment - ignore ignorables that follow shifted code points */
7202 continue;
7203 } else if(isContinuation(sOrder)) {
7204 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7205 if(sInShifted) {
7206 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7207 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7208 continue;
7209 } else {
7210 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7211 break;
7212 }
7213 } else { /* Just lower level values */
7214 if(sInShifted) {
7215 continue;
7216 } else {
7217 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7218 continue;
7219 }
7220 }
7221 } else { /* regular */
7222 if(coll->leadBytePermutationTable != NULL){
7223 sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
7224 }
7225 if((sOrder & UCOL_PRIMARYMASK) > LVT) {
7226 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7227 break;
7228 } else {
7229 if((sOrder & UCOL_PRIMARYMASK) > 0) {
7230 sInShifted = TRUE;
7231 sOrder &= UCOL_PRIMARYMASK;
7232 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7233 continue;
7234 } else {
7235 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7236 sInShifted = FALSE;
7237 continue;
7238 }
7239 }
7240 }
7241 }
7242 sOrder &= UCOL_PRIMARYMASK;
7243 sInShifted = FALSE;
7244
7245 for(;;) {
7246 tOrder = ucol_IGetNextCE(coll, tColl, status);
7247 if(tOrder == UCOL_NO_MORE_CES) {
7248 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7249 break;
7250 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
7251 /* UCA amendment - ignore ignorables that follow shifted code points */
7252 continue;
7253 } else if(isContinuation(tOrder)) {
7254 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7255 if(tInShifted) {
7256 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7257 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7258 continue;
7259 } else {
7260 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7261 break;
7262 }
7263 } else { /* Just lower level values */
7264 if(tInShifted) {
7265 continue;
7266 } else {
7267 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7268 continue;
7269 }
7270 }
7271 } else { /* regular */
7272 if(coll->leadBytePermutationTable != NULL){
7273 tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
7274 }
7275 if((tOrder & UCOL_PRIMARYMASK) > LVT) {
7276 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7277 break;
7278 } else {
7279 if((tOrder & UCOL_PRIMARYMASK) > 0) {
7280 tInShifted = TRUE;
7281 tOrder &= UCOL_PRIMARYMASK;
7282 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7283 continue;
7284 } else {
7285 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7286 tInShifted = FALSE;
7287 continue;
7288 }
7289 }
7290 }
7291 }
7292 tOrder &= UCOL_PRIMARYMASK;
7293 tInShifted = FALSE;
7294
7295 if(sOrder == tOrder) {
7296 /*
7297 if(doHiragana && hirResult == UCOL_EQUAL) {
7298 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7299 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7300 ? UCOL_LESS:UCOL_GREATER;
7301 }
7302 }
7303 */
7304 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7305 break;
7306 } else {
7307 sOrder = 0;
7308 tOrder = 0;
7309 continue;
7310 }
7311 } else {
7312 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
7313 goto commonReturn;
7314 }
7315 } /* no primary difference... do the rest from the buffers */
7316 }
7317
7318 /* now, we're gonna reexamine collected CEs */
7319 uint32_t *sCE;
7320 uint32_t *tCE;
7321
7322 /* This is the secondary level of comparison */
7323 if(checkSecTer) {
7324 if(!isFrenchSec) { /* normal */
7325 sCE = sCEs.buf;
7326 tCE = tCEs.buf;
7327 for(;;) {
7328 while (secS == 0) {
7329 secS = *(sCE++) & UCOL_SECONDARYMASK;
7330 }
7331
7332 while(secT == 0) {
7333 secT = *(tCE++) & UCOL_SECONDARYMASK;
7334 }
7335
7336 if(secS == secT) {
7337 if(secS == UCOL_NO_MORE_CES_SECONDARY) {
7338 break;
7339 } else {
7340 secS = 0; secT = 0;
7341 continue;
7342 }
7343 } else {
7344 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7345 goto commonReturn;
7346 }
7347 }
7348 } else { /* do the French */
7349 uint32_t *sCESave = NULL;
7350 uint32_t *tCESave = NULL;
7351 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
7352 tCE = tCEs.pos-2;
7353 for(;;) {
7354 while (secS == 0 && sCE >= sCEs.buf) {
7355 if(sCESave == NULL) {
7356 secS = *(sCE--);
7357 if(isContinuation(secS)) {
7358 while(isContinuation(secS = *(sCE--)))
7359 ;
7360 /* after this, secS has the start of continuation, and sCEs points before that */
7361 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
7362 sCE+=2; /* need to point to the first continuation CP */
7363 /* However, now you can just continue doing stuff */
7364 }
7365 } else {
7366 secS = *(sCE++);
7367 if(!isContinuation(secS)) { /* This means we have finished with this cont */
7368 sCE = sCESave; /* reset the pointer to before continuation */
7369 sCESave = NULL;
7370 secS = 0; /* Fetch a fresh CE before the continuation sequence. */
7371 continue;
7372 }
7373 }
7374 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7375 }
7376
7377 while(secT == 0 && tCE >= tCEs.buf) {
7378 if(tCESave == NULL) {
7379 secT = *(tCE--);
7380 if(isContinuation(secT)) {
7381 while(isContinuation(secT = *(tCE--)))
7382 ;
7383 /* after this, secS has the start of continuation, and sCEs points before that */
7384 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
7385 tCE+=2; /* need to point to the first continuation CP */
7386 /* However, now you can just continue doing stuff */
7387 }
7388 } else {
7389 secT = *(tCE++);
7390 if(!isContinuation(secT)) { /* This means we have finished with this cont */
7391 tCE = tCESave; /* reset the pointer to before continuation */
7392 tCESave = NULL;
7393 secT = 0; /* Fetch a fresh CE before the continuation sequence. */
7394 continue;
7395 }
7396 }
7397 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7398 }
7399
7400 if(secS == secT) {
7401 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
7402 break;
7403 } else {
7404 secS = 0; secT = 0;
7405 continue;
7406 }
7407 } else {
7408 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7409 goto commonReturn;
7410 }
7411 }
7412 }
7413 }
7414
7415 /* doing the case bit */
7416 if(checkCase) {
7417 sCE = sCEs.buf;
7418 tCE = tCEs.buf;
7419 for(;;) {
7420 while((secS & UCOL_REMOVE_CASE) == 0) {
7421 if(!isContinuation(*sCE++)) {
7422 secS =*(sCE-1);
7423 if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7424 // primary ignorables should not be considered on the case level when the strength is primary
7425 // otherwise, the CEs stop being well-formed
7426 secS &= UCOL_TERT_CASE_MASK;
7427 secS ^= caseSwitch;
7428 } else {
7429 secS = 0;
7430 }
7431 } else {
7432 secS = 0;
7433 }
7434 }
7435
7436 while((secT & UCOL_REMOVE_CASE) == 0) {
7437 if(!isContinuation(*tCE++)) {
7438 secT = *(tCE-1);
7439 if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7440 // primary ignorables should not be considered on the case level when the strength is primary
7441 // otherwise, the CEs stop being well-formed
7442 secT &= UCOL_TERT_CASE_MASK;
7443 secT ^= caseSwitch;
7444 } else {
7445 secT = 0;
7446 }
7447 } else {
7448 secT = 0;
7449 }
7450 }
7451
7452 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
7453 result = UCOL_LESS;
7454 goto commonReturn;
7455 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
7456 result = UCOL_GREATER;
7457 goto commonReturn;
7458 }
7459
7460 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
7461 break;
7462 } else {
7463 secS = 0;
7464 secT = 0;
7465 }
7466 }
7467 }
7468
7469 /* Tertiary level */
7470 if(checkTertiary) {
7471 secS = 0;
7472 secT = 0;
7473 sCE = sCEs.buf;
7474 tCE = tCEs.buf;
7475 for(;;) {
7476 while((secS & UCOL_REMOVE_CASE) == 0) {
7477 sOrder = *sCE++;
7478 secS = sOrder & tertiaryMask;
7479 if(!isContinuation(sOrder)) {
7480 secS ^= caseSwitch;
7481 } else {
7482 secS &= UCOL_REMOVE_CASE;
7483 }
7484 }
7485
7486 while((secT & UCOL_REMOVE_CASE) == 0) {
7487 tOrder = *tCE++;
7488 secT = tOrder & tertiaryMask;
7489 if(!isContinuation(tOrder)) {
7490 secT ^= caseSwitch;
7491 } else {
7492 secT &= UCOL_REMOVE_CASE;
7493 }
7494 }
7495
7496 if(secS == secT) {
7497 if((secS & UCOL_REMOVE_CASE) == 1) {
7498 break;
7499 } else {
7500 secS = 0; secT = 0;
7501 continue;
7502 }
7503 } else {
7504 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7505 goto commonReturn;
7506 }
7507 }
7508 }
7509
7510
7511 if(qShifted /*checkQuad*/) {
7512 UBool sInShifted = TRUE;
7513 UBool tInShifted = TRUE;
7514 secS = 0;
7515 secT = 0;
7516 sCE = sCEs.buf;
7517 tCE = tCEs.buf;
7518 for(;;) {
7519 while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(secS) && !sInShifted)) {
7520 secS = *(sCE++);
7521 if(isContinuation(secS)) {
7522 if(!sInShifted) {
7523 continue;
7524 }
7525 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
7526 secS = UCOL_PRIMARYMASK;
7527 sInShifted = FALSE;
7528 } else {
7529 sInShifted = TRUE;
7530 }
7531 }
7532 secS &= UCOL_PRIMARYMASK;
7533
7534
7535 while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(secT) && !tInShifted)) {
7536 secT = *(tCE++);
7537 if(isContinuation(secT)) {
7538 if(!tInShifted) {
7539 continue;
7540 }
7541 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
7542 secT = UCOL_PRIMARYMASK;
7543 tInShifted = FALSE;
7544 } else {
7545 tInShifted = TRUE;
7546 }
7547 }
7548 secT &= UCOL_PRIMARYMASK;
7549
7550 if(secS == secT) {
7551 if(secS == UCOL_NO_MORE_CES_PRIMARY) {
7552 break;
7553 } else {
7554 secS = 0; secT = 0;
7555 continue;
7556 }
7557 } else {
7558 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7559 goto commonReturn;
7560 }
7561 }
7562 } else if(doHiragana && hirResult != UCOL_EQUAL) {
7563 // If we're fine on quaternaries, we might be different
7564 // on Hiragana. This, however, might fail us in shifted.
7565 result = hirResult;
7566 goto commonReturn;
7567 }
7568
7569 /* For IDENTICAL comparisons, we use a bitwise character comparison */
7570 /* as a tiebreaker if all else is equal. */
7571 /* Getting here should be quite rare - strings are not identical - */
7572 /* that is checked first, but compared == through all other checks. */
7573 if(checkIdent)
7574 {
7575 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
7576 result = ucol_checkIdent(sColl, tColl, TRUE, status);
7577 }
7578
7579 commonReturn:
7580 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
7581 if (sCEs.buf != sCEs.localArray ) {
7582 uprv_free(sCEs.buf);
7583 }
7584 if (tCEs.buf != tCEs.localArray ) {
7585 uprv_free(tCEs.buf);
7586 }
7587 }
7588
7589 return result;
7590 }
7591
7592 static UCollationResult
ucol_strcollRegular(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength,UErrorCode * status)7593 ucol_strcollRegular(const UCollator *coll,
7594 const UChar *source, int32_t sourceLength,
7595 const UChar *target, int32_t targetLength,
7596 UErrorCode *status) {
7597 collIterate sColl, tColl;
7598 // Preparing the context objects for iterating over strings
7599 IInit_collIterate(coll, source, sourceLength, &sColl, status);
7600 IInit_collIterate(coll, target, targetLength, &tColl, status);
7601 if(U_FAILURE(*status)) {
7602 return UCOL_LESS;
7603 }
7604 return ucol_strcollRegular(&sColl, &tColl, status);
7605 }
7606
7607 static inline uint32_t
ucol_getLatinOneContraction(const UCollator * coll,int32_t strength,uint32_t CE,const UChar * s,int32_t * index,int32_t len)7608 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
7609 uint32_t CE, const UChar *s, int32_t *index, int32_t len)
7610 {
7611 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
7612 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
7613 int32_t offset = 1;
7614 UChar schar = 0, tchar = 0;
7615
7616 for(;;) {
7617 if(len == -1) {
7618 if(s[*index] == 0) { // end of string
7619 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7620 } else {
7621 schar = s[*index];
7622 }
7623 } else {
7624 if(*index == len) {
7625 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7626 } else {
7627 schar = s[*index];
7628 }
7629 }
7630
7631 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
7632 offset++;
7633 }
7634
7635 if (schar == tchar) {
7636 (*index)++;
7637 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
7638 }
7639 else
7640 {
7641 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
7642 return UCOL_BAIL_OUT_CE;
7643 }
7644 // skip completely ignorables
7645 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
7646 if(isZeroCE == 0) { // we have to ignore completely ignorables
7647 (*index)++;
7648 continue;
7649 }
7650
7651 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7652 }
7653 }
7654 }
7655
7656
7657 /**
7658 * This is a fast strcoll, geared towards text in Latin-1.
7659 * It supports contractions of size two, French secondaries
7660 * and case switching. You can use it with strengths primary
7661 * to tertiary. It does not support shifted and case level.
7662 * It relies on the table build by setupLatin1Table. If it
7663 * doesn't understand something, it will go to the regular
7664 * strcoll.
7665 */
7666 static UCollationResult
ucol_strcollUseLatin1(const UCollator * coll,const UChar * source,int32_t sLen,const UChar * target,int32_t tLen,UErrorCode * status)7667 ucol_strcollUseLatin1( const UCollator *coll,
7668 const UChar *source,
7669 int32_t sLen,
7670 const UChar *target,
7671 int32_t tLen,
7672 UErrorCode *status)
7673 {
7674 U_ALIGN_CODE(16);
7675 int32_t strength = coll->strength;
7676
7677 int32_t sIndex = 0, tIndex = 0;
7678 UChar sChar = 0, tChar = 0;
7679 uint32_t sOrder=0, tOrder=0;
7680
7681 UBool endOfSource = FALSE;
7682
7683 uint32_t *elements = coll->latinOneCEs;
7684
7685 UBool haveContractions = FALSE; // if we have contractions in our string
7686 // we cannot do French secondary
7687
7688 // Do the primary level
7689 for(;;) {
7690 while(sOrder==0) { // this loop skips primary ignorables
7691 // sOrder=getNextlatinOneCE(source);
7692 if(sLen==-1) { // handling zero terminated strings
7693 sChar=source[sIndex++];
7694 if(sChar==0) {
7695 endOfSource = TRUE;
7696 break;
7697 }
7698 } else { // handling strings with known length
7699 if(sIndex==sLen) {
7700 endOfSource = TRUE;
7701 break;
7702 }
7703 sChar=source[sIndex++];
7704 }
7705 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7706 //fprintf(stderr, "R");
7707 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7708 }
7709 sOrder = elements[sChar];
7710 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
7711 // specials can basically be either contractions or bail-out signs. If we get anything
7712 // else, we'll bail out anywasy
7713 if(getCETag(sOrder) == CONTRACTION_TAG) {
7714 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
7715 haveContractions = TRUE; // if there are contractions, we cannot do French secondary
7716 // However, if there are contractions in the table, but we always use just one char,
7717 // we might be able to do French. This should be checked out.
7718 }
7719 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7720 //fprintf(stderr, "S");
7721 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7722 }
7723 }
7724 }
7725
7726 while(tOrder==0) { // this loop skips primary ignorables
7727 // tOrder=getNextlatinOneCE(target);
7728 if(tLen==-1) { // handling zero terminated strings
7729 tChar=target[tIndex++];
7730 if(tChar==0) {
7731 if(endOfSource) { // this is different than source loop,
7732 // as we already know that source loop is done here,
7733 // so we can either finish the primary loop if both
7734 // strings are done or anounce the result if only
7735 // target is done. Same below.
7736 goto endOfPrimLoop;
7737 } else {
7738 return UCOL_GREATER;
7739 }
7740 }
7741 } else { // handling strings with known length
7742 if(tIndex==tLen) {
7743 if(endOfSource) {
7744 goto endOfPrimLoop;
7745 } else {
7746 return UCOL_GREATER;
7747 }
7748 }
7749 tChar=target[tIndex++];
7750 }
7751 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7752 //fprintf(stderr, "R");
7753 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7754 }
7755 tOrder = elements[tChar];
7756 if(tOrder >= UCOL_NOT_FOUND) {
7757 // Handling specials, see the comments for source
7758 if(getCETag(tOrder) == CONTRACTION_TAG) {
7759 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
7760 haveContractions = TRUE;
7761 }
7762 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7763 //fprintf(stderr, "S");
7764 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7765 }
7766 }
7767 }
7768 if(endOfSource) { // source is finished, but target is not, say the result.
7769 return UCOL_LESS;
7770 }
7771
7772 if(sOrder == tOrder) { // if we have same CEs, we continue the loop
7773 sOrder = 0; tOrder = 0;
7774 continue;
7775 } else {
7776 // compare current top bytes
7777 if(((sOrder^tOrder)&0xFF000000)!=0) {
7778 // top bytes differ, return difference
7779 if(sOrder < tOrder) {
7780 return UCOL_LESS;
7781 } else if(sOrder > tOrder) {
7782 return UCOL_GREATER;
7783 }
7784 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
7785 // since we must return enum value
7786 }
7787
7788 // top bytes match, continue with following bytes
7789 sOrder<<=8;
7790 tOrder<<=8;
7791 }
7792 }
7793
7794 endOfPrimLoop:
7795 // after primary loop, we definitely know the sizes of strings,
7796 // so we set it and use simpler loop for secondaries and tertiaries
7797 sLen = sIndex; tLen = tIndex;
7798 if(strength >= UCOL_SECONDARY) {
7799 // adjust the table beggining
7800 elements += coll->latinOneTableLen;
7801 endOfSource = FALSE;
7802
7803 if(coll->frenchCollation == UCOL_OFF) { // non French
7804 // This loop is a simplified copy of primary loop
7805 // at this point we know that whole strings are latin-1, so we don't
7806 // check for that. We also know that we only have contractions as
7807 // specials.
7808 sIndex = 0; tIndex = 0;
7809 for(;;) {
7810 while(sOrder==0) {
7811 if(sIndex==sLen) {
7812 endOfSource = TRUE;
7813 break;
7814 }
7815 sChar=source[sIndex++];
7816 sOrder = elements[sChar];
7817 if(sOrder > UCOL_NOT_FOUND) {
7818 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
7819 }
7820 }
7821
7822 while(tOrder==0) {
7823 if(tIndex==tLen) {
7824 if(endOfSource) {
7825 goto endOfSecLoop;
7826 } else {
7827 return UCOL_GREATER;
7828 }
7829 }
7830 tChar=target[tIndex++];
7831 tOrder = elements[tChar];
7832 if(tOrder > UCOL_NOT_FOUND) {
7833 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
7834 }
7835 }
7836 if(endOfSource) {
7837 return UCOL_LESS;
7838 }
7839
7840 if(sOrder == tOrder) {
7841 sOrder = 0; tOrder = 0;
7842 continue;
7843 } else {
7844 // see primary loop for comments on this
7845 if(((sOrder^tOrder)&0xFF000000)!=0) {
7846 if(sOrder < tOrder) {
7847 return UCOL_LESS;
7848 } else if(sOrder > tOrder) {
7849 return UCOL_GREATER;
7850 }
7851 }
7852 sOrder<<=8;
7853 tOrder<<=8;
7854 }
7855 }
7856 } else { // French
7857 if(haveContractions) { // if we have contractions, we have to bail out
7858 // since we don't really know how to handle them here
7859 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7860 }
7861 // For French, we go backwards
7862 sIndex = sLen; tIndex = tLen;
7863 for(;;) {
7864 while(sOrder==0) {
7865 if(sIndex==0) {
7866 endOfSource = TRUE;
7867 break;
7868 }
7869 sChar=source[--sIndex];
7870 sOrder = elements[sChar];
7871 // don't even look for contractions
7872 }
7873
7874 while(tOrder==0) {
7875 if(tIndex==0) {
7876 if(endOfSource) {
7877 goto endOfSecLoop;
7878 } else {
7879 return UCOL_GREATER;
7880 }
7881 }
7882 tChar=target[--tIndex];
7883 tOrder = elements[tChar];
7884 // don't even look for contractions
7885 }
7886 if(endOfSource) {
7887 return UCOL_LESS;
7888 }
7889
7890 if(sOrder == tOrder) {
7891 sOrder = 0; tOrder = 0;
7892 continue;
7893 } else {
7894 // see the primary loop for comments
7895 if(((sOrder^tOrder)&0xFF000000)!=0) {
7896 if(sOrder < tOrder) {
7897 return UCOL_LESS;
7898 } else if(sOrder > tOrder) {
7899 return UCOL_GREATER;
7900 }
7901 }
7902 sOrder<<=8;
7903 tOrder<<=8;
7904 }
7905 }
7906 }
7907 }
7908
7909 endOfSecLoop:
7910 if(strength >= UCOL_TERTIARY) {
7911 // tertiary loop is the same as secondary (except no French)
7912 elements += coll->latinOneTableLen;
7913 sIndex = 0; tIndex = 0;
7914 endOfSource = FALSE;
7915 for(;;) {
7916 while(sOrder==0) {
7917 if(sIndex==sLen) {
7918 endOfSource = TRUE;
7919 break;
7920 }
7921 sChar=source[sIndex++];
7922 sOrder = elements[sChar];
7923 if(sOrder > UCOL_NOT_FOUND) {
7924 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
7925 }
7926 }
7927 while(tOrder==0) {
7928 if(tIndex==tLen) {
7929 if(endOfSource) {
7930 return UCOL_EQUAL; // if both strings are at the end, they are equal
7931 } else {
7932 return UCOL_GREATER;
7933 }
7934 }
7935 tChar=target[tIndex++];
7936 tOrder = elements[tChar];
7937 if(tOrder > UCOL_NOT_FOUND) {
7938 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
7939 }
7940 }
7941 if(endOfSource) {
7942 return UCOL_LESS;
7943 }
7944 if(sOrder == tOrder) {
7945 sOrder = 0; tOrder = 0;
7946 continue;
7947 } else {
7948 if(((sOrder^tOrder)&0xff000000)!=0) {
7949 if(sOrder < tOrder) {
7950 return UCOL_LESS;
7951 } else if(sOrder > tOrder) {
7952 return UCOL_GREATER;
7953 }
7954 }
7955 sOrder<<=8;
7956 tOrder<<=8;
7957 }
7958 }
7959 }
7960 return UCOL_EQUAL;
7961 }
7962
7963 /*
7964 Note: ucol_strcollUTF8 supports null terminated input. Calculating length of
7965 null terminated input string takes extra amount of CPU cycles.
7966 */
7967 static UCollationResult
ucol_strcollRegularUTF8(const UCollator * coll,const char * source,int32_t sourceLength,const char * target,int32_t targetLength,UErrorCode * status)7968 ucol_strcollRegularUTF8(
7969 const UCollator *coll,
7970 const char *source,
7971 int32_t sourceLength,
7972 const char *target,
7973 int32_t targetLength,
7974 UErrorCode *status)
7975 {
7976 UCharIterator src;
7977 UCharIterator tgt;
7978
7979 uiter_setUTF8(&src, source, sourceLength);
7980 uiter_setUTF8(&tgt, target, targetLength);
7981
7982 // Preparing the context objects for iterating over strings
7983 collIterate sColl, tColl;
7984 IInit_collIterate(coll, NULL, -1, &sColl, status);
7985 IInit_collIterate(coll, NULL, -1, &tColl, status);
7986 if(U_FAILURE(*status)) {
7987 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
7988 return UCOL_EQUAL;
7989 }
7990 // The division for the array length may truncate the array size to
7991 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
7992 // for all platforms anyway.
7993 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7994 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7995 UNormIterator *sNormIter = NULL, *tNormIter = NULL;
7996
7997 sColl.iterator = &src;
7998 sColl.flags |= UCOL_USE_ITERATOR;
7999 tColl.flags |= UCOL_USE_ITERATOR;
8000 tColl.iterator = &tgt;
8001
8002 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8003 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8004 sColl.iterator = unorm_setIter(sNormIter, &src, UNORM_FCD, status);
8005 sColl.flags &= ~UCOL_ITER_NORM;
8006
8007 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8008 tColl.iterator = unorm_setIter(tNormIter, &tgt, UNORM_FCD, status);
8009 tColl.flags &= ~UCOL_ITER_NORM;
8010 }
8011
8012 return ucol_strcollRegular(&sColl, &tColl, status);
8013 }
8014
8015 static inline uint32_t
ucol_getLatinOneContractionUTF8(const UCollator * coll,int32_t strength,uint32_t CE,const char * s,int32_t * index,int32_t len)8016 ucol_getLatinOneContractionUTF8(const UCollator *coll, int32_t strength,
8017 uint32_t CE, const char *s, int32_t *index, int32_t len)
8018 {
8019 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
8020 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
8021 int32_t offset = 1;
8022 UChar32 schar = 0, tchar = 0;
8023
8024 for(;;) {
8025 if (*index == len) {
8026 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8027 }
8028 U8_GET_OR_FFFD((const uint8_t*)s, 0, *index, len, schar);
8029 if (len < 0 && schar == 0) {
8030 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8031 }
8032
8033 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
8034 offset++;
8035 }
8036
8037 if (schar == tchar) {
8038 U8_FWD_1(s, *index, len);
8039 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
8040 }
8041 else
8042 {
8043 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
8044 return UCOL_BAIL_OUT_CE;
8045 }
8046 // skip completely ignorables
8047 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
8048 if(isZeroCE == 0) { // we have to ignore completely ignorables
8049 U8_FWD_1(s, *index, len);
8050 continue;
8051 }
8052
8053 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8054 }
8055 }
8056 }
8057
8058 static inline UCollationResult
ucol_strcollUseLatin1UTF8(const UCollator * coll,const char * source,int32_t sLen,const char * target,int32_t tLen,UErrorCode * status)8059 ucol_strcollUseLatin1UTF8(
8060 const UCollator *coll,
8061 const char *source,
8062 int32_t sLen,
8063 const char *target,
8064 int32_t tLen,
8065 UErrorCode *status)
8066 {
8067 U_ALIGN_CODE(16);
8068 int32_t strength = coll->strength;
8069
8070 int32_t sIndex = 0, tIndex = 0;
8071 UChar32 sChar = 0, tChar = 0;
8072 uint32_t sOrder=0, tOrder=0;
8073
8074 UBool endOfSource = FALSE;
8075
8076 uint32_t *elements = coll->latinOneCEs;
8077
8078 UBool haveContractions = FALSE; // if we have contractions in our string
8079 // we cannot do French secondary
8080
8081 // Do the primary level
8082 for(;;) {
8083 while(sOrder==0) { // this loop skips primary ignorables
8084 // sOrder=getNextlatinOneCE(source);
8085 if (sIndex == sLen) {
8086 endOfSource = TRUE;
8087 break;
8088 }
8089 U8_NEXT_OR_FFFD(source, sIndex, sLen ,sChar);
8090 if (sLen < 0 && sChar == 0) {
8091 endOfSource = TRUE;
8092 sLen = sIndex;
8093 break;
8094 }
8095 if(sChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8096 //fprintf(stderr, "R");
8097 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8098 }
8099 sOrder = elements[sChar];
8100 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
8101 // specials can basically be either contractions or bail-out signs. If we get anything
8102 // else, we'll bail out anywasy
8103 if(getCETag(sOrder) == CONTRACTION_TAG) {
8104 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
8105 haveContractions = TRUE; // if there are contractions, we cannot do French secondary
8106 // However, if there are contractions in the table, but we always use just one char,
8107 // we might be able to do French. This should be checked out.
8108 }
8109 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8110 //fprintf(stderr, "S");
8111 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8112 }
8113 }
8114 }
8115
8116 while(tOrder==0) { // this loop skips primary ignorables
8117 // tOrder=getNextlatinOneCE(target);
8118 if (tIndex == tLen) {
8119 if(endOfSource) {
8120 goto endOfPrimLoopU8;
8121 } else {
8122 return UCOL_GREATER;
8123 }
8124 }
8125 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
8126 if (tLen < 0 && tChar == 0) {
8127 if(endOfSource) {
8128 tLen = tIndex;
8129 goto endOfPrimLoopU8;
8130 } else {
8131 return UCOL_GREATER;
8132 }
8133 }
8134 if(tChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8135 //fprintf(stderr, "R");
8136 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8137 }
8138 tOrder = elements[tChar];
8139 if(tOrder >= UCOL_NOT_FOUND) {
8140 // Handling specials, see the comments for source
8141 if(getCETag(tOrder) == CONTRACTION_TAG) {
8142 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
8143 haveContractions = TRUE;
8144 }
8145 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8146 //fprintf(stderr, "S");
8147 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8148 }
8149 }
8150 }
8151 if(endOfSource) { // source is finished, but target is not, say the result.
8152 return UCOL_LESS;
8153 }
8154
8155 if(sOrder == tOrder) { // if we have same CEs, we continue the loop
8156 sOrder = 0; tOrder = 0;
8157 continue;
8158 } else {
8159 // compare current top bytes
8160 if(((sOrder^tOrder)&0xFF000000)!=0) {
8161 // top bytes differ, return difference
8162 if(sOrder < tOrder) {
8163 return UCOL_LESS;
8164 } else if(sOrder > tOrder) {
8165 return UCOL_GREATER;
8166 }
8167 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
8168 // since we must return enum value
8169 }
8170
8171 // top bytes match, continue with following bytes
8172 sOrder<<=8;
8173 tOrder<<=8;
8174 }
8175 }
8176
8177 endOfPrimLoopU8:
8178 // after primary loop, we definitely know the sizes of strings,
8179 // so we set it and use simpler loop for secondaries and tertiaries
8180 sLen = sIndex; tLen = tIndex;
8181 if(strength >= UCOL_SECONDARY) {
8182 // adjust the table beggining
8183 elements += coll->latinOneTableLen;
8184 endOfSource = FALSE;
8185
8186 if(coll->frenchCollation == UCOL_OFF) { // non French
8187 // This loop is a simplified copy of primary loop
8188 // at this point we know that whole strings are latin-1, so we don't
8189 // check for that. We also know that we only have contractions as
8190 // specials.
8191 sIndex = 0; tIndex = 0;
8192 for(;;) {
8193 while(sOrder==0) {
8194 if(sIndex==sLen) {
8195 endOfSource = TRUE;
8196 break;
8197 }
8198 U_ASSERT(sLen >= 0);
8199 U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar);
8200 U_ASSERT(sChar >= 0 && sChar <= 0xFF);
8201 sOrder = elements[sChar];
8202 if(sOrder > UCOL_NOT_FOUND) {
8203 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
8204 }
8205 }
8206
8207 while(tOrder==0) {
8208 if(tIndex==tLen) {
8209 if(endOfSource) {
8210 goto endOfSecLoopU8;
8211 } else {
8212 return UCOL_GREATER;
8213 }
8214 }
8215 U_ASSERT(tLen >= 0);
8216 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
8217 U_ASSERT(tChar >= 0 && tChar <= 0xFF);
8218 tOrder = elements[tChar];
8219 if(tOrder > UCOL_NOT_FOUND) {
8220 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
8221 }
8222 }
8223 if(endOfSource) {
8224 return UCOL_LESS;
8225 }
8226
8227 if(sOrder == tOrder) {
8228 sOrder = 0; tOrder = 0;
8229 continue;
8230 } else {
8231 // see primary loop for comments on this
8232 if(((sOrder^tOrder)&0xFF000000)!=0) {
8233 if(sOrder < tOrder) {
8234 return UCOL_LESS;
8235 } else if(sOrder > tOrder) {
8236 return UCOL_GREATER;
8237 }
8238 }
8239 sOrder<<=8;
8240 tOrder<<=8;
8241 }
8242 }
8243 } else { // French
8244 if(haveContractions) { // if we have contractions, we have to bail out
8245 // since we don't really know how to handle them here
8246 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8247 }
8248 // For French, we go backwards
8249 sIndex = sLen; tIndex = tLen;
8250 for(;;) {
8251 while(sOrder==0) {
8252 if(sIndex==0) {
8253 endOfSource = TRUE;
8254 break;
8255 }
8256 U8_PREV_OR_FFFD(source, 0, sIndex, sChar);
8257 U_ASSERT(sChar >= 0 && sChar <= 0xFF);
8258 sOrder = elements[sChar];
8259 // don't even look for contractions
8260 }
8261
8262 while(tOrder==0) {
8263 if(tIndex==0) {
8264 if(endOfSource) {
8265 goto endOfSecLoopU8;
8266 } else {
8267 return UCOL_GREATER;
8268 }
8269 }
8270 U8_PREV_OR_FFFD(target, 0, tIndex, tChar);
8271 U_ASSERT(tChar >= 0 && tChar <= 0xFF);
8272 tOrder = elements[tChar];
8273 // don't even look for contractions
8274 }
8275 if(endOfSource) {
8276 return UCOL_LESS;
8277 }
8278
8279 if(sOrder == tOrder) {
8280 sOrder = 0; tOrder = 0;
8281 continue;
8282 } else {
8283 // see the primary loop for comments
8284 if(((sOrder^tOrder)&0xFF000000)!=0) {
8285 if(sOrder < tOrder) {
8286 return UCOL_LESS;
8287 } else if(sOrder > tOrder) {
8288 return UCOL_GREATER;
8289 }
8290 }
8291 sOrder<<=8;
8292 tOrder<<=8;
8293 }
8294 }
8295 }
8296 }
8297
8298 endOfSecLoopU8:
8299 if(strength >= UCOL_TERTIARY) {
8300 // tertiary loop is the same as secondary (except no French)
8301 elements += coll->latinOneTableLen;
8302 sIndex = 0; tIndex = 0;
8303 endOfSource = FALSE;
8304 for(;;) {
8305 while(sOrder==0) {
8306 if(sIndex==sLen) {
8307 endOfSource = TRUE;
8308 break;
8309 }
8310 U_ASSERT(sLen >= 0);
8311 U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar);
8312 U_ASSERT(sChar >= 0 && sChar <= 0xFF);
8313 sOrder = elements[sChar];
8314 if(sOrder > UCOL_NOT_FOUND) {
8315 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
8316 }
8317 }
8318 while(tOrder==0) {
8319 if(tIndex==tLen) {
8320 if(endOfSource) {
8321 return UCOL_EQUAL; // if both strings are at the end, they are equal
8322 } else {
8323 return UCOL_GREATER;
8324 }
8325 }
8326 U_ASSERT(tLen >= 0);
8327 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
8328 U_ASSERT(tChar >= 0 && tChar <= 0xFF);
8329 tOrder = elements[tChar];
8330 if(tOrder > UCOL_NOT_FOUND) {
8331 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
8332 }
8333 }
8334 if(endOfSource) {
8335 return UCOL_LESS;
8336 }
8337 if(sOrder == tOrder) {
8338 sOrder = 0; tOrder = 0;
8339 continue;
8340 } else {
8341 if(((sOrder^tOrder)&0xff000000)!=0) {
8342 if(sOrder < tOrder) {
8343 return UCOL_LESS;
8344 } else if(sOrder > tOrder) {
8345 return UCOL_GREATER;
8346 }
8347 }
8348 sOrder<<=8;
8349 tOrder<<=8;
8350 }
8351 }
8352 }
8353 return UCOL_EQUAL;
8354 }
8355
8356 U_CAPI UCollationResult U_EXPORT2
ucol_strcollIter(const UCollator * coll,UCharIterator * sIter,UCharIterator * tIter,UErrorCode * status)8357 ucol_strcollIter( const UCollator *coll,
8358 UCharIterator *sIter,
8359 UCharIterator *tIter,
8360 UErrorCode *status)
8361 {
8362 if(!status || U_FAILURE(*status)) {
8363 return UCOL_EQUAL;
8364 }
8365
8366 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
8367 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
8368
8369 if (sIter == tIter) {
8370 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8371 return UCOL_EQUAL;
8372 }
8373 if(sIter == NULL || tIter == NULL || coll == NULL) {
8374 *status = U_ILLEGAL_ARGUMENT_ERROR;
8375 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8376 return UCOL_EQUAL;
8377 }
8378
8379 UCollationResult result = UCOL_EQUAL;
8380
8381 // Preparing the context objects for iterating over strings
8382 collIterate sColl, tColl;
8383 IInit_collIterate(coll, NULL, -1, &sColl, status);
8384 IInit_collIterate(coll, NULL, -1, &tColl, status);
8385 if(U_FAILURE(*status)) {
8386 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8387 return UCOL_EQUAL;
8388 }
8389 // The division for the array length may truncate the array size to
8390 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8391 // for all platforms anyway.
8392 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8393 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8394 UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8395
8396 sColl.iterator = sIter;
8397 sColl.flags |= UCOL_USE_ITERATOR;
8398 tColl.flags |= UCOL_USE_ITERATOR;
8399 tColl.iterator = tIter;
8400
8401 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8402 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8403 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8404 sColl.flags &= ~UCOL_ITER_NORM;
8405
8406 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8407 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8408 tColl.flags &= ~UCOL_ITER_NORM;
8409 }
8410
8411 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
8412
8413 while((sChar = sColl.iterator->next(sColl.iterator)) ==
8414 (tChar = tColl.iterator->next(tColl.iterator))) {
8415 if(sChar == U_SENTINEL) {
8416 result = UCOL_EQUAL;
8417 goto end_compare;
8418 }
8419 }
8420
8421 if(sChar == U_SENTINEL) {
8422 tChar = tColl.iterator->previous(tColl.iterator);
8423 }
8424
8425 if(tChar == U_SENTINEL) {
8426 sChar = sColl.iterator->previous(sColl.iterator);
8427 }
8428
8429 sChar = sColl.iterator->previous(sColl.iterator);
8430 tChar = tColl.iterator->previous(tColl.iterator);
8431
8432 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8433 {
8434 // We are stopped in the middle of a contraction.
8435 // Scan backwards through the == part of the string looking for the start of the contraction.
8436 // It doesn't matter which string we scan, since they are the same in this region.
8437 do
8438 {
8439 sChar = sColl.iterator->previous(sColl.iterator);
8440 tChar = tColl.iterator->previous(tColl.iterator);
8441 }
8442 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8443 }
8444
8445
8446 if(U_SUCCESS(*status)) {
8447 result = ucol_strcollRegular(&sColl, &tColl, status);
8448 }
8449
8450 end_compare:
8451 if(sNormIter || tNormIter) {
8452 unorm_closeIter(sNormIter);
8453 unorm_closeIter(tNormIter);
8454 }
8455
8456 UTRACE_EXIT_VALUE_STATUS(result, *status)
8457 return result;
8458 }
8459
8460
8461 /* */
8462 /* ucol_strcoll Main public API string comparison function */
8463 /* */
8464 U_CAPI UCollationResult U_EXPORT2
ucol_strcoll(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)8465 ucol_strcoll( const UCollator *coll,
8466 const UChar *source,
8467 int32_t sourceLength,
8468 const UChar *target,
8469 int32_t targetLength)
8470 {
8471 U_ALIGN_CODE(16);
8472
8473 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
8474 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8475 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8476 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
8477 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
8478 }
8479
8480 if((source == NULL && sourceLength != 0) || (target == NULL && targetLength != 0)) {
8481 // do not crash, but return. Should have
8482 // status argument to return error.
8483 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8484 return UCOL_EQUAL;
8485 }
8486
8487 /* Quick check if source and target are same strings. */
8488 /* They should either both be NULL terminated or the explicit length should be set on both. */
8489 if (source==target && sourceLength==targetLength) {
8490 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8491 return UCOL_EQUAL;
8492 }
8493
8494 if(coll->delegate != NULL) {
8495 UErrorCode status = U_ZERO_ERROR;
8496 return ((const Collator*)coll->delegate)->compare(source,sourceLength,target,targetLength, status);
8497 }
8498
8499 /* Scan the strings. Find: */
8500 /* The length of any leading portion that is equal */
8501 /* Whether they are exactly equal. (in which case we just return) */
8502 const UChar *pSrc = source;
8503 const UChar *pTarg = target;
8504 int32_t equalLength;
8505
8506 if (sourceLength == -1 && targetLength == -1) {
8507 // Both strings are null terminated.
8508 // Scan through any leading equal portion.
8509 while (*pSrc == *pTarg && *pSrc != 0) {
8510 pSrc++;
8511 pTarg++;
8512 }
8513 if (*pSrc == 0 && *pTarg == 0) {
8514 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8515 return UCOL_EQUAL;
8516 }
8517 equalLength = (int32_t)(pSrc - source);
8518 }
8519 else
8520 {
8521 // One or both strings has an explicit length.
8522 const UChar *pSrcEnd = source + sourceLength;
8523 const UChar *pTargEnd = target + targetLength;
8524
8525 // Scan while the strings are bitwise ==, or until one is exhausted.
8526 for (;;) {
8527 if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8528 break;
8529 }
8530 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8531 break;
8532 }
8533 if (*pSrc != *pTarg) {
8534 break;
8535 }
8536 pSrc++;
8537 pTarg++;
8538 }
8539 equalLength = (int32_t)(pSrc - source);
8540
8541 // If we made it all the way through both strings, we are done. They are ==
8542 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */
8543 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) /* and also at end of dest string */
8544 {
8545 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8546 return UCOL_EQUAL;
8547 }
8548 }
8549 if (equalLength > 0) {
8550 /* There is an identical portion at the beginning of the two strings. */
8551 /* If the identical portion ends within a contraction or a comibining */
8552 /* character sequence, back up to the start of that sequence. */
8553
8554 // These values should already be set by the code above.
8555 //pSrc = source + equalLength; /* point to the first differing chars */
8556 //pTarg = target + equalLength;
8557 if ((pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) ||
8558 (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll)))
8559 {
8560 // We are stopped in the middle of a contraction.
8561 // Scan backwards through the == part of the string looking for the start of the contraction.
8562 // It doesn't matter which string we scan, since they are the same in this region.
8563 do
8564 {
8565 equalLength--;
8566 pSrc--;
8567 }
8568 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
8569 }
8570
8571 source += equalLength;
8572 target += equalLength;
8573 if (sourceLength > 0) {
8574 sourceLength -= equalLength;
8575 }
8576 if (targetLength > 0) {
8577 targetLength -= equalLength;
8578 }
8579 }
8580
8581 UErrorCode status = U_ZERO_ERROR;
8582 UCollationResult returnVal;
8583 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
8584 returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status);
8585 } else {
8586 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
8587 }
8588 UTRACE_EXIT_VALUE(returnVal);
8589 return returnVal;
8590 }
8591
8592 U_CAPI UCollationResult U_EXPORT2
ucol_strcollUTF8(const UCollator * coll,const char * source,int32_t sourceLength,const char * target,int32_t targetLength,UErrorCode * status)8593 ucol_strcollUTF8(
8594 const UCollator *coll,
8595 const char *source,
8596 int32_t sourceLength,
8597 const char *target,
8598 int32_t targetLength,
8599 UErrorCode *status)
8600 {
8601 U_ALIGN_CODE(16);
8602
8603 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8);
8604 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8605 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8606 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLength);
8607 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLength);
8608 }
8609
8610 if (U_FAILURE(*status)) {
8611 /* do nothing */
8612 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8613 return UCOL_EQUAL;
8614 }
8615
8616 if((source == NULL && sourceLength != 0) || (target == NULL && targetLength != 0)) {
8617 *status = U_ILLEGAL_ARGUMENT_ERROR;
8618 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8619 return UCOL_EQUAL;
8620 }
8621
8622 /* Quick check if source and target are same strings. */
8623 /* They should either both be NULL terminated or the explicit length should be set on both. */
8624 if (source==target && sourceLength==targetLength) {
8625 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8626 return UCOL_EQUAL;
8627 }
8628
8629 if(coll->delegate != NULL) {
8630 return ((const Collator*)coll->delegate)->compareUTF8(
8631 StringPiece(source, (sourceLength < 0) ? uprv_strlen(source) : sourceLength),
8632 StringPiece(target, (targetLength < 0) ? uprv_strlen(target) : targetLength),
8633 *status);
8634 }
8635
8636 /* Scan the strings. Find: */
8637 /* The length of any leading portion that is equal */
8638 /* Whether they are exactly equal. (in which case we just return) */
8639 const char *pSrc = source;
8640 const char *pTarg = target;
8641 UBool bSrcLimit = FALSE;
8642 UBool bTargLimit = FALSE;
8643
8644 if (sourceLength == -1 && targetLength == -1) {
8645 // Both strings are null terminated.
8646 // Scan through any leading equal portion.
8647 while (*pSrc == *pTarg && *pSrc != 0) {
8648 pSrc++;
8649 pTarg++;
8650 }
8651 if (*pSrc == 0 && *pTarg == 0) {
8652 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8653 return UCOL_EQUAL;
8654 }
8655 bSrcLimit = (*pSrc == 0);
8656 bTargLimit = (*pTarg == 0);
8657 }
8658 else
8659 {
8660 // One or both strings has an explicit length.
8661 const char *pSrcEnd = source + sourceLength;
8662 const char *pTargEnd = target + targetLength;
8663
8664 // Scan while the strings are bitwise ==, or until one is exhausted.
8665 for (;;) {
8666 if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8667 break;
8668 }
8669 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8670 break;
8671 }
8672 if (*pSrc != *pTarg) {
8673 break;
8674 }
8675 pSrc++;
8676 pTarg++;
8677 }
8678 bSrcLimit = (pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0));
8679 bTargLimit = (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0));
8680
8681 // If we made it all the way through both strings, we are done. They are ==
8682 if (bSrcLimit && /* At end of src string, however it was specified. */
8683 bTargLimit) /* and also at end of dest string */
8684 {
8685 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8686 return UCOL_EQUAL;
8687 }
8688 }
8689
8690 U_ASSERT(!(bSrcLimit && bTargLimit));
8691
8692 int32_t equalLength = pSrc - source;
8693 UBool bSawNonLatin1 = FALSE;
8694
8695 if (equalLength > 0) {
8696 // Align position to the start of UTF-8 code point.
8697 if (bTargLimit) {
8698 U8_SET_CP_START((const uint8_t*)source, 0, equalLength);
8699 } else {
8700 U8_SET_CP_START((const uint8_t*)target, 0, equalLength);
8701 }
8702 pSrc = source + equalLength;
8703 pTarg = target + equalLength;
8704 }
8705
8706 if (equalLength > 0) {
8707 /* There is an identical portion at the beginning of the two strings. */
8708 /* If the identical portion ends within a contraction or a comibining */
8709 /* character sequence, back up to the start of that sequence. */
8710 UBool bUnsafeCP = FALSE;
8711 UChar32 uc32 = -1;
8712
8713 if (!bSrcLimit) {
8714 U8_GET_OR_FFFD((const uint8_t*)source, 0, equalLength, sourceLength, uc32);
8715 if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
8716 bUnsafeCP = TRUE;
8717 }
8718 bSawNonLatin1 |= (uc32 > 0xff);
8719 }
8720 if (!bTargLimit) {
8721 U8_GET_OR_FFFD((const uint8_t*)target, 0, equalLength, targetLength, uc32);
8722 if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
8723 bUnsafeCP = TRUE;
8724 }
8725 bSawNonLatin1 |= (uc32 > 0xff);
8726 }
8727
8728 if (bUnsafeCP) {
8729 while (equalLength > 0) {
8730 // We are stopped in the middle of a contraction.
8731 // Scan backwards through the == part of the string looking for the start of the contraction.
8732 // It doesn't matter which string we scan, since they are the same in this region.
8733 U8_PREV_OR_FFFD((uint8_t*)source, 0, equalLength, uc32);
8734 bSawNonLatin1 |= (uc32 > 0xff);
8735 if (uc32 < 0x10000 && !ucol_unsafeCP((UChar)uc32, coll)) {
8736 break;
8737 }
8738 }
8739 }
8740 source += equalLength;
8741 target += equalLength;
8742 if (sourceLength > 0) {
8743 sourceLength -= equalLength;
8744 }
8745 if (targetLength > 0) {
8746 targetLength -= equalLength;
8747 }
8748 } else {
8749 // Lead byte of Latin 1 character is 0x00 - 0xC3
8750 bSawNonLatin1 = (source && (sourceLength != 0) && (uint8_t)*source > 0xc3);
8751 bSawNonLatin1 |= (target && (targetLength != 0) && (uint8_t)*target > 0xc3);
8752 }
8753
8754 UCollationResult returnVal;
8755
8756 if(!coll->latinOneUse || bSawNonLatin1) {
8757 returnVal = ucol_strcollRegularUTF8(coll, source, sourceLength, target, targetLength, status);
8758 } else {
8759 returnVal = ucol_strcollUseLatin1UTF8(coll, source, sourceLength, target, targetLength, status);
8760 }
8761 UTRACE_EXIT_VALUE_STATUS(returnVal, *status);
8762 return returnVal;
8763 }
8764
8765
8766 /* convenience function for comparing strings */
8767 U_CAPI UBool U_EXPORT2
ucol_greater(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)8768 ucol_greater( const UCollator *coll,
8769 const UChar *source,
8770 int32_t sourceLength,
8771 const UChar *target,
8772 int32_t targetLength)
8773 {
8774 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8775 == UCOL_GREATER);
8776 }
8777
8778 /* convenience function for comparing strings */
8779 U_CAPI UBool U_EXPORT2
ucol_greaterOrEqual(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)8780 ucol_greaterOrEqual( const UCollator *coll,
8781 const UChar *source,
8782 int32_t sourceLength,
8783 const UChar *target,
8784 int32_t targetLength)
8785 {
8786 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8787 != UCOL_LESS);
8788 }
8789
8790 /* convenience function for comparing strings */
8791 U_CAPI UBool U_EXPORT2
ucol_equal(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)8792 ucol_equal( const UCollator *coll,
8793 const UChar *source,
8794 int32_t sourceLength,
8795 const UChar *target,
8796 int32_t targetLength)
8797 {
8798 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8799 == UCOL_EQUAL);
8800 }
8801
8802 U_CAPI void U_EXPORT2
ucol_getUCAVersion(const UCollator * coll,UVersionInfo info)8803 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
8804 if(coll && coll->UCA) {
8805 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
8806 }
8807 }
8808
8809 #endif /* #if !UCONFIG_NO_COLLATION */
8810