1 /*
2 *******************************************************************************
3 * Copyright (C) 1996-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: ucol.cpp
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * Modification history
12 * Date Name Comments
13 * 1996-1999 various members of ICU team maintained C API for collation framework
14 * 02/16/2001 synwee Added internal method getPrevSpecialCE
15 * 03/01/2001 synwee Added maxexpansion functionality.
16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant
17 */
18
19 #include "unicode/utypes.h"
20
21 #if !UCONFIG_NO_COLLATION
22
23 #include "unicode/bytestream.h"
24 #include "unicode/coleitr.h"
25 #include "unicode/unorm.h"
26 #include "unicode/udata.h"
27 #include "unicode/ustring.h"
28 #include "unicode/utf8.h"
29
30 #include "ucol_imp.h"
31 #include "bocsu.h"
32
33 #include "normalizer2impl.h"
34 #include "unorm_it.h"
35 #include "umutex.h"
36 #include "cmemory.h"
37 #include "ucln_in.h"
38 #include "cstring.h"
39 #include "utracimp.h"
40 #include "putilimp.h"
41 #include "uassert.h"
42 #include "unicode/coll.h"
43
44 #ifdef UCOL_DEBUG
45 #include <stdio.h>
46 #endif
47
48 U_NAMESPACE_USE
49
50 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
51
52 #define LAST_BYTE_MASK_ 0xFF
53 #define SECOND_LAST_BYTE_SHIFT_ 8
54
55 #define ZERO_CC_LIMIT_ 0xC0
56
57 // These are static pointers to the NFC/NFD implementation instance.
58 // Each of them is always the same between calls to u_cleanup
59 // and therefore writing to it is not synchronized.
60 // They are cleaned in ucol_cleanup
61 static const Normalizer2 *g_nfd = NULL;
62 static const Normalizer2Impl *g_nfcImpl = NULL;
63
64 // These are values from UCA required for
65 // implicit generation and supressing sort key compression
66 // they should regularly be in the UCA, but if one
67 // is running without UCA, it could be a problem
68 static const int32_t maxRegularPrimary = 0x7A;
69 static const int32_t minImplicitPrimary = 0xE0;
70 static const int32_t maxImplicitPrimary = 0xE4;
71
72 U_CDECL_BEGIN
73 static UBool U_CALLCONV
ucol_cleanup(void)74 ucol_cleanup(void)
75 {
76 g_nfd = NULL;
77 g_nfcImpl = NULL;
78 return TRUE;
79 }
80
81 static int32_t U_CALLCONV
_getFoldingOffset(uint32_t data)82 _getFoldingOffset(uint32_t data) {
83 return (int32_t)(data&0xFFFFFF);
84 }
85
86 U_CDECL_END
87
88 static inline
initializeNFD(UErrorCode * status)89 UBool initializeNFD(UErrorCode *status) {
90 if (g_nfd != NULL) {
91 return TRUE;
92 } else {
93 // The result is constant, until the library is reloaded.
94 g_nfd = Normalizer2Factory::getNFDInstance(*status);
95 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
96 return U_SUCCESS(*status);
97 }
98 }
99
100 // init FCD data
101 static inline
initializeFCD(UErrorCode * status)102 UBool initializeFCD(UErrorCode *status) {
103 if (g_nfcImpl != NULL) {
104 return TRUE;
105 } else {
106 // The result is constant, until the library is reloaded.
107 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
108 // Note: Alternatively, we could also store this pointer in each collIterate struct,
109 // same as Normalizer2Factory::getImpl(collIterate->nfd).
110 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
111 return U_SUCCESS(*status);
112 }
113 }
114
115 static
IInit_collIterate(const UCollator * collator,const UChar * sourceString,int32_t sourceLen,collIterate * s,UErrorCode * status)116 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
117 int32_t sourceLen, collIterate *s,
118 UErrorCode *status)
119 {
120 (s)->string = (s)->pos = sourceString;
121 (s)->origFlags = 0;
122 (s)->flags = 0;
123 if (sourceLen >= 0) {
124 s->flags |= UCOL_ITER_HASLEN;
125 (s)->endp = (UChar *)sourceString+sourceLen;
126 }
127 else {
128 /* change to enable easier checking for end of string for fcdpositon */
129 (s)->endp = NULL;
130 }
131 (s)->extendCEs = NULL;
132 (s)->extendCEsSize = 0;
133 (s)->CEpos = (s)->toReturn = (s)->CEs;
134 (s)->offsetBuffer = NULL;
135 (s)->offsetBufferSize = 0;
136 (s)->offsetReturn = (s)->offsetStore = NULL;
137 (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
138 (s)->coll = (collator);
139 if (initializeNFD(status)) {
140 (s)->nfd = g_nfd;
141 } else {
142 return;
143 }
144 (s)->fcdPosition = 0;
145 if(collator->normalizationMode == UCOL_ON) {
146 (s)->flags |= UCOL_ITER_NORM;
147 }
148 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
149 (s)->flags |= UCOL_HIRAGANA_Q;
150 }
151 (s)->iterator = NULL;
152 //(s)->iteratorIndex = 0;
153 }
154
155 U_CAPI void U_EXPORT2
uprv_init_collIterate(const UCollator * collator,const UChar * sourceString,int32_t sourceLen,collIterate * s,UErrorCode * status)156 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
157 int32_t sourceLen, collIterate *s,
158 UErrorCode *status) {
159 /* Out-of-line version for use from other files. */
160 IInit_collIterate(collator, sourceString, sourceLen, s, status);
161 }
162
163 U_CAPI collIterate * U_EXPORT2
uprv_new_collIterate(UErrorCode * status)164 uprv_new_collIterate(UErrorCode *status) {
165 if(U_FAILURE(*status)) {
166 return NULL;
167 }
168 collIterate *s = new collIterate;
169 if(s == NULL) {
170 *status = U_MEMORY_ALLOCATION_ERROR;
171 return NULL;
172 }
173 return s;
174 }
175
176 U_CAPI void U_EXPORT2
uprv_delete_collIterate(collIterate * s)177 uprv_delete_collIterate(collIterate *s) {
178 delete s;
179 }
180
181 U_CAPI UBool U_EXPORT2
uprv_collIterateAtEnd(collIterate * s)182 uprv_collIterateAtEnd(collIterate *s) {
183 return s == NULL || s->pos == s->endp;
184 }
185
186 /**
187 * Backup the state of the collIterate struct data
188 * @param data collIterate to backup
189 * @param backup storage
190 */
191 static
backupState(const collIterate * data,collIterateState * backup)192 inline void backupState(const collIterate *data, collIterateState *backup)
193 {
194 backup->fcdPosition = data->fcdPosition;
195 backup->flags = data->flags;
196 backup->origFlags = data->origFlags;
197 backup->pos = data->pos;
198 backup->bufferaddress = data->writableBuffer.getBuffer();
199 backup->buffersize = data->writableBuffer.length();
200 backup->iteratorMove = 0;
201 backup->iteratorIndex = 0;
202 if(data->iterator != NULL) {
203 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
204 backup->iteratorIndex = data->iterator->getState(data->iterator);
205 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
206 if(backup->iteratorIndex == UITER_NO_STATE) {
207 while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
208 backup->iteratorMove++;
209 data->iterator->move(data->iterator, -1, UITER_CURRENT);
210 }
211 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
212 }
213 }
214 }
215
216 /**
217 * Loads the state into the collIterate struct data
218 * @param data collIterate to backup
219 * @param backup storage
220 * @param forwards boolean to indicate if forwards iteration is used,
221 * false indicates backwards iteration
222 */
223 static
loadState(collIterate * data,const collIterateState * backup,UBool forwards)224 inline void loadState(collIterate *data, const collIterateState *backup,
225 UBool forwards)
226 {
227 UErrorCode status = U_ZERO_ERROR;
228 data->flags = backup->flags;
229 data->origFlags = backup->origFlags;
230 if(data->iterator != NULL) {
231 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
232 data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
233 if(backup->iteratorMove != 0) {
234 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
235 }
236 }
237 data->pos = backup->pos;
238
239 if ((data->flags & UCOL_ITER_INNORMBUF) &&
240 data->writableBuffer.getBuffer() != backup->bufferaddress) {
241 /*
242 this is when a new buffer has been reallocated and we'll have to
243 calculate the new position.
244 note the new buffer has to contain the contents of the old buffer.
245 */
246 if (forwards) {
247 data->pos = data->writableBuffer.getTerminatedBuffer() +
248 (data->pos - backup->bufferaddress);
249 }
250 else {
251 /* backwards direction */
252 int32_t temp = backup->buffersize -
253 (int32_t)(data->pos - backup->bufferaddress);
254 data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp);
255 }
256 }
257 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
258 /*
259 this is alittle tricky.
260 if we are initially not in the normalization buffer, even if we
261 normalize in the later stage, the data in the buffer will be
262 ignored, since we skip back up to the data string.
263 however if we are already in the normalization buffer, any
264 further normalization will pull data into the normalization
265 buffer and modify the fcdPosition.
266 since we are keeping the data in the buffer for use, the
267 fcdPosition can not be reverted back.
268 arrgghh....
269 */
270 data->fcdPosition = backup->fcdPosition;
271 }
272 }
273
274 static UBool
reallocCEs(collIterate * data,int32_t newCapacity)275 reallocCEs(collIterate *data, int32_t newCapacity) {
276 uint32_t *oldCEs = data->extendCEs;
277 if(oldCEs == NULL) {
278 oldCEs = data->CEs;
279 }
280 int32_t length = data->CEpos - oldCEs;
281 uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4);
282 if(newCEs == NULL) {
283 return FALSE;
284 }
285 uprv_memcpy(newCEs, oldCEs, length * 4);
286 uprv_free(data->extendCEs);
287 data->extendCEs = newCEs;
288 data->extendCEsSize = newCapacity;
289 data->CEpos = newCEs + length;
290 return TRUE;
291 }
292
293 static UBool
increaseCEsCapacity(collIterate * data)294 increaseCEsCapacity(collIterate *data) {
295 int32_t oldCapacity;
296 if(data->extendCEs != NULL) {
297 oldCapacity = data->extendCEsSize;
298 } else {
299 oldCapacity = LENGTHOF(data->CEs);
300 }
301 return reallocCEs(data, 2 * oldCapacity);
302 }
303
304 static UBool
ensureCEsCapacity(collIterate * data,int32_t minCapacity)305 ensureCEsCapacity(collIterate *data, int32_t minCapacity) {
306 int32_t oldCapacity;
307 if(data->extendCEs != NULL) {
308 oldCapacity = data->extendCEsSize;
309 } else {
310 oldCapacity = LENGTHOF(data->CEs);
311 }
312 if(minCapacity <= oldCapacity) {
313 return TRUE;
314 }
315 oldCapacity *= 2;
316 return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity);
317 }
318
appendOffset(int32_t offset,UErrorCode & errorCode)319 void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) {
320 if(U_FAILURE(errorCode)) {
321 return;
322 }
323 int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer);
324 U_ASSERT(length >= offsetBufferSize || offsetStore != NULL);
325 if(length >= offsetBufferSize) {
326 int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE;
327 int32_t *newBuffer = static_cast<int32_t *>(uprv_malloc(newCapacity * 4));
328 if(newBuffer == NULL) {
329 errorCode = U_MEMORY_ALLOCATION_ERROR;
330 return;
331 }
332 if(length > 0) {
333 uprv_memcpy(newBuffer, offsetBuffer, length * 4);
334 }
335 uprv_free(offsetBuffer);
336 offsetBuffer = newBuffer;
337 offsetStore = offsetBuffer + length;
338 offsetBufferSize = newCapacity;
339 }
340 *offsetStore++ = offset;
341 }
342
343 /*
344 * collIter_eos()
345 * Checks for a collIterate being positioned at the end of
346 * its source string.
347 *
348 */
349 static
collIter_eos(collIterate * s)350 inline UBool collIter_eos(collIterate *s) {
351 if(s->flags & UCOL_USE_ITERATOR) {
352 return !(s->iterator->hasNext(s->iterator));
353 }
354 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
355 // Null terminated string, but not at null, so not at end.
356 // Whether in main or normalization buffer doesn't matter.
357 return FALSE;
358 }
359
360 // String with length. Can't be in normalization buffer, which is always
361 // null termintated.
362 if (s->flags & UCOL_ITER_HASLEN) {
363 return (s->pos == s->endp);
364 }
365
366 // We are at a null termination, could be either normalization buffer or main string.
367 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
368 // At null at end of main string.
369 return TRUE;
370 }
371
372 // At null at end of normalization buffer. Need to check whether there there are
373 // any characters left in the main buffer.
374 if(s->origFlags & UCOL_USE_ITERATOR) {
375 return !(s->iterator->hasNext(s->iterator));
376 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
377 // Null terminated main string. fcdPosition is the 'return' position into main buf.
378 return (*s->fcdPosition == 0);
379 }
380 else {
381 // Main string with an end pointer.
382 return s->fcdPosition == s->endp;
383 }
384 }
385
386 /*
387 * collIter_bos()
388 * Checks for a collIterate being positioned at the start of
389 * its source string.
390 *
391 */
392 static
collIter_bos(collIterate * source)393 inline UBool collIter_bos(collIterate *source) {
394 // if we're going backwards, we need to know whether there is more in the
395 // iterator, even if we are in the side buffer
396 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
397 return !source->iterator->hasPrevious(source->iterator);
398 }
399 if (source->pos <= source->string ||
400 ((source->flags & UCOL_ITER_INNORMBUF) &&
401 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
402 return TRUE;
403 }
404 return FALSE;
405 }
406
407 /*static
408 inline UBool collIter_SimpleBos(collIterate *source) {
409 // if we're going backwards, we need to know whether there is more in the
410 // iterator, even if we are in the side buffer
411 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
412 return !source->iterator->hasPrevious(source->iterator);
413 }
414 if (source->pos == source->string) {
415 return TRUE;
416 }
417 return FALSE;
418 }*/
419 //return (data->pos == data->string) ||
420
421
422 /****************************************************************************/
423 /* Following are the open/close functions */
424 /* */
425 /****************************************************************************/
426
427 static UCollator*
ucol_initFromBinary(const uint8_t * bin,int32_t length,const UCollator * base,UCollator * fillIn,UErrorCode * status)428 ucol_initFromBinary(const uint8_t *bin, int32_t length,
429 const UCollator *base,
430 UCollator *fillIn,
431 UErrorCode *status)
432 {
433 UCollator *result = fillIn;
434 if(U_FAILURE(*status)) {
435 return NULL;
436 }
437 /*
438 if(base == NULL) {
439 // we don't support null base yet
440 *status = U_ILLEGAL_ARGUMENT_ERROR;
441 return NULL;
442 }
443 */
444 // We need these and we could be running without UCA
445 uprv_uca_initImplicitConstants(status);
446 UCATableHeader *colData = (UCATableHeader *)bin;
447 // do we want version check here? We're trying to figure out whether collators are compatible
448 if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
449 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
450 colData->version[0] != UCOL_BUILDER_VERSION)
451 {
452 *status = U_COLLATOR_VERSION_MISMATCH;
453 return NULL;
454 }
455 else {
456 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
457 result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
458 if(U_FAILURE(*status)){
459 return NULL;
460 }
461 result->hasRealData = TRUE;
462 }
463 else {
464 if(base) {
465 result = ucol_initCollator(base->image, result, base, status);
466 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
467 if(U_FAILURE(*status)){
468 return NULL;
469 }
470 result->hasRealData = FALSE;
471 }
472 else {
473 *status = U_USELESS_COLLATOR_ERROR;
474 return NULL;
475 }
476 }
477 result->freeImageOnClose = FALSE;
478 }
479 result->actualLocale = NULL;
480 result->validLocale = NULL;
481 result->requestedLocale = NULL;
482 result->rules = NULL;
483 result->rulesLength = 0;
484 result->freeRulesOnClose = FALSE;
485 result->ucaRules = NULL;
486 return result;
487 }
488
489 U_CAPI UCollator* U_EXPORT2
ucol_openBinary(const uint8_t * bin,int32_t length,const UCollator * base,UErrorCode * status)490 ucol_openBinary(const uint8_t *bin, int32_t length,
491 const UCollator *base,
492 UErrorCode *status)
493 {
494 return ucol_initFromBinary(bin, length, base, NULL, status);
495 }
496
497 U_CAPI int32_t U_EXPORT2
ucol_cloneBinary(const UCollator * coll,uint8_t * buffer,int32_t capacity,UErrorCode * status)498 ucol_cloneBinary(const UCollator *coll,
499 uint8_t *buffer, int32_t capacity,
500 UErrorCode *status)
501 {
502 int32_t length = 0;
503 if(U_FAILURE(*status)) {
504 return length;
505 }
506 if(capacity < 0) {
507 *status = U_ILLEGAL_ARGUMENT_ERROR;
508 return length;
509 }
510 if(coll->hasRealData == TRUE) {
511 length = coll->image->size;
512 if(length <= capacity) {
513 uprv_memcpy(buffer, coll->image, length);
514 } else {
515 *status = U_BUFFER_OVERFLOW_ERROR;
516 }
517 } else {
518 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
519 if(length <= capacity) {
520 /* build the UCATableHeader with minimal entries */
521 /* do not copy the header from the UCA file because its values are wrong! */
522 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
523
524 /* reset everything */
525 uprv_memset(buffer, 0, length);
526
527 /* set the tailoring-specific values */
528 UCATableHeader *myData = (UCATableHeader *)buffer;
529 myData->size = length;
530
531 /* offset for the options, the only part of the data that is present after the header */
532 myData->options = sizeof(UCATableHeader);
533
534 /* need to always set the expansion value for an upper bound of the options */
535 myData->expansion = myData->options + sizeof(UColOptionSet);
536
537 myData->magic = UCOL_HEADER_MAGIC;
538 myData->isBigEndian = U_IS_BIG_ENDIAN;
539 myData->charSetFamily = U_CHARSET_FAMILY;
540
541 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
542 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
543
544 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
545 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
546 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
547 myData->jamoSpecial = coll->image->jamoSpecial;
548
549 /* copy the collator options */
550 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
551 } else {
552 *status = U_BUFFER_OVERFLOW_ERROR;
553 }
554 }
555 return length;
556 }
557
558 U_CAPI UCollator* U_EXPORT2
ucol_safeClone(const UCollator * coll,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)559 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
560 {
561 UCollator * localCollator;
562 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
563 char *stackBufferChars = (char *)stackBuffer;
564 int32_t imageSize = 0;
565 int32_t rulesSize = 0;
566 int32_t rulesPadding = 0;
567 int32_t defaultReorderCodesSize = 0;
568 int32_t reorderCodesSize = 0;
569 uint8_t *image;
570 UChar *rules;
571 int32_t* defaultReorderCodes;
572 int32_t* reorderCodes;
573 uint8_t* leadBytePermutationTable;
574 UBool colAllocated = FALSE;
575 UBool imageAllocated = FALSE;
576
577 if (status == NULL || U_FAILURE(*status)){
578 return 0;
579 }
580 if ((stackBuffer && !pBufferSize) || !coll){
581 *status = U_ILLEGAL_ARGUMENT_ERROR;
582 return 0;
583 }
584
585 if (coll->rules && coll->freeRulesOnClose) {
586 rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
587 rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
588 bufferSizeNeeded += rulesSize + rulesPadding;
589 }
590 // no padding for alignment needed from here since the next two are 4 byte quantities
591 if (coll->defaultReorderCodes) {
592 defaultReorderCodesSize = coll->defaultReorderCodesLength * sizeof(int32_t);
593 bufferSizeNeeded += defaultReorderCodesSize;
594 }
595 if (coll->reorderCodes) {
596 reorderCodesSize = coll->reorderCodesLength * sizeof(int32_t);
597 bufferSizeNeeded += reorderCodesSize;
598 }
599 if (coll->leadBytePermutationTable) {
600 bufferSizeNeeded += 256 * sizeof(uint8_t);
601 }
602
603 if (stackBuffer && *pBufferSize <= 0) { /* 'preflighting' request - set needed size into *pBufferSize */
604 *pBufferSize = bufferSizeNeeded;
605 return 0;
606 }
607
608 /* Pointers on 64-bit platforms need to be aligned
609 * on a 64-bit boundry in memory.
610 */
611 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
612 int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
613 if (*pBufferSize > offsetUp) {
614 *pBufferSize -= offsetUp;
615 stackBufferChars += offsetUp;
616 }
617 else {
618 /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
619 *pBufferSize = 1;
620 }
621 }
622 stackBuffer = (void *)stackBufferChars;
623
624 if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) {
625 /* allocate one here...*/
626 stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
627 // Null pointer check.
628 if (stackBufferChars == NULL) {
629 *status = U_MEMORY_ALLOCATION_ERROR;
630 return NULL;
631 }
632 colAllocated = TRUE;
633 if (U_SUCCESS(*status)) {
634 *status = U_SAFECLONE_ALLOCATED_WARNING;
635 }
636 }
637 localCollator = (UCollator *)stackBufferChars;
638 rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
639 defaultReorderCodes = (int32_t*)((uint8_t*)rules + rulesSize);
640 reorderCodes = (int32_t*)((uint8_t*)defaultReorderCodes + defaultReorderCodesSize);
641 leadBytePermutationTable = (uint8_t*)reorderCodes + reorderCodesSize;
642
643 {
644 UErrorCode tempStatus = U_ZERO_ERROR;
645 imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
646 }
647 if (coll->freeImageOnClose) {
648 image = (uint8_t *)uprv_malloc(imageSize);
649 // Null pointer check
650 if (image == NULL) {
651 *status = U_MEMORY_ALLOCATION_ERROR;
652 return NULL;
653 }
654 ucol_cloneBinary(coll, image, imageSize, status);
655 imageAllocated = TRUE;
656 }
657 else {
658 image = (uint8_t *)coll->image;
659 }
660 localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
661 if (U_FAILURE(*status)) {
662 return NULL;
663 }
664
665 if (coll->rules) {
666 if (coll->freeRulesOnClose) {
667 localCollator->rules = u_strcpy(rules, coll->rules);
668 //bufferEnd += rulesSize;
669 }
670 else {
671 localCollator->rules = coll->rules;
672 }
673 localCollator->freeRulesOnClose = FALSE;
674 localCollator->rulesLength = coll->rulesLength;
675 }
676
677 // collator reordering
678 if (coll->defaultReorderCodes) {
679 localCollator->defaultReorderCodes =
680 (int32_t*) uprv_memcpy(defaultReorderCodes, coll->defaultReorderCodes, coll->defaultReorderCodesLength * sizeof(int32_t));
681 localCollator->defaultReorderCodesLength = coll->defaultReorderCodesLength;
682 localCollator->freeDefaultReorderCodesOnClose = FALSE;
683 }
684 if (coll->reorderCodes) {
685 localCollator->reorderCodes =
686 (int32_t*)uprv_memcpy(reorderCodes, coll->reorderCodes, coll->reorderCodesLength * sizeof(int32_t));
687 localCollator->reorderCodesLength = coll->reorderCodesLength;
688 localCollator->freeReorderCodesOnClose = FALSE;
689 }
690 if (coll->leadBytePermutationTable) {
691 localCollator->leadBytePermutationTable =
692 (uint8_t*) uprv_memcpy(leadBytePermutationTable, coll->leadBytePermutationTable, 256);
693 localCollator->freeLeadBytePermutationTableOnClose = FALSE;
694 }
695
696 int32_t i;
697 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
698 ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
699 }
700 // zero copies of pointers
701 localCollator->actualLocale = NULL;
702 localCollator->validLocale = NULL;
703 localCollator->requestedLocale = NULL;
704 localCollator->ucaRules = coll->ucaRules; // There should only be one copy here.
705 localCollator->freeOnClose = colAllocated;
706 localCollator->freeImageOnClose = imageAllocated;
707 return localCollator;
708 }
709
710 U_CAPI void U_EXPORT2
ucol_close(UCollator * coll)711 ucol_close(UCollator *coll)
712 {
713 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
714 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
715 if(coll != NULL) {
716 // these are always owned by each UCollator struct,
717 // so we always free them
718 if(coll->validLocale != NULL) {
719 uprv_free(coll->validLocale);
720 }
721 if(coll->actualLocale != NULL) {
722 uprv_free(coll->actualLocale);
723 }
724 if(coll->requestedLocale != NULL) {
725 uprv_free(coll->requestedLocale);
726 }
727 if(coll->latinOneCEs != NULL) {
728 uprv_free(coll->latinOneCEs);
729 }
730 if(coll->options != NULL && coll->freeOptionsOnClose) {
731 uprv_free(coll->options);
732 }
733 if(coll->rules != NULL && coll->freeRulesOnClose) {
734 uprv_free((UChar *)coll->rules);
735 }
736 if(coll->image != NULL && coll->freeImageOnClose) {
737 uprv_free((UCATableHeader *)coll->image);
738 }
739
740 if(coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
741 uprv_free(coll->leadBytePermutationTable);
742 }
743 if(coll->defaultReorderCodes != NULL && coll->freeDefaultReorderCodesOnClose == TRUE) {
744 uprv_free(coll->defaultReorderCodes);
745 }
746 if(coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
747 uprv_free(coll->reorderCodes);
748 }
749
750 if(coll->delegate != NULL) {
751 delete (Collator*)coll->delegate;
752 }
753
754 /* Here, it would be advisable to close: */
755 /* - UData for UCA (unless we stuff it in the root resb */
756 /* Again, do we need additional housekeeping... HMMM! */
757 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
758 if(coll->freeOnClose){
759 /* for safeClone, if freeOnClose is FALSE,
760 don't free the other instance data */
761 uprv_free(coll);
762 }
763 }
764 UTRACE_EXIT();
765 }
766
767 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
768 /* you should be able to get the binary chunk to write out... Doesn't look very full now */
769 U_CFUNC uint8_t* U_EXPORT2
ucol_cloneRuleData(const UCollator * coll,int32_t * length,UErrorCode * status)770 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
771 {
772 uint8_t *result = NULL;
773 if(U_FAILURE(*status)) {
774 return NULL;
775 }
776 if(coll->hasRealData == TRUE) {
777 *length = coll->image->size;
778 result = (uint8_t *)uprv_malloc(*length);
779 /* test for NULL */
780 if (result == NULL) {
781 *status = U_MEMORY_ALLOCATION_ERROR;
782 return NULL;
783 }
784 uprv_memcpy(result, coll->image, *length);
785 } else {
786 *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
787 result = (uint8_t *)uprv_malloc(*length);
788 /* test for NULL */
789 if (result == NULL) {
790 *status = U_MEMORY_ALLOCATION_ERROR;
791 return NULL;
792 }
793
794 /* build the UCATableHeader with minimal entries */
795 /* do not copy the header from the UCA file because its values are wrong! */
796 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
797
798 /* reset everything */
799 uprv_memset(result, 0, *length);
800
801 /* set the tailoring-specific values */
802 UCATableHeader *myData = (UCATableHeader *)result;
803 myData->size = *length;
804
805 /* offset for the options, the only part of the data that is present after the header */
806 myData->options = sizeof(UCATableHeader);
807
808 /* need to always set the expansion value for an upper bound of the options */
809 myData->expansion = myData->options + sizeof(UColOptionSet);
810
811 myData->magic = UCOL_HEADER_MAGIC;
812 myData->isBigEndian = U_IS_BIG_ENDIAN;
813 myData->charSetFamily = U_CHARSET_FAMILY;
814
815 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
816 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
817
818 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
819 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
820 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
821 myData->jamoSpecial = coll->image->jamoSpecial;
822
823 /* copy the collator options */
824 uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
825 }
826 return result;
827 }
828
ucol_setOptionsFromHeader(UCollator * result,UColOptionSet * opts,UErrorCode * status)829 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
830 if(U_FAILURE(*status)) {
831 return;
832 }
833 result->caseFirst = (UColAttributeValue)opts->caseFirst;
834 result->caseLevel = (UColAttributeValue)opts->caseLevel;
835 result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
836 result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
837 if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) {
838 return;
839 }
840 result->strength = (UColAttributeValue)opts->strength;
841 result->variableTopValue = opts->variableTopValue;
842 result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
843 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
844 result->numericCollation = (UColAttributeValue)opts->numericCollation;
845 result->caseFirstisDefault = TRUE;
846 result->caseLevelisDefault = TRUE;
847 result->frenchCollationisDefault = TRUE;
848 result->normalizationModeisDefault = TRUE;
849 result->strengthisDefault = TRUE;
850 result->variableTopValueisDefault = TRUE;
851 result->alternateHandlingisDefault = TRUE;
852 result->hiraganaQisDefault = TRUE;
853 result->numericCollationisDefault = TRUE;
854
855 ucol_updateInternalState(result, status);
856
857 result->options = opts;
858 }
859
860
861 /**
862 * Approximate determination if a character is at a contraction end.
863 * Guaranteed to be TRUE if a character is at the end of a contraction,
864 * otherwise it is not deterministic.
865 * @param c character to be determined
866 * @param coll collator
867 */
868 static
ucol_contractionEndCP(UChar c,const UCollator * coll)869 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
870 if (c < coll->minContrEndCP) {
871 return FALSE;
872 }
873
874 int32_t hash = c;
875 uint8_t htbyte;
876 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
877 if (U16_IS_TRAIL(c)) {
878 return TRUE;
879 }
880 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
881 }
882 htbyte = coll->contrEndCP[hash>>3];
883 return (((htbyte >> (hash & 7)) & 1) == 1);
884 }
885
886
887
888 /*
889 * i_getCombiningClass()
890 * A fast, at least partly inline version of u_getCombiningClass()
891 * This is a candidate for further optimization. Used heavily
892 * in contraction processing.
893 */
894 static
i_getCombiningClass(UChar32 c,const UCollator * coll)895 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
896 uint8_t sCC = 0;
897 if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
898 sCC = u_getCombiningClass(c);
899 }
900 return sCC;
901 }
902
ucol_initCollator(const UCATableHeader * image,UCollator * fillIn,const UCollator * UCA,UErrorCode * status)903 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
904 UChar c;
905 UCollator *result = fillIn;
906 if(U_FAILURE(*status) || image == NULL) {
907 return NULL;
908 }
909
910 if(result == NULL) {
911 result = (UCollator *)uprv_malloc(sizeof(UCollator));
912 if(result == NULL) {
913 *status = U_MEMORY_ALLOCATION_ERROR;
914 return result;
915 }
916 result->freeOnClose = TRUE;
917 } else {
918 result->freeOnClose = FALSE;
919 }
920
921 result->delegate = NULL;
922
923 result->image = image;
924 result->mapping.getFoldingOffset = _getFoldingOffset;
925 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
926 utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
927 if(U_FAILURE(*status)) {
928 if(result->freeOnClose == TRUE) {
929 uprv_free(result);
930 result = NULL;
931 }
932 return result;
933 }
934
935 result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
936 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
937 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
938 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
939 result->rules = NULL;
940 result->rulesLength = 0;
941 result->freeRulesOnClose = FALSE;
942 result->defaultReorderCodes = NULL;
943 result->defaultReorderCodesLength = 0;
944 result->freeDefaultReorderCodesOnClose = FALSE;
945 result->reorderCodes = NULL;
946 result->reorderCodesLength = 0;
947 result->freeReorderCodesOnClose = FALSE;
948 result->leadBytePermutationTable = NULL;
949 result->freeLeadBytePermutationTableOnClose = FALSE;
950
951 /* get the version info from UCATableHeader and populate the Collator struct*/
952 result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
953 result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
954 result->dataVersion[2] = 0;
955 result->dataVersion[3] = 0;
956
957 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
958 result->minUnsafeCP = 0;
959 for (c=0; c<0x300; c++) { // Find the smallest unsafe char.
960 if (ucol_unsafeCP(c, result)) break;
961 }
962 result->minUnsafeCP = c;
963
964 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
965 result->minContrEndCP = 0;
966 for (c=0; c<0x300; c++) { // Find the Contraction-ending char.
967 if (ucol_contractionEndCP(c, result)) break;
968 }
969 result->minContrEndCP = c;
970
971 /* max expansion tables */
972 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
973 result->image->endExpansionCE);
974 result->lastEndExpansionCE = result->endExpansionCE +
975 result->image->endExpansionCECount - 1;
976 result->expansionCESize = (uint8_t*)result->image +
977 result->image->expansionCESize;
978
979
980 //result->errorCode = *status;
981
982 result->latinOneCEs = NULL;
983
984 result->latinOneRegenTable = FALSE;
985 result->latinOneFailed = FALSE;
986 result->UCA = UCA;
987
988 /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
989 result->ucaRules = NULL;
990 result->actualLocale = NULL;
991 result->validLocale = NULL;
992 result->requestedLocale = NULL;
993 result->hasRealData = FALSE; // real data lives in .dat file...
994 result->freeImageOnClose = FALSE;
995
996 /* set attributes */
997 ucol_setOptionsFromHeader(
998 result,
999 (UColOptionSet*)((uint8_t*)result->image+result->image->options),
1000 status);
1001 result->freeOptionsOnClose = FALSE;
1002
1003 return result;
1004 }
1005
1006 /* new Mark's code */
1007
1008 /**
1009 * For generation of Implicit CEs
1010 * @author Davis
1011 *
1012 * Cleaned up so that changes can be made more easily.
1013 * Old values:
1014 # First Implicit: E26A792D
1015 # Last Implicit: E3DC70C0
1016 # First CJK: E0030300
1017 # Last CJK: E0A9DD00
1018 # First CJK_A: E0A9DF00
1019 # Last CJK_A: E0DE3100
1020 */
1021 /* Following is a port of Mark's code for new treatment of implicits.
1022 * It is positioned here, since ucol_initUCA need to initialize the
1023 * variables below according to the data in the fractional UCA.
1024 */
1025
1026 /**
1027 * Function used to:
1028 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
1029 * b) bump any non-CJK characters by 10FFFF.
1030 * The relevant blocks are:
1031 * A: 4E00..9FFF; CJK Unified Ideographs
1032 * F900..FAFF; CJK Compatibility Ideographs
1033 * B: 3400..4DBF; CJK Unified Ideographs Extension A
1034 * 20000..XX; CJK Unified Ideographs Extension B (and others later on)
1035 * As long as
1036 * no new B characters are allocated between 4E00 and FAFF, and
1037 * no new A characters are outside of this range,
1038 * (very high probability) this simple code will work.
1039 * The reordered blocks are:
1040 * Block1 is CJK
1041 * Block2 is CJK_COMPAT_USED
1042 * Block3 is CJK_A
1043 * (all contiguous)
1044 * Any other CJK gets its normal code point
1045 * Any non-CJK gets +10FFFF
1046 * When we reorder Block1, we make sure that it is at the very start,
1047 * so that it will use a 3-byte form.
1048 * Warning: the we only pick up the compatibility characters that are
1049 * NOT decomposed, so that block is smaller!
1050 */
1051
1052 // CONSTANTS
1053 static const UChar32
1054 NON_CJK_OFFSET = 0x110000,
1055 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
1056
1057 /**
1058 * Precomputed by initImplicitConstants()
1059 */
1060 static int32_t
1061 final3Multiplier = 0,
1062 final4Multiplier = 0,
1063 final3Count = 0,
1064 final4Count = 0,
1065 medialCount = 0,
1066 min3Primary = 0,
1067 min4Primary = 0,
1068 max4Primary = 0,
1069 minTrail = 0,
1070 maxTrail = 0,
1071 max3Trail = 0,
1072 max4Trail = 0,
1073 min4Boundary = 0;
1074
1075 static const UChar32
1076 // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
1077 // 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; (Unicode 6.1)
1078 CJK_BASE = 0x4E00,
1079 CJK_LIMIT = 0x9FCC+1,
1080 // Unified CJK ideographs in the compatibility ideographs block.
1081 CJK_COMPAT_USED_BASE = 0xFA0E,
1082 CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
1083 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
1084 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
1085 CJK_A_BASE = 0x3400,
1086 CJK_A_LIMIT = 0x4DB5+1,
1087 // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
1088 // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
1089 CJK_B_BASE = 0x20000,
1090 CJK_B_LIMIT = 0x2A6D6+1,
1091 // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
1092 // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
1093 CJK_C_BASE = 0x2A700,
1094 CJK_C_LIMIT = 0x2B734+1,
1095 // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
1096 // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
1097 CJK_D_BASE = 0x2B740,
1098 CJK_D_LIMIT = 0x2B81D+1;
1099 // when adding to this list, look for all occurrences (in project)
1100 // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!!
1101
swapCJK(UChar32 i)1102 static UChar32 swapCJK(UChar32 i) {
1103 if (i < CJK_A_BASE) {
1104 // non-CJK
1105 } else if (i < CJK_A_LIMIT) {
1106 // Extension A has lower code points than the original Unihan+compat
1107 // but sorts higher.
1108 return i - CJK_A_BASE
1109 + (CJK_LIMIT - CJK_BASE)
1110 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1111 } else if (i < CJK_BASE) {
1112 // non-CJK
1113 } else if (i < CJK_LIMIT) {
1114 return i - CJK_BASE;
1115 } else if (i < CJK_COMPAT_USED_BASE) {
1116 // non-CJK
1117 } else if (i < CJK_COMPAT_USED_LIMIT) {
1118 return i - CJK_COMPAT_USED_BASE
1119 + (CJK_LIMIT - CJK_BASE);
1120 } else if (i < CJK_B_BASE) {
1121 // non-CJK
1122 } else if (i < CJK_B_LIMIT) {
1123 return i; // non-BMP-CJK
1124 } else if (i < CJK_C_BASE) {
1125 // non-CJK
1126 } else if (i < CJK_C_LIMIT) {
1127 return i; // non-BMP-CJK
1128 } else if (i < CJK_D_BASE) {
1129 // non-CJK
1130 } else if (i < CJK_D_LIMIT) {
1131 return i; // non-BMP-CJK
1132 }
1133 return i + NON_CJK_OFFSET; // non-CJK
1134 }
1135
1136 U_CAPI UChar32 U_EXPORT2
uprv_uca_getRawFromCodePoint(UChar32 i)1137 uprv_uca_getRawFromCodePoint(UChar32 i) {
1138 return swapCJK(i)+1;
1139 }
1140
1141 U_CAPI UChar32 U_EXPORT2
uprv_uca_getCodePointFromRaw(UChar32 i)1142 uprv_uca_getCodePointFromRaw(UChar32 i) {
1143 i--;
1144 UChar32 result = 0;
1145 if(i >= NON_CJK_OFFSET) {
1146 result = i - NON_CJK_OFFSET;
1147 } else if(i >= CJK_B_BASE) {
1148 result = i;
1149 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
1150 if(i < CJK_LIMIT - CJK_BASE) {
1151 result = i + CJK_BASE;
1152 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
1153 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
1154 } else {
1155 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1156 }
1157 } else {
1158 result = -1;
1159 }
1160 return result;
1161 }
1162
1163 // GET IMPLICIT PRIMARY WEIGHTS
1164 // Return value is left justified primary key
1165 U_CAPI uint32_t U_EXPORT2
uprv_uca_getImplicitFromRaw(UChar32 cp)1166 uprv_uca_getImplicitFromRaw(UChar32 cp) {
1167 /*
1168 if (cp < 0 || cp > UCOL_MAX_INPUT) {
1169 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
1170 }
1171 */
1172 int32_t last0 = cp - min4Boundary;
1173 if (last0 < 0) {
1174 int32_t last1 = cp / final3Count;
1175 last0 = cp % final3Count;
1176
1177 int32_t last2 = last1 / medialCount;
1178 last1 %= medialCount;
1179
1180 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
1181 last1 = minTrail + last1; // offset
1182 last2 = min3Primary + last2; // offset
1183 /*
1184 if (last2 >= min4Primary) {
1185 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
1186 }
1187 */
1188 return (last2 << 24) + (last1 << 16) + (last0 << 8);
1189 } else {
1190 int32_t last1 = last0 / final4Count;
1191 last0 %= final4Count;
1192
1193 int32_t last2 = last1 / medialCount;
1194 last1 %= medialCount;
1195
1196 int32_t last3 = last2 / medialCount;
1197 last2 %= medialCount;
1198
1199 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
1200 last1 = minTrail + last1; // offset
1201 last2 = minTrail + last2; // offset
1202 last3 = min4Primary + last3; // offset
1203 /*
1204 if (last3 > max4Primary) {
1205 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
1206 }
1207 */
1208 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
1209 }
1210 }
1211
1212 static uint32_t U_EXPORT2
uprv_uca_getImplicitPrimary(UChar32 cp)1213 uprv_uca_getImplicitPrimary(UChar32 cp) {
1214 //fprintf(stdout, "Incoming: %04x\n", cp);
1215 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1216
1217 cp = swapCJK(cp);
1218 cp++;
1219 // we now have a range of numbers from 0 to 21FFFF.
1220
1221 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1222 //fprintf(stdout, "CJK swapped: %04x\n", cp);
1223
1224 return uprv_uca_getImplicitFromRaw(cp);
1225 }
1226
1227 /**
1228 * Converts implicit CE into raw integer ("code point")
1229 * @param implicit
1230 * @return -1 if illegal format
1231 */
1232 U_CAPI UChar32 U_EXPORT2
uprv_uca_getRawFromImplicit(uint32_t implicit)1233 uprv_uca_getRawFromImplicit(uint32_t implicit) {
1234 UChar32 result;
1235 UChar32 b3 = implicit & 0xFF;
1236 UChar32 b2 = (implicit >> 8) & 0xFF;
1237 UChar32 b1 = (implicit >> 16) & 0xFF;
1238 UChar32 b0 = (implicit >> 24) & 0xFF;
1239
1240 // simple parameter checks
1241 if (b0 < min3Primary || b0 > max4Primary
1242 || b1 < minTrail || b1 > maxTrail)
1243 return -1;
1244 // normal offsets
1245 b1 -= minTrail;
1246
1247 // take care of the final values, and compose
1248 if (b0 < min4Primary) {
1249 if (b2 < minTrail || b2 > max3Trail || b3 != 0)
1250 return -1;
1251 b2 -= minTrail;
1252 UChar32 remainder = b2 % final3Multiplier;
1253 if (remainder != 0)
1254 return -1;
1255 b0 -= min3Primary;
1256 b2 /= final3Multiplier;
1257 result = ((b0 * medialCount) + b1) * final3Count + b2;
1258 } else {
1259 if (b2 < minTrail || b2 > maxTrail
1260 || b3 < minTrail || b3 > max4Trail)
1261 return -1;
1262 b2 -= minTrail;
1263 b3 -= minTrail;
1264 UChar32 remainder = b3 % final4Multiplier;
1265 if (remainder != 0)
1266 return -1;
1267 b3 /= final4Multiplier;
1268 b0 -= min4Primary;
1269 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
1270 }
1271 // final check
1272 if (result < 0 || result > UCOL_MAX_INPUT)
1273 return -1;
1274 return result;
1275 }
1276
1277
divideAndRoundUp(int a,int b)1278 static inline int32_t divideAndRoundUp(int a, int b) {
1279 return 1 + (a-1)/b;
1280 }
1281
1282 /* this function is either called from initUCA or from genUCA before
1283 * doing canonical closure for the UCA.
1284 */
1285
1286 /**
1287 * Set up to generate implicits.
1288 * Maintenance Note: this function may end up being called more than once, due
1289 * to threading races during initialization. Make sure that
1290 * none of the Constants is ever transiently assigned an
1291 * incorrect value.
1292 * @param minPrimary
1293 * @param maxPrimary
1294 * @param minTrail final byte
1295 * @param maxTrail final byte
1296 * @param gap3 the gap we leave for tailoring for 3-byte forms
1297 * @param gap4 the gap we leave for tailoring for 4-byte forms
1298 */
initImplicitConstants(int minPrimary,int maxPrimary,int minTrailIn,int maxTrailIn,int gap3,int primaries3count,UErrorCode * status)1299 static void initImplicitConstants(int minPrimary, int maxPrimary,
1300 int minTrailIn, int maxTrailIn,
1301 int gap3, int primaries3count,
1302 UErrorCode *status) {
1303 // some simple parameter checks
1304 if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
1305 || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
1306 || (primaries3count < 1))
1307 {
1308 *status = U_ILLEGAL_ARGUMENT_ERROR;
1309 return;
1310 };
1311
1312 minTrail = minTrailIn;
1313 maxTrail = maxTrailIn;
1314
1315 min3Primary = minPrimary;
1316 max4Primary = maxPrimary;
1317 // compute constants for use later.
1318 // number of values we can use in trailing bytes
1319 // leave room for empty values between AND above, e.g. if gap = 2
1320 // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1321 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1322 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1323 final3Multiplier = gap3 + 1;
1324 final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
1325 max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
1326
1327 // medials can use full range
1328 medialCount = (maxTrail - minTrail + 1);
1329 // find out how many values fit in each form
1330 int32_t threeByteCount = medialCount * final3Count;
1331 // now determine where the 3/4 boundary is.
1332 // we use 3 bytes below the boundary, and 4 above
1333 int32_t primariesAvailable = maxPrimary - minPrimary + 1;
1334 int32_t primaries4count = primariesAvailable - primaries3count;
1335
1336
1337 int32_t min3ByteCoverage = primaries3count * threeByteCount;
1338 min4Primary = minPrimary + primaries3count;
1339 min4Boundary = min3ByteCoverage;
1340 // Now expand out the multiplier for the 4 bytes, and redo.
1341
1342 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
1343 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
1344 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
1345 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
1346 if (gap4 < 1) {
1347 *status = U_ILLEGAL_ARGUMENT_ERROR;
1348 return;
1349 }
1350 final4Multiplier = gap4 + 1;
1351 final4Count = neededPerFinalByte;
1352 max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
1353 }
1354
1355 /**
1356 * Supply parameters for generating implicit CEs
1357 */
1358 U_CAPI void U_EXPORT2
uprv_uca_initImplicitConstants(UErrorCode * status)1359 uprv_uca_initImplicitConstants(UErrorCode *status) {
1360 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1361 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1362 initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
1363 }
1364
1365
1366 /* collIterNormalize Incremental Normalization happens here. */
1367 /* pick up the range of chars identifed by FCD, */
1368 /* normalize it into the collIterate's writable buffer, */
1369 /* switch the collIterate's state to use the writable buffer. */
1370 /* */
1371 static
collIterNormalize(collIterate * collationSource)1372 void collIterNormalize(collIterate *collationSource)
1373 {
1374 UErrorCode status = U_ZERO_ERROR;
1375 const UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */
1376 const UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */
1377
1378 collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)),
1379 collationSource->writableBuffer,
1380 status);
1381 if (U_FAILURE(status)) {
1382 #ifdef UCOL_DEBUG
1383 fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status));
1384 #endif
1385 return;
1386 }
1387
1388 collationSource->pos = collationSource->writableBuffer.getTerminatedBuffer();
1389 collationSource->origFlags = collationSource->flags;
1390 collationSource->flags |= UCOL_ITER_INNORMBUF;
1391 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1392 }
1393
1394
1395 // This function takes the iterator and extracts normalized stuff up to the next boundary
1396 // It is similar in the end results to the collIterNormalize, but for the cases when we
1397 // use an iterator
1398 /*static
1399 inline void normalizeIterator(collIterate *collationSource) {
1400 UErrorCode status = U_ZERO_ERROR;
1401 UBool wasNormalized = FALSE;
1402 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1403 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
1404 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1405 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1406 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1407 // reallocate and terminate
1408 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1409 &collationSource->writableBuffer,
1410 (int32_t *)&collationSource->writableBufSize, normLen + 1,
1411 0)
1412 ) {
1413 #ifdef UCOL_DEBUG
1414 fprintf(stderr, "normalizeIterator(), out of memory\n");
1415 #endif
1416 return;
1417 }
1418 status = U_ZERO_ERROR;
1419 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1420 collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
1421 normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1422 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1423 }
1424 // Terminate the buffer - we already checked that it is big enough
1425 collationSource->writableBuffer[normLen] = 0;
1426 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1427 collationSource->flags |= UCOL_ITER_ALLOCATED;
1428 }
1429 collationSource->pos = collationSource->writableBuffer;
1430 collationSource->origFlags = collationSource->flags;
1431 collationSource->flags |= UCOL_ITER_INNORMBUF;
1432 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1433 }*/
1434
1435
1436 /* Incremental FCD check and normalize */
1437 /* Called from getNextCE when normalization state is suspect. */
1438 /* When entering, the state is known to be this: */
1439 /* o We are working in the main buffer of the collIterate, not the side */
1440 /* writable buffer. When in the side buffer, normalization mode is always off, */
1441 /* so we won't get here. */
1442 /* o The leading combining class from the current character is 0 or */
1443 /* the trailing combining class of the previous char was zero. */
1444 /* True because the previous call to this function will have always exited */
1445 /* that way, and we get called for every char where cc might be non-zero. */
1446 static
collIterFCD(collIterate * collationSource)1447 inline UBool collIterFCD(collIterate *collationSource) {
1448 const UChar *srcP, *endP;
1449 uint8_t leadingCC;
1450 uint8_t prevTrailingCC = 0;
1451 uint16_t fcd;
1452 UBool needNormalize = FALSE;
1453
1454 srcP = collationSource->pos-1;
1455
1456 if (collationSource->flags & UCOL_ITER_HASLEN) {
1457 endP = collationSource->endp;
1458 } else {
1459 endP = NULL;
1460 }
1461
1462 // Get the trailing combining class of the current character. If it's zero, we are OK.
1463 fcd = g_nfcImpl->nextFCD16(srcP, endP);
1464 if (fcd != 0) {
1465 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1466
1467 if (prevTrailingCC != 0) {
1468 // The current char has a non-zero trailing CC. Scan forward until we find
1469 // a char with a leading cc of zero.
1470 while (endP == NULL || srcP != endP)
1471 {
1472 const UChar *savedSrcP = srcP;
1473
1474 fcd = g_nfcImpl->nextFCD16(srcP, endP);
1475 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1476 if (leadingCC == 0) {
1477 srcP = savedSrcP; // Hit char that is not part of combining sequence.
1478 // back up over it. (Could be surrogate pair!)
1479 break;
1480 }
1481
1482 if (leadingCC < prevTrailingCC) {
1483 needNormalize = TRUE;
1484 }
1485
1486 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1487 }
1488 }
1489 }
1490
1491 collationSource->fcdPosition = (UChar *)srcP;
1492
1493 return needNormalize;
1494 }
1495
1496 /****************************************************************************/
1497 /* Following are the CE retrieval functions */
1498 /* */
1499 /****************************************************************************/
1500
1501 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
1502 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
1503
1504 /* there should be a macro version of this function in the header file */
1505 /* This is the first function that tries to fetch a collation element */
1506 /* If it's not succesfull or it encounters a more difficult situation */
1507 /* some more sofisticated and slower functions are invoked */
1508 static
ucol_IGetNextCE(const UCollator * coll,collIterate * collationSource,UErrorCode * status)1509 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1510 uint32_t order = 0;
1511 if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */
1512 order = *(collationSource->toReturn++); /* if so, return them */
1513 if(collationSource->CEpos == collationSource->toReturn) {
1514 collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs;
1515 }
1516 return order;
1517 }
1518
1519 UChar ch = 0;
1520 collationSource->offsetReturn = NULL;
1521
1522 do {
1523 for (;;) /* Loop handles case when incremental normalize switches */
1524 { /* to or from the side buffer / original string, and we */
1525 /* need to start again to get the next character. */
1526
1527 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1528 {
1529 // The source string is null terminated and we're not working from the side buffer,
1530 // and we're not normalizing. This is the fast path.
1531 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1532 ch = *collationSource->pos++;
1533 if (ch != 0) {
1534 break;
1535 }
1536 else {
1537 return UCOL_NO_MORE_CES;
1538 }
1539 }
1540
1541 if (collationSource->flags & UCOL_ITER_HASLEN) {
1542 // Normal path for strings when length is specified.
1543 // (We can't be in side buffer because it is always null terminated.)
1544 if (collationSource->pos >= collationSource->endp) {
1545 // Ran off of the end of the main source string. We're done.
1546 return UCOL_NO_MORE_CES;
1547 }
1548 ch = *collationSource->pos++;
1549 }
1550 else if(collationSource->flags & UCOL_USE_ITERATOR) {
1551 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
1552 if(iterCh == U_SENTINEL) {
1553 return UCOL_NO_MORE_CES;
1554 }
1555 ch = (UChar)iterCh;
1556 }
1557 else
1558 {
1559 // Null terminated string.
1560 ch = *collationSource->pos++;
1561 if (ch == 0) {
1562 // Ran off end of buffer.
1563 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1564 // Ran off end of main string. backing up one character.
1565 collationSource->pos--;
1566 return UCOL_NO_MORE_CES;
1567 }
1568 else
1569 {
1570 // Hit null in the normalize side buffer.
1571 // Usually this means the end of the normalized data,
1572 // except for one odd case: a null followed by combining chars,
1573 // which is the case if we are at the start of the buffer.
1574 if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) {
1575 break;
1576 }
1577
1578 // Null marked end of side buffer.
1579 // Revert to the main string and
1580 // loop back to top to try again to get a character.
1581 collationSource->pos = collationSource->fcdPosition;
1582 collationSource->flags = collationSource->origFlags;
1583 continue;
1584 }
1585 }
1586 }
1587
1588 if(collationSource->flags&UCOL_HIRAGANA_Q) {
1589 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
1590 * based on whether the previous codepoint was Hiragana or Katakana.
1591 */
1592 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
1593 ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
1594 collationSource->flags |= UCOL_WAS_HIRAGANA;
1595 } else {
1596 collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1597 }
1598 }
1599
1600 // We've got a character. See if there's any fcd and/or normalization stuff to do.
1601 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1602 if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1603 break;
1604 }
1605
1606 if (collationSource->fcdPosition >= collationSource->pos) {
1607 // An earlier FCD check has already covered the current character.
1608 // We can go ahead and process this char.
1609 break;
1610 }
1611
1612 if (ch < ZERO_CC_LIMIT_ ) {
1613 // Fast fcd safe path. Trailing combining class == 0. This char is OK.
1614 break;
1615 }
1616
1617 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1618 // We need to peek at the next character in order to tell if we are FCD
1619 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
1620 // We are at the last char of source string.
1621 // It is always OK for FCD check.
1622 break;
1623 }
1624
1625 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test
1626 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1627 break;
1628 }
1629 }
1630
1631
1632 // Need a more complete FCD check and possible normalization.
1633 if (collIterFCD(collationSource)) {
1634 collIterNormalize(collationSource);
1635 }
1636 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1637 // No normalization was needed. Go ahead and process the char we already had.
1638 break;
1639 }
1640
1641 // Some normalization happened. Next loop iteration will pick up a char
1642 // from the normalization buffer.
1643
1644 } // end for (;;)
1645
1646
1647 if (ch <= 0xFF) {
1648 /* For latin-1 characters we never need to fall back to the UCA table */
1649 /* because all of the UCA data is replicated in the latinOneMapping array */
1650 order = coll->latinOneMapping[ch];
1651 if (order > UCOL_NOT_FOUND) {
1652 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
1653 }
1654 }
1655 else
1656 {
1657 // Always use UCA for Han, Hangul
1658 // (Han extension A is before main Han block)
1659 // **** Han compatibility chars ?? ****
1660 if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
1661 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
1662 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
1663 // between the two target ranges; do normal lookup
1664 // **** this range is YI, Modifier tone letters, ****
1665 // **** Latin-D, Syloti Nagari, Phagas-pa. ****
1666 // **** Latin-D might be tailored, so we need to ****
1667 // **** do the normal lookup for these guys. ****
1668 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1669 } else {
1670 // in one of the target ranges; use UCA
1671 order = UCOL_NOT_FOUND;
1672 }
1673 } else {
1674 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1675 }
1676
1677 if(order > UCOL_NOT_FOUND) { /* if a CE is special */
1678 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */
1679 }
1680
1681 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */
1682 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1683 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1684
1685 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
1686 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
1687 }
1688 }
1689 }
1690 } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
1691
1692 if(order == UCOL_NOT_FOUND) {
1693 order = getImplicit(ch, collationSource);
1694 }
1695 return order; /* return the CE */
1696 }
1697
1698 /* ucol_getNextCE, out-of-line version for use from other files. */
1699 U_CAPI uint32_t U_EXPORT2
ucol_getNextCE(const UCollator * coll,collIterate * collationSource,UErrorCode * status)1700 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1701 return ucol_IGetNextCE(coll, collationSource, status);
1702 }
1703
1704
1705 /**
1706 * Incremental previous normalization happens here. Pick up the range of chars
1707 * identifed by FCD, normalize it into the collIterate's writable buffer,
1708 * switch the collIterate's state to use the writable buffer.
1709 * @param data collation iterator data
1710 */
1711 static
collPrevIterNormalize(collIterate * data)1712 void collPrevIterNormalize(collIterate *data)
1713 {
1714 UErrorCode status = U_ZERO_ERROR;
1715 const UChar *pEnd = data->pos; /* End normalize + 1 */
1716 const UChar *pStart;
1717
1718 /* Start normalize */
1719 if (data->fcdPosition == NULL) {
1720 pStart = data->string;
1721 }
1722 else {
1723 pStart = data->fcdPosition + 1;
1724 }
1725
1726 int32_t normLen =
1727 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)),
1728 data->writableBuffer,
1729 status).
1730 length();
1731 if(U_FAILURE(status)) {
1732 return;
1733 }
1734 /*
1735 this puts the null termination infront of the normalized string instead
1736 of the end
1737 */
1738 data->writableBuffer.insert(0, (UChar)0);
1739
1740 /*
1741 * The usual case at this point is that we've got a base
1742 * character followed by marks that were normalized. If
1743 * fcdPosition is NULL, that means that we backed up to
1744 * the beginning of the string and there's no base character.
1745 *
1746 * Forward processing will usually normalize when it sees
1747 * the first mark, so that mark will get it's natural offset
1748 * and the rest will get the offset of the character following
1749 * the marks. The base character will also get its natural offset.
1750 *
1751 * We write the offset of the base character, if there is one,
1752 * followed by the offset of the first mark and then the offsets
1753 * of the rest of the marks.
1754 */
1755 int32_t firstMarkOffset = 0;
1756 int32_t trailOffset = (int32_t)(data->pos - data->string + 1);
1757 int32_t trailCount = normLen - 1;
1758
1759 if (data->fcdPosition != NULL) {
1760 int32_t baseOffset = (int32_t)(data->fcdPosition - data->string);
1761 UChar baseChar = *data->fcdPosition;
1762
1763 firstMarkOffset = baseOffset + 1;
1764
1765 /*
1766 * If the base character is the start of a contraction, forward processing
1767 * will normalize the marks while checking for the contraction, which means
1768 * that the offset of the first mark will the same as the other marks.
1769 *
1770 * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
1771 */
1772 if (baseChar >= 0x100) {
1773 uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
1774
1775 if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
1776 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
1777 }
1778
1779 if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
1780 firstMarkOffset = trailOffset;
1781 }
1782 }
1783
1784 data->appendOffset(baseOffset, status);
1785 }
1786
1787 data->appendOffset(firstMarkOffset, status);
1788
1789 for (int32_t i = 0; i < trailCount; i += 1) {
1790 data->appendOffset(trailOffset, status);
1791 }
1792
1793 data->offsetRepeatValue = trailOffset;
1794
1795 data->offsetReturn = data->offsetStore - 1;
1796 if (data->offsetReturn == data->offsetBuffer) {
1797 data->offsetStore = data->offsetBuffer;
1798 }
1799
1800 data->pos = data->writableBuffer.getTerminatedBuffer() + 1 + normLen;
1801 data->origFlags = data->flags;
1802 data->flags |= UCOL_ITER_INNORMBUF;
1803 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1804 }
1805
1806
1807 /**
1808 * Incremental FCD check for previous iteration and normalize. Called from
1809 * getPrevCE when normalization state is suspect.
1810 * When entering, the state is known to be this:
1811 * o We are working in the main buffer of the collIterate, not the side
1812 * writable buffer. When in the side buffer, normalization mode is always
1813 * off, so we won't get here.
1814 * o The leading combining class from the current character is 0 or the
1815 * trailing combining class of the previous char was zero.
1816 * True because the previous call to this function will have always exited
1817 * that way, and we get called for every char where cc might be non-zero.
1818 * @param data collation iterate struct
1819 * @return normalization status, TRUE for normalization to be done, FALSE
1820 * otherwise
1821 */
1822 static
collPrevIterFCD(collIterate * data)1823 inline UBool collPrevIterFCD(collIterate *data)
1824 {
1825 const UChar *src, *start;
1826 uint8_t leadingCC;
1827 uint8_t trailingCC = 0;
1828 uint16_t fcd;
1829 UBool result = FALSE;
1830
1831 start = data->string;
1832 src = data->pos + 1;
1833
1834 /* Get the trailing combining class of the current character. */
1835 fcd = g_nfcImpl->previousFCD16(start, src);
1836
1837 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1838
1839 if (leadingCC != 0) {
1840 /*
1841 The current char has a non-zero leading combining class.
1842 Scan backward until we find a char with a trailing cc of zero.
1843 */
1844 for (;;)
1845 {
1846 if (start == src) {
1847 data->fcdPosition = NULL;
1848 return result;
1849 }
1850
1851 fcd = g_nfcImpl->previousFCD16(start, src);
1852
1853 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1854
1855 if (trailingCC == 0) {
1856 break;
1857 }
1858
1859 if (leadingCC < trailingCC) {
1860 result = TRUE;
1861 }
1862
1863 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1864 }
1865 }
1866
1867 data->fcdPosition = (UChar *)src;
1868
1869 return result;
1870 }
1871
1872 /** gets a code unit from the string at a given offset
1873 * Handles both normal and iterative cases.
1874 * No error checking - caller beware!
1875 */
1876 static inline
peekCodeUnit(collIterate * source,int32_t offset)1877 UChar peekCodeUnit(collIterate *source, int32_t offset) {
1878 if(source->pos != NULL) {
1879 return *(source->pos + offset);
1880 } else if(source->iterator != NULL) {
1881 UChar32 c;
1882 if(offset != 0) {
1883 source->iterator->move(source->iterator, offset, UITER_CURRENT);
1884 c = source->iterator->next(source->iterator);
1885 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1886 } else {
1887 c = source->iterator->current(source->iterator);
1888 }
1889 return c >= 0 ? (UChar)c : 0xfffd; // If the caller works properly, we should never see c<0.
1890 } else {
1891 return 0xfffd;
1892 }
1893 }
1894
1895 // Code point version. Treats the offset as a _code point_ delta.
1896 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16.
1897 // We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer.
1898 static inline
peekCodePoint(collIterate * source,int32_t offset)1899 UChar32 peekCodePoint(collIterate *source, int32_t offset) {
1900 UChar32 c;
1901 if(source->pos != NULL) {
1902 const UChar *p = source->pos;
1903 if(offset >= 0) {
1904 // Skip forward over (offset-1) code points.
1905 while(--offset >= 0) {
1906 if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) {
1907 ++p;
1908 }
1909 }
1910 // Read the code point there.
1911 c = *p++;
1912 UChar trail;
1913 if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) {
1914 c = U16_GET_SUPPLEMENTARY(c, trail);
1915 }
1916 } else /* offset<0 */ {
1917 // Skip backward over (offset-1) code points.
1918 while(++offset < 0) {
1919 if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) {
1920 --p;
1921 }
1922 }
1923 // Read the code point before that.
1924 c = *--p;
1925 UChar lead;
1926 if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) {
1927 c = U16_GET_SUPPLEMENTARY(lead, c);
1928 }
1929 }
1930 } else if(source->iterator != NULL) {
1931 if(offset >= 0) {
1932 // Skip forward over (offset-1) code points.
1933 int32_t fwd = offset;
1934 while(fwd-- > 0) {
1935 uiter_next32(source->iterator);
1936 }
1937 // Read the code point there.
1938 c = uiter_current32(source->iterator);
1939 // Return to the starting point, skipping backward over (offset-1) code points.
1940 while(offset-- > 0) {
1941 uiter_previous32(source->iterator);
1942 }
1943 } else /* offset<0 */ {
1944 // Read backward, reading offset code points, remember only the last-read one.
1945 int32_t back = offset;
1946 do {
1947 c = uiter_previous32(source->iterator);
1948 } while(++back < 0);
1949 // Return to the starting position, skipping forward over offset code points.
1950 do {
1951 uiter_next32(source->iterator);
1952 } while(++offset < 0);
1953 }
1954 } else {
1955 c = U_SENTINEL;
1956 }
1957 return c;
1958 }
1959
1960 /**
1961 * Determines if we are at the start of the data string in the backwards
1962 * collation iterator
1963 * @param data collation iterator
1964 * @return TRUE if we are at the start
1965 */
1966 static
isAtStartPrevIterate(collIterate * data)1967 inline UBool isAtStartPrevIterate(collIterate *data) {
1968 if(data->pos == NULL && data->iterator != NULL) {
1969 return !data->iterator->hasPrevious(data->iterator);
1970 }
1971 //return (collIter_bos(data)) ||
1972 return (data->pos == data->string) ||
1973 ((data->flags & UCOL_ITER_INNORMBUF) && (data->pos != NULL) &&
1974 *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1975 }
1976
1977 static
goBackOne(collIterate * data)1978 inline void goBackOne(collIterate *data) {
1979 # if 0
1980 // somehow, it looks like we need to keep iterator synced up
1981 // at all times, as above.
1982 if(data->pos) {
1983 data->pos--;
1984 }
1985 if(data->iterator) {
1986 data->iterator->previous(data->iterator);
1987 }
1988 #endif
1989 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
1990 data->iterator->previous(data->iterator);
1991 }
1992 if(data->pos) {
1993 data->pos --;
1994 }
1995 }
1996
1997 /**
1998 * Inline function that gets a simple CE.
1999 * So what it does is that it will first check the expansion buffer. If the
2000 * expansion buffer is not empty, ie the end pointer to the expansion buffer
2001 * is different from the string pointer, we return the collation element at the
2002 * return pointer and decrement it.
2003 * For more complicated CEs it resorts to getComplicatedCE.
2004 * @param coll collator data
2005 * @param data collation iterator struct
2006 * @param status error status
2007 */
2008 static
ucol_IGetPrevCE(const UCollator * coll,collIterate * data,UErrorCode * status)2009 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
2010 UErrorCode *status)
2011 {
2012 uint32_t result = (uint32_t)UCOL_NULLORDER;
2013
2014 if (data->offsetReturn != NULL) {
2015 if (data->offsetRepeatCount > 0) {
2016 data->offsetRepeatCount -= 1;
2017 } else {
2018 if (data->offsetReturn == data->offsetBuffer) {
2019 data->offsetReturn = NULL;
2020 data->offsetStore = data->offsetBuffer;
2021 } else {
2022 data->offsetReturn -= 1;
2023 }
2024 }
2025 }
2026
2027 if ((data->extendCEs && data->toReturn > data->extendCEs) ||
2028 (!data->extendCEs && data->toReturn > data->CEs))
2029 {
2030 data->toReturn -= 1;
2031 result = *(data->toReturn);
2032 if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
2033 data->CEpos = data->toReturn;
2034 }
2035 }
2036 else {
2037 UChar ch = 0;
2038
2039 do {
2040 /*
2041 Loop handles case when incremental normalize switches to or from the
2042 side buffer / original string, and we need to start again to get the
2043 next character.
2044 */
2045 for (;;) {
2046 if (data->flags & UCOL_ITER_HASLEN) {
2047 /*
2048 Normal path for strings when length is specified.
2049 Not in side buffer because it is always null terminated.
2050 */
2051 if (data->pos <= data->string) {
2052 /* End of the main source string */
2053 return UCOL_NO_MORE_CES;
2054 }
2055 data->pos --;
2056 ch = *data->pos;
2057 }
2058 // we are using an iterator to go back. Pray for us!
2059 else if (data->flags & UCOL_USE_ITERATOR) {
2060 UChar32 iterCh = data->iterator->previous(data->iterator);
2061 if(iterCh == U_SENTINEL) {
2062 return UCOL_NO_MORE_CES;
2063 } else {
2064 ch = (UChar)iterCh;
2065 }
2066 }
2067 else {
2068 data->pos --;
2069 ch = *data->pos;
2070 /* we are in the side buffer. */
2071 if (ch == 0) {
2072 /*
2073 At the start of the normalize side buffer.
2074 Go back to string.
2075 Because pointer points to the last accessed character,
2076 hence we have to increment it by one here.
2077 */
2078 data->flags = data->origFlags;
2079 data->offsetRepeatValue = 0;
2080
2081 if (data->fcdPosition == NULL) {
2082 data->pos = data->string;
2083 return UCOL_NO_MORE_CES;
2084 }
2085 else {
2086 data->pos = data->fcdPosition + 1;
2087 }
2088
2089 continue;
2090 }
2091 }
2092
2093 if(data->flags&UCOL_HIRAGANA_Q) {
2094 if(ch>=0x3040 && ch<=0x309f) {
2095 data->flags |= UCOL_WAS_HIRAGANA;
2096 } else {
2097 data->flags &= ~UCOL_WAS_HIRAGANA;
2098 }
2099 }
2100
2101 /*
2102 * got a character to determine if there's fcd and/or normalization
2103 * stuff to do.
2104 * if the current character is not fcd.
2105 * if current character is at the start of the string
2106 * Trailing combining class == 0.
2107 * Note if pos is in the writablebuffer, norm is always 0
2108 */
2109 if (ch < ZERO_CC_LIMIT_ ||
2110 // this should propel us out of the loop in the iterator case
2111 (data->flags & UCOL_ITER_NORM) == 0 ||
2112 (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
2113 || data->string == data->pos) {
2114 break;
2115 }
2116
2117 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
2118 /* if next character is FCD */
2119 if (data->pos == data->string) {
2120 /* First char of string is always OK for FCD check */
2121 break;
2122 }
2123
2124 /* Not first char of string, do the FCD fast test */
2125 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
2126 break;
2127 }
2128 }
2129
2130 /* Need a more complete FCD check and possible normalization. */
2131 if (collPrevIterFCD(data)) {
2132 collPrevIterNormalize(data);
2133 }
2134
2135 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2136 /* No normalization. Go ahead and process the char. */
2137 break;
2138 }
2139
2140 /*
2141 Some normalization happened.
2142 Next loop picks up a char from the normalization buffer.
2143 */
2144 }
2145
2146 /* attempt to handle contractions, after removal of the backwards
2147 contraction
2148 */
2149 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
2150 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
2151 } else {
2152 if (ch <= 0xFF) {
2153 result = coll->latinOneMapping[ch];
2154 }
2155 else {
2156 // Always use UCA for [3400..9FFF], [AC00..D7AF]
2157 // **** [FA0E..FA2F] ?? ****
2158 if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
2159 (ch >= 0x3400 && ch <= 0xD7AF)) {
2160 if (ch > 0x9FFF && ch < 0xAC00) {
2161 // between the two target ranges; do normal lookup
2162 // **** this range is YI, Modifier tone letters, ****
2163 // **** Latin-D, Syloti Nagari, Phagas-pa. ****
2164 // **** Latin-D might be tailored, so we need to ****
2165 // **** do the normal lookup for these guys. ****
2166 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
2167 } else {
2168 result = UCOL_NOT_FOUND;
2169 }
2170 } else {
2171 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
2172 }
2173 }
2174 if (result > UCOL_NOT_FOUND) {
2175 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
2176 }
2177 if (result == UCOL_NOT_FOUND) { // Not found in master list
2178 if (!isAtStartPrevIterate(data) &&
2179 ucol_contractionEndCP(ch, data->coll))
2180 {
2181 result = UCOL_CONTRACTION;
2182 } else {
2183 if(coll->UCA) {
2184 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
2185 }
2186 }
2187
2188 if (result > UCOL_NOT_FOUND) {
2189 if(coll->UCA) {
2190 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
2191 }
2192 }
2193 }
2194 }
2195 } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
2196
2197 if(result == UCOL_NOT_FOUND) {
2198 result = getPrevImplicit(ch, data);
2199 }
2200 }
2201
2202 return result;
2203 }
2204
2205
2206 /* ucol_getPrevCE, out-of-line version for use from other files. */
2207 U_CFUNC uint32_t U_EXPORT2
ucol_getPrevCE(const UCollator * coll,collIterate * data,UErrorCode * status)2208 ucol_getPrevCE(const UCollator *coll, collIterate *data,
2209 UErrorCode *status) {
2210 return ucol_IGetPrevCE(coll, data, status);
2211 }
2212
2213
2214 /* this should be connected to special Jamo handling */
2215 U_CFUNC uint32_t U_EXPORT2
ucol_getFirstCE(const UCollator * coll,UChar u,UErrorCode * status)2216 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
2217 collIterate colIt;
2218 IInit_collIterate(coll, &u, 1, &colIt, status);
2219 if(U_FAILURE(*status)) {
2220 return 0;
2221 }
2222 return ucol_IGetNextCE(coll, &colIt, status);
2223 }
2224
2225 /**
2226 * Inserts the argument character into the end of the buffer pushing back the
2227 * null terminator.
2228 * @param data collIterate struct data
2229 * @param ch character to be appended
2230 * @return the position of the new addition
2231 */
2232 static
insertBufferEnd(collIterate * data,UChar ch)2233 inline const UChar * insertBufferEnd(collIterate *data, UChar ch)
2234 {
2235 int32_t oldLength = data->writableBuffer.length();
2236 return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength;
2237 }
2238
2239 /**
2240 * Inserts the argument string into the end of the buffer pushing back the
2241 * null terminator.
2242 * @param data collIterate struct data
2243 * @param string to be appended
2244 * @param length of the string to be appended
2245 * @return the position of the new addition
2246 */
2247 static
insertBufferEnd(collIterate * data,const UChar * str,int32_t length)2248 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length)
2249 {
2250 int32_t oldLength = data->writableBuffer.length();
2251 return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength;
2252 }
2253
2254 /**
2255 * Special normalization function for contraction in the forwards iterator.
2256 * This normalization sequence will place the current character at source->pos
2257 * and its following normalized sequence into the buffer.
2258 * The fcd position, pos will be changed.
2259 * pos will now point to positions in the buffer.
2260 * Flags will be changed accordingly.
2261 * @param data collation iterator data
2262 */
2263 static
normalizeNextContraction(collIterate * data)2264 inline void normalizeNextContraction(collIterate *data)
2265 {
2266 int32_t strsize;
2267 UErrorCode status = U_ZERO_ERROR;
2268 /* because the pointer points to the next character */
2269 const UChar *pStart = data->pos - 1;
2270 const UChar *pEnd;
2271
2272 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2273 data->writableBuffer.setTo(*(pStart - 1));
2274 strsize = 1;
2275 }
2276 else {
2277 strsize = data->writableBuffer.length();
2278 }
2279
2280 pEnd = data->fcdPosition;
2281
2282 data->writableBuffer.append(
2283 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status));
2284 if(U_FAILURE(status)) {
2285 return;
2286 }
2287
2288 data->pos = data->writableBuffer.getTerminatedBuffer() + strsize;
2289 data->origFlags = data->flags;
2290 data->flags |= UCOL_ITER_INNORMBUF;
2291 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2292 }
2293
2294 /**
2295 * Contraction character management function that returns the next character
2296 * for the forwards iterator.
2297 * Does nothing if the next character is in buffer and not the first character
2298 * in it.
2299 * Else it checks next character in data string to see if it is normalizable.
2300 * If it is not, the character is simply copied into the buffer, else
2301 * the whole normalized substring is copied into the buffer, including the
2302 * current character.
2303 * @param data collation element iterator data
2304 * @return next character
2305 */
2306 static
getNextNormalizedChar(collIterate * data)2307 inline UChar getNextNormalizedChar(collIterate *data)
2308 {
2309 UChar nextch;
2310 UChar ch;
2311 // Here we need to add the iterator code. One problem is the way
2312 // end of string is handled. If we just return next char, it could
2313 // be the sentinel. Most of the cases already check for this, but we
2314 // need to be sure.
2315 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2316 /* if no normalization and not in buffer. */
2317 if(data->flags & UCOL_USE_ITERATOR) {
2318 return (UChar)data->iterator->next(data->iterator);
2319 } else {
2320 return *(data->pos ++);
2321 }
2322 }
2323
2324 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2325 //normalizeIterator(data);
2326 //}
2327
2328 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2329 if ((innormbuf && *data->pos != 0) ||
2330 (data->fcdPosition != NULL && !innormbuf &&
2331 data->pos < data->fcdPosition)) {
2332 /*
2333 if next character is in normalized buffer, no further normalization
2334 is required
2335 */
2336 return *(data->pos ++);
2337 }
2338
2339 if (data->flags & UCOL_ITER_HASLEN) {
2340 /* in data string */
2341 if (data->pos + 1 == data->endp) {
2342 return *(data->pos ++);
2343 }
2344 }
2345 else {
2346 if (innormbuf) {
2347 // inside the normalization buffer, but at the end
2348 // (since we encountered zero). This means, in the
2349 // case we're using char iterator, that we need to
2350 // do another round of normalization.
2351 //if(data->origFlags & UCOL_USE_ITERATOR) {
2352 // we need to restore original flags,
2353 // otherwise, we'll lose them
2354 //data->flags = data->origFlags;
2355 //normalizeIterator(data);
2356 //return *(data->pos++);
2357 //} else {
2358 /*
2359 in writable buffer, at this point fcdPosition can not be
2360 pointing to the end of the data string. see contracting tag.
2361 */
2362 if(data->fcdPosition) {
2363 if (*(data->fcdPosition + 1) == 0 ||
2364 data->fcdPosition + 1 == data->endp) {
2365 /* at the end of the string, dump it into the normalizer */
2366 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1;
2367 // Check if data->pos received a null pointer
2368 if (data->pos == NULL) {
2369 return (UChar)-1; // Return to indicate error.
2370 }
2371 return *(data->fcdPosition ++);
2372 }
2373 data->pos = data->fcdPosition;
2374 } else if(data->origFlags & UCOL_USE_ITERATOR) {
2375 // if we are here, we're using a normalizing iterator.
2376 // we should just continue further.
2377 data->flags = data->origFlags;
2378 data->pos = NULL;
2379 return (UChar)data->iterator->next(data->iterator);
2380 }
2381 //}
2382 }
2383 else {
2384 if (*(data->pos + 1) == 0) {
2385 return *(data->pos ++);
2386 }
2387 }
2388 }
2389
2390 ch = *data->pos ++;
2391 nextch = *data->pos;
2392
2393 /*
2394 * if the current character is not fcd.
2395 * Trailing combining class == 0.
2396 */
2397 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2398 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2399 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2400 /*
2401 Need a more complete FCD check and possible normalization.
2402 normalize substring will be appended to buffer
2403 */
2404 if (collIterFCD(data)) {
2405 normalizeNextContraction(data);
2406 return *(data->pos ++);
2407 }
2408 else if (innormbuf) {
2409 /* fcdposition shifted even when there's no normalization, if we
2410 don't input the rest into this, we'll get the wrong position when
2411 we reach the end of the writableBuffer */
2412 int32_t length = (int32_t)(data->fcdPosition - data->pos + 1);
2413 data->pos = insertBufferEnd(data, data->pos - 1, length);
2414 // Check if data->pos received a null pointer
2415 if (data->pos == NULL) {
2416 return (UChar)-1; // Return to indicate error.
2417 }
2418 return *(data->pos ++);
2419 }
2420 }
2421
2422 if (innormbuf) {
2423 /*
2424 no normalization is to be done hence only one character will be
2425 appended to the buffer.
2426 */
2427 data->pos = insertBufferEnd(data, ch) + 1;
2428 // Check if data->pos received a null pointer
2429 if (data->pos == NULL) {
2430 return (UChar)-1; // Return to indicate error.
2431 }
2432 }
2433
2434 /* points back to the pos in string */
2435 return ch;
2436 }
2437
2438
2439
2440 /**
2441 * Function to copy the buffer into writableBuffer and sets the fcd position to
2442 * the correct position
2443 * @param source data string source
2444 * @param buffer character buffer
2445 */
2446 static
setDiscontiguosAttribute(collIterate * source,const UnicodeString & buffer)2447 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer)
2448 {
2449 /* okay confusing part here. to ensure that the skipped characters are
2450 considered later, we need to place it in the appropriate position in the
2451 normalization buffer and reassign the pos pointer. simple case if pos
2452 reside in string, simply copy to normalization buffer and
2453 fcdposition = pos, pos = start of normalization buffer. if pos in
2454 normalization buffer, we'll insert the copy infront of pos and point pos
2455 to the start of the normalization buffer. why am i doing these copies?
2456 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2457 not require any changes, which be really painful. */
2458 if (source->flags & UCOL_ITER_INNORMBUF) {
2459 int32_t replaceLength = source->pos - source->writableBuffer.getBuffer();
2460 source->writableBuffer.replace(0, replaceLength, buffer);
2461 }
2462 else {
2463 source->fcdPosition = source->pos;
2464 source->origFlags = source->flags;
2465 source->flags |= UCOL_ITER_INNORMBUF;
2466 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2467 source->writableBuffer = buffer;
2468 }
2469
2470 source->pos = source->writableBuffer.getTerminatedBuffer();
2471 }
2472
2473 /**
2474 * Function to get the discontiguos collation element within the source.
2475 * Note this function will set the position to the appropriate places.
2476 * @param coll current collator used
2477 * @param source data string source
2478 * @param constart index to the start character in the contraction table
2479 * @return discontiguos collation element offset
2480 */
2481 static
getDiscontiguous(const UCollator * coll,collIterate * source,const UChar * constart)2482 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2483 const UChar *constart)
2484 {
2485 /* source->pos currently points to the second combining character after
2486 the start character */
2487 const UChar *temppos = source->pos;
2488 UnicodeString buffer;
2489 const UChar *tempconstart = constart;
2490 uint8_t tempflags = source->flags;
2491 UBool multicontraction = FALSE;
2492 collIterateState discState;
2493
2494 backupState(source, &discState);
2495
2496 buffer.setTo(peekCodePoint(source, -1));
2497 for (;;) {
2498 UChar *UCharOffset;
2499 UChar schar,
2500 tchar;
2501 uint32_t result;
2502
2503 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2504 || (peekCodeUnit(source, 0) == 0 &&
2505 //|| (*source->pos == 0 &&
2506 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2507 source->fcdPosition == NULL ||
2508 source->fcdPosition == source->endp ||
2509 *(source->fcdPosition) == 0 ||
2510 u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2511 /* end of string in null terminated string or stopped by a
2512 null character, note fcd does not always point to a base
2513 character after the discontiguos change */
2514 u_getCombiningClass(peekCodePoint(source, 0)) == 0) {
2515 //u_getCombiningClass(*(source->pos)) == 0) {
2516 //constart = (UChar *)coll->image + getContractOffset(CE);
2517 if (multicontraction) {
2518 source->pos = temppos - 1;
2519 setDiscontiguosAttribute(source, buffer);
2520 return *(coll->contractionCEs +
2521 (tempconstart - coll->contractionIndex));
2522 }
2523 constart = tempconstart;
2524 break;
2525 }
2526
2527 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2528 schar = getNextNormalizedChar(source);
2529
2530 while (schar > (tchar = *UCharOffset)) {
2531 UCharOffset++;
2532 }
2533
2534 if (schar != tchar) {
2535 /* not the correct codepoint. we stuff the current codepoint into
2536 the discontiguos buffer and try the next character */
2537 buffer.append(schar);
2538 continue;
2539 }
2540 else {
2541 if (u_getCombiningClass(schar) ==
2542 u_getCombiningClass(peekCodePoint(source, -2))) {
2543 buffer.append(schar);
2544 continue;
2545 }
2546 result = *(coll->contractionCEs +
2547 (UCharOffset - coll->contractionIndex));
2548 }
2549
2550 if (result == UCOL_NOT_FOUND) {
2551 break;
2552 } else if (isContraction(result)) {
2553 /* this is a multi-contraction*/
2554 tempconstart = (UChar *)coll->image + getContractOffset(result);
2555 if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2556 != UCOL_NOT_FOUND) {
2557 multicontraction = TRUE;
2558 temppos = source->pos + 1;
2559 }
2560 } else {
2561 setDiscontiguosAttribute(source, buffer);
2562 return result;
2563 }
2564 }
2565
2566 /* no problems simply reverting just like that,
2567 if we are in string before getting into this function, points back to
2568 string hence no problem.
2569 if we are in normalization buffer before getting into this function,
2570 since we'll never use another normalization within this function, we
2571 know that fcdposition points to a base character. the normalization buffer
2572 never change, hence this revert works. */
2573 loadState(source, &discState, TRUE);
2574 goBackOne(source);
2575
2576 //source->pos = temppos - 1;
2577 source->flags = tempflags;
2578 return *(coll->contractionCEs + (constart - coll->contractionIndex));
2579 }
2580
2581 /* now uses Mark's getImplicitPrimary code */
2582 static
getImplicit(UChar32 cp,collIterate * collationSource)2583 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2584 uint32_t r = uprv_uca_getImplicitPrimary(cp);
2585 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2586 collationSource->offsetRepeatCount += 1;
2587 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2588 }
2589
2590 /**
2591 * Inserts the argument character into the front of the buffer replacing the
2592 * front null terminator.
2593 * @param data collation element iterator data
2594 * @param ch character to be appended
2595 */
2596 static
insertBufferFront(collIterate * data,UChar ch)2597 inline void insertBufferFront(collIterate *data, UChar ch)
2598 {
2599 data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2;
2600 }
2601
2602 /**
2603 * Special normalization function for contraction in the previous iterator.
2604 * This normalization sequence will place the current character at source->pos
2605 * and its following normalized sequence into the buffer.
2606 * The fcd position, pos will be changed.
2607 * pos will now point to positions in the buffer.
2608 * Flags will be changed accordingly.
2609 * @param data collation iterator data
2610 */
2611 static
normalizePrevContraction(collIterate * data,UErrorCode * status)2612 inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
2613 {
2614 const UChar *pEnd = data->pos + 1; /* End normalize + 1 */
2615 const UChar *pStart;
2616
2617 UnicodeString endOfBuffer;
2618 if (data->flags & UCOL_ITER_HASLEN) {
2619 /*
2620 normalization buffer not used yet, we'll pull down the next
2621 character into the end of the buffer
2622 */
2623 endOfBuffer.setTo(*pEnd);
2624 }
2625 else {
2626 endOfBuffer.setTo(data->writableBuffer, 1); // after the leading NUL
2627 }
2628
2629 if (data->fcdPosition == NULL) {
2630 pStart = data->string;
2631 }
2632 else {
2633 pStart = data->fcdPosition + 1;
2634 }
2635 int32_t normLen =
2636 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)),
2637 data->writableBuffer,
2638 *status).
2639 length();
2640 if(U_FAILURE(*status)) {
2641 return;
2642 }
2643 /*
2644 this puts the null termination infront of the normalized string instead
2645 of the end
2646 */
2647 data->pos =
2648 data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() +
2649 1 + normLen;
2650 data->origFlags = data->flags;
2651 data->flags |= UCOL_ITER_INNORMBUF;
2652 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2653 }
2654
2655 /**
2656 * Contraction character management function that returns the previous character
2657 * for the backwards iterator.
2658 * Does nothing if the previous character is in buffer and not the first
2659 * character in it.
2660 * Else it checks previous character in data string to see if it is
2661 * normalizable.
2662 * If it is not, the character is simply copied into the buffer, else
2663 * the whole normalized substring is copied into the buffer, including the
2664 * current character.
2665 * @param data collation element iterator data
2666 * @return previous character
2667 */
2668 static
getPrevNormalizedChar(collIterate * data,UErrorCode * status)2669 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
2670 {
2671 UChar prevch;
2672 UChar ch;
2673 const UChar *start;
2674 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2675 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2676 (innormbuf && *(data->pos - 1) != 0)) {
2677 /*
2678 if no normalization.
2679 if previous character is in normalized buffer, no further normalization
2680 is required
2681 */
2682 if(data->flags & UCOL_USE_ITERATOR) {
2683 data->iterator->move(data->iterator, -1, UITER_CURRENT);
2684 return (UChar)data->iterator->next(data->iterator);
2685 } else {
2686 return *(data->pos - 1);
2687 }
2688 }
2689
2690 start = data->pos;
2691 if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
2692 /* in data string */
2693 if ((start - 1) == data->string) {
2694 return *(start - 1);
2695 }
2696 start --;
2697 ch = *start;
2698 prevch = *(start - 1);
2699 }
2700 else {
2701 /*
2702 in writable buffer, at this point fcdPosition can not be NULL.
2703 see contracting tag.
2704 */
2705 if (data->fcdPosition == data->string) {
2706 /* at the start of the string, just dump it into the normalizer */
2707 insertBufferFront(data, *(data->fcdPosition));
2708 data->fcdPosition = NULL;
2709 return *(data->pos - 1);
2710 }
2711 start = data->fcdPosition;
2712 ch = *start;
2713 prevch = *(start - 1);
2714 }
2715 /*
2716 * if the current character is not fcd.
2717 * Trailing combining class == 0.
2718 */
2719 if (data->fcdPosition > start &&
2720 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2721 {
2722 /*
2723 Need a more complete FCD check and possible normalization.
2724 normalize substring will be appended to buffer
2725 */
2726 const UChar *backuppos = data->pos;
2727 data->pos = start;
2728 if (collPrevIterFCD(data)) {
2729 normalizePrevContraction(data, status);
2730 return *(data->pos - 1);
2731 }
2732 data->pos = backuppos;
2733 data->fcdPosition ++;
2734 }
2735
2736 if (innormbuf) {
2737 /*
2738 no normalization is to be done hence only one character will be
2739 appended to the buffer.
2740 */
2741 insertBufferFront(data, ch);
2742 data->fcdPosition --;
2743 }
2744
2745 return ch;
2746 }
2747
2748 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2749 /* It is called by getNextCE */
2750
2751 /* The following should be even */
2752 #define UCOL_MAX_DIGITS_FOR_NUMBER 254
2753
ucol_prv_getSpecialCE(const UCollator * coll,UChar ch,uint32_t CE,collIterate * source,UErrorCode * status)2754 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
2755 collIterateState entryState;
2756 backupState(source, &entryState);
2757 UChar32 cp = ch;
2758
2759 for (;;) {
2760 // This loop will repeat only in the case of contractions, and only when a contraction
2761 // is found and the first CE resulting from that contraction is itself a special
2762 // (an expansion, for example.) All other special CE types are fully handled the
2763 // first time through, and the loop exits.
2764
2765 const uint32_t *CEOffset = NULL;
2766 switch(getCETag(CE)) {
2767 case NOT_FOUND_TAG:
2768 /* This one is not found, and we'll let somebody else bother about it... no more games */
2769 return CE;
2770 case SPEC_PROC_TAG:
2771 {
2772 // Special processing is getting a CE that is preceded by a certain prefix
2773 // Currently this is only needed for optimizing Japanese length and iteration marks.
2774 // When we encouter a special processing tag, we go backwards and try to see if
2775 // we have a match.
2776 // Contraction tables are used - so the whole process is not unlike contraction.
2777 // prefix data is stored backwards in the table.
2778 const UChar *UCharOffset;
2779 UChar schar, tchar;
2780 collIterateState prefixState;
2781 backupState(source, &prefixState);
2782 loadState(source, &entryState, TRUE);
2783 goBackOne(source); // We want to look at the point where we entered - actually one
2784 // before that...
2785
2786 for(;;) {
2787 // This loop will run once per source string character, for as long as we
2788 // are matching a potential contraction sequence
2789
2790 // First we position ourselves at the begining of contraction sequence
2791 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2792 if (collIter_bos(source)) {
2793 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2794 break;
2795 }
2796 schar = getPrevNormalizedChar(source, status);
2797 goBackOne(source);
2798
2799 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2800 UCharOffset++;
2801 }
2802
2803 if (schar == tchar) {
2804 // Found the source string char in the table.
2805 // Pick up the corresponding CE from the table.
2806 CE = *(coll->contractionCEs +
2807 (UCharOffset - coll->contractionIndex));
2808 }
2809 else
2810 {
2811 // Source string char was not in the table.
2812 // We have not found the prefix.
2813 CE = *(coll->contractionCEs +
2814 (ContractionStart - coll->contractionIndex));
2815 }
2816
2817 if(!isPrefix(CE)) {
2818 // The source string char was in the contraction table, and the corresponding
2819 // CE is not a prefix CE. We found the prefix, break
2820 // out of loop, this CE will end up being returned. This is the normal
2821 // way out of prefix handling when the source actually contained
2822 // the prefix.
2823 break;
2824 }
2825 }
2826 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
2827 loadState(source, &prefixState, TRUE);
2828 if(source->origFlags & UCOL_USE_ITERATOR) {
2829 source->flags = source->origFlags;
2830 }
2831 } else { // prefix search was a failure, we have to backup all the way to the start
2832 loadState(source, &entryState, TRUE);
2833 }
2834 break;
2835 }
2836 case CONTRACTION_TAG:
2837 {
2838 /* This should handle contractions */
2839 collIterateState state;
2840 backupState(source, &state);
2841 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
2842 const UChar *UCharOffset;
2843 UChar schar, tchar;
2844
2845 for (;;) {
2846 /* This loop will run once per source string character, for as long as we */
2847 /* are matching a potential contraction sequence */
2848
2849 /* First we position ourselves at the begining of contraction sequence */
2850 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2851
2852 if (collIter_eos(source)) {
2853 // Ran off the end of the source string.
2854 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2855 // So we'll pick whatever we have at the point...
2856 if (CE == UCOL_NOT_FOUND) {
2857 // back up the source over all the chars we scanned going into this contraction.
2858 CE = firstCE;
2859 loadState(source, &state, TRUE);
2860 if(source->origFlags & UCOL_USE_ITERATOR) {
2861 source->flags = source->origFlags;
2862 }
2863 }
2864 break;
2865 }
2866
2867 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2868 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
2869
2870 schar = getNextNormalizedChar(source);
2871 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2872 UCharOffset++;
2873 }
2874
2875 if (schar == tchar) {
2876 // Found the source string char in the contraction table.
2877 // Pick up the corresponding CE from the table.
2878 CE = *(coll->contractionCEs +
2879 (UCharOffset - coll->contractionIndex));
2880 }
2881 else
2882 {
2883 // Source string char was not in contraction table.
2884 // Unless we have a discontiguous contraction, we have finished
2885 // with this contraction.
2886 // in order to do the proper detection, we
2887 // need to see if we're dealing with a supplementary
2888 /* We test whether the next two char are surrogate pairs.
2889 * This test is done if the iterator is not NULL.
2890 * If there is no surrogate pair, the iterator
2891 * goes back one if needed. */
2892 UChar32 miss = schar;
2893 if (source->iterator) {
2894 UChar32 surrNextChar; /* the next char in the iteration to test */
2895 int32_t prevPos; /* holds the previous position before move forward of the source iterator */
2896 if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) {
2897 prevPos = source->iterator->index;
2898 surrNextChar = getNextNormalizedChar(source);
2899 if (U16_IS_TRAIL(surrNextChar)) {
2900 miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar);
2901 } else if (prevPos < source->iterator->index){
2902 goBackOne(source);
2903 }
2904 }
2905 } else if (U16_IS_LEAD(schar)) {
2906 miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
2907 }
2908
2909 uint8_t sCC;
2910 if (miss < 0x300 ||
2911 maxCC == 0 ||
2912 (sCC = i_getCombiningClass(miss, coll)) == 0 ||
2913 sCC>maxCC ||
2914 (allSame != 0 && sCC == maxCC) ||
2915 collIter_eos(source))
2916 {
2917 // Contraction can not be discontiguous.
2918 goBackOne(source); // back up the source string by one,
2919 // because the character we just looked at was
2920 // not part of the contraction. */
2921 if(U_IS_SUPPLEMENTARY(miss)) {
2922 goBackOne(source);
2923 }
2924 CE = *(coll->contractionCEs +
2925 (ContractionStart - coll->contractionIndex));
2926 } else {
2927 //
2928 // Contraction is possibly discontiguous.
2929 // Scan more of source string looking for a match
2930 //
2931 UChar tempchar;
2932 /* find the next character if schar is not a base character
2933 and we are not yet at the end of the string */
2934 tempchar = getNextNormalizedChar(source);
2935 // probably need another supplementary thingie here
2936 goBackOne(source);
2937 if (i_getCombiningClass(tempchar, coll) == 0) {
2938 goBackOne(source);
2939 if(U_IS_SUPPLEMENTARY(miss)) {
2940 goBackOne(source);
2941 }
2942 /* Spit out the last char of the string, wasn't tasty enough */
2943 CE = *(coll->contractionCEs +
2944 (ContractionStart - coll->contractionIndex));
2945 } else {
2946 CE = getDiscontiguous(coll, source, ContractionStart);
2947 }
2948 }
2949 } // else after if(schar == tchar)
2950
2951 if(CE == UCOL_NOT_FOUND) {
2952 /* The Source string did not match the contraction that we were checking. */
2953 /* Back up the source position to undo the effects of having partially */
2954 /* scanned through what ultimately proved to not be a contraction. */
2955 loadState(source, &state, TRUE);
2956 CE = firstCE;
2957 break;
2958 }
2959
2960 if(!isContraction(CE)) {
2961 // The source string char was in the contraction table, and the corresponding
2962 // CE is not a contraction CE. We completed the contraction, break
2963 // out of loop, this CE will end up being returned. This is the normal
2964 // way out of contraction handling when the source actually contained
2965 // the contraction.
2966 break;
2967 }
2968
2969
2970 // The source string char was in the contraction table, and the corresponding
2971 // CE is IS a contraction CE. We will continue looping to check the source
2972 // string for the remaining chars in the contraction.
2973 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
2974 if(tempCE != UCOL_NOT_FOUND) {
2975 // We have scanned a a section of source string for which there is a
2976 // CE from the contraction table. Remember the CE and scan position, so
2977 // that we can return to this point if further scanning fails to
2978 // match a longer contraction sequence.
2979 firstCE = tempCE;
2980
2981 goBackOne(source);
2982 backupState(source, &state);
2983 getNextNormalizedChar(source);
2984
2985 // Another way to do this is:
2986 //collIterateState tempState;
2987 //backupState(source, &tempState);
2988 //goBackOne(source);
2989 //backupState(source, &state);
2990 //loadState(source, &tempState, TRUE);
2991
2992 // The problem is that for incomplete contractions we have to remember the previous
2993 // position. Before, the only thing I needed to do was state.pos--;
2994 // After iterator introduction and especially after introduction of normalizing
2995 // iterators, it became much more difficult to decrease the saved state.
2996 // I'm not yet sure which of the two methods above is faster.
2997 }
2998 } // for(;;)
2999 break;
3000 } // case CONTRACTION_TAG:
3001 case LONG_PRIMARY_TAG:
3002 {
3003 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3004 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3005 source->offsetRepeatCount += 1;
3006 return CE;
3007 }
3008 case EXPANSION_TAG:
3009 {
3010 /* This should handle expansion. */
3011 /* NOTE: we can encounter both continuations and expansions in an expansion! */
3012 /* I have to decide where continuations are going to be dealt with */
3013 uint32_t size;
3014 uint32_t i; /* general counter */
3015
3016 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3017 size = getExpansionCount(CE);
3018 CE = *CEOffset++;
3019 //source->offsetRepeatCount = -1;
3020
3021 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
3022 for(i = 1; i<size; i++) {
3023 *(source->CEpos++) = *CEOffset++;
3024 source->offsetRepeatCount += 1;
3025 }
3026 } else { /* else, we do */
3027 while(*CEOffset != 0) {
3028 *(source->CEpos++) = *CEOffset++;
3029 source->offsetRepeatCount += 1;
3030 }
3031 }
3032
3033 return CE;
3034 }
3035 case DIGIT_TAG:
3036 {
3037 /*
3038 We do a check to see if we want to collate digits as numbers; if so we generate
3039 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3040 */
3041 //uint32_t size;
3042 uint32_t i; /* general counter */
3043
3044 if (source->coll->numericCollation == UCOL_ON){
3045 collIterateState digitState = {0,0,0,0,0,0,0,0,0};
3046 UChar32 char32 = 0;
3047 int32_t digVal = 0;
3048
3049 uint32_t digIndx = 0;
3050 uint32_t endIndex = 0;
3051 uint32_t trailingZeroIndex = 0;
3052
3053 uint8_t collateVal = 0;
3054
3055 UBool nonZeroValReached = FALSE;
3056
3057 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs.
3058 /*
3059 We parse the source string until we hit a char that's NOT a digit.
3060 Use this u_charDigitValue. This might be slow because we have to
3061 handle surrogates...
3062 */
3063 /*
3064 if (U16_IS_LEAD(ch)){
3065 if (!collIter_eos(source)) {
3066 backupState(source, &digitState);
3067 UChar trail = getNextNormalizedChar(source);
3068 if(U16_IS_TRAIL(trail)) {
3069 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3070 } else {
3071 loadState(source, &digitState, TRUE);
3072 char32 = ch;
3073 }
3074 } else {
3075 char32 = ch;
3076 }
3077 } else {
3078 char32 = ch;
3079 }
3080 digVal = u_charDigitValue(char32);
3081 */
3082 digVal = u_charDigitValue(cp); // if we have arrived here, we have
3083 // already processed possible supplementaries that trigered the digit tag -
3084 // all supplementaries are marked in the UCA.
3085 /*
3086 We pad a zero in front of the first element anyways. This takes
3087 care of the (probably) most common case where people are sorting things followed
3088 by a single digit
3089 */
3090 digIndx++;
3091 for(;;){
3092 // Make sure we have enough space. No longer needed;
3093 // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
3094 // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
3095 // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
3096
3097 // Skipping over leading zeroes.
3098 if (digVal != 0) {
3099 nonZeroValReached = TRUE;
3100 }
3101 if (nonZeroValReached) {
3102 /*
3103 We parse the digit string into base 100 numbers (this fits into a byte).
3104 We only add to the buffer in twos, thus if we are parsing an odd character,
3105 that serves as the 'tens' digit while the if we are parsing an even one, that
3106 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3107 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3108 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3109 than all the other bytes.
3110 */
3111
3112 if (digIndx % 2 == 1){
3113 collateVal += (uint8_t)digVal;
3114
3115 // We don't enter the low-order-digit case unless we've already seen
3116 // the high order, or for the first digit, which is always non-zero.
3117 if (collateVal != 0)
3118 trailingZeroIndex = 0;
3119
3120 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3121 collateVal = 0;
3122 }
3123 else{
3124 // We drop the collation value into the buffer so if we need to do
3125 // a "front patch" we don't have to check to see if we're hitting the
3126 // last element.
3127 collateVal = (uint8_t)(digVal * 10);
3128
3129 // Check for trailing zeroes.
3130 if (collateVal == 0)
3131 {
3132 if (!trailingZeroIndex)
3133 trailingZeroIndex = (digIndx/2) + 2;
3134 }
3135 else
3136 trailingZeroIndex = 0;
3137
3138 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3139 }
3140 digIndx++;
3141 }
3142
3143 // Get next character.
3144 if (!collIter_eos(source)){
3145 ch = getNextNormalizedChar(source);
3146 if (U16_IS_LEAD(ch)){
3147 if (!collIter_eos(source)) {
3148 backupState(source, &digitState);
3149 UChar trail = getNextNormalizedChar(source);
3150 if(U16_IS_TRAIL(trail)) {
3151 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3152 } else {
3153 loadState(source, &digitState, TRUE);
3154 char32 = ch;
3155 }
3156 }
3157 } else {
3158 char32 = ch;
3159 }
3160
3161 if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){
3162 // Resetting position to point to the next unprocessed char. We
3163 // overshot it when doing our test/set for numbers.
3164 if (char32 > 0xFFFF) { // For surrogates.
3165 loadState(source, &digitState, TRUE);
3166 //goBackOne(source);
3167 }
3168 goBackOne(source);
3169 break;
3170 }
3171 } else {
3172 break;
3173 }
3174 }
3175
3176 if (nonZeroValReached == FALSE){
3177 digIndx = 2;
3178 numTempBuf[2] = 6;
3179 }
3180
3181 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
3182 if (digIndx % 2 != 0){
3183 /*
3184 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3185 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3186 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3187 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3188 */
3189
3190 for(i = 2; i < endIndex; i++){
3191 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3192 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3193 }
3194 --digIndx;
3195 }
3196
3197 // Subtract one off of the last byte.
3198 numTempBuf[endIndex-1] -= 1;
3199
3200 /*
3201 We want to skip over the first two slots in the buffer. The first slot
3202 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3203 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3204 */
3205 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3206 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3207
3208 // Now transfer the collation key to our collIterate struct.
3209 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3210 //size = ((endIndex+1) & ~1)/2;
3211 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3212 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3213 UCOL_BYTE_COMMON; // Tertiary weight.
3214 i = 2; // Reset the index into the buffer.
3215 while(i < endIndex)
3216 {
3217 uint32_t primWeight = numTempBuf[i++] << 8;
3218 if ( i < endIndex)
3219 primWeight |= numTempBuf[i++];
3220 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3221 }
3222
3223 } else {
3224 // no numeric mode, we'll just switch to whatever we stashed and continue
3225 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3226 CE = *CEOffset++;
3227 break;
3228 }
3229 return CE;
3230 }
3231 /* various implicits optimization */
3232 case IMPLICIT_TAG: /* everything that is not defined otherwise */
3233 /* UCA is filled with these. Tailorings are NOT_FOUND */
3234 return getImplicit(cp, source);
3235 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3236 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3237 return getImplicit(cp, source);
3238 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3239 {
3240 static const uint32_t
3241 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3242 //const uint32_t LCount = 19;
3243 static const uint32_t VCount = 21;
3244 static const uint32_t TCount = 28;
3245 //const uint32_t NCount = VCount * TCount; // 588
3246 //const uint32_t SCount = LCount * NCount; // 11172
3247 uint32_t L = ch - SBase;
3248
3249 // divide into pieces
3250
3251 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
3252 L /= TCount;
3253 uint32_t V = L % VCount;
3254 L /= VCount;
3255
3256 // offset them
3257
3258 L += LBase;
3259 V += VBase;
3260 T += TBase;
3261
3262 // return the first CE, but first put the rest into the expansion buffer
3263 if (!source->coll->image->jamoSpecial) { // FAST PATH
3264
3265 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3266 if (T != TBase) {
3267 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3268 }
3269
3270 return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3271
3272 } else { // Jamo is Special
3273 // Since Hanguls pass the FCD check, it is
3274 // guaranteed that we won't be in
3275 // the normalization buffer if something like this happens
3276
3277 // However, if we are using a uchar iterator and normalization
3278 // is ON, the Hangul that lead us here is going to be in that
3279 // normalization buffer. Here we want to restore the uchar
3280 // iterator state and pull out of the normalization buffer
3281 if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
3282 source->flags = source->origFlags; // restore the iterator
3283 source->pos = NULL;
3284 }
3285
3286 // Move Jamos into normalization buffer
3287 UChar *buffer = source->writableBuffer.getBuffer(4);
3288 int32_t bufferLength;
3289 buffer[0] = (UChar)L;
3290 buffer[1] = (UChar)V;
3291 if (T != TBase) {
3292 buffer[2] = (UChar)T;
3293 bufferLength = 3;
3294 } else {
3295 bufferLength = 2;
3296 }
3297 source->writableBuffer.releaseBuffer(bufferLength);
3298
3299 // Indicate where to continue in main input string after exhausting the writableBuffer
3300 source->fcdPosition = source->pos;
3301
3302 source->pos = source->writableBuffer.getTerminatedBuffer();
3303 source->origFlags = source->flags;
3304 source->flags |= UCOL_ITER_INNORMBUF;
3305 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3306
3307 return(UCOL_IGNORABLE);
3308 }
3309 }
3310 case SURROGATE_TAG:
3311 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
3312 /* two things can happen here: next code point can be a trailing surrogate - we will use it */
3313 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
3314 /* we treat it like an unassigned code point. */
3315 {
3316 UChar trail;
3317 collIterateState state;
3318 backupState(source, &state);
3319 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
3320 // we chould have stepped one char forward and it might have turned that it
3321 // was not a trail surrogate. In that case, we have to backup.
3322 loadState(source, &state, TRUE);
3323 return UCOL_NOT_FOUND;
3324 } else {
3325 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
3326 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
3327 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
3328 // We need to backup
3329 loadState(source, &state, TRUE);
3330 return CE;
3331 }
3332 // calculate the supplementary code point value, if surrogate was not tailored
3333 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3334 }
3335 }
3336 break;
3337 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
3338 UChar nextChar;
3339 if( source->flags & UCOL_USE_ITERATOR) {
3340 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
3341 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3342 source->iterator->next(source->iterator);
3343 return getImplicit(cp, source);
3344 }
3345 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
3346 U_IS_TRAIL((nextChar=*source->pos))) {
3347 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3348 source->pos++;
3349 return getImplicit(cp, source);
3350 }
3351 return UCOL_NOT_FOUND;
3352 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3353 return UCOL_NOT_FOUND; /* broken surrogate sequence */
3354 case CHARSET_TAG:
3355 /* not yet implemented */
3356 /* probably after 1.8 */
3357 return UCOL_NOT_FOUND;
3358 default:
3359 *status = U_INTERNAL_PROGRAM_ERROR;
3360 CE=0;
3361 break;
3362 }
3363 if (CE <= UCOL_NOT_FOUND) break;
3364 }
3365 return CE;
3366 }
3367
3368
3369 /* now uses Mark's getImplicitPrimary code */
3370 static
getPrevImplicit(UChar32 cp,collIterate * collationSource)3371 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3372 uint32_t r = uprv_uca_getImplicitPrimary(cp);
3373
3374 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3375 collationSource->toReturn = collationSource->CEpos;
3376
3377 // **** doesn't work if using iterator ****
3378 if (collationSource->flags & UCOL_ITER_INNORMBUF) {
3379 collationSource->offsetRepeatCount = 1;
3380 } else {
3381 int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
3382
3383 UErrorCode errorCode = U_ZERO_ERROR;
3384 collationSource->appendOffset(firstOffset, errorCode);
3385 collationSource->appendOffset(firstOffset + 1, errorCode);
3386
3387 collationSource->offsetReturn = collationSource->offsetStore - 1;
3388 *(collationSource->offsetBuffer) = firstOffset;
3389 if (collationSource->offsetReturn == collationSource->offsetBuffer) {
3390 collationSource->offsetStore = collationSource->offsetBuffer;
3391 }
3392 }
3393
3394 return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3395 }
3396
3397 /**
3398 * This function handles the special CEs like contractions, expansions,
3399 * surrogates, Thai.
3400 * It is called by both getPrevCE
3401 */
ucol_prv_getSpecialPrevCE(const UCollator * coll,UChar ch,uint32_t CE,collIterate * source,UErrorCode * status)3402 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3403 collIterate *source,
3404 UErrorCode *status)
3405 {
3406 const uint32_t *CEOffset = NULL;
3407 UChar *UCharOffset = NULL;
3408 UChar schar;
3409 const UChar *constart = NULL;
3410 uint32_t size;
3411 UChar buffer[UCOL_MAX_BUFFER];
3412 uint32_t *endCEBuffer;
3413 UChar *strbuffer;
3414 int32_t noChars = 0;
3415 int32_t CECount = 0;
3416
3417 for(;;)
3418 {
3419 /* the only ces that loops are thai and contractions */
3420 switch (getCETag(CE))
3421 {
3422 case NOT_FOUND_TAG: /* this tag always returns */
3423 return CE;
3424
3425 case SPEC_PROC_TAG:
3426 {
3427 // Special processing is getting a CE that is preceded by a certain prefix
3428 // Currently this is only needed for optimizing Japanese length and iteration marks.
3429 // When we encouter a special processing tag, we go backwards and try to see if
3430 // we have a match.
3431 // Contraction tables are used - so the whole process is not unlike contraction.
3432 // prefix data is stored backwards in the table.
3433 const UChar *UCharOffset;
3434 UChar schar, tchar;
3435 collIterateState prefixState;
3436 backupState(source, &prefixState);
3437 for(;;) {
3438 // This loop will run once per source string character, for as long as we
3439 // are matching a potential contraction sequence
3440
3441 // First we position ourselves at the begining of contraction sequence
3442 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3443
3444 if (collIter_bos(source)) {
3445 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3446 break;
3447 }
3448 schar = getPrevNormalizedChar(source, status);
3449 goBackOne(source);
3450
3451 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3452 UCharOffset++;
3453 }
3454
3455 if (schar == tchar) {
3456 // Found the source string char in the table.
3457 // Pick up the corresponding CE from the table.
3458 CE = *(coll->contractionCEs +
3459 (UCharOffset - coll->contractionIndex));
3460 }
3461 else
3462 {
3463 // if there is a completely ignorable code point in the middle of
3464 // a prefix, we need to act as if it's not there
3465 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3466 // lone surrogates cannot be set to zero as it would break other processing
3467 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
3468 // it's easy for BMP code points
3469 if(isZeroCE == 0) {
3470 continue;
3471 } else if(U16_IS_SURROGATE(schar)) {
3472 // for supplementary code points, we have to check the next one
3473 // situations where we are going to ignore
3474 // 1. beginning of the string: schar is a lone surrogate
3475 // 2. schar is a lone surrogate
3476 // 3. schar is a trail surrogate in a valid surrogate sequence
3477 // that is explicitly set to zero.
3478 if (!collIter_bos(source)) {
3479 UChar lead;
3480 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
3481 isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
3482 if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) {
3483 uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
3484 if(finalCE == 0) {
3485 // this is a real, assigned completely ignorable code point
3486 goBackOne(source);
3487 continue;
3488 }
3489 }
3490 } else {
3491 // lone surrogate, treat like unassigned
3492 return UCOL_NOT_FOUND;
3493 }
3494 } else {
3495 // lone surrogate at the beggining, treat like unassigned
3496 return UCOL_NOT_FOUND;
3497 }
3498 }
3499 // Source string char was not in the table.
3500 // We have not found the prefix.
3501 CE = *(coll->contractionCEs +
3502 (ContractionStart - coll->contractionIndex));
3503 }
3504
3505 if(!isPrefix(CE)) {
3506 // The source string char was in the contraction table, and the corresponding
3507 // CE is not a prefix CE. We found the prefix, break
3508 // out of loop, this CE will end up being returned. This is the normal
3509 // way out of prefix handling when the source actually contained
3510 // the prefix.
3511 break;
3512 }
3513 }
3514 loadState(source, &prefixState, TRUE);
3515 break;
3516 }
3517
3518 case CONTRACTION_TAG: {
3519 /* to ensure that the backwards and forwards iteration matches, we
3520 take the current region of most possible match and pass it through
3521 the forward iteration. this will ensure that the obstinate problem of
3522 overlapping contractions will not occur.
3523 */
3524 schar = peekCodeUnit(source, 0);
3525 constart = (UChar *)coll->image + getContractOffset(CE);
3526 if (isAtStartPrevIterate(source)
3527 /* commented away contraction end checks after adding the checks
3528 in getPrevCE */) {
3529 /* start of string or this is not the end of any contraction */
3530 CE = *(coll->contractionCEs +
3531 (constart - coll->contractionIndex));
3532 break;
3533 }
3534 strbuffer = buffer;
3535 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3536 *(UCharOffset --) = 0;
3537 noChars = 0;
3538 // have to swap thai characters
3539 while (ucol_unsafeCP(schar, coll)) {
3540 *(UCharOffset) = schar;
3541 noChars++;
3542 UCharOffset --;
3543 schar = getPrevNormalizedChar(source, status);
3544 goBackOne(source);
3545 // TODO: when we exhaust the contraction buffer,
3546 // it needs to get reallocated. The problem is
3547 // that the size depends on the string which is
3548 // not iterated over. However, since we're travelling
3549 // backwards, we already had to set the iterator at
3550 // the end - so we might as well know where we are?
3551 if (UCharOffset + 1 == buffer) {
3552 /* we have exhausted the buffer */
3553 int32_t newsize = 0;
3554 if(source->pos) { // actually dealing with a position
3555 newsize = (int32_t)(source->pos - source->string + 1);
3556 } else { // iterator
3557 newsize = 4 * UCOL_MAX_BUFFER;
3558 }
3559 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3560 (newsize + UCOL_MAX_BUFFER));
3561 /* test for NULL */
3562 if (strbuffer == NULL) {
3563 *status = U_MEMORY_ALLOCATION_ERROR;
3564 return UCOL_NO_MORE_CES;
3565 }
3566 UCharOffset = strbuffer + newsize;
3567 uprv_memcpy(UCharOffset, buffer,
3568 UCOL_MAX_BUFFER * sizeof(UChar));
3569 UCharOffset --;
3570 }
3571 if ((source->pos && (source->pos == source->string ||
3572 ((source->flags & UCOL_ITER_INNORMBUF) &&
3573 *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3574 || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
3575 break;
3576 }
3577 }
3578 /* adds the initial base character to the string */
3579 *(UCharOffset) = schar;
3580 noChars++;
3581
3582 int32_t offsetBias;
3583
3584 // **** doesn't work if using iterator ****
3585 if (source->flags & UCOL_ITER_INNORMBUF) {
3586 offsetBias = -1;
3587 } else {
3588 offsetBias = (int32_t)(source->pos - source->string);
3589 }
3590
3591 /* a new collIterate is used to simplify things, since using the current
3592 collIterate will mean that the forward and backwards iteration will
3593 share and change the same buffers. we don't want to get into that. */
3594 collIterate temp;
3595 int32_t rawOffset;
3596
3597 IInit_collIterate(coll, UCharOffset, noChars, &temp, status);
3598 if(U_FAILURE(*status)) {
3599 return (uint32_t)UCOL_NULLORDER;
3600 }
3601 temp.flags &= ~UCOL_ITER_NORM;
3602 temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;
3603
3604 rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero?
3605 CE = ucol_IGetNextCE(coll, &temp, status);
3606
3607 if (source->extendCEs) {
3608 endCEBuffer = source->extendCEs + source->extendCEsSize;
3609 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t));
3610 } else {
3611 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3612 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t));
3613 }
3614
3615 while (CE != UCOL_NO_MORE_CES) {
3616 *(source->CEpos ++) = CE;
3617
3618 if (offsetBias >= 0) {
3619 source->appendOffset(rawOffset + offsetBias, *status);
3620 }
3621
3622 CECount++;
3623 if (source->CEpos == endCEBuffer) {
3624 /* ran out of CE space, reallocate to new buffer.
3625 If reallocation fails, reset pointers and bail out,
3626 there's no guarantee of the right character position after
3627 this bail*/
3628 if (!increaseCEsCapacity(source)) {
3629 *status = U_MEMORY_ALLOCATION_ERROR;
3630 break;
3631 }
3632
3633 endCEBuffer = source->extendCEs + source->extendCEsSize;
3634 }
3635
3636 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
3637 rawOffset = (int32_t)(temp.fcdPosition - temp.string);
3638 } else {
3639 rawOffset = (int32_t)(temp.pos - temp.string);
3640 }
3641
3642 CE = ucol_IGetNextCE(coll, &temp, status);
3643 }
3644
3645 if (strbuffer != buffer) {
3646 uprv_free(strbuffer);
3647 }
3648 if (U_FAILURE(*status)) {
3649 return (uint32_t)UCOL_NULLORDER;
3650 }
3651
3652 if (source->offsetRepeatValue != 0) {
3653 if (CECount > noChars) {
3654 source->offsetRepeatCount += temp.offsetRepeatCount;
3655 } else {
3656 // **** does this really skip the right offsets? ****
3657 source->offsetReturn -= (noChars - CECount);
3658 }
3659 }
3660
3661 if (offsetBias >= 0) {
3662 source->offsetReturn = source->offsetStore - 1;
3663 if (source->offsetReturn == source->offsetBuffer) {
3664 source->offsetStore = source->offsetBuffer;
3665 }
3666 }
3667
3668 source->toReturn = source->CEpos - 1;
3669 if (source->toReturn == source->CEs) {
3670 source->CEpos = source->CEs;
3671 }
3672
3673 return *(source->toReturn);
3674 }
3675 case LONG_PRIMARY_TAG:
3676 {
3677 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3678 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3679 source->toReturn = source->CEpos - 1;
3680
3681 if (source->flags & UCOL_ITER_INNORMBUF) {
3682 source->offsetRepeatCount = 1;
3683 } else {
3684 int32_t firstOffset = (int32_t)(source->pos - source->string);
3685
3686 source->appendOffset(firstOffset, *status);
3687 source->appendOffset(firstOffset + 1, *status);
3688
3689 source->offsetReturn = source->offsetStore - 1;
3690 *(source->offsetBuffer) = firstOffset;
3691 if (source->offsetReturn == source->offsetBuffer) {
3692 source->offsetStore = source->offsetBuffer;
3693 }
3694 }
3695
3696
3697 return *(source->toReturn);
3698 }
3699
3700 case EXPANSION_TAG: /* this tag always returns */
3701 {
3702 /*
3703 This should handle expansion.
3704 NOTE: we can encounter both continuations and expansions in an expansion!
3705 I have to decide where continuations are going to be dealt with
3706 */
3707 int32_t firstOffset = (int32_t)(source->pos - source->string);
3708
3709 // **** doesn't work if using iterator ****
3710 if (source->offsetReturn != NULL) {
3711 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) {
3712 source->offsetStore = source->offsetBuffer;
3713 }else {
3714 firstOffset = -1;
3715 }
3716 }
3717
3718 /* find the offset to expansion table */
3719 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3720 size = getExpansionCount(CE);
3721 if (size != 0) {
3722 /*
3723 if there are less than 16 elements in expansion, we don't terminate
3724 */
3725 uint32_t count;
3726
3727 for (count = 0; count < size; count++) {
3728 *(source->CEpos ++) = *CEOffset++;
3729
3730 if (firstOffset >= 0) {
3731 source->appendOffset(firstOffset + 1, *status);
3732 }
3733 }
3734 } else {
3735 /* else, we do */
3736 while (*CEOffset != 0) {
3737 *(source->CEpos ++) = *CEOffset ++;
3738
3739 if (firstOffset >= 0) {
3740 source->appendOffset(firstOffset + 1, *status);
3741 }
3742 }
3743 }
3744
3745 if (firstOffset >= 0) {
3746 source->offsetReturn = source->offsetStore - 1;
3747 *(source->offsetBuffer) = firstOffset;
3748 if (source->offsetReturn == source->offsetBuffer) {
3749 source->offsetStore = source->offsetBuffer;
3750 }
3751 } else {
3752 source->offsetRepeatCount += size - 1;
3753 }
3754
3755 source->toReturn = source->CEpos - 1;
3756 // in case of one element expansion, we
3757 // want to immediately return CEpos
3758 if(source->toReturn == source->CEs) {
3759 source->CEpos = source->CEs;
3760 }
3761
3762 return *(source->toReturn);
3763 }
3764
3765 case DIGIT_TAG:
3766 {
3767 /*
3768 We do a check to see if we want to collate digits as numbers; if so we generate
3769 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3770 */
3771 uint32_t i; /* general counter */
3772
3773 if (source->coll->numericCollation == UCOL_ON){
3774 uint32_t digIndx = 0;
3775 uint32_t endIndex = 0;
3776 uint32_t leadingZeroIndex = 0;
3777 uint32_t trailingZeroCount = 0;
3778
3779 uint8_t collateVal = 0;
3780
3781 UBool nonZeroValReached = FALSE;
3782
3783 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs.
3784 /*
3785 We parse the source string until we hit a char that's NOT a digit.
3786 Use this u_charDigitValue. This might be slow because we have to
3787 handle surrogates...
3788 */
3789 /*
3790 We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less,
3791 with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
3792 element we process when going backward. To determine how long that chunk might be, we may need to make
3793 two passes through the loop that collects digits - one to see how long the string is (and how much is
3794 leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has
3795 more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation
3796 element chunk after resetting the state to the initialState at the right side of the digit string.
3797 */
3798 uint32_t ceLimit = 0;
3799 UChar initial_ch = ch;
3800 collIterateState initialState = {0,0,0,0,0,0,0,0,0};
3801 backupState(source, &initialState);
3802
3803 for(;;) {
3804 collIterateState state = {0,0,0,0,0,0,0,0,0};
3805 UChar32 char32 = 0;
3806 int32_t digVal = 0;
3807
3808 if (U16_IS_TRAIL (ch)) {
3809 if (!collIter_bos(source)){
3810 UChar lead = getPrevNormalizedChar(source, status);
3811 if(U16_IS_LEAD(lead)) {
3812 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3813 goBackOne(source);
3814 } else {
3815 char32 = ch;
3816 }
3817 } else {
3818 char32 = ch;
3819 }
3820 } else {
3821 char32 = ch;
3822 }
3823 digVal = u_charDigitValue(char32);
3824
3825 for(;;) {
3826 // Make sure we have enough space. No longer needed;
3827 // at this point the largest value of digIndx when we need to save data in numTempBuf
3828 // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure
3829 // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2).
3830
3831 // Skip over trailing zeroes, and keep a count of them.
3832 if (digVal != 0)
3833 nonZeroValReached = TRUE;
3834
3835 if (nonZeroValReached) {
3836 /*
3837 We parse the digit string into base 100 numbers (this fits into a byte).
3838 We only add to the buffer in twos, thus if we are parsing an odd character,
3839 that serves as the 'tens' digit while the if we are parsing an even one, that
3840 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3841 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3842 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3843 than all the other bytes.
3844
3845 Since we're doing in this reverse we want to put the first digit encountered into the
3846 ones place and the second digit encountered into the tens place.
3847 */
3848
3849 if ((digIndx + trailingZeroCount) % 2 == 1) {
3850 // High-order digit case (tens place)
3851 collateVal += (uint8_t)(digVal * 10);
3852
3853 // We cannot set leadingZeroIndex unless it has been set for the
3854 // low-order digit. Therefore, all we can do for the high-order
3855 // digit is turn it off, never on.
3856 // The only time we will have a high digit without a low is for
3857 // the very first non-zero digit, so no zero check is necessary.
3858 if (collateVal != 0)
3859 leadingZeroIndex = 0;
3860
3861 // The first pass through, digIndx may exceed the limit, but in that case
3862 // we no longer care about numTempBuf contents since they will be discarded
3863 if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {
3864 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3865 }
3866 collateVal = 0;
3867 } else {
3868 // Low-order digit case (ones place)
3869 collateVal = (uint8_t)digVal;
3870
3871 // Check for leading zeroes.
3872 if (collateVal == 0) {
3873 if (!leadingZeroIndex)
3874 leadingZeroIndex = (digIndx/2) + 2;
3875 } else
3876 leadingZeroIndex = 0;
3877
3878 // No need to write to buffer; the case of a last odd digit
3879 // is handled below.
3880 }
3881 ++digIndx;
3882 } else
3883 ++trailingZeroCount;
3884
3885 if (!collIter_bos(source)) {
3886 ch = getPrevNormalizedChar(source, status);
3887 //goBackOne(source);
3888 if (U16_IS_TRAIL(ch)) {
3889 backupState(source, &state);
3890 if (!collIter_bos(source)) {
3891 goBackOne(source);
3892 UChar lead = getPrevNormalizedChar(source, status);
3893
3894 if(U16_IS_LEAD(lead)) {
3895 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3896 } else {
3897 loadState(source, &state, FALSE);
3898 char32 = ch;
3899 }
3900 }
3901 } else
3902 char32 = ch;
3903
3904 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {
3905 if (char32 > 0xFFFF) {// For surrogates.
3906 loadState(source, &state, FALSE);
3907 }
3908 // Don't need to "reverse" the goBackOne call,
3909 // as this points to the next position to process..
3910 //if (char32 > 0xFFFF) // For surrogates.
3911 //getNextNormalizedChar(source);
3912 break;
3913 }
3914
3915 goBackOne(source);
3916 }else
3917 break;
3918 }
3919
3920 if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) {
3921 // our collation element is not too big, go ahead and finish with it
3922 break;
3923 }
3924 // our digit string is too long for a collation element;
3925 // set the limit for it, reset the state and begin again
3926 ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER;
3927 if ( ceLimit == 0 ) {
3928 ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;
3929 }
3930 ch = initial_ch;
3931 loadState(source, &initialState, FALSE);
3932 digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0;
3933 collateVal = 0;
3934 nonZeroValReached = FALSE;
3935 }
3936
3937 if (! nonZeroValReached) {
3938 digIndx = 2;
3939 trailingZeroCount = 0;
3940 numTempBuf[2] = 6;
3941 }
3942
3943 if ((digIndx + trailingZeroCount) % 2 != 0) {
3944 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
3945 digIndx += 1; // The implicit leading zero
3946 }
3947 if (trailingZeroCount % 2 != 0) {
3948 // We had to consume one trailing zero for the low digit
3949 // of the least significant byte
3950 digIndx += 1; // The trailing zero not in the exponent
3951 trailingZeroCount -= 1;
3952 }
3953
3954 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
3955
3956 // Subtract one off of the last byte. Really the first byte here, but it's reversed...
3957 numTempBuf[2] -= 1;
3958
3959 /*
3960 We want to skip over the first two slots in the buffer. The first slot
3961 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3962 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3963 The exponent must be adjusted by the number of leading zeroes, and the number of
3964 trailing zeroes.
3965 */
3966 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3967 uint32_t exponent = (digIndx+trailingZeroCount)/2;
3968 if (leadingZeroIndex)
3969 exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
3970 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
3971
3972 // Now transfer the collation key to our collIterate struct.
3973 // The total size for our collation key is half of endIndex, rounded up.
3974 int32_t size = (endIndex+1)/2;
3975 if(!ensureCEsCapacity(source, size)) {
3976 return (uint32_t)UCOL_NULLORDER;
3977 }
3978 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3979 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3980 UCOL_BYTE_COMMON; // Tertiary weight.
3981 i = endIndex - 1; // Reset the index into the buffer.
3982 while(i >= 2) {
3983 uint32_t primWeight = numTempBuf[i--] << 8;
3984 if ( i >= 2)
3985 primWeight |= numTempBuf[i--];
3986 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3987 }
3988
3989 source->toReturn = source->CEpos -1;
3990 return *(source->toReturn);
3991 } else {
3992 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3993 CE = *(CEOffset++);
3994 break;
3995 }
3996 }
3997
3998 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3999 {
4000 static const uint32_t
4001 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
4002 //const uint32_t LCount = 19;
4003 static const uint32_t VCount = 21;
4004 static const uint32_t TCount = 28;
4005 //const uint32_t NCount = VCount * TCount; /* 588 */
4006 //const uint32_t SCount = LCount * NCount; /* 11172 */
4007
4008 uint32_t L = ch - SBase;
4009 /*
4010 divide into pieces.
4011 we do it in this order since some compilers can do % and / in one
4012 operation
4013 */
4014 uint32_t T = L % TCount;
4015 L /= TCount;
4016 uint32_t V = L % VCount;
4017 L /= VCount;
4018
4019 /* offset them */
4020 L += LBase;
4021 V += VBase;
4022 T += TBase;
4023
4024 int32_t firstOffset = (int32_t)(source->pos - source->string);
4025 source->appendOffset(firstOffset, *status);
4026
4027 /*
4028 * return the first CE, but first put the rest into the expansion buffer
4029 */
4030 if (!source->coll->image->jamoSpecial) {
4031 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
4032 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
4033 source->appendOffset(firstOffset + 1, *status);
4034
4035 if (T != TBase) {
4036 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
4037 source->appendOffset(firstOffset + 1, *status);
4038 }
4039
4040 source->toReturn = source->CEpos - 1;
4041
4042 source->offsetReturn = source->offsetStore - 1;
4043 if (source->offsetReturn == source->offsetBuffer) {
4044 source->offsetStore = source->offsetBuffer;
4045 }
4046
4047 return *(source->toReturn);
4048 } else {
4049 // Since Hanguls pass the FCD check, it is
4050 // guaranteed that we won't be in
4051 // the normalization buffer if something like this happens
4052
4053 // Move Jamos into normalization buffer
4054 UChar *tempbuffer = source->writableBuffer.getBuffer(5);
4055 int32_t tempbufferLength, jamoOffset;
4056 tempbuffer[0] = 0;
4057 tempbuffer[1] = (UChar)L;
4058 tempbuffer[2] = (UChar)V;
4059 if (T != TBase) {
4060 tempbuffer[3] = (UChar)T;
4061 tempbufferLength = 4;
4062 } else {
4063 tempbufferLength = 3;
4064 }
4065 source->writableBuffer.releaseBuffer(tempbufferLength);
4066
4067 // Indicate where to continue in main input string after exhausting the writableBuffer
4068 if (source->pos == source->string) {
4069 jamoOffset = 0;
4070 source->fcdPosition = NULL;
4071 } else {
4072 jamoOffset = source->pos - source->string;
4073 source->fcdPosition = source->pos-1;
4074 }
4075
4076 // Append offsets for the additional chars
4077 // (not the 0, and not the L whose offsets match the original Hangul)
4078 int32_t jamoRemaining = tempbufferLength - 2;
4079 jamoOffset++; // appended offsets should match end of original Hangul
4080 while (jamoRemaining-- > 0) {
4081 source->appendOffset(jamoOffset, *status);
4082 }
4083
4084 source->offsetRepeatValue = jamoOffset;
4085
4086 source->offsetReturn = source->offsetStore - 1;
4087 if (source->offsetReturn == source->offsetBuffer) {
4088 source->offsetStore = source->offsetBuffer;
4089 }
4090
4091 source->pos = source->writableBuffer.getTerminatedBuffer() + tempbufferLength;
4092 source->origFlags = source->flags;
4093 source->flags |= UCOL_ITER_INNORMBUF;
4094 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
4095
4096 return(UCOL_IGNORABLE);
4097 }
4098 }
4099
4100 case IMPLICIT_TAG: /* everything that is not defined otherwise */
4101 return getPrevImplicit(ch, source);
4102
4103 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
4104 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
4105 return getPrevImplicit(ch, source);
4106
4107 case SURROGATE_TAG: /* This is a surrogate pair */
4108 /* essentially an engaged lead surrogate. */
4109 /* if you have encountered it here, it means that a */
4110 /* broken sequence was encountered and this is an error */
4111 return UCOL_NOT_FOUND;
4112
4113 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
4114 return UCOL_NOT_FOUND; /* broken surrogate sequence */
4115
4116 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
4117 {
4118 UChar32 cp = 0;
4119 UChar prevChar;
4120 const UChar *prev;
4121 if (isAtStartPrevIterate(source)) {
4122 /* we are at the start of the string, wrong place to be at */
4123 return UCOL_NOT_FOUND;
4124 }
4125 if (source->pos != source->writableBuffer.getBuffer()) {
4126 prev = source->pos - 1;
4127 } else {
4128 prev = source->fcdPosition;
4129 }
4130 prevChar = *prev;
4131
4132 /* Handles Han and Supplementary characters here.*/
4133 if (U16_IS_LEAD(prevChar)) {
4134 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
4135 source->pos = prev;
4136 } else {
4137 return UCOL_NOT_FOUND; /* like unassigned */
4138 }
4139
4140 return getPrevImplicit(cp, source);
4141 }
4142
4143 /* UCA is filled with these. Tailorings are NOT_FOUND */
4144 /* not yet implemented */
4145 case CHARSET_TAG: /* this tag always returns */
4146 /* probably after 1.8 */
4147 return UCOL_NOT_FOUND;
4148
4149 default: /* this tag always returns */
4150 *status = U_INTERNAL_PROGRAM_ERROR;
4151 CE=0;
4152 break;
4153 }
4154
4155 if (CE <= UCOL_NOT_FOUND) {
4156 break;
4157 }
4158 }
4159
4160 return CE;
4161 }
4162
4163 /* This should really be a macro */
4164 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
4165 /* secondaries in French */
4166 /*
4167 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
4168 uint8_t temp;
4169 while(start<end) {
4170 temp = *start;
4171 *start++ = *end;
4172 *end-- = temp;
4173 }
4174 }
4175 */
4176
4177 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
4178 TYPE tempA; \
4179 while((start)<(end)) { \
4180 tempA = *(start); \
4181 *(start)++ = *(end); \
4182 *(end)-- = tempA; \
4183 } \
4184 }
4185
4186 /****************************************************************************/
4187 /* Following are the sortkey generation functions */
4188 /* */
4189 /****************************************************************************/
4190
4191 /**
4192 * Merge two sort keys.
4193 * This is useful, for example, to combine sort keys from first and last names
4194 * to sort such pairs.
4195 * Merged sort keys consider on each collation level the first part first entirely,
4196 * then the second one.
4197 * It is possible to merge multiple sort keys by consecutively merging
4198 * another one with the intermediate result.
4199 *
4200 * The length of the merge result is the sum of the lengths of the input sort keys
4201 * minus 1.
4202 *
4203 * @param src1 the first sort key
4204 * @param src1Length the length of the first sort key, including the zero byte at the end;
4205 * can be -1 if the function is to find the length
4206 * @param src2 the second sort key
4207 * @param src2Length the length of the second sort key, including the zero byte at the end;
4208 * can be -1 if the function is to find the length
4209 * @param dest the buffer where the merged sort key is written,
4210 * can be NULL if destCapacity==0
4211 * @param destCapacity the number of bytes in the dest buffer
4212 * @return the length of the merged sort key, src1Length+src2Length-1;
4213 * can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
4214 * in which cases the contents of dest is undefined
4215 *
4216 * @draft
4217 */
4218 U_CAPI int32_t U_EXPORT2
ucol_mergeSortkeys(const uint8_t * src1,int32_t src1Length,const uint8_t * src2,int32_t src2Length,uint8_t * dest,int32_t destCapacity)4219 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
4220 const uint8_t *src2, int32_t src2Length,
4221 uint8_t *dest, int32_t destCapacity) {
4222 int32_t destLength;
4223 uint8_t b;
4224
4225 /* check arguments */
4226 if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
4227 src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
4228 destCapacity<0 || (destCapacity>0 && dest==NULL)
4229 ) {
4230 /* error, attempt to write a zero byte and return 0 */
4231 if(dest!=NULL && destCapacity>0) {
4232 *dest=0;
4233 }
4234 return 0;
4235 }
4236
4237 /* check lengths and capacity */
4238 if(src1Length<0) {
4239 src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
4240 }
4241 if(src2Length<0) {
4242 src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
4243 }
4244
4245 destLength=src1Length+src2Length-1;
4246 if(destLength>destCapacity) {
4247 /* the merged sort key does not fit into the destination */
4248 return destLength;
4249 }
4250
4251 /* merge the sort keys with the same number of levels */
4252 while(*src1!=0 && *src2!=0) { /* while both have another level */
4253 /* copy level from src1 not including 00 or 01 */
4254 while((b=*src1)>=2) {
4255 ++src1;
4256 *dest++=b;
4257 }
4258
4259 /* add a 02 merge separator */
4260 *dest++=2;
4261
4262 /* copy level from src2 not including 00 or 01 */
4263 while((b=*src2)>=2) {
4264 ++src2;
4265 *dest++=b;
4266 }
4267
4268 /* if both sort keys have another level, then add a 01 level separator and continue */
4269 if(*src1==1 && *src2==1) {
4270 ++src1;
4271 ++src2;
4272 *dest++=1;
4273 }
4274 }
4275
4276 /*
4277 * here, at least one sort key is finished now, but the other one
4278 * might have some contents left from containing more levels;
4279 * that contents is just appended to the result
4280 */
4281 if(*src1!=0) {
4282 /* src1 is not finished, therefore *src2==0, and src1 is appended */
4283 src2=src1;
4284 }
4285 /* append src2, "the other, unfinished sort key" */
4286 uprv_strcpy((char *)dest, (const char *)src2);
4287
4288 /* trust that neither sort key contained illegally embedded zero bytes */
4289 return destLength;
4290 }
4291
4292 U_NAMESPACE_BEGIN
4293
4294 class SortKeyByteSink : public ByteSink {
4295 public:
SortKeyByteSink(char * dest,int32_t destCapacity)4296 SortKeyByteSink(char *dest, int32_t destCapacity)
4297 : buffer_(dest), capacity_(destCapacity),
4298 appended_(0) {
4299 if (buffer_ == NULL) {
4300 capacity_ = 0;
4301 } else if(capacity_ < 0) {
4302 buffer_ = NULL;
4303 capacity_ = 0;
4304 }
4305 }
4306 virtual ~SortKeyByteSink();
4307
4308 virtual void Append(const char *bytes, int32_t n);
Append(uint32_t b)4309 void Append(uint32_t b) {
4310 if (appended_ < capacity_ || Resize(1, appended_)) {
4311 buffer_[appended_] = (char)b;
4312 }
4313 ++appended_;
4314 }
Append(uint32_t b1,uint32_t b2)4315 void Append(uint32_t b1, uint32_t b2) {
4316 int32_t a2 = appended_ + 2;
4317 if (a2 <= capacity_ || Resize(2, appended_)) {
4318 buffer_[appended_] = (char)b1;
4319 buffer_[appended_ + 1] = (char)b2;
4320 } else if(appended_ < capacity_) {
4321 buffer_[appended_] = (char)b1;
4322 }
4323 appended_ = a2;
4324 }
4325 virtual char *GetAppendBuffer(int32_t min_capacity,
4326 int32_t desired_capacity_hint,
4327 char *scratch, int32_t scratch_capacity,
4328 int32_t *result_capacity);
NumberOfBytesAppended() const4329 int32_t NumberOfBytesAppended() const { return appended_; }
4330 /** @return FALSE if memory allocation failed */
IsOk() const4331 UBool IsOk() const { return buffer_ != NULL; }
4332
4333 protected:
4334 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) = 0;
4335 virtual UBool Resize(int32_t appendCapacity, int32_t length) = 0;
4336
SetNotOk()4337 void SetNotOk() {
4338 buffer_ = NULL;
4339 capacity_ = 0;
4340 }
4341
4342 char *buffer_;
4343 int32_t capacity_;
4344 int32_t appended_;
4345
4346 private:
4347 SortKeyByteSink(const SortKeyByteSink &); // copy constructor not implemented
4348 SortKeyByteSink &operator=(const SortKeyByteSink &); // assignment operator not implemented
4349 };
4350
~SortKeyByteSink()4351 SortKeyByteSink::~SortKeyByteSink() {}
4352
4353 void
Append(const char * bytes,int32_t n)4354 SortKeyByteSink::Append(const char *bytes, int32_t n) {
4355 if (n <= 0 || bytes == NULL) {
4356 return;
4357 }
4358 int32_t length = appended_;
4359 appended_ += n;
4360 if ((buffer_ + length) == bytes) {
4361 return; // the caller used GetAppendBuffer() and wrote the bytes already
4362 }
4363 int32_t available = capacity_ - length;
4364 if (n <= available) {
4365 uprv_memcpy(buffer_ + length, bytes, n);
4366 } else {
4367 AppendBeyondCapacity(bytes, n, length);
4368 }
4369 }
4370
4371 char *
GetAppendBuffer(int32_t min_capacity,int32_t desired_capacity_hint,char * scratch,int32_t scratch_capacity,int32_t * result_capacity)4372 SortKeyByteSink::GetAppendBuffer(int32_t min_capacity,
4373 int32_t desired_capacity_hint,
4374 char *scratch,
4375 int32_t scratch_capacity,
4376 int32_t *result_capacity) {
4377 if (min_capacity < 1 || scratch_capacity < min_capacity) {
4378 *result_capacity = 0;
4379 return NULL;
4380 }
4381 int32_t available = capacity_ - appended_;
4382 if (available >= min_capacity) {
4383 *result_capacity = available;
4384 return buffer_ + appended_;
4385 } else if (Resize(desired_capacity_hint, appended_)) {
4386 *result_capacity = capacity_ - appended_;
4387 return buffer_ + appended_;
4388 } else {
4389 *result_capacity = scratch_capacity;
4390 return scratch;
4391 }
4392 }
4393
4394 class FixedSortKeyByteSink : public SortKeyByteSink {
4395 public:
FixedSortKeyByteSink(char * dest,int32_t destCapacity)4396 FixedSortKeyByteSink(char *dest, int32_t destCapacity)
4397 : SortKeyByteSink(dest, destCapacity) {}
4398 virtual ~FixedSortKeyByteSink();
4399
4400 private:
4401 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
4402 virtual UBool Resize(int32_t appendCapacity, int32_t length);
4403 };
4404
~FixedSortKeyByteSink()4405 FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
4406
4407 void
AppendBeyondCapacity(const char * bytes,int32_t,int32_t length)4408 FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) {
4409 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
4410 // Fill the buffer completely.
4411 int32_t available = capacity_ - length;
4412 if (available > 0) {
4413 uprv_memcpy(buffer_ + length, bytes, available);
4414 }
4415 }
4416
4417 UBool
Resize(int32_t,int32_t)4418 FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
4419 return FALSE;
4420 }
4421
4422 class CollationKeyByteSink : public SortKeyByteSink {
4423 public:
CollationKeyByteSink(CollationKey & key)4424 CollationKeyByteSink(CollationKey &key)
4425 : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()),
4426 key_(key) {}
4427 virtual ~CollationKeyByteSink();
4428
4429 private:
4430 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
4431 virtual UBool Resize(int32_t appendCapacity, int32_t length);
4432
4433 CollationKey &key_;
4434 };
4435
~CollationKeyByteSink()4436 CollationKeyByteSink::~CollationKeyByteSink() {}
4437
4438 void
AppendBeyondCapacity(const char * bytes,int32_t n,int32_t length)4439 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) {
4440 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
4441 if (Resize(n, length)) {
4442 uprv_memcpy(buffer_ + length, bytes, n);
4443 }
4444 }
4445
4446 UBool
Resize(int32_t appendCapacity,int32_t length)4447 CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
4448 if (buffer_ == NULL) {
4449 return FALSE; // allocation failed before already
4450 }
4451 int32_t newCapacity = 2 * capacity_;
4452 int32_t altCapacity = length + 2 * appendCapacity;
4453 if (newCapacity < altCapacity) {
4454 newCapacity = altCapacity;
4455 }
4456 if (newCapacity < 200) {
4457 newCapacity = 200;
4458 }
4459 uint8_t *newBuffer = key_.reallocate(newCapacity, length);
4460 if (newBuffer == NULL) {
4461 SetNotOk();
4462 return FALSE;
4463 }
4464 buffer_ = reinterpret_cast<char *>(newBuffer);
4465 capacity_ = newCapacity;
4466 return TRUE;
4467 }
4468
4469 /**
4470 * uint8_t byte buffer, similar to CharString but simpler.
4471 */
4472 class SortKeyLevel : public UMemory {
4473 public:
SortKeyLevel()4474 SortKeyLevel() : len(0), ok(TRUE) {}
~SortKeyLevel()4475 ~SortKeyLevel() {}
4476
4477 /** @return FALSE if memory allocation failed */
isOk() const4478 UBool isOk() const { return ok; }
isEmpty() const4479 UBool isEmpty() const { return len == 0; }
length() const4480 int32_t length() const { return len; }
data() const4481 const uint8_t *data() const { return buffer.getAlias(); }
operator [](int32_t index) const4482 uint8_t operator[](int32_t index) const { return buffer[index]; }
4483
4484 void appendByte(uint32_t b);
4485
appendTo(ByteSink & sink) const4486 void appendTo(ByteSink &sink) const {
4487 sink.Append(reinterpret_cast<const char *>(buffer.getAlias()), len);
4488 }
4489
lastByte()4490 uint8_t &lastByte() {
4491 U_ASSERT(len > 0);
4492 return buffer[len - 1];
4493 }
4494
getLastFewBytes(int32_t n)4495 uint8_t *getLastFewBytes(int32_t n) {
4496 if (ok && len >= n) {
4497 return buffer.getAlias() + len - n;
4498 } else {
4499 return NULL;
4500 }
4501 }
4502
4503 private:
4504 MaybeStackArray<uint8_t, 40> buffer;
4505 int32_t len;
4506 UBool ok;
4507
4508 UBool ensureCapacity(int32_t appendCapacity);
4509
4510 SortKeyLevel(const SortKeyLevel &other); // forbid copying of this class
4511 SortKeyLevel &operator=(const SortKeyLevel &other); // forbid copying of this class
4512 };
4513
appendByte(uint32_t b)4514 void SortKeyLevel::appendByte(uint32_t b) {
4515 if(len < buffer.getCapacity() || ensureCapacity(1)) {
4516 buffer[len++] = (uint8_t)b;
4517 }
4518 }
4519
ensureCapacity(int32_t appendCapacity)4520 UBool SortKeyLevel::ensureCapacity(int32_t appendCapacity) {
4521 if(!ok) {
4522 return FALSE;
4523 }
4524 int32_t newCapacity = 2 * buffer.getCapacity();
4525 int32_t altCapacity = len + 2 * appendCapacity;
4526 if (newCapacity < altCapacity) {
4527 newCapacity = altCapacity;
4528 }
4529 if (newCapacity < 200) {
4530 newCapacity = 200;
4531 }
4532 if(buffer.resize(newCapacity, len)==NULL) {
4533 return ok = FALSE;
4534 }
4535 return TRUE;
4536 }
4537
4538 U_NAMESPACE_END
4539
4540 /* sortkey API */
4541 U_CAPI int32_t U_EXPORT2
ucol_getSortKey(const UCollator * coll,const UChar * source,int32_t sourceLength,uint8_t * result,int32_t resultLength)4542 ucol_getSortKey(const UCollator *coll,
4543 const UChar *source,
4544 int32_t sourceLength,
4545 uint8_t *result,
4546 int32_t resultLength)
4547 {
4548 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
4549 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
4550 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
4551 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
4552 }
4553
4554 if(coll->delegate != NULL) {
4555 return ((const Collator*)coll->delegate)->getSortKey(source, sourceLength, result, resultLength);
4556 }
4557
4558 UErrorCode status = U_ZERO_ERROR;
4559 int32_t keySize = 0;
4560
4561 if(source != NULL) {
4562 // source == NULL is actually an error situation, but we would need to
4563 // have an error code to return it. Until we introduce a new
4564 // API, it stays like this
4565
4566 /* this uses the function pointer that is set in updateinternalstate */
4567 /* currently, there are two funcs: */
4568 /*ucol_calcSortKey(...);*/
4569 /*ucol_calcSortKeySimpleTertiary(...);*/
4570
4571 uint8_t noDest[1] = { 0 };
4572 if(result == NULL) {
4573 // Distinguish pure preflighting from an allocation error.
4574 result = noDest;
4575 resultLength = 0;
4576 }
4577 FixedSortKeyByteSink sink(reinterpret_cast<char *>(result), resultLength);
4578 coll->sortKeyGen(coll, source, sourceLength, sink, &status);
4579 if(U_SUCCESS(status)) {
4580 keySize = sink.NumberOfBytesAppended();
4581 }
4582 }
4583 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
4584 UTRACE_EXIT_STATUS(status);
4585 return keySize;
4586 }
4587
4588 U_CFUNC int32_t
ucol_getCollationKey(const UCollator * coll,const UChar * source,int32_t sourceLength,CollationKey & key,UErrorCode & errorCode)4589 ucol_getCollationKey(const UCollator *coll,
4590 const UChar *source, int32_t sourceLength,
4591 CollationKey &key,
4592 UErrorCode &errorCode) {
4593 CollationKeyByteSink sink(key);
4594 coll->sortKeyGen(coll, source, sourceLength, sink, &errorCode);
4595 return sink.NumberOfBytesAppended();
4596 }
4597
4598 // Is this primary weight compressible?
4599 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
4600 // TODO: This should use per-lead-byte flags from FractionalUCA.txt.
4601 static inline UBool
isCompressible(const UCollator *,uint8_t primary1)4602 isCompressible(const UCollator * /*coll*/, uint8_t primary1) {
4603 return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary;
4604 }
4605
4606 static
doCaseShift(SortKeyLevel & cases,uint32_t & caseShift)4607 inline void doCaseShift(SortKeyLevel &cases, uint32_t &caseShift) {
4608 if (caseShift == 0) {
4609 cases.appendByte(UCOL_CASE_BYTE_START);
4610 caseShift = UCOL_CASE_SHIFT_START;
4611 }
4612 }
4613
4614 // Packs the secondary buffer when processing French locale.
4615 static void
packFrench(const uint8_t * secondaries,int32_t secsize,SortKeyByteSink & result)4616 packFrench(const uint8_t *secondaries, int32_t secsize, SortKeyByteSink &result) {
4617 secondaries += secsize; // We read the secondary-level bytes back to front.
4618 uint8_t secondary;
4619 int32_t count2 = 0;
4620 int32_t i = 0;
4621 // we use i here since the key size already accounts for terminators, so we'll discard the increment
4622 for(i = 0; i<secsize; i++) {
4623 secondary = *(secondaries-i-1);
4624 /* This is compression code. */
4625 if (secondary == UCOL_COMMON2) {
4626 ++count2;
4627 } else {
4628 if (count2 > 0) {
4629 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4630 while (count2 > UCOL_TOP_COUNT2) {
4631 result.Append(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
4632 count2 -= (uint32_t)UCOL_TOP_COUNT2;
4633 }
4634 result.Append(UCOL_COMMON_TOP2 - (count2-1));
4635 } else {
4636 while (count2 > UCOL_BOT_COUNT2) {
4637 result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4638 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4639 }
4640 result.Append(UCOL_COMMON_BOT2 + (count2-1));
4641 }
4642 count2 = 0;
4643 }
4644 result.Append(secondary);
4645 }
4646 }
4647 if (count2 > 0) {
4648 while (count2 > UCOL_BOT_COUNT2) {
4649 result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4650 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4651 }
4652 result.Append(UCOL_COMMON_BOT2 + (count2-1));
4653 }
4654 }
4655
4656 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
4657
4658 /* This is the sortkey work horse function */
4659 U_CFUNC void U_CALLCONV
ucol_calcSortKey(const UCollator * coll,const UChar * source,int32_t sourceLength,SortKeyByteSink & result,UErrorCode * status)4660 ucol_calcSortKey(const UCollator *coll,
4661 const UChar *source,
4662 int32_t sourceLength,
4663 SortKeyByteSink &result,
4664 UErrorCode *status)
4665 {
4666 if(U_FAILURE(*status)) {
4667 return;
4668 }
4669
4670 SortKeyByteSink &primaries = result;
4671 SortKeyLevel secondaries;
4672 SortKeyLevel tertiaries;
4673 SortKeyLevel cases;
4674 SortKeyLevel quads;
4675
4676 UnicodeString normSource;
4677
4678 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4679
4680 UColAttributeValue strength = coll->strength;
4681
4682 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4683 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4684 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4685 UBool compareIdent = (strength == UCOL_IDENTICAL);
4686 UBool doCase = (coll->caseLevel == UCOL_ON);
4687 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4688 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
4689 //UBool qShifted = shifted && (compareQuad == 0);
4690 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4691
4692 uint32_t variableTopValue = coll->variableTopValue;
4693 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4694 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4695 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4696 uint8_t UCOL_HIRAGANA_QUAD = 0;
4697 if(doHiragana) {
4698 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4699 /* allocate one more space for hiragana, value for hiragana */
4700 }
4701 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4702
4703 /* support for special features like caselevel and funky secondaries */
4704 int32_t lastSecondaryLength = 0;
4705 uint32_t caseShift = 0;
4706
4707 /* If we need to normalize, we'll do it all at once at the beginning! */
4708 const Normalizer2 *norm2;
4709 if(compareIdent) {
4710 norm2 = Normalizer2Factory::getNFDInstance(*status);
4711 } else if(coll->normalizationMode != UCOL_OFF) {
4712 norm2 = Normalizer2Factory::getFCDInstance(*status);
4713 } else {
4714 norm2 = NULL;
4715 }
4716 if(norm2 != NULL) {
4717 normSource.setTo(FALSE, source, len);
4718 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
4719 if(qcYesLength != len) {
4720 UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
4721 normSource.truncate(qcYesLength);
4722 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
4723 source = normSource.getBuffer();
4724 len = normSource.length();
4725 }
4726 }
4727 collIterate s;
4728 IInit_collIterate(coll, source, len, &s, status);
4729 if(U_FAILURE(*status)) {
4730 return;
4731 }
4732 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized.
4733
4734 uint32_t order = 0;
4735
4736 uint8_t primary1 = 0;
4737 uint8_t primary2 = 0;
4738 uint8_t secondary = 0;
4739 uint8_t tertiary = 0;
4740 uint8_t caseSwitch = coll->caseSwitch;
4741 uint8_t tertiaryMask = coll->tertiaryMask;
4742 int8_t tertiaryAddition = coll->tertiaryAddition;
4743 uint8_t tertiaryTop = coll->tertiaryTop;
4744 uint8_t tertiaryBottom = coll->tertiaryBottom;
4745 uint8_t tertiaryCommon = coll->tertiaryCommon;
4746 uint8_t caseBits = 0;
4747
4748 UBool wasShifted = FALSE;
4749 UBool notIsContinuation = FALSE;
4750
4751 uint32_t count2 = 0, count3 = 0, count4 = 0;
4752 uint8_t leadPrimary = 0;
4753
4754 for(;;) {
4755 order = ucol_IGetNextCE(coll, &s, status);
4756 if(order == UCOL_NO_MORE_CES) {
4757 break;
4758 }
4759
4760 if(order == 0) {
4761 continue;
4762 }
4763
4764 notIsContinuation = !isContinuation(order);
4765
4766 if(notIsContinuation) {
4767 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
4768 } else {
4769 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4770 }
4771
4772 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4773 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4774 primary1 = (uint8_t)(order >> 8);
4775
4776 uint8_t originalPrimary1 = primary1;
4777 if(notIsContinuation && coll->leadBytePermutationTable != NULL) {
4778 primary1 = coll->leadBytePermutationTable[primary1];
4779 }
4780
4781 if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4782 || (!notIsContinuation && wasShifted)))
4783 || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
4784 {
4785 /* and other ignorables should be removed if following a shifted code point */
4786 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4787 /* we should just completely ignore it */
4788 continue;
4789 }
4790 if(compareQuad == 0) {
4791 if(count4 > 0) {
4792 while (count4 > UCOL_BOT_COUNT4) {
4793 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4794 count4 -= UCOL_BOT_COUNT4;
4795 }
4796 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
4797 count4 = 0;
4798 }
4799 /* We are dealing with a variable and we're treating them as shifted */
4800 /* This is a shifted ignorable */
4801 if(primary1 != 0) { /* we need to check this since we could be in continuation */
4802 quads.appendByte(primary1);
4803 }
4804 if(primary2 != 0) {
4805 quads.appendByte(primary2);
4806 }
4807 }
4808 wasShifted = TRUE;
4809 } else {
4810 wasShifted = FALSE;
4811 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4812 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */
4813 /* regular and simple sortkey calc */
4814 if(primary1 != UCOL_IGNORABLE) {
4815 if(notIsContinuation) {
4816 if(leadPrimary == primary1) {
4817 primaries.Append(primary2);
4818 } else {
4819 if(leadPrimary != 0) {
4820 primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
4821 }
4822 if(primary2 == UCOL_IGNORABLE) {
4823 /* one byter, not compressed */
4824 primaries.Append(primary1);
4825 leadPrimary = 0;
4826 } else if(isCompressible(coll, originalPrimary1)) {
4827 /* compress */
4828 primaries.Append(leadPrimary = primary1, primary2);
4829 } else {
4830 leadPrimary = 0;
4831 primaries.Append(primary1, primary2);
4832 }
4833 }
4834 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4835 if(primary2 == UCOL_IGNORABLE) {
4836 primaries.Append(primary1);
4837 } else {
4838 primaries.Append(primary1, primary2);
4839 }
4840 }
4841 }
4842
4843 if(secondary > compareSec) {
4844 if(!isFrenchSec) {
4845 /* This is compression code. */
4846 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4847 ++count2;
4848 } else {
4849 if (count2 > 0) {
4850 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4851 while (count2 > UCOL_TOP_COUNT2) {
4852 secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
4853 count2 -= (uint32_t)UCOL_TOP_COUNT2;
4854 }
4855 secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1));
4856 } else {
4857 while (count2 > UCOL_BOT_COUNT2) {
4858 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4859 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4860 }
4861 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
4862 }
4863 count2 = 0;
4864 }
4865 secondaries.appendByte(secondary);
4866 }
4867 } else {
4868 /* Do the special handling for French secondaries */
4869 /* We need to get continuation elements and do intermediate restore */
4870 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
4871 if(notIsContinuation) {
4872 if (lastSecondaryLength > 1) {
4873 uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength);
4874 if (frenchStartPtr != NULL) {
4875 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4876 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
4877 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4878 }
4879 }
4880 lastSecondaryLength = 1;
4881 } else {
4882 ++lastSecondaryLength;
4883 }
4884 secondaries.appendByte(secondary);
4885 }
4886 }
4887
4888 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4889 // do the case level if we need to do it. We don't want to calculate
4890 // case level for primary ignorables if we have only primary strength and case level
4891 // otherwise we would break well formedness of CEs
4892 doCaseShift(cases, caseShift);
4893 if(notIsContinuation) {
4894 caseBits = (uint8_t)(tertiary & 0xC0);
4895
4896 if(tertiary != 0) {
4897 if(coll->caseFirst == UCOL_UPPER_FIRST) {
4898 if((caseBits & 0xC0) == 0) {
4899 cases.lastByte() |= 1 << (--caseShift);
4900 } else {
4901 cases.lastByte() |= 0 << (--caseShift);
4902 /* second bit */
4903 doCaseShift(cases, caseShift);
4904 cases.lastByte() |= ((caseBits>>6)&1) << (--caseShift);
4905 }
4906 } else {
4907 if((caseBits & 0xC0) == 0) {
4908 cases.lastByte() |= 0 << (--caseShift);
4909 } else {
4910 cases.lastByte() |= 1 << (--caseShift);
4911 /* second bit */
4912 doCaseShift(cases, caseShift);
4913 cases.lastByte() |= ((caseBits>>7)&1) << (--caseShift);
4914 }
4915 }
4916 }
4917 }
4918 } else {
4919 if(notIsContinuation) {
4920 tertiary ^= caseSwitch;
4921 }
4922 }
4923
4924 tertiary &= tertiaryMask;
4925 if(tertiary > compareTer) {
4926 /* This is compression code. */
4927 /* sequence size check is included in the if clause */
4928 if (tertiary == tertiaryCommon && notIsContinuation) {
4929 ++count3;
4930 } else {
4931 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
4932 tertiary += tertiaryAddition;
4933 } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
4934 tertiary -= tertiaryAddition;
4935 }
4936 if (count3 > 0) {
4937 if ((tertiary > tertiaryCommon)) {
4938 while (count3 > coll->tertiaryTopCount) {
4939 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
4940 count3 -= (uint32_t)coll->tertiaryTopCount;
4941 }
4942 tertiaries.appendByte(tertiaryTop - (count3-1));
4943 } else {
4944 while (count3 > coll->tertiaryBottomCount) {
4945 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
4946 count3 -= (uint32_t)coll->tertiaryBottomCount;
4947 }
4948 tertiaries.appendByte(tertiaryBottom + (count3-1));
4949 }
4950 count3 = 0;
4951 }
4952 tertiaries.appendByte(tertiary);
4953 }
4954 }
4955
4956 if(/*qShifted*/(compareQuad==0) && notIsContinuation) {
4957 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4958 if(count4>0) { // Close this part
4959 while (count4 > UCOL_BOT_COUNT4) {
4960 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4961 count4 -= UCOL_BOT_COUNT4;
4962 }
4963 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
4964 count4 = 0;
4965 }
4966 quads.appendByte(UCOL_HIRAGANA_QUAD); // Add the Hiragana
4967 } else { // This wasn't Hiragana, so we can continue adding stuff
4968 count4++;
4969 }
4970 }
4971 }
4972 }
4973
4974 /* Here, we are generally done with processing */
4975 /* bailing out would not be too productive */
4976
4977 UBool ok = TRUE;
4978 if(U_SUCCESS(*status)) {
4979 /* we have done all the CE's, now let's put them together to form a key */
4980 if(compareSec == 0) {
4981 if (count2 > 0) {
4982 while (count2 > UCOL_BOT_COUNT2) {
4983 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4984 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4985 }
4986 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
4987 }
4988 result.Append(UCOL_LEVELTERMINATOR);
4989 if(!secondaries.isOk()) {
4990 ok = FALSE;
4991 } else if(!isFrenchSec) {
4992 secondaries.appendTo(result);
4993 } else {
4994 // If there are any unresolved continuation secondaries,
4995 // reverse them here so that we can reverse the whole secondary thing.
4996 if (lastSecondaryLength > 1) {
4997 uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength);
4998 if (frenchStartPtr != NULL) {
4999 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
5000 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
5001 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
5002 }
5003 }
5004 packFrench(secondaries.data(), secondaries.length(), result);
5005 }
5006 }
5007
5008 if(doCase) {
5009 ok &= cases.isOk();
5010 result.Append(UCOL_LEVELTERMINATOR);
5011 cases.appendTo(result);
5012 }
5013
5014 if(compareTer == 0) {
5015 if (count3 > 0) {
5016 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
5017 while (count3 >= coll->tertiaryTopCount) {
5018 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
5019 count3 -= (uint32_t)coll->tertiaryTopCount;
5020 }
5021 tertiaries.appendByte(tertiaryTop - count3);
5022 } else {
5023 while (count3 > coll->tertiaryBottomCount) {
5024 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
5025 count3 -= (uint32_t)coll->tertiaryBottomCount;
5026 }
5027 tertiaries.appendByte(tertiaryBottom + (count3-1));
5028 }
5029 }
5030 ok &= tertiaries.isOk();
5031 result.Append(UCOL_LEVELTERMINATOR);
5032 tertiaries.appendTo(result);
5033
5034 if(compareQuad == 0/*qShifted == TRUE*/) {
5035 if(count4 > 0) {
5036 while (count4 > UCOL_BOT_COUNT4) {
5037 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
5038 count4 -= UCOL_BOT_COUNT4;
5039 }
5040 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
5041 }
5042 ok &= quads.isOk();
5043 result.Append(UCOL_LEVELTERMINATOR);
5044 quads.appendTo(result);
5045 }
5046
5047 if(compareIdent) {
5048 result.Append(UCOL_LEVELTERMINATOR);
5049 u_writeIdenticalLevelRun(s.string, len, result);
5050 }
5051 }
5052 result.Append(0);
5053 }
5054
5055 /* To avoid memory leak, free the offset buffer if necessary. */
5056 ucol_freeOffsetBuffer(&s);
5057
5058 ok &= result.IsOk();
5059 if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; }
5060 }
5061
5062
5063 U_CFUNC void U_CALLCONV
ucol_calcSortKeySimpleTertiary(const UCollator * coll,const UChar * source,int32_t sourceLength,SortKeyByteSink & result,UErrorCode * status)5064 ucol_calcSortKeySimpleTertiary(const UCollator *coll,
5065 const UChar *source,
5066 int32_t sourceLength,
5067 SortKeyByteSink &result,
5068 UErrorCode *status)
5069 {
5070 U_ALIGN_CODE(16);
5071
5072 if(U_FAILURE(*status)) {
5073 return;
5074 }
5075
5076 SortKeyByteSink &primaries = result;
5077 SortKeyLevel secondaries;
5078 SortKeyLevel tertiaries;
5079
5080 UnicodeString normSource;
5081
5082 int32_t len = sourceLength;
5083
5084 /* If we need to normalize, we'll do it all at once at the beginning! */
5085 if(coll->normalizationMode != UCOL_OFF) {
5086 normSource.setTo(len < 0, source, len);
5087 const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status);
5088 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
5089 if(qcYesLength != normSource.length()) {
5090 UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
5091 normSource.truncate(qcYesLength);
5092 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
5093 source = normSource.getBuffer();
5094 len = normSource.length();
5095 }
5096 }
5097 collIterate s;
5098 IInit_collIterate(coll, (UChar *)source, len, &s, status);
5099 if(U_FAILURE(*status)) {
5100 return;
5101 }
5102 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized.
5103
5104 uint32_t order = 0;
5105
5106 uint8_t primary1 = 0;
5107 uint8_t primary2 = 0;
5108 uint8_t secondary = 0;
5109 uint8_t tertiary = 0;
5110 uint8_t caseSwitch = coll->caseSwitch;
5111 uint8_t tertiaryMask = coll->tertiaryMask;
5112 int8_t tertiaryAddition = coll->tertiaryAddition;
5113 uint8_t tertiaryTop = coll->tertiaryTop;
5114 uint8_t tertiaryBottom = coll->tertiaryBottom;
5115 uint8_t tertiaryCommon = coll->tertiaryCommon;
5116
5117 UBool notIsContinuation = FALSE;
5118
5119 uint32_t count2 = 0, count3 = 0;
5120 uint8_t leadPrimary = 0;
5121
5122 for(;;) {
5123 order = ucol_IGetNextCE(coll, &s, status);
5124
5125 if(order == 0) {
5126 continue;
5127 }
5128
5129 if(order == UCOL_NO_MORE_CES) {
5130 break;
5131 }
5132
5133 notIsContinuation = !isContinuation(order);
5134
5135 if(notIsContinuation) {
5136 tertiary = (uint8_t)((order & tertiaryMask));
5137 } else {
5138 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5139 }
5140
5141 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5142 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5143 primary1 = (uint8_t)(order >> 8);
5144
5145 uint8_t originalPrimary1 = primary1;
5146 if (coll->leadBytePermutationTable != NULL && notIsContinuation) {
5147 primary1 = coll->leadBytePermutationTable[primary1];
5148 }
5149
5150 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5151 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */
5152 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
5153 /* regular and simple sortkey calc */
5154 if(primary1 != UCOL_IGNORABLE) {
5155 if(notIsContinuation) {
5156 if(leadPrimary == primary1) {
5157 primaries.Append(primary2);
5158 } else {
5159 if(leadPrimary != 0) {
5160 primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5161 }
5162 if(primary2 == UCOL_IGNORABLE) {
5163 /* one byter, not compressed */
5164 primaries.Append(primary1);
5165 leadPrimary = 0;
5166 } else if(isCompressible(coll, originalPrimary1)) {
5167 /* compress */
5168 primaries.Append(leadPrimary = primary1, primary2);
5169 } else {
5170 leadPrimary = 0;
5171 primaries.Append(primary1, primary2);
5172 }
5173 }
5174 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5175 if(primary2 == UCOL_IGNORABLE) {
5176 primaries.Append(primary1);
5177 } else {
5178 primaries.Append(primary1, primary2);
5179 }
5180 }
5181 }
5182
5183 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5184 /* This is compression code. */
5185 if (secondary == UCOL_COMMON2 && notIsContinuation) {
5186 ++count2;
5187 } else {
5188 if (count2 > 0) {
5189 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5190 while (count2 > UCOL_TOP_COUNT2) {
5191 secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5192 count2 -= (uint32_t)UCOL_TOP_COUNT2;
5193 }
5194 secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1));
5195 } else {
5196 while (count2 > UCOL_BOT_COUNT2) {
5197 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5198 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5199 }
5200 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
5201 }
5202 count2 = 0;
5203 }
5204 secondaries.appendByte(secondary);
5205 }
5206 }
5207
5208 if(notIsContinuation) {
5209 tertiary ^= caseSwitch;
5210 }
5211
5212 if(tertiary > 0) {
5213 /* This is compression code. */
5214 /* sequence size check is included in the if clause */
5215 if (tertiary == tertiaryCommon && notIsContinuation) {
5216 ++count3;
5217 } else {
5218 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5219 tertiary += tertiaryAddition;
5220 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5221 tertiary -= tertiaryAddition;
5222 }
5223 if (count3 > 0) {
5224 if ((tertiary > tertiaryCommon)) {
5225 while (count3 > coll->tertiaryTopCount) {
5226 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
5227 count3 -= (uint32_t)coll->tertiaryTopCount;
5228 }
5229 tertiaries.appendByte(tertiaryTop - (count3-1));
5230 } else {
5231 while (count3 > coll->tertiaryBottomCount) {
5232 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
5233 count3 -= (uint32_t)coll->tertiaryBottomCount;
5234 }
5235 tertiaries.appendByte(tertiaryBottom + (count3-1));
5236 }
5237 count3 = 0;
5238 }
5239 tertiaries.appendByte(tertiary);
5240 }
5241 }
5242 }
5243
5244 UBool ok = TRUE;
5245 if(U_SUCCESS(*status)) {
5246 /* we have done all the CE's, now let's put them together to form a key */
5247 if (count2 > 0) {
5248 while (count2 > UCOL_BOT_COUNT2) {
5249 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5250 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5251 }
5252 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
5253 }
5254 ok &= secondaries.isOk();
5255 result.Append(UCOL_LEVELTERMINATOR);
5256 secondaries.appendTo(result);
5257
5258 if (count3 > 0) {
5259 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5260 while (count3 >= coll->tertiaryTopCount) {
5261 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
5262 count3 -= (uint32_t)coll->tertiaryTopCount;
5263 }
5264 tertiaries.appendByte(tertiaryTop - count3);
5265 } else {
5266 while (count3 > coll->tertiaryBottomCount) {
5267 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
5268 count3 -= (uint32_t)coll->tertiaryBottomCount;
5269 }
5270 tertiaries.appendByte(tertiaryBottom + (count3-1));
5271 }
5272 }
5273 ok &= tertiaries.isOk();
5274 result.Append(UCOL_LEVELTERMINATOR);
5275 tertiaries.appendTo(result);
5276
5277 result.Append(0);
5278 }
5279
5280 /* To avoid memory leak, free the offset buffer if necessary. */
5281 ucol_freeOffsetBuffer(&s);
5282
5283 ok &= result.IsOk();
5284 if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; }
5285 }
5286
5287 static inline
isShiftedCE(uint32_t CE,uint32_t LVT,UBool * wasShifted)5288 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5289 UBool notIsContinuation = !isContinuation(CE);
5290 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
5291 if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5292 || (!notIsContinuation && *wasShifted)))
5293 || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
5294 {
5295 // The stuff below should probably be in the sortkey code... maybe not...
5296 if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
5297 /* we should just completely ignore it */
5298 *wasShifted = TRUE;
5299 //continue;
5300 }
5301 //*wasShifted = TRUE;
5302 return TRUE;
5303 } else {
5304 *wasShifted = FALSE;
5305 return FALSE;
5306 }
5307 }
5308 static inline
terminatePSKLevel(int32_t level,int32_t maxLevel,int32_t & i,uint8_t * dest)5309 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
5310 if(level < maxLevel) {
5311 dest[i++] = UCOL_LEVELTERMINATOR;
5312 } else {
5313 dest[i++] = 0;
5314 }
5315 }
5316
5317 /** enumeration of level identifiers for partial sort key generation */
5318 enum {
5319 UCOL_PSK_PRIMARY = 0,
5320 UCOL_PSK_SECONDARY = 1,
5321 UCOL_PSK_CASE = 2,
5322 UCOL_PSK_TERTIARY = 3,
5323 UCOL_PSK_QUATERNARY = 4,
5324 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have three bits to blow */
5325 UCOL_PSK_IDENTICAL = 6,
5326 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */
5327 UCOL_PSK_LIMIT
5328 };
5329
5330 /** collation state enum. *_SHIFT value is how much to shift right
5331 * to get the state piece to the right. *_MASK value should be
5332 * ANDed with the shifted state. This data is stored in state[1]
5333 * field.
5334 */
5335 enum {
5336 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */
5337 UCOL_PSK_LEVEL_MASK = 7, /** three bits */
5338 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
5339 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
5340 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5341 * This field is also used to denote that the French secondary level is finished
5342 */
5343 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5344 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5345 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
5346 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5347 /** When we do French we need to reverse secondary values. However, continuations
5348 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5349 */
5350 UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
5351 UCOL_PSK_BOCSU_BYTES_MASK = 3,
5352 UCOL_PSK_CONSUMED_CES_SHIFT = 9,
5353 UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
5354 };
5355
5356 // macro calculating the number of expansion CEs available
5357 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
5358
5359
5360 /** main sortkey part procedure. On the first call,
5361 * you should pass in a collator, an iterator, empty state
5362 * state[0] == state[1] == 0, a buffer to hold results
5363 * number of bytes you need and an error code pointer.
5364 * Make sure your buffer is big enough to hold the wanted
5365 * number of sortkey bytes. I don't check.
5366 * The only meaningful status you can get back is
5367 * U_BUFFER_OVERFLOW_ERROR, which basically means that you
5368 * have been dealt a raw deal and that you probably won't
5369 * be able to use partial sortkey generation for this
5370 * particular combination of string and collator. This
5371 * is highly unlikely, but you should still check the error code.
5372 * Any other status means that you're not in a sane situation
5373 * anymore. After the first call, preserve state values and
5374 * use them on subsequent calls to obtain more bytes of a sortkey.
5375 * Use until the number of bytes written is smaller than the requested
5376 * number of bytes. Generated sortkey is not compatible with the
5377 * one generated by ucol_getSortKey, as we don't do any compression.
5378 * However, levels are still terminated by a 1 (one) and the sortkey
5379 * is terminated by a 0 (zero). Identical level is the same as in the
5380 * regular sortkey - internal bocu-1 implementation is used.
5381 * For curious, although you cannot do much about this, here is
5382 * the structure of state words.
5383 * state[0] - iterator state. Depends on the iterator implementation,
5384 * but allows the iterator to continue where it stopped in
5385 * the last iteration.
5386 * state[1] - collation processing state. Here is the distribution
5387 * of the bits:
5388 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5389 * quaternary, quin (we don't use this one), identical and
5390 * null (producing only zeroes - first one to terminate the
5391 * sortkey and subsequent to fill the buffer).
5392 * 3 - byte count. Number of bytes written on the primary level.
5393 * 4 - was shifted. Whether the previous iteration finished in the
5394 * shifted state.
5395 * 5, 6 - French continuation bytes written. See the comment in the enum
5396 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on
5397 * the identical level.
5398 * 9..31 - CEs consumed. Number of getCE or next32 operations performed
5399 * since thes last successful update of the iterator state.
5400 */
5401 U_CAPI int32_t U_EXPORT2
ucol_nextSortKeyPart(const UCollator * coll,UCharIterator * iter,uint32_t state[2],uint8_t * dest,int32_t count,UErrorCode * status)5402 ucol_nextSortKeyPart(const UCollator *coll,
5403 UCharIterator *iter,
5404 uint32_t state[2],
5405 uint8_t *dest, int32_t count,
5406 UErrorCode *status)
5407 {
5408 /* error checking */
5409 if(status==NULL || U_FAILURE(*status)) {
5410 return 0;
5411 }
5412 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
5413 if( coll==NULL || iter==NULL ||
5414 state==NULL ||
5415 count<0 || (count>0 && dest==NULL)
5416 ) {
5417 *status=U_ILLEGAL_ARGUMENT_ERROR;
5418 UTRACE_EXIT_STATUS(status);
5419 return 0;
5420 }
5421
5422 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5423 coll, iter, state[0], state[1], dest, count);
5424
5425 if(count==0) {
5426 /* nothing to do */
5427 UTRACE_EXIT_VALUE(0);
5428 return 0;
5429 }
5430 /** Setting up situation according to the state we got from the previous iteration */
5431 // The state of the iterator from the previous invocation
5432 uint32_t iterState = state[0];
5433 // Has the last iteration ended in the shifted state
5434 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
5435 // What is the current level of the sortkey?
5436 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
5437 // Have we written only one byte from a two byte primary in the previous iteration?
5438 // Also on secondary level - have we finished with the French secondary?
5439 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
5440 // number of bytes in the continuation buffer for French
5441 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
5442 // Number of bytes already written from a bocsu sequence. Since
5443 // the longes bocsu sequence is 4 long, this can be up to 3.
5444 int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
5445 // Number of elements that need to be consumed in this iteration because
5446 // the iterator returned UITER_NO_STATE at the end of the last iteration,
5447 // so we had to save the last valid state.
5448 int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
5449
5450 /** values that depend on the collator attributes */
5451 // strength of the collator.
5452 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
5453 // maximal level of the partial sortkey. Need to take whether case level is done
5454 int32_t maxLevel = 0;
5455 if(strength < UCOL_TERTIARY) {
5456 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5457 maxLevel = UCOL_PSK_CASE;
5458 } else {
5459 maxLevel = strength;
5460 }
5461 } else {
5462 if(strength == UCOL_TERTIARY) {
5463 maxLevel = UCOL_PSK_TERTIARY;
5464 } else if(strength == UCOL_QUATERNARY) {
5465 maxLevel = UCOL_PSK_QUATERNARY;
5466 } else { // identical
5467 maxLevel = UCOL_IDENTICAL;
5468 }
5469 }
5470 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
5471 uint8_t UCOL_HIRAGANA_QUAD =
5472 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
5473 // Boundary value that decides whether a CE is shifted or not
5474 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
5475 // Are we doing French collation?
5476 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
5477
5478 /** initializing the collation state */
5479 UBool notIsContinuation = FALSE;
5480 uint32_t CE = UCOL_NO_MORE_CES;
5481
5482 collIterate s;
5483 IInit_collIterate(coll, NULL, -1, &s, status);
5484 if(U_FAILURE(*status)) {
5485 UTRACE_EXIT_STATUS(*status);
5486 return 0;
5487 }
5488 s.iterator = iter;
5489 s.flags |= UCOL_USE_ITERATOR;
5490 // This variable tells us whether we have produced some other levels in this iteration
5491 // before we moved to the identical level. In that case, we need to switch the
5492 // type of the iterator.
5493 UBool doingIdenticalFromStart = FALSE;
5494 // Normalizing iterator
5495 // The division for the array length may truncate the array size to
5496 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
5497 // for all platforms anyway.
5498 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
5499 UNormIterator *normIter = NULL;
5500 // If the normalization is turned on for the collator and we are below identical level
5501 // we will use a FCD normalizing iterator
5502 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
5503 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5504 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
5505 s.flags &= ~UCOL_ITER_NORM;
5506 if(U_FAILURE(*status)) {
5507 UTRACE_EXIT_STATUS(*status);
5508 return 0;
5509 }
5510 } else if(level == UCOL_PSK_IDENTICAL) {
5511 // for identical level, we need a NFD iterator. We need to instantiate it here, since we
5512 // will be updating the state - and this cannot be done on an ordinary iterator.
5513 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5514 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5515 s.flags &= ~UCOL_ITER_NORM;
5516 if(U_FAILURE(*status)) {
5517 UTRACE_EXIT_STATUS(*status);
5518 return 0;
5519 }
5520 doingIdenticalFromStart = TRUE;
5521 }
5522
5523 // This is the tentative new state of the iterator. The problem
5524 // is that the iterator might return an undefined state, in
5525 // which case we should save the last valid state and increase
5526 // the iterator skip value.
5527 uint32_t newState = 0;
5528
5529 // First, we set the iterator to the last valid position
5530 // from the last iteration. This was saved in state[0].
5531 if(iterState == 0) {
5532 /* initial state */
5533 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
5534 s.iterator->move(s.iterator, 0, UITER_LIMIT);
5535 } else {
5536 s.iterator->move(s.iterator, 0, UITER_START);
5537 }
5538 } else {
5539 /* reset to previous state */
5540 s.iterator->setState(s.iterator, iterState, status);
5541 if(U_FAILURE(*status)) {
5542 UTRACE_EXIT_STATUS(*status);
5543 return 0;
5544 }
5545 }
5546
5547
5548
5549 // This variable tells us whether we can attempt to update the state
5550 // of iterator. Situations where we don't want to update iterator state
5551 // are the existence of expansion CEs that are not yet processed, and
5552 // finishing the case level without enough space in the buffer to insert
5553 // a level terminator.
5554 UBool canUpdateState = TRUE;
5555
5556 // Consume all the CEs that were consumed at the end of the previous
5557 // iteration without updating the iterator state. On identical level,
5558 // consume the code points.
5559 int32_t counter = cces;
5560 if(level < UCOL_PSK_IDENTICAL) {
5561 while(counter-->0) {
5562 // If we're doing French and we are on the secondary level,
5563 // we go backwards.
5564 if(level == UCOL_PSK_SECONDARY && doingFrench) {
5565 CE = ucol_IGetPrevCE(coll, &s, status);
5566 } else {
5567 CE = ucol_IGetNextCE(coll, &s, status);
5568 }
5569 if(CE==UCOL_NO_MORE_CES) {
5570 /* should not happen */
5571 *status=U_INTERNAL_PROGRAM_ERROR;
5572 UTRACE_EXIT_STATUS(*status);
5573 return 0;
5574 }
5575 if(uprv_numAvailableExpCEs(s)) {
5576 canUpdateState = FALSE;
5577 }
5578 }
5579 } else {
5580 while(counter-->0) {
5581 uiter_next32(s.iterator);
5582 }
5583 }
5584
5585 // French secondary needs to know whether the iterator state of zero came from previous level OR
5586 // from a new invocation...
5587 UBool wasDoingPrimary = FALSE;
5588 // destination buffer byte counter. When this guy
5589 // gets to count, we're done with the iteration
5590 int32_t i = 0;
5591 // used to count the zero bytes written after we
5592 // have finished with the sort key
5593 int32_t j = 0;
5594
5595
5596 // Hm.... I think we're ready to plunge in. Basic story is as following:
5597 // we have a fall through case based on level. This is used for initial
5598 // positioning on iteration start. Every level processor contains a
5599 // for(;;) which will be broken when we exhaust all the CEs. Other
5600 // way to exit is a goto saveState, which happens when we have filled
5601 // out our buffer.
5602 switch(level) {
5603 case UCOL_PSK_PRIMARY:
5604 wasDoingPrimary = TRUE;
5605 for(;;) {
5606 if(i==count) {
5607 goto saveState;
5608 }
5609 // We should save the state only if we
5610 // are sure that we are done with the
5611 // previous iterator state
5612 if(canUpdateState && byteCountOrFrenchDone == 0) {
5613 newState = s.iterator->getState(s.iterator);
5614 if(newState != UITER_NO_STATE) {
5615 iterState = newState;
5616 cces = 0;
5617 }
5618 }
5619 CE = ucol_IGetNextCE(coll, &s, status);
5620 cces++;
5621 if(CE==UCOL_NO_MORE_CES) {
5622 // Add the level separator
5623 terminatePSKLevel(level, maxLevel, i, dest);
5624 byteCountOrFrenchDone=0;
5625 // Restart the iteration an move to the
5626 // second level
5627 s.iterator->move(s.iterator, 0, UITER_START);
5628 cces = 0;
5629 level = UCOL_PSK_SECONDARY;
5630 break;
5631 }
5632 if(!isContinuation(CE)){
5633 if(coll->leadBytePermutationTable != NULL){
5634 CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF);
5635 }
5636 }
5637 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5638 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
5639 if(CE != 0) {
5640 if(byteCountOrFrenchDone == 0) {
5641 // get the second byte of primary
5642 dest[i++]=(uint8_t)(CE >> 8);
5643 } else {
5644 byteCountOrFrenchDone = 0;
5645 }
5646 if((CE &=0xff)!=0) {
5647 if(i==count) {
5648 /* overflow */
5649 byteCountOrFrenchDone = 1;
5650 cces--;
5651 goto saveState;
5652 }
5653 dest[i++]=(uint8_t)CE;
5654 }
5655 }
5656 }
5657 if(uprv_numAvailableExpCEs(s)) {
5658 canUpdateState = FALSE;
5659 } else {
5660 canUpdateState = TRUE;
5661 }
5662 }
5663 /* fall through to next level */
5664 case UCOL_PSK_SECONDARY:
5665 if(strength >= UCOL_SECONDARY) {
5666 if(!doingFrench) {
5667 for(;;) {
5668 if(i == count) {
5669 goto saveState;
5670 }
5671 // We should save the state only if we
5672 // are sure that we are done with the
5673 // previous iterator state
5674 if(canUpdateState) {
5675 newState = s.iterator->getState(s.iterator);
5676 if(newState != UITER_NO_STATE) {
5677 iterState = newState;
5678 cces = 0;
5679 }
5680 }
5681 CE = ucol_IGetNextCE(coll, &s, status);
5682 cces++;
5683 if(CE==UCOL_NO_MORE_CES) {
5684 // Add the level separator
5685 terminatePSKLevel(level, maxLevel, i, dest);
5686 byteCountOrFrenchDone = 0;
5687 // Restart the iteration an move to the
5688 // second level
5689 s.iterator->move(s.iterator, 0, UITER_START);
5690 cces = 0;
5691 level = UCOL_PSK_CASE;
5692 break;
5693 }
5694 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5695 CE >>= 8; /* get secondary */
5696 if(CE != 0) {
5697 dest[i++]=(uint8_t)CE;
5698 }
5699 }
5700 if(uprv_numAvailableExpCEs(s)) {
5701 canUpdateState = FALSE;
5702 } else {
5703 canUpdateState = TRUE;
5704 }
5705 }
5706 } else { // French secondary processing
5707 uint8_t frenchBuff[UCOL_MAX_BUFFER];
5708 int32_t frenchIndex = 0;
5709 // Here we are going backwards.
5710 // If the iterator is at the beggining, it should be
5711 // moved to end.
5712 if(wasDoingPrimary) {
5713 s.iterator->move(s.iterator, 0, UITER_LIMIT);
5714 cces = 0;
5715 }
5716 for(;;) {
5717 if(i == count) {
5718 goto saveState;
5719 }
5720 if(canUpdateState) {
5721 newState = s.iterator->getState(s.iterator);
5722 if(newState != UITER_NO_STATE) {
5723 iterState = newState;
5724 cces = 0;
5725 }
5726 }
5727 CE = ucol_IGetPrevCE(coll, &s, status);
5728 cces++;
5729 if(CE==UCOL_NO_MORE_CES) {
5730 // Add the level separator
5731 terminatePSKLevel(level, maxLevel, i, dest);
5732 byteCountOrFrenchDone = 0;
5733 // Restart the iteration an move to the next level
5734 s.iterator->move(s.iterator, 0, UITER_START);
5735 level = UCOL_PSK_CASE;
5736 break;
5737 }
5738 if(isContinuation(CE)) { // if it's a continuation, we want to save it and
5739 // reverse when we get a first non-continuation CE.
5740 CE >>= 8;
5741 frenchBuff[frenchIndex++] = (uint8_t)CE;
5742 } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
5743 CE >>= 8; /* get secondary */
5744 if(!frenchIndex) {
5745 if(CE != 0) {
5746 dest[i++]=(uint8_t)CE;
5747 }
5748 } else {
5749 frenchBuff[frenchIndex++] = (uint8_t)CE;
5750 frenchIndex -= usedFrench;
5751 usedFrench = 0;
5752 while(i < count && frenchIndex) {
5753 dest[i++] = frenchBuff[--frenchIndex];
5754 usedFrench++;
5755 }
5756 }
5757 }
5758 if(uprv_numAvailableExpCEs(s)) {
5759 canUpdateState = FALSE;
5760 } else {
5761 canUpdateState = TRUE;
5762 }
5763 }
5764 }
5765 } else {
5766 level = UCOL_PSK_CASE;
5767 }
5768 /* fall through to next level */
5769 case UCOL_PSK_CASE:
5770 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5771 uint32_t caseShift = UCOL_CASE_SHIFT_START;
5772 uint8_t caseByte = UCOL_CASE_BYTE_START;
5773 uint8_t caseBits = 0;
5774
5775 for(;;) {
5776 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START);
5777 if(i == count) {
5778 goto saveState;
5779 }
5780 // We should save the state only if we
5781 // are sure that we are done with the
5782 // previous iterator state
5783 if(canUpdateState) {
5784 newState = s.iterator->getState(s.iterator);
5785 if(newState != UITER_NO_STATE) {
5786 iterState = newState;
5787 cces = 0;
5788 }
5789 }
5790 CE = ucol_IGetNextCE(coll, &s, status);
5791 cces++;
5792 if(CE==UCOL_NO_MORE_CES) {
5793 // On the case level we might have an unfinished
5794 // case byte. Add one if it's started.
5795 if(caseShift != UCOL_CASE_SHIFT_START) {
5796 dest[i++] = caseByte;
5797 }
5798 cces = 0;
5799 // We have finished processing CEs on this level.
5800 // However, we don't know if we have enough space
5801 // to add a case level terminator.
5802 if(i < count) {
5803 // Add the level separator
5804 terminatePSKLevel(level, maxLevel, i, dest);
5805 // Restart the iteration and move to the
5806 // next level
5807 s.iterator->move(s.iterator, 0, UITER_START);
5808 level = UCOL_PSK_TERTIARY;
5809 } else {
5810 canUpdateState = FALSE;
5811 }
5812 break;
5813 }
5814
5815 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5816 if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
5817 // do the case level if we need to do it. We don't want to calculate
5818 // case level for primary ignorables if we have only primary strength and case level
5819 // otherwise we would break well formedness of CEs
5820 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5821 caseBits = (uint8_t)(CE & 0xC0);
5822 // this copies the case level logic from the
5823 // sort key generation code
5824 if(CE != 0) {
5825 if (caseShift == 0) {
5826 dest[i++] = caseByte;
5827 caseShift = UCOL_CASE_SHIFT_START;
5828 caseByte = UCOL_CASE_BYTE_START;
5829 }
5830 if(coll->caseFirst == UCOL_UPPER_FIRST) {
5831 if((caseBits & 0xC0) == 0) {
5832 caseByte |= 1 << (--caseShift);
5833 } else {
5834 caseByte |= 0 << (--caseShift);
5835 /* second bit */
5836 if(caseShift == 0) {
5837 dest[i++] = caseByte;
5838 caseShift = UCOL_CASE_SHIFT_START;
5839 caseByte = UCOL_CASE_BYTE_START;
5840 }
5841 caseByte |= ((caseBits>>6)&1) << (--caseShift);
5842 }
5843 } else {
5844 if((caseBits & 0xC0) == 0) {
5845 caseByte |= 0 << (--caseShift);
5846 } else {
5847 caseByte |= 1 << (--caseShift);
5848 /* second bit */
5849 if(caseShift == 0) {
5850 dest[i++] = caseByte;
5851 caseShift = UCOL_CASE_SHIFT_START;
5852 caseByte = UCOL_CASE_BYTE_START;
5853 }
5854 caseByte |= ((caseBits>>7)&1) << (--caseShift);
5855 }
5856 }
5857 }
5858
5859 }
5860 }
5861 // Not sure this is correct for the case level - revisit
5862 if(uprv_numAvailableExpCEs(s)) {
5863 canUpdateState = FALSE;
5864 } else {
5865 canUpdateState = TRUE;
5866 }
5867 }
5868 } else {
5869 level = UCOL_PSK_TERTIARY;
5870 }
5871 /* fall through to next level */
5872 case UCOL_PSK_TERTIARY:
5873 if(strength >= UCOL_TERTIARY) {
5874 for(;;) {
5875 if(i == count) {
5876 goto saveState;
5877 }
5878 // We should save the state only if we
5879 // are sure that we are done with the
5880 // previous iterator state
5881 if(canUpdateState) {
5882 newState = s.iterator->getState(s.iterator);
5883 if(newState != UITER_NO_STATE) {
5884 iterState = newState;
5885 cces = 0;
5886 }
5887 }
5888 CE = ucol_IGetNextCE(coll, &s, status);
5889 cces++;
5890 if(CE==UCOL_NO_MORE_CES) {
5891 // Add the level separator
5892 terminatePSKLevel(level, maxLevel, i, dest);
5893 byteCountOrFrenchDone = 0;
5894 // Restart the iteration an move to the
5895 // second level
5896 s.iterator->move(s.iterator, 0, UITER_START);
5897 cces = 0;
5898 level = UCOL_PSK_QUATERNARY;
5899 break;
5900 }
5901 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5902 notIsContinuation = !isContinuation(CE);
5903
5904 if(notIsContinuation) {
5905 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5906 CE ^= coll->caseSwitch;
5907 CE &= coll->tertiaryMask;
5908 } else {
5909 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
5910 }
5911
5912 if(CE != 0) {
5913 dest[i++]=(uint8_t)CE;
5914 }
5915 }
5916 if(uprv_numAvailableExpCEs(s)) {
5917 canUpdateState = FALSE;
5918 } else {
5919 canUpdateState = TRUE;
5920 }
5921 }
5922 } else {
5923 // if we're not doing tertiary
5924 // skip to the end
5925 level = UCOL_PSK_NULL;
5926 }
5927 /* fall through to next level */
5928 case UCOL_PSK_QUATERNARY:
5929 if(strength >= UCOL_QUATERNARY) {
5930 for(;;) {
5931 if(i == count) {
5932 goto saveState;
5933 }
5934 // We should save the state only if we
5935 // are sure that we are done with the
5936 // previous iterator state
5937 if(canUpdateState) {
5938 newState = s.iterator->getState(s.iterator);
5939 if(newState != UITER_NO_STATE) {
5940 iterState = newState;
5941 cces = 0;
5942 }
5943 }
5944 CE = ucol_IGetNextCE(coll, &s, status);
5945 cces++;
5946 if(CE==UCOL_NO_MORE_CES) {
5947 // Add the level separator
5948 terminatePSKLevel(level, maxLevel, i, dest);
5949 //dest[i++] = UCOL_LEVELTERMINATOR;
5950 byteCountOrFrenchDone = 0;
5951 // Restart the iteration an move to the
5952 // second level
5953 s.iterator->move(s.iterator, 0, UITER_START);
5954 cces = 0;
5955 level = UCOL_PSK_QUIN;
5956 break;
5957 }
5958 if(CE==0)
5959 continue;
5960 if(isShiftedCE(CE, LVT, &wasShifted)) {
5961 CE >>= 16; /* get primary */
5962 if(CE != 0) {
5963 if(byteCountOrFrenchDone == 0) {
5964 dest[i++]=(uint8_t)(CE >> 8);
5965 } else {
5966 byteCountOrFrenchDone = 0;
5967 }
5968 if((CE &=0xff)!=0) {
5969 if(i==count) {
5970 /* overflow */
5971 byteCountOrFrenchDone = 1;
5972 goto saveState;
5973 }
5974 dest[i++]=(uint8_t)CE;
5975 }
5976 }
5977 } else {
5978 notIsContinuation = !isContinuation(CE);
5979 if(notIsContinuation) {
5980 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
5981 dest[i++] = UCOL_HIRAGANA_QUAD;
5982 } else {
5983 dest[i++] = 0xFF;
5984 }
5985 }
5986 }
5987 if(uprv_numAvailableExpCEs(s)) {
5988 canUpdateState = FALSE;
5989 } else {
5990 canUpdateState = TRUE;
5991 }
5992 }
5993 } else {
5994 // if we're not doing quaternary
5995 // skip to the end
5996 level = UCOL_PSK_NULL;
5997 }
5998 /* fall through to next level */
5999 case UCOL_PSK_QUIN:
6000 level = UCOL_PSK_IDENTICAL;
6001 /* fall through to next level */
6002 case UCOL_PSK_IDENTICAL:
6003 if(strength >= UCOL_IDENTICAL) {
6004 UChar32 first, second;
6005 int32_t bocsuBytesWritten = 0;
6006 // We always need to do identical on
6007 // the NFD form of the string.
6008 if(normIter == NULL) {
6009 // we arrived from the level below and
6010 // normalization was not turned on.
6011 // therefore, we need to make a fresh NFD iterator
6012 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6013 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6014 } else if(!doingIdenticalFromStart) {
6015 // there is an iterator, but we did some other levels.
6016 // therefore, we have a FCD iterator - need to make
6017 // a NFD one.
6018 // normIter being at the beginning does not guarantee
6019 // that the underlying iterator is at the beginning
6020 iter->move(iter, 0, UITER_START);
6021 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6022 }
6023 // At this point we have a NFD iterator that is positioned
6024 // in the right place
6025 if(U_FAILURE(*status)) {
6026 UTRACE_EXIT_STATUS(*status);
6027 return 0;
6028 }
6029 first = uiter_previous32(s.iterator);
6030 // maybe we're at the start of the string
6031 if(first == U_SENTINEL) {
6032 first = 0;
6033 } else {
6034 uiter_next32(s.iterator);
6035 }
6036
6037 j = 0;
6038 for(;;) {
6039 if(i == count) {
6040 if(j+1 < bocsuBytesWritten) {
6041 bocsuBytesUsed = j+1;
6042 }
6043 goto saveState;
6044 }
6045
6046 // On identical level, we will always save
6047 // the state if we reach this point, since
6048 // we don't depend on getNextCE for content
6049 // all the content is in our buffer and we
6050 // already either stored the full buffer OR
6051 // otherwise we won't arrive here.
6052 newState = s.iterator->getState(s.iterator);
6053 if(newState != UITER_NO_STATE) {
6054 iterState = newState;
6055 cces = 0;
6056 }
6057
6058 uint8_t buff[4];
6059 second = uiter_next32(s.iterator);
6060 cces++;
6061
6062 // end condition for identical level
6063 if(second == U_SENTINEL) {
6064 terminatePSKLevel(level, maxLevel, i, dest);
6065 level = UCOL_PSK_NULL;
6066 break;
6067 }
6068 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
6069 first = second;
6070
6071 j = 0;
6072 if(bocsuBytesUsed != 0) {
6073 while(bocsuBytesUsed-->0) {
6074 j++;
6075 }
6076 }
6077
6078 while(i < count && j < bocsuBytesWritten) {
6079 dest[i++] = buff[j++];
6080 }
6081 }
6082
6083 } else {
6084 level = UCOL_PSK_NULL;
6085 }
6086 /* fall through to next level */
6087 case UCOL_PSK_NULL:
6088 j = i;
6089 while(j<count) {
6090 dest[j++]=0;
6091 }
6092 break;
6093 default:
6094 *status = U_INTERNAL_PROGRAM_ERROR;
6095 UTRACE_EXIT_STATUS(*status);
6096 return 0;
6097 }
6098
6099 saveState:
6100 // Now we need to return stuff. First we want to see whether we have
6101 // done everything for the current state of iterator.
6102 if(byteCountOrFrenchDone
6103 || canUpdateState == FALSE
6104 || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)
6105 {
6106 // Any of above mean that the previous transaction
6107 // wasn't finished and that we should store the
6108 // previous iterator state.
6109 state[0] = iterState;
6110 } else {
6111 // The transaction is complete. We will continue in the next iteration.
6112 state[0] = s.iterator->getState(s.iterator);
6113 cces = 0;
6114 }
6115 // Store the number of bocsu bytes written.
6116 if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
6117 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6118 }
6119 state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
6120
6121 // Next we put in the level of comparison
6122 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
6123
6124 // If we are doing French, we need to store whether we have just finished the French level
6125 if(level == UCOL_PSK_SECONDARY && doingFrench) {
6126 state[1] |= (((int32_t)(state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6127 } else {
6128 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6129 }
6130
6131 // Was the latest CE shifted
6132 if(wasShifted) {
6133 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6134 }
6135 // Check for cces overflow
6136 if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
6137 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6138 }
6139 // Store cces
6140 state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
6141
6142 // Check for French overflow
6143 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6144 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6145 }
6146 // Store number of bytes written in the French secondary continuation sequence
6147 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
6148
6149
6150 // If we have used normalizing iterator, get rid of it
6151 if(normIter != NULL) {
6152 unorm_closeIter(normIter);
6153 }
6154
6155 /* To avoid memory leak, free the offset buffer if necessary. */
6156 ucol_freeOffsetBuffer(&s);
6157
6158 // Return number of meaningful sortkey bytes.
6159 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
6160 dest,i, state[0], state[1]);
6161 UTRACE_EXIT_VALUE(i);
6162 return i;
6163 }
6164
6165 /**
6166 * Produce a bound for a given sortkey and a number of levels.
6167 */
6168 U_CAPI int32_t U_EXPORT2
ucol_getBound(const uint8_t * source,int32_t sourceLength,UColBoundMode boundType,uint32_t noOfLevels,uint8_t * result,int32_t resultLength,UErrorCode * status)6169 ucol_getBound(const uint8_t *source,
6170 int32_t sourceLength,
6171 UColBoundMode boundType,
6172 uint32_t noOfLevels,
6173 uint8_t *result,
6174 int32_t resultLength,
6175 UErrorCode *status)
6176 {
6177 // consistency checks
6178 if(status == NULL || U_FAILURE(*status)) {
6179 return 0;
6180 }
6181 if(source == NULL) {
6182 *status = U_ILLEGAL_ARGUMENT_ERROR;
6183 return 0;
6184 }
6185
6186 int32_t sourceIndex = 0;
6187 // Scan the string until we skip enough of the key OR reach the end of the key
6188 do {
6189 sourceIndex++;
6190 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
6191 noOfLevels--;
6192 }
6193 } while (noOfLevels > 0
6194 && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6195
6196 if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6197 && noOfLevels > 0) {
6198 *status = U_SORT_KEY_TOO_SHORT_WARNING;
6199 }
6200
6201
6202 // READ ME: this code assumes that the values for boundType
6203 // enum will not changes. They are set so that the enum value
6204 // corresponds to the number of extra bytes each bound type
6205 // needs.
6206 if(result != NULL && resultLength >= sourceIndex+boundType) {
6207 uprv_memcpy(result, source, sourceIndex);
6208 switch(boundType) {
6209 // Lower bound just gets terminated. No extra bytes
6210 case UCOL_BOUND_LOWER: // = 0
6211 break;
6212 // Upper bound needs one extra byte
6213 case UCOL_BOUND_UPPER: // = 1
6214 result[sourceIndex++] = 2;
6215 break;
6216 // Upper long bound needs two extra bytes
6217 case UCOL_BOUND_UPPER_LONG: // = 2
6218 result[sourceIndex++] = 0xFF;
6219 result[sourceIndex++] = 0xFF;
6220 break;
6221 default:
6222 *status = U_ILLEGAL_ARGUMENT_ERROR;
6223 return 0;
6224 }
6225 result[sourceIndex++] = 0;
6226
6227 return sourceIndex;
6228 } else {
6229 return sourceIndex+boundType+1;
6230 }
6231 }
6232
6233 /****************************************************************************/
6234 /* Following are the functions that deal with the properties of a collator */
6235 /* there are new APIs and some compatibility APIs */
6236 /****************************************************************************/
6237
6238 static inline void
ucol_addLatinOneEntry(UCollator * coll,UChar ch,uint32_t CE,int32_t * primShift,int32_t * secShift,int32_t * terShift)6239 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
6240 int32_t *primShift, int32_t *secShift, int32_t *terShift)
6241 {
6242 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6243 UBool reverseSecondary = FALSE;
6244 UBool continuation = isContinuation(CE);
6245 if(!continuation) {
6246 tertiary = (uint8_t)((CE & coll->tertiaryMask));
6247 tertiary ^= coll->caseSwitch;
6248 reverseSecondary = TRUE;
6249 } else {
6250 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6251 tertiary &= UCOL_REMOVE_CASE;
6252 reverseSecondary = FALSE;
6253 }
6254
6255 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6256 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6257 primary1 = (uint8_t)(CE >> 8);
6258
6259 if(primary1 != 0) {
6260 if (coll->leadBytePermutationTable != NULL && !continuation) {
6261 primary1 = coll->leadBytePermutationTable[primary1];
6262 }
6263
6264 coll->latinOneCEs[ch] |= (primary1 << *primShift);
6265 *primShift -= 8;
6266 }
6267 if(primary2 != 0) {
6268 if(*primShift < 0) {
6269 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6270 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6271 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6272 return;
6273 }
6274 coll->latinOneCEs[ch] |= (primary2 << *primShift);
6275 *primShift -= 8;
6276 }
6277 if(secondary != 0) {
6278 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
6279 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
6280 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
6281 } else { // normal case
6282 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
6283 }
6284 *secShift -= 8;
6285 }
6286 if(tertiary != 0) {
6287 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
6288 *terShift -= 8;
6289 }
6290 }
6291
6292 static inline UBool
ucol_resizeLatinOneTable(UCollator * coll,int32_t size,UErrorCode * status)6293 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6294 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6295 if(newTable == NULL) {
6296 *status = U_MEMORY_ALLOCATION_ERROR;
6297 coll->latinOneFailed = TRUE;
6298 return FALSE;
6299 }
6300 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
6301 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6302 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6303 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
6304 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
6305 coll->latinOneTableLen = size;
6306 uprv_free(coll->latinOneCEs);
6307 coll->latinOneCEs = newTable;
6308 return TRUE;
6309 }
6310
6311 static UBool
ucol_setUpLatinOne(UCollator * coll,UErrorCode * status)6312 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6313 UBool result = TRUE;
6314 if(coll->latinOneCEs == NULL) {
6315 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
6316 if(coll->latinOneCEs == NULL) {
6317 *status = U_MEMORY_ALLOCATION_ERROR;
6318 return FALSE;
6319 }
6320 coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6321 }
6322 UChar ch = 0;
6323 UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6324 // Check for null pointer
6325 if (U_FAILURE(*status)) {
6326 ucol_closeElements(it);
6327 return FALSE;
6328 }
6329 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
6330
6331 int32_t primShift = 24, secShift = 24, terShift = 24;
6332 uint32_t CE = 0;
6333 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6334
6335 // TODO: make safe if you get more than you wanted...
6336 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
6337 primShift = 24; secShift = 24; terShift = 24;
6338 if(ch < 0x100) {
6339 CE = coll->latinOneMapping[ch];
6340 } else {
6341 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
6342 if(CE == UCOL_NOT_FOUND && coll->UCA) {
6343 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
6344 }
6345 }
6346 if(CE < UCOL_NOT_FOUND) {
6347 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6348 } else {
6349 switch (getCETag(CE)) {
6350 case EXPANSION_TAG:
6351 case DIGIT_TAG:
6352 ucol_setText(it, &ch, 1, status);
6353 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
6354 if(primShift < 0 || secShift < 0 || terShift < 0) {
6355 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6356 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6357 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6358 break;
6359 }
6360 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6361 }
6362 break;
6363 case CONTRACTION_TAG:
6364 // here is the trick
6365 // F2 is contraction. We do something very similar to contractions
6366 // but have two indices, one in the real contraction table and the
6367 // other to where we stuffed things. This hopes that we don't have
6368 // many contractions (this should work for latin-1 tables).
6369 {
6370 if((CE & 0x00FFF000) != 0) {
6371 *status = U_UNSUPPORTED_ERROR;
6372 goto cleanup_after_failure;
6373 }
6374
6375 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
6376
6377 CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
6378
6379 coll->latinOneCEs[ch] = CE;
6380 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
6381 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
6382
6383 // We're going to jump into contraction table, pick the elements
6384 // and use them
6385 do {
6386 CE = *(coll->contractionCEs +
6387 (UCharOffset - coll->contractionIndex));
6388 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
6389 uint32_t size;
6390 uint32_t i; /* general counter */
6391 uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
6392 size = getExpansionCount(CE);
6393 //CE = *CEOffset++;
6394 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6395 for(i = 0; i<size; i++) {
6396 if(primShift < 0 || secShift < 0 || terShift < 0) {
6397 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6398 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6399 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6400 break;
6401 }
6402 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6403 }
6404 } else { /* else, we do */
6405 while(*CEOffset != 0) {
6406 if(primShift < 0 || secShift < 0 || terShift < 0) {
6407 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6408 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6409 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6410 break;
6411 }
6412 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6413 }
6414 }
6415 contractionOffset++;
6416 } else if(CE < UCOL_NOT_FOUND) {
6417 ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
6418 } else {
6419 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6420 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6421 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6422 contractionOffset++;
6423 }
6424 UCharOffset++;
6425 primShift = 24; secShift = 24; terShift = 24;
6426 if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
6427 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
6428 goto cleanup_after_failure;
6429 }
6430 }
6431 } while(*UCharOffset != 0xFFFF);
6432 }
6433 break;;
6434 case SPEC_PROC_TAG:
6435 {
6436 // 0xB7 is a precontext character defined in UCA5.1, a special
6437 // handle is implemeted in order to save LatinOne table for
6438 // most locales.
6439 if (ch==0xb7) {
6440 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6441 }
6442 else {
6443 goto cleanup_after_failure;
6444 }
6445 }
6446 break;
6447 default:
6448 goto cleanup_after_failure;
6449 }
6450 }
6451 }
6452 // compact table
6453 if(contractionOffset < coll->latinOneTableLen) {
6454 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
6455 goto cleanup_after_failure;
6456 }
6457 }
6458 ucol_closeElements(it);
6459 return result;
6460
6461 cleanup_after_failure:
6462 // status should already be set before arriving here.
6463 coll->latinOneFailed = TRUE;
6464 ucol_closeElements(it);
6465 return FALSE;
6466 }
6467
ucol_updateInternalState(UCollator * coll,UErrorCode * status)6468 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
6469 if(U_SUCCESS(*status)) {
6470 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6471 coll->caseSwitch = UCOL_CASE_SWITCH;
6472 } else {
6473 coll->caseSwitch = UCOL_NO_CASE_SWITCH;
6474 }
6475
6476 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
6477 coll->tertiaryMask = UCOL_REMOVE_CASE;
6478 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6479 coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */
6480 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
6481 coll->tertiaryBottom = UCOL_COMMON_BOT3;
6482 } else {
6483 coll->tertiaryMask = UCOL_KEEP_CASE;
6484 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
6485 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6486 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
6487 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
6488 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
6489 } else {
6490 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6491 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
6492 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
6493 }
6494 }
6495
6496 /* Set the compression values */
6497 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - coll->tertiaryBottom - 1);
6498 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
6499 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
6500
6501 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
6502 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE)
6503 {
6504 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
6505 } else {
6506 coll->sortKeyGen = ucol_calcSortKey;
6507 }
6508 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
6509 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed)
6510 {
6511 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
6512 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
6513 //fprintf(stderr, "F");
6514 coll->latinOneUse = TRUE;
6515 } else {
6516 coll->latinOneUse = FALSE;
6517 }
6518 if(*status == U_UNSUPPORTED_ERROR) {
6519 *status = U_ZERO_ERROR;
6520 }
6521 } else { // latin1Table exists and it doesn't need to be regenerated, just use it
6522 coll->latinOneUse = TRUE;
6523 }
6524 } else {
6525 coll->latinOneUse = FALSE;
6526 }
6527 }
6528 }
6529
6530 U_CAPI uint32_t U_EXPORT2
ucol_setVariableTop(UCollator * coll,const UChar * varTop,int32_t len,UErrorCode * status)6531 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
6532 if(U_FAILURE(*status) || coll == NULL) {
6533 return 0;
6534 }
6535 if(len == -1) {
6536 len = u_strlen(varTop);
6537 }
6538 if(len == 0) {
6539 *status = U_ILLEGAL_ARGUMENT_ERROR;
6540 return 0;
6541 }
6542
6543 if(coll->delegate!=NULL) {
6544 return ((Collator*)coll->delegate)->setVariableTop(varTop, len, *status);
6545 }
6546
6547
6548 collIterate s;
6549 IInit_collIterate(coll, varTop, len, &s, status);
6550 if(U_FAILURE(*status)) {
6551 return 0;
6552 }
6553
6554 uint32_t CE = ucol_IGetNextCE(coll, &s, status);
6555
6556 /* here we check if we have consumed all characters */
6557 /* you can put in either one character or a contraction */
6558 /* you shouldn't put more... */
6559 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
6560 *status = U_CE_NOT_FOUND_ERROR;
6561 return 0;
6562 }
6563
6564 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
6565
6566 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
6567 *status = U_PRIMARY_TOO_LONG_ERROR;
6568 return 0;
6569 }
6570 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
6571 coll->variableTopValueisDefault = FALSE;
6572 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
6573 }
6574
6575 /* To avoid memory leak, free the offset buffer if necessary. */
6576 ucol_freeOffsetBuffer(&s);
6577
6578 return CE & UCOL_PRIMARYMASK;
6579 }
6580
ucol_getVariableTop(const UCollator * coll,UErrorCode * status)6581 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
6582 if(U_FAILURE(*status) || coll == NULL) {
6583 return 0;
6584 }
6585 if(coll->delegate!=NULL) {
6586 return ((const Collator*)coll->delegate)->getVariableTop(*status);
6587 }
6588 return coll->variableTopValue<<16;
6589 }
6590
6591 U_CAPI void U_EXPORT2
ucol_restoreVariableTop(UCollator * coll,const uint32_t varTop,UErrorCode * status)6592 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
6593 if(U_FAILURE(*status) || coll == NULL) {
6594 return;
6595 }
6596
6597 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
6598 coll->variableTopValueisDefault = FALSE;
6599 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
6600 }
6601 }
6602 /* Attribute setter API */
6603 U_CAPI void U_EXPORT2
ucol_setAttribute(UCollator * coll,UColAttribute attr,UColAttributeValue value,UErrorCode * status)6604 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
6605 if(U_FAILURE(*status) || coll == NULL) {
6606 return;
6607 }
6608
6609 if(coll->delegate != NULL) {
6610 ((Collator*)coll->delegate)->setAttribute(attr,value,*status);
6611 return;
6612 }
6613
6614 UColAttributeValue oldFrench = coll->frenchCollation;
6615 UColAttributeValue oldCaseFirst = coll->caseFirst;
6616 switch(attr) {
6617 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
6618 if(value == UCOL_ON) {
6619 coll->numericCollation = UCOL_ON;
6620 coll->numericCollationisDefault = FALSE;
6621 } else if (value == UCOL_OFF) {
6622 coll->numericCollation = UCOL_OFF;
6623 coll->numericCollationisDefault = FALSE;
6624 } else if (value == UCOL_DEFAULT) {
6625 coll->numericCollationisDefault = TRUE;
6626 coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
6627 } else {
6628 *status = U_ILLEGAL_ARGUMENT_ERROR;
6629 }
6630 break;
6631 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
6632 if(value == UCOL_ON || value == UCOL_OFF || value == UCOL_DEFAULT) {
6633 // This attribute is an implementation detail of the CLDR Japanese tailoring.
6634 // The implementation might change to use a different mechanism
6635 // to achieve the same Japanese sort order.
6636 // Since ICU 50, this attribute is not settable any more via API functions.
6637 } else {
6638 *status = U_ILLEGAL_ARGUMENT_ERROR;
6639 }
6640 break;
6641 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6642 if(value == UCOL_ON) {
6643 coll->frenchCollation = UCOL_ON;
6644 coll->frenchCollationisDefault = FALSE;
6645 } else if (value == UCOL_OFF) {
6646 coll->frenchCollation = UCOL_OFF;
6647 coll->frenchCollationisDefault = FALSE;
6648 } else if (value == UCOL_DEFAULT) {
6649 coll->frenchCollationisDefault = TRUE;
6650 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
6651 } else {
6652 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6653 }
6654 break;
6655 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6656 if(value == UCOL_SHIFTED) {
6657 coll->alternateHandling = UCOL_SHIFTED;
6658 coll->alternateHandlingisDefault = FALSE;
6659 } else if (value == UCOL_NON_IGNORABLE) {
6660 coll->alternateHandling = UCOL_NON_IGNORABLE;
6661 coll->alternateHandlingisDefault = FALSE;
6662 } else if (value == UCOL_DEFAULT) {
6663 coll->alternateHandlingisDefault = TRUE;
6664 coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
6665 } else {
6666 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6667 }
6668 break;
6669 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6670 if(value == UCOL_LOWER_FIRST) {
6671 coll->caseFirst = UCOL_LOWER_FIRST;
6672 coll->caseFirstisDefault = FALSE;
6673 } else if (value == UCOL_UPPER_FIRST) {
6674 coll->caseFirst = UCOL_UPPER_FIRST;
6675 coll->caseFirstisDefault = FALSE;
6676 } else if (value == UCOL_OFF) {
6677 coll->caseFirst = UCOL_OFF;
6678 coll->caseFirstisDefault = FALSE;
6679 } else if (value == UCOL_DEFAULT) {
6680 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
6681 coll->caseFirstisDefault = TRUE;
6682 } else {
6683 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6684 }
6685 break;
6686 case UCOL_CASE_LEVEL: /* do we have an extra case level */
6687 if(value == UCOL_ON) {
6688 coll->caseLevel = UCOL_ON;
6689 coll->caseLevelisDefault = FALSE;
6690 } else if (value == UCOL_OFF) {
6691 coll->caseLevel = UCOL_OFF;
6692 coll->caseLevelisDefault = FALSE;
6693 } else if (value == UCOL_DEFAULT) {
6694 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
6695 coll->caseLevelisDefault = TRUE;
6696 } else {
6697 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6698 }
6699 break;
6700 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6701 if(value == UCOL_ON) {
6702 coll->normalizationMode = UCOL_ON;
6703 coll->normalizationModeisDefault = FALSE;
6704 initializeFCD(status);
6705 } else if (value == UCOL_OFF) {
6706 coll->normalizationMode = UCOL_OFF;
6707 coll->normalizationModeisDefault = FALSE;
6708 } else if (value == UCOL_DEFAULT) {
6709 coll->normalizationModeisDefault = TRUE;
6710 coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
6711 if(coll->normalizationMode == UCOL_ON) {
6712 initializeFCD(status);
6713 }
6714 } else {
6715 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6716 }
6717 break;
6718 case UCOL_STRENGTH: /* attribute for strength */
6719 if (value == UCOL_DEFAULT) {
6720 coll->strengthisDefault = TRUE;
6721 coll->strength = (UColAttributeValue)coll->options->strength;
6722 } else if (value <= UCOL_IDENTICAL) {
6723 coll->strengthisDefault = FALSE;
6724 coll->strength = value;
6725 } else {
6726 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6727 }
6728 break;
6729 case UCOL_ATTRIBUTE_COUNT:
6730 default:
6731 *status = U_ILLEGAL_ARGUMENT_ERROR;
6732 break;
6733 }
6734 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
6735 coll->latinOneRegenTable = TRUE;
6736 } else {
6737 coll->latinOneRegenTable = FALSE;
6738 }
6739 ucol_updateInternalState(coll, status);
6740 }
6741
6742 U_CAPI UColAttributeValue U_EXPORT2
ucol_getAttribute(const UCollator * coll,UColAttribute attr,UErrorCode * status)6743 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
6744 if(U_FAILURE(*status) || coll == NULL) {
6745 return UCOL_DEFAULT;
6746 }
6747
6748 if(coll->delegate != NULL) {
6749 return ((Collator*)coll->delegate)->getAttribute(attr,*status);
6750 }
6751
6752 switch(attr) {
6753 case UCOL_NUMERIC_COLLATION:
6754 return coll->numericCollation;
6755 case UCOL_HIRAGANA_QUATERNARY_MODE:
6756 return coll->hiraganaQ;
6757 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6758 return coll->frenchCollation;
6759 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6760 return coll->alternateHandling;
6761 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6762 return coll->caseFirst;
6763 case UCOL_CASE_LEVEL: /* do we have an extra case level */
6764 return coll->caseLevel;
6765 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6766 return coll->normalizationMode;
6767 case UCOL_STRENGTH: /* attribute for strength */
6768 return coll->strength;
6769 case UCOL_ATTRIBUTE_COUNT:
6770 default:
6771 *status = U_ILLEGAL_ARGUMENT_ERROR;
6772 break;
6773 }
6774 return UCOL_DEFAULT;
6775 }
6776
6777 U_CAPI void U_EXPORT2
ucol_setStrength(UCollator * coll,UCollationStrength strength)6778 ucol_setStrength( UCollator *coll,
6779 UCollationStrength strength)
6780 {
6781 UErrorCode status = U_ZERO_ERROR;
6782 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
6783 }
6784
6785 U_CAPI UCollationStrength U_EXPORT2
ucol_getStrength(const UCollator * coll)6786 ucol_getStrength(const UCollator *coll)
6787 {
6788 UErrorCode status = U_ZERO_ERROR;
6789 return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
6790 }
6791
6792 U_CAPI int32_t U_EXPORT2
ucol_getReorderCodes(const UCollator * coll,int32_t * dest,int32_t destCapacity,UErrorCode * status)6793 ucol_getReorderCodes(const UCollator *coll,
6794 int32_t *dest,
6795 int32_t destCapacity,
6796 UErrorCode *status) {
6797 if (U_FAILURE(*status)) {
6798 return 0;
6799 }
6800
6801 if(coll->delegate!=NULL) {
6802 return ((const Collator*)coll->delegate)->getReorderCodes(dest, destCapacity, *status);
6803 }
6804
6805 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
6806 *status = U_ILLEGAL_ARGUMENT_ERROR;
6807 return 0;
6808 }
6809
6810 #ifdef UCOL_DEBUG
6811 printf("coll->reorderCodesLength = %d\n", coll->reorderCodesLength);
6812 printf("coll->defaultReorderCodesLength = %d\n", coll->defaultReorderCodesLength);
6813 #endif
6814
6815 if (coll->reorderCodesLength > destCapacity) {
6816 *status = U_BUFFER_OVERFLOW_ERROR;
6817 return coll->reorderCodesLength;
6818 }
6819 for (int32_t i = 0; i < coll->reorderCodesLength; i++) {
6820 dest[i] = coll->reorderCodes[i];
6821 }
6822 return coll->reorderCodesLength;
6823 }
6824
6825 U_CAPI void U_EXPORT2
ucol_setReorderCodes(UCollator * coll,const int32_t * reorderCodes,int32_t reorderCodesLength,UErrorCode * status)6826 ucol_setReorderCodes(UCollator* coll,
6827 const int32_t* reorderCodes,
6828 int32_t reorderCodesLength,
6829 UErrorCode *status) {
6830 if (U_FAILURE(*status)) {
6831 return;
6832 }
6833
6834 if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NULL)) {
6835 *status = U_ILLEGAL_ARGUMENT_ERROR;
6836 return;
6837 }
6838
6839 if(coll->delegate!=NULL) {
6840 ((Collator*)coll->delegate)->setReorderCodes(reorderCodes, reorderCodesLength, *status);
6841 return;
6842 }
6843
6844 if (coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
6845 uprv_free(coll->reorderCodes);
6846 }
6847 coll->reorderCodes = NULL;
6848 coll->reorderCodesLength = 0;
6849 if (reorderCodesLength == 0) {
6850 if (coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
6851 uprv_free(coll->leadBytePermutationTable);
6852 }
6853 coll->leadBytePermutationTable = NULL;
6854 return;
6855 }
6856 coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int32_t));
6857 if (coll->reorderCodes == NULL) {
6858 *status = U_MEMORY_ALLOCATION_ERROR;
6859 return;
6860 }
6861 coll->freeReorderCodesOnClose = TRUE;
6862 for (int32_t i = 0; i < reorderCodesLength; i++) {
6863 coll->reorderCodes[i] = reorderCodes[i];
6864 }
6865 coll->reorderCodesLength = reorderCodesLength;
6866 ucol_buildPermutationTable(coll, status);
6867 }
6868
6869 U_CAPI int32_t U_EXPORT2
ucol_getEquivalentReorderCodes(int32_t reorderCode,int32_t * dest,int32_t destCapacity,UErrorCode * pErrorCode)6870 ucol_getEquivalentReorderCodes(int32_t reorderCode,
6871 int32_t* dest,
6872 int32_t destCapacity,
6873 UErrorCode *pErrorCode) {
6874 bool equivalentCodesSet[USCRIPT_CODE_LIMIT];
6875 uint16_t leadBytes[256];
6876 int leadBytesCount;
6877 int leadByteIndex;
6878 int16_t reorderCodesForLeadByte[USCRIPT_CODE_LIMIT];
6879 int reorderCodesForLeadByteCount;
6880 int reorderCodeIndex;
6881
6882 int32_t equivalentCodesCount = 0;
6883 int setIndex;
6884
6885 if (U_FAILURE(*pErrorCode)) {
6886 return 0;
6887 }
6888
6889 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
6890 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
6891 return 0;
6892 }
6893
6894 uprv_memset(equivalentCodesSet, 0, USCRIPT_CODE_LIMIT * sizeof(bool));
6895
6896 const UCollator* uca = ucol_initUCA(pErrorCode);
6897 if (U_FAILURE(*pErrorCode)) {
6898 return 0;
6899 }
6900 leadBytesCount = ucol_getLeadBytesForReorderCode(uca, reorderCode, leadBytes, 256);
6901 for (leadByteIndex = 0; leadByteIndex < leadBytesCount; leadByteIndex++) {
6902 reorderCodesForLeadByteCount = ucol_getReorderCodesForLeadByte(
6903 uca, leadBytes[leadByteIndex], reorderCodesForLeadByte, USCRIPT_CODE_LIMIT);
6904 for (reorderCodeIndex = 0; reorderCodeIndex < reorderCodesForLeadByteCount; reorderCodeIndex++) {
6905 equivalentCodesSet[reorderCodesForLeadByte[reorderCodeIndex]] = true;
6906 }
6907 }
6908
6909 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
6910 if (equivalentCodesSet[setIndex] == true) {
6911 equivalentCodesCount++;
6912 }
6913 }
6914
6915 if (destCapacity == 0) {
6916 return equivalentCodesCount;
6917 }
6918
6919 equivalentCodesCount = 0;
6920 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
6921 if (equivalentCodesSet[setIndex] == true) {
6922 dest[equivalentCodesCount++] = setIndex;
6923 if (equivalentCodesCount >= destCapacity) {
6924 break;
6925 }
6926 }
6927 }
6928 return equivalentCodesCount;
6929 }
6930
6931
6932 /****************************************************************************/
6933 /* Following are misc functions */
6934 /* there are new APIs and some compatibility APIs */
6935 /****************************************************************************/
6936
6937 U_CAPI void U_EXPORT2
ucol_getVersion(const UCollator * coll,UVersionInfo versionInfo)6938 ucol_getVersion(const UCollator* coll,
6939 UVersionInfo versionInfo)
6940 {
6941 if(coll->delegate!=NULL) {
6942 ((const Collator*)coll->delegate)->getVersion(versionInfo);
6943 return;
6944 }
6945 /* RunTime version */
6946 uint8_t rtVersion = UCOL_RUNTIME_VERSION;
6947 /* Builder version*/
6948 uint8_t bdVersion = coll->image->version[0];
6949
6950 /* Charset Version. Need to get the version from cnv files
6951 * makeconv should populate cnv files with version and
6952 * an api has to be provided in ucnv.h to obtain this version
6953 */
6954 uint8_t csVersion = 0;
6955
6956 /* combine the version info */
6957 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
6958
6959 /* Tailoring rules */
6960 versionInfo[0] = (uint8_t)(cmbVersion>>8);
6961 versionInfo[1] = (uint8_t)cmbVersion;
6962 versionInfo[2] = coll->image->version[1];
6963 if(coll->UCA) {
6964 /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */
6965 versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07);
6966 } else {
6967 versionInfo[3] = 0;
6968 }
6969 }
6970
6971
6972 /* This internal API checks whether a character is tailored or not */
6973 U_CAPI UBool U_EXPORT2
ucol_isTailored(const UCollator * coll,const UChar u,UErrorCode * status)6974 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
6975 if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
6976 return FALSE;
6977 }
6978
6979 uint32_t CE = UCOL_NOT_FOUND;
6980 const UChar *ContractionStart = NULL;
6981 if(u < 0x100) { /* latin-1 */
6982 CE = coll->latinOneMapping[u];
6983 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
6984 return FALSE;
6985 }
6986 } else { /* regular */
6987 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
6988 }
6989
6990 if(isContraction(CE)) {
6991 ContractionStart = (UChar *)coll->image+getContractOffset(CE);
6992 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
6993 }
6994
6995 return (UBool)(CE != UCOL_NOT_FOUND);
6996 }
6997
6998
6999 /****************************************************************************/
7000 /* Following are the string compare functions */
7001 /* */
7002 /****************************************************************************/
7003
7004
7005 /* ucol_checkIdent internal function. Does byte level string compare. */
7006 /* Used by strcoll if strength == identical and strings */
7007 /* are otherwise equal. */
7008 /* */
7009 /* Comparison must be done on NFD normalized strings. */
7010 /* FCD is not good enough. */
7011
7012 static
ucol_checkIdent(collIterate * sColl,collIterate * tColl,UBool normalize,UErrorCode * status)7013 UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
7014 {
7015 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
7016 // of same type, but that doesn't really mean that it will stay that way.
7017 int32_t comparison;
7018
7019 if (sColl->flags & UCOL_USE_ITERATOR) {
7020 // The division for the array length may truncate the array size to
7021 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
7022 // for all platforms anyway.
7023 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7024 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7025 UNormIterator *sNIt = NULL, *tNIt = NULL;
7026 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
7027 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
7028 sColl->iterator->move(sColl->iterator, 0, UITER_START);
7029 tColl->iterator->move(tColl->iterator, 0, UITER_START);
7030 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
7031 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
7032 comparison = u_strCompareIter(sIt, tIt, TRUE);
7033 unorm_closeIter(sNIt);
7034 unorm_closeIter(tNIt);
7035 } else {
7036 int32_t sLen = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1;
7037 const UChar *sBuf = sColl->string;
7038 int32_t tLen = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1;
7039 const UChar *tBuf = tColl->string;
7040
7041 if (normalize) {
7042 *status = U_ZERO_ERROR;
7043 // Note: We could use Normalizer::compare() or similar, but for short strings
7044 // which may not be in FCD it might be faster to just NFD them.
7045 // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than
7046 // NFD'ing immediately might be faster for long strings,
7047 // but string comparison is usually done on relatively short strings.
7048 sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen),
7049 sColl->writableBuffer,
7050 *status);
7051 tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen),
7052 tColl->writableBuffer,
7053 *status);
7054 if(U_FAILURE(*status)) {
7055 return UCOL_LESS;
7056 }
7057 comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer);
7058 } else {
7059 comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE);
7060 }
7061 }
7062
7063 if (comparison < 0) {
7064 return UCOL_LESS;
7065 } else if (comparison == 0) {
7066 return UCOL_EQUAL;
7067 } else /* comparison > 0 */ {
7068 return UCOL_GREATER;
7069 }
7070 }
7071
7072 /* CEBuf - A struct and some inline functions to handle the saving */
7073 /* of CEs in a buffer within ucol_strcoll */
7074
7075 #define UCOL_CEBUF_SIZE 512
7076 typedef struct ucol_CEBuf {
7077 uint32_t *buf;
7078 uint32_t *endp;
7079 uint32_t *pos;
7080 uint32_t localArray[UCOL_CEBUF_SIZE];
7081 } ucol_CEBuf;
7082
7083
7084 static
UCOL_INIT_CEBUF(ucol_CEBuf * b)7085 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
7086 (b)->buf = (b)->pos = (b)->localArray;
7087 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
7088 }
7089
7090 static
ucol_CEBuf_Expand(ucol_CEBuf * b,collIterate * ci,UErrorCode * status)7091 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) {
7092 uint32_t oldSize;
7093 uint32_t newSize;
7094 uint32_t *newBuf;
7095
7096 ci->flags |= UCOL_ITER_ALLOCATED;
7097 oldSize = (uint32_t)(b->pos - b->buf);
7098 newSize = oldSize * 2;
7099 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
7100 if(newBuf == NULL) {
7101 *status = U_MEMORY_ALLOCATION_ERROR;
7102 }
7103 else {
7104 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
7105 if (b->buf != b->localArray) {
7106 uprv_free(b->buf);
7107 }
7108 b->buf = newBuf;
7109 b->endp = b->buf + newSize;
7110 b->pos = b->buf + oldSize;
7111 }
7112 }
7113
7114 static
UCOL_CEBUF_PUT(ucol_CEBuf * b,uint32_t ce,collIterate * ci,UErrorCode * status)7115 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) {
7116 if (b->pos == b->endp) {
7117 ucol_CEBuf_Expand(b, ci, status);
7118 }
7119 if (U_SUCCESS(*status)) {
7120 *(b)->pos++ = ce;
7121 }
7122 }
7123
7124 /* This is a trick string compare function that goes in and uses sortkeys to compare */
7125 /* It is used when compare gets in trouble and needs to bail out */
ucol_compareUsingSortKeys(collIterate * sColl,collIterate * tColl,UErrorCode * status)7126 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7127 collIterate *tColl,
7128 UErrorCode *status)
7129 {
7130 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7131 uint8_t *sourceKeyP = sourceKey;
7132 uint8_t *targetKeyP = targetKey;
7133 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7134 const UCollator *coll = sColl->coll;
7135 const UChar *source = NULL;
7136 const UChar *target = NULL;
7137 int32_t result = UCOL_EQUAL;
7138 UnicodeString sourceString, targetString;
7139 int32_t sourceLength;
7140 int32_t targetLength;
7141
7142 if(sColl->flags & UCOL_USE_ITERATOR) {
7143 sColl->iterator->move(sColl->iterator, 0, UITER_START);
7144 tColl->iterator->move(tColl->iterator, 0, UITER_START);
7145 UChar32 c;
7146 while((c=sColl->iterator->next(sColl->iterator))>=0) {
7147 sourceString.append((UChar)c);
7148 }
7149 while((c=tColl->iterator->next(tColl->iterator))>=0) {
7150 targetString.append((UChar)c);
7151 }
7152 source = sourceString.getBuffer();
7153 sourceLength = sourceString.length();
7154 target = targetString.getBuffer();
7155 targetLength = targetString.length();
7156 } else { // no iterators
7157 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1;
7158 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1;
7159 source = sColl->string;
7160 target = tColl->string;
7161 }
7162
7163
7164
7165 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7166 if(sourceKeyLen > UCOL_MAX_BUFFER) {
7167 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7168 if(sourceKeyP == NULL) {
7169 *status = U_MEMORY_ALLOCATION_ERROR;
7170 goto cleanup_and_do_compare;
7171 }
7172 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7173 }
7174
7175 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7176 if(targetKeyLen > UCOL_MAX_BUFFER) {
7177 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7178 if(targetKeyP == NULL) {
7179 *status = U_MEMORY_ALLOCATION_ERROR;
7180 goto cleanup_and_do_compare;
7181 }
7182 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7183 }
7184
7185 result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7186
7187 cleanup_and_do_compare:
7188 if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
7189 uprv_free(sourceKeyP);
7190 }
7191
7192 if(targetKeyP != NULL && targetKeyP != targetKey) {
7193 uprv_free(targetKeyP);
7194 }
7195
7196 if(result<0) {
7197 return UCOL_LESS;
7198 } else if(result>0) {
7199 return UCOL_GREATER;
7200 } else {
7201 return UCOL_EQUAL;
7202 }
7203 }
7204
7205
7206 static UCollationResult
ucol_strcollRegular(collIterate * sColl,collIterate * tColl,UErrorCode * status)7207 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
7208 {
7209 U_ALIGN_CODE(16);
7210
7211 const UCollator *coll = sColl->coll;
7212
7213
7214 // setting up the collator parameters
7215 UColAttributeValue strength = coll->strength;
7216 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY);
7217
7218 UBool checkSecTer = initialCheckSecTer;
7219 UBool checkTertiary = (strength >= UCOL_TERTIARY);
7220 UBool checkQuad = (strength >= UCOL_QUATERNARY);
7221 UBool checkIdent = (strength == UCOL_IDENTICAL);
7222 UBool checkCase = (coll->caseLevel == UCOL_ON);
7223 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7224 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7225 UBool qShifted = shifted && checkQuad;
7226 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7227
7228 if(doHiragana && shifted) {
7229 return (ucol_compareUsingSortKeys(sColl, tColl, status));
7230 }
7231 uint8_t caseSwitch = coll->caseSwitch;
7232 uint8_t tertiaryMask = coll->tertiaryMask;
7233
7234 // This is the lowest primary value that will not be ignored if shifted
7235 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7236
7237 UCollationResult result = UCOL_EQUAL;
7238 UCollationResult hirResult = UCOL_EQUAL;
7239
7240 // Preparing the CE buffers. They will be filled during the primary phase
7241 ucol_CEBuf sCEs;
7242 ucol_CEBuf tCEs;
7243 UCOL_INIT_CEBUF(&sCEs);
7244 UCOL_INIT_CEBUF(&tCEs);
7245
7246 uint32_t secS = 0, secT = 0;
7247 uint32_t sOrder=0, tOrder=0;
7248
7249 // Non shifted primary processing is quite simple
7250 if(!shifted) {
7251 for(;;) {
7252
7253 // We fetch CEs until we hit a non ignorable primary or end.
7254 do {
7255 // We get the next CE
7256 sOrder = ucol_IGetNextCE(coll, sColl, status);
7257 // Stuff it in the buffer
7258 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7259 // And keep just the primary part.
7260 sOrder &= UCOL_PRIMARYMASK;
7261 } while(sOrder == 0);
7262
7263 // see the comments on the above block
7264 do {
7265 tOrder = ucol_IGetNextCE(coll, tColl, status);
7266 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7267 tOrder &= UCOL_PRIMARYMASK;
7268 } while(tOrder == 0);
7269
7270 // if both primaries are the same
7271 if(sOrder == tOrder) {
7272 // and there are no more CEs, we advance to the next level
7273 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7274 break;
7275 }
7276 if(doHiragana && hirResult == UCOL_EQUAL) {
7277 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
7278 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
7279 ? UCOL_LESS:UCOL_GREATER;
7280 }
7281 }
7282 } else {
7283 // only need to check one for continuation
7284 // if one is then the other must be or the preceding CE would be a prefix of the other
7285 if (coll->leadBytePermutationTable != NULL && !isContinuation(sOrder)) {
7286 sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
7287 tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
7288 }
7289 // if two primaries are different, we are done
7290 result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER;
7291 goto commonReturn;
7292 }
7293 } // no primary difference... do the rest from the buffers
7294 } else { // shifted - do a slightly more complicated processing :)
7295 for(;;) {
7296 UBool sInShifted = FALSE;
7297 UBool tInShifted = FALSE;
7298 // This version of code can be refactored. However, it seems easier to understand this way.
7299 // Source loop. Sam as the target loop.
7300 for(;;) {
7301 sOrder = ucol_IGetNextCE(coll, sColl, status);
7302 if(sOrder == UCOL_NO_MORE_CES) {
7303 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7304 break;
7305 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
7306 /* UCA amendment - ignore ignorables that follow shifted code points */
7307 continue;
7308 } else if(isContinuation(sOrder)) {
7309 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7310 if(sInShifted) {
7311 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7312 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7313 continue;
7314 } else {
7315 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7316 break;
7317 }
7318 } else { /* Just lower level values */
7319 if(sInShifted) {
7320 continue;
7321 } else {
7322 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7323 continue;
7324 }
7325 }
7326 } else { /* regular */
7327 if(coll->leadBytePermutationTable != NULL){
7328 sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
7329 }
7330 if((sOrder & UCOL_PRIMARYMASK) > LVT) {
7331 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7332 break;
7333 } else {
7334 if((sOrder & UCOL_PRIMARYMASK) > 0) {
7335 sInShifted = TRUE;
7336 sOrder &= UCOL_PRIMARYMASK;
7337 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7338 continue;
7339 } else {
7340 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7341 sInShifted = FALSE;
7342 continue;
7343 }
7344 }
7345 }
7346 }
7347 sOrder &= UCOL_PRIMARYMASK;
7348 sInShifted = FALSE;
7349
7350 for(;;) {
7351 tOrder = ucol_IGetNextCE(coll, tColl, status);
7352 if(tOrder == UCOL_NO_MORE_CES) {
7353 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7354 break;
7355 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
7356 /* UCA amendment - ignore ignorables that follow shifted code points */
7357 continue;
7358 } else if(isContinuation(tOrder)) {
7359 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7360 if(tInShifted) {
7361 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7362 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7363 continue;
7364 } else {
7365 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7366 break;
7367 }
7368 } else { /* Just lower level values */
7369 if(tInShifted) {
7370 continue;
7371 } else {
7372 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7373 continue;
7374 }
7375 }
7376 } else { /* regular */
7377 if(coll->leadBytePermutationTable != NULL){
7378 tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
7379 }
7380 if((tOrder & UCOL_PRIMARYMASK) > LVT) {
7381 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7382 break;
7383 } else {
7384 if((tOrder & UCOL_PRIMARYMASK) > 0) {
7385 tInShifted = TRUE;
7386 tOrder &= UCOL_PRIMARYMASK;
7387 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7388 continue;
7389 } else {
7390 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7391 tInShifted = FALSE;
7392 continue;
7393 }
7394 }
7395 }
7396 }
7397 tOrder &= UCOL_PRIMARYMASK;
7398 tInShifted = FALSE;
7399
7400 if(sOrder == tOrder) {
7401 /*
7402 if(doHiragana && hirResult == UCOL_EQUAL) {
7403 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7404 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7405 ? UCOL_LESS:UCOL_GREATER;
7406 }
7407 }
7408 */
7409 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7410 break;
7411 } else {
7412 sOrder = 0;
7413 tOrder = 0;
7414 continue;
7415 }
7416 } else {
7417 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
7418 goto commonReturn;
7419 }
7420 } /* no primary difference... do the rest from the buffers */
7421 }
7422
7423 /* now, we're gonna reexamine collected CEs */
7424 uint32_t *sCE;
7425 uint32_t *tCE;
7426
7427 /* This is the secondary level of comparison */
7428 if(checkSecTer) {
7429 if(!isFrenchSec) { /* normal */
7430 sCE = sCEs.buf;
7431 tCE = tCEs.buf;
7432 for(;;) {
7433 while (secS == 0) {
7434 secS = *(sCE++) & UCOL_SECONDARYMASK;
7435 }
7436
7437 while(secT == 0) {
7438 secT = *(tCE++) & UCOL_SECONDARYMASK;
7439 }
7440
7441 if(secS == secT) {
7442 if(secS == UCOL_NO_MORE_CES_SECONDARY) {
7443 break;
7444 } else {
7445 secS = 0; secT = 0;
7446 continue;
7447 }
7448 } else {
7449 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7450 goto commonReturn;
7451 }
7452 }
7453 } else { /* do the French */
7454 uint32_t *sCESave = NULL;
7455 uint32_t *tCESave = NULL;
7456 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
7457 tCE = tCEs.pos-2;
7458 for(;;) {
7459 while (secS == 0 && sCE >= sCEs.buf) {
7460 if(sCESave == NULL) {
7461 secS = *(sCE--);
7462 if(isContinuation(secS)) {
7463 while(isContinuation(secS = *(sCE--)))
7464 ;
7465 /* after this, secS has the start of continuation, and sCEs points before that */
7466 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
7467 sCE+=2; /* need to point to the first continuation CP */
7468 /* However, now you can just continue doing stuff */
7469 }
7470 } else {
7471 secS = *(sCE++);
7472 if(!isContinuation(secS)) { /* This means we have finished with this cont */
7473 sCE = sCESave; /* reset the pointer to before continuation */
7474 sCESave = NULL;
7475 secS = 0; /* Fetch a fresh CE before the continuation sequence. */
7476 continue;
7477 }
7478 }
7479 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7480 }
7481
7482 while(secT == 0 && tCE >= tCEs.buf) {
7483 if(tCESave == NULL) {
7484 secT = *(tCE--);
7485 if(isContinuation(secT)) {
7486 while(isContinuation(secT = *(tCE--)))
7487 ;
7488 /* after this, secS has the start of continuation, and sCEs points before that */
7489 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
7490 tCE+=2; /* need to point to the first continuation CP */
7491 /* However, now you can just continue doing stuff */
7492 }
7493 } else {
7494 secT = *(tCE++);
7495 if(!isContinuation(secT)) { /* This means we have finished with this cont */
7496 tCE = tCESave; /* reset the pointer to before continuation */
7497 tCESave = NULL;
7498 secT = 0; /* Fetch a fresh CE before the continuation sequence. */
7499 continue;
7500 }
7501 }
7502 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7503 }
7504
7505 if(secS == secT) {
7506 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
7507 break;
7508 } else {
7509 secS = 0; secT = 0;
7510 continue;
7511 }
7512 } else {
7513 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7514 goto commonReturn;
7515 }
7516 }
7517 }
7518 }
7519
7520 /* doing the case bit */
7521 if(checkCase) {
7522 sCE = sCEs.buf;
7523 tCE = tCEs.buf;
7524 for(;;) {
7525 while((secS & UCOL_REMOVE_CASE) == 0) {
7526 if(!isContinuation(*sCE++)) {
7527 secS =*(sCE-1);
7528 if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7529 // primary ignorables should not be considered on the case level when the strength is primary
7530 // otherwise, the CEs stop being well-formed
7531 secS &= UCOL_TERT_CASE_MASK;
7532 secS ^= caseSwitch;
7533 } else {
7534 secS = 0;
7535 }
7536 } else {
7537 secS = 0;
7538 }
7539 }
7540
7541 while((secT & UCOL_REMOVE_CASE) == 0) {
7542 if(!isContinuation(*tCE++)) {
7543 secT = *(tCE-1);
7544 if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7545 // primary ignorables should not be considered on the case level when the strength is primary
7546 // otherwise, the CEs stop being well-formed
7547 secT &= UCOL_TERT_CASE_MASK;
7548 secT ^= caseSwitch;
7549 } else {
7550 secT = 0;
7551 }
7552 } else {
7553 secT = 0;
7554 }
7555 }
7556
7557 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
7558 result = UCOL_LESS;
7559 goto commonReturn;
7560 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
7561 result = UCOL_GREATER;
7562 goto commonReturn;
7563 }
7564
7565 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
7566 break;
7567 } else {
7568 secS = 0;
7569 secT = 0;
7570 }
7571 }
7572 }
7573
7574 /* Tertiary level */
7575 if(checkTertiary) {
7576 secS = 0;
7577 secT = 0;
7578 sCE = sCEs.buf;
7579 tCE = tCEs.buf;
7580 for(;;) {
7581 while((secS & UCOL_REMOVE_CASE) == 0) {
7582 secS = *(sCE++) & tertiaryMask;
7583 if(!isContinuation(secS)) {
7584 secS ^= caseSwitch;
7585 } else {
7586 secS &= UCOL_REMOVE_CASE;
7587 }
7588 }
7589
7590 while((secT & UCOL_REMOVE_CASE) == 0) {
7591 secT = *(tCE++) & tertiaryMask;
7592 if(!isContinuation(secT)) {
7593 secT ^= caseSwitch;
7594 } else {
7595 secT &= UCOL_REMOVE_CASE;
7596 }
7597 }
7598
7599 if(secS == secT) {
7600 if((secS & UCOL_REMOVE_CASE) == 1) {
7601 break;
7602 } else {
7603 secS = 0; secT = 0;
7604 continue;
7605 }
7606 } else {
7607 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7608 goto commonReturn;
7609 }
7610 }
7611 }
7612
7613
7614 if(qShifted /*checkQuad*/) {
7615 UBool sInShifted = TRUE;
7616 UBool tInShifted = TRUE;
7617 secS = 0;
7618 secT = 0;
7619 sCE = sCEs.buf;
7620 tCE = tCEs.buf;
7621 for(;;) {
7622 while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(secS) && !sInShifted)) {
7623 secS = *(sCE++);
7624 if(isContinuation(secS)) {
7625 if(!sInShifted) {
7626 continue;
7627 }
7628 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
7629 secS = UCOL_PRIMARYMASK;
7630 sInShifted = FALSE;
7631 } else {
7632 sInShifted = TRUE;
7633 }
7634 }
7635 secS &= UCOL_PRIMARYMASK;
7636
7637
7638 while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(secT) && !tInShifted)) {
7639 secT = *(tCE++);
7640 if(isContinuation(secT)) {
7641 if(!tInShifted) {
7642 continue;
7643 }
7644 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
7645 secT = UCOL_PRIMARYMASK;
7646 tInShifted = FALSE;
7647 } else {
7648 tInShifted = TRUE;
7649 }
7650 }
7651 secT &= UCOL_PRIMARYMASK;
7652
7653 if(secS == secT) {
7654 if(secS == UCOL_NO_MORE_CES_PRIMARY) {
7655 break;
7656 } else {
7657 secS = 0; secT = 0;
7658 continue;
7659 }
7660 } else {
7661 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7662 goto commonReturn;
7663 }
7664 }
7665 } else if(doHiragana && hirResult != UCOL_EQUAL) {
7666 // If we're fine on quaternaries, we might be different
7667 // on Hiragana. This, however, might fail us in shifted.
7668 result = hirResult;
7669 goto commonReturn;
7670 }
7671
7672 /* For IDENTICAL comparisons, we use a bitwise character comparison */
7673 /* as a tiebreaker if all else is equal. */
7674 /* Getting here should be quite rare - strings are not identical - */
7675 /* that is checked first, but compared == through all other checks. */
7676 if(checkIdent)
7677 {
7678 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
7679 result = ucol_checkIdent(sColl, tColl, TRUE, status);
7680 }
7681
7682 commonReturn:
7683 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
7684 if (sCEs.buf != sCEs.localArray ) {
7685 uprv_free(sCEs.buf);
7686 }
7687 if (tCEs.buf != tCEs.localArray ) {
7688 uprv_free(tCEs.buf);
7689 }
7690 }
7691
7692 return result;
7693 }
7694
7695 static UCollationResult
ucol_strcollRegular(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength,UErrorCode * status)7696 ucol_strcollRegular(const UCollator *coll,
7697 const UChar *source, int32_t sourceLength,
7698 const UChar *target, int32_t targetLength,
7699 UErrorCode *status) {
7700 collIterate sColl, tColl;
7701 // Preparing the context objects for iterating over strings
7702 IInit_collIterate(coll, source, sourceLength, &sColl, status);
7703 IInit_collIterate(coll, target, targetLength, &tColl, status);
7704 if(U_FAILURE(*status)) {
7705 return UCOL_LESS;
7706 }
7707 return ucol_strcollRegular(&sColl, &tColl, status);
7708 }
7709
7710 static inline uint32_t
ucol_getLatinOneContraction(const UCollator * coll,int32_t strength,uint32_t CE,const UChar * s,int32_t * index,int32_t len)7711 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
7712 uint32_t CE, const UChar *s, int32_t *index, int32_t len)
7713 {
7714 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
7715 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
7716 int32_t offset = 1;
7717 UChar schar = 0, tchar = 0;
7718
7719 for(;;) {
7720 if(len == -1) {
7721 if(s[*index] == 0) { // end of string
7722 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7723 } else {
7724 schar = s[*index];
7725 }
7726 } else {
7727 if(*index == len) {
7728 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7729 } else {
7730 schar = s[*index];
7731 }
7732 }
7733
7734 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
7735 offset++;
7736 }
7737
7738 if (schar == tchar) {
7739 (*index)++;
7740 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
7741 }
7742 else
7743 {
7744 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
7745 return UCOL_BAIL_OUT_CE;
7746 }
7747 // skip completely ignorables
7748 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
7749 if(isZeroCE == 0) { // we have to ignore completely ignorables
7750 (*index)++;
7751 continue;
7752 }
7753
7754 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7755 }
7756 }
7757 }
7758
7759
7760 /**
7761 * This is a fast strcoll, geared towards text in Latin-1.
7762 * It supports contractions of size two, French secondaries
7763 * and case switching. You can use it with strengths primary
7764 * to tertiary. It does not support shifted and case level.
7765 * It relies on the table build by setupLatin1Table. If it
7766 * doesn't understand something, it will go to the regular
7767 * strcoll.
7768 */
7769 static UCollationResult
ucol_strcollUseLatin1(const UCollator * coll,const UChar * source,int32_t sLen,const UChar * target,int32_t tLen,UErrorCode * status)7770 ucol_strcollUseLatin1( const UCollator *coll,
7771 const UChar *source,
7772 int32_t sLen,
7773 const UChar *target,
7774 int32_t tLen,
7775 UErrorCode *status)
7776 {
7777 U_ALIGN_CODE(16);
7778 int32_t strength = coll->strength;
7779
7780 int32_t sIndex = 0, tIndex = 0;
7781 UChar sChar = 0, tChar = 0;
7782 uint32_t sOrder=0, tOrder=0;
7783
7784 UBool endOfSource = FALSE;
7785
7786 uint32_t *elements = coll->latinOneCEs;
7787
7788 UBool haveContractions = FALSE; // if we have contractions in our string
7789 // we cannot do French secondary
7790
7791 // Do the primary level
7792 for(;;) {
7793 while(sOrder==0) { // this loop skips primary ignorables
7794 // sOrder=getNextlatinOneCE(source);
7795 if(sLen==-1) { // handling zero terminated strings
7796 sChar=source[sIndex++];
7797 if(sChar==0) {
7798 endOfSource = TRUE;
7799 break;
7800 }
7801 } else { // handling strings with known length
7802 if(sIndex==sLen) {
7803 endOfSource = TRUE;
7804 break;
7805 }
7806 sChar=source[sIndex++];
7807 }
7808 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7809 //fprintf(stderr, "R");
7810 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7811 }
7812 sOrder = elements[sChar];
7813 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
7814 // specials can basically be either contractions or bail-out signs. If we get anything
7815 // else, we'll bail out anywasy
7816 if(getCETag(sOrder) == CONTRACTION_TAG) {
7817 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
7818 haveContractions = TRUE; // if there are contractions, we cannot do French secondary
7819 // However, if there are contractions in the table, but we always use just one char,
7820 // we might be able to do French. This should be checked out.
7821 }
7822 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7823 //fprintf(stderr, "S");
7824 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7825 }
7826 }
7827 }
7828
7829 while(tOrder==0) { // this loop skips primary ignorables
7830 // tOrder=getNextlatinOneCE(target);
7831 if(tLen==-1) { // handling zero terminated strings
7832 tChar=target[tIndex++];
7833 if(tChar==0) {
7834 if(endOfSource) { // this is different than source loop,
7835 // as we already know that source loop is done here,
7836 // so we can either finish the primary loop if both
7837 // strings are done or anounce the result if only
7838 // target is done. Same below.
7839 goto endOfPrimLoop;
7840 } else {
7841 return UCOL_GREATER;
7842 }
7843 }
7844 } else { // handling strings with known length
7845 if(tIndex==tLen) {
7846 if(endOfSource) {
7847 goto endOfPrimLoop;
7848 } else {
7849 return UCOL_GREATER;
7850 }
7851 }
7852 tChar=target[tIndex++];
7853 }
7854 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7855 //fprintf(stderr, "R");
7856 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7857 }
7858 tOrder = elements[tChar];
7859 if(tOrder >= UCOL_NOT_FOUND) {
7860 // Handling specials, see the comments for source
7861 if(getCETag(tOrder) == CONTRACTION_TAG) {
7862 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
7863 haveContractions = TRUE;
7864 }
7865 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7866 //fprintf(stderr, "S");
7867 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7868 }
7869 }
7870 }
7871 if(endOfSource) { // source is finished, but target is not, say the result.
7872 return UCOL_LESS;
7873 }
7874
7875 if(sOrder == tOrder) { // if we have same CEs, we continue the loop
7876 sOrder = 0; tOrder = 0;
7877 continue;
7878 } else {
7879 // compare current top bytes
7880 if(((sOrder^tOrder)&0xFF000000)!=0) {
7881 // top bytes differ, return difference
7882 if(sOrder < tOrder) {
7883 return UCOL_LESS;
7884 } else if(sOrder > tOrder) {
7885 return UCOL_GREATER;
7886 }
7887 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
7888 // since we must return enum value
7889 }
7890
7891 // top bytes match, continue with following bytes
7892 sOrder<<=8;
7893 tOrder<<=8;
7894 }
7895 }
7896
7897 endOfPrimLoop:
7898 // after primary loop, we definitely know the sizes of strings,
7899 // so we set it and use simpler loop for secondaries and tertiaries
7900 sLen = sIndex; tLen = tIndex;
7901 if(strength >= UCOL_SECONDARY) {
7902 // adjust the table beggining
7903 elements += coll->latinOneTableLen;
7904 endOfSource = FALSE;
7905
7906 if(coll->frenchCollation == UCOL_OFF) { // non French
7907 // This loop is a simplified copy of primary loop
7908 // at this point we know that whole strings are latin-1, so we don't
7909 // check for that. We also know that we only have contractions as
7910 // specials.
7911 sIndex = 0; tIndex = 0;
7912 for(;;) {
7913 while(sOrder==0) {
7914 if(sIndex==sLen) {
7915 endOfSource = TRUE;
7916 break;
7917 }
7918 sChar=source[sIndex++];
7919 sOrder = elements[sChar];
7920 if(sOrder > UCOL_NOT_FOUND) {
7921 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
7922 }
7923 }
7924
7925 while(tOrder==0) {
7926 if(tIndex==tLen) {
7927 if(endOfSource) {
7928 goto endOfSecLoop;
7929 } else {
7930 return UCOL_GREATER;
7931 }
7932 }
7933 tChar=target[tIndex++];
7934 tOrder = elements[tChar];
7935 if(tOrder > UCOL_NOT_FOUND) {
7936 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
7937 }
7938 }
7939 if(endOfSource) {
7940 return UCOL_LESS;
7941 }
7942
7943 if(sOrder == tOrder) {
7944 sOrder = 0; tOrder = 0;
7945 continue;
7946 } else {
7947 // see primary loop for comments on this
7948 if(((sOrder^tOrder)&0xFF000000)!=0) {
7949 if(sOrder < tOrder) {
7950 return UCOL_LESS;
7951 } else if(sOrder > tOrder) {
7952 return UCOL_GREATER;
7953 }
7954 }
7955 sOrder<<=8;
7956 tOrder<<=8;
7957 }
7958 }
7959 } else { // French
7960 if(haveContractions) { // if we have contractions, we have to bail out
7961 // since we don't really know how to handle them here
7962 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7963 }
7964 // For French, we go backwards
7965 sIndex = sLen; tIndex = tLen;
7966 for(;;) {
7967 while(sOrder==0) {
7968 if(sIndex==0) {
7969 endOfSource = TRUE;
7970 break;
7971 }
7972 sChar=source[--sIndex];
7973 sOrder = elements[sChar];
7974 // don't even look for contractions
7975 }
7976
7977 while(tOrder==0) {
7978 if(tIndex==0) {
7979 if(endOfSource) {
7980 goto endOfSecLoop;
7981 } else {
7982 return UCOL_GREATER;
7983 }
7984 }
7985 tChar=target[--tIndex];
7986 tOrder = elements[tChar];
7987 // don't even look for contractions
7988 }
7989 if(endOfSource) {
7990 return UCOL_LESS;
7991 }
7992
7993 if(sOrder == tOrder) {
7994 sOrder = 0; tOrder = 0;
7995 continue;
7996 } else {
7997 // see the primary loop for comments
7998 if(((sOrder^tOrder)&0xFF000000)!=0) {
7999 if(sOrder < tOrder) {
8000 return UCOL_LESS;
8001 } else if(sOrder > tOrder) {
8002 return UCOL_GREATER;
8003 }
8004 }
8005 sOrder<<=8;
8006 tOrder<<=8;
8007 }
8008 }
8009 }
8010 }
8011
8012 endOfSecLoop:
8013 if(strength >= UCOL_TERTIARY) {
8014 // tertiary loop is the same as secondary (except no French)
8015 elements += coll->latinOneTableLen;
8016 sIndex = 0; tIndex = 0;
8017 endOfSource = FALSE;
8018 for(;;) {
8019 while(sOrder==0) {
8020 if(sIndex==sLen) {
8021 endOfSource = TRUE;
8022 break;
8023 }
8024 sChar=source[sIndex++];
8025 sOrder = elements[sChar];
8026 if(sOrder > UCOL_NOT_FOUND) {
8027 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
8028 }
8029 }
8030 while(tOrder==0) {
8031 if(tIndex==tLen) {
8032 if(endOfSource) {
8033 return UCOL_EQUAL; // if both strings are at the end, they are equal
8034 } else {
8035 return UCOL_GREATER;
8036 }
8037 }
8038 tChar=target[tIndex++];
8039 tOrder = elements[tChar];
8040 if(tOrder > UCOL_NOT_FOUND) {
8041 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
8042 }
8043 }
8044 if(endOfSource) {
8045 return UCOL_LESS;
8046 }
8047 if(sOrder == tOrder) {
8048 sOrder = 0; tOrder = 0;
8049 continue;
8050 } else {
8051 if(((sOrder^tOrder)&0xff000000)!=0) {
8052 if(sOrder < tOrder) {
8053 return UCOL_LESS;
8054 } else if(sOrder > tOrder) {
8055 return UCOL_GREATER;
8056 }
8057 }
8058 sOrder<<=8;
8059 tOrder<<=8;
8060 }
8061 }
8062 }
8063 return UCOL_EQUAL;
8064 }
8065
8066 /*
8067 Slightly modified version of U8_NEXT macro defined in utf8.h. U8_NEXT requires
8068 the length of UTF-8 string. This version assumes that the UTF-8 string is null
8069 terminated and does not require the length as input.
8070
8071 Note: ucol_strcollUTF8 supports null terminated input. Calculating length of
8072 null terminated input string takes extra amount of CPU cycles.
8073 */
8074 static const UChar32
8075 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
8076
8077 #define UTF8_ERROR_VALUE_1 0x15
8078 #define UTF8_ERROR_VALUE_2 0x9f
8079 #define UTF_ERROR_VALUE 0xffff
8080
8081 static const UChar32
8082 utf8_errorValue[6]={
8083 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff,
8084 0x3ffffff, 0x7fffffff
8085 };
8086
8087 static
utf8_nextCharSafeBodyNullTerm(const uint8_t * s,int32_t * pi,UChar32 c,UBool strict)8088 UChar32 utf8_nextCharSafeBodyNullTerm(const uint8_t *s, int32_t *pi, UChar32 c, UBool strict) {
8089 int32_t i=*pi;
8090 uint8_t count=U8_COUNT_TRAIL_BYTES(c);
8091 U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */
8092
8093 if (c) {
8094 uint8_t trail, illegal=0;
8095
8096 U8_MASK_LEAD_BYTE((c), count);
8097 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
8098 switch(count) {
8099 /* each branch falls through to the next one */
8100 case 5:
8101 case 4:
8102 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
8103 illegal=1;
8104 break;
8105 case 3:
8106 trail=s[(i)];
8107 if (trail==0) {
8108 illegal=1;
8109 break;
8110 }
8111 (c)=((c)<<6)|(trail&0x3f);
8112 if(c<0x110) {
8113 illegal|=(trail&0xc0)^0x80;
8114 } else {
8115 /* code point>0x10ffff, outside Unicode */
8116 illegal=1;
8117 break;
8118 }
8119 ++(i);
8120 case 2:
8121 trail=s[(i)];
8122 if (trail==0) {
8123 illegal=1;
8124 break;
8125 }
8126 (c)=((c)<<6)|(trail&0x3f);
8127 illegal|=(trail&0xc0)^0x80;
8128 ++(i);
8129 case 1:
8130 trail=s[(i)];
8131 if (trail==0) {
8132 illegal=1;
8133 break;
8134 }
8135 (c)=((c)<<6)|(trail&0x3f);
8136 illegal|=(trail&0xc0)^0x80;
8137 ++(i);
8138 break;
8139 case 0:
8140 if(strict>=0) {
8141 return UTF8_ERROR_VALUE_1;
8142 } else {
8143 return U_SENTINEL;
8144 }
8145 /* no default branch to optimize switch() - all values are covered */
8146 }
8147
8148 /*
8149 * All the error handling should return a value
8150 * that needs count bytes so that UTF8_GET_CHAR_SAFE() works right.
8151 *
8152 * Starting with Unicode 3.0.1, non-shortest forms are illegal.
8153 * Starting with Unicode 3.2, surrogate code points must not be
8154 * encoded in UTF-8, and there are no irregular sequences any more.
8155 *
8156 * U8_ macros (new in ICU 2.4) return negative values for error conditions.
8157 */
8158
8159 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
8160 /* illegal is also set if count>=4 */
8161 if(illegal || (c)<utf8_minLegal[count] || (U_IS_SURROGATE(c) && strict!=-2)) {
8162 /* error handling */
8163 uint8_t errorCount=count;
8164 /* don't go beyond this sequence */
8165 i=*pi;
8166 while(count>0 && U8_IS_TRAIL(s[i])) {
8167 ++(i);
8168 --count;
8169 }
8170 if(strict>=0) {
8171 c=utf8_errorValue[errorCount-count];
8172 } else {
8173 c=U_SENTINEL;
8174 }
8175 } else if((strict)>0 && U_IS_UNICODE_NONCHAR(c)) {
8176 /* strict: forbid non-characters like U+fffe */
8177 c=utf8_errorValue[count];
8178 }
8179 }
8180 *pi=i;
8181 return c;
8182 }
8183
8184 #define U8_NEXT_NULLTERM(s, i, c) { \
8185 (c)=(uint8_t)(s)[(i)]; \
8186 if((c)>=0x80) { \
8187 uint8_t __t1, __t2; \
8188 if( /* handle U+1000..U+CFFF inline */ \
8189 (0xe0<(c) && (c)<=0xec) && \
8190 (__t1=(uint8_t)((s)[(i)+1]-0x80))<=0x3f && __t1 != 0 && \
8191 (__t2=(uint8_t)((s)[(i)+2]-0x80))<= 0x3f && __t2 != 0 \
8192 ) { \
8193 /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
8194 (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
8195 (i)+=3; \
8196 } else if( /* handle U+0080..U+07FF inline */ \
8197 ((c)<0xe0 && (c)>=0xc2) && \
8198 (__t1=(uint8_t)((s)[(i)+1]-0x80))<=0x3f && __t1 != 0 \
8199 ) { \
8200 (c)=(UChar)((((c)&0x1f)<<6)|__t1); \
8201 (i)+=2; \
8202 } else if(U8_IS_LEAD(c)) { \
8203 /* function call for "complicated" and error cases */ \
8204 ++(i); \
8205 (c)=utf8_nextCharSafeBodyNullTerm((const uint8_t *)s, &(i), c, -1); \
8206 } else { \
8207 (c)=U_SENTINEL; \
8208 ++(i); \
8209 } \
8210 } else { \
8211 if ((c)) { \
8212 ++(i); \
8213 } \
8214 } \
8215 }
8216
8217 #define U8_GET_NULLTERM(s, start, i, c) { \
8218 int32_t _u8_get_index=(int32_t)(i); \
8219 U8_SET_CP_START(s, start, _u8_get_index); \
8220 U8_NEXT_NULLTERM(s, _u8_get_index, c); \
8221 }
8222
8223
8224 static UCollationResult
ucol_strcollRegularUTF8(const UCollator * coll,const char * source,int32_t sourceLength,const char * target,int32_t targetLength,UErrorCode * status)8225 ucol_strcollRegularUTF8(
8226 const UCollator *coll,
8227 const char *source,
8228 int32_t sourceLength,
8229 const char *target,
8230 int32_t targetLength,
8231 UErrorCode *status)
8232 {
8233 UCharIterator src;
8234 UCharIterator tgt;
8235
8236 uiter_setUTF8(&src, source, sourceLength);
8237 uiter_setUTF8(&tgt, target, targetLength);
8238
8239 // Preparing the context objects for iterating over strings
8240 collIterate sColl, tColl;
8241 IInit_collIterate(coll, NULL, -1, &sColl, status);
8242 IInit_collIterate(coll, NULL, -1, &tColl, status);
8243 if(U_FAILURE(*status)) {
8244 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8245 return UCOL_EQUAL;
8246 }
8247 // The division for the array length may truncate the array size to
8248 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8249 // for all platforms anyway.
8250 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8251 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8252 UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8253
8254 sColl.iterator = &src;
8255 sColl.flags |= UCOL_USE_ITERATOR;
8256 tColl.flags |= UCOL_USE_ITERATOR;
8257 tColl.iterator = &tgt;
8258
8259 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8260 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8261 sColl.iterator = unorm_setIter(sNormIter, &src, UNORM_FCD, status);
8262 sColl.flags &= ~UCOL_ITER_NORM;
8263
8264 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8265 tColl.iterator = unorm_setIter(tNormIter, &tgt, UNORM_FCD, status);
8266 tColl.flags &= ~UCOL_ITER_NORM;
8267 }
8268
8269 return ucol_strcollRegular(&sColl, &tColl, status);
8270 }
8271
8272 static inline uint32_t
ucol_getLatinOneContractionUTF8(const UCollator * coll,int32_t strength,uint32_t CE,const char * s,int32_t * index,int32_t len)8273 ucol_getLatinOneContractionUTF8(const UCollator *coll, int32_t strength,
8274 uint32_t CE, const char *s, int32_t *index, int32_t len)
8275 {
8276 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
8277 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
8278 int32_t offset = 1;
8279 UChar32 schar = 0, tchar = 0;
8280
8281 for(;;) {
8282 if (len == -1) {
8283 U8_GET_NULLTERM((const uint8_t*)s, 0, *index, schar);
8284 if (schar == 0) {
8285 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8286 }
8287 } else {
8288 if (*index == len) {
8289 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8290 }
8291 U8_GET((const uint8_t*)s, 0, *index, len, schar);
8292 }
8293 if (schar == -1) {
8294 schar = 0xfffd;
8295 }
8296
8297 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
8298 offset++;
8299 }
8300
8301 if (schar == tchar) {
8302 U8_FWD_1(s, *index, len);
8303 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
8304 }
8305 else
8306 {
8307 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
8308 return UCOL_BAIL_OUT_CE;
8309 }
8310 // skip completely ignorables
8311 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
8312 if(isZeroCE == 0) { // we have to ignore completely ignorables
8313 U8_FWD_1(s, *index, len);
8314 continue;
8315 }
8316
8317 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8318 }
8319 }
8320 }
8321
8322 static inline UCollationResult
ucol_strcollUseLatin1UTF8(const UCollator * coll,const char * source,int32_t sLen,const char * target,int32_t tLen,UErrorCode * status)8323 ucol_strcollUseLatin1UTF8(
8324 const UCollator *coll,
8325 const char *source,
8326 int32_t sLen,
8327 const char *target,
8328 int32_t tLen,
8329 UErrorCode *status)
8330 {
8331 U_ALIGN_CODE(16);
8332 int32_t strength = coll->strength;
8333
8334 int32_t sIndex = 0, tIndex = 0;
8335 UChar32 sChar = 0, tChar = 0;
8336 uint32_t sOrder=0, tOrder=0;
8337
8338 UBool endOfSource = FALSE;
8339
8340 uint32_t *elements = coll->latinOneCEs;
8341
8342 UBool haveContractions = FALSE; // if we have contractions in our string
8343 // we cannot do French secondary
8344
8345 // Do the primary level
8346 for(;;) {
8347 while(sOrder==0) { // this loop skips primary ignorables
8348 // sOrder=getNextlatinOneCE(source);
8349 if (sLen==-1) {
8350 U8_NEXT_NULLTERM(source, sIndex, sChar);
8351 if (sChar == 0) {
8352 endOfSource = TRUE;
8353 sLen = sIndex;
8354 break;
8355 }
8356 } else {
8357 if (sIndex == sLen) {
8358 endOfSource = TRUE;
8359 break;
8360 }
8361 U8_NEXT(source, sIndex, sLen ,sChar);
8362 }
8363 if (sChar == -1) {
8364 sChar = 0xfffd; // fallback for the bad code
8365 }
8366 if(sChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8367 //fprintf(stderr, "R");
8368 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8369 }
8370 sOrder = elements[sChar];
8371 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
8372 // specials can basically be either contractions or bail-out signs. If we get anything
8373 // else, we'll bail out anywasy
8374 if(getCETag(sOrder) == CONTRACTION_TAG) {
8375 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
8376 haveContractions = TRUE; // if there are contractions, we cannot do French secondary
8377 // However, if there are contractions in the table, but we always use just one char,
8378 // we might be able to do French. This should be checked out.
8379 }
8380 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8381 //fprintf(stderr, "S");
8382 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8383 }
8384 }
8385 }
8386
8387 while(tOrder==0) { // this loop skips primary ignorables
8388 // tOrder=getNextlatinOneCE(target);
8389 if (tLen == -1) {
8390 U8_NEXT_NULLTERM(target, tIndex, tChar);
8391 if (tChar == 0) {
8392 if(endOfSource) {
8393 tLen = tIndex;
8394 goto endOfPrimLoopU8;
8395 } else {
8396 return UCOL_GREATER;
8397 }
8398 }
8399 } else {
8400 if (tIndex == tLen) {
8401 if(endOfSource) {
8402 goto endOfPrimLoopU8;
8403 } else {
8404 return UCOL_GREATER;
8405 }
8406 }
8407 U8_NEXT(target, tIndex, tLen, tChar);
8408 }
8409 if (tChar == -1) {
8410 tChar = 0xfffd;
8411 }
8412 if(tChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8413 //fprintf(stderr, "R");
8414 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8415 }
8416 tOrder = elements[tChar];
8417 if(tOrder >= UCOL_NOT_FOUND) {
8418 // Handling specials, see the comments for source
8419 if(getCETag(tOrder) == CONTRACTION_TAG) {
8420 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
8421 haveContractions = TRUE;
8422 }
8423 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8424 //fprintf(stderr, "S");
8425 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8426 }
8427 }
8428 }
8429 if(endOfSource) { // source is finished, but target is not, say the result.
8430 return UCOL_LESS;
8431 }
8432
8433 if(sOrder == tOrder) { // if we have same CEs, we continue the loop
8434 sOrder = 0; tOrder = 0;
8435 continue;
8436 } else {
8437 // compare current top bytes
8438 if(((sOrder^tOrder)&0xFF000000)!=0) {
8439 // top bytes differ, return difference
8440 if(sOrder < tOrder) {
8441 return UCOL_LESS;
8442 } else if(sOrder > tOrder) {
8443 return UCOL_GREATER;
8444 }
8445 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
8446 // since we must return enum value
8447 }
8448
8449 // top bytes match, continue with following bytes
8450 sOrder<<=8;
8451 tOrder<<=8;
8452 }
8453 }
8454
8455 endOfPrimLoopU8:
8456 // after primary loop, we definitely know the sizes of strings,
8457 // so we set it and use simpler loop for secondaries and tertiaries
8458 sLen = sIndex; tLen = tIndex;
8459 if(strength >= UCOL_SECONDARY) {
8460 // adjust the table beggining
8461 elements += coll->latinOneTableLen;
8462 endOfSource = FALSE;
8463
8464 if(coll->frenchCollation == UCOL_OFF) { // non French
8465 // This loop is a simplified copy of primary loop
8466 // at this point we know that whole strings are latin-1, so we don't
8467 // check for that. We also know that we only have contractions as
8468 // specials.
8469 sIndex = 0; tIndex = 0;
8470 for(;;) {
8471 while(sOrder==0) {
8472 if(sIndex==sLen) {
8473 endOfSource = TRUE;
8474 break;
8475 }
8476 U_ASSERT(sLen >= 0);
8477 U8_NEXT(source, sIndex, sLen, sChar);
8478 U_ASSERT(sChar >= 0 && sChar <= 0xFF);
8479 sOrder = elements[sChar];
8480 if(sOrder > UCOL_NOT_FOUND) {
8481 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
8482 }
8483 }
8484
8485 while(tOrder==0) {
8486 if(tIndex==tLen) {
8487 if(endOfSource) {
8488 goto endOfSecLoopU8;
8489 } else {
8490 return UCOL_GREATER;
8491 }
8492 }
8493 U_ASSERT(tLen >= 0);
8494 U8_NEXT(target, tIndex, tLen, tChar);
8495 U_ASSERT(tChar >= 0 && tChar <= 0xFF);
8496 tOrder = elements[tChar];
8497 if(tOrder > UCOL_NOT_FOUND) {
8498 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
8499 }
8500 }
8501 if(endOfSource) {
8502 return UCOL_LESS;
8503 }
8504
8505 if(sOrder == tOrder) {
8506 sOrder = 0; tOrder = 0;
8507 continue;
8508 } else {
8509 // see primary loop for comments on this
8510 if(((sOrder^tOrder)&0xFF000000)!=0) {
8511 if(sOrder < tOrder) {
8512 return UCOL_LESS;
8513 } else if(sOrder > tOrder) {
8514 return UCOL_GREATER;
8515 }
8516 }
8517 sOrder<<=8;
8518 tOrder<<=8;
8519 }
8520 }
8521 } else { // French
8522 if(haveContractions) { // if we have contractions, we have to bail out
8523 // since we don't really know how to handle them here
8524 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8525 }
8526 // For French, we go backwards
8527 sIndex = sLen; tIndex = tLen;
8528 for(;;) {
8529 while(sOrder==0) {
8530 if(sIndex==0) {
8531 endOfSource = TRUE;
8532 break;
8533 }
8534 U8_PREV(source, 0, sIndex, sChar);
8535 U_ASSERT(sChar >= 0 && sChar <= 0xFF);
8536 sOrder = elements[sChar];
8537 // don't even look for contractions
8538 }
8539
8540 while(tOrder==0) {
8541 if(tIndex==0) {
8542 if(endOfSource) {
8543 goto endOfSecLoopU8;
8544 } else {
8545 return UCOL_GREATER;
8546 }
8547 }
8548 U8_PREV(target, 0, tIndex, tChar);
8549 U_ASSERT(tChar >= 0 && tChar <= 0xFF);
8550 tOrder = elements[tChar];
8551 // don't even look for contractions
8552 }
8553 if(endOfSource) {
8554 return UCOL_LESS;
8555 }
8556
8557 if(sOrder == tOrder) {
8558 sOrder = 0; tOrder = 0;
8559 continue;
8560 } else {
8561 // see the primary loop for comments
8562 if(((sOrder^tOrder)&0xFF000000)!=0) {
8563 if(sOrder < tOrder) {
8564 return UCOL_LESS;
8565 } else if(sOrder > tOrder) {
8566 return UCOL_GREATER;
8567 }
8568 }
8569 sOrder<<=8;
8570 tOrder<<=8;
8571 }
8572 }
8573 }
8574 }
8575
8576 endOfSecLoopU8:
8577 if(strength >= UCOL_TERTIARY) {
8578 // tertiary loop is the same as secondary (except no French)
8579 elements += coll->latinOneTableLen;
8580 sIndex = 0; tIndex = 0;
8581 endOfSource = FALSE;
8582 for(;;) {
8583 while(sOrder==0) {
8584 if(sIndex==sLen) {
8585 endOfSource = TRUE;
8586 break;
8587 }
8588 U_ASSERT(sLen >= 0);
8589 U8_NEXT(source, sIndex, sLen, sChar);
8590 U_ASSERT(sChar >= 0 && sChar <= 0xFF);
8591 sOrder = elements[sChar];
8592 if(sOrder > UCOL_NOT_FOUND) {
8593 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
8594 }
8595 }
8596 while(tOrder==0) {
8597 if(tIndex==tLen) {
8598 if(endOfSource) {
8599 return UCOL_EQUAL; // if both strings are at the end, they are equal
8600 } else {
8601 return UCOL_GREATER;
8602 }
8603 }
8604 U_ASSERT(tLen >= 0);
8605 U8_NEXT(target, tIndex, tLen, tChar);
8606 U_ASSERT(tChar >= 0 && tChar <= 0xFF);
8607 tOrder = elements[tChar];
8608 if(tOrder > UCOL_NOT_FOUND) {
8609 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
8610 }
8611 }
8612 if(endOfSource) {
8613 return UCOL_LESS;
8614 }
8615 if(sOrder == tOrder) {
8616 sOrder = 0; tOrder = 0;
8617 continue;
8618 } else {
8619 if(((sOrder^tOrder)&0xff000000)!=0) {
8620 if(sOrder < tOrder) {
8621 return UCOL_LESS;
8622 } else if(sOrder > tOrder) {
8623 return UCOL_GREATER;
8624 }
8625 }
8626 sOrder<<=8;
8627 tOrder<<=8;
8628 }
8629 }
8630 }
8631 return UCOL_EQUAL;
8632 }
8633
8634 U_CAPI UCollationResult U_EXPORT2
ucol_strcollIter(const UCollator * coll,UCharIterator * sIter,UCharIterator * tIter,UErrorCode * status)8635 ucol_strcollIter( const UCollator *coll,
8636 UCharIterator *sIter,
8637 UCharIterator *tIter,
8638 UErrorCode *status)
8639 {
8640 if(!status || U_FAILURE(*status)) {
8641 return UCOL_EQUAL;
8642 }
8643
8644 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
8645 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
8646
8647 if (sIter == tIter) {
8648 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8649 return UCOL_EQUAL;
8650 }
8651 if(sIter == NULL || tIter == NULL || coll == NULL) {
8652 *status = U_ILLEGAL_ARGUMENT_ERROR;
8653 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8654 return UCOL_EQUAL;
8655 }
8656
8657 UCollationResult result = UCOL_EQUAL;
8658
8659 // Preparing the context objects for iterating over strings
8660 collIterate sColl, tColl;
8661 IInit_collIterate(coll, NULL, -1, &sColl, status);
8662 IInit_collIterate(coll, NULL, -1, &tColl, status);
8663 if(U_FAILURE(*status)) {
8664 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8665 return UCOL_EQUAL;
8666 }
8667 // The division for the array length may truncate the array size to
8668 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8669 // for all platforms anyway.
8670 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8671 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8672 UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8673
8674 sColl.iterator = sIter;
8675 sColl.flags |= UCOL_USE_ITERATOR;
8676 tColl.flags |= UCOL_USE_ITERATOR;
8677 tColl.iterator = tIter;
8678
8679 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8680 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8681 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8682 sColl.flags &= ~UCOL_ITER_NORM;
8683
8684 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8685 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8686 tColl.flags &= ~UCOL_ITER_NORM;
8687 }
8688
8689 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
8690
8691 while((sChar = sColl.iterator->next(sColl.iterator)) ==
8692 (tChar = tColl.iterator->next(tColl.iterator))) {
8693 if(sChar == U_SENTINEL) {
8694 result = UCOL_EQUAL;
8695 goto end_compare;
8696 }
8697 }
8698
8699 if(sChar == U_SENTINEL) {
8700 tChar = tColl.iterator->previous(tColl.iterator);
8701 }
8702
8703 if(tChar == U_SENTINEL) {
8704 sChar = sColl.iterator->previous(sColl.iterator);
8705 }
8706
8707 sChar = sColl.iterator->previous(sColl.iterator);
8708 tChar = tColl.iterator->previous(tColl.iterator);
8709
8710 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8711 {
8712 // We are stopped in the middle of a contraction.
8713 // Scan backwards through the == part of the string looking for the start of the contraction.
8714 // It doesn't matter which string we scan, since they are the same in this region.
8715 do
8716 {
8717 sChar = sColl.iterator->previous(sColl.iterator);
8718 tChar = tColl.iterator->previous(tColl.iterator);
8719 }
8720 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8721 }
8722
8723
8724 if(U_SUCCESS(*status)) {
8725 result = ucol_strcollRegular(&sColl, &tColl, status);
8726 }
8727
8728 end_compare:
8729 if(sNormIter || tNormIter) {
8730 unorm_closeIter(sNormIter);
8731 unorm_closeIter(tNormIter);
8732 }
8733
8734 UTRACE_EXIT_VALUE_STATUS(result, *status)
8735 return result;
8736 }
8737
8738
8739 /* */
8740 /* ucol_strcoll Main public API string comparison function */
8741 /* */
8742 U_CAPI UCollationResult U_EXPORT2
ucol_strcoll(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)8743 ucol_strcoll( const UCollator *coll,
8744 const UChar *source,
8745 int32_t sourceLength,
8746 const UChar *target,
8747 int32_t targetLength)
8748 {
8749 U_ALIGN_CODE(16);
8750
8751 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
8752 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8753 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8754 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
8755 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
8756 }
8757
8758 if(source == NULL || target == NULL) {
8759 // do not crash, but return. Should have
8760 // status argument to return error.
8761 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8762 return UCOL_EQUAL;
8763 }
8764
8765 /* Quick check if source and target are same strings. */
8766 /* They should either both be NULL terminated or the explicit length should be set on both. */
8767 if (source==target && sourceLength==targetLength) {
8768 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8769 return UCOL_EQUAL;
8770 }
8771
8772 if(coll->delegate != NULL) {
8773 UErrorCode status = U_ZERO_ERROR;
8774 return ((const Collator*)coll->delegate)->compare(source,sourceLength,target,targetLength, status);
8775 }
8776
8777 /* Scan the strings. Find: */
8778 /* The length of any leading portion that is equal */
8779 /* Whether they are exactly equal. (in which case we just return) */
8780 const UChar *pSrc = source;
8781 const UChar *pTarg = target;
8782 int32_t equalLength;
8783
8784 if (sourceLength == -1 && targetLength == -1) {
8785 // Both strings are null terminated.
8786 // Scan through any leading equal portion.
8787 while (*pSrc == *pTarg && *pSrc != 0) {
8788 pSrc++;
8789 pTarg++;
8790 }
8791 if (*pSrc == 0 && *pTarg == 0) {
8792 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8793 return UCOL_EQUAL;
8794 }
8795 equalLength = (int32_t)(pSrc - source);
8796 }
8797 else
8798 {
8799 // One or both strings has an explicit length.
8800 const UChar *pSrcEnd = source + sourceLength;
8801 const UChar *pTargEnd = target + targetLength;
8802
8803 // Scan while the strings are bitwise ==, or until one is exhausted.
8804 for (;;) {
8805 if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8806 break;
8807 }
8808 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8809 break;
8810 }
8811 if (*pSrc != *pTarg) {
8812 break;
8813 }
8814 pSrc++;
8815 pTarg++;
8816 }
8817 equalLength = (int32_t)(pSrc - source);
8818
8819 // If we made it all the way through both strings, we are done. They are ==
8820 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */
8821 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) /* and also at end of dest string */
8822 {
8823 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8824 return UCOL_EQUAL;
8825 }
8826 }
8827 if (equalLength > 0) {
8828 /* There is an identical portion at the beginning of the two strings. */
8829 /* If the identical portion ends within a contraction or a comibining */
8830 /* character sequence, back up to the start of that sequence. */
8831
8832 // These values should already be set by the code above.
8833 //pSrc = source + equalLength; /* point to the first differing chars */
8834 //pTarg = target + equalLength;
8835 if ((pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) ||
8836 (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll)))
8837 {
8838 // We are stopped in the middle of a contraction.
8839 // Scan backwards through the == part of the string looking for the start of the contraction.
8840 // It doesn't matter which string we scan, since they are the same in this region.
8841 do
8842 {
8843 equalLength--;
8844 pSrc--;
8845 }
8846 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
8847 }
8848
8849 source += equalLength;
8850 target += equalLength;
8851 if (sourceLength > 0) {
8852 sourceLength -= equalLength;
8853 }
8854 if (targetLength > 0) {
8855 targetLength -= equalLength;
8856 }
8857 }
8858
8859 UErrorCode status = U_ZERO_ERROR;
8860 UCollationResult returnVal;
8861 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
8862 returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status);
8863 } else {
8864 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
8865 }
8866 UTRACE_EXIT_VALUE(returnVal);
8867 return returnVal;
8868 }
8869
8870 U_CAPI UCollationResult U_EXPORT2
ucol_strcollUTF8(const UCollator * coll,const char * source,int32_t sourceLength,const char * target,int32_t targetLength,UErrorCode * status)8871 ucol_strcollUTF8(
8872 const UCollator *coll,
8873 const char *source,
8874 int32_t sourceLength,
8875 const char *target,
8876 int32_t targetLength,
8877 UErrorCode *status)
8878 {
8879 U_ALIGN_CODE(16);
8880
8881 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8);
8882 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8883 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8884 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLength);
8885 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLength);
8886 }
8887
8888 if (U_FAILURE(*status)) {
8889 /* do nothing */
8890 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8891 return UCOL_EQUAL;
8892 }
8893
8894 if(source == NULL || target == NULL) {
8895 *status = U_ILLEGAL_ARGUMENT_ERROR;
8896 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8897 return UCOL_EQUAL;
8898 }
8899
8900 /* Quick check if source and target are same strings. */
8901 /* They should either both be NULL terminated or the explicit length should be set on both. */
8902 if (source==target && sourceLength==targetLength) {
8903 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8904 return UCOL_EQUAL;
8905 }
8906
8907 if(coll->delegate != NULL) {
8908 return ((const Collator*)coll->delegate)->compareUTF8(
8909 StringPiece(source, (sourceLength < 0) ? uprv_strlen(source) : sourceLength),
8910 StringPiece(target, (targetLength < 0) ? uprv_strlen(target) : targetLength),
8911 *status);
8912 }
8913
8914 /* Scan the strings. Find: */
8915 /* The length of any leading portion that is equal */
8916 /* Whether they are exactly equal. (in which case we just return) */
8917 const char *pSrc = source;
8918 const char *pTarg = target;
8919 UBool bSrcLimit = FALSE;
8920 UBool bTargLimit = FALSE;
8921
8922 if (sourceLength == -1 && targetLength == -1) {
8923 // Both strings are null terminated.
8924 // Scan through any leading equal portion.
8925 while (*pSrc == *pTarg && *pSrc != 0) {
8926 pSrc++;
8927 pTarg++;
8928 }
8929 if (*pSrc == 0 && *pTarg == 0) {
8930 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8931 return UCOL_EQUAL;
8932 }
8933 bSrcLimit = (*pSrc == 0);
8934 bTargLimit = (*pTarg == 0);
8935 }
8936 else
8937 {
8938 // One or both strings has an explicit length.
8939 const char *pSrcEnd = source + sourceLength;
8940 const char *pTargEnd = target + targetLength;
8941
8942 // Scan while the strings are bitwise ==, or until one is exhausted.
8943 for (;;) {
8944 if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8945 break;
8946 }
8947 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8948 break;
8949 }
8950 if (*pSrc != *pTarg) {
8951 break;
8952 }
8953 pSrc++;
8954 pTarg++;
8955 }
8956 bSrcLimit = (pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0));
8957 bTargLimit = (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0));
8958
8959 // If we made it all the way through both strings, we are done. They are ==
8960 if (bSrcLimit && /* At end of src string, however it was specified. */
8961 bTargLimit) /* and also at end of dest string */
8962 {
8963 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8964 return UCOL_EQUAL;
8965 }
8966 }
8967
8968 U_ASSERT(!(bSrcLimit && bTargLimit));
8969
8970 int32_t equalLength = pSrc - source;
8971 UBool bSawNonLatin1 = FALSE;
8972
8973 if (equalLength > 0) {
8974 // Align position to the start of UTF-8 code point.
8975 if (bTargLimit) {
8976 U8_SET_CP_START((const uint8_t*)source, 0, equalLength);
8977 } else {
8978 U8_SET_CP_START((const uint8_t*)target, 0, equalLength);
8979 }
8980 pSrc = source + equalLength;
8981 pTarg = target + equalLength;
8982 }
8983
8984 if (equalLength > 0) {
8985 /* There is an identical portion at the beginning of the two strings. */
8986 /* If the identical portion ends within a contraction or a comibining */
8987 /* character sequence, back up to the start of that sequence. */
8988 UBool bUnsafeCP = FALSE;
8989 UChar32 uc32 = -1;
8990
8991 if (!bSrcLimit) {
8992 if (sourceLength >= 0) {
8993 U8_GET((uint8_t*)source, 0, equalLength, sourceLength, uc32);
8994 } else {
8995 U8_GET_NULLTERM((uint8_t*)source, 0, equalLength, uc32);
8996 }
8997 if (uc32 == -1) {
8998 uc32 = 0xfffd;
8999 bSawNonLatin1 |= TRUE;
9000 } else {
9001 if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
9002 bUnsafeCP = TRUE;
9003 }
9004 bSawNonLatin1 |= (uc32 > 0xff);
9005 }
9006 }
9007 if (!bTargLimit) {
9008 if (targetLength >= 0) {
9009 U8_GET((uint8_t*)target, 0, equalLength, targetLength, uc32);
9010 } else {
9011 U8_GET_NULLTERM((uint8_t*)target, 0, equalLength, uc32);
9012 }
9013 if (uc32 == -1) {
9014 uc32 = 0xfffd;
9015 bSawNonLatin1 |= TRUE;
9016 } else {
9017 if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
9018 bUnsafeCP = TRUE;
9019 }
9020 bSawNonLatin1 |= (uc32 > 0xff);
9021 }
9022 }
9023
9024 if (bUnsafeCP) {
9025 while (equalLength > 0) {
9026 // We are stopped in the middle of a contraction.
9027 // Scan backwards through the == part of the string looking for the start of the contraction.
9028 // It doesn't matter which string we scan, since they are the same in this region.
9029 U8_PREV((uint8_t*)source, 0, equalLength, uc32);
9030 bSawNonLatin1 |= (uc32 > 0xff);
9031 if (uc32 < 0x10000 && !ucol_unsafeCP((UChar)uc32, coll)) {
9032 break;
9033 }
9034 }
9035 }
9036 source += equalLength;
9037 target += equalLength;
9038 if (sourceLength > 0) {
9039 sourceLength -= equalLength;
9040 }
9041 if (targetLength > 0) {
9042 targetLength -= equalLength;
9043 }
9044 } else {
9045 // Lead byte of Latin 1 character is 0x00 - 0xC3
9046 bSawNonLatin1 = (source && (sourceLength != 0) && (uint8_t)*source > 0xc3);
9047 bSawNonLatin1 |= (target && (targetLength != 0) && (uint8_t)*target > 0xc3);
9048 }
9049
9050 UCollationResult returnVal;
9051
9052 if(!coll->latinOneUse || bSawNonLatin1) {
9053 returnVal = ucol_strcollRegularUTF8(coll, source, sourceLength, target, targetLength, status);
9054 } else {
9055 returnVal = ucol_strcollUseLatin1UTF8(coll, source, sourceLength, target, targetLength, status);
9056 }
9057 UTRACE_EXIT_VALUE_STATUS(returnVal, *status);
9058 return returnVal;
9059 }
9060
9061
9062 /* convenience function for comparing strings */
9063 U_CAPI UBool U_EXPORT2
ucol_greater(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)9064 ucol_greater( const UCollator *coll,
9065 const UChar *source,
9066 int32_t sourceLength,
9067 const UChar *target,
9068 int32_t targetLength)
9069 {
9070 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
9071 == UCOL_GREATER);
9072 }
9073
9074 /* convenience function for comparing strings */
9075 U_CAPI UBool U_EXPORT2
ucol_greaterOrEqual(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)9076 ucol_greaterOrEqual( const UCollator *coll,
9077 const UChar *source,
9078 int32_t sourceLength,
9079 const UChar *target,
9080 int32_t targetLength)
9081 {
9082 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
9083 != UCOL_LESS);
9084 }
9085
9086 /* convenience function for comparing strings */
9087 U_CAPI UBool U_EXPORT2
ucol_equal(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)9088 ucol_equal( const UCollator *coll,
9089 const UChar *source,
9090 int32_t sourceLength,
9091 const UChar *target,
9092 int32_t targetLength)
9093 {
9094 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
9095 == UCOL_EQUAL);
9096 }
9097
9098 U_CAPI void U_EXPORT2
ucol_getUCAVersion(const UCollator * coll,UVersionInfo info)9099 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
9100 if(coll && coll->UCA) {
9101 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
9102 }
9103 }
9104
9105 #endif /* #if !UCONFIG_NO_COLLATION */
9106