• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *   Copyright (C) 1996-2011, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 *   file name:  ucol.cpp
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 * Modification history
12 * Date        Name      Comments
13 * 1996-1999   various members of ICU team maintained C API for collation framework
14 * 02/16/2001  synwee    Added internal method getPrevSpecialCE
15 * 03/01/2001  synwee    Added maxexpansion functionality.
16 * 03/16/2001  weiv      Collation framework is rewritten in C and made UCA compliant
17 */
18 
19 #include "unicode/utypes.h"
20 
21 #if !UCONFIG_NO_COLLATION
22 
23 #include "unicode/coleitr.h"
24 #include "unicode/unorm.h"
25 #include "unicode/udata.h"
26 #include "unicode/ustring.h"
27 
28 #include "ucol_imp.h"
29 #include "bocsu.h"
30 
31 #include "normalizer2impl.h"
32 #include "unorm_it.h"
33 #include "umutex.h"
34 #include "cmemory.h"
35 #include "ucln_in.h"
36 #include "cstring.h"
37 #include "utracimp.h"
38 #include "putilimp.h"
39 #include "uassert.h"
40 
41 #ifdef UCOL_DEBUG
42 #include <stdio.h>
43 #endif
44 
45 U_NAMESPACE_USE
46 
47 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
48 
49 #define LAST_BYTE_MASK_           0xFF
50 #define SECOND_LAST_BYTE_SHIFT_   8
51 
52 #define ZERO_CC_LIMIT_            0xC0
53 
54 // this is static pointer to the normalizer fcdTrieIndex
55 // it is always the same between calls to u_cleanup
56 // and therefore writing to it is not synchronized.
57 // It is cleaned in ucol_cleanup
58 static const uint16_t *fcdTrieIndex=NULL;
59 // Code points at fcdHighStart and above have a zero FCD value.
60 static UChar32 fcdHighStart = 0;
61 
62 // These are values from UCA required for
63 // implicit generation and supressing sort key compression
64 // they should regularly be in the UCA, but if one
65 // is running without UCA, it could be a problem
66 static const int32_t maxRegularPrimary  = 0x7A;
67 static const int32_t minImplicitPrimary = 0xE0;
68 static const int32_t maxImplicitPrimary = 0xE4;
69 
70 U_CDECL_BEGIN
71 static UBool U_CALLCONV
ucol_cleanup(void)72 ucol_cleanup(void)
73 {
74     fcdTrieIndex = NULL;
75     return TRUE;
76 }
77 
78 static int32_t U_CALLCONV
_getFoldingOffset(uint32_t data)79 _getFoldingOffset(uint32_t data) {
80     return (int32_t)(data&0xFFFFFF);
81 }
82 
83 U_CDECL_END
84 
85 // init FCD data
86 static inline
initializeFCD(UErrorCode * status)87 UBool initializeFCD(UErrorCode *status) {
88     if (fcdTrieIndex != NULL) {
89         return TRUE;
90     } else {
91         // The result is constant, until the library is reloaded.
92         fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
93         ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
94         return U_SUCCESS(*status);
95     }
96 }
97 
98 static
IInit_collIterate(const UCollator * collator,const UChar * sourceString,int32_t sourceLen,collIterate * s,UErrorCode * status)99 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
100                               int32_t sourceLen, collIterate *s,
101                               UErrorCode *status)
102 {
103     (s)->string = (s)->pos = sourceString;
104     (s)->origFlags = 0;
105     (s)->flags = 0;
106     if (sourceLen >= 0) {
107         s->flags |= UCOL_ITER_HASLEN;
108         (s)->endp = (UChar *)sourceString+sourceLen;
109     }
110     else {
111         /* change to enable easier checking for end of string for fcdpositon */
112         (s)->endp = NULL;
113     }
114     (s)->extendCEs = NULL;
115     (s)->extendCEsSize = 0;
116     (s)->CEpos = (s)->toReturn = (s)->CEs;
117     (s)->offsetBuffer = NULL;
118     (s)->offsetBufferSize = 0;
119     (s)->offsetReturn = (s)->offsetStore = NULL;
120     (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
121     (s)->coll = (collator);
122     (s)->nfd = Normalizer2Factory::getNFDInstance(*status);
123     (s)->fcdPosition = 0;
124     if(collator->normalizationMode == UCOL_ON) {
125         (s)->flags |= UCOL_ITER_NORM;
126     }
127     if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
128         (s)->flags |= UCOL_HIRAGANA_Q;
129     }
130     (s)->iterator = NULL;
131     //(s)->iteratorIndex = 0;
132 }
133 
134 U_CAPI void  U_EXPORT2
uprv_init_collIterate(const UCollator * collator,const UChar * sourceString,int32_t sourceLen,collIterate * s,UErrorCode * status)135 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
136                              int32_t sourceLen, collIterate *s,
137                              UErrorCode *status) {
138     /* Out-of-line version for use from other files. */
139     IInit_collIterate(collator, sourceString, sourceLen, s, status);
140 }
141 
142 U_CAPI collIterate * U_EXPORT2
uprv_new_collIterate(UErrorCode * status)143 uprv_new_collIterate(UErrorCode *status) {
144     if(U_FAILURE(*status)) {
145         return NULL;
146     }
147     collIterate *s = new collIterate;
148     if(s == NULL) {
149         *status = U_MEMORY_ALLOCATION_ERROR;
150         return NULL;
151     }
152     return s;
153 }
154 
155 U_CAPI void U_EXPORT2
uprv_delete_collIterate(collIterate * s)156 uprv_delete_collIterate(collIterate *s) {
157     delete s;
158 }
159 
160 U_CAPI UBool U_EXPORT2
uprv_collIterateAtEnd(collIterate * s)161 uprv_collIterateAtEnd(collIterate *s) {
162     return s == NULL || s->pos == s->endp;
163 }
164 
165 /**
166 * Backup the state of the collIterate struct data
167 * @param data collIterate to backup
168 * @param backup storage
169 */
170 static
backupState(const collIterate * data,collIterateState * backup)171 inline void backupState(const collIterate *data, collIterateState *backup)
172 {
173     backup->fcdPosition = data->fcdPosition;
174     backup->flags       = data->flags;
175     backup->origFlags   = data->origFlags;
176     backup->pos         = data->pos;
177     backup->bufferaddress = data->writableBuffer.getBuffer();
178     backup->buffersize    = data->writableBuffer.length();
179     backup->iteratorMove = 0;
180     backup->iteratorIndex = 0;
181     if(data->iterator != NULL) {
182         //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
183         backup->iteratorIndex = data->iterator->getState(data->iterator);
184         // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
185         if(backup->iteratorIndex == UITER_NO_STATE) {
186             while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
187                 backup->iteratorMove++;
188                 data->iterator->move(data->iterator, -1, UITER_CURRENT);
189             }
190             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
191         }
192     }
193 }
194 
195 /**
196 * Loads the state into the collIterate struct data
197 * @param data collIterate to backup
198 * @param backup storage
199 * @param forwards boolean to indicate if forwards iteration is used,
200 *        false indicates backwards iteration
201 */
202 static
loadState(collIterate * data,const collIterateState * backup,UBool forwards)203 inline void loadState(collIterate *data, const collIterateState *backup,
204                       UBool        forwards)
205 {
206     UErrorCode status = U_ZERO_ERROR;
207     data->flags       = backup->flags;
208     data->origFlags   = backup->origFlags;
209     if(data->iterator != NULL) {
210         //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
211         data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
212         if(backup->iteratorMove != 0) {
213             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
214         }
215     }
216     data->pos         = backup->pos;
217 
218     if ((data->flags & UCOL_ITER_INNORMBUF) &&
219         data->writableBuffer.getBuffer() != backup->bufferaddress) {
220         /*
221         this is when a new buffer has been reallocated and we'll have to
222         calculate the new position.
223         note the new buffer has to contain the contents of the old buffer.
224         */
225         if (forwards) {
226             data->pos = data->writableBuffer.getTerminatedBuffer() +
227                                          (data->pos - backup->bufferaddress);
228         }
229         else {
230             /* backwards direction */
231             int32_t temp = backup->buffersize -
232                                   (int32_t)(data->pos - backup->bufferaddress);
233             data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp);
234         }
235     }
236     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
237         /*
238         this is alittle tricky.
239         if we are initially not in the normalization buffer, even if we
240         normalize in the later stage, the data in the buffer will be
241         ignored, since we skip back up to the data string.
242         however if we are already in the normalization buffer, any
243         further normalization will pull data into the normalization
244         buffer and modify the fcdPosition.
245         since we are keeping the data in the buffer for use, the
246         fcdPosition can not be reverted back.
247         arrgghh....
248         */
249         data->fcdPosition = backup->fcdPosition;
250     }
251 }
252 
253 static UBool
reallocCEs(collIterate * data,int32_t newCapacity)254 reallocCEs(collIterate *data, int32_t newCapacity) {
255     uint32_t *oldCEs = data->extendCEs;
256     if(oldCEs == NULL) {
257         oldCEs = data->CEs;
258     }
259     int32_t length = data->CEpos - oldCEs;
260     uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4);
261     if(newCEs == NULL) {
262         return FALSE;
263     }
264     uprv_memcpy(newCEs, oldCEs, length * 4);
265     uprv_free(data->extendCEs);
266     data->extendCEs = newCEs;
267     data->extendCEsSize = newCapacity;
268     data->CEpos = newCEs + length;
269     return TRUE;
270 }
271 
272 static UBool
increaseCEsCapacity(collIterate * data)273 increaseCEsCapacity(collIterate *data) {
274     int32_t oldCapacity;
275     if(data->extendCEs != NULL) {
276         oldCapacity = data->extendCEsSize;
277     } else {
278         oldCapacity = LENGTHOF(data->CEs);
279     }
280     return reallocCEs(data, 2 * oldCapacity);
281 }
282 
283 static UBool
ensureCEsCapacity(collIterate * data,int32_t minCapacity)284 ensureCEsCapacity(collIterate *data, int32_t minCapacity) {
285     int32_t oldCapacity;
286     if(data->extendCEs != NULL) {
287         oldCapacity = data->extendCEsSize;
288     } else {
289         oldCapacity = LENGTHOF(data->CEs);
290     }
291     if(minCapacity <= oldCapacity) {
292         return TRUE;
293     }
294     oldCapacity *= 2;
295     return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity);
296 }
297 
appendOffset(int32_t offset,UErrorCode & errorCode)298 void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) {
299     if(U_FAILURE(errorCode)) {
300         return;
301     }
302     int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer);
303     if(length >= offsetBufferSize) {
304         int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE;
305         int32_t *newBuffer = reinterpret_cast<int32_t *>(uprv_malloc(newCapacity * 4));
306         if(newBuffer == NULL) {
307             errorCode = U_MEMORY_ALLOCATION_ERROR;
308             return;
309         }
310         if(length > 0) {
311             uprv_memcpy(newBuffer, offsetBuffer, length * 4);
312         }
313         uprv_free(offsetBuffer);
314         offsetBuffer = newBuffer;
315         offsetStore = offsetBuffer + length;
316         offsetBufferSize = newCapacity;
317     }
318     *offsetStore++ = offset;
319 }
320 
321 /*
322 * collIter_eos()
323 *     Checks for a collIterate being positioned at the end of
324 *     its source string.
325 *
326 */
327 static
collIter_eos(collIterate * s)328 inline UBool collIter_eos(collIterate *s) {
329     if(s->flags & UCOL_USE_ITERATOR) {
330       return !(s->iterator->hasNext(s->iterator));
331     }
332     if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
333         // Null terminated string, but not at null, so not at end.
334         //   Whether in main or normalization buffer doesn't matter.
335         return FALSE;
336     }
337 
338     // String with length.  Can't be in normalization buffer, which is always
339     //  null termintated.
340     if (s->flags & UCOL_ITER_HASLEN) {
341         return (s->pos == s->endp);
342     }
343 
344     // We are at a null termination, could be either normalization buffer or main string.
345     if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
346         // At null at end of main string.
347         return TRUE;
348     }
349 
350     // At null at end of normalization buffer.  Need to check whether there there are
351     //   any characters left in the main buffer.
352     if(s->origFlags & UCOL_USE_ITERATOR) {
353       return !(s->iterator->hasNext(s->iterator));
354     } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
355         // Null terminated main string.  fcdPosition is the 'return' position into main buf.
356         return (*s->fcdPosition == 0);
357     }
358     else {
359         // Main string with an end pointer.
360         return s->fcdPosition == s->endp;
361     }
362 }
363 
364 /*
365 * collIter_bos()
366 *     Checks for a collIterate being positioned at the start of
367 *     its source string.
368 *
369 */
370 static
collIter_bos(collIterate * source)371 inline UBool collIter_bos(collIterate *source) {
372   // if we're going backwards, we need to know whether there is more in the
373   // iterator, even if we are in the side buffer
374   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
375     return !source->iterator->hasPrevious(source->iterator);
376   }
377   if (source->pos <= source->string ||
378       ((source->flags & UCOL_ITER_INNORMBUF) &&
379       *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
380     return TRUE;
381   }
382   return FALSE;
383 }
384 
385 /*static
386 inline UBool collIter_SimpleBos(collIterate *source) {
387   // if we're going backwards, we need to know whether there is more in the
388   // iterator, even if we are in the side buffer
389   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
390     return !source->iterator->hasPrevious(source->iterator);
391   }
392   if (source->pos == source->string) {
393     return TRUE;
394   }
395   return FALSE;
396 }*/
397     //return (data->pos == data->string) ||
398 
399 
400 /****************************************************************************/
401 /* Following are the open/close functions                                   */
402 /*                                                                          */
403 /****************************************************************************/
404 
405 static UCollator*
ucol_initFromBinary(const uint8_t * bin,int32_t length,const UCollator * base,UCollator * fillIn,UErrorCode * status)406 ucol_initFromBinary(const uint8_t *bin, int32_t length,
407                 const UCollator *base,
408                 UCollator *fillIn,
409                 UErrorCode *status)
410 {
411     UCollator *result = fillIn;
412     if(U_FAILURE(*status)) {
413         return NULL;
414     }
415     /*
416     if(base == NULL) {
417         // we don't support null base yet
418         *status = U_ILLEGAL_ARGUMENT_ERROR;
419         return NULL;
420     }
421     */
422     // We need these and we could be running without UCA
423     uprv_uca_initImplicitConstants(status);
424     UCATableHeader *colData = (UCATableHeader *)bin;
425     // do we want version check here? We're trying to figure out whether collators are compatible
426     if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
427         uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
428         colData->version[0] != UCOL_BUILDER_VERSION)
429     {
430         *status = U_COLLATOR_VERSION_MISMATCH;
431         return NULL;
432     }
433     else {
434         if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
435             result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
436             if(U_FAILURE(*status)){
437                 return NULL;
438             }
439             result->hasRealData = TRUE;
440         }
441         else {
442             if(base) {
443                 result = ucol_initCollator(base->image, result, base, status);
444                 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
445                 if(U_FAILURE(*status)){
446                     return NULL;
447                 }
448                 result->hasRealData = FALSE;
449             }
450             else {
451                 *status = U_USELESS_COLLATOR_ERROR;
452                 return NULL;
453             }
454         }
455         result->freeImageOnClose = FALSE;
456     }
457     result->actualLocale = NULL;
458     result->validLocale = NULL;
459     result->requestedLocale = NULL;
460     result->rules = NULL;
461     result->rulesLength = 0;
462     result->freeRulesOnClose = FALSE;
463     result->ucaRules = NULL;
464     return result;
465 }
466 
467 U_CAPI UCollator* U_EXPORT2
ucol_openBinary(const uint8_t * bin,int32_t length,const UCollator * base,UErrorCode * status)468 ucol_openBinary(const uint8_t *bin, int32_t length,
469                 const UCollator *base,
470                 UErrorCode *status)
471 {
472     return ucol_initFromBinary(bin, length, base, NULL, status);
473 }
474 
475 U_CAPI int32_t U_EXPORT2
ucol_cloneBinary(const UCollator * coll,uint8_t * buffer,int32_t capacity,UErrorCode * status)476 ucol_cloneBinary(const UCollator *coll,
477                  uint8_t *buffer, int32_t capacity,
478                  UErrorCode *status)
479 {
480     int32_t length = 0;
481     if(U_FAILURE(*status)) {
482         return length;
483     }
484     if(capacity < 0) {
485         *status = U_ILLEGAL_ARGUMENT_ERROR;
486         return length;
487     }
488     if(coll->hasRealData == TRUE) {
489         length = coll->image->size;
490         if(length <= capacity) {
491             uprv_memcpy(buffer, coll->image, length);
492         } else {
493             *status = U_BUFFER_OVERFLOW_ERROR;
494         }
495     } else {
496         length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
497         if(length <= capacity) {
498             /* build the UCATableHeader with minimal entries */
499             /* do not copy the header from the UCA file because its values are wrong! */
500             /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
501 
502             /* reset everything */
503             uprv_memset(buffer, 0, length);
504 
505             /* set the tailoring-specific values */
506             UCATableHeader *myData = (UCATableHeader *)buffer;
507             myData->size = length;
508 
509             /* offset for the options, the only part of the data that is present after the header */
510             myData->options = sizeof(UCATableHeader);
511 
512             /* need to always set the expansion value for an upper bound of the options */
513             myData->expansion = myData->options + sizeof(UColOptionSet);
514 
515             myData->magic = UCOL_HEADER_MAGIC;
516             myData->isBigEndian = U_IS_BIG_ENDIAN;
517             myData->charSetFamily = U_CHARSET_FAMILY;
518 
519             /* copy UCA's version; genrb will override all but the builder version with tailoring data */
520             uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
521 
522             uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
523             uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
524             uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
525             myData->jamoSpecial = coll->image->jamoSpecial;
526 
527             /* copy the collator options */
528             uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
529         } else {
530             *status = U_BUFFER_OVERFLOW_ERROR;
531         }
532     }
533     return length;
534 }
535 
536 U_CAPI UCollator* U_EXPORT2
ucol_safeClone(const UCollator * coll,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)537 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
538 {
539     UCollator * localCollator;
540     int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
541     char *stackBufferChars = (char *)stackBuffer;
542     int32_t imageSize = 0;
543     int32_t rulesSize = 0;
544     int32_t rulesPadding = 0;
545     uint8_t *image;
546     UChar *rules;
547     UBool colAllocated = FALSE;
548     UBool imageAllocated = FALSE;
549 
550     if (status == NULL || U_FAILURE(*status)){
551         return 0;
552     }
553     if ((stackBuffer && !pBufferSize) || !coll){
554        *status = U_ILLEGAL_ARGUMENT_ERROR;
555         return 0;
556     }
557     if (coll->rules && coll->freeRulesOnClose) {
558         rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
559         rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
560         bufferSizeNeeded += rulesSize + rulesPadding;
561     }
562 
563     if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
564         *pBufferSize =  bufferSizeNeeded;
565         return 0;
566     }
567 
568     /* Pointers on 64-bit platforms need to be aligned
569      * on a 64-bit boundry in memory.
570      */
571     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
572         int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
573         if (*pBufferSize > offsetUp) {
574             *pBufferSize -= offsetUp;
575             stackBufferChars += offsetUp;
576         }
577         else {
578             /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
579             *pBufferSize = 1;
580         }
581     }
582     stackBuffer = (void *)stackBufferChars;
583 
584     if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) {
585         /* allocate one here...*/
586         stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
587         // Null pointer check.
588         if (stackBufferChars == NULL) {
589             *status = U_MEMORY_ALLOCATION_ERROR;
590             return NULL;
591         }
592         colAllocated = TRUE;
593         if (U_SUCCESS(*status)) {
594             *status = U_SAFECLONE_ALLOCATED_WARNING;
595         }
596     }
597     localCollator = (UCollator *)stackBufferChars;
598     rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
599     {
600         UErrorCode tempStatus = U_ZERO_ERROR;
601         imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
602     }
603     if (coll->freeImageOnClose) {
604         image = (uint8_t *)uprv_malloc(imageSize);
605         // Null pointer check
606         if (image == NULL) {
607             *status = U_MEMORY_ALLOCATION_ERROR;
608             return NULL;
609         }
610         ucol_cloneBinary(coll, image, imageSize, status);
611         imageAllocated = TRUE;
612     }
613     else {
614         image = (uint8_t *)coll->image;
615     }
616     localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
617     if (U_FAILURE(*status)) {
618         return NULL;
619     }
620 
621     if (coll->rules) {
622         if (coll->freeRulesOnClose) {
623             localCollator->rules = u_strcpy(rules, coll->rules);
624             //bufferEnd += rulesSize;
625         }
626         else {
627             localCollator->rules = coll->rules;
628         }
629         localCollator->freeRulesOnClose = FALSE;
630         localCollator->rulesLength = coll->rulesLength;
631     }
632 
633     int32_t i;
634     for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
635         ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
636     }
637     // zero copies of pointers
638     localCollator->actualLocale = NULL;
639     localCollator->validLocale = NULL;
640     localCollator->requestedLocale = NULL;
641     localCollator->ucaRules = coll->ucaRules; // There should only be one copy here.
642     localCollator->freeOnClose = colAllocated;
643     localCollator->freeImageOnClose = imageAllocated;
644     return localCollator;
645 }
646 
647 U_CAPI void U_EXPORT2
ucol_close(UCollator * coll)648 ucol_close(UCollator *coll)
649 {
650     UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
651     UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
652     if(coll != NULL) {
653         // these are always owned by each UCollator struct,
654         // so we always free them
655         if(coll->validLocale != NULL) {
656             uprv_free(coll->validLocale);
657         }
658         if(coll->actualLocale != NULL) {
659             uprv_free(coll->actualLocale);
660         }
661         if(coll->requestedLocale != NULL) {
662             uprv_free(coll->requestedLocale);
663         }
664         if(coll->latinOneCEs != NULL) {
665             uprv_free(coll->latinOneCEs);
666         }
667         if(coll->options != NULL && coll->freeOptionsOnClose) {
668             uprv_free(coll->options);
669         }
670         if(coll->rules != NULL && coll->freeRulesOnClose) {
671             uprv_free((UChar *)coll->rules);
672         }
673         if(coll->image != NULL && coll->freeImageOnClose) {
674             uprv_free((UCATableHeader *)coll->image);
675         }
676         if(coll->leadBytePermutationTable != NULL) {
677             uprv_free(coll->leadBytePermutationTable);
678         }
679         if(coll->reorderCodes != NULL) {
680             uprv_free(coll->reorderCodes);
681         }
682 
683         /* Here, it would be advisable to close: */
684         /* - UData for UCA (unless we stuff it in the root resb */
685         /* Again, do we need additional housekeeping... HMMM! */
686         UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
687         if(coll->freeOnClose){
688             /* for safeClone, if freeOnClose is FALSE,
689             don't free the other instance data */
690             uprv_free(coll);
691         }
692     }
693     UTRACE_EXIT();
694 }
695 
696 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
697 /* you should be able to get the binary chunk to write out...  Doesn't look very full now */
698 U_CFUNC uint8_t* U_EXPORT2
ucol_cloneRuleData(const UCollator * coll,int32_t * length,UErrorCode * status)699 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
700 {
701     uint8_t *result = NULL;
702     if(U_FAILURE(*status)) {
703         return NULL;
704     }
705     if(coll->hasRealData == TRUE) {
706         *length = coll->image->size;
707         result = (uint8_t *)uprv_malloc(*length);
708         /* test for NULL */
709         if (result == NULL) {
710             *status = U_MEMORY_ALLOCATION_ERROR;
711             return NULL;
712         }
713         uprv_memcpy(result, coll->image, *length);
714     } else {
715         *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
716         result = (uint8_t *)uprv_malloc(*length);
717         /* test for NULL */
718         if (result == NULL) {
719             *status = U_MEMORY_ALLOCATION_ERROR;
720             return NULL;
721         }
722 
723         /* build the UCATableHeader with minimal entries */
724         /* do not copy the header from the UCA file because its values are wrong! */
725         /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
726 
727         /* reset everything */
728         uprv_memset(result, 0, *length);
729 
730         /* set the tailoring-specific values */
731         UCATableHeader *myData = (UCATableHeader *)result;
732         myData->size = *length;
733 
734         /* offset for the options, the only part of the data that is present after the header */
735         myData->options = sizeof(UCATableHeader);
736 
737         /* need to always set the expansion value for an upper bound of the options */
738         myData->expansion = myData->options + sizeof(UColOptionSet);
739 
740         myData->magic = UCOL_HEADER_MAGIC;
741         myData->isBigEndian = U_IS_BIG_ENDIAN;
742         myData->charSetFamily = U_CHARSET_FAMILY;
743 
744         /* copy UCA's version; genrb will override all but the builder version with tailoring data */
745         uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
746 
747         uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
748         uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
749         uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
750         myData->jamoSpecial = coll->image->jamoSpecial;
751 
752         /* copy the collator options */
753         uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
754     }
755     return result;
756 }
757 
ucol_setOptionsFromHeader(UCollator * result,UColOptionSet * opts,UErrorCode * status)758 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
759     if(U_FAILURE(*status)) {
760         return;
761     }
762     result->caseFirst = (UColAttributeValue)opts->caseFirst;
763     result->caseLevel = (UColAttributeValue)opts->caseLevel;
764     result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
765     result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
766     if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) {
767         return;
768     }
769     result->strength = (UColAttributeValue)opts->strength;
770     result->variableTopValue = opts->variableTopValue;
771     result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
772     result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
773     result->numericCollation = (UColAttributeValue)opts->numericCollation;
774     result->caseFirstisDefault = TRUE;
775     result->caseLevelisDefault = TRUE;
776     result->frenchCollationisDefault = TRUE;
777     result->normalizationModeisDefault = TRUE;
778     result->strengthisDefault = TRUE;
779     result->variableTopValueisDefault = TRUE;
780     result->alternateHandlingisDefault = TRUE;
781     result->hiraganaQisDefault = TRUE;
782     result->numericCollationisDefault = TRUE;
783 
784     ucol_updateInternalState(result, status);
785 
786     result->options = opts;
787 }
788 
789 
790 /**
791 * Approximate determination if a character is at a contraction end.
792 * Guaranteed to be TRUE if a character is at the end of a contraction,
793 * otherwise it is not deterministic.
794 * @param c character to be determined
795 * @param coll collator
796 */
797 static
ucol_contractionEndCP(UChar c,const UCollator * coll)798 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
799     if (c < coll->minContrEndCP) {
800         return FALSE;
801     }
802 
803     int32_t  hash = c;
804     uint8_t  htbyte;
805     if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
806         if (U16_IS_TRAIL(c)) {
807             return TRUE;
808         }
809         hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
810     }
811     htbyte = coll->contrEndCP[hash>>3];
812     return (((htbyte >> (hash & 7)) & 1) == 1);
813 }
814 
815 
816 
817 /*
818 *   i_getCombiningClass()
819 *        A fast, at least partly inline version of u_getCombiningClass()
820 *        This is a candidate for further optimization.  Used heavily
821 *        in contraction processing.
822 */
823 static
i_getCombiningClass(UChar32 c,const UCollator * coll)824 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
825     uint8_t sCC = 0;
826     if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
827         sCC = u_getCombiningClass(c);
828     }
829     return sCC;
830 }
831 
ucol_initCollator(const UCATableHeader * image,UCollator * fillIn,const UCollator * UCA,UErrorCode * status)832 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
833     UChar c;
834     UCollator *result = fillIn;
835     if(U_FAILURE(*status) || image == NULL) {
836         return NULL;
837     }
838 
839     if(result == NULL) {
840         result = (UCollator *)uprv_malloc(sizeof(UCollator));
841         if(result == NULL) {
842             *status = U_MEMORY_ALLOCATION_ERROR;
843             return result;
844         }
845         result->freeOnClose = TRUE;
846     } else {
847         result->freeOnClose = FALSE;
848     }
849 
850     result->image = image;
851     result->mapping.getFoldingOffset = _getFoldingOffset;
852     const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
853     utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
854     if(U_FAILURE(*status)) {
855         if(result->freeOnClose == TRUE) {
856             uprv_free(result);
857             result = NULL;
858         }
859         return result;
860     }
861 
862     result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
863     result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
864     result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
865     result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
866     result->rules = NULL;
867     result->rulesLength = 0;
868     result->freeRulesOnClose = FALSE;
869     result->reorderCodes = NULL;
870     result->reorderCodesLength = 0;
871     result->leadBytePermutationTable = NULL;
872 
873     /* get the version info from UCATableHeader and populate the Collator struct*/
874     result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
875     result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
876     result->dataVersion[2] = 0;
877     result->dataVersion[3] = 0;
878 
879     result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
880     result->minUnsafeCP = 0;
881     for (c=0; c<0x300; c++) {  // Find the smallest unsafe char.
882         if (ucol_unsafeCP(c, result)) break;
883     }
884     result->minUnsafeCP = c;
885 
886     result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
887     result->minContrEndCP = 0;
888     for (c=0; c<0x300; c++) {  // Find the Contraction-ending char.
889         if (ucol_contractionEndCP(c, result)) break;
890     }
891     result->minContrEndCP = c;
892 
893     /* max expansion tables */
894     result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
895                                          result->image->endExpansionCE);
896     result->lastEndExpansionCE = result->endExpansionCE +
897                                  result->image->endExpansionCECount - 1;
898     result->expansionCESize = (uint8_t*)result->image +
899                                                result->image->expansionCESize;
900 
901 
902     //result->errorCode = *status;
903 
904     result->latinOneCEs = NULL;
905 
906     result->latinOneRegenTable = FALSE;
907     result->latinOneFailed = FALSE;
908     result->UCA = UCA;
909 
910     /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
911     result->ucaRules = NULL;
912     result->actualLocale = NULL;
913     result->validLocale = NULL;
914     result->requestedLocale = NULL;
915     result->hasRealData = FALSE; // real data lives in .dat file...
916     result->freeImageOnClose = FALSE;
917 
918     /* set attributes */
919     ucol_setOptionsFromHeader(
920         result,
921         (UColOptionSet*)((uint8_t*)result->image+result->image->options),
922         status);
923     result->freeOptionsOnClose = FALSE;
924 
925     return result;
926 }
927 
928 /* new Mark's code */
929 
930 /**
931  * For generation of Implicit CEs
932  * @author Davis
933  *
934  * Cleaned up so that changes can be made more easily.
935  * Old values:
936 # First Implicit: E26A792D
937 # Last Implicit: E3DC70C0
938 # First CJK: E0030300
939 # Last CJK: E0A9DD00
940 # First CJK_A: E0A9DF00
941 # Last CJK_A: E0DE3100
942  */
943 /* Following is a port of Mark's code for new treatment of implicits.
944  * It is positioned here, since ucol_initUCA need to initialize the
945  * variables below according to the data in the fractional UCA.
946  */
947 
948 /**
949  * Function used to:
950  * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
951  * b) bump any non-CJK characters by 10FFFF.
952  * The relevant blocks are:
953  * A:    4E00..9FFF; CJK Unified Ideographs
954  *       F900..FAFF; CJK Compatibility Ideographs
955  * B:    3400..4DBF; CJK Unified Ideographs Extension A
956  *       20000..XX;  CJK Unified Ideographs Extension B (and others later on)
957  * As long as
958  *   no new B characters are allocated between 4E00 and FAFF, and
959  *   no new A characters are outside of this range,
960  * (very high probability) this simple code will work.
961  * The reordered blocks are:
962  * Block1 is CJK
963  * Block2 is CJK_COMPAT_USED
964  * Block3 is CJK_A
965  * (all contiguous)
966  * Any other CJK gets its normal code point
967  * Any non-CJK gets +10FFFF
968  * When we reorder Block1, we make sure that it is at the very start,
969  * so that it will use a 3-byte form.
970  * Warning: the we only pick up the compatibility characters that are
971  * NOT decomposed, so that block is smaller!
972  */
973 
974 // CONSTANTS
975 static const UChar32
976     NON_CJK_OFFSET = 0x110000,
977     UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
978 
979 /**
980  * Precomputed by initImplicitConstants()
981  */
982 static int32_t
983     final3Multiplier = 0,
984     final4Multiplier = 0,
985     final3Count = 0,
986     final4Count = 0,
987     medialCount = 0,
988     min3Primary = 0,
989     min4Primary = 0,
990     max4Primary = 0,
991     minTrail = 0,
992     maxTrail = 0,
993     max3Trail = 0,
994     max4Trail = 0,
995     min4Boundary = 0;
996 
997 static const UChar32
998     // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
999     // 9FCB;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
1000     CJK_BASE = 0x4E00,
1001     CJK_LIMIT = 0x9FCB+1,
1002     // Unified CJK ideographs in the compatibility ideographs block.
1003     CJK_COMPAT_USED_BASE = 0xFA0E,
1004     CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
1005     // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
1006     // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
1007     CJK_A_BASE = 0x3400,
1008     CJK_A_LIMIT = 0x4DB5+1,
1009     // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
1010     // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
1011     CJK_B_BASE = 0x20000,
1012     CJK_B_LIMIT = 0x2A6D6+1,
1013     // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
1014     // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
1015     CJK_C_BASE = 0x2A700,
1016     CJK_C_LIMIT = 0x2B734+1,
1017     // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
1018     // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
1019     CJK_D_BASE = 0x2B740,
1020     CJK_D_LIMIT = 0x2B81D+1;
1021     // when adding to this list, look for all occurrences (in project)
1022     // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!!
1023 
swapCJK(UChar32 i)1024 static UChar32 swapCJK(UChar32 i) {
1025     if (i < CJK_A_BASE) {
1026         // non-CJK
1027     } else if (i < CJK_A_LIMIT) {
1028         // Extension A has lower code points than the original Unihan+compat
1029         // but sorts higher.
1030         return i - CJK_A_BASE
1031                 + (CJK_LIMIT - CJK_BASE)
1032                 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1033     } else if (i < CJK_BASE) {
1034         // non-CJK
1035     } else if (i < CJK_LIMIT) {
1036         return i - CJK_BASE;
1037     } else if (i < CJK_COMPAT_USED_BASE) {
1038         // non-CJK
1039     } else if (i < CJK_COMPAT_USED_LIMIT) {
1040         return i - CJK_COMPAT_USED_BASE
1041                 + (CJK_LIMIT - CJK_BASE);
1042     } else if (i < CJK_B_BASE) {
1043         // non-CJK
1044     } else if (i < CJK_B_LIMIT) {
1045         return i; // non-BMP-CJK
1046     } else if (i < CJK_C_BASE) {
1047         // non-CJK
1048     } else if (i < CJK_C_LIMIT) {
1049         return i; // non-BMP-CJK
1050     } else if (i < CJK_D_BASE) {
1051         // non-CJK
1052     } else if (i < CJK_D_LIMIT) {
1053         return i; // non-BMP-CJK
1054     }
1055     return i + NON_CJK_OFFSET; // non-CJK
1056 }
1057 
1058 U_CAPI UChar32 U_EXPORT2
uprv_uca_getRawFromCodePoint(UChar32 i)1059 uprv_uca_getRawFromCodePoint(UChar32 i) {
1060     return swapCJK(i)+1;
1061 }
1062 
1063 U_CAPI UChar32 U_EXPORT2
uprv_uca_getCodePointFromRaw(UChar32 i)1064 uprv_uca_getCodePointFromRaw(UChar32 i) {
1065     i--;
1066     UChar32 result = 0;
1067     if(i >= NON_CJK_OFFSET) {
1068         result = i - NON_CJK_OFFSET;
1069     } else if(i >= CJK_B_BASE) {
1070         result = i;
1071     } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
1072         if(i < CJK_LIMIT - CJK_BASE) {
1073             result = i + CJK_BASE;
1074         } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
1075             result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
1076         } else {
1077             result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1078         }
1079     } else {
1080         result = -1;
1081     }
1082     return result;
1083 }
1084 
1085 // GET IMPLICIT PRIMARY WEIGHTS
1086 // Return value is left justified primary key
1087 U_CAPI uint32_t U_EXPORT2
uprv_uca_getImplicitFromRaw(UChar32 cp)1088 uprv_uca_getImplicitFromRaw(UChar32 cp) {
1089     /*
1090     if (cp < 0 || cp > UCOL_MAX_INPUT) {
1091         throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
1092     }
1093     */
1094     int32_t last0 = cp - min4Boundary;
1095     if (last0 < 0) {
1096         int32_t last1 = cp / final3Count;
1097         last0 = cp % final3Count;
1098 
1099         int32_t last2 = last1 / medialCount;
1100         last1 %= medialCount;
1101 
1102         last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
1103         last1 = minTrail + last1; // offset
1104         last2 = min3Primary + last2; // offset
1105         /*
1106         if (last2 >= min4Primary) {
1107             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
1108         }
1109         */
1110         return (last2 << 24) + (last1 << 16) + (last0 << 8);
1111     } else {
1112         int32_t last1 = last0 / final4Count;
1113         last0 %= final4Count;
1114 
1115         int32_t last2 = last1 / medialCount;
1116         last1 %= medialCount;
1117 
1118         int32_t last3 = last2 / medialCount;
1119         last2 %= medialCount;
1120 
1121         last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
1122         last1 = minTrail + last1; // offset
1123         last2 = minTrail + last2; // offset
1124         last3 = min4Primary + last3; // offset
1125         /*
1126         if (last3 > max4Primary) {
1127             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
1128         }
1129         */
1130         return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
1131     }
1132 }
1133 
1134 static uint32_t U_EXPORT2
uprv_uca_getImplicitPrimary(UChar32 cp)1135 uprv_uca_getImplicitPrimary(UChar32 cp) {
1136    //fprintf(stdout, "Incoming: %04x\n", cp);
1137     //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1138 
1139     cp = swapCJK(cp);
1140     cp++;
1141     // we now have a range of numbers from 0 to 21FFFF.
1142 
1143     //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1144     //fprintf(stdout, "CJK swapped: %04x\n", cp);
1145 
1146     return uprv_uca_getImplicitFromRaw(cp);
1147 }
1148 
1149 /**
1150  * Converts implicit CE into raw integer ("code point")
1151  * @param implicit
1152  * @return -1 if illegal format
1153  */
1154 U_CAPI UChar32 U_EXPORT2
uprv_uca_getRawFromImplicit(uint32_t implicit)1155 uprv_uca_getRawFromImplicit(uint32_t implicit) {
1156     UChar32 result;
1157     UChar32 b3 = implicit & 0xFF;
1158     UChar32 b2 = (implicit >> 8) & 0xFF;
1159     UChar32 b1 = (implicit >> 16) & 0xFF;
1160     UChar32 b0 = (implicit >> 24) & 0xFF;
1161 
1162     // simple parameter checks
1163     if (b0 < min3Primary || b0 > max4Primary
1164         || b1 < minTrail || b1 > maxTrail)
1165         return -1;
1166     // normal offsets
1167     b1 -= minTrail;
1168 
1169     // take care of the final values, and compose
1170     if (b0 < min4Primary) {
1171         if (b2 < minTrail || b2 > max3Trail || b3 != 0)
1172             return -1;
1173         b2 -= minTrail;
1174         UChar32 remainder = b2 % final3Multiplier;
1175         if (remainder != 0)
1176             return -1;
1177         b0 -= min3Primary;
1178         b2 /= final3Multiplier;
1179         result = ((b0 * medialCount) + b1) * final3Count + b2;
1180     } else {
1181         if (b2 < minTrail || b2 > maxTrail
1182             || b3 < minTrail || b3 > max4Trail)
1183             return -1;
1184         b2 -= minTrail;
1185         b3 -= minTrail;
1186         UChar32 remainder = b3 % final4Multiplier;
1187         if (remainder != 0)
1188             return -1;
1189         b3 /= final4Multiplier;
1190         b0 -= min4Primary;
1191         result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
1192     }
1193     // final check
1194     if (result < 0 || result > UCOL_MAX_INPUT)
1195         return -1;
1196     return result;
1197 }
1198 
1199 
divideAndRoundUp(int a,int b)1200 static inline int32_t divideAndRoundUp(int a, int b) {
1201     return 1 + (a-1)/b;
1202 }
1203 
1204 /* this function is either called from initUCA or from genUCA before
1205  * doing canonical closure for the UCA.
1206  */
1207 
1208 /**
1209  * Set up to generate implicits.
1210  * Maintenance Note:  this function may end up being called more than once, due
1211  *                    to threading races during initialization.  Make sure that
1212  *                    none of the Constants is ever transiently assigned an
1213  *                    incorrect value.
1214  * @param minPrimary
1215  * @param maxPrimary
1216  * @param minTrail final byte
1217  * @param maxTrail final byte
1218  * @param gap3 the gap we leave for tailoring for 3-byte forms
1219  * @param gap4 the gap we leave for tailoring for 4-byte forms
1220  */
initImplicitConstants(int minPrimary,int maxPrimary,int minTrailIn,int maxTrailIn,int gap3,int primaries3count,UErrorCode * status)1221 static void initImplicitConstants(int minPrimary, int maxPrimary,
1222                                     int minTrailIn, int maxTrailIn,
1223                                     int gap3, int primaries3count,
1224                                     UErrorCode *status) {
1225     // some simple parameter checks
1226     if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
1227         || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
1228         || (primaries3count < 1))
1229     {
1230         *status = U_ILLEGAL_ARGUMENT_ERROR;
1231         return;
1232     };
1233 
1234     minTrail = minTrailIn;
1235     maxTrail = maxTrailIn;
1236 
1237     min3Primary = minPrimary;
1238     max4Primary = maxPrimary;
1239     // compute constants for use later.
1240     // number of values we can use in trailing bytes
1241     // leave room for empty values between AND above, e.g. if gap = 2
1242     // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1243     // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1244     // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1245     final3Multiplier = gap3 + 1;
1246     final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
1247     max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
1248 
1249     // medials can use full range
1250     medialCount = (maxTrail - minTrail + 1);
1251     // find out how many values fit in each form
1252     int32_t threeByteCount = medialCount * final3Count;
1253     // now determine where the 3/4 boundary is.
1254     // we use 3 bytes below the boundary, and 4 above
1255     int32_t primariesAvailable = maxPrimary - minPrimary + 1;
1256     int32_t primaries4count = primariesAvailable - primaries3count;
1257 
1258 
1259     int32_t min3ByteCoverage = primaries3count * threeByteCount;
1260     min4Primary = minPrimary + primaries3count;
1261     min4Boundary = min3ByteCoverage;
1262     // Now expand out the multiplier for the 4 bytes, and redo.
1263 
1264     int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
1265     int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
1266     int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
1267     int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
1268     if (gap4 < 1) {
1269         *status = U_ILLEGAL_ARGUMENT_ERROR;
1270         return;
1271     }
1272     final4Multiplier = gap4 + 1;
1273     final4Count = neededPerFinalByte;
1274     max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
1275 }
1276 
1277     /**
1278      * Supply parameters for generating implicit CEs
1279      */
1280 U_CAPI void U_EXPORT2
uprv_uca_initImplicitConstants(UErrorCode * status)1281 uprv_uca_initImplicitConstants(UErrorCode *status) {
1282     // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1283     //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1284     initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
1285 }
1286 
1287 
1288 /*    collIterNormalize     Incremental Normalization happens here.                       */
1289 /*                          pick up the range of chars identifed by FCD,                  */
1290 /*                          normalize it into the collIterate's writable buffer,          */
1291 /*                          switch the collIterate's state to use the writable buffer.    */
1292 /*                                                                                        */
1293 static
collIterNormalize(collIterate * collationSource)1294 void collIterNormalize(collIterate *collationSource)
1295 {
1296     UErrorCode  status = U_ZERO_ERROR;
1297     const UChar *srcP = collationSource->pos - 1;      /*  Start of chars to normalize    */
1298     const UChar *endP = collationSource->fcdPosition;  /* End of region to normalize+1    */
1299 
1300     collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)),
1301                                     collationSource->writableBuffer,
1302                                     status);
1303     if (U_FAILURE(status)) {
1304 #ifdef UCOL_DEBUG
1305         fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status));
1306 #endif
1307         return;
1308     }
1309 
1310     collationSource->pos        = collationSource->writableBuffer.getTerminatedBuffer();
1311     collationSource->origFlags  = collationSource->flags;
1312     collationSource->flags     |= UCOL_ITER_INNORMBUF;
1313     collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1314 }
1315 
1316 
1317 // This function takes the iterator and extracts normalized stuff up to the next boundary
1318 // It is similar in the end results to the collIterNormalize, but for the cases when we
1319 // use an iterator
1320 /*static
1321 inline void normalizeIterator(collIterate *collationSource) {
1322   UErrorCode status = U_ZERO_ERROR;
1323   UBool wasNormalized = FALSE;
1324   //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1325   uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
1326   int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1327     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1328   if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1329     // reallocate and terminate
1330     if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1331                                &collationSource->writableBuffer,
1332                                (int32_t *)&collationSource->writableBufSize, normLen + 1,
1333                                0)
1334     ) {
1335     #ifdef UCOL_DEBUG
1336         fprintf(stderr, "normalizeIterator(), out of memory\n");
1337     #endif
1338         return;
1339     }
1340     status = U_ZERO_ERROR;
1341     //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1342     collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
1343     normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1344     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1345   }
1346   // Terminate the buffer - we already checked that it is big enough
1347   collationSource->writableBuffer[normLen] = 0;
1348   if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1349       collationSource->flags |= UCOL_ITER_ALLOCATED;
1350   }
1351   collationSource->pos        = collationSource->writableBuffer;
1352   collationSource->origFlags  = collationSource->flags;
1353   collationSource->flags     |= UCOL_ITER_INNORMBUF;
1354   collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1355 }*/
1356 
1357 
1358 /* Incremental FCD check and normalize                                                    */
1359 /*   Called from getNextCE when normalization state is suspect.                           */
1360 /*   When entering, the state is known to be this:                                        */
1361 /*      o   We are working in the main buffer of the collIterate, not the side            */
1362 /*          writable buffer.  When in the side buffer, normalization mode is always off,  */
1363 /*          so we won't get here.                                                         */
1364 /*      o   The leading combining class from the current character is 0 or                */
1365 /*          the trailing combining class of the previous char was zero.                   */
1366 /*          True because the previous call to this function will have always exited       */
1367 /*          that way, and we get called for every char where cc might be non-zero.        */
1368 static
collIterFCD(collIterate * collationSource)1369 inline UBool collIterFCD(collIterate *collationSource) {
1370     const UChar *srcP, *endP;
1371     uint8_t     leadingCC;
1372     uint8_t     prevTrailingCC = 0;
1373     uint16_t    fcd;
1374     UBool       needNormalize = FALSE;
1375 
1376     srcP = collationSource->pos-1;
1377 
1378     if (collationSource->flags & UCOL_ITER_HASLEN) {
1379         endP = collationSource->endp;
1380     } else {
1381         endP = NULL;
1382     }
1383 
1384     // Get the trailing combining class of the current character.  If it's zero,
1385     //   we are OK.
1386     /* trie access */
1387     fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
1388     if (fcd != 0) {
1389         prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1390 
1391         if (prevTrailingCC != 0) {
1392             // The current char has a non-zero trailing CC.  Scan forward until we find
1393             //   a char with a leading cc of zero.
1394             while (endP == NULL || srcP != endP)
1395             {
1396                 const UChar *savedSrcP = srcP;
1397 
1398                 /* trie access */
1399                 fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
1400                 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1401                 if (leadingCC == 0) {
1402                     srcP = savedSrcP;      // Hit char that is not part of combining sequence.
1403                                            //   back up over it.  (Could be surrogate pair!)
1404                     break;
1405                 }
1406 
1407                 if (leadingCC < prevTrailingCC) {
1408                     needNormalize = TRUE;
1409                 }
1410 
1411                 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1412             }
1413         }
1414     }
1415 
1416     collationSource->fcdPosition = (UChar *)srcP;
1417 
1418     return needNormalize;
1419 }
1420 
1421 /****************************************************************************/
1422 /* Following are the CE retrieval functions                                 */
1423 /*                                                                          */
1424 /****************************************************************************/
1425 
1426 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
1427 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
1428 
1429 /* there should be a macro version of this function in the header file */
1430 /* This is the first function that tries to fetch a collation element  */
1431 /* If it's not succesfull or it encounters a more difficult situation  */
1432 /* some more sofisticated and slower functions are invoked             */
1433 static
ucol_IGetNextCE(const UCollator * coll,collIterate * collationSource,UErrorCode * status)1434 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1435     uint32_t order = 0;
1436     if (collationSource->CEpos > collationSource->toReturn) {       /* Are there any CEs from previous expansions? */
1437         order = *(collationSource->toReturn++);                         /* if so, return them */
1438         if(collationSource->CEpos == collationSource->toReturn) {
1439             collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs;
1440         }
1441         return order;
1442     }
1443 
1444     UChar ch = 0;
1445     collationSource->offsetReturn = NULL;
1446 
1447     do {
1448         for (;;)                           /* Loop handles case when incremental normalize switches   */
1449         {                                  /*   to or from the side buffer / original string, and we  */
1450             /*   need to start again to get the next character.        */
1451 
1452             if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1453             {
1454                 // The source string is null terminated and we're not working from the side buffer,
1455                 //   and we're not normalizing.  This is the fast path.
1456                 //   (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1457                 ch = *collationSource->pos++;
1458                 if (ch != 0) {
1459                     break;
1460                 }
1461                 else {
1462                     return UCOL_NO_MORE_CES;
1463                 }
1464             }
1465 
1466             if (collationSource->flags & UCOL_ITER_HASLEN) {
1467                 // Normal path for strings when length is specified.
1468                 //   (We can't be in side buffer because it is always null terminated.)
1469                 if (collationSource->pos >= collationSource->endp) {
1470                     // Ran off of the end of the main source string.  We're done.
1471                     return UCOL_NO_MORE_CES;
1472                 }
1473                 ch = *collationSource->pos++;
1474             }
1475             else if(collationSource->flags & UCOL_USE_ITERATOR) {
1476                 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
1477                 if(iterCh == U_SENTINEL) {
1478                     return UCOL_NO_MORE_CES;
1479                 }
1480                 ch = (UChar)iterCh;
1481             }
1482             else
1483             {
1484                 // Null terminated string.
1485                 ch = *collationSource->pos++;
1486                 if (ch == 0) {
1487                     // Ran off end of buffer.
1488                     if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1489                         // Ran off end of main string. backing up one character.
1490                         collationSource->pos--;
1491                         return UCOL_NO_MORE_CES;
1492                     }
1493                     else
1494                     {
1495                         // Hit null in the normalize side buffer.
1496                         // Usually this means the end of the normalized data,
1497                         // except for one odd case: a null followed by combining chars,
1498                         //   which is the case if we are at the start of the buffer.
1499                         if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) {
1500                             break;
1501                         }
1502 
1503                         //  Null marked end of side buffer.
1504                         //   Revert to the main string and
1505                         //   loop back to top to try again to get a character.
1506                         collationSource->pos   = collationSource->fcdPosition;
1507                         collationSource->flags = collationSource->origFlags;
1508                         continue;
1509                     }
1510                 }
1511             }
1512 
1513             if(collationSource->flags&UCOL_HIRAGANA_Q) {
1514                 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
1515                  * based on whether the previous codepoint was Hiragana or Katakana.
1516                  */
1517                 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
1518                         ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
1519                     collationSource->flags |= UCOL_WAS_HIRAGANA;
1520                 } else {
1521                     collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1522                 }
1523             }
1524 
1525             // We've got a character.  See if there's any fcd and/or normalization stuff to do.
1526             //    Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1527             if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1528                 break;
1529             }
1530 
1531             if (collationSource->fcdPosition >= collationSource->pos) {
1532                 // An earlier FCD check has already covered the current character.
1533                 // We can go ahead and process this char.
1534                 break;
1535             }
1536 
1537             if (ch < ZERO_CC_LIMIT_ ) {
1538                 // Fast fcd safe path.  Trailing combining class == 0.  This char is OK.
1539                 break;
1540             }
1541 
1542             if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1543                 // We need to peek at the next character in order to tell if we are FCD
1544                 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
1545                     // We are at the last char of source string.
1546                     //  It is always OK for FCD check.
1547                     break;
1548                 }
1549 
1550                 // Not at last char of source string (or we'll check against terminating null).  Do the FCD fast test
1551                 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1552                     break;
1553                 }
1554             }
1555 
1556 
1557             // Need a more complete FCD check and possible normalization.
1558             if (collIterFCD(collationSource)) {
1559                 collIterNormalize(collationSource);
1560             }
1561             if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1562                 //  No normalization was needed.  Go ahead and process the char we already had.
1563                 break;
1564             }
1565 
1566             // Some normalization happened.  Next loop iteration will pick up a char
1567             //   from the normalization buffer.
1568 
1569         }   // end for (;;)
1570 
1571 
1572         if (ch <= 0xFF) {
1573             /*  For latin-1 characters we never need to fall back to the UCA table        */
1574             /*    because all of the UCA data is replicated in the latinOneMapping array  */
1575             order = coll->latinOneMapping[ch];
1576             if (order > UCOL_NOT_FOUND) {
1577                 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
1578             }
1579         }
1580         else
1581         {
1582             // Always use UCA for Han, Hangul
1583             // (Han extension A is before main Han block)
1584             // **** Han compatibility chars ?? ****
1585             if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
1586                 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
1587                 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
1588                     // between the two target ranges; do normal lookup
1589                     // **** this range is YI, Modifier tone letters, ****
1590                     // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
1591                     // **** Latin-D might be tailored, so we need to ****
1592                     // **** do the normal lookup for these guys.     ****
1593                     order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1594                 } else {
1595                     // in one of the target ranges; use UCA
1596                     order = UCOL_NOT_FOUND;
1597                 }
1598             } else {
1599                 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1600             }
1601 
1602             if(order > UCOL_NOT_FOUND) {                                       /* if a CE is special                */
1603                 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);    /* and try to get the special CE     */
1604             }
1605 
1606             if(order == UCOL_NOT_FOUND && coll->UCA) {   /* We couldn't find a good CE in the tailoring */
1607                 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1608                 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1609 
1610                 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
1611                     order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
1612                 }
1613             }
1614         }
1615     } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
1616 
1617     if(order == UCOL_NOT_FOUND) {
1618         order = getImplicit(ch, collationSource);
1619     }
1620     return order; /* return the CE */
1621 }
1622 
1623 /* ucol_getNextCE, out-of-line version for use from other files.   */
1624 U_CAPI uint32_t  U_EXPORT2
ucol_getNextCE(const UCollator * coll,collIterate * collationSource,UErrorCode * status)1625 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1626     return ucol_IGetNextCE(coll, collationSource, status);
1627 }
1628 
1629 
1630 /**
1631 * Incremental previous normalization happens here. Pick up the range of chars
1632 * identifed by FCD, normalize it into the collIterate's writable buffer,
1633 * switch the collIterate's state to use the writable buffer.
1634 * @param data collation iterator data
1635 */
1636 static
collPrevIterNormalize(collIterate * data)1637 void collPrevIterNormalize(collIterate *data)
1638 {
1639     UErrorCode status  = U_ZERO_ERROR;
1640     const UChar *pEnd   = data->pos;  /* End normalize + 1 */
1641     const UChar *pStart;
1642 
1643     /* Start normalize */
1644     if (data->fcdPosition == NULL) {
1645         pStart = data->string;
1646     }
1647     else {
1648         pStart = data->fcdPosition + 1;
1649     }
1650 
1651     int32_t normLen =
1652         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)),
1653                              data->writableBuffer,
1654                              status).
1655         length();
1656     if(U_FAILURE(status)) {
1657         return;
1658     }
1659     /*
1660     this puts the null termination infront of the normalized string instead
1661     of the end
1662     */
1663     data->writableBuffer.insert(0, (UChar)0);
1664 
1665     /*
1666      * The usual case at this point is that we've got a base
1667      * character followed by marks that were normalized. If
1668      * fcdPosition is NULL, that means that we backed up to
1669      * the beginning of the string and there's no base character.
1670      *
1671      * Forward processing will usually normalize when it sees
1672      * the first mark, so that mark will get it's natural offset
1673      * and the rest will get the offset of the character following
1674      * the marks. The base character will also get its natural offset.
1675      *
1676      * We write the offset of the base character, if there is one,
1677      * followed by the offset of the first mark and then the offsets
1678      * of the rest of the marks.
1679      */
1680     int32_t firstMarkOffset = 0;
1681     int32_t trailOffset     = (int32_t)(data->pos - data->string + 1);
1682     int32_t trailCount      = normLen - 1;
1683 
1684     if (data->fcdPosition != NULL) {
1685         int32_t baseOffset = (int32_t)(data->fcdPosition - data->string);
1686         UChar   baseChar   = *data->fcdPosition;
1687 
1688         firstMarkOffset = baseOffset + 1;
1689 
1690         /*
1691          * If the base character is the start of a contraction, forward processing
1692          * will normalize the marks while checking for the contraction, which means
1693          * that the offset of the first mark will the same as the other marks.
1694          *
1695          * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
1696          */
1697         if (baseChar >= 0x100) {
1698             uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
1699 
1700             if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
1701                 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
1702             }
1703 
1704             if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
1705                 firstMarkOffset = trailOffset;
1706             }
1707         }
1708 
1709         data->appendOffset(baseOffset, status);
1710     }
1711 
1712     data->appendOffset(firstMarkOffset, status);
1713 
1714     for (int32_t i = 0; i < trailCount; i += 1) {
1715         data->appendOffset(trailOffset, status);
1716     }
1717 
1718     data->offsetRepeatValue = trailOffset;
1719 
1720     data->offsetReturn = data->offsetStore - 1;
1721     if (data->offsetReturn == data->offsetBuffer) {
1722         data->offsetStore = data->offsetBuffer;
1723     }
1724 
1725     data->pos        = data->writableBuffer.getTerminatedBuffer() + 1 + normLen;
1726     data->origFlags  = data->flags;
1727     data->flags     |= UCOL_ITER_INNORMBUF;
1728     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1729 }
1730 
1731 
1732 /**
1733 * Incremental FCD check for previous iteration and normalize. Called from
1734 * getPrevCE when normalization state is suspect.
1735 * When entering, the state is known to be this:
1736 * o  We are working in the main buffer of the collIterate, not the side
1737 *    writable buffer. When in the side buffer, normalization mode is always
1738 *    off, so we won't get here.
1739 * o  The leading combining class from the current character is 0 or the
1740 *    trailing combining class of the previous char was zero.
1741 *    True because the previous call to this function will have always exited
1742 *    that way, and we get called for every char where cc might be non-zero.
1743 * @param data collation iterate struct
1744 * @return normalization status, TRUE for normalization to be done, FALSE
1745 *         otherwise
1746 */
1747 static
collPrevIterFCD(collIterate * data)1748 inline UBool collPrevIterFCD(collIterate *data)
1749 {
1750     const UChar *src, *start;
1751     uint8_t     leadingCC;
1752     uint8_t     trailingCC = 0;
1753     uint16_t    fcd;
1754     UBool       result = FALSE;
1755 
1756     start = data->string;
1757     src = data->pos + 1;
1758 
1759     /* Get the trailing combining class of the current character. */
1760     fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
1761 
1762     leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1763 
1764     if (leadingCC != 0) {
1765         /*
1766         The current char has a non-zero leading combining class.
1767         Scan backward until we find a char with a trailing cc of zero.
1768         */
1769         for (;;)
1770         {
1771             if (start == src) {
1772                 data->fcdPosition = NULL;
1773                 return result;
1774             }
1775 
1776             fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
1777 
1778             trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1779 
1780             if (trailingCC == 0) {
1781                 break;
1782             }
1783 
1784             if (leadingCC < trailingCC) {
1785                 result = TRUE;
1786             }
1787 
1788             leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1789         }
1790     }
1791 
1792     data->fcdPosition = (UChar *)src;
1793 
1794     return result;
1795 }
1796 
1797 /** gets a code unit from the string at a given offset
1798  *  Handles both normal and iterative cases.
1799  *  No error checking - caller beware!
1800  */
1801 static inline
peekCodeUnit(collIterate * source,int32_t offset)1802 UChar peekCodeUnit(collIterate *source, int32_t offset) {
1803     if(source->pos != NULL) {
1804         return *(source->pos + offset);
1805     } else if(source->iterator != NULL) {
1806         UChar32 c;
1807         if(offset != 0) {
1808             source->iterator->move(source->iterator, offset, UITER_CURRENT);
1809             c = source->iterator->next(source->iterator);
1810             source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1811         } else {
1812             c = source->iterator->current(source->iterator);
1813         }
1814         return c >= 0 ? (UChar)c : 0xfffd;  // If the caller works properly, we should never see c<0.
1815     } else {
1816         return 0xfffd;
1817     }
1818 }
1819 
1820 // Code point version. Treats the offset as a _code point_ delta.
1821 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16.
1822 // We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer.
1823 static inline
peekCodePoint(collIterate * source,int32_t offset)1824 UChar32 peekCodePoint(collIterate *source, int32_t offset) {
1825     UChar32 c;
1826     if(source->pos != NULL) {
1827         const UChar *p = source->pos;
1828         if(offset >= 0) {
1829             // Skip forward over (offset-1) code points.
1830             while(--offset >= 0) {
1831                 if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) {
1832                     ++p;
1833                 }
1834             }
1835             // Read the code point there.
1836             c = *p++;
1837             UChar trail;
1838             if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) {
1839                 c = U16_GET_SUPPLEMENTARY(c, trail);
1840             }
1841         } else /* offset<0 */ {
1842             // Skip backward over (offset-1) code points.
1843             while(++offset < 0) {
1844                 if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) {
1845                     --p;
1846                 }
1847             }
1848             // Read the code point before that.
1849             c = *--p;
1850             UChar lead;
1851             if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) {
1852                 c = U16_GET_SUPPLEMENTARY(lead, c);
1853             }
1854         }
1855     } else if(source->iterator != NULL) {
1856         if(offset >= 0) {
1857             // Skip forward over (offset-1) code points.
1858             int32_t fwd = offset;
1859             while(fwd-- > 0) {
1860                 uiter_next32(source->iterator);
1861             }
1862             // Read the code point there.
1863             c = uiter_current32(source->iterator);
1864             // Return to the starting point, skipping backward over (offset-1) code points.
1865             while(offset-- > 0) {
1866                 uiter_previous32(source->iterator);
1867             }
1868         } else /* offset<0 */ {
1869             // Read backward, reading offset code points, remember only the last-read one.
1870             int32_t back = offset;
1871             do {
1872                 c = uiter_previous32(source->iterator);
1873             } while(++back < 0);
1874             // Return to the starting position, skipping forward over offset code points.
1875             do {
1876                 uiter_next32(source->iterator);
1877             } while(++offset < 0);
1878         }
1879     } else {
1880         c = U_SENTINEL;
1881     }
1882     return c;
1883 }
1884 
1885 /**
1886 * Determines if we are at the start of the data string in the backwards
1887 * collation iterator
1888 * @param data collation iterator
1889 * @return TRUE if we are at the start
1890 */
1891 static
isAtStartPrevIterate(collIterate * data)1892 inline UBool isAtStartPrevIterate(collIterate *data) {
1893     if(data->pos == NULL && data->iterator != NULL) {
1894         return !data->iterator->hasPrevious(data->iterator);
1895     }
1896     //return (collIter_bos(data)) ||
1897     return (data->pos == data->string) ||
1898               ((data->flags & UCOL_ITER_INNORMBUF) &&
1899               *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1900 }
1901 
1902 static
goBackOne(collIterate * data)1903 inline void goBackOne(collIterate *data) {
1904 # if 0
1905     // somehow, it looks like we need to keep iterator synced up
1906     // at all times, as above.
1907     if(data->pos) {
1908         data->pos--;
1909     }
1910     if(data->iterator) {
1911         data->iterator->previous(data->iterator);
1912     }
1913 #endif
1914     if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
1915         data->iterator->previous(data->iterator);
1916     }
1917     if(data->pos) {
1918         data->pos --;
1919     }
1920 }
1921 
1922 /**
1923 * Inline function that gets a simple CE.
1924 * So what it does is that it will first check the expansion buffer. If the
1925 * expansion buffer is not empty, ie the end pointer to the expansion buffer
1926 * is different from the string pointer, we return the collation element at the
1927 * return pointer and decrement it.
1928 * For more complicated CEs it resorts to getComplicatedCE.
1929 * @param coll collator data
1930 * @param data collation iterator struct
1931 * @param status error status
1932 */
1933 static
ucol_IGetPrevCE(const UCollator * coll,collIterate * data,UErrorCode * status)1934 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
1935                                UErrorCode *status)
1936 {
1937     uint32_t result = (uint32_t)UCOL_NULLORDER;
1938 
1939     if (data->offsetReturn != NULL) {
1940         if (data->offsetRepeatCount > 0) {
1941                 data->offsetRepeatCount -= 1;
1942         } else {
1943             if (data->offsetReturn == data->offsetBuffer) {
1944                 data->offsetReturn = NULL;
1945                 data->offsetStore  = data->offsetBuffer;
1946             } else {
1947                 data->offsetReturn -= 1;
1948             }
1949         }
1950     }
1951 
1952     if ((data->extendCEs && data->toReturn > data->extendCEs) ||
1953             (!data->extendCEs && data->toReturn > data->CEs))
1954     {
1955         data->toReturn -= 1;
1956         result = *(data->toReturn);
1957         if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
1958             data->CEpos = data->toReturn;
1959         }
1960     }
1961     else {
1962         UChar ch = 0;
1963 
1964         do {
1965             /*
1966             Loop handles case when incremental normalize switches to or from the
1967             side buffer / original string, and we need to start again to get the
1968             next character.
1969             */
1970             for (;;) {
1971                 if (data->flags & UCOL_ITER_HASLEN) {
1972                     /*
1973                     Normal path for strings when length is specified.
1974                     Not in side buffer because it is always null terminated.
1975                     */
1976                     if (data->pos <= data->string) {
1977                         /* End of the main source string */
1978                         return UCOL_NO_MORE_CES;
1979                     }
1980                     data->pos --;
1981                     ch = *data->pos;
1982                 }
1983                 // we are using an iterator to go back. Pray for us!
1984                 else if (data->flags & UCOL_USE_ITERATOR) {
1985                   UChar32 iterCh = data->iterator->previous(data->iterator);
1986                   if(iterCh == U_SENTINEL) {
1987                     return UCOL_NO_MORE_CES;
1988                   } else {
1989                     ch = (UChar)iterCh;
1990                   }
1991                 }
1992                 else {
1993                     data->pos --;
1994                     ch = *data->pos;
1995                     /* we are in the side buffer. */
1996                     if (ch == 0) {
1997                         /*
1998                         At the start of the normalize side buffer.
1999                         Go back to string.
2000                         Because pointer points to the last accessed character,
2001                         hence we have to increment it by one here.
2002                         */
2003                         data->flags = data->origFlags;
2004                         data->offsetRepeatValue = 0;
2005 
2006                          if (data->fcdPosition == NULL) {
2007                             data->pos = data->string;
2008                             return UCOL_NO_MORE_CES;
2009                         }
2010                         else {
2011                             data->pos   = data->fcdPosition + 1;
2012                         }
2013 
2014                        continue;
2015                     }
2016                 }
2017 
2018                 if(data->flags&UCOL_HIRAGANA_Q) {
2019                   if(ch>=0x3040 && ch<=0x309f) {
2020                     data->flags |= UCOL_WAS_HIRAGANA;
2021                   } else {
2022                     data->flags &= ~UCOL_WAS_HIRAGANA;
2023                   }
2024                 }
2025 
2026                 /*
2027                 * got a character to determine if there's fcd and/or normalization
2028                 * stuff to do.
2029                 * if the current character is not fcd.
2030                 * if current character is at the start of the string
2031                 * Trailing combining class == 0.
2032                 * Note if pos is in the writablebuffer, norm is always 0
2033                 */
2034                 if (ch < ZERO_CC_LIMIT_ ||
2035                   // this should propel us out of the loop in the iterator case
2036                     (data->flags & UCOL_ITER_NORM) == 0 ||
2037                     (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
2038                     || data->string == data->pos) {
2039                     break;
2040                 }
2041 
2042                 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
2043                     /* if next character is FCD */
2044                     if (data->pos == data->string) {
2045                         /* First char of string is always OK for FCD check */
2046                         break;
2047                     }
2048 
2049                     /* Not first char of string, do the FCD fast test */
2050                     if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
2051                         break;
2052                     }
2053                 }
2054 
2055                 /* Need a more complete FCD check and possible normalization. */
2056                 if (collPrevIterFCD(data)) {
2057                     collPrevIterNormalize(data);
2058                 }
2059 
2060                 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2061                     /*  No normalization. Go ahead and process the char. */
2062                     break;
2063                 }
2064 
2065                 /*
2066                 Some normalization happened.
2067                 Next loop picks up a char from the normalization buffer.
2068                 */
2069             }
2070 
2071             /* attempt to handle contractions, after removal of the backwards
2072             contraction
2073             */
2074             if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
2075                 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
2076             } else {
2077                 if (ch <= 0xFF) {
2078                     result = coll->latinOneMapping[ch];
2079                 }
2080                 else {
2081                     // Always use UCA for [3400..9FFF], [AC00..D7AF]
2082                     // **** [FA0E..FA2F] ?? ****
2083                     if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
2084                         (ch >= 0x3400 && ch <= 0xD7AF)) {
2085                         if (ch > 0x9FFF && ch < 0xAC00) {
2086                             // between the two target ranges; do normal lookup
2087                             // **** this range is YI, Modifier tone letters, ****
2088                             // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
2089                             // **** Latin-D might be tailored, so we need to ****
2090                             // **** do the normal lookup for these guys.     ****
2091                              result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
2092                         } else {
2093                             result = UCOL_NOT_FOUND;
2094                         }
2095                     } else {
2096                         result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
2097                     }
2098                 }
2099                 if (result > UCOL_NOT_FOUND) {
2100                     result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
2101                 }
2102                 if (result == UCOL_NOT_FOUND) { // Not found in master list
2103                     if (!isAtStartPrevIterate(data) &&
2104                         ucol_contractionEndCP(ch, data->coll))
2105                     {
2106                         result = UCOL_CONTRACTION;
2107                     } else {
2108                         if(coll->UCA) {
2109                             result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
2110                         }
2111                     }
2112 
2113                     if (result > UCOL_NOT_FOUND) {
2114                         if(coll->UCA) {
2115                             result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
2116                         }
2117                     }
2118                 }
2119             }
2120         } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
2121 
2122         if(result == UCOL_NOT_FOUND) {
2123             result = getPrevImplicit(ch, data);
2124         }
2125     }
2126 
2127     return result;
2128 }
2129 
2130 
2131 /*   ucol_getPrevCE, out-of-line version for use from other files.  */
2132 U_CFUNC uint32_t  U_EXPORT2
ucol_getPrevCE(const UCollator * coll,collIterate * data,UErrorCode * status)2133 ucol_getPrevCE(const UCollator *coll, collIterate *data,
2134                         UErrorCode *status) {
2135     return ucol_IGetPrevCE(coll, data, status);
2136 }
2137 
2138 
2139 /* this should be connected to special Jamo handling */
2140 U_CFUNC uint32_t  U_EXPORT2
ucol_getFirstCE(const UCollator * coll,UChar u,UErrorCode * status)2141 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
2142     collIterate colIt;
2143     IInit_collIterate(coll, &u, 1, &colIt, status);
2144     if(U_FAILURE(*status)) {
2145         return 0;
2146     }
2147     return ucol_IGetNextCE(coll, &colIt, status);
2148 }
2149 
2150 /**
2151 * Inserts the argument character into the end of the buffer pushing back the
2152 * null terminator.
2153 * @param data collIterate struct data
2154 * @param ch character to be appended
2155 * @return the position of the new addition
2156 */
2157 static
insertBufferEnd(collIterate * data,UChar ch)2158 inline const UChar * insertBufferEnd(collIterate *data, UChar ch)
2159 {
2160     int32_t oldLength = data->writableBuffer.length();
2161     return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength;
2162 }
2163 
2164 /**
2165 * Inserts the argument string into the end of the buffer pushing back the
2166 * null terminator.
2167 * @param data collIterate struct data
2168 * @param string to be appended
2169 * @param length of the string to be appended
2170 * @return the position of the new addition
2171 */
2172 static
insertBufferEnd(collIterate * data,const UChar * str,int32_t length)2173 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length)
2174 {
2175     int32_t oldLength = data->writableBuffer.length();
2176     return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength;
2177 }
2178 
2179 /**
2180 * Special normalization function for contraction in the forwards iterator.
2181 * This normalization sequence will place the current character at source->pos
2182 * and its following normalized sequence into the buffer.
2183 * The fcd position, pos will be changed.
2184 * pos will now point to positions in the buffer.
2185 * Flags will be changed accordingly.
2186 * @param data collation iterator data
2187 */
2188 static
normalizeNextContraction(collIterate * data)2189 inline void normalizeNextContraction(collIterate *data)
2190 {
2191     int32_t     strsize;
2192     UErrorCode  status     = U_ZERO_ERROR;
2193     /* because the pointer points to the next character */
2194     const UChar *pStart    = data->pos - 1;
2195     const UChar *pEnd;
2196 
2197     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2198         data->writableBuffer.setTo(*(pStart - 1));
2199         strsize               = 1;
2200     }
2201     else {
2202         strsize = data->writableBuffer.length();
2203     }
2204 
2205     pEnd = data->fcdPosition;
2206 
2207     data->writableBuffer.append(
2208         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status));
2209     if(U_FAILURE(status)) {
2210         return;
2211     }
2212 
2213     data->pos        = data->writableBuffer.getTerminatedBuffer() + strsize;
2214     data->origFlags  = data->flags;
2215     data->flags     |= UCOL_ITER_INNORMBUF;
2216     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2217 }
2218 
2219 /**
2220 * Contraction character management function that returns the next character
2221 * for the forwards iterator.
2222 * Does nothing if the next character is in buffer and not the first character
2223 * in it.
2224 * Else it checks next character in data string to see if it is normalizable.
2225 * If it is not, the character is simply copied into the buffer, else
2226 * the whole normalized substring is copied into the buffer, including the
2227 * current character.
2228 * @param data collation element iterator data
2229 * @return next character
2230 */
2231 static
getNextNormalizedChar(collIterate * data)2232 inline UChar getNextNormalizedChar(collIterate *data)
2233 {
2234     UChar  nextch;
2235     UChar  ch;
2236     // Here we need to add the iterator code. One problem is the way
2237     // end of string is handled. If we just return next char, it could
2238     // be the sentinel. Most of the cases already check for this, but we
2239     // need to be sure.
2240     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2241          /* if no normalization and not in buffer. */
2242       if(data->flags & UCOL_USE_ITERATOR) {
2243          return (UChar)data->iterator->next(data->iterator);
2244       } else {
2245          return *(data->pos ++);
2246       }
2247     }
2248 
2249     //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2250       //normalizeIterator(data);
2251     //}
2252 
2253     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2254     if ((innormbuf && *data->pos != 0) ||
2255         (data->fcdPosition != NULL && !innormbuf &&
2256         data->pos < data->fcdPosition)) {
2257         /*
2258         if next character is in normalized buffer, no further normalization
2259         is required
2260         */
2261         return *(data->pos ++);
2262     }
2263 
2264     if (data->flags & UCOL_ITER_HASLEN) {
2265         /* in data string */
2266         if (data->pos + 1 == data->endp) {
2267             return *(data->pos ++);
2268         }
2269     }
2270     else {
2271         if (innormbuf) {
2272           // inside the normalization buffer, but at the end
2273           // (since we encountered zero). This means, in the
2274           // case we're using char iterator, that we need to
2275           // do another round of normalization.
2276           //if(data->origFlags & UCOL_USE_ITERATOR) {
2277             // we need to restore original flags,
2278             // otherwise, we'll lose them
2279             //data->flags = data->origFlags;
2280             //normalizeIterator(data);
2281             //return *(data->pos++);
2282           //} else {
2283             /*
2284             in writable buffer, at this point fcdPosition can not be
2285             pointing to the end of the data string. see contracting tag.
2286             */
2287           if(data->fcdPosition) {
2288             if (*(data->fcdPosition + 1) == 0 ||
2289                 data->fcdPosition + 1 == data->endp) {
2290                 /* at the end of the string, dump it into the normalizer */
2291                 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1;
2292                 // Check if data->pos received a null pointer
2293                 if (data->pos == NULL) {
2294                     return (UChar)-1; // Return to indicate error.
2295                 }
2296                 return *(data->fcdPosition ++);
2297             }
2298             data->pos = data->fcdPosition;
2299           } else if(data->origFlags & UCOL_USE_ITERATOR) {
2300             // if we are here, we're using a normalizing iterator.
2301             // we should just continue further.
2302             data->flags = data->origFlags;
2303             data->pos = NULL;
2304             return (UChar)data->iterator->next(data->iterator);
2305           }
2306           //}
2307         }
2308         else {
2309             if (*(data->pos + 1) == 0) {
2310                 return *(data->pos ++);
2311             }
2312         }
2313     }
2314 
2315     ch = *data->pos ++;
2316     nextch = *data->pos;
2317 
2318     /*
2319     * if the current character is not fcd.
2320     * Trailing combining class == 0.
2321     */
2322     if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2323         (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2324          ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2325             /*
2326             Need a more complete FCD check and possible normalization.
2327             normalize substring will be appended to buffer
2328             */
2329         if (collIterFCD(data)) {
2330             normalizeNextContraction(data);
2331             return *(data->pos ++);
2332         }
2333         else if (innormbuf) {
2334             /* fcdposition shifted even when there's no normalization, if we
2335             don't input the rest into this, we'll get the wrong position when
2336             we reach the end of the writableBuffer */
2337             int32_t length = (int32_t)(data->fcdPosition - data->pos + 1);
2338             data->pos = insertBufferEnd(data, data->pos - 1, length);
2339             // Check if data->pos received a null pointer
2340             if (data->pos == NULL) {
2341                 return (UChar)-1; // Return to indicate error.
2342             }
2343             return *(data->pos ++);
2344         }
2345     }
2346 
2347     if (innormbuf) {
2348         /*
2349         no normalization is to be done hence only one character will be
2350         appended to the buffer.
2351         */
2352         data->pos = insertBufferEnd(data, ch) + 1;
2353         // Check if data->pos received a null pointer
2354         if (data->pos == NULL) {
2355             return (UChar)-1; // Return to indicate error.
2356         }
2357     }
2358 
2359     /* points back to the pos in string */
2360     return ch;
2361 }
2362 
2363 
2364 
2365 /**
2366 * Function to copy the buffer into writableBuffer and sets the fcd position to
2367 * the correct position
2368 * @param source data string source
2369 * @param buffer character buffer
2370 */
2371 static
setDiscontiguosAttribute(collIterate * source,const UnicodeString & buffer)2372 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer)
2373 {
2374     /* okay confusing part here. to ensure that the skipped characters are
2375     considered later, we need to place it in the appropriate position in the
2376     normalization buffer and reassign the pos pointer. simple case if pos
2377     reside in string, simply copy to normalization buffer and
2378     fcdposition = pos, pos = start of normalization buffer. if pos in
2379     normalization buffer, we'll insert the copy infront of pos and point pos
2380     to the start of the normalization buffer. why am i doing these copies?
2381     well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2382     not require any changes, which be really painful. */
2383     if (source->flags & UCOL_ITER_INNORMBUF) {
2384         int32_t replaceLength = source->pos - source->writableBuffer.getBuffer();
2385         source->writableBuffer.replace(0, replaceLength, buffer);
2386     }
2387     else {
2388         source->fcdPosition  = source->pos;
2389         source->origFlags    = source->flags;
2390         source->flags       |= UCOL_ITER_INNORMBUF;
2391         source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2392         source->writableBuffer = buffer;
2393     }
2394 
2395     source->pos = source->writableBuffer.getTerminatedBuffer();
2396 }
2397 
2398 /**
2399 * Function to get the discontiguos collation element within the source.
2400 * Note this function will set the position to the appropriate places.
2401 * @param coll current collator used
2402 * @param source data string source
2403 * @param constart index to the start character in the contraction table
2404 * @return discontiguos collation element offset
2405 */
2406 static
getDiscontiguous(const UCollator * coll,collIterate * source,const UChar * constart)2407 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2408                                 const UChar *constart)
2409 {
2410     /* source->pos currently points to the second combining character after
2411        the start character */
2412           const UChar *temppos      = source->pos;
2413           UnicodeString buffer;
2414     const UChar   *tempconstart = constart;
2415           uint8_t  tempflags    = source->flags;
2416           UBool    multicontraction = FALSE;
2417           collIterateState discState;
2418 
2419           backupState(source, &discState);
2420 
2421     buffer.setTo(peekCodePoint(source, -1));
2422     for (;;) {
2423         UChar    *UCharOffset;
2424         UChar     schar,
2425                   tchar;
2426         uint32_t  result;
2427 
2428         if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2429             || (peekCodeUnit(source, 0) == 0  &&
2430             //|| (*source->pos == 0  &&
2431                 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2432                  source->fcdPosition == NULL ||
2433                  source->fcdPosition == source->endp ||
2434                  *(source->fcdPosition) == 0 ||
2435                  u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2436                  /* end of string in null terminated string or stopped by a
2437                  null character, note fcd does not always point to a base
2438                  character after the discontiguos change */
2439                  u_getCombiningClass(peekCodePoint(source, 0)) == 0) {
2440                  //u_getCombiningClass(*(source->pos)) == 0) {
2441             //constart = (UChar *)coll->image + getContractOffset(CE);
2442             if (multicontraction) {
2443                 source->pos    = temppos - 1;
2444                 setDiscontiguosAttribute(source, buffer);
2445                 return *(coll->contractionCEs +
2446                                     (tempconstart - coll->contractionIndex));
2447             }
2448             constart = tempconstart;
2449             break;
2450         }
2451 
2452         UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2453         schar = getNextNormalizedChar(source);
2454 
2455         while (schar > (tchar = *UCharOffset)) {
2456             UCharOffset++;
2457         }
2458 
2459         if (schar != tchar) {
2460             /* not the correct codepoint. we stuff the current codepoint into
2461             the discontiguos buffer and try the next character */
2462             buffer.append(schar);
2463             continue;
2464         }
2465         else {
2466             if (u_getCombiningClass(schar) ==
2467                 u_getCombiningClass(peekCodePoint(source, -2))) {
2468                 buffer.append(schar);
2469                 continue;
2470             }
2471             result = *(coll->contractionCEs +
2472                                       (UCharOffset - coll->contractionIndex));
2473         }
2474 
2475         if (result == UCOL_NOT_FOUND) {
2476           break;
2477         } else if (isContraction(result)) {
2478             /* this is a multi-contraction*/
2479             tempconstart = (UChar *)coll->image + getContractOffset(result);
2480             if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2481                 != UCOL_NOT_FOUND) {
2482                 multicontraction = TRUE;
2483                 temppos       = source->pos + 1;
2484             }
2485         } else {
2486             setDiscontiguosAttribute(source, buffer);
2487             return result;
2488         }
2489     }
2490 
2491     /* no problems simply reverting just like that,
2492     if we are in string before getting into this function, points back to
2493     string hence no problem.
2494     if we are in normalization buffer before getting into this function,
2495     since we'll never use another normalization within this function, we
2496     know that fcdposition points to a base character. the normalization buffer
2497     never change, hence this revert works. */
2498     loadState(source, &discState, TRUE);
2499     goBackOne(source);
2500 
2501     //source->pos   = temppos - 1;
2502     source->flags = tempflags;
2503     return *(coll->contractionCEs + (constart - coll->contractionIndex));
2504 }
2505 
2506 /* now uses Mark's getImplicitPrimary code */
2507 static
getImplicit(UChar32 cp,collIterate * collationSource)2508 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2509     uint32_t r = uprv_uca_getImplicitPrimary(cp);
2510     *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2511     collationSource->offsetRepeatCount += 1;
2512     return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2513 }
2514 
2515 /**
2516 * Inserts the argument character into the front of the buffer replacing the
2517 * front null terminator.
2518 * @param data collation element iterator data
2519 * @param ch character to be appended
2520 */
2521 static
insertBufferFront(collIterate * data,UChar ch)2522 inline void insertBufferFront(collIterate *data, UChar ch)
2523 {
2524     data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2;
2525 }
2526 
2527 /**
2528 * Special normalization function for contraction in the previous iterator.
2529 * This normalization sequence will place the current character at source->pos
2530 * and its following normalized sequence into the buffer.
2531 * The fcd position, pos will be changed.
2532 * pos will now point to positions in the buffer.
2533 * Flags will be changed accordingly.
2534 * @param data collation iterator data
2535 */
2536 static
normalizePrevContraction(collIterate * data,UErrorCode * status)2537 inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
2538 {
2539     const UChar *pEnd = data->pos + 1;         /* End normalize + 1 */
2540     const UChar *pStart;
2541 
2542     UnicodeString endOfBuffer;
2543     if (data->flags & UCOL_ITER_HASLEN) {
2544         /*
2545         normalization buffer not used yet, we'll pull down the next
2546         character into the end of the buffer
2547         */
2548         endOfBuffer.setTo(*pEnd);
2549     }
2550     else {
2551         endOfBuffer.setTo(data->writableBuffer, 1);  // after the leading NUL
2552     }
2553 
2554     if (data->fcdPosition == NULL) {
2555         pStart = data->string;
2556     }
2557     else {
2558         pStart = data->fcdPosition + 1;
2559     }
2560     int32_t normLen =
2561         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)),
2562                              data->writableBuffer,
2563                              *status).
2564         length();
2565     if(U_FAILURE(*status)) {
2566         return;
2567     }
2568     /*
2569     this puts the null termination infront of the normalized string instead
2570     of the end
2571     */
2572     data->pos =
2573         data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() +
2574         1 + normLen;
2575     data->origFlags  = data->flags;
2576     data->flags     |= UCOL_ITER_INNORMBUF;
2577     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2578 }
2579 
2580 /**
2581 * Contraction character management function that returns the previous character
2582 * for the backwards iterator.
2583 * Does nothing if the previous character is in buffer and not the first
2584 * character in it.
2585 * Else it checks previous character in data string to see if it is
2586 * normalizable.
2587 * If it is not, the character is simply copied into the buffer, else
2588 * the whole normalized substring is copied into the buffer, including the
2589 * current character.
2590 * @param data collation element iterator data
2591 * @return previous character
2592 */
2593 static
getPrevNormalizedChar(collIterate * data,UErrorCode * status)2594 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
2595 {
2596     UChar  prevch;
2597     UChar  ch;
2598     const UChar *start;
2599     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2600     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2601         (innormbuf && *(data->pos - 1) != 0)) {
2602         /*
2603         if no normalization.
2604         if previous character is in normalized buffer, no further normalization
2605         is required
2606         */
2607       if(data->flags & UCOL_USE_ITERATOR) {
2608         data->iterator->move(data->iterator, -1, UITER_CURRENT);
2609         return (UChar)data->iterator->next(data->iterator);
2610       } else {
2611         return *(data->pos - 1);
2612       }
2613     }
2614 
2615     start = data->pos;
2616     if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
2617         /* in data string */
2618         if ((start - 1) == data->string) {
2619             return *(start - 1);
2620         }
2621         start --;
2622         ch     = *start;
2623         prevch = *(start - 1);
2624     }
2625     else {
2626         /*
2627         in writable buffer, at this point fcdPosition can not be NULL.
2628         see contracting tag.
2629         */
2630         if (data->fcdPosition == data->string) {
2631             /* at the start of the string, just dump it into the normalizer */
2632             insertBufferFront(data, *(data->fcdPosition));
2633             data->fcdPosition = NULL;
2634             return *(data->pos - 1);
2635         }
2636         start  = data->fcdPosition;
2637         ch     = *start;
2638         prevch = *(start - 1);
2639     }
2640     /*
2641     * if the current character is not fcd.
2642     * Trailing combining class == 0.
2643     */
2644     if (data->fcdPosition > start &&
2645        (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2646     {
2647         /*
2648         Need a more complete FCD check and possible normalization.
2649         normalize substring will be appended to buffer
2650         */
2651         const UChar *backuppos = data->pos;
2652         data->pos = start;
2653         if (collPrevIterFCD(data)) {
2654             normalizePrevContraction(data, status);
2655             return *(data->pos - 1);
2656         }
2657         data->pos = backuppos;
2658         data->fcdPosition ++;
2659     }
2660 
2661     if (innormbuf) {
2662     /*
2663     no normalization is to be done hence only one character will be
2664     appended to the buffer.
2665     */
2666         insertBufferFront(data, ch);
2667         data->fcdPosition --;
2668     }
2669 
2670     return ch;
2671 }
2672 
2673 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2674 /* It is called by getNextCE */
2675 
2676 /* The following should be even */
2677 #define UCOL_MAX_DIGITS_FOR_NUMBER 254
2678 
ucol_prv_getSpecialCE(const UCollator * coll,UChar ch,uint32_t CE,collIterate * source,UErrorCode * status)2679 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
2680     collIterateState entryState;
2681     backupState(source, &entryState);
2682     UChar32 cp = ch;
2683 
2684     for (;;) {
2685         // This loop will repeat only in the case of contractions, and only when a contraction
2686         //   is found and the first CE resulting from that contraction is itself a special
2687         //   (an expansion, for example.)  All other special CE types are fully handled the
2688         //   first time through, and the loop exits.
2689 
2690         const uint32_t *CEOffset = NULL;
2691         switch(getCETag(CE)) {
2692         case NOT_FOUND_TAG:
2693             /* This one is not found, and we'll let somebody else bother about it... no more games */
2694             return CE;
2695         case SPEC_PROC_TAG:
2696             {
2697                 // Special processing is getting a CE that is preceded by a certain prefix
2698                 // Currently this is only needed for optimizing Japanese length and iteration marks.
2699                 // When we encouter a special processing tag, we go backwards and try to see if
2700                 // we have a match.
2701                 // Contraction tables are used - so the whole process is not unlike contraction.
2702                 // prefix data is stored backwards in the table.
2703                 const UChar *UCharOffset;
2704                 UChar schar, tchar;
2705                 collIterateState prefixState;
2706                 backupState(source, &prefixState);
2707                 loadState(source, &entryState, TRUE);
2708                 goBackOne(source); // We want to look at the point where we entered - actually one
2709                 // before that...
2710 
2711                 for(;;) {
2712                     // This loop will run once per source string character, for as long as we
2713                     //  are matching a potential contraction sequence
2714 
2715                     // First we position ourselves at the begining of contraction sequence
2716                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2717                     if (collIter_bos(source)) {
2718                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2719                         break;
2720                     }
2721                     schar = getPrevNormalizedChar(source, status);
2722                     goBackOne(source);
2723 
2724                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2725                         UCharOffset++;
2726                     }
2727 
2728                     if (schar == tchar) {
2729                         // Found the source string char in the table.
2730                         //  Pick up the corresponding CE from the table.
2731                         CE = *(coll->contractionCEs +
2732                             (UCharOffset - coll->contractionIndex));
2733                     }
2734                     else
2735                     {
2736                         // Source string char was not in the table.
2737                         //   We have not found the prefix.
2738                         CE = *(coll->contractionCEs +
2739                             (ContractionStart - coll->contractionIndex));
2740                     }
2741 
2742                     if(!isPrefix(CE)) {
2743                         // The source string char was in the contraction table, and the corresponding
2744                         //   CE is not a prefix CE.  We found the prefix, break
2745                         //   out of loop, this CE will end up being returned.  This is the normal
2746                         //   way out of prefix handling when the source actually contained
2747                         //   the prefix.
2748                         break;
2749                     }
2750                 }
2751                 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
2752                     loadState(source, &prefixState, TRUE);
2753                     if(source->origFlags & UCOL_USE_ITERATOR) {
2754                         source->flags = source->origFlags;
2755                     }
2756                 } else { // prefix search was a failure, we have to backup all the way to the start
2757                     loadState(source, &entryState, TRUE);
2758                 }
2759                 break;
2760             }
2761         case CONTRACTION_TAG:
2762             {
2763                 /* This should handle contractions */
2764                 collIterateState state;
2765                 backupState(source, &state);
2766                 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
2767                 const UChar *UCharOffset;
2768                 UChar schar, tchar;
2769 
2770                 for (;;) {
2771                     /* This loop will run once per source string character, for as long as we     */
2772                     /*  are matching a potential contraction sequence                  */
2773 
2774                     /* First we position ourselves at the begining of contraction sequence */
2775                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2776 
2777                     if (collIter_eos(source)) {
2778                         // Ran off the end of the source string.
2779                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2780                         // So we'll pick whatever we have at the point...
2781                         if (CE == UCOL_NOT_FOUND) {
2782                             // back up the source over all the chars we scanned going into this contraction.
2783                             CE = firstCE;
2784                             loadState(source, &state, TRUE);
2785                             if(source->origFlags & UCOL_USE_ITERATOR) {
2786                                 source->flags = source->origFlags;
2787                             }
2788                         }
2789                         break;
2790                     }
2791 
2792                     uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2793                     uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
2794 
2795                     schar = getNextNormalizedChar(source);
2796                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2797                         UCharOffset++;
2798                     }
2799 
2800                     if (schar == tchar) {
2801                         // Found the source string char in the contraction table.
2802                         //  Pick up the corresponding CE from the table.
2803                         CE = *(coll->contractionCEs +
2804                             (UCharOffset - coll->contractionIndex));
2805                     }
2806                     else
2807                     {
2808                         // Source string char was not in contraction table.
2809                         //   Unless we have a discontiguous contraction, we have finished
2810                         //   with this contraction.
2811                         // in order to do the proper detection, we
2812                         // need to see if we're dealing with a supplementary
2813                         /* We test whether the next two char are surrogate pairs.
2814                         * This test is done if the iterator is not NULL.
2815                         * If there is no surrogate pair, the iterator
2816                         * goes back one if needed. */
2817                         UChar32 miss = schar;
2818                         if (source->iterator) {
2819                             UChar32 surrNextChar; /* the next char in the iteration to test */
2820                             int32_t prevPos; /* holds the previous position before move forward of the source iterator */
2821                             if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) {
2822                                 prevPos = source->iterator->index;
2823                                 surrNextChar = getNextNormalizedChar(source);
2824                                 if (U16_IS_TRAIL(surrNextChar)) {
2825                                     miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar);
2826                                 } else if (prevPos < source->iterator->index){
2827                                     goBackOne(source);
2828                                 }
2829                             }
2830                         } else if (U16_IS_LEAD(schar)) {
2831                             miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
2832                         }
2833 
2834                         uint8_t sCC;
2835                         if (miss < 0x300 ||
2836                             maxCC == 0 ||
2837                             (sCC = i_getCombiningClass(miss, coll)) == 0 ||
2838                             sCC>maxCC ||
2839                             (allSame != 0 && sCC == maxCC) ||
2840                             collIter_eos(source))
2841                         {
2842                             //  Contraction can not be discontiguous.
2843                             goBackOne(source);  // back up the source string by one,
2844                             //  because  the character we just looked at was
2845                             //  not part of the contraction.   */
2846                             if(U_IS_SUPPLEMENTARY(miss)) {
2847                                 goBackOne(source);
2848                             }
2849                             CE = *(coll->contractionCEs +
2850                                 (ContractionStart - coll->contractionIndex));
2851                         } else {
2852                             //
2853                             // Contraction is possibly discontiguous.
2854                             //   Scan more of source string looking for a match
2855                             //
2856                             UChar tempchar;
2857                             /* find the next character if schar is not a base character
2858                             and we are not yet at the end of the string */
2859                             tempchar = getNextNormalizedChar(source);
2860                             // probably need another supplementary thingie here
2861                             goBackOne(source);
2862                             if (i_getCombiningClass(tempchar, coll) == 0) {
2863                                 goBackOne(source);
2864                                 if(U_IS_SUPPLEMENTARY(miss)) {
2865                                     goBackOne(source);
2866                                 }
2867                                 /* Spit out the last char of the string, wasn't tasty enough */
2868                                 CE = *(coll->contractionCEs +
2869                                     (ContractionStart - coll->contractionIndex));
2870                             } else {
2871                                 CE = getDiscontiguous(coll, source, ContractionStart);
2872                             }
2873                         }
2874                     } // else after if(schar == tchar)
2875 
2876                     if(CE == UCOL_NOT_FOUND) {
2877                         /* The Source string did not match the contraction that we were checking.  */
2878                         /*  Back up the source position to undo the effects of having partially    */
2879                         /*   scanned through what ultimately proved to not be a contraction.       */
2880                         loadState(source, &state, TRUE);
2881                         CE = firstCE;
2882                         break;
2883                     }
2884 
2885                     if(!isContraction(CE)) {
2886                         // The source string char was in the contraction table, and the corresponding
2887                         //   CE is not a contraction CE.  We completed the contraction, break
2888                         //   out of loop, this CE will end up being returned.  This is the normal
2889                         //   way out of contraction handling when the source actually contained
2890                         //   the contraction.
2891                         break;
2892                     }
2893 
2894 
2895                     // The source string char was in the contraction table, and the corresponding
2896                     //   CE is IS  a contraction CE.  We will continue looping to check the source
2897                     //   string for the remaining chars in the contraction.
2898                     uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
2899                     if(tempCE != UCOL_NOT_FOUND) {
2900                         // We have scanned a a section of source string for which there is a
2901                         //  CE from the contraction table.  Remember the CE and scan position, so
2902                         //  that we can return to this point if further scanning fails to
2903                         //  match a longer contraction sequence.
2904                         firstCE = tempCE;
2905 
2906                         goBackOne(source);
2907                         backupState(source, &state);
2908                         getNextNormalizedChar(source);
2909 
2910                         // Another way to do this is:
2911                         //collIterateState tempState;
2912                         //backupState(source, &tempState);
2913                         //goBackOne(source);
2914                         //backupState(source, &state);
2915                         //loadState(source, &tempState, TRUE);
2916 
2917                         // The problem is that for incomplete contractions we have to remember the previous
2918                         // position. Before, the only thing I needed to do was state.pos--;
2919                         // After iterator introduction and especially after introduction of normalizing
2920                         // iterators, it became much more difficult to decrease the saved state.
2921                         // I'm not yet sure which of the two methods above is faster.
2922                     }
2923                 } // for(;;)
2924                 break;
2925             } // case CONTRACTION_TAG:
2926         case LONG_PRIMARY_TAG:
2927             {
2928                 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
2929                 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
2930                 source->offsetRepeatCount += 1;
2931                 return CE;
2932             }
2933         case EXPANSION_TAG:
2934             {
2935                 /* This should handle expansion. */
2936                 /* NOTE: we can encounter both continuations and expansions in an expansion! */
2937                 /* I have to decide where continuations are going to be dealt with */
2938                 uint32_t size;
2939                 uint32_t i;    /* general counter */
2940 
2941                 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
2942                 size = getExpansionCount(CE);
2943                 CE = *CEOffset++;
2944               //source->offsetRepeatCount = -1;
2945 
2946                 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
2947                     for(i = 1; i<size; i++) {
2948                         *(source->CEpos++) = *CEOffset++;
2949                         source->offsetRepeatCount += 1;
2950                     }
2951                 } else { /* else, we do */
2952                     while(*CEOffset != 0) {
2953                         *(source->CEpos++) = *CEOffset++;
2954                         source->offsetRepeatCount += 1;
2955                     }
2956                 }
2957 
2958                 return CE;
2959             }
2960         case DIGIT_TAG:
2961             {
2962                 /*
2963                 We do a check to see if we want to collate digits as numbers; if so we generate
2964                 a custom collation key. Otherwise we pull out the value stored in the expansion table.
2965                 */
2966                 //uint32_t size;
2967                 uint32_t i;    /* general counter */
2968 
2969                 if (source->coll->numericCollation == UCOL_ON){
2970                     collIterateState digitState = {0,0,0,0,0,0,0,0,0};
2971                     UChar32 char32 = 0;
2972                     int32_t digVal = 0;
2973 
2974                     uint32_t digIndx = 0;
2975                     uint32_t endIndex = 0;
2976                     uint32_t trailingZeroIndex = 0;
2977 
2978                     uint8_t collateVal = 0;
2979 
2980                     UBool nonZeroValReached = FALSE;
2981 
2982                     uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs.
2983                     /*
2984                          We parse the source string until we hit a char that's NOT a digit.
2985                         Use this u_charDigitValue. This might be slow because we have to
2986                         handle surrogates...
2987                     */
2988             /*
2989                     if (U16_IS_LEAD(ch)){
2990                       if (!collIter_eos(source)) {
2991                         backupState(source, &digitState);
2992                         UChar trail = getNextNormalizedChar(source);
2993                         if(U16_IS_TRAIL(trail)) {
2994                           char32 = U16_GET_SUPPLEMENTARY(ch, trail);
2995                         } else {
2996                           loadState(source, &digitState, TRUE);
2997                           char32 = ch;
2998                         }
2999                       } else {
3000                         char32 = ch;
3001                       }
3002                     } else {
3003                       char32 = ch;
3004                     }
3005                     digVal = u_charDigitValue(char32);
3006             */
3007                     digVal = u_charDigitValue(cp); // if we have arrived here, we have
3008                     // already processed possible supplementaries that trigered the digit tag -
3009                     // all supplementaries are marked in the UCA.
3010                     /*
3011                         We  pad a zero in front of the first element anyways. This takes
3012                         care of the (probably) most common case where people are sorting things followed
3013                         by a single digit
3014                     */
3015                     digIndx++;
3016                     for(;;){
3017                         // Make sure we have enough space. No longer needed;
3018                         // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
3019                         // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
3020                         // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
3021 
3022                         // Skipping over leading zeroes.
3023                         if (digVal != 0) {
3024                             nonZeroValReached = TRUE;
3025                         }
3026                         if (nonZeroValReached) {
3027                             /*
3028                             We parse the digit string into base 100 numbers (this fits into a byte).
3029                             We only add to the buffer in twos, thus if we are parsing an odd character,
3030                             that serves as the 'tens' digit while the if we are parsing an even one, that
3031                             is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3032                             a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3033                             overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3034                             than all the other bytes.
3035                             */
3036 
3037                             if (digIndx % 2 == 1){
3038                                 collateVal += (uint8_t)digVal;
3039 
3040                                 // We don't enter the low-order-digit case unless we've already seen
3041                                 // the high order, or for the first digit, which is always non-zero.
3042                                 if (collateVal != 0)
3043                                     trailingZeroIndex = 0;
3044 
3045                                 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3046                                 collateVal = 0;
3047                             }
3048                             else{
3049                                 // We drop the collation value into the buffer so if we need to do
3050                                 // a "front patch" we don't have to check to see if we're hitting the
3051                                 // last element.
3052                                 collateVal = (uint8_t)(digVal * 10);
3053 
3054                                 // Check for trailing zeroes.
3055                                 if (collateVal == 0)
3056                                 {
3057                                     if (!trailingZeroIndex)
3058                                         trailingZeroIndex = (digIndx/2) + 2;
3059                                 }
3060                                 else
3061                                     trailingZeroIndex = 0;
3062 
3063                                 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3064                             }
3065                             digIndx++;
3066                         }
3067 
3068                         // Get next character.
3069                         if (!collIter_eos(source)){
3070                             ch = getNextNormalizedChar(source);
3071                             if (U16_IS_LEAD(ch)){
3072                                 if (!collIter_eos(source)) {
3073                                     backupState(source, &digitState);
3074                                     UChar trail = getNextNormalizedChar(source);
3075                                     if(U16_IS_TRAIL(trail)) {
3076                                         char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3077                                     } else {
3078                                         loadState(source, &digitState, TRUE);
3079                                         char32 = ch;
3080                                     }
3081                                 }
3082                             } else {
3083                                 char32 = ch;
3084                             }
3085 
3086                             if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){
3087                                 // Resetting position to point to the next unprocessed char. We
3088                                 // overshot it when doing our test/set for numbers.
3089                                 if (char32 > 0xFFFF) { // For surrogates.
3090                                     loadState(source, &digitState, TRUE);
3091                                     //goBackOne(source);
3092                                 }
3093                                 goBackOne(source);
3094                                 break;
3095                             }
3096                         } else {
3097                             break;
3098                         }
3099                     }
3100 
3101                     if (nonZeroValReached == FALSE){
3102                         digIndx = 2;
3103                         numTempBuf[2] = 6;
3104                     }
3105 
3106                     endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
3107                     if (digIndx % 2 != 0){
3108                         /*
3109                         We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3110                         we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3111                         Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3112                         single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3113                         */
3114 
3115                         for(i = 2; i < endIndex; i++){
3116                             numTempBuf[i] =     (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3117                                 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3118                         }
3119                         --digIndx;
3120                     }
3121 
3122                     // Subtract one off of the last byte.
3123                     numTempBuf[endIndex-1] -= 1;
3124 
3125                     /*
3126                     We want to skip over the first two slots in the buffer. The first slot
3127                     is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3128                     sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3129                     */
3130                     numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3131                     numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3132 
3133                     // Now transfer the collation key to our collIterate struct.
3134                     // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3135                     //size = ((endIndex+1) & ~1)/2;
3136                     CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3137                         (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3138                         UCOL_BYTE_COMMON; // Tertiary weight.
3139                     i = 2; // Reset the index into the buffer.
3140                     while(i < endIndex)
3141                     {
3142                         uint32_t primWeight = numTempBuf[i++] << 8;
3143                         if ( i < endIndex)
3144                             primWeight |= numTempBuf[i++];
3145                         *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3146                     }
3147 
3148                 } else {
3149                     // no numeric mode, we'll just switch to whatever we stashed and continue
3150                     CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3151                     CE = *CEOffset++;
3152                     break;
3153                 }
3154                 return CE;
3155             }
3156             /* various implicits optimization */
3157         case IMPLICIT_TAG:        /* everything that is not defined otherwise */
3158             /* UCA is filled with these. Tailorings are NOT_FOUND */
3159             return getImplicit(cp, source);
3160         case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3161             // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3162             return getImplicit(cp, source);
3163         case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3164             {
3165                 static const uint32_t
3166                     SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3167                 //const uint32_t LCount = 19;
3168                 static const uint32_t VCount = 21;
3169                 static const uint32_t TCount = 28;
3170                 //const uint32_t NCount = VCount * TCount;   // 588
3171                 //const uint32_t SCount = LCount * NCount;   // 11172
3172                 uint32_t L = ch - SBase;
3173 
3174                 // divide into pieces
3175 
3176                 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
3177                 L /= TCount;
3178                 uint32_t V = L % VCount;
3179                 L /= VCount;
3180 
3181                 // offset them
3182 
3183                 L += LBase;
3184                 V += VBase;
3185                 T += TBase;
3186 
3187                 // return the first CE, but first put the rest into the expansion buffer
3188                 if (!source->coll->image->jamoSpecial) { // FAST PATH
3189 
3190                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3191                     if (T != TBase) {
3192                         *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3193                     }
3194 
3195                     return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3196 
3197                 } else { // Jamo is Special
3198                     // Since Hanguls pass the FCD check, it is
3199                     // guaranteed that we won't be in
3200                     // the normalization buffer if something like this happens
3201 
3202                     // However, if we are using a uchar iterator and normalization
3203                     // is ON, the Hangul that lead us here is going to be in that
3204                     // normalization buffer. Here we want to restore the uchar
3205                     // iterator state and pull out of the normalization buffer
3206                     if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
3207                         source->flags = source->origFlags; // restore the iterator
3208                         source->pos = NULL;
3209                     }
3210 
3211                     // Move Jamos into normalization buffer
3212                     UChar *buffer = source->writableBuffer.getBuffer(4);
3213                     int32_t bufferLength;
3214                     buffer[0] = (UChar)L;
3215                     buffer[1] = (UChar)V;
3216                     if (T != TBase) {
3217                         buffer[2] = (UChar)T;
3218                         bufferLength = 3;
3219                     } else {
3220                         bufferLength = 2;
3221                     }
3222                     source->writableBuffer.releaseBuffer(bufferLength);
3223 
3224                     // Indicate where to continue in main input string after exhausting the writableBuffer
3225                     source->fcdPosition       = source->pos;
3226 
3227                     source->pos   = source->writableBuffer.getTerminatedBuffer();
3228                     source->origFlags   = source->flags;
3229                     source->flags       |= UCOL_ITER_INNORMBUF;
3230                     source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3231 
3232                     return(UCOL_IGNORABLE);
3233                 }
3234             }
3235         case SURROGATE_TAG:
3236             /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
3237             /* two things can happen here: next code point can be a trailing surrogate - we will use it */
3238             /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
3239             /* we treat it like an unassigned code point. */
3240             {
3241                 UChar trail;
3242                 collIterateState state;
3243                 backupState(source, &state);
3244                 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
3245                     // we chould have stepped one char forward and it might have turned that it
3246                     // was not a trail surrogate. In that case, we have to backup.
3247                     loadState(source, &state, TRUE);
3248                     return UCOL_NOT_FOUND;
3249                 } else {
3250                     /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
3251                     CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
3252                     if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
3253                         // We need to backup
3254                         loadState(source, &state, TRUE);
3255                         return CE;
3256                     }
3257                     // calculate the supplementary code point value, if surrogate was not tailored
3258                     cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3259                 }
3260             }
3261             break;
3262         case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
3263             UChar nextChar;
3264             if( source->flags & UCOL_USE_ITERATOR) {
3265                 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
3266                     cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3267                     source->iterator->next(source->iterator);
3268                     return getImplicit(cp, source);
3269                 }
3270             } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
3271                       U_IS_TRAIL((nextChar=*source->pos))) {
3272                 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3273                 source->pos++;
3274                 return getImplicit(cp, source);
3275             }
3276             return UCOL_NOT_FOUND;
3277         case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3278             return UCOL_NOT_FOUND; /* broken surrogate sequence */
3279         case CHARSET_TAG:
3280             /* not yet implemented */
3281             /* probably after 1.8 */
3282             return UCOL_NOT_FOUND;
3283         default:
3284             *status = U_INTERNAL_PROGRAM_ERROR;
3285             CE=0;
3286             break;
3287     }
3288     if (CE <= UCOL_NOT_FOUND) break;
3289   }
3290   return CE;
3291 }
3292 
3293 
3294 /* now uses Mark's getImplicitPrimary code */
3295 static
getPrevImplicit(UChar32 cp,collIterate * collationSource)3296 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3297     uint32_t r = uprv_uca_getImplicitPrimary(cp);
3298 
3299     *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3300     collationSource->toReturn = collationSource->CEpos;
3301 
3302     // **** doesn't work if using iterator ****
3303     if (collationSource->flags & UCOL_ITER_INNORMBUF) {
3304         collationSource->offsetRepeatCount = 1;
3305     } else {
3306         int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
3307 
3308         UErrorCode errorCode = U_ZERO_ERROR;
3309         collationSource->appendOffset(firstOffset, errorCode);
3310         collationSource->appendOffset(firstOffset + 1, errorCode);
3311 
3312         collationSource->offsetReturn = collationSource->offsetStore - 1;
3313         *(collationSource->offsetBuffer) = firstOffset;
3314         if (collationSource->offsetReturn == collationSource->offsetBuffer) {
3315             collationSource->offsetStore = collationSource->offsetBuffer;
3316         }
3317     }
3318 
3319     return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3320 }
3321 
3322 /**
3323  * This function handles the special CEs like contractions, expansions,
3324  * surrogates, Thai.
3325  * It is called by both getPrevCE
3326  */
ucol_prv_getSpecialPrevCE(const UCollator * coll,UChar ch,uint32_t CE,collIterate * source,UErrorCode * status)3327 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3328                           collIterate *source,
3329                           UErrorCode *status)
3330 {
3331     const uint32_t *CEOffset    = NULL;
3332           UChar    *UCharOffset = NULL;
3333           UChar    schar;
3334     const UChar    *constart    = NULL;
3335           uint32_t size;
3336           UChar    buffer[UCOL_MAX_BUFFER];
3337           uint32_t *endCEBuffer;
3338           UChar   *strbuffer;
3339           int32_t noChars = 0;
3340           int32_t CECount = 0;
3341 
3342     for(;;)
3343     {
3344         /* the only ces that loops are thai and contractions */
3345         switch (getCETag(CE))
3346         {
3347         case NOT_FOUND_TAG:  /* this tag always returns */
3348             return CE;
3349 
3350         case SPEC_PROC_TAG:
3351             {
3352                 // Special processing is getting a CE that is preceded by a certain prefix
3353                 // Currently this is only needed for optimizing Japanese length and iteration marks.
3354                 // When we encouter a special processing tag, we go backwards and try to see if
3355                 // we have a match.
3356                 // Contraction tables are used - so the whole process is not unlike contraction.
3357                 // prefix data is stored backwards in the table.
3358                 const UChar *UCharOffset;
3359                 UChar schar, tchar;
3360                 collIterateState prefixState;
3361                 backupState(source, &prefixState);
3362                 for(;;) {
3363                     // This loop will run once per source string character, for as long as we
3364                     //  are matching a potential contraction sequence
3365 
3366                     // First we position ourselves at the begining of contraction sequence
3367                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3368 
3369                     if (collIter_bos(source)) {
3370                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3371                         break;
3372                     }
3373                     schar = getPrevNormalizedChar(source, status);
3374                     goBackOne(source);
3375 
3376                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3377                         UCharOffset++;
3378                     }
3379 
3380                     if (schar == tchar) {
3381                         // Found the source string char in the table.
3382                         //  Pick up the corresponding CE from the table.
3383                         CE = *(coll->contractionCEs +
3384                             (UCharOffset - coll->contractionIndex));
3385                     }
3386                     else
3387                     {
3388                         // if there is a completely ignorable code point in the middle of
3389                         // a prefix, we need to act as if it's not there
3390                         // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3391                         // lone surrogates cannot be set to zero as it would break other processing
3392                         uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
3393                         // it's easy for BMP code points
3394                         if(isZeroCE == 0) {
3395                             continue;
3396                         } else if(U16_IS_SURROGATE(schar)) {
3397                             // for supplementary code points, we have to check the next one
3398                             // situations where we are going to ignore
3399                             // 1. beginning of the string: schar is a lone surrogate
3400                             // 2. schar is a lone surrogate
3401                             // 3. schar is a trail surrogate in a valid surrogate sequence
3402                             //    that is explicitly set to zero.
3403                             if (!collIter_bos(source)) {
3404                                 UChar lead;
3405                                 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
3406                                     isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
3407                                     if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) {
3408                                         uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
3409                                         if(finalCE == 0) {
3410                                             // this is a real, assigned completely ignorable code point
3411                                             goBackOne(source);
3412                                             continue;
3413                                         }
3414                                     }
3415                                 } else {
3416                                     // lone surrogate, treat like unassigned
3417                                     return UCOL_NOT_FOUND;
3418                                 }
3419                             } else {
3420                                 // lone surrogate at the beggining, treat like unassigned
3421                                 return UCOL_NOT_FOUND;
3422                             }
3423                         }
3424                         // Source string char was not in the table.
3425                         //   We have not found the prefix.
3426                         CE = *(coll->contractionCEs +
3427                             (ContractionStart - coll->contractionIndex));
3428                     }
3429 
3430                     if(!isPrefix(CE)) {
3431                         // The source string char was in the contraction table, and the corresponding
3432                         //   CE is not a prefix CE.  We found the prefix, break
3433                         //   out of loop, this CE will end up being returned.  This is the normal
3434                         //   way out of prefix handling when the source actually contained
3435                         //   the prefix.
3436                         break;
3437                     }
3438                 }
3439                 loadState(source, &prefixState, TRUE);
3440                 break;
3441             }
3442 
3443         case CONTRACTION_TAG: {
3444             /* to ensure that the backwards and forwards iteration matches, we
3445             take the current region of most possible match and pass it through
3446             the forward iteration. this will ensure that the obstinate problem of
3447             overlapping contractions will not occur.
3448             */
3449             schar = peekCodeUnit(source, 0);
3450             constart = (UChar *)coll->image + getContractOffset(CE);
3451             if (isAtStartPrevIterate(source)
3452                 /* commented away contraction end checks after adding the checks
3453                 in getPrevCE  */) {
3454                     /* start of string or this is not the end of any contraction */
3455                     CE = *(coll->contractionCEs +
3456                         (constart - coll->contractionIndex));
3457                     break;
3458             }
3459             strbuffer = buffer;
3460             UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3461             *(UCharOffset --) = 0;
3462             noChars = 0;
3463             // have to swap thai characters
3464             while (ucol_unsafeCP(schar, coll)) {
3465                 *(UCharOffset) = schar;
3466                 noChars++;
3467                 UCharOffset --;
3468                 schar = getPrevNormalizedChar(source, status);
3469                 goBackOne(source);
3470                 // TODO: when we exhaust the contraction buffer,
3471                 // it needs to get reallocated. The problem is
3472                 // that the size depends on the string which is
3473                 // not iterated over. However, since we're travelling
3474                 // backwards, we already had to set the iterator at
3475                 // the end - so we might as well know where we are?
3476                 if (UCharOffset + 1 == buffer) {
3477                     /* we have exhausted the buffer */
3478                     int32_t newsize = 0;
3479                     if(source->pos) { // actually dealing with a position
3480                         newsize = (int32_t)(source->pos - source->string + 1);
3481                     } else { // iterator
3482                         newsize = 4 * UCOL_MAX_BUFFER;
3483                     }
3484                     strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3485                         (newsize + UCOL_MAX_BUFFER));
3486                     /* test for NULL */
3487                     if (strbuffer == NULL) {
3488                         *status = U_MEMORY_ALLOCATION_ERROR;
3489                         return UCOL_NO_MORE_CES;
3490                     }
3491                     UCharOffset = strbuffer + newsize;
3492                     uprv_memcpy(UCharOffset, buffer,
3493                         UCOL_MAX_BUFFER * sizeof(UChar));
3494                     UCharOffset --;
3495                 }
3496                 if ((source->pos && (source->pos == source->string ||
3497                     ((source->flags & UCOL_ITER_INNORMBUF) &&
3498                     *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3499                     || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
3500                         break;
3501                 }
3502             }
3503             /* adds the initial base character to the string */
3504             *(UCharOffset) = schar;
3505             noChars++;
3506 
3507             int32_t offsetBias;
3508 
3509             // **** doesn't work if using iterator ****
3510             if (source->flags & UCOL_ITER_INNORMBUF) {
3511                 offsetBias = -1;
3512             } else {
3513                 offsetBias = (int32_t)(source->pos - source->string);
3514             }
3515 
3516             /* a new collIterate is used to simplify things, since using the current
3517             collIterate will mean that the forward and backwards iteration will
3518             share and change the same buffers. we don't want to get into that. */
3519             collIterate temp;
3520             int32_t rawOffset;
3521 
3522             IInit_collIterate(coll, UCharOffset, noChars, &temp, status);
3523             if(U_FAILURE(*status)) {
3524                 return UCOL_NULLORDER;
3525             }
3526             temp.flags &= ~UCOL_ITER_NORM;
3527             temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;
3528 
3529             rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero?
3530             CE = ucol_IGetNextCE(coll, &temp, status);
3531 
3532             if (source->extendCEs) {
3533                 endCEBuffer = source->extendCEs + source->extendCEsSize;
3534                 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t));
3535             } else {
3536                 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3537                 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t));
3538             }
3539 
3540             while (CE != UCOL_NO_MORE_CES) {
3541                 *(source->CEpos ++) = CE;
3542 
3543                 if (offsetBias >= 0) {
3544                     source->appendOffset(rawOffset + offsetBias, *status);
3545                 }
3546 
3547                 CECount++;
3548                 if (source->CEpos == endCEBuffer) {
3549                     /* ran out of CE space, reallocate to new buffer.
3550                     If reallocation fails, reset pointers and bail out,
3551                     there's no guarantee of the right character position after
3552                     this bail*/
3553                     if (!increaseCEsCapacity(source)) {
3554                         *status = U_MEMORY_ALLOCATION_ERROR;
3555                         break;
3556                     }
3557 
3558                     endCEBuffer = source->extendCEs + source->extendCEsSize;
3559                 }
3560 
3561                 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
3562                     rawOffset = (int32_t)(temp.fcdPosition - temp.string);
3563                 } else {
3564                     rawOffset = (int32_t)(temp.pos - temp.string);
3565                 }
3566 
3567                 CE = ucol_IGetNextCE(coll, &temp, status);
3568             }
3569 
3570             if (strbuffer != buffer) {
3571                 uprv_free(strbuffer);
3572             }
3573             if (U_FAILURE(*status)) {
3574                 return (uint32_t)UCOL_NULLORDER;
3575             }
3576 
3577             if (source->offsetRepeatValue != 0) {
3578                 if (CECount > noChars) {
3579                     source->offsetRepeatCount += temp.offsetRepeatCount;
3580                 } else {
3581                     // **** does this really skip the right offsets? ****
3582                     source->offsetReturn -= (noChars - CECount);
3583                 }
3584             }
3585 
3586             if (offsetBias >= 0) {
3587                 source->offsetReturn = source->offsetStore - 1;
3588                 if (source->offsetReturn == source->offsetBuffer) {
3589                     source->offsetStore = source->offsetBuffer;
3590                 }
3591             }
3592 
3593             source->toReturn = source->CEpos - 1;
3594             if (source->toReturn == source->CEs) {
3595                 source->CEpos = source->CEs;
3596             }
3597 
3598             return *(source->toReturn);
3599         }
3600         case LONG_PRIMARY_TAG:
3601             {
3602                 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3603                 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3604                 source->toReturn = source->CEpos - 1;
3605 
3606                 if (source->flags & UCOL_ITER_INNORMBUF) {
3607                     source->offsetRepeatCount = 1;
3608                 } else {
3609                     int32_t firstOffset = (int32_t)(source->pos - source->string);
3610 
3611                     source->appendOffset(firstOffset, *status);
3612                     source->appendOffset(firstOffset + 1, *status);
3613 
3614                     source->offsetReturn = source->offsetStore - 1;
3615                     *(source->offsetBuffer) = firstOffset;
3616                     if (source->offsetReturn == source->offsetBuffer) {
3617                         source->offsetStore = source->offsetBuffer;
3618                     }
3619                 }
3620 
3621 
3622                 return *(source->toReturn);
3623             }
3624 
3625         case EXPANSION_TAG: /* this tag always returns */
3626             {
3627             /*
3628             This should handle expansion.
3629             NOTE: we can encounter both continuations and expansions in an expansion!
3630             I have to decide where continuations are going to be dealt with
3631             */
3632             int32_t firstOffset = (int32_t)(source->pos - source->string);
3633 
3634             // **** doesn't work if using iterator ****
3635             if (source->offsetReturn != NULL) {
3636                 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) {
3637                     source->offsetStore = source->offsetBuffer;
3638                 }else {
3639                   firstOffset = -1;
3640                 }
3641             }
3642 
3643             /* find the offset to expansion table */
3644             CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3645             size     = getExpansionCount(CE);
3646             if (size != 0) {
3647                 /*
3648                 if there are less than 16 elements in expansion, we don't terminate
3649                 */
3650                 uint32_t count;
3651 
3652                 for (count = 0; count < size; count++) {
3653                     *(source->CEpos ++) = *CEOffset++;
3654 
3655                     if (firstOffset >= 0) {
3656                         source->appendOffset(firstOffset + 1, *status);
3657                     }
3658                 }
3659             } else {
3660                 /* else, we do */
3661                 while (*CEOffset != 0) {
3662                     *(source->CEpos ++) = *CEOffset ++;
3663 
3664                     if (firstOffset >= 0) {
3665                         source->appendOffset(firstOffset + 1, *status);
3666                     }
3667                 }
3668             }
3669 
3670             if (firstOffset >= 0) {
3671                 source->offsetReturn = source->offsetStore - 1;
3672                 *(source->offsetBuffer) = firstOffset;
3673                 if (source->offsetReturn == source->offsetBuffer) {
3674                     source->offsetStore = source->offsetBuffer;
3675                 }
3676             } else {
3677                 source->offsetRepeatCount += size - 1;
3678             }
3679 
3680             source->toReturn = source->CEpos - 1;
3681             // in case of one element expansion, we
3682             // want to immediately return CEpos
3683             if(source->toReturn == source->CEs) {
3684                 source->CEpos = source->CEs;
3685             }
3686 
3687             return *(source->toReturn);
3688             }
3689 
3690         case DIGIT_TAG:
3691             {
3692                 /*
3693                 We do a check to see if we want to collate digits as numbers; if so we generate
3694                 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3695                 */
3696                 uint32_t i;    /* general counter */
3697 
3698                 if (source->coll->numericCollation == UCOL_ON){
3699                     uint32_t digIndx = 0;
3700                     uint32_t endIndex = 0;
3701                     uint32_t leadingZeroIndex = 0;
3702                     uint32_t trailingZeroCount = 0;
3703 
3704                     uint8_t collateVal = 0;
3705 
3706                     UBool nonZeroValReached = FALSE;
3707 
3708                     uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs.
3709                     /*
3710                     We parse the source string until we hit a char that's NOT a digit.
3711                     Use this u_charDigitValue. This might be slow because we have to
3712                     handle surrogates...
3713                     */
3714                     /*
3715                     We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less,
3716                     with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
3717                     element we process when going backward. To determine how long that chunk might be, we may need to make
3718                     two passes through the loop that collects digits - one to see how long the string is (and how much is
3719                     leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has
3720                     more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation
3721                     element chunk after resetting the state to the initialState at the right side of the digit string.
3722                     */
3723                     uint32_t ceLimit = 0;
3724                     UChar initial_ch = ch;
3725                     collIterateState initialState = {0,0,0,0,0,0,0,0,0};
3726                     backupState(source, &initialState);
3727 
3728                     for(;;) {
3729                         collIterateState state = {0,0,0,0,0,0,0,0,0};
3730                         UChar32 char32 = 0;
3731                         int32_t digVal = 0;
3732 
3733                         if (U16_IS_TRAIL (ch)) {
3734                             if (!collIter_bos(source)){
3735                                 UChar lead = getPrevNormalizedChar(source, status);
3736                                 if(U16_IS_LEAD(lead)) {
3737                                     char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3738                                     goBackOne(source);
3739                                 } else {
3740                                     char32 = ch;
3741                                 }
3742                             } else {
3743                                 char32 = ch;
3744                             }
3745                         } else {
3746                             char32 = ch;
3747                         }
3748                         digVal = u_charDigitValue(char32);
3749 
3750                         for(;;) {
3751                             // Make sure we have enough space. No longer needed;
3752                             // at this point the largest value of digIndx when we need to save data in numTempBuf
3753                             // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure
3754                             // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2).
3755 
3756                             // Skip over trailing zeroes, and keep a count of them.
3757                             if (digVal != 0)
3758                                 nonZeroValReached = TRUE;
3759 
3760                             if (nonZeroValReached) {
3761                                 /*
3762                                 We parse the digit string into base 100 numbers (this fits into a byte).
3763                                 We only add to the buffer in twos, thus if we are parsing an odd character,
3764                                 that serves as the 'tens' digit while the if we are parsing an even one, that
3765                                 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3766                                 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3767                                 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3768                                 than all the other bytes.
3769 
3770                                 Since we're doing in this reverse we want to put the first digit encountered into the
3771                                 ones place and the second digit encountered into the tens place.
3772                                 */
3773 
3774                                 if ((digIndx + trailingZeroCount) % 2 == 1) {
3775                                     // High-order digit case (tens place)
3776                                     collateVal += (uint8_t)(digVal * 10);
3777 
3778                                     // We cannot set leadingZeroIndex unless it has been set for the
3779                                     // low-order digit. Therefore, all we can do for the high-order
3780                                     // digit is turn it off, never on.
3781                                     // The only time we will have a high digit without a low is for
3782                                     // the very first non-zero digit, so no zero check is necessary.
3783                                     if (collateVal != 0)
3784                                         leadingZeroIndex = 0;
3785 
3786                                     // The first pass through, digIndx may exceed the limit, but in that case
3787                                     // we no longer care about numTempBuf contents since they will be discarded
3788                                     if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {
3789                                         numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3790                                     }
3791                                     collateVal = 0;
3792                                 } else {
3793                                     // Low-order digit case (ones place)
3794                                     collateVal = (uint8_t)digVal;
3795 
3796                                     // Check for leading zeroes.
3797                                     if (collateVal == 0) {
3798                                         if (!leadingZeroIndex)
3799                                             leadingZeroIndex = (digIndx/2) + 2;
3800                                     } else
3801                                         leadingZeroIndex = 0;
3802 
3803                                     // No need to write to buffer; the case of a last odd digit
3804                                     // is handled below.
3805                                 }
3806                                 ++digIndx;
3807                             } else
3808                                 ++trailingZeroCount;
3809 
3810                             if (!collIter_bos(source)) {
3811                                 ch = getPrevNormalizedChar(source, status);
3812                                 //goBackOne(source);
3813                                 if (U16_IS_TRAIL(ch)) {
3814                                     backupState(source, &state);
3815                                     if (!collIter_bos(source)) {
3816                                         goBackOne(source);
3817                                         UChar lead = getPrevNormalizedChar(source, status);
3818 
3819                                         if(U16_IS_LEAD(lead)) {
3820                                             char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3821                                         } else {
3822                                             loadState(source, &state, FALSE);
3823                                             char32 = ch;
3824                                         }
3825                                     }
3826                                 } else
3827                                     char32 = ch;
3828 
3829                                 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {
3830                                     if (char32 > 0xFFFF) {// For surrogates.
3831                                         loadState(source, &state, FALSE);
3832                                     }
3833                                     // Don't need to "reverse" the goBackOne call,
3834                                     // as this points to the next position to process..
3835                                     //if (char32 > 0xFFFF) // For surrogates.
3836                                     //getNextNormalizedChar(source);
3837                                     break;
3838                                 }
3839 
3840                                 goBackOne(source);
3841                             }else
3842                                 break;
3843                         }
3844 
3845                         if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) {
3846                             // our collation element is not too big, go ahead and finish with it
3847                             break;
3848                         }
3849                         // our digit string is too long for a collation element;
3850                         // set the limit for it, reset the state and begin again
3851                         ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER;
3852                         if ( ceLimit == 0 ) {
3853                             ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;
3854                         }
3855                         ch = initial_ch;
3856                         loadState(source, &initialState, FALSE);
3857                         digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0;
3858                         collateVal = 0;
3859                         nonZeroValReached = FALSE;
3860                     }
3861 
3862                     if (! nonZeroValReached) {
3863                         digIndx = 2;
3864                         trailingZeroCount = 0;
3865                         numTempBuf[2] = 6;
3866                     }
3867 
3868                     if ((digIndx + trailingZeroCount) % 2 != 0) {
3869                         numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
3870                         digIndx += 1;       // The implicit leading zero
3871                     }
3872                     if (trailingZeroCount % 2 != 0) {
3873                         // We had to consume one trailing zero for the low digit
3874                         // of the least significant byte
3875                         digIndx += 1;       // The trailing zero not in the exponent
3876                         trailingZeroCount -= 1;
3877                     }
3878 
3879                     endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
3880 
3881                     // Subtract one off of the last byte. Really the first byte here, but it's reversed...
3882                     numTempBuf[2] -= 1;
3883 
3884                     /*
3885                     We want to skip over the first two slots in the buffer. The first slot
3886                     is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3887                     sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3888                     The exponent must be adjusted by the number of leading zeroes, and the number of
3889                     trailing zeroes.
3890                     */
3891                     numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3892                     uint32_t exponent = (digIndx+trailingZeroCount)/2;
3893                     if (leadingZeroIndex)
3894                         exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
3895                     numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
3896 
3897                     // Now transfer the collation key to our collIterate struct.
3898                     // The total size for our collation key is half of endIndex, rounded up.
3899                     int32_t size = (endIndex+1)/2;
3900                     if(!ensureCEsCapacity(source, size)) {
3901                         return UCOL_NULLORDER;
3902                     }
3903                     *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3904                         (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3905                         UCOL_BYTE_COMMON; // Tertiary weight.
3906                     i = endIndex - 1; // Reset the index into the buffer.
3907                     while(i >= 2) {
3908                         uint32_t primWeight = numTempBuf[i--] << 8;
3909                         if ( i >= 2)
3910                             primWeight |= numTempBuf[i--];
3911                         *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3912                     }
3913 
3914                     source->toReturn = source->CEpos -1;
3915                     return *(source->toReturn);
3916                 } else {
3917                     CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3918                     CE = *(CEOffset++);
3919                     break;
3920                 }
3921             }
3922 
3923         case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3924             {
3925                 static const uint32_t
3926                     SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3927                 //const uint32_t LCount = 19;
3928                 static const uint32_t VCount = 21;
3929                 static const uint32_t TCount = 28;
3930                 //const uint32_t NCount = VCount * TCount;   /* 588 */
3931                 //const uint32_t SCount = LCount * NCount;   /* 11172 */
3932 
3933                 uint32_t L = ch - SBase;
3934                 /*
3935                 divide into pieces.
3936                 we do it in this order since some compilers can do % and / in one
3937                 operation
3938                 */
3939                 uint32_t T = L % TCount;
3940                 L /= TCount;
3941                 uint32_t V = L % VCount;
3942                 L /= VCount;
3943 
3944                 /* offset them */
3945                 L += LBase;
3946                 V += VBase;
3947                 T += TBase;
3948 
3949                 int32_t firstOffset = (int32_t)(source->pos - source->string);
3950                 source->appendOffset(firstOffset, *status);
3951 
3952                 /*
3953                  * return the first CE, but first put the rest into the expansion buffer
3954                  */
3955                 if (!source->coll->image->jamoSpecial) {
3956                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3957                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3958                     source->appendOffset(firstOffset + 1, *status);
3959 
3960                     if (T != TBase) {
3961                         *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3962                         source->appendOffset(firstOffset + 1, *status);
3963                     }
3964 
3965                     source->toReturn = source->CEpos - 1;
3966 
3967                     source->offsetReturn = source->offsetStore - 1;
3968                     if (source->offsetReturn == source->offsetBuffer) {
3969                         source->offsetStore = source->offsetBuffer;
3970                     }
3971 
3972                     return *(source->toReturn);
3973                 } else {
3974                     // Since Hanguls pass the FCD check, it is
3975                     // guaranteed that we won't be in
3976                     // the normalization buffer if something like this happens
3977 
3978                     // Move Jamos into normalization buffer
3979                     UChar *tempbuffer = source->writableBuffer.getBuffer(5);
3980                     int32_t tempbufferLength, jamoOffset;
3981                     tempbuffer[0] = 0;
3982                     tempbuffer[1] = (UChar)L;
3983                     tempbuffer[2] = (UChar)V;
3984                     if (T != TBase) {
3985                         tempbuffer[3] = (UChar)T;
3986                         tempbufferLength = 4;
3987                     } else {
3988                         tempbufferLength = 3;
3989                     }
3990                     source->writableBuffer.releaseBuffer(tempbufferLength);
3991 
3992                     // Indicate where to continue in main input string after exhausting the writableBuffer
3993                     if (source->pos  == source->string) {
3994                         jamoOffset = 0;
3995                         source->fcdPosition = NULL;
3996                     } else {
3997                         jamoOffset = source->pos - source->string;
3998                         source->fcdPosition       = source->pos-1;
3999                     }
4000 
4001 					// Append offsets for the additional chars
4002 					// (not the 0, and not the L whose offsets match the original Hangul)
4003                     int32_t jamoRemaining = tempbufferLength - 2;
4004                     jamoOffset++; // appended offsets should match end of original Hangul
4005                     while (jamoRemaining-- > 0) {
4006                         source->appendOffset(jamoOffset, *status);
4007                     }
4008 
4009                     source->offsetRepeatValue = jamoOffset;
4010 
4011                     source->offsetReturn = source->offsetStore - 1;
4012                     if (source->offsetReturn == source->offsetBuffer) {
4013                         source->offsetStore = source->offsetBuffer;
4014                     }
4015 
4016                     source->pos               = source->writableBuffer.getTerminatedBuffer() + tempbufferLength;
4017                     source->origFlags         = source->flags;
4018                     source->flags            |= UCOL_ITER_INNORMBUF;
4019                     source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
4020 
4021                     return(UCOL_IGNORABLE);
4022                 }
4023             }
4024 
4025         case IMPLICIT_TAG:        /* everything that is not defined otherwise */
4026             return getPrevImplicit(ch, source);
4027 
4028             // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
4029         case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
4030             return getPrevImplicit(ch, source);
4031 
4032         case SURROGATE_TAG:  /* This is a surrogate pair */
4033             /* essentially an engaged lead surrogate. */
4034             /* if you have encountered it here, it means that a */
4035             /* broken sequence was encountered and this is an error */
4036             return UCOL_NOT_FOUND;
4037 
4038         case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
4039             return UCOL_NOT_FOUND; /* broken surrogate sequence */
4040 
4041         case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
4042             {
4043                 UChar32 cp = 0;
4044                 UChar  prevChar;
4045                 const UChar *prev;
4046                 if (isAtStartPrevIterate(source)) {
4047                     /* we are at the start of the string, wrong place to be at */
4048                     return UCOL_NOT_FOUND;
4049                 }
4050                 if (source->pos != source->writableBuffer.getBuffer()) {
4051                     prev     = source->pos - 1;
4052                 } else {
4053                     prev     = source->fcdPosition;
4054                 }
4055                 prevChar = *prev;
4056 
4057                 /* Handles Han and Supplementary characters here.*/
4058                 if (U16_IS_LEAD(prevChar)) {
4059                     cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
4060                     source->pos = prev;
4061                 } else {
4062                     return UCOL_NOT_FOUND; /* like unassigned */
4063                 }
4064 
4065                 return getPrevImplicit(cp, source);
4066             }
4067 
4068             /* UCA is filled with these. Tailorings are NOT_FOUND */
4069             /* not yet implemented */
4070         case CHARSET_TAG:  /* this tag always returns */
4071             /* probably after 1.8 */
4072             return UCOL_NOT_FOUND;
4073 
4074         default:           /* this tag always returns */
4075             *status = U_INTERNAL_PROGRAM_ERROR;
4076             CE=0;
4077             break;
4078         }
4079 
4080         if (CE <= UCOL_NOT_FOUND) {
4081             break;
4082         }
4083     }
4084 
4085     return CE;
4086 }
4087 
4088 /* This should really be a macro        */
4089 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
4090 /* anyway */
4091 static
reallocateBuffer(uint8_t ** secondaries,uint8_t * secStart,uint8_t * second,uint32_t * secSize,uint32_t newSize,UErrorCode * status)4092 uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) {
4093 #ifdef UCOL_DEBUG
4094     fprintf(stderr, ".");
4095 #endif
4096     uint8_t *newStart = NULL;
4097     uint32_t offset = (uint32_t)(*secondaries-secStart);
4098 
4099     if(secStart==second) {
4100         newStart=(uint8_t*)uprv_malloc(newSize);
4101         if(newStart==NULL) {
4102             *status = U_MEMORY_ALLOCATION_ERROR;
4103             return NULL;
4104         }
4105         uprv_memcpy(newStart, secStart, *secondaries-secStart);
4106     } else {
4107         newStart=(uint8_t*)uprv_realloc(secStart, newSize);
4108         if(newStart==NULL) {
4109             *status = U_MEMORY_ALLOCATION_ERROR;
4110             /* Since we're reallocating, return original reference so we don't loose it. */
4111             return secStart;
4112         }
4113     }
4114     *secondaries=newStart+offset;
4115     *secSize=newSize;
4116     return newStart;
4117 }
4118 
4119 
4120 /* This should really be a macro                                                                      */
4121 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
4122 /* secondaries in French                                                                              */
4123 /*
4124 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
4125   uint8_t temp;
4126   while(start<end) {
4127     temp = *start;
4128     *start++ = *end;
4129     *end-- = temp;
4130   }
4131 }
4132 */
4133 
4134 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
4135   TYPE tempA; \
4136 while((start)<(end)) { \
4137     tempA = *(start); \
4138     *(start)++ = *(end); \
4139     *(end)-- = tempA; \
4140 } \
4141 }
4142 
4143 /****************************************************************************/
4144 /* Following are the sortkey generation functions                           */
4145 /*                                                                          */
4146 /****************************************************************************/
4147 
4148 /**
4149  * Merge two sort keys.
4150  * This is useful, for example, to combine sort keys from first and last names
4151  * to sort such pairs.
4152  * Merged sort keys consider on each collation level the first part first entirely,
4153  * then the second one.
4154  * It is possible to merge multiple sort keys by consecutively merging
4155  * another one with the intermediate result.
4156  *
4157  * The length of the merge result is the sum of the lengths of the input sort keys
4158  * minus 1.
4159  *
4160  * @param src1 the first sort key
4161  * @param src1Length the length of the first sort key, including the zero byte at the end;
4162  *        can be -1 if the function is to find the length
4163  * @param src2 the second sort key
4164  * @param src2Length the length of the second sort key, including the zero byte at the end;
4165  *        can be -1 if the function is to find the length
4166  * @param dest the buffer where the merged sort key is written,
4167  *        can be NULL if destCapacity==0
4168  * @param destCapacity the number of bytes in the dest buffer
4169  * @return the length of the merged sort key, src1Length+src2Length-1;
4170  *         can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
4171  *         in which cases the contents of dest is undefined
4172  *
4173  * @draft
4174  */
4175 U_CAPI int32_t U_EXPORT2
ucol_mergeSortkeys(const uint8_t * src1,int32_t src1Length,const uint8_t * src2,int32_t src2Length,uint8_t * dest,int32_t destCapacity)4176 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
4177                    const uint8_t *src2, int32_t src2Length,
4178                    uint8_t *dest, int32_t destCapacity) {
4179     int32_t destLength;
4180     uint8_t b;
4181 
4182     /* check arguments */
4183     if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
4184         src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
4185         destCapacity<0 || (destCapacity>0 && dest==NULL)
4186     ) {
4187         /* error, attempt to write a zero byte and return 0 */
4188         if(dest!=NULL && destCapacity>0) {
4189             *dest=0;
4190         }
4191         return 0;
4192     }
4193 
4194     /* check lengths and capacity */
4195     if(src1Length<0) {
4196         src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
4197     }
4198     if(src2Length<0) {
4199         src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
4200     }
4201 
4202     destLength=src1Length+src2Length-1;
4203     if(destLength>destCapacity) {
4204         /* the merged sort key does not fit into the destination */
4205         return destLength;
4206     }
4207 
4208     /* merge the sort keys with the same number of levels */
4209     while(*src1!=0 && *src2!=0) { /* while both have another level */
4210         /* copy level from src1 not including 00 or 01 */
4211         while((b=*src1)>=2) {
4212             ++src1;
4213             *dest++=b;
4214         }
4215 
4216         /* add a 02 merge separator */
4217         *dest++=2;
4218 
4219         /* copy level from src2 not including 00 or 01 */
4220         while((b=*src2)>=2) {
4221             ++src2;
4222             *dest++=b;
4223         }
4224 
4225         /* if both sort keys have another level, then add a 01 level separator and continue */
4226         if(*src1==1 && *src2==1) {
4227             ++src1;
4228             ++src2;
4229             *dest++=1;
4230         }
4231     }
4232 
4233     /*
4234      * here, at least one sort key is finished now, but the other one
4235      * might have some contents left from containing more levels;
4236      * that contents is just appended to the result
4237      */
4238     if(*src1!=0) {
4239         /* src1 is not finished, therefore *src2==0, and src1 is appended */
4240         src2=src1;
4241     }
4242     /* append src2, "the other, unfinished sort key" */
4243     uprv_strcpy((char *)dest, (const char *)src2);
4244 
4245     /* trust that neither sort key contained illegally embedded zero bytes */
4246     return destLength;
4247 }
4248 
4249 /* sortkey API */
4250 U_CAPI int32_t U_EXPORT2
ucol_getSortKey(const UCollator * coll,const UChar * source,int32_t sourceLength,uint8_t * result,int32_t resultLength)4251 ucol_getSortKey(const    UCollator    *coll,
4252         const    UChar        *source,
4253         int32_t        sourceLength,
4254         uint8_t        *result,
4255         int32_t        resultLength)
4256 {
4257     UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
4258     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
4259         UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
4260             ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
4261     }
4262 
4263     UErrorCode status = U_ZERO_ERROR;
4264     int32_t keySize   = 0;
4265 
4266     if(source != NULL) {
4267         // source == NULL is actually an error situation, but we would need to
4268         // have an error code to return it. Until we introduce a new
4269         // API, it stays like this
4270 
4271         /* this uses the function pointer that is set in updateinternalstate */
4272         /* currently, there are two funcs: */
4273         /*ucol_calcSortKey(...);*/
4274         /*ucol_calcSortKeySimpleTertiary(...);*/
4275 
4276         keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status);
4277         //if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && result && resultLength > 0) {
4278             // That's not good. Something unusual happened.
4279             // We don't know how much we initialized before we failed.
4280             // NULL terminate for safety.
4281             // We have no way say that we have generated a partial sort key.
4282             //result[0] = 0;
4283             //keySize = 0;
4284         //}
4285     }
4286     UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
4287     UTRACE_EXIT_STATUS(status);
4288     return keySize;
4289 }
4290 
4291 /* this function is called by the C++ API for sortkey generation */
4292 U_CFUNC int32_t
ucol_getSortKeyWithAllocation(const UCollator * coll,const UChar * source,int32_t sourceLength,uint8_t ** pResult,UErrorCode * pErrorCode)4293 ucol_getSortKeyWithAllocation(const UCollator *coll,
4294                               const UChar *source, int32_t sourceLength,
4295                               uint8_t **pResult,
4296                               UErrorCode *pErrorCode) {
4297     *pResult = 0;
4298     return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode);
4299 }
4300 
4301 #define UCOL_FSEC_BUF_SIZE 256
4302 
4303 // Is this primary weight compressible?
4304 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
4305 // TODO: This should use per-lead-byte flags from FractionalUCA.txt.
4306 static inline UBool
isCompressible(const UCollator *,uint8_t primary1)4307 isCompressible(const UCollator * /*coll*/, uint8_t primary1) {
4308     return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary;
4309 }
4310 
4311 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0  */
4312 /* or if we run out of space while making a sortkey and want to return ASAP                                   */
ucol_getSortKeySize(const UCollator * coll,collIterate * s,int32_t currentSize,UColAttributeValue strength,int32_t len)4313 int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) {
4314     UErrorCode status = U_ZERO_ERROR;
4315     //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4316     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4317     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4318     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4319     UBool  compareIdent = (strength == UCOL_IDENTICAL);
4320     UBool  doCase = (coll->caseLevel == UCOL_ON);
4321     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
4322     //UBool  qShifted = shifted  && (compareQuad == 0);
4323     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4324     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4325     uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE];
4326     uint8_t *fSecs = fSecsBuff;
4327     uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE;
4328     uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL;
4329 
4330     uint32_t variableTopValue = coll->variableTopValue;
4331     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4332     if(doHiragana) {
4333         UCOL_COMMON_BOT4++;
4334         /* allocate one more space for hiragana */
4335     }
4336     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4337 
4338     uint32_t order = UCOL_NO_MORE_CES;
4339     uint8_t primary1 = 0;
4340     uint8_t primary2 = 0;
4341     uint8_t secondary = 0;
4342     uint8_t tertiary = 0;
4343     int32_t caseShift = 0;
4344     uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */
4345 
4346     uint8_t caseSwitch = coll->caseSwitch;
4347     uint8_t tertiaryMask = coll->tertiaryMask;
4348     uint8_t tertiaryCommon = coll->tertiaryCommon;
4349 
4350     UBool wasShifted = FALSE;
4351     UBool notIsContinuation = FALSE;
4352     uint8_t leadPrimary = 0;
4353 
4354 
4355     for(;;) {
4356         order = ucol_IGetNextCE(coll, s, &status);
4357         if(order == UCOL_NO_MORE_CES) {
4358             break;
4359         }
4360 
4361         if(order == 0) {
4362             continue;
4363         }
4364 
4365         notIsContinuation = !isContinuation(order);
4366 
4367 
4368         if(notIsContinuation) {
4369             tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK));
4370         } else {
4371             tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4372         }
4373         secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4374         primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4375         primary1 = (uint8_t)(order >> 8);
4376 
4377         /* no need to permute since the actual code values don't matter
4378         if (coll->leadBytePermutationTable != NULL && notIsContinuation) {
4379             primary1 = coll->leadBytePermutationTable[primary1];
4380         }
4381         */
4382 
4383         if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4384                       || (!notIsContinuation && wasShifted)))
4385             || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
4386                 /* and other ignorables should be removed if following a shifted code point */
4387                 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4388                     /* we should just completely ignore it */
4389                     continue;
4390                 }
4391                 if(compareQuad == 0) {
4392                     if(c4 > 0) {
4393                         currentSize += (c2/UCOL_BOT_COUNT4)+1;
4394                         c4 = 0;
4395                     }
4396                     currentSize++;
4397                     if(primary2 != 0) {
4398                         currentSize++;
4399                     }
4400                 }
4401                 wasShifted = TRUE;
4402         } else {
4403             wasShifted = FALSE;
4404             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4405             /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
4406             /* calculate sortkey size */
4407             if(primary1 != UCOL_IGNORABLE) {
4408                 if(notIsContinuation) {
4409                     if(leadPrimary == primary1) {
4410                         currentSize++;
4411                     } else {
4412                         if(leadPrimary != 0) {
4413                             currentSize++;
4414                         }
4415                         if(primary2 == UCOL_IGNORABLE) {
4416                             /* one byter, not compressed */
4417                             currentSize++;
4418                             leadPrimary = 0;
4419                         } else if(isCompressible(coll, primary1)) {
4420                             /* compress */
4421                             leadPrimary = primary1;
4422                             currentSize+=2;
4423                         } else {
4424                             leadPrimary = 0;
4425                             currentSize+=2;
4426                         }
4427                     }
4428                 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4429                     currentSize++;
4430                     if(primary2 != UCOL_IGNORABLE) {
4431                         currentSize++;
4432                     }
4433                 }
4434             }
4435 
4436             if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */
4437                 if(!isFrenchSec){
4438                     if (secondary == UCOL_COMMON2 && notIsContinuation) {
4439                         c2++;
4440                     } else {
4441                         if(c2 > 0) {
4442                             if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4443                                 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1;
4444                             } else {
4445                                 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1;
4446                             }
4447                             c2 = 0;
4448                         }
4449                         currentSize++;
4450                     }
4451                 } else {
4452                     fSecs[fSecsLen++] = secondary;
4453                     if(fSecsLen == fSecsMaxLen) {
4454                         uint8_t *fSecsTemp;
4455                         if(fSecs == fSecsBuff) {
4456                             fSecsTemp = (uint8_t *)uprv_malloc(2*fSecsLen);
4457                         } else {
4458                             fSecsTemp = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen);
4459                         }
4460                         if(fSecsTemp == NULL) {
4461                             status = U_MEMORY_ALLOCATION_ERROR;
4462                             return 0;
4463                         }
4464                         fSecs = fSecsTemp;
4465                         fSecsMaxLen *= 2;
4466                     }
4467                     if(notIsContinuation) {
4468                         if (frenchStartPtr != NULL) {
4469                             /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4470                             uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4471                             frenchStartPtr = NULL;
4472                         }
4473                     } else {
4474                         if (frenchStartPtr == NULL) {
4475                             frenchStartPtr = fSecs+fSecsLen-2;
4476                         }
4477                         frenchEndPtr = fSecs+fSecsLen-1;
4478                     }
4479                 }
4480             }
4481 
4482             if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4483                 // do the case level if we need to do it. We don't want to calculate
4484                 // case level for primary ignorables if we have only primary strength and case level
4485                 // otherwise we would break well formedness of CEs
4486                 if (caseShift  == 0) {
4487                     currentSize++;
4488                     caseShift = UCOL_CASE_SHIFT_START;
4489                 }
4490                 if((tertiary&0x3F) > 0 && notIsContinuation) {
4491                     caseShift--;
4492                     if((tertiary &0xC0) != 0) {
4493                         if (caseShift  == 0) {
4494                             currentSize++;
4495                             caseShift = UCOL_CASE_SHIFT_START;
4496                         }
4497                         caseShift--;
4498                     }
4499                 }
4500             } else {
4501                 if(notIsContinuation) {
4502                     tertiary ^= caseSwitch;
4503                 }
4504             }
4505 
4506             tertiary &= tertiaryMask;
4507             if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */
4508                 if (tertiary == tertiaryCommon && notIsContinuation) {
4509                     c3++;
4510                 } else {
4511                     if(c3 > 0) {
4512                         if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
4513                             || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
4514                                 currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1;
4515                         } else {
4516                             currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1;
4517                         }
4518                         c3 = 0;
4519                     }
4520                     currentSize++;
4521                 }
4522             }
4523 
4524             if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
4525                 if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4526                     if(c4>0) { // Close this part
4527                         currentSize += (c4/UCOL_BOT_COUNT4)+1;
4528                         c4 = 0;
4529                     }
4530                     currentSize++; // Add the Hiragana
4531                 } else { // This wasn't Hiragana, so we can continue adding stuff
4532                     c4++;
4533                 }
4534             }
4535         }
4536     }
4537 
4538     if(!isFrenchSec){
4539         if(c2 > 0) {
4540             currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4541         }
4542     } else {
4543         uint32_t i = 0;
4544         if(frenchStartPtr != NULL) {
4545             uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4546         }
4547         for(i = 0; i<fSecsLen; i++) {
4548             secondary = *(fSecs+fSecsLen-i-1);
4549             /* This is compression code. */
4550             if (secondary == UCOL_COMMON2) {
4551                 ++c2;
4552             } else {
4553                 if(c2 > 0) {
4554                     if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4555                         currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0);
4556                     } else {
4557                         currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4558                     }
4559                     c2 = 0;
4560                 }
4561                 currentSize++;
4562             }
4563         }
4564         if(c2 > 0) {
4565             currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4566         }
4567         if(fSecs != fSecsBuff) {
4568             uprv_free(fSecs);
4569         }
4570     }
4571 
4572     if(c3 > 0) {
4573         currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0);
4574     }
4575 
4576     if(c4 > 0  && compareQuad == 0) {
4577         currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0);
4578     }
4579 
4580     if(compareIdent) {
4581         currentSize += u_lengthOfIdenticalLevelRun(s->string, len);
4582     }
4583     return currentSize;
4584 }
4585 
4586 static
doCaseShift(uint8_t ** cases,uint32_t & caseShift)4587 inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) {
4588     if (caseShift  == 0) {
4589         *(*cases)++ = UCOL_CASE_BYTE_START;
4590         caseShift = UCOL_CASE_SHIFT_START;
4591     }
4592 }
4593 
4594 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
4595 // know how many values we wanted to add, even if we didn't add them all
4596 static
addWithIncrement(uint8_t * & primaries,uint8_t * limit,uint32_t & size,const uint8_t value)4597 inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) {
4598     size++;
4599     if(primaries < limit) {
4600         *(primaries)++ = value;
4601     }
4602 }
4603 
4604 // Packs the secondary buffer when processing French locale. Adds the terminator.
4605 static
packFrench(uint8_t * primaries,uint8_t * primEnd,uint8_t * secondaries,uint32_t * secsize,uint8_t * frenchStartPtr,uint8_t * frenchEndPtr)4606 inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) {
4607     uint8_t secondary;
4608     int32_t count2 = 0;
4609     uint32_t i = 0, size = 0;
4610     // we use i here since the key size already accounts for terminators, so we'll discard the increment
4611     addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR);
4612     /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
4613     if(frenchStartPtr != NULL) {
4614         uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4615     }
4616     for(i = 0; i<*secsize; i++) {
4617         secondary = *(secondaries-i-1);
4618         /* This is compression code. */
4619         if (secondary == UCOL_COMMON2) {
4620             ++count2;
4621         } else {
4622             if (count2 > 0) {
4623                 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4624                     while (count2 > UCOL_TOP_COUNT2) {
4625                         addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
4626                         count2 -= (uint32_t)UCOL_TOP_COUNT2;
4627                     }
4628                     addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
4629                 } else {
4630                     while (count2 > UCOL_BOT_COUNT2) {
4631                         addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4632                         count2 -= (uint32_t)UCOL_BOT_COUNT2;
4633                     }
4634                     addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4635                 }
4636                 count2 = 0;
4637             }
4638             addWithIncrement(primaries, primEnd, size, secondary);
4639         }
4640     }
4641     if (count2 > 0) {
4642         while (count2 > UCOL_BOT_COUNT2) {
4643             addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4644             count2 -= (uint32_t)UCOL_BOT_COUNT2;
4645         }
4646         addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4647     }
4648     *secsize = size;
4649     return primaries;
4650 }
4651 
4652 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
4653 
4654 /* This is the sortkey work horse function */
4655 U_CFUNC int32_t U_CALLCONV
ucol_calcSortKey(const UCollator * coll,const UChar * source,int32_t sourceLength,uint8_t ** result,uint32_t resultLength,UBool allocateSKBuffer,UErrorCode * status)4656 ucol_calcSortKey(const    UCollator    *coll,
4657         const    UChar        *source,
4658         int32_t        sourceLength,
4659         uint8_t        **result,
4660         uint32_t        resultLength,
4661         UBool allocateSKBuffer,
4662         UErrorCode *status)
4663 {
4664     //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4665 
4666     uint32_t i = 0; /* general purpose counter */
4667 
4668     /* Stack allocated buffers for buffers we use */
4669     uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER];
4670 
4671     uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad;
4672 
4673     if(U_FAILURE(*status)) {
4674         return 0;
4675     }
4676 
4677     if(primaries == NULL && allocateSKBuffer == TRUE) {
4678         primaries = *result = prim;
4679         resultLength = UCOL_PRIMARY_MAX_BUFFER;
4680     }
4681 
4682     uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER,
4683       caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER;
4684 
4685     uint32_t sortKeySize = 1; /* it is always \0 terminated */
4686 
4687     UnicodeString normSource;
4688 
4689     int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4690 
4691     UColAttributeValue strength = coll->strength;
4692 
4693     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4694     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4695     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4696     UBool  compareIdent = (strength == UCOL_IDENTICAL);
4697     UBool  doCase = (coll->caseLevel == UCOL_ON);
4698     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4699     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
4700     //UBool  qShifted = shifted && (compareQuad == 0);
4701     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4702 
4703     uint32_t variableTopValue = coll->variableTopValue;
4704     // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4705     // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4706     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4707     uint8_t UCOL_HIRAGANA_QUAD = 0;
4708     if(doHiragana) {
4709         UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4710         /* allocate one more space for hiragana, value for hiragana */
4711     }
4712     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4713 
4714     /* support for special features like caselevel and funky secondaries */
4715     uint8_t *frenchStartPtr = NULL;
4716     uint8_t *frenchEndPtr = NULL;
4717     uint32_t caseShift = 0;
4718 
4719     sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShifted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0));
4720 
4721     /* If we need to normalize, we'll do it all at once at the beginning! */
4722     const Normalizer2 *norm2;
4723     if(compareIdent) {
4724         norm2 = Normalizer2Factory::getNFDInstance(*status);
4725     } else if(coll->normalizationMode != UCOL_OFF) {
4726         norm2 = Normalizer2Factory::getFCDInstance(*status);
4727     } else {
4728         norm2 = NULL;
4729     }
4730     if(norm2 != NULL) {
4731         normSource.setTo(FALSE, source, len);
4732         int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
4733         if(qcYesLength != len) {
4734             UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
4735             normSource.truncate(qcYesLength);
4736             norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
4737             source = normSource.getBuffer();
4738             len = normSource.length();
4739         }
4740     }
4741     collIterate s;
4742     IInit_collIterate(coll, source, len, &s, status);
4743     if(U_FAILURE(*status)) {
4744         return 0;
4745     }
4746     s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.
4747 
4748     if(resultLength == 0 || primaries == NULL) {
4749         return ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4750     }
4751     uint8_t *primarySafeEnd = primaries + resultLength - 1;
4752     if(strength > UCOL_PRIMARY) {
4753         primarySafeEnd--;
4754     }
4755 
4756     uint32_t minBufferSize = UCOL_MAX_BUFFER;
4757 
4758     uint8_t *primStart = primaries;
4759     uint8_t *secStart = secondaries;
4760     uint8_t *terStart = tertiaries;
4761     uint8_t *caseStart = cases;
4762     uint8_t *quadStart = quads;
4763 
4764     uint32_t order = 0;
4765 
4766     uint8_t primary1 = 0;
4767     uint8_t primary2 = 0;
4768     uint8_t secondary = 0;
4769     uint8_t tertiary = 0;
4770     uint8_t caseSwitch = coll->caseSwitch;
4771     uint8_t tertiaryMask = coll->tertiaryMask;
4772     int8_t tertiaryAddition = coll->tertiaryAddition;
4773     uint8_t tertiaryTop = coll->tertiaryTop;
4774     uint8_t tertiaryBottom = coll->tertiaryBottom;
4775     uint8_t tertiaryCommon = coll->tertiaryCommon;
4776     uint8_t caseBits = 0;
4777 
4778     UBool finished = FALSE;
4779     UBool wasShifted = FALSE;
4780     UBool notIsContinuation = FALSE;
4781 
4782     uint32_t prevBuffSize = 0;
4783 
4784     uint32_t count2 = 0, count3 = 0, count4 = 0;
4785     uint8_t leadPrimary = 0;
4786 
4787     for(;;) {
4788         for(i=prevBuffSize; i<minBufferSize; ++i) {
4789 
4790             order = ucol_IGetNextCE(coll, &s, status);
4791             if(order == UCOL_NO_MORE_CES) {
4792                 finished = TRUE;
4793                 break;
4794             }
4795 
4796             if(order == 0) {
4797                 continue;
4798             }
4799 
4800             notIsContinuation = !isContinuation(order);
4801 
4802             if(notIsContinuation) {
4803                 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
4804             } else {
4805                 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4806             }
4807 
4808             secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4809             primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4810             primary1 = (uint8_t)(order >> 8);
4811 
4812             uint8_t originalPrimary1 = primary1;
4813             if(notIsContinuation && coll->leadBytePermutationTable != NULL) {
4814                 primary1 = coll->leadBytePermutationTable[primary1];
4815             }
4816 
4817             if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4818                            || (!notIsContinuation && wasShifted)))
4819                 || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
4820             {
4821                 /* and other ignorables should be removed if following a shifted code point */
4822                 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4823                     /* we should just completely ignore it */
4824                     continue;
4825                 }
4826                 if(compareQuad == 0) {
4827                     if(count4 > 0) {
4828                         while (count4 > UCOL_BOT_COUNT4) {
4829                             *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4830                             count4 -= UCOL_BOT_COUNT4;
4831                         }
4832                         *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4833                         count4 = 0;
4834                     }
4835                     /* We are dealing with a variable and we're treating them as shifted */
4836                     /* This is a shifted ignorable */
4837                     if(primary1 != 0) { /* we need to check this since we could be in continuation */
4838                         *quads++ = primary1;
4839                     }
4840                     if(primary2 != 0) {
4841                         *quads++ = primary2;
4842                     }
4843                 }
4844                 wasShifted = TRUE;
4845             } else {
4846                 wasShifted = FALSE;
4847                 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4848                 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
4849                 /* regular and simple sortkey calc */
4850                 if(primary1 != UCOL_IGNORABLE) {
4851                     if(notIsContinuation) {
4852                         if(leadPrimary == primary1) {
4853                             *primaries++ = primary2;
4854                         } else {
4855                             if(leadPrimary != 0) {
4856                                 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
4857                             }
4858                             if(primary2 == UCOL_IGNORABLE) {
4859                                 /* one byter, not compressed */
4860                                 *primaries++ = primary1;
4861                                 leadPrimary = 0;
4862                             } else if(isCompressible(coll, originalPrimary1)) {
4863                                 /* compress */
4864                                 *primaries++ = leadPrimary = primary1;
4865                                 if(primaries <= primarySafeEnd) {
4866                                     *primaries++ = primary2;
4867                                 }
4868                             } else {
4869                                 leadPrimary = 0;
4870                                 *primaries++ = primary1;
4871                                 if(primaries <= primarySafeEnd) {
4872                                     *primaries++ = primary2;
4873                                 }
4874                             }
4875                         }
4876                     } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4877                         *primaries++ = primary1;
4878                         if((primary2 != UCOL_IGNORABLE) && (primaries <= primarySafeEnd)) {
4879                                 *primaries++ = primary2; /* second part */
4880                         }
4881                     }
4882                 }
4883 
4884                 if(secondary > compareSec) {
4885                     if(!isFrenchSec) {
4886                         /* This is compression code. */
4887                         if (secondary == UCOL_COMMON2 && notIsContinuation) {
4888                             ++count2;
4889                         } else {
4890                             if (count2 > 0) {
4891                                 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4892                                     while (count2 > UCOL_TOP_COUNT2) {
4893                                         *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
4894                                         count2 -= (uint32_t)UCOL_TOP_COUNT2;
4895                                     }
4896                                     *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
4897                                 } else {
4898                                     while (count2 > UCOL_BOT_COUNT2) {
4899                                         *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4900                                         count2 -= (uint32_t)UCOL_BOT_COUNT2;
4901                                     }
4902                                     *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
4903                                 }
4904                                 count2 = 0;
4905                             }
4906                             *secondaries++ = secondary;
4907                         }
4908                     } else {
4909                         *secondaries++ = secondary;
4910                         /* Do the special handling for French secondaries */
4911                         /* We need to get continuation elements and do intermediate restore */
4912                         /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
4913                         if(notIsContinuation) {
4914                             if (frenchStartPtr != NULL) {
4915                                 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4916                                 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4917                                 frenchStartPtr = NULL;
4918                             }
4919                         } else {
4920                             if (frenchStartPtr == NULL) {
4921                                 frenchStartPtr = secondaries - 2;
4922                             }
4923                             frenchEndPtr = secondaries-1;
4924                         }
4925                     }
4926                 }
4927 
4928                 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4929                     // do the case level if we need to do it. We don't want to calculate
4930                     // case level for primary ignorables if we have only primary strength and case level
4931                     // otherwise we would break well formedness of CEs
4932                     doCaseShift(&cases, caseShift);
4933                     if(notIsContinuation) {
4934                         caseBits = (uint8_t)(tertiary & 0xC0);
4935 
4936                         if(tertiary != 0) {
4937                             if(coll->caseFirst == UCOL_UPPER_FIRST) {
4938                                 if((caseBits & 0xC0) == 0) {
4939                                     *(cases-1) |= 1 << (--caseShift);
4940                                 } else {
4941                                     *(cases-1) |= 0 << (--caseShift);
4942                                     /* second bit */
4943                                     doCaseShift(&cases, caseShift);
4944                                     *(cases-1) |= ((caseBits>>6)&1) << (--caseShift);
4945                                 }
4946                             } else {
4947                                 if((caseBits & 0xC0) == 0) {
4948                                     *(cases-1) |= 0 << (--caseShift);
4949                                 } else {
4950                                     *(cases-1) |= 1 << (--caseShift);
4951                                     /* second bit */
4952                                     doCaseShift(&cases, caseShift);
4953                                     *(cases-1) |= ((caseBits>>7)&1) << (--caseShift);
4954                                 }
4955                             }
4956                         }
4957 
4958                     }
4959                 } else {
4960                     if(notIsContinuation) {
4961                         tertiary ^= caseSwitch;
4962                     }
4963                 }
4964 
4965                 tertiary &= tertiaryMask;
4966                 if(tertiary > compareTer) {
4967                     /* This is compression code. */
4968                     /* sequence size check is included in the if clause */
4969                     if (tertiary == tertiaryCommon && notIsContinuation) {
4970                         ++count3;
4971                     } else {
4972                         if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
4973                             tertiary += tertiaryAddition;
4974                         } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
4975                             tertiary -= tertiaryAddition;
4976                         }
4977                         if (count3 > 0) {
4978                             if ((tertiary > tertiaryCommon)) {
4979                                 while (count3 > coll->tertiaryTopCount) {
4980                                     *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
4981                                     count3 -= (uint32_t)coll->tertiaryTopCount;
4982                                 }
4983                                 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
4984                             } else {
4985                                 while (count3 > coll->tertiaryBottomCount) {
4986                                     *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
4987                                     count3 -= (uint32_t)coll->tertiaryBottomCount;
4988                                 }
4989                                 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
4990                             }
4991                             count3 = 0;
4992                         }
4993                         *tertiaries++ = tertiary;
4994                     }
4995                 }
4996 
4997                 if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
4998                     if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4999                         if(count4>0) { // Close this part
5000                             while (count4 > UCOL_BOT_COUNT4) {
5001                                 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
5002                                 count4 -= UCOL_BOT_COUNT4;
5003                             }
5004                             *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
5005                             count4 = 0;
5006                         }
5007                         *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana
5008                     } else { // This wasn't Hiragana, so we can continue adding stuff
5009                         count4++;
5010                     }
5011                 }
5012             }
5013 
5014             if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
5015                 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
5016                     IInit_collIterate(coll, (UChar *)source, len, &s, status);
5017                     if(U_FAILURE(*status)) {
5018                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5019                         finished = TRUE;
5020                         break;
5021                     }
5022                     s.flags &= ~UCOL_ITER_NORM;
5023                     sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
5024                     *status = U_BUFFER_OVERFLOW_ERROR;
5025                     finished = TRUE;
5026                     break;
5027                 } else { /* It's much nicer if we can actually reallocate */
5028                     int32_t sks = sortKeySize+(int32_t)((primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart));
5029                     primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
5030                     if(U_SUCCESS(*status)) {
5031                         *result = primStart;
5032                         primarySafeEnd = primStart + resultLength - 1;
5033                         if(strength > UCOL_PRIMARY) {
5034                             primarySafeEnd--;
5035                         }
5036                     } else {
5037                         /* We ran out of memory!? We can't recover. */
5038                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5039                         finished = TRUE;
5040                         break;
5041                     }
5042                 }
5043             }
5044         }
5045         if(finished) {
5046             break;
5047         } else {
5048             prevBuffSize = minBufferSize;
5049 
5050             uint32_t frenchStartOffset = 0, frenchEndOffset = 0;
5051             if (frenchStartPtr != NULL) {
5052                 frenchStartOffset = (uint32_t)(frenchStartPtr - secStart);
5053                 frenchEndOffset = (uint32_t)(frenchEndPtr - secStart);
5054             }
5055             secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
5056             terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
5057             caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status);
5058             quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status);
5059             if(U_FAILURE(*status)) {
5060                 /* We ran out of memory!? We can't recover. */
5061                 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5062                 break;
5063             }
5064             if (frenchStartPtr != NULL) {
5065                 frenchStartPtr = secStart + frenchStartOffset;
5066                 frenchEndPtr = secStart + frenchEndOffset;
5067             }
5068             minBufferSize *= 2;
5069         }
5070     }
5071 
5072     /* Here, we are generally done with processing */
5073     /* bailing out would not be too productive */
5074 
5075     if(U_SUCCESS(*status)) {
5076         sortKeySize += (uint32_t)(primaries - primStart);
5077         /* we have done all the CE's, now let's put them together to form a key */
5078         if(compareSec == 0) {
5079             if (count2 > 0) {
5080                 while (count2 > UCOL_BOT_COUNT2) {
5081                     *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5082                     count2 -= (uint32_t)UCOL_BOT_COUNT2;
5083                 }
5084                 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5085             }
5086             uint32_t secsize = (uint32_t)(secondaries-secStart);
5087             if(!isFrenchSec) { // Regular situation, we know the length of secondaries
5088                 sortKeySize += secsize;
5089                 if(sortKeySize <= resultLength) {
5090                     *(primaries++) = UCOL_LEVELTERMINATOR;
5091                     uprv_memcpy(primaries, secStart, secsize);
5092                     primaries += secsize;
5093                 } else {
5094                     if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
5095                         primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5096                         if(U_SUCCESS(*status)) {
5097                             *result = primStart;
5098                             *(primaries++) = UCOL_LEVELTERMINATOR;
5099                             uprv_memcpy(primaries, secStart, secsize);
5100                             primaries += secsize;
5101                         }
5102                         else {
5103                             /* We ran out of memory!? We can't recover. */
5104                             sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5105                             goto cleanup;
5106                         }
5107                     } else {
5108                         *status = U_BUFFER_OVERFLOW_ERROR;
5109                     }
5110                 }
5111             } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
5112                 uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
5113                 sortKeySize += secsize;
5114                 if(sortKeySize <= resultLength) { // if we managed to pack fine
5115                     primaries = newPrim; // update the primary pointer
5116                 } else { // overflow, need to reallocate and redo
5117                     if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
5118                         primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5119                         if(U_SUCCESS(*status)) {
5120                             primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
5121                         }
5122                         else {
5123                             /* We ran out of memory!? We can't recover. */
5124                             sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5125                             goto cleanup;
5126                         }
5127                     } else {
5128                         *status = U_BUFFER_OVERFLOW_ERROR;
5129                     }
5130                 }
5131             }
5132         }
5133 
5134         if(doCase) {
5135             uint32_t casesize = (uint32_t)(cases - caseStart);
5136             sortKeySize += casesize;
5137             if(sortKeySize <= resultLength) {
5138                 *(primaries++) = UCOL_LEVELTERMINATOR;
5139                 uprv_memcpy(primaries, caseStart, casesize);
5140                 primaries += casesize;
5141             } else {
5142                 if(allocateSKBuffer == TRUE) {
5143                     primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5144                     if(U_SUCCESS(*status)) {
5145                         *result = primStart;
5146                         *(primaries++) = UCOL_LEVELTERMINATOR;
5147                         uprv_memcpy(primaries, caseStart, casesize);
5148                     }
5149                     else {
5150                         /* We ran out of memory!? We can't recover. */
5151                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5152                         goto cleanup;
5153                     }
5154                 } else {
5155                     *status = U_BUFFER_OVERFLOW_ERROR;
5156                 }
5157             }
5158         }
5159 
5160         if(compareTer == 0) {
5161             if (count3 > 0) {
5162                 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
5163                     while (count3 >= coll->tertiaryTopCount) {
5164                         *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5165                         count3 -= (uint32_t)coll->tertiaryTopCount;
5166                     }
5167                     *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5168                 } else {
5169                     while (count3 > coll->tertiaryBottomCount) {
5170                         *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5171                         count3 -= (uint32_t)coll->tertiaryBottomCount;
5172                     }
5173                     *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5174                 }
5175             }
5176             uint32_t tersize = (uint32_t)(tertiaries - terStart);
5177             sortKeySize += tersize;
5178             if(sortKeySize <= resultLength) {
5179                 *(primaries++) = UCOL_LEVELTERMINATOR;
5180                 uprv_memcpy(primaries, terStart, tersize);
5181                 primaries += tersize;
5182             } else {
5183                 if(allocateSKBuffer == TRUE) {
5184                     primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5185                     if(U_SUCCESS(*status)) {
5186                         *result = primStart;
5187                         *(primaries++) = UCOL_LEVELTERMINATOR;
5188                         uprv_memcpy(primaries, terStart, tersize);
5189                     }
5190                     else {
5191                         /* We ran out of memory!? We can't recover. */
5192                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5193                         goto cleanup;
5194                     }
5195                 } else {
5196                     *status = U_BUFFER_OVERFLOW_ERROR;
5197                 }
5198             }
5199 
5200             if(compareQuad == 0/*qShifted == TRUE*/) {
5201                 if(count4 > 0) {
5202                     while (count4 > UCOL_BOT_COUNT4) {
5203                         *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
5204                         count4 -= UCOL_BOT_COUNT4;
5205                     }
5206                     *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
5207                 }
5208                 uint32_t quadsize = (uint32_t)(quads - quadStart);
5209                 sortKeySize += quadsize;
5210                 if(sortKeySize <= resultLength) {
5211                     *(primaries++) = UCOL_LEVELTERMINATOR;
5212                     uprv_memcpy(primaries, quadStart, quadsize);
5213                     primaries += quadsize;
5214                 } else {
5215                     if(allocateSKBuffer == TRUE) {
5216                         primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5217                         if(U_SUCCESS(*status)) {
5218                             *result = primStart;
5219                             *(primaries++) = UCOL_LEVELTERMINATOR;
5220                             uprv_memcpy(primaries, quadStart, quadsize);
5221                         }
5222                         else {
5223                             /* We ran out of memory!? We can't recover. */
5224                             sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5225                             goto cleanup;
5226                         }
5227                     } else {
5228                         *status = U_BUFFER_OVERFLOW_ERROR;
5229                     }
5230                 }
5231             }
5232 
5233             if(compareIdent) {
5234                 sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len);
5235                 if(sortKeySize <= resultLength) {
5236                     *(primaries++) = UCOL_LEVELTERMINATOR;
5237                     primaries += u_writeIdenticalLevelRun(s.string, len, primaries);
5238                 } else {
5239                     if(allocateSKBuffer == TRUE) {
5240                         primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status);
5241                         if(U_SUCCESS(*status)) {
5242                             *result = primStart;
5243                             *(primaries++) = UCOL_LEVELTERMINATOR;
5244                             u_writeIdenticalLevelRun(s.string, len, primaries);
5245                         }
5246                         else {
5247                             /* We ran out of memory!? We can't recover. */
5248                             sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5249                             goto cleanup;
5250                         }
5251                     } else {
5252                         *status = U_BUFFER_OVERFLOW_ERROR;
5253                     }
5254                 }
5255             }
5256         }
5257         *(primaries++) = '\0';
5258     }
5259 
5260     if(allocateSKBuffer == TRUE) {
5261         *result = (uint8_t*)uprv_malloc(sortKeySize);
5262         /* test for NULL */
5263         if (*result == NULL) {
5264             *status = U_MEMORY_ALLOCATION_ERROR;
5265             goto cleanup;
5266         }
5267         uprv_memcpy(*result, primStart, sortKeySize);
5268         if(primStart != prim) {
5269             uprv_free(primStart);
5270         }
5271     }
5272 
5273 cleanup:
5274     if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
5275         /* NULL terminate for safety */
5276         **result = 0;
5277     }
5278     if(terStart != tert) {
5279         uprv_free(terStart);
5280         uprv_free(secStart);
5281         uprv_free(caseStart);
5282         uprv_free(quadStart);
5283     }
5284 
5285     /* To avoid memory leak, free the offset buffer if necessary. */
5286     ucol_freeOffsetBuffer(&s);
5287 
5288     return sortKeySize;
5289 }
5290 
5291 
5292 U_CFUNC int32_t U_CALLCONV
ucol_calcSortKeySimpleTertiary(const UCollator * coll,const UChar * source,int32_t sourceLength,uint8_t ** result,uint32_t resultLength,UBool allocateSKBuffer,UErrorCode * status)5293 ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
5294         const    UChar        *source,
5295         int32_t        sourceLength,
5296         uint8_t        **result,
5297         uint32_t        resultLength,
5298         UBool allocateSKBuffer,
5299         UErrorCode *status)
5300 {
5301     U_ALIGN_CODE(16);
5302 
5303     //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
5304     uint32_t i = 0; /* general purpose counter */
5305 
5306     /* Stack allocated buffers for buffers we use */
5307     uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
5308 
5309     uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert;
5310 
5311     if(U_FAILURE(*status)) {
5312         return 0;
5313     }
5314 
5315     if(primaries == NULL && allocateSKBuffer == TRUE) {
5316         primaries = *result = prim;
5317         resultLength = UCOL_PRIMARY_MAX_BUFFER;
5318     }
5319 
5320     uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER;
5321 
5322     uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */
5323 
5324     UnicodeString normSource;
5325 
5326     int32_t len =  sourceLength;
5327 
5328     /* If we need to normalize, we'll do it all at once at the beginning! */
5329     if(coll->normalizationMode != UCOL_OFF) {
5330         normSource.setTo(len < 0, source, len);
5331         const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status);
5332         int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
5333         if(qcYesLength != normSource.length()) {
5334             UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
5335             normSource.truncate(qcYesLength);
5336             norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
5337             source = normSource.getBuffer();
5338             len = normSource.length();
5339         }
5340     }
5341     collIterate s;
5342     IInit_collIterate(coll, (UChar *)source, len, &s, status);
5343     if(U_FAILURE(*status)) {
5344         return 0;
5345     }
5346     s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.
5347 
5348     if(resultLength == 0 || primaries == NULL) {
5349         return ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5350     }
5351 
5352     uint8_t *primarySafeEnd = primaries + resultLength - 2;
5353 
5354     uint32_t minBufferSize = UCOL_MAX_BUFFER;
5355 
5356     uint8_t *primStart = primaries;
5357     uint8_t *secStart = secondaries;
5358     uint8_t *terStart = tertiaries;
5359 
5360     uint32_t order = 0;
5361 
5362     uint8_t primary1 = 0;
5363     uint8_t primary2 = 0;
5364     uint8_t secondary = 0;
5365     uint8_t tertiary = 0;
5366     uint8_t caseSwitch = coll->caseSwitch;
5367     uint8_t tertiaryMask = coll->tertiaryMask;
5368     int8_t tertiaryAddition = coll->tertiaryAddition;
5369     uint8_t tertiaryTop = coll->tertiaryTop;
5370     uint8_t tertiaryBottom = coll->tertiaryBottom;
5371     uint8_t tertiaryCommon = coll->tertiaryCommon;
5372 
5373     uint32_t prevBuffSize = 0;
5374 
5375     UBool finished = FALSE;
5376     UBool notIsContinuation = FALSE;
5377 
5378     uint32_t count2 = 0, count3 = 0;
5379     uint8_t leadPrimary = 0;
5380 
5381     for(;;) {
5382         for(i=prevBuffSize; i<minBufferSize; ++i) {
5383 
5384             order = ucol_IGetNextCE(coll, &s, status);
5385 
5386             if(order == 0) {
5387                 continue;
5388             }
5389 
5390             if(order == UCOL_NO_MORE_CES) {
5391                 finished = TRUE;
5392                 break;
5393             }
5394 
5395             notIsContinuation = !isContinuation(order);
5396 
5397             if(notIsContinuation) {
5398                 tertiary = (uint8_t)((order & tertiaryMask));
5399             } else {
5400                 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5401             }
5402 
5403             secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5404             primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5405             primary1 = (uint8_t)(order >> 8);
5406 
5407             uint8_t originalPrimary1 = primary1;
5408             if (coll->leadBytePermutationTable != NULL && notIsContinuation) {
5409                 primary1 = coll->leadBytePermutationTable[primary1];
5410             }
5411 
5412             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5413             /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
5414             /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above.               */
5415             /* regular and simple sortkey calc */
5416             if(primary1 != UCOL_IGNORABLE) {
5417                 if(notIsContinuation) {
5418                     if(leadPrimary == primary1) {
5419                         *primaries++ = primary2;
5420                     } else {
5421                         if(leadPrimary != 0) {
5422                             *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5423                         }
5424                         if(primary2 == UCOL_IGNORABLE) {
5425                             /* one byter, not compressed */
5426                             *primaries++ = primary1;
5427                             leadPrimary = 0;
5428                         } else if(isCompressible(coll, originalPrimary1)) {
5429                             /* compress */
5430                             *primaries++ = leadPrimary = primary1;
5431                             *primaries++ = primary2;
5432                         } else {
5433                             leadPrimary = 0;
5434                             *primaries++ = primary1;
5435                             *primaries++ = primary2;
5436                         }
5437                     }
5438                 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5439                     *primaries++ = primary1;
5440                     if(primary2 != UCOL_IGNORABLE) {
5441                         *primaries++ = primary2; /* second part */
5442                     }
5443                 }
5444             }
5445 
5446             if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5447                 /* This is compression code. */
5448                 if (secondary == UCOL_COMMON2 && notIsContinuation) {
5449                     ++count2;
5450                 } else {
5451                     if (count2 > 0) {
5452                         if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5453                             while (count2 > UCOL_TOP_COUNT2) {
5454                                 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5455                                 count2 -= (uint32_t)UCOL_TOP_COUNT2;
5456                             }
5457                             *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
5458                         } else {
5459                             while (count2 > UCOL_BOT_COUNT2) {
5460                                 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5461                                 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5462                             }
5463                             *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5464                         }
5465                         count2 = 0;
5466                     }
5467                     *secondaries++ = secondary;
5468                 }
5469             }
5470 
5471             if(notIsContinuation) {
5472                 tertiary ^= caseSwitch;
5473             }
5474 
5475             if(tertiary > 0) {
5476                 /* This is compression code. */
5477                 /* sequence size check is included in the if clause */
5478                 if (tertiary == tertiaryCommon && notIsContinuation) {
5479                     ++count3;
5480                 } else {
5481                     if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5482                         tertiary += tertiaryAddition;
5483                     } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5484                         tertiary -= tertiaryAddition;
5485                     }
5486                     if (count3 > 0) {
5487                         if ((tertiary > tertiaryCommon)) {
5488                             while (count3 > coll->tertiaryTopCount) {
5489                                 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5490                                 count3 -= (uint32_t)coll->tertiaryTopCount;
5491                             }
5492                             *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
5493                         } else {
5494                             while (count3 > coll->tertiaryBottomCount) {
5495                                 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5496                                 count3 -= (uint32_t)coll->tertiaryBottomCount;
5497                             }
5498                             *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5499                         }
5500                         count3 = 0;
5501                     }
5502                     *tertiaries++ = tertiary;
5503                 }
5504             }
5505 
5506             if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
5507                 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
5508                     IInit_collIterate(coll, (UChar *)source, len, &s, status);
5509                     if(U_FAILURE(*status)) {
5510                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5511                         finished = TRUE;
5512                         break;
5513                     }
5514                     s.flags &= ~UCOL_ITER_NORM;
5515                     sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5516                     *status = U_BUFFER_OVERFLOW_ERROR;
5517                     finished = TRUE;
5518                     break;
5519                 } else { /* It's much nicer if we can actually reallocate */
5520                     int32_t sks = sortKeySize+(int32_t)((primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart));
5521                     primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
5522                     if(U_SUCCESS(*status)) {
5523                         *result = primStart;
5524                         primarySafeEnd = primStart + resultLength - 2;
5525                     } else {
5526                         /* We ran out of memory!? We can't recover. */
5527                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5528                         finished = TRUE;
5529                         break;
5530                     }
5531                 }
5532             }
5533         }
5534         if(finished) {
5535             break;
5536         } else {
5537             prevBuffSize = minBufferSize;
5538             secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
5539             terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
5540             minBufferSize *= 2;
5541             if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5542                 /* We ran out of memory!? We can't recover. */
5543                 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5544                 break;
5545             }
5546         }
5547     }
5548 
5549     if(U_SUCCESS(*status)) {
5550         sortKeySize += (uint32_t)(primaries - primStart);
5551         /* we have done all the CE's, now let's put them together to form a key */
5552         if (count2 > 0) {
5553             while (count2 > UCOL_BOT_COUNT2) {
5554                 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5555                 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5556             }
5557             *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5558         }
5559         uint32_t secsize = (uint32_t)(secondaries-secStart);
5560         sortKeySize += secsize;
5561         if(sortKeySize <= resultLength) {
5562             *(primaries++) = UCOL_LEVELTERMINATOR;
5563             uprv_memcpy(primaries, secStart, secsize);
5564             primaries += secsize;
5565         } else {
5566             if(allocateSKBuffer == TRUE) {
5567                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5568                 if(U_SUCCESS(*status)) {
5569                     *(primaries++) = UCOL_LEVELTERMINATOR;
5570                     *result = primStart;
5571                     uprv_memcpy(primaries, secStart, secsize);
5572                 }
5573                 else {
5574                     /* We ran out of memory!? We can't recover. */
5575                     sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5576                     goto cleanup;
5577                 }
5578             } else {
5579                 *status = U_BUFFER_OVERFLOW_ERROR;
5580             }
5581         }
5582 
5583         if (count3 > 0) {
5584             if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5585                 while (count3 >= coll->tertiaryTopCount) {
5586                     *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5587                     count3 -= (uint32_t)coll->tertiaryTopCount;
5588                 }
5589                 *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5590             } else {
5591                 while (count3 > coll->tertiaryBottomCount) {
5592                     *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5593                     count3 -= (uint32_t)coll->tertiaryBottomCount;
5594                 }
5595                 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5596             }
5597         }
5598         uint32_t tersize = (uint32_t)(tertiaries - terStart);
5599         sortKeySize += tersize;
5600         if(sortKeySize <= resultLength) {
5601             *(primaries++) = UCOL_LEVELTERMINATOR;
5602             uprv_memcpy(primaries, terStart, tersize);
5603             primaries += tersize;
5604         } else {
5605             if(allocateSKBuffer == TRUE) {
5606                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5607                 if(U_SUCCESS(*status)) {
5608                     *result = primStart;
5609                     *(primaries++) = UCOL_LEVELTERMINATOR;
5610                     uprv_memcpy(primaries, terStart, tersize);
5611                 }
5612                 else {
5613                     /* We ran out of memory!? We can't recover. */
5614                     sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5615                     goto cleanup;
5616                 }
5617             } else {
5618                 *status = U_BUFFER_OVERFLOW_ERROR;
5619             }
5620         }
5621 
5622         *(primaries++) = '\0';
5623     }
5624 
5625     if(allocateSKBuffer == TRUE) {
5626         *result = (uint8_t*)uprv_malloc(sortKeySize);
5627         /* test for NULL */
5628         if (*result == NULL) {
5629             *status = U_MEMORY_ALLOCATION_ERROR;
5630             goto cleanup;
5631         }
5632         uprv_memcpy(*result, primStart, sortKeySize);
5633         if(primStart != prim) {
5634             uprv_free(primStart);
5635         }
5636     }
5637 
5638 cleanup:
5639     if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
5640         /* NULL terminate for safety */
5641         **result = 0;
5642     }
5643     if(terStart != tert) {
5644         uprv_free(terStart);
5645         uprv_free(secStart);
5646     }
5647 
5648     /* To avoid memory leak, free the offset buffer if necessary. */
5649     ucol_freeOffsetBuffer(&s);
5650 
5651     return sortKeySize;
5652 }
5653 
5654 static inline
isShiftedCE(uint32_t CE,uint32_t LVT,UBool * wasShifted)5655 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5656     UBool notIsContinuation = !isContinuation(CE);
5657     uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
5658     if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5659                || (!notIsContinuation && *wasShifted)))
5660         || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
5661     {
5662         // The stuff below should probably be in the sortkey code... maybe not...
5663         if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
5664             /* we should just completely ignore it */
5665             *wasShifted = TRUE;
5666             //continue;
5667         }
5668         //*wasShifted = TRUE;
5669         return TRUE;
5670     } else {
5671         *wasShifted = FALSE;
5672         return FALSE;
5673     }
5674 }
5675 static inline
terminatePSKLevel(int32_t level,int32_t maxLevel,int32_t & i,uint8_t * dest)5676 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
5677     if(level < maxLevel) {
5678         dest[i++] = UCOL_LEVELTERMINATOR;
5679     } else {
5680         dest[i++] = 0;
5681     }
5682 }
5683 
5684 /** enumeration of level identifiers for partial sort key generation */
5685 enum {
5686   UCOL_PSK_PRIMARY = 0,
5687     UCOL_PSK_SECONDARY = 1,
5688     UCOL_PSK_CASE = 2,
5689     UCOL_PSK_TERTIARY = 3,
5690     UCOL_PSK_QUATERNARY = 4,
5691     UCOL_PSK_QUIN = 5,      /** This is an extra level, not used - but we have three bits to blow */
5692     UCOL_PSK_IDENTICAL = 6,
5693     UCOL_PSK_NULL = 7,      /** level for the end of sort key. Will just produce zeros */
5694     UCOL_PSK_LIMIT
5695 };
5696 
5697 /** collation state enum. *_SHIFT value is how much to shift right
5698  *  to get the state piece to the right. *_MASK value should be
5699  *  ANDed with the shifted state. This data is stored in state[1]
5700  *  field.
5701  */
5702 enum {
5703     UCOL_PSK_LEVEL_SHIFT = 0,      /** level identificator. stores an enum value from above */
5704     UCOL_PSK_LEVEL_MASK = 7,       /** three bits */
5705     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
5706     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
5707     /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5708      *  This field is also used to denote that the French secondary level is finished
5709      */
5710     UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5711     UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5712     UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
5713     UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5714     /** When we do French we need to reverse secondary values. However, continuations
5715      *  need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5716      */
5717     UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
5718     UCOL_PSK_BOCSU_BYTES_MASK = 3,
5719     UCOL_PSK_CONSUMED_CES_SHIFT = 9,
5720     UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
5721 };
5722 
5723 // macro calculating the number of expansion CEs available
5724 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
5725 
5726 
5727 /** main sortkey part procedure. On the first call,
5728  *  you should pass in a collator, an iterator, empty state
5729  *  state[0] == state[1] == 0, a buffer to hold results
5730  *  number of bytes you need and an error code pointer.
5731  *  Make sure your buffer is big enough to hold the wanted
5732  *  number of sortkey bytes. I don't check.
5733  *  The only meaningful status you can get back is
5734  *  U_BUFFER_OVERFLOW_ERROR, which basically means that you
5735  *  have been dealt a raw deal and that you probably won't
5736  *  be able to use partial sortkey generation for this
5737  *  particular combination of string and collator. This
5738  *  is highly unlikely, but you should still check the error code.
5739  *  Any other status means that you're not in a sane situation
5740  *  anymore. After the first call, preserve state values and
5741  *  use them on subsequent calls to obtain more bytes of a sortkey.
5742  *  Use until the number of bytes written is smaller than the requested
5743  *  number of bytes. Generated sortkey is not compatible with the
5744  *  one generated by ucol_getSortKey, as we don't do any compression.
5745  *  However, levels are still terminated by a 1 (one) and the sortkey
5746  *  is terminated by a 0 (zero). Identical level is the same as in the
5747  *  regular sortkey - internal bocu-1 implementation is used.
5748  *  For curious, although you cannot do much about this, here is
5749  *  the structure of state words.
5750  *  state[0] - iterator state. Depends on the iterator implementation,
5751  *             but allows the iterator to continue where it stopped in
5752  *             the last iteration.
5753  *  state[1] - collation processing state. Here is the distribution
5754  *             of the bits:
5755  *   0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5756  *             quaternary, quin (we don't use this one), identical and
5757  *             null (producing only zeroes - first one to terminate the
5758  *             sortkey and subsequent to fill the buffer).
5759  *   3       - byte count. Number of bytes written on the primary level.
5760  *   4       - was shifted. Whether the previous iteration finished in the
5761  *             shifted state.
5762  *   5, 6    - French continuation bytes written. See the comment in the enum
5763  *   7,8     - Bocsu bytes used. Number of bytes from a bocu sequence on
5764  *             the identical level.
5765  *   9..31   - CEs consumed. Number of getCE or next32 operations performed
5766  *             since thes last successful update of the iterator state.
5767  */
5768 U_CAPI int32_t U_EXPORT2
ucol_nextSortKeyPart(const UCollator * coll,UCharIterator * iter,uint32_t state[2],uint8_t * dest,int32_t count,UErrorCode * status)5769 ucol_nextSortKeyPart(const UCollator *coll,
5770                      UCharIterator *iter,
5771                      uint32_t state[2],
5772                      uint8_t *dest, int32_t count,
5773                      UErrorCode *status)
5774 {
5775     /* error checking */
5776     if(status==NULL || U_FAILURE(*status)) {
5777         return 0;
5778     }
5779     UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
5780     if( coll==NULL || iter==NULL ||
5781         state==NULL ||
5782         count<0 || (count>0 && dest==NULL)
5783     ) {
5784         *status=U_ILLEGAL_ARGUMENT_ERROR;
5785         UTRACE_EXIT_STATUS(status);
5786         return 0;
5787     }
5788 
5789     UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5790                   coll, iter, state[0], state[1], dest, count);
5791 
5792     if(count==0) {
5793         /* nothing to do */
5794         UTRACE_EXIT_VALUE(0);
5795         return 0;
5796     }
5797     /** Setting up situation according to the state we got from the previous iteration */
5798     // The state of the iterator from the previous invocation
5799     uint32_t iterState = state[0];
5800     // Has the last iteration ended in the shifted state
5801     UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
5802     // What is the current level of the sortkey?
5803     int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
5804     // Have we written only one byte from a two byte primary in the previous iteration?
5805     // Also on secondary level - have we finished with the French secondary?
5806     int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
5807     // number of bytes in the continuation buffer for French
5808     int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
5809     // Number of bytes already written from a bocsu sequence. Since
5810     // the longes bocsu sequence is 4 long, this can be up to 3.
5811     int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
5812     // Number of elements that need to be consumed in this iteration because
5813     // the iterator returned UITER_NO_STATE at the end of the last iteration,
5814     // so we had to save the last valid state.
5815     int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
5816 
5817     /** values that depend on the collator attributes */
5818     // strength of the collator.
5819     int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
5820     // maximal level of the partial sortkey. Need to take whether case level is done
5821     int32_t maxLevel = 0;
5822     if(strength < UCOL_TERTIARY) {
5823         if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5824             maxLevel = UCOL_PSK_CASE;
5825         } else {
5826             maxLevel = strength;
5827         }
5828     } else {
5829         if(strength == UCOL_TERTIARY) {
5830             maxLevel = UCOL_PSK_TERTIARY;
5831         } else if(strength == UCOL_QUATERNARY) {
5832             maxLevel = UCOL_PSK_QUATERNARY;
5833         } else { // identical
5834             maxLevel = UCOL_IDENTICAL;
5835         }
5836     }
5837     // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
5838     uint8_t UCOL_HIRAGANA_QUAD =
5839       (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
5840     // Boundary value that decides whether a CE is shifted or not
5841     uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
5842     // Are we doing French collation?
5843     UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
5844 
5845     /** initializing the collation state */
5846     UBool notIsContinuation = FALSE;
5847     uint32_t CE = UCOL_NO_MORE_CES;
5848 
5849     collIterate s;
5850     IInit_collIterate(coll, NULL, -1, &s, status);
5851     if(U_FAILURE(*status)) {
5852         UTRACE_EXIT_STATUS(*status);
5853         return 0;
5854     }
5855     s.iterator = iter;
5856     s.flags |= UCOL_USE_ITERATOR;
5857     // This variable tells us whether we have produced some other levels in this iteration
5858     // before we moved to the identical level. In that case, we need to switch the
5859     // type of the iterator.
5860     UBool doingIdenticalFromStart = FALSE;
5861     // Normalizing iterator
5862     // The division for the array length may truncate the array size to
5863     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
5864     // for all platforms anyway.
5865     UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
5866     UNormIterator *normIter = NULL;
5867     // If the normalization is turned on for the collator and we are below identical level
5868     // we will use a FCD normalizing iterator
5869     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
5870         normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5871         s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
5872         s.flags &= ~UCOL_ITER_NORM;
5873         if(U_FAILURE(*status)) {
5874             UTRACE_EXIT_STATUS(*status);
5875             return 0;
5876         }
5877     } else if(level == UCOL_PSK_IDENTICAL) {
5878         // for identical level, we need a NFD iterator. We need to instantiate it here, since we
5879         // will be updating the state - and this cannot be done on an ordinary iterator.
5880         normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5881         s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5882         s.flags &= ~UCOL_ITER_NORM;
5883         if(U_FAILURE(*status)) {
5884             UTRACE_EXIT_STATUS(*status);
5885             return 0;
5886         }
5887         doingIdenticalFromStart = TRUE;
5888     }
5889 
5890     // This is the tentative new state of the iterator. The problem
5891     // is that the iterator might return an undefined state, in
5892     // which case we should save the last valid state and increase
5893     // the iterator skip value.
5894     uint32_t newState = 0;
5895 
5896     // First, we set the iterator to the last valid position
5897     // from the last iteration. This was saved in state[0].
5898     if(iterState == 0) {
5899         /* initial state */
5900         if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
5901             s.iterator->move(s.iterator, 0, UITER_LIMIT);
5902         } else {
5903             s.iterator->move(s.iterator, 0, UITER_START);
5904         }
5905     } else {
5906         /* reset to previous state */
5907         s.iterator->setState(s.iterator, iterState, status);
5908         if(U_FAILURE(*status)) {
5909             UTRACE_EXIT_STATUS(*status);
5910             return 0;
5911         }
5912     }
5913 
5914 
5915 
5916     // This variable tells us whether we can attempt to update the state
5917     // of iterator. Situations where we don't want to update iterator state
5918     // are the existence of expansion CEs that are not yet processed, and
5919     // finishing the case level without enough space in the buffer to insert
5920     // a level terminator.
5921     UBool canUpdateState = TRUE;
5922 
5923     // Consume all the CEs that were consumed at the end of the previous
5924     // iteration without updating the iterator state. On identical level,
5925     // consume the code points.
5926     int32_t counter = cces;
5927     if(level < UCOL_PSK_IDENTICAL) {
5928         while(counter-->0) {
5929             // If we're doing French and we are on the secondary level,
5930             // we go backwards.
5931             if(level == UCOL_PSK_SECONDARY && doingFrench) {
5932                 CE = ucol_IGetPrevCE(coll, &s, status);
5933             } else {
5934                 CE = ucol_IGetNextCE(coll, &s, status);
5935             }
5936             if(CE==UCOL_NO_MORE_CES) {
5937                 /* should not happen */
5938                 *status=U_INTERNAL_PROGRAM_ERROR;
5939                 UTRACE_EXIT_STATUS(*status);
5940                 return 0;
5941             }
5942             if(uprv_numAvailableExpCEs(s)) {
5943                 canUpdateState = FALSE;
5944             }
5945         }
5946     } else {
5947         while(counter-->0) {
5948             uiter_next32(s.iterator);
5949         }
5950     }
5951 
5952     // French secondary needs to know whether the iterator state of zero came from previous level OR
5953     // from a new invocation...
5954     UBool wasDoingPrimary = FALSE;
5955     // destination buffer byte counter. When this guy
5956     // gets to count, we're done with the iteration
5957     int32_t i = 0;
5958     // used to count the zero bytes written after we
5959     // have finished with the sort key
5960     int32_t j = 0;
5961 
5962 
5963     // Hm.... I think we're ready to plunge in. Basic story is as following:
5964     // we have a fall through case based on level. This is used for initial
5965     // positioning on iteration start. Every level processor contains a
5966     // for(;;) which will be broken when we exhaust all the CEs. Other
5967     // way to exit is a goto saveState, which happens when we have filled
5968     // out our buffer.
5969     switch(level) {
5970     case UCOL_PSK_PRIMARY:
5971         wasDoingPrimary = TRUE;
5972         for(;;) {
5973             if(i==count) {
5974                 goto saveState;
5975             }
5976             // We should save the state only if we
5977             // are sure that we are done with the
5978             // previous iterator state
5979             if(canUpdateState && byteCountOrFrenchDone == 0) {
5980                 newState = s.iterator->getState(s.iterator);
5981                 if(newState != UITER_NO_STATE) {
5982                     iterState = newState;
5983                     cces = 0;
5984                 }
5985             }
5986             CE = ucol_IGetNextCE(coll, &s, status);
5987             cces++;
5988             if(CE==UCOL_NO_MORE_CES) {
5989                 // Add the level separator
5990                 terminatePSKLevel(level, maxLevel, i, dest);
5991                 byteCountOrFrenchDone=0;
5992                 // Restart the iteration an move to the
5993                 // second level
5994                 s.iterator->move(s.iterator, 0, UITER_START);
5995                 cces = 0;
5996                 level = UCOL_PSK_SECONDARY;
5997                 break;
5998             }
5999             if(!isContinuation(CE)){
6000                 if(coll->leadBytePermutationTable != NULL){
6001                     CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF);
6002                 }
6003             }
6004             if(!isShiftedCE(CE, LVT, &wasShifted)) {
6005                 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
6006                 if(CE != 0) {
6007                     if(byteCountOrFrenchDone == 0) {
6008                         // get the second byte of primary
6009                         dest[i++]=(uint8_t)(CE >> 8);
6010                     } else {
6011                         byteCountOrFrenchDone = 0;
6012                     }
6013                     if((CE &=0xff)!=0) {
6014                         if(i==count) {
6015                             /* overflow */
6016                             byteCountOrFrenchDone = 1;
6017                             cces--;
6018                             goto saveState;
6019                         }
6020                         dest[i++]=(uint8_t)CE;
6021                     }
6022                 }
6023             }
6024             if(uprv_numAvailableExpCEs(s)) {
6025                 canUpdateState = FALSE;
6026             } else {
6027                 canUpdateState = TRUE;
6028             }
6029         }
6030         /* fall through to next level */
6031     case UCOL_PSK_SECONDARY:
6032         if(strength >= UCOL_SECONDARY) {
6033             if(!doingFrench) {
6034                 for(;;) {
6035                     if(i == count) {
6036                         goto saveState;
6037                     }
6038                     // We should save the state only if we
6039                     // are sure that we are done with the
6040                     // previous iterator state
6041                     if(canUpdateState) {
6042                         newState = s.iterator->getState(s.iterator);
6043                         if(newState != UITER_NO_STATE) {
6044                             iterState = newState;
6045                             cces = 0;
6046                         }
6047                     }
6048                     CE = ucol_IGetNextCE(coll, &s, status);
6049                     cces++;
6050                     if(CE==UCOL_NO_MORE_CES) {
6051                         // Add the level separator
6052                         terminatePSKLevel(level, maxLevel, i, dest);
6053                         byteCountOrFrenchDone = 0;
6054                         // Restart the iteration an move to the
6055                         // second level
6056                         s.iterator->move(s.iterator, 0, UITER_START);
6057                         cces = 0;
6058                         level = UCOL_PSK_CASE;
6059                         break;
6060                     }
6061                     if(!isShiftedCE(CE, LVT, &wasShifted)) {
6062                         CE >>= 8; /* get secondary */
6063                         if(CE != 0) {
6064                             dest[i++]=(uint8_t)CE;
6065                         }
6066                     }
6067                     if(uprv_numAvailableExpCEs(s)) {
6068                         canUpdateState = FALSE;
6069                     } else {
6070                         canUpdateState = TRUE;
6071                     }
6072                 }
6073             } else { // French secondary processing
6074                 uint8_t frenchBuff[UCOL_MAX_BUFFER];
6075                 int32_t frenchIndex = 0;
6076                 // Here we are going backwards.
6077                 // If the iterator is at the beggining, it should be
6078                 // moved to end.
6079                 if(wasDoingPrimary) {
6080                     s.iterator->move(s.iterator, 0, UITER_LIMIT);
6081                     cces = 0;
6082                 }
6083                 for(;;) {
6084                     if(i == count) {
6085                         goto saveState;
6086                     }
6087                     if(canUpdateState) {
6088                         newState = s.iterator->getState(s.iterator);
6089                         if(newState != UITER_NO_STATE) {
6090                             iterState = newState;
6091                             cces = 0;
6092                         }
6093                     }
6094                     CE = ucol_IGetPrevCE(coll, &s, status);
6095                     cces++;
6096                     if(CE==UCOL_NO_MORE_CES) {
6097                         // Add the level separator
6098                         terminatePSKLevel(level, maxLevel, i, dest);
6099                         byteCountOrFrenchDone = 0;
6100                         // Restart the iteration an move to the next level
6101                         s.iterator->move(s.iterator, 0, UITER_START);
6102                         level = UCOL_PSK_CASE;
6103                         break;
6104                     }
6105                     if(isContinuation(CE)) { // if it's a continuation, we want to save it and
6106                         // reverse when we get a first non-continuation CE.
6107                         CE >>= 8;
6108                         frenchBuff[frenchIndex++] = (uint8_t)CE;
6109                     } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
6110                         CE >>= 8; /* get secondary */
6111                         if(!frenchIndex) {
6112                             if(CE != 0) {
6113                                 dest[i++]=(uint8_t)CE;
6114                             }
6115                         } else {
6116                             frenchBuff[frenchIndex++] = (uint8_t)CE;
6117                             frenchIndex -= usedFrench;
6118                             usedFrench = 0;
6119                             while(i < count && frenchIndex) {
6120                                 dest[i++] = frenchBuff[--frenchIndex];
6121                                 usedFrench++;
6122                             }
6123                         }
6124                     }
6125                     if(uprv_numAvailableExpCEs(s)) {
6126                         canUpdateState = FALSE;
6127                     } else {
6128                         canUpdateState = TRUE;
6129                     }
6130                 }
6131             }
6132         } else {
6133             level = UCOL_PSK_CASE;
6134         }
6135         /* fall through to next level */
6136     case UCOL_PSK_CASE:
6137         if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
6138             uint32_t caseShift = UCOL_CASE_SHIFT_START;
6139             uint8_t caseByte = UCOL_CASE_BYTE_START;
6140             uint8_t caseBits = 0;
6141 
6142             for(;;) {
6143                 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START);
6144                 if(i == count) {
6145                     goto saveState;
6146                 }
6147                 // We should save the state only if we
6148                 // are sure that we are done with the
6149                 // previous iterator state
6150                 if(canUpdateState) {
6151                     newState = s.iterator->getState(s.iterator);
6152                     if(newState != UITER_NO_STATE) {
6153                         iterState = newState;
6154                         cces = 0;
6155                     }
6156                 }
6157                 CE = ucol_IGetNextCE(coll, &s, status);
6158                 cces++;
6159                 if(CE==UCOL_NO_MORE_CES) {
6160                     // On the case level we might have an unfinished
6161                     // case byte. Add one if it's started.
6162                     if(caseShift != UCOL_CASE_SHIFT_START) {
6163                         dest[i++] = caseByte;
6164                     }
6165                     cces = 0;
6166                     // We have finished processing CEs on this level.
6167                     // However, we don't know if we have enough space
6168                     // to add a case level terminator.
6169                     if(i < count) {
6170                         // Add the level separator
6171                         terminatePSKLevel(level, maxLevel, i, dest);
6172                         // Restart the iteration and move to the
6173                         // next level
6174                         s.iterator->move(s.iterator, 0, UITER_START);
6175                         level = UCOL_PSK_TERTIARY;
6176                     } else {
6177                         canUpdateState = FALSE;
6178                     }
6179                     break;
6180                 }
6181 
6182                 if(!isShiftedCE(CE, LVT, &wasShifted)) {
6183                     if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
6184                         // do the case level if we need to do it. We don't want to calculate
6185                         // case level for primary ignorables if we have only primary strength and case level
6186                         // otherwise we would break well formedness of CEs
6187                         CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
6188                         caseBits = (uint8_t)(CE & 0xC0);
6189                         // this copies the case level logic from the
6190                         // sort key generation code
6191                         if(CE != 0) {
6192                             if (caseShift == 0) {
6193                                 dest[i++] = caseByte;
6194                                 caseShift = UCOL_CASE_SHIFT_START;
6195                                 caseByte = UCOL_CASE_BYTE_START;
6196                             }
6197                             if(coll->caseFirst == UCOL_UPPER_FIRST) {
6198                                 if((caseBits & 0xC0) == 0) {
6199                                     caseByte |= 1 << (--caseShift);
6200                                 } else {
6201                                     caseByte |= 0 << (--caseShift);
6202                                     /* second bit */
6203                                     if(caseShift == 0) {
6204                                         dest[i++] = caseByte;
6205                                         caseShift = UCOL_CASE_SHIFT_START;
6206                                         caseByte = UCOL_CASE_BYTE_START;
6207                                     }
6208                                     caseByte |= ((caseBits>>6)&1) << (--caseShift);
6209                                 }
6210                             } else {
6211                                 if((caseBits & 0xC0) == 0) {
6212                                     caseByte |= 0 << (--caseShift);
6213                                 } else {
6214                                     caseByte |= 1 << (--caseShift);
6215                                     /* second bit */
6216                                     if(caseShift == 0) {
6217                                         dest[i++] = caseByte;
6218                                         caseShift = UCOL_CASE_SHIFT_START;
6219                                         caseByte = UCOL_CASE_BYTE_START;
6220                                     }
6221                                     caseByte |= ((caseBits>>7)&1) << (--caseShift);
6222                                 }
6223                             }
6224                         }
6225 
6226                     }
6227                 }
6228                 // Not sure this is correct for the case level - revisit
6229                 if(uprv_numAvailableExpCEs(s)) {
6230                     canUpdateState = FALSE;
6231                 } else {
6232                     canUpdateState = TRUE;
6233                 }
6234             }
6235         } else {
6236             level = UCOL_PSK_TERTIARY;
6237         }
6238         /* fall through to next level */
6239     case UCOL_PSK_TERTIARY:
6240         if(strength >= UCOL_TERTIARY) {
6241             for(;;) {
6242                 if(i == count) {
6243                     goto saveState;
6244                 }
6245                 // We should save the state only if we
6246                 // are sure that we are done with the
6247                 // previous iterator state
6248                 if(canUpdateState) {
6249                     newState = s.iterator->getState(s.iterator);
6250                     if(newState != UITER_NO_STATE) {
6251                         iterState = newState;
6252                         cces = 0;
6253                     }
6254                 }
6255                 CE = ucol_IGetNextCE(coll, &s, status);
6256                 cces++;
6257                 if(CE==UCOL_NO_MORE_CES) {
6258                     // Add the level separator
6259                     terminatePSKLevel(level, maxLevel, i, dest);
6260                     byteCountOrFrenchDone = 0;
6261                     // Restart the iteration an move to the
6262                     // second level
6263                     s.iterator->move(s.iterator, 0, UITER_START);
6264                     cces = 0;
6265                     level = UCOL_PSK_QUATERNARY;
6266                     break;
6267                 }
6268                 if(!isShiftedCE(CE, LVT, &wasShifted)) {
6269                     notIsContinuation = !isContinuation(CE);
6270 
6271                     if(notIsContinuation) {
6272                         CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
6273                         CE ^= coll->caseSwitch;
6274                         CE &= coll->tertiaryMask;
6275                     } else {
6276                         CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6277                     }
6278 
6279                     if(CE != 0) {
6280                         dest[i++]=(uint8_t)CE;
6281                     }
6282                 }
6283                 if(uprv_numAvailableExpCEs(s)) {
6284                     canUpdateState = FALSE;
6285                 } else {
6286                     canUpdateState = TRUE;
6287                 }
6288             }
6289         } else {
6290             // if we're not doing tertiary
6291             // skip to the end
6292             level = UCOL_PSK_NULL;
6293         }
6294         /* fall through to next level */
6295     case UCOL_PSK_QUATERNARY:
6296         if(strength >= UCOL_QUATERNARY) {
6297             for(;;) {
6298                 if(i == count) {
6299                     goto saveState;
6300                 }
6301                 // We should save the state only if we
6302                 // are sure that we are done with the
6303                 // previous iterator state
6304                 if(canUpdateState) {
6305                     newState = s.iterator->getState(s.iterator);
6306                     if(newState != UITER_NO_STATE) {
6307                         iterState = newState;
6308                         cces = 0;
6309                     }
6310                 }
6311                 CE = ucol_IGetNextCE(coll, &s, status);
6312                 cces++;
6313                 if(CE==UCOL_NO_MORE_CES) {
6314                     // Add the level separator
6315                     terminatePSKLevel(level, maxLevel, i, dest);
6316                     //dest[i++] = UCOL_LEVELTERMINATOR;
6317                     byteCountOrFrenchDone = 0;
6318                     // Restart the iteration an move to the
6319                     // second level
6320                     s.iterator->move(s.iterator, 0, UITER_START);
6321                     cces = 0;
6322                     level = UCOL_PSK_QUIN;
6323                     break;
6324                 }
6325                 if(CE==0)
6326                     continue;
6327                 if(isShiftedCE(CE, LVT, &wasShifted)) {
6328                     CE >>= 16; /* get primary */
6329                     if(CE != 0) {
6330                         if(byteCountOrFrenchDone == 0) {
6331                             dest[i++]=(uint8_t)(CE >> 8);
6332                         } else {
6333                             byteCountOrFrenchDone = 0;
6334                         }
6335                         if((CE &=0xff)!=0) {
6336                             if(i==count) {
6337                                 /* overflow */
6338                                 byteCountOrFrenchDone = 1;
6339                                 goto saveState;
6340                             }
6341                             dest[i++]=(uint8_t)CE;
6342                         }
6343                     }
6344                 } else {
6345                     notIsContinuation = !isContinuation(CE);
6346                     if(notIsContinuation) {
6347                         if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
6348                             dest[i++] = UCOL_HIRAGANA_QUAD;
6349                         } else {
6350                             dest[i++] = 0xFF;
6351                         }
6352                     }
6353                 }
6354                 if(uprv_numAvailableExpCEs(s)) {
6355                     canUpdateState = FALSE;
6356                 } else {
6357                     canUpdateState = TRUE;
6358                 }
6359             }
6360         } else {
6361             // if we're not doing quaternary
6362             // skip to the end
6363             level = UCOL_PSK_NULL;
6364         }
6365         /* fall through to next level */
6366     case UCOL_PSK_QUIN:
6367         level = UCOL_PSK_IDENTICAL;
6368         /* fall through to next level */
6369     case UCOL_PSK_IDENTICAL:
6370         if(strength >= UCOL_IDENTICAL) {
6371             UChar32 first, second;
6372             int32_t bocsuBytesWritten = 0;
6373             // We always need to do identical on
6374             // the NFD form of the string.
6375             if(normIter == NULL) {
6376                 // we arrived from the level below and
6377                 // normalization was not turned on.
6378                 // therefore, we need to make a fresh NFD iterator
6379                 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6380                 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6381             } else if(!doingIdenticalFromStart) {
6382                 // there is an iterator, but we did some other levels.
6383                 // therefore, we have a FCD iterator - need to make
6384                 // a NFD one.
6385                 // normIter being at the beginning does not guarantee
6386                 // that the underlying iterator is at the beginning
6387                 iter->move(iter, 0, UITER_START);
6388                 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6389             }
6390             // At this point we have a NFD iterator that is positioned
6391             // in the right place
6392             if(U_FAILURE(*status)) {
6393                 UTRACE_EXIT_STATUS(*status);
6394                 return 0;
6395             }
6396             first = uiter_previous32(s.iterator);
6397             // maybe we're at the start of the string
6398             if(first == U_SENTINEL) {
6399                 first = 0;
6400             } else {
6401                 uiter_next32(s.iterator);
6402             }
6403 
6404             j = 0;
6405             for(;;) {
6406                 if(i == count) {
6407                     if(j+1 < bocsuBytesWritten) {
6408                         bocsuBytesUsed = j+1;
6409                     }
6410                     goto saveState;
6411                 }
6412 
6413                 // On identical level, we will always save
6414                 // the state if we reach this point, since
6415                 // we don't depend on getNextCE for content
6416                 // all the content is in our buffer and we
6417                 // already either stored the full buffer OR
6418                 // otherwise we won't arrive here.
6419                 newState = s.iterator->getState(s.iterator);
6420                 if(newState != UITER_NO_STATE) {
6421                     iterState = newState;
6422                     cces = 0;
6423                 }
6424 
6425                 uint8_t buff[4];
6426                 second = uiter_next32(s.iterator);
6427                 cces++;
6428 
6429                 // end condition for identical level
6430                 if(second == U_SENTINEL) {
6431                     terminatePSKLevel(level, maxLevel, i, dest);
6432                     level = UCOL_PSK_NULL;
6433                     break;
6434                 }
6435                 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
6436                 first = second;
6437 
6438                 j = 0;
6439                 if(bocsuBytesUsed != 0) {
6440                     while(bocsuBytesUsed-->0) {
6441                         j++;
6442                     }
6443                 }
6444 
6445                 while(i < count && j < bocsuBytesWritten) {
6446                     dest[i++] = buff[j++];
6447                 }
6448             }
6449 
6450         } else {
6451             level = UCOL_PSK_NULL;
6452         }
6453         /* fall through to next level */
6454     case UCOL_PSK_NULL:
6455         j = i;
6456         while(j<count) {
6457             dest[j++]=0;
6458         }
6459         break;
6460     default:
6461         *status = U_INTERNAL_PROGRAM_ERROR;
6462         UTRACE_EXIT_STATUS(*status);
6463         return 0;
6464     }
6465 
6466 saveState:
6467     // Now we need to return stuff. First we want to see whether we have
6468     // done everything for the current state of iterator.
6469     if(byteCountOrFrenchDone
6470         || canUpdateState == FALSE
6471         || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)
6472     {
6473         // Any of above mean that the previous transaction
6474         // wasn't finished and that we should store the
6475         // previous iterator state.
6476         state[0] = iterState;
6477     } else {
6478         // The transaction is complete. We will continue in the next iteration.
6479         state[0] = s.iterator->getState(s.iterator);
6480         cces = 0;
6481     }
6482     // Store the number of bocsu bytes written.
6483     if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
6484         *status = U_INDEX_OUTOFBOUNDS_ERROR;
6485     }
6486     state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
6487 
6488     // Next we put in the level of comparison
6489     state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
6490 
6491     // If we are doing French, we need to store whether we have just finished the French level
6492     if(level == UCOL_PSK_SECONDARY && doingFrench) {
6493         state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6494     } else {
6495         state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6496     }
6497 
6498     // Was the latest CE shifted
6499     if(wasShifted) {
6500         state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6501     }
6502     // Check for cces overflow
6503     if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
6504         *status = U_INDEX_OUTOFBOUNDS_ERROR;
6505     }
6506     // Store cces
6507     state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
6508 
6509     // Check for French overflow
6510     if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6511         *status = U_INDEX_OUTOFBOUNDS_ERROR;
6512     }
6513     // Store number of bytes written in the French secondary continuation sequence
6514     state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
6515 
6516 
6517     // If we have used normalizing iterator, get rid of it
6518     if(normIter != NULL) {
6519         unorm_closeIter(normIter);
6520     }
6521 
6522     /* To avoid memory leak, free the offset buffer if necessary. */
6523     ucol_freeOffsetBuffer(&s);
6524 
6525     // Return number of meaningful sortkey bytes.
6526     UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
6527                   dest,i, state[0], state[1]);
6528     UTRACE_EXIT_VALUE(i);
6529     return i;
6530 }
6531 
6532 /**
6533  * Produce a bound for a given sortkey and a number of levels.
6534  */
6535 U_CAPI int32_t U_EXPORT2
ucol_getBound(const uint8_t * source,int32_t sourceLength,UColBoundMode boundType,uint32_t noOfLevels,uint8_t * result,int32_t resultLength,UErrorCode * status)6536 ucol_getBound(const uint8_t       *source,
6537         int32_t             sourceLength,
6538         UColBoundMode       boundType,
6539         uint32_t            noOfLevels,
6540         uint8_t             *result,
6541         int32_t             resultLength,
6542         UErrorCode          *status)
6543 {
6544     // consistency checks
6545     if(status == NULL || U_FAILURE(*status)) {
6546         return 0;
6547     }
6548     if(source == NULL) {
6549         *status = U_ILLEGAL_ARGUMENT_ERROR;
6550         return 0;
6551     }
6552 
6553     int32_t sourceIndex = 0;
6554     // Scan the string until we skip enough of the key OR reach the end of the key
6555     do {
6556         sourceIndex++;
6557         if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
6558             noOfLevels--;
6559         }
6560     } while (noOfLevels > 0
6561         && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6562 
6563     if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6564         && noOfLevels > 0) {
6565             *status = U_SORT_KEY_TOO_SHORT_WARNING;
6566     }
6567 
6568 
6569     // READ ME: this code assumes that the values for boundType
6570     // enum will not changes. They are set so that the enum value
6571     // corresponds to the number of extra bytes each bound type
6572     // needs.
6573     if(result != NULL && resultLength >= sourceIndex+boundType) {
6574         uprv_memcpy(result, source, sourceIndex);
6575         switch(boundType) {
6576             // Lower bound just gets terminated. No extra bytes
6577         case UCOL_BOUND_LOWER: // = 0
6578             break;
6579             // Upper bound needs one extra byte
6580         case UCOL_BOUND_UPPER: // = 1
6581             result[sourceIndex++] = 2;
6582             break;
6583             // Upper long bound needs two extra bytes
6584         case UCOL_BOUND_UPPER_LONG: // = 2
6585             result[sourceIndex++] = 0xFF;
6586             result[sourceIndex++] = 0xFF;
6587             break;
6588         default:
6589             *status = U_ILLEGAL_ARGUMENT_ERROR;
6590             return 0;
6591         }
6592         result[sourceIndex++] = 0;
6593 
6594         return sourceIndex;
6595     } else {
6596         return sourceIndex+boundType+1;
6597     }
6598 }
6599 
6600 /****************************************************************************/
6601 /* Following are the functions that deal with the properties of a collator  */
6602 /* there are new APIs and some compatibility APIs                           */
6603 /****************************************************************************/
6604 
6605 static inline void
ucol_addLatinOneEntry(UCollator * coll,UChar ch,uint32_t CE,int32_t * primShift,int32_t * secShift,int32_t * terShift)6606 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
6607                     int32_t *primShift, int32_t *secShift, int32_t *terShift)
6608 {
6609     uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6610     UBool reverseSecondary = FALSE;
6611     UBool continuation = isContinuation(CE);
6612     if(!continuation) {
6613         tertiary = (uint8_t)((CE & coll->tertiaryMask));
6614         tertiary ^= coll->caseSwitch;
6615         reverseSecondary = TRUE;
6616     } else {
6617         tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6618         tertiary &= UCOL_REMOVE_CASE;
6619         reverseSecondary = FALSE;
6620     }
6621 
6622     secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6623     primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6624     primary1 = (uint8_t)(CE >> 8);
6625 
6626     if(primary1 != 0) {
6627         if (coll->leadBytePermutationTable != NULL && !continuation) {
6628             primary1 = coll->leadBytePermutationTable[primary1];
6629         }
6630 
6631         coll->latinOneCEs[ch] |= (primary1 << *primShift);
6632         *primShift -= 8;
6633     }
6634     if(primary2 != 0) {
6635         if(*primShift < 0) {
6636             coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6637             coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6638             coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6639             return;
6640         }
6641         coll->latinOneCEs[ch] |= (primary2 << *primShift);
6642         *primShift -= 8;
6643     }
6644     if(secondary != 0) {
6645         if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
6646             coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
6647             coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
6648         } else { // normal case
6649             coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
6650         }
6651         *secShift -= 8;
6652     }
6653     if(tertiary != 0) {
6654         coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
6655         *terShift -= 8;
6656     }
6657 }
6658 
6659 static inline UBool
ucol_resizeLatinOneTable(UCollator * coll,int32_t size,UErrorCode * status)6660 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6661     uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6662     if(newTable == NULL) {
6663       *status = U_MEMORY_ALLOCATION_ERROR;
6664       coll->latinOneFailed = TRUE;
6665       return FALSE;
6666     }
6667     int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
6668     uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6669     uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6670     uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
6671     uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
6672     coll->latinOneTableLen = size;
6673     uprv_free(coll->latinOneCEs);
6674     coll->latinOneCEs = newTable;
6675     return TRUE;
6676 }
6677 
6678 static UBool
ucol_setUpLatinOne(UCollator * coll,UErrorCode * status)6679 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6680     UBool result = TRUE;
6681     if(coll->latinOneCEs == NULL) {
6682         coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
6683         if(coll->latinOneCEs == NULL) {
6684             *status = U_MEMORY_ALLOCATION_ERROR;
6685             return FALSE;
6686         }
6687         coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6688     }
6689     UChar ch = 0;
6690     UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6691     // Check for null pointer
6692     if (U_FAILURE(*status)) {
6693         return FALSE;
6694     }
6695     uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
6696 
6697     int32_t primShift = 24, secShift = 24, terShift = 24;
6698     uint32_t CE = 0;
6699     int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6700 
6701     // TODO: make safe if you get more than you wanted...
6702     for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
6703         primShift = 24; secShift = 24; terShift = 24;
6704         if(ch < 0x100) {
6705             CE = coll->latinOneMapping[ch];
6706         } else {
6707             CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
6708             if(CE == UCOL_NOT_FOUND && coll->UCA) {
6709                 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
6710             }
6711         }
6712         if(CE < UCOL_NOT_FOUND) {
6713             ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6714         } else {
6715             switch (getCETag(CE)) {
6716             case EXPANSION_TAG:
6717             case DIGIT_TAG:
6718                 ucol_setText(it, &ch, 1, status);
6719                 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
6720                     if(primShift < 0 || secShift < 0 || terShift < 0) {
6721                         coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6722                         coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6723                         coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6724                         break;
6725                     }
6726                     ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6727                 }
6728                 break;
6729             case CONTRACTION_TAG:
6730                 // here is the trick
6731                 // F2 is contraction. We do something very similar to contractions
6732                 // but have two indices, one in the real contraction table and the
6733                 // other to where we stuffed things. This hopes that we don't have
6734                 // many contractions (this should work for latin-1 tables).
6735                 {
6736                     if((CE & 0x00FFF000) != 0) {
6737                         *status = U_UNSUPPORTED_ERROR;
6738                         goto cleanup_after_failure;
6739                     }
6740 
6741                     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
6742 
6743                     CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
6744 
6745                     coll->latinOneCEs[ch] = CE;
6746                     coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
6747                     coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
6748 
6749                     // We're going to jump into contraction table, pick the elements
6750                     // and use them
6751                     do {
6752                         CE = *(coll->contractionCEs +
6753                             (UCharOffset - coll->contractionIndex));
6754                         if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
6755                             uint32_t size;
6756                             uint32_t i;    /* general counter */
6757                             uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
6758                             size = getExpansionCount(CE);
6759                             //CE = *CEOffset++;
6760                             if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6761                                 for(i = 0; i<size; i++) {
6762                                     if(primShift < 0 || secShift < 0 || terShift < 0) {
6763                                         coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6764                                         coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6765                                         coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6766                                         break;
6767                                     }
6768                                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6769                                 }
6770                             } else { /* else, we do */
6771                                 while(*CEOffset != 0) {
6772                                     if(primShift < 0 || secShift < 0 || terShift < 0) {
6773                                         coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6774                                         coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6775                                         coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6776                                         break;
6777                                     }
6778                                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6779                                 }
6780                             }
6781                             contractionOffset++;
6782                         } else if(CE < UCOL_NOT_FOUND) {
6783                             ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
6784                         } else {
6785                             coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6786                             coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6787                             coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6788                             contractionOffset++;
6789                         }
6790                         UCharOffset++;
6791                         primShift = 24; secShift = 24; terShift = 24;
6792                         if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
6793                             if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
6794                                 goto cleanup_after_failure;
6795                             }
6796                         }
6797                     } while(*UCharOffset != 0xFFFF);
6798                 }
6799                 break;;
6800             case SPEC_PROC_TAG:
6801                 {
6802                     // 0xB7 is a precontext character defined in UCA5.1, a special
6803                     // handle is implemeted in order to save LatinOne table for
6804                     // most locales.
6805                     if (ch==0xb7) {
6806                         ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6807                     }
6808                     else {
6809                         goto cleanup_after_failure;
6810                     }
6811                 }
6812                 break;
6813             default:
6814                 goto cleanup_after_failure;
6815             }
6816         }
6817     }
6818     // compact table
6819     if(contractionOffset < coll->latinOneTableLen) {
6820         if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
6821             goto cleanup_after_failure;
6822         }
6823     }
6824     ucol_closeElements(it);
6825     return result;
6826 
6827 cleanup_after_failure:
6828     // status should already be set before arriving here.
6829     coll->latinOneFailed = TRUE;
6830     ucol_closeElements(it);
6831     return FALSE;
6832 }
6833 
ucol_updateInternalState(UCollator * coll,UErrorCode * status)6834 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
6835     if(U_SUCCESS(*status)) {
6836         if(coll->caseFirst == UCOL_UPPER_FIRST) {
6837             coll->caseSwitch = UCOL_CASE_SWITCH;
6838         } else {
6839             coll->caseSwitch = UCOL_NO_CASE_SWITCH;
6840         }
6841 
6842         if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
6843             coll->tertiaryMask = UCOL_REMOVE_CASE;
6844             coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6845             coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */
6846             coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
6847             coll->tertiaryBottom = UCOL_COMMON_BOT3;
6848         } else {
6849             coll->tertiaryMask = UCOL_KEEP_CASE;
6850             coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
6851             if(coll->caseFirst == UCOL_UPPER_FIRST) {
6852                 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
6853                 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
6854                 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
6855             } else {
6856                 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6857                 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
6858                 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
6859             }
6860         }
6861 
6862         /* Set the compression values */
6863         uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1);
6864         coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
6865         coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
6866 
6867         if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
6868             && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE)
6869         {
6870             coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
6871         } else {
6872             coll->sortKeyGen = ucol_calcSortKey;
6873         }
6874         if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
6875             && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed)
6876         {
6877             if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
6878                 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
6879                     //fprintf(stderr, "F");
6880                     coll->latinOneUse = TRUE;
6881                 } else {
6882                     coll->latinOneUse = FALSE;
6883                 }
6884                 if(*status == U_UNSUPPORTED_ERROR) {
6885                     *status = U_ZERO_ERROR;
6886                 }
6887             } else { // latin1Table exists and it doesn't need to be regenerated, just use it
6888                 coll->latinOneUse = TRUE;
6889             }
6890         } else {
6891             coll->latinOneUse = FALSE;
6892         }
6893     }
6894 }
6895 
6896 U_CAPI uint32_t  U_EXPORT2
ucol_setVariableTop(UCollator * coll,const UChar * varTop,int32_t len,UErrorCode * status)6897 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
6898     if(U_FAILURE(*status) || coll == NULL) {
6899         return 0;
6900     }
6901     if(len == -1) {
6902         len = u_strlen(varTop);
6903     }
6904     if(len == 0) {
6905         *status = U_ILLEGAL_ARGUMENT_ERROR;
6906         return 0;
6907     }
6908 
6909     collIterate s;
6910     IInit_collIterate(coll, varTop, len, &s, status);
6911     if(U_FAILURE(*status)) {
6912         return 0;
6913     }
6914 
6915     uint32_t CE = ucol_IGetNextCE(coll, &s, status);
6916 
6917     /* here we check if we have consumed all characters */
6918     /* you can put in either one character or a contraction */
6919     /* you shouldn't put more... */
6920     if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
6921         *status = U_CE_NOT_FOUND_ERROR;
6922         return 0;
6923     }
6924 
6925     uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
6926 
6927     if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
6928         *status = U_PRIMARY_TOO_LONG_ERROR;
6929         return 0;
6930     }
6931     if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
6932         coll->variableTopValueisDefault = FALSE;
6933         coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
6934     }
6935 
6936     /* To avoid memory leak, free the offset buffer if necessary. */
6937     ucol_freeOffsetBuffer(&s);
6938 
6939     return CE & UCOL_PRIMARYMASK;
6940 }
6941 
ucol_getVariableTop(const UCollator * coll,UErrorCode * status)6942 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
6943     if(U_FAILURE(*status) || coll == NULL) {
6944         return 0;
6945     }
6946     return coll->variableTopValue<<16;
6947 }
6948 
6949 U_CAPI void  U_EXPORT2
ucol_restoreVariableTop(UCollator * coll,const uint32_t varTop,UErrorCode * status)6950 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
6951     if(U_FAILURE(*status) || coll == NULL) {
6952         return;
6953     }
6954 
6955     if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
6956         coll->variableTopValueisDefault = FALSE;
6957         coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
6958     }
6959 }
6960 /* Attribute setter API */
6961 U_CAPI void  U_EXPORT2
ucol_setAttribute(UCollator * coll,UColAttribute attr,UColAttributeValue value,UErrorCode * status)6962 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
6963     if(U_FAILURE(*status) || coll == NULL) {
6964       return;
6965     }
6966     UColAttributeValue oldFrench = coll->frenchCollation;
6967     UColAttributeValue oldCaseFirst = coll->caseFirst;
6968     switch(attr) {
6969     case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
6970         if(value == UCOL_ON) {
6971             coll->numericCollation = UCOL_ON;
6972             coll->numericCollationisDefault = FALSE;
6973         } else if (value == UCOL_OFF) {
6974             coll->numericCollation = UCOL_OFF;
6975             coll->numericCollationisDefault = FALSE;
6976         } else if (value == UCOL_DEFAULT) {
6977             coll->numericCollationisDefault = TRUE;
6978             coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
6979         } else {
6980             *status = U_ILLEGAL_ARGUMENT_ERROR;
6981         }
6982         break;
6983     case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
6984         if(value == UCOL_ON) {
6985             coll->hiraganaQ = UCOL_ON;
6986             coll->hiraganaQisDefault = FALSE;
6987         } else if (value == UCOL_OFF) {
6988             coll->hiraganaQ = UCOL_OFF;
6989             coll->hiraganaQisDefault = FALSE;
6990         } else if (value == UCOL_DEFAULT) {
6991             coll->hiraganaQisDefault = TRUE;
6992             coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ;
6993         } else {
6994             *status = U_ILLEGAL_ARGUMENT_ERROR;
6995         }
6996         break;
6997     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6998         if(value == UCOL_ON) {
6999             coll->frenchCollation = UCOL_ON;
7000             coll->frenchCollationisDefault = FALSE;
7001         } else if (value == UCOL_OFF) {
7002             coll->frenchCollation = UCOL_OFF;
7003             coll->frenchCollationisDefault = FALSE;
7004         } else if (value == UCOL_DEFAULT) {
7005             coll->frenchCollationisDefault = TRUE;
7006             coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
7007         } else {
7008             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7009         }
7010         break;
7011     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
7012         if(value == UCOL_SHIFTED) {
7013             coll->alternateHandling = UCOL_SHIFTED;
7014             coll->alternateHandlingisDefault = FALSE;
7015         } else if (value == UCOL_NON_IGNORABLE) {
7016             coll->alternateHandling = UCOL_NON_IGNORABLE;
7017             coll->alternateHandlingisDefault = FALSE;
7018         } else if (value == UCOL_DEFAULT) {
7019             coll->alternateHandlingisDefault = TRUE;
7020             coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
7021         } else {
7022             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7023         }
7024         break;
7025     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
7026         if(value == UCOL_LOWER_FIRST) {
7027             coll->caseFirst = UCOL_LOWER_FIRST;
7028             coll->caseFirstisDefault = FALSE;
7029         } else if (value == UCOL_UPPER_FIRST) {
7030             coll->caseFirst = UCOL_UPPER_FIRST;
7031             coll->caseFirstisDefault = FALSE;
7032         } else if (value == UCOL_OFF) {
7033             coll->caseFirst = UCOL_OFF;
7034             coll->caseFirstisDefault = FALSE;
7035         } else if (value == UCOL_DEFAULT) {
7036             coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
7037             coll->caseFirstisDefault = TRUE;
7038         } else {
7039             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7040         }
7041         break;
7042     case UCOL_CASE_LEVEL: /* do we have an extra case level */
7043         if(value == UCOL_ON) {
7044             coll->caseLevel = UCOL_ON;
7045             coll->caseLevelisDefault = FALSE;
7046         } else if (value == UCOL_OFF) {
7047             coll->caseLevel = UCOL_OFF;
7048             coll->caseLevelisDefault = FALSE;
7049         } else if (value == UCOL_DEFAULT) {
7050             coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
7051             coll->caseLevelisDefault = TRUE;
7052         } else {
7053             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7054         }
7055         break;
7056     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
7057         if(value == UCOL_ON) {
7058             coll->normalizationMode = UCOL_ON;
7059             coll->normalizationModeisDefault = FALSE;
7060             initializeFCD(status);
7061         } else if (value == UCOL_OFF) {
7062             coll->normalizationMode = UCOL_OFF;
7063             coll->normalizationModeisDefault = FALSE;
7064         } else if (value == UCOL_DEFAULT) {
7065             coll->normalizationModeisDefault = TRUE;
7066             coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
7067             if(coll->normalizationMode == UCOL_ON) {
7068                 initializeFCD(status);
7069             }
7070         } else {
7071             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7072         }
7073         break;
7074     case UCOL_STRENGTH:         /* attribute for strength */
7075         if (value == UCOL_DEFAULT) {
7076             coll->strengthisDefault = TRUE;
7077             coll->strength = (UColAttributeValue)coll->options->strength;
7078         } else if (value <= UCOL_IDENTICAL) {
7079             coll->strengthisDefault = FALSE;
7080             coll->strength = value;
7081         } else {
7082             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7083         }
7084         break;
7085     case UCOL_ATTRIBUTE_COUNT:
7086     default:
7087         *status = U_ILLEGAL_ARGUMENT_ERROR;
7088         break;
7089     }
7090     if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
7091         coll->latinOneRegenTable = TRUE;
7092     } else {
7093         coll->latinOneRegenTable = FALSE;
7094     }
7095     ucol_updateInternalState(coll, status);
7096 }
7097 
7098 U_CAPI UColAttributeValue  U_EXPORT2
ucol_getAttribute(const UCollator * coll,UColAttribute attr,UErrorCode * status)7099 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
7100     if(U_FAILURE(*status) || coll == NULL) {
7101       return UCOL_DEFAULT;
7102     }
7103     switch(attr) {
7104     case UCOL_NUMERIC_COLLATION:
7105       return coll->numericCollation;
7106     case UCOL_HIRAGANA_QUATERNARY_MODE:
7107       return coll->hiraganaQ;
7108     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
7109         return coll->frenchCollation;
7110     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
7111         return coll->alternateHandling;
7112     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
7113         return coll->caseFirst;
7114     case UCOL_CASE_LEVEL: /* do we have an extra case level */
7115         return coll->caseLevel;
7116     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
7117         return coll->normalizationMode;
7118     case UCOL_STRENGTH:         /* attribute for strength */
7119         return coll->strength;
7120     case UCOL_ATTRIBUTE_COUNT:
7121     default:
7122         *status = U_ILLEGAL_ARGUMENT_ERROR;
7123         break;
7124     }
7125     return UCOL_DEFAULT;
7126 }
7127 
7128 U_CAPI void U_EXPORT2
ucol_setStrength(UCollator * coll,UCollationStrength strength)7129 ucol_setStrength(    UCollator                *coll,
7130             UCollationStrength        strength)
7131 {
7132     UErrorCode status = U_ZERO_ERROR;
7133     ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
7134 }
7135 
7136 U_CAPI UCollationStrength U_EXPORT2
ucol_getStrength(const UCollator * coll)7137 ucol_getStrength(const UCollator *coll)
7138 {
7139     UErrorCode status = U_ZERO_ERROR;
7140     return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
7141 }
7142 
7143 U_INTERNAL int32_t U_EXPORT2
ucol_getReorderCodes(const UCollator * coll,int32_t * dest,int32_t destCapacity,UErrorCode * pErrorCode)7144 ucol_getReorderCodes(const UCollator *coll,
7145                     int32_t *dest,
7146                     int32_t destCapacity,
7147                     UErrorCode *pErrorCode) {
7148     if (U_FAILURE(*pErrorCode)) {
7149         return 0;
7150     }
7151 
7152     if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
7153         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
7154         return 0;
7155     }
7156 
7157     if (coll->reorderCodesLength > destCapacity) {
7158         *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
7159         return coll->reorderCodesLength;
7160     }
7161     for (int32_t i = 0; i < coll->reorderCodesLength; i++) {
7162         dest[i] = coll->reorderCodes[i];
7163     }
7164     return coll->reorderCodesLength;
7165 }
7166 
7167 U_INTERNAL void U_EXPORT2
ucol_setReorderCodes(UCollator * coll,const int32_t * reorderCodes,int32_t reorderCodesLength,UErrorCode * pErrorCode)7168 ucol_setReorderCodes(UCollator *coll,
7169                     const int32_t *reorderCodes,
7170                     int32_t reorderCodesLength,
7171                     UErrorCode *pErrorCode) {
7172     if (U_FAILURE(*pErrorCode)) {
7173         return;
7174     }
7175 
7176     if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NULL)) {
7177         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
7178         return;
7179     }
7180 
7181     uprv_free(coll->reorderCodes);
7182     coll->reorderCodes = NULL;
7183     coll->reorderCodesLength = 0;
7184     if (reorderCodesLength == 0) {
7185         uprv_free(coll->leadBytePermutationTable);
7186         coll->leadBytePermutationTable = NULL;
7187         return;
7188     }
7189     coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int32_t));
7190     if (coll->reorderCodes == NULL) {
7191         *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
7192         return;
7193     }
7194     for (int32_t i = 0; i < reorderCodesLength; i++) {
7195         coll->reorderCodes[i] = reorderCodes[i];
7196     }
7197     coll->reorderCodesLength = reorderCodesLength;
7198     ucol_buildPermutationTable(coll, pErrorCode);
7199     if (U_FAILURE(*pErrorCode)) {
7200         uprv_free(coll->reorderCodes);
7201         coll->reorderCodes = NULL;
7202         coll->reorderCodesLength = 0;
7203     }
7204 }
7205 
7206 
7207 /****************************************************************************/
7208 /* Following are misc functions                                             */
7209 /* there are new APIs and some compatibility APIs                           */
7210 /****************************************************************************/
7211 
7212 U_CAPI void U_EXPORT2
ucol_getVersion(const UCollator * coll,UVersionInfo versionInfo)7213 ucol_getVersion(const UCollator* coll,
7214                 UVersionInfo versionInfo)
7215 {
7216     /* RunTime version  */
7217     uint8_t rtVersion = UCOL_RUNTIME_VERSION;
7218     /* Builder version*/
7219     uint8_t bdVersion = coll->image->version[0];
7220 
7221     /* Charset Version. Need to get the version from cnv files
7222      * makeconv should populate cnv files with version and
7223      * an api has to be provided in ucnv.h to obtain this version
7224      */
7225     uint8_t csVersion = 0;
7226 
7227     /* combine the version info */
7228     uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
7229 
7230     /* Tailoring rules */
7231     versionInfo[0] = (uint8_t)(cmbVersion>>8);
7232     versionInfo[1] = (uint8_t)cmbVersion;
7233     versionInfo[2] = coll->image->version[1];
7234     if(coll->UCA) {
7235         /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */
7236         versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07);
7237     } else {
7238         versionInfo[3] = 0;
7239     }
7240 }
7241 
7242 
7243 /* This internal API checks whether a character is tailored or not */
7244 U_CAPI UBool  U_EXPORT2
ucol_isTailored(const UCollator * coll,const UChar u,UErrorCode * status)7245 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
7246     if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
7247         return FALSE;
7248     }
7249 
7250     uint32_t CE = UCOL_NOT_FOUND;
7251     const UChar *ContractionStart = NULL;
7252     if(u < 0x100) { /* latin-1 */
7253         CE = coll->latinOneMapping[u];
7254         if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
7255             return FALSE;
7256         }
7257     } else { /* regular */
7258         CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
7259     }
7260 
7261     if(isContraction(CE)) {
7262         ContractionStart = (UChar *)coll->image+getContractOffset(CE);
7263         CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
7264     }
7265 
7266     return (UBool)(CE != UCOL_NOT_FOUND);
7267 }
7268 
7269 
7270 /****************************************************************************/
7271 /* Following are the string compare functions                               */
7272 /*                                                                          */
7273 /****************************************************************************/
7274 
7275 
7276 /*  ucol_checkIdent    internal function.  Does byte level string compare.   */
7277 /*                     Used by strcoll if strength == identical and strings  */
7278 /*                     are otherwise equal.                                  */
7279 /*                                                                           */
7280 /*                     Comparison must be done on NFD normalized strings.    */
7281 /*                     FCD is not good enough.                               */
7282 
7283 static
ucol_checkIdent(collIterate * sColl,collIterate * tColl,UBool normalize,UErrorCode * status)7284 UCollationResult    ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
7285 {
7286     // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
7287     // of same type, but that doesn't really mean that it will stay that way.
7288     int32_t            comparison;
7289 
7290     if (sColl->flags & UCOL_USE_ITERATOR) {
7291         // The division for the array length may truncate the array size to
7292         // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
7293         // for all platforms anyway.
7294         UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7295         UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7296         UNormIterator *sNIt = NULL, *tNIt = NULL;
7297         sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
7298         tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
7299         sColl->iterator->move(sColl->iterator, 0, UITER_START);
7300         tColl->iterator->move(tColl->iterator, 0, UITER_START);
7301         UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
7302         UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
7303         comparison = u_strCompareIter(sIt, tIt, TRUE);
7304         unorm_closeIter(sNIt);
7305         unorm_closeIter(tNIt);
7306     } else {
7307         int32_t sLen      = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1;
7308         const UChar *sBuf = sColl->string;
7309         int32_t tLen      = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1;
7310         const UChar *tBuf = tColl->string;
7311 
7312         if (normalize) {
7313             *status = U_ZERO_ERROR;
7314             // Note: We could use Normalizer::compare() or similar, but for short strings
7315             // which may not be in FCD it might be faster to just NFD them.
7316             // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than
7317             // NFD'ing immediately might be faster for long strings,
7318             // but string comparison is usually done on relatively short strings.
7319             sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen),
7320                                   sColl->writableBuffer,
7321                                   *status);
7322             tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen),
7323                                   tColl->writableBuffer,
7324                                   *status);
7325             if(U_FAILURE(*status)) {
7326                 return UCOL_LESS;
7327             }
7328             comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer);
7329         } else {
7330             comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE);
7331         }
7332     }
7333 
7334     if (comparison < 0) {
7335         return UCOL_LESS;
7336     } else if (comparison == 0) {
7337         return UCOL_EQUAL;
7338     } else /* comparison > 0 */ {
7339         return UCOL_GREATER;
7340     }
7341 }
7342 
7343 /*  CEBuf - A struct and some inline functions to handle the saving    */
7344 /*          of CEs in a buffer within ucol_strcoll                     */
7345 
7346 #define UCOL_CEBUF_SIZE 512
7347 typedef struct ucol_CEBuf {
7348     uint32_t    *buf;
7349     uint32_t    *endp;
7350     uint32_t    *pos;
7351     uint32_t     localArray[UCOL_CEBUF_SIZE];
7352 } ucol_CEBuf;
7353 
7354 
7355 static
UCOL_INIT_CEBUF(ucol_CEBuf * b)7356 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
7357     (b)->buf = (b)->pos = (b)->localArray;
7358     (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
7359 }
7360 
7361 static
ucol_CEBuf_Expand(ucol_CEBuf * b,collIterate * ci,UErrorCode * status)7362 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) {
7363     uint32_t  oldSize;
7364     uint32_t  newSize;
7365     uint32_t  *newBuf;
7366 
7367     ci->flags |= UCOL_ITER_ALLOCATED;
7368     oldSize = (uint32_t)(b->pos - b->buf);
7369     newSize = oldSize * 2;
7370     newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
7371     if(newBuf == NULL) {
7372         *status = U_MEMORY_ALLOCATION_ERROR;
7373     }
7374     else {
7375         uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
7376         if (b->buf != b->localArray) {
7377             uprv_free(b->buf);
7378         }
7379         b->buf = newBuf;
7380         b->endp = b->buf + newSize;
7381         b->pos  = b->buf + oldSize;
7382     }
7383 }
7384 
7385 static
UCOL_CEBUF_PUT(ucol_CEBuf * b,uint32_t ce,collIterate * ci,UErrorCode * status)7386 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) {
7387     if (b->pos == b->endp) {
7388         ucol_CEBuf_Expand(b, ci, status);
7389     }
7390     if (U_SUCCESS(*status)) {
7391         *(b)->pos++ = ce;
7392     }
7393 }
7394 
7395 /* This is a trick string compare function that goes in and uses sortkeys to compare */
7396 /* It is used when compare gets in trouble and needs to bail out                     */
ucol_compareUsingSortKeys(collIterate * sColl,collIterate * tColl,UErrorCode * status)7397 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7398                                                   collIterate *tColl,
7399                                                   UErrorCode *status)
7400 {
7401     uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7402     uint8_t *sourceKeyP = sourceKey;
7403     uint8_t *targetKeyP = targetKey;
7404     int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7405     const UCollator *coll = sColl->coll;
7406     const UChar *source = NULL;
7407     const UChar *target = NULL;
7408     int32_t result = UCOL_EQUAL;
7409     UnicodeString sourceString, targetString;
7410     int32_t sourceLength;
7411     int32_t targetLength;
7412 
7413     if(sColl->flags & UCOL_USE_ITERATOR) {
7414         sColl->iterator->move(sColl->iterator, 0, UITER_START);
7415         tColl->iterator->move(tColl->iterator, 0, UITER_START);
7416         UChar32 c;
7417         while((c=sColl->iterator->next(sColl->iterator))>=0) {
7418             sourceString.append((UChar)c);
7419         }
7420         while((c=tColl->iterator->next(tColl->iterator))>=0) {
7421             targetString.append((UChar)c);
7422         }
7423         source = sourceString.getBuffer();
7424         sourceLength = sourceString.length();
7425         target = targetString.getBuffer();
7426         targetLength = targetString.length();
7427     } else { // no iterators
7428         sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1;
7429         targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1;
7430         source = sColl->string;
7431         target = tColl->string;
7432     }
7433 
7434 
7435 
7436     sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7437     if(sourceKeyLen > UCOL_MAX_BUFFER) {
7438         sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7439         if(sourceKeyP == NULL) {
7440             *status = U_MEMORY_ALLOCATION_ERROR;
7441             goto cleanup_and_do_compare;
7442         }
7443         sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7444     }
7445 
7446     targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7447     if(targetKeyLen > UCOL_MAX_BUFFER) {
7448         targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7449         if(targetKeyP == NULL) {
7450             *status = U_MEMORY_ALLOCATION_ERROR;
7451             goto cleanup_and_do_compare;
7452         }
7453         targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7454     }
7455 
7456     result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7457 
7458 cleanup_and_do_compare:
7459     if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
7460         uprv_free(sourceKeyP);
7461     }
7462 
7463     if(targetKeyP != NULL && targetKeyP != targetKey) {
7464         uprv_free(targetKeyP);
7465     }
7466 
7467     if(result<0) {
7468         return UCOL_LESS;
7469     } else if(result>0) {
7470         return UCOL_GREATER;
7471     } else {
7472         return UCOL_EQUAL;
7473     }
7474 }
7475 
7476 
7477 static UCollationResult
ucol_strcollRegular(collIterate * sColl,collIterate * tColl,UErrorCode * status)7478 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
7479 {
7480     U_ALIGN_CODE(16);
7481 
7482     const UCollator *coll = sColl->coll;
7483 
7484 
7485     // setting up the collator parameters
7486     UColAttributeValue strength = coll->strength;
7487     UBool initialCheckSecTer = (strength  >= UCOL_SECONDARY);
7488 
7489     UBool checkSecTer = initialCheckSecTer;
7490     UBool checkTertiary = (strength  >= UCOL_TERTIARY);
7491     UBool checkQuad = (strength  >= UCOL_QUATERNARY);
7492     UBool checkIdent = (strength == UCOL_IDENTICAL);
7493     UBool checkCase = (coll->caseLevel == UCOL_ON);
7494     UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7495     UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7496     UBool qShifted = shifted && checkQuad;
7497     UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7498 
7499     if(doHiragana && shifted) {
7500         return (ucol_compareUsingSortKeys(sColl, tColl, status));
7501     }
7502     uint8_t caseSwitch = coll->caseSwitch;
7503     uint8_t tertiaryMask = coll->tertiaryMask;
7504 
7505     // This is the lowest primary value that will not be ignored if shifted
7506     uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7507 
7508     UCollationResult result = UCOL_EQUAL;
7509     UCollationResult hirResult = UCOL_EQUAL;
7510 
7511     // Preparing the CE buffers. They will be filled during the primary phase
7512     ucol_CEBuf   sCEs;
7513     ucol_CEBuf   tCEs;
7514     UCOL_INIT_CEBUF(&sCEs);
7515     UCOL_INIT_CEBUF(&tCEs);
7516 
7517     uint32_t secS = 0, secT = 0;
7518     uint32_t sOrder=0, tOrder=0;
7519 
7520     // Non shifted primary processing is quite simple
7521     if(!shifted) {
7522         for(;;) {
7523 
7524             // We fetch CEs until we hit a non ignorable primary or end.
7525             do {
7526                 // We get the next CE
7527                 sOrder = ucol_IGetNextCE(coll, sColl, status);
7528                 // Stuff it in the buffer
7529                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7530                 // And keep just the primary part.
7531                 sOrder &= UCOL_PRIMARYMASK;
7532             } while(sOrder == 0);
7533 
7534             // see the comments on the above block
7535             do {
7536                 tOrder = ucol_IGetNextCE(coll, tColl, status);
7537                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7538                 tOrder &= UCOL_PRIMARYMASK;
7539             } while(tOrder == 0);
7540 
7541             // if both primaries are the same
7542             if(sOrder == tOrder) {
7543                 // and there are no more CEs, we advance to the next level
7544                 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7545                     break;
7546                 }
7547                 if(doHiragana && hirResult == UCOL_EQUAL) {
7548                     if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
7549                         hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
7550                             ? UCOL_LESS:UCOL_GREATER;
7551                     }
7552                 }
7553             } else {
7554                 // only need to check one for continuation
7555                 // if one is then the other must be or the preceding CE would be a prefix of the other
7556                 if (coll->leadBytePermutationTable != NULL && !isContinuation(sOrder)) {
7557                     sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
7558                     tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
7559                 }
7560                 // if two primaries are different, we are done
7561                 result = (sOrder < tOrder) ?  UCOL_LESS: UCOL_GREATER;
7562                 goto commonReturn;
7563             }
7564         } // no primary difference... do the rest from the buffers
7565     } else { // shifted - do a slightly more complicated processing :)
7566         for(;;) {
7567             UBool sInShifted = FALSE;
7568             UBool tInShifted = FALSE;
7569             // This version of code can be refactored. However, it seems easier to understand this way.
7570             // Source loop. Sam as the target loop.
7571             for(;;) {
7572                 sOrder = ucol_IGetNextCE(coll, sColl, status);
7573                 if(sOrder == UCOL_NO_MORE_CES) {
7574                     UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7575                     break;
7576                 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
7577                     /* UCA amendment - ignore ignorables that follow shifted code points */
7578                     continue;
7579                 } else if(isContinuation(sOrder)) {
7580                     if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7581                         if(sInShifted) {
7582                             sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7583                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7584                             continue;
7585                         } else {
7586                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7587                             break;
7588                         }
7589                     } else { /* Just lower level values */
7590                         if(sInShifted) {
7591                             continue;
7592                         } else {
7593                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7594                             continue;
7595                         }
7596                     }
7597                 } else { /* regular */
7598                     if(coll->leadBytePermutationTable != NULL){
7599                         sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
7600                     }
7601                     if((sOrder & UCOL_PRIMARYMASK) > LVT) {
7602                         UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7603                         break;
7604                     } else {
7605                         if((sOrder & UCOL_PRIMARYMASK) > 0) {
7606                             sInShifted = TRUE;
7607                             sOrder &= UCOL_PRIMARYMASK;
7608                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7609                             continue;
7610                         } else {
7611                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7612                             sInShifted = FALSE;
7613                             continue;
7614                         }
7615                     }
7616                 }
7617             }
7618             sOrder &= UCOL_PRIMARYMASK;
7619             sInShifted = FALSE;
7620 
7621             for(;;) {
7622                 tOrder = ucol_IGetNextCE(coll, tColl, status);
7623                 if(tOrder == UCOL_NO_MORE_CES) {
7624                     UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7625                     break;
7626                 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
7627                     /* UCA amendment - ignore ignorables that follow shifted code points */
7628                     continue;
7629                 } else if(isContinuation(tOrder)) {
7630                     if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7631                         if(tInShifted) {
7632                             tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7633                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7634                             continue;
7635                         } else {
7636                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7637                             break;
7638                         }
7639                     } else { /* Just lower level values */
7640                         if(tInShifted) {
7641                             continue;
7642                         } else {
7643                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7644                             continue;
7645                         }
7646                     }
7647                 } else { /* regular */
7648                     if(coll->leadBytePermutationTable != NULL){
7649                         tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
7650                     }
7651                     if((tOrder & UCOL_PRIMARYMASK) > LVT) {
7652                         UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7653                         break;
7654                     } else {
7655                         if((tOrder & UCOL_PRIMARYMASK) > 0) {
7656                             tInShifted = TRUE;
7657                             tOrder &= UCOL_PRIMARYMASK;
7658                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7659                             continue;
7660                         } else {
7661                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7662                             tInShifted = FALSE;
7663                             continue;
7664                         }
7665                     }
7666                 }
7667             }
7668             tOrder &= UCOL_PRIMARYMASK;
7669             tInShifted = FALSE;
7670 
7671             if(sOrder == tOrder) {
7672                 /*
7673                 if(doHiragana && hirResult == UCOL_EQUAL) {
7674                 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7675                 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7676                 ? UCOL_LESS:UCOL_GREATER;
7677                 }
7678                 }
7679                 */
7680                 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7681                     break;
7682                 } else {
7683                     sOrder = 0;
7684                     tOrder = 0;
7685                     continue;
7686                 }
7687             } else {
7688                 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
7689                 goto commonReturn;
7690             }
7691         } /* no primary difference... do the rest from the buffers */
7692     }
7693 
7694     /* now, we're gonna reexamine collected CEs */
7695     uint32_t    *sCE;
7696     uint32_t    *tCE;
7697 
7698     /* This is the secondary level of comparison */
7699     if(checkSecTer) {
7700         if(!isFrenchSec) { /* normal */
7701             sCE = sCEs.buf;
7702             tCE = tCEs.buf;
7703             for(;;) {
7704                 while (secS == 0) {
7705                     secS = *(sCE++) & UCOL_SECONDARYMASK;
7706                 }
7707 
7708                 while(secT == 0) {
7709                     secT = *(tCE++) & UCOL_SECONDARYMASK;
7710                 }
7711 
7712                 if(secS == secT) {
7713                     if(secS == UCOL_NO_MORE_CES_SECONDARY) {
7714                         break;
7715                     } else {
7716                         secS = 0; secT = 0;
7717                         continue;
7718                     }
7719                 } else {
7720                     result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7721                     goto commonReturn;
7722                 }
7723             }
7724         } else { /* do the French */
7725             uint32_t *sCESave = NULL;
7726             uint32_t *tCESave = NULL;
7727             sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
7728             tCE = tCEs.pos-2;
7729             for(;;) {
7730                 while (secS == 0 && sCE >= sCEs.buf) {
7731                     if(sCESave == NULL) {
7732                         secS = *(sCE--);
7733                         if(isContinuation(secS)) {
7734                             while(isContinuation(secS = *(sCE--)))
7735                                 ;
7736                             /* after this, secS has the start of continuation, and sCEs points before that */
7737                             sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
7738                             sCE+=2;  /* need to point to the first continuation CP */
7739                             /* However, now you can just continue doing stuff */
7740                         }
7741                     } else {
7742                         secS = *(sCE++);
7743                         if(!isContinuation(secS)) { /* This means we have finished with this cont */
7744                             sCE = sCESave;            /* reset the pointer to before continuation */
7745                             sCESave = NULL;
7746                             secS = 0;  /* Fetch a fresh CE before the continuation sequence. */
7747                             continue;
7748                         }
7749                     }
7750                     secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7751                 }
7752 
7753                 while(secT == 0 && tCE >= tCEs.buf) {
7754                     if(tCESave == NULL) {
7755                         secT = *(tCE--);
7756                         if(isContinuation(secT)) {
7757                             while(isContinuation(secT = *(tCE--)))
7758                                 ;
7759                             /* after this, secS has the start of continuation, and sCEs points before that */
7760                             tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
7761                             tCE+=2;  /* need to point to the first continuation CP */
7762                             /* However, now you can just continue doing stuff */
7763                         }
7764                     } else {
7765                         secT = *(tCE++);
7766                         if(!isContinuation(secT)) { /* This means we have finished with this cont */
7767                             tCE = tCESave;          /* reset the pointer to before continuation */
7768                             tCESave = NULL;
7769                             secT = 0;  /* Fetch a fresh CE before the continuation sequence. */
7770                             continue;
7771                         }
7772                     }
7773                     secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7774                 }
7775 
7776                 if(secS == secT) {
7777                     if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
7778                         break;
7779                     } else {
7780                         secS = 0; secT = 0;
7781                         continue;
7782                     }
7783                 } else {
7784                     result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7785                     goto commonReturn;
7786                 }
7787             }
7788         }
7789     }
7790 
7791     /* doing the case bit */
7792     if(checkCase) {
7793         sCE = sCEs.buf;
7794         tCE = tCEs.buf;
7795         for(;;) {
7796             while((secS & UCOL_REMOVE_CASE) == 0) {
7797                 if(!isContinuation(*sCE++)) {
7798                     secS =*(sCE-1);
7799                     if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7800                         // primary ignorables should not be considered on the case level when the strength is primary
7801                         // otherwise, the CEs stop being well-formed
7802                         secS &= UCOL_TERT_CASE_MASK;
7803                         secS ^= caseSwitch;
7804                     } else {
7805                         secS = 0;
7806                     }
7807                 } else {
7808                     secS = 0;
7809                 }
7810             }
7811 
7812             while((secT & UCOL_REMOVE_CASE) == 0) {
7813                 if(!isContinuation(*tCE++)) {
7814                     secT = *(tCE-1);
7815                     if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7816                         // primary ignorables should not be considered on the case level when the strength is primary
7817                         // otherwise, the CEs stop being well-formed
7818                         secT &= UCOL_TERT_CASE_MASK;
7819                         secT ^= caseSwitch;
7820                     } else {
7821                         secT = 0;
7822                     }
7823                 } else {
7824                     secT = 0;
7825                 }
7826             }
7827 
7828             if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
7829                 result = UCOL_LESS;
7830                 goto commonReturn;
7831             } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
7832                 result = UCOL_GREATER;
7833                 goto commonReturn;
7834             }
7835 
7836             if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
7837                 break;
7838             } else {
7839                 secS = 0;
7840                 secT = 0;
7841             }
7842         }
7843     }
7844 
7845     /* Tertiary level */
7846     if(checkTertiary) {
7847         secS = 0;
7848         secT = 0;
7849         sCE = sCEs.buf;
7850         tCE = tCEs.buf;
7851         for(;;) {
7852             while((secS & UCOL_REMOVE_CASE) == 0) {
7853                 secS = *(sCE++) & tertiaryMask;
7854                 if(!isContinuation(secS)) {
7855                     secS ^= caseSwitch;
7856                 } else {
7857                     secS &= UCOL_REMOVE_CASE;
7858                 }
7859             }
7860 
7861             while((secT & UCOL_REMOVE_CASE)  == 0) {
7862                 secT = *(tCE++) & tertiaryMask;
7863                 if(!isContinuation(secT)) {
7864                     secT ^= caseSwitch;
7865                 } else {
7866                     secT &= UCOL_REMOVE_CASE;
7867                 }
7868             }
7869 
7870             if(secS == secT) {
7871                 if((secS & UCOL_REMOVE_CASE) == 1) {
7872                     break;
7873                 } else {
7874                     secS = 0; secT = 0;
7875                     continue;
7876                 }
7877             } else {
7878                 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7879                 goto commonReturn;
7880             }
7881         }
7882     }
7883 
7884 
7885     if(qShifted /*checkQuad*/) {
7886         UBool sInShifted = TRUE;
7887         UBool tInShifted = TRUE;
7888         secS = 0;
7889         secT = 0;
7890         sCE = sCEs.buf;
7891         tCE = tCEs.buf;
7892         for(;;) {
7893             while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(secS) && !sInShifted)) {
7894                 secS = *(sCE++);
7895                 if(isContinuation(secS)) {
7896                     if(!sInShifted) {
7897                         continue;
7898                     }
7899                 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
7900                     secS = UCOL_PRIMARYMASK;
7901                     sInShifted = FALSE;
7902                 } else {
7903                     sInShifted = TRUE;
7904                 }
7905             }
7906             secS &= UCOL_PRIMARYMASK;
7907 
7908 
7909             while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(secT) && !tInShifted)) {
7910                 secT = *(tCE++);
7911                 if(isContinuation(secT)) {
7912                     if(!tInShifted) {
7913                         continue;
7914                     }
7915                 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
7916                     secT = UCOL_PRIMARYMASK;
7917                     tInShifted = FALSE;
7918                 } else {
7919                     tInShifted = TRUE;
7920                 }
7921             }
7922             secT &= UCOL_PRIMARYMASK;
7923 
7924             if(secS == secT) {
7925                 if(secS == UCOL_NO_MORE_CES_PRIMARY) {
7926                     break;
7927                 } else {
7928                     secS = 0; secT = 0;
7929                     continue;
7930                 }
7931             } else {
7932                 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7933                 goto commonReturn;
7934             }
7935         }
7936     } else if(doHiragana && hirResult != UCOL_EQUAL) {
7937         // If we're fine on quaternaries, we might be different
7938         // on Hiragana. This, however, might fail us in shifted.
7939         result = hirResult;
7940         goto commonReturn;
7941     }
7942 
7943     /*  For IDENTICAL comparisons, we use a bitwise character comparison */
7944     /*  as a tiebreaker if all else is equal.                                */
7945     /*  Getting here  should be quite rare - strings are not identical -     */
7946     /*     that is checked first, but compared == through all other checks.  */
7947     if(checkIdent)
7948     {
7949         //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
7950         result = ucol_checkIdent(sColl, tColl, TRUE, status);
7951     }
7952 
7953 commonReturn:
7954     if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
7955         if (sCEs.buf != sCEs.localArray ) {
7956             uprv_free(sCEs.buf);
7957         }
7958         if (tCEs.buf != tCEs.localArray ) {
7959             uprv_free(tCEs.buf);
7960         }
7961     }
7962 
7963     return result;
7964 }
7965 
7966 static UCollationResult
ucol_strcollRegular(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength,UErrorCode * status)7967 ucol_strcollRegular(const UCollator *coll,
7968                     const UChar *source, int32_t sourceLength,
7969                     const UChar *target, int32_t targetLength,
7970                     UErrorCode *status) {
7971     collIterate sColl, tColl;
7972     // Preparing the context objects for iterating over strings
7973     IInit_collIterate(coll, source, sourceLength, &sColl, status);
7974     IInit_collIterate(coll, target, targetLength, &tColl, status);
7975     if(U_FAILURE(*status)) {
7976         return UCOL_LESS;
7977     }
7978     return ucol_strcollRegular(&sColl, &tColl, status);
7979 }
7980 
7981 static inline uint32_t
ucol_getLatinOneContraction(const UCollator * coll,int32_t strength,uint32_t CE,const UChar * s,int32_t * index,int32_t len)7982 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
7983                           uint32_t CE, const UChar *s, int32_t *index, int32_t len)
7984 {
7985     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
7986     int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
7987     int32_t offset = 1;
7988     UChar schar = 0, tchar = 0;
7989 
7990     for(;;) {
7991         if(len == -1) {
7992             if(s[*index] == 0) { // end of string
7993                 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7994             } else {
7995                 schar = s[*index];
7996             }
7997         } else {
7998             if(*index == len) {
7999                 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8000             } else {
8001                 schar = s[*index];
8002             }
8003         }
8004 
8005         while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
8006             offset++;
8007         }
8008 
8009         if (schar == tchar) {
8010             (*index)++;
8011             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
8012         }
8013         else
8014         {
8015             if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
8016                 return UCOL_BAIL_OUT_CE;
8017             }
8018             // skip completely ignorables
8019             uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
8020             if(isZeroCE == 0) { // we have to ignore completely ignorables
8021                 (*index)++;
8022                 continue;
8023             }
8024 
8025             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8026         }
8027     }
8028 }
8029 
8030 
8031 /**
8032  * This is a fast strcoll, geared towards text in Latin-1.
8033  * It supports contractions of size two, French secondaries
8034  * and case switching. You can use it with strengths primary
8035  * to tertiary. It does not support shifted and case level.
8036  * It relies on the table build by setupLatin1Table. If it
8037  * doesn't understand something, it will go to the regular
8038  * strcoll.
8039  */
8040 static UCollationResult
ucol_strcollUseLatin1(const UCollator * coll,const UChar * source,int32_t sLen,const UChar * target,int32_t tLen,UErrorCode * status)8041 ucol_strcollUseLatin1( const UCollator    *coll,
8042               const UChar        *source,
8043               int32_t            sLen,
8044               const UChar        *target,
8045               int32_t            tLen,
8046               UErrorCode *status)
8047 {
8048     U_ALIGN_CODE(16);
8049     int32_t strength = coll->strength;
8050 
8051     int32_t sIndex = 0, tIndex = 0;
8052     UChar sChar = 0, tChar = 0;
8053     uint32_t sOrder=0, tOrder=0;
8054 
8055     UBool endOfSource = FALSE;
8056 
8057     uint32_t *elements = coll->latinOneCEs;
8058 
8059     UBool haveContractions = FALSE; // if we have contractions in our string
8060                                     // we cannot do French secondary
8061 
8062     // Do the primary level
8063     for(;;) {
8064         while(sOrder==0) { // this loop skips primary ignorables
8065             // sOrder=getNextlatinOneCE(source);
8066             if(sLen==-1) {   // handling zero terminated strings
8067                 sChar=source[sIndex++];
8068                 if(sChar==0) {
8069                     endOfSource = TRUE;
8070                     break;
8071                 }
8072             } else {        // handling strings with known length
8073                 if(sIndex==sLen) {
8074                     endOfSource = TRUE;
8075                     break;
8076                 }
8077                 sChar=source[sIndex++];
8078             }
8079             if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8080                 //fprintf(stderr, "R");
8081                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8082             }
8083             sOrder = elements[sChar];
8084             if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
8085                 // specials can basically be either contractions or bail-out signs. If we get anything
8086                 // else, we'll bail out anywasy
8087                 if(getCETag(sOrder) == CONTRACTION_TAG) {
8088                     sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
8089                     haveContractions = TRUE; // if there are contractions, we cannot do French secondary
8090                     // However, if there are contractions in the table, but we always use just one char,
8091                     // we might be able to do French. This should be checked out.
8092                 }
8093                 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8094                     //fprintf(stderr, "S");
8095                     return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8096                 }
8097             }
8098         }
8099 
8100         while(tOrder==0) {  // this loop skips primary ignorables
8101             // tOrder=getNextlatinOneCE(target);
8102             if(tLen==-1) {    // handling zero terminated strings
8103                 tChar=target[tIndex++];
8104                 if(tChar==0) {
8105                     if(endOfSource) { // this is different than source loop,
8106                         // as we already know that source loop is done here,
8107                         // so we can either finish the primary loop if both
8108                         // strings are done or anounce the result if only
8109                         // target is done. Same below.
8110                         goto endOfPrimLoop;
8111                     } else {
8112                         return UCOL_GREATER;
8113                     }
8114                 }
8115             } else {          // handling strings with known length
8116                 if(tIndex==tLen) {
8117                     if(endOfSource) {
8118                         goto endOfPrimLoop;
8119                     } else {
8120                         return UCOL_GREATER;
8121                     }
8122                 }
8123                 tChar=target[tIndex++];
8124             }
8125             if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8126                 //fprintf(stderr, "R");
8127                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8128             }
8129             tOrder = elements[tChar];
8130             if(tOrder >= UCOL_NOT_FOUND) {
8131                 // Handling specials, see the comments for source
8132                 if(getCETag(tOrder) == CONTRACTION_TAG) {
8133                     tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
8134                     haveContractions = TRUE;
8135                 }
8136                 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8137                     //fprintf(stderr, "S");
8138                     return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8139                 }
8140             }
8141         }
8142         if(endOfSource) { // source is finished, but target is not, say the result.
8143             return UCOL_LESS;
8144         }
8145 
8146         if(sOrder == tOrder) { // if we have same CEs, we continue the loop
8147             sOrder = 0; tOrder = 0;
8148             continue;
8149         } else {
8150             // compare current top bytes
8151             if(((sOrder^tOrder)&0xFF000000)!=0) {
8152                 // top bytes differ, return difference
8153                 if(sOrder < tOrder) {
8154                     return UCOL_LESS;
8155                 } else if(sOrder > tOrder) {
8156                     return UCOL_GREATER;
8157                 }
8158                 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
8159                 // since we must return enum value
8160             }
8161 
8162             // top bytes match, continue with following bytes
8163             sOrder<<=8;
8164             tOrder<<=8;
8165         }
8166     }
8167 
8168 endOfPrimLoop:
8169     // after primary loop, we definitely know the sizes of strings,
8170     // so we set it and use simpler loop for secondaries and tertiaries
8171     sLen = sIndex; tLen = tIndex;
8172     if(strength >= UCOL_SECONDARY) {
8173         // adjust the table beggining
8174         elements += coll->latinOneTableLen;
8175         endOfSource = FALSE;
8176 
8177         if(coll->frenchCollation == UCOL_OFF) { // non French
8178             // This loop is a simplified copy of primary loop
8179             // at this point we know that whole strings are latin-1, so we don't
8180             // check for that. We also know that we only have contractions as
8181             // specials.
8182             sIndex = 0; tIndex = 0;
8183             for(;;) {
8184                 while(sOrder==0) {
8185                     if(sIndex==sLen) {
8186                         endOfSource = TRUE;
8187                         break;
8188                     }
8189                     sChar=source[sIndex++];
8190                     sOrder = elements[sChar];
8191                     if(sOrder > UCOL_NOT_FOUND) {
8192                         sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
8193                     }
8194                 }
8195 
8196                 while(tOrder==0) {
8197                     if(tIndex==tLen) {
8198                         if(endOfSource) {
8199                             goto endOfSecLoop;
8200                         } else {
8201                             return UCOL_GREATER;
8202                         }
8203                     }
8204                     tChar=target[tIndex++];
8205                     tOrder = elements[tChar];
8206                     if(tOrder > UCOL_NOT_FOUND) {
8207                         tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
8208                     }
8209                 }
8210                 if(endOfSource) {
8211                     return UCOL_LESS;
8212                 }
8213 
8214                 if(sOrder == tOrder) {
8215                     sOrder = 0; tOrder = 0;
8216                     continue;
8217                 } else {
8218                     // see primary loop for comments on this
8219                     if(((sOrder^tOrder)&0xFF000000)!=0) {
8220                         if(sOrder < tOrder) {
8221                             return UCOL_LESS;
8222                         } else if(sOrder > tOrder) {
8223                             return UCOL_GREATER;
8224                         }
8225                     }
8226                     sOrder<<=8;
8227                     tOrder<<=8;
8228                 }
8229             }
8230         } else { // French
8231             if(haveContractions) { // if we have contractions, we have to bail out
8232                 // since we don't really know how to handle them here
8233                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8234             }
8235             // For French, we go backwards
8236             sIndex = sLen; tIndex = tLen;
8237             for(;;) {
8238                 while(sOrder==0) {
8239                     if(sIndex==0) {
8240                         endOfSource = TRUE;
8241                         break;
8242                     }
8243                     sChar=source[--sIndex];
8244                     sOrder = elements[sChar];
8245                     // don't even look for contractions
8246                 }
8247 
8248                 while(tOrder==0) {
8249                     if(tIndex==0) {
8250                         if(endOfSource) {
8251                             goto endOfSecLoop;
8252                         } else {
8253                             return UCOL_GREATER;
8254                         }
8255                     }
8256                     tChar=target[--tIndex];
8257                     tOrder = elements[tChar];
8258                     // don't even look for contractions
8259                 }
8260                 if(endOfSource) {
8261                     return UCOL_LESS;
8262                 }
8263 
8264                 if(sOrder == tOrder) {
8265                     sOrder = 0; tOrder = 0;
8266                     continue;
8267                 } else {
8268                     // see the primary loop for comments
8269                     if(((sOrder^tOrder)&0xFF000000)!=0) {
8270                         if(sOrder < tOrder) {
8271                             return UCOL_LESS;
8272                         } else if(sOrder > tOrder) {
8273                             return UCOL_GREATER;
8274                         }
8275                     }
8276                     sOrder<<=8;
8277                     tOrder<<=8;
8278                 }
8279             }
8280         }
8281     }
8282 
8283 endOfSecLoop:
8284     if(strength >= UCOL_TERTIARY) {
8285         // tertiary loop is the same as secondary (except no French)
8286         elements += coll->latinOneTableLen;
8287         sIndex = 0; tIndex = 0;
8288         endOfSource = FALSE;
8289         for(;;) {
8290             while(sOrder==0) {
8291                 if(sIndex==sLen) {
8292                     endOfSource = TRUE;
8293                     break;
8294                 }
8295                 sChar=source[sIndex++];
8296                 sOrder = elements[sChar];
8297                 if(sOrder > UCOL_NOT_FOUND) {
8298                     sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
8299                 }
8300             }
8301             while(tOrder==0) {
8302                 if(tIndex==tLen) {
8303                     if(endOfSource) {
8304                         return UCOL_EQUAL; // if both strings are at the end, they are equal
8305                     } else {
8306                         return UCOL_GREATER;
8307                     }
8308                 }
8309                 tChar=target[tIndex++];
8310                 tOrder = elements[tChar];
8311                 if(tOrder > UCOL_NOT_FOUND) {
8312                     tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
8313                 }
8314             }
8315             if(endOfSource) {
8316                 return UCOL_LESS;
8317             }
8318             if(sOrder == tOrder) {
8319                 sOrder = 0; tOrder = 0;
8320                 continue;
8321             } else {
8322                 if(((sOrder^tOrder)&0xff000000)!=0) {
8323                     if(sOrder < tOrder) {
8324                         return UCOL_LESS;
8325                     } else if(sOrder > tOrder) {
8326                         return UCOL_GREATER;
8327                     }
8328                 }
8329                 sOrder<<=8;
8330                 tOrder<<=8;
8331             }
8332         }
8333     }
8334     return UCOL_EQUAL;
8335 }
8336 
8337 
8338 U_CAPI UCollationResult U_EXPORT2
ucol_strcollIter(const UCollator * coll,UCharIterator * sIter,UCharIterator * tIter,UErrorCode * status)8339 ucol_strcollIter( const UCollator    *coll,
8340                  UCharIterator *sIter,
8341                  UCharIterator *tIter,
8342                  UErrorCode         *status)
8343 {
8344     if(!status || U_FAILURE(*status)) {
8345         return UCOL_EQUAL;
8346     }
8347 
8348     UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
8349     UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
8350 
8351     if (sIter == tIter) {
8352         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8353         return UCOL_EQUAL;
8354     }
8355     if(sIter == NULL || tIter == NULL || coll == NULL) {
8356         *status = U_ILLEGAL_ARGUMENT_ERROR;
8357         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8358         return UCOL_EQUAL;
8359     }
8360 
8361     UCollationResult result = UCOL_EQUAL;
8362 
8363     // Preparing the context objects for iterating over strings
8364     collIterate sColl, tColl;
8365     IInit_collIterate(coll, NULL, -1, &sColl, status);
8366     IInit_collIterate(coll, NULL, -1, &tColl, status);
8367     if(U_FAILURE(*status)) {
8368         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8369         return UCOL_EQUAL;
8370     }
8371     // The division for the array length may truncate the array size to
8372     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8373     // for all platforms anyway.
8374     UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8375     UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8376     UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8377 
8378     sColl.iterator = sIter;
8379     sColl.flags |= UCOL_USE_ITERATOR;
8380     tColl.flags |= UCOL_USE_ITERATOR;
8381     tColl.iterator = tIter;
8382 
8383     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8384         sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8385         sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8386         sColl.flags &= ~UCOL_ITER_NORM;
8387 
8388         tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8389         tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8390         tColl.flags &= ~UCOL_ITER_NORM;
8391     }
8392 
8393     UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
8394 
8395     while((sChar = sColl.iterator->next(sColl.iterator)) ==
8396         (tChar = tColl.iterator->next(tColl.iterator))) {
8397             if(sChar == U_SENTINEL) {
8398                 result = UCOL_EQUAL;
8399                 goto end_compare;
8400             }
8401     }
8402 
8403     if(sChar == U_SENTINEL) {
8404         tChar = tColl.iterator->previous(tColl.iterator);
8405     }
8406 
8407     if(tChar == U_SENTINEL) {
8408         sChar = sColl.iterator->previous(sColl.iterator);
8409     }
8410 
8411     sChar = sColl.iterator->previous(sColl.iterator);
8412     tChar = tColl.iterator->previous(tColl.iterator);
8413 
8414     if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8415     {
8416         // We are stopped in the middle of a contraction.
8417         // Scan backwards through the == part of the string looking for the start of the contraction.
8418         //   It doesn't matter which string we scan, since they are the same in this region.
8419         do
8420         {
8421             sChar = sColl.iterator->previous(sColl.iterator);
8422             tChar = tColl.iterator->previous(tColl.iterator);
8423         }
8424         while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8425     }
8426 
8427 
8428     if(U_SUCCESS(*status)) {
8429         result = ucol_strcollRegular(&sColl, &tColl, status);
8430     }
8431 
8432 end_compare:
8433     if(sNormIter || tNormIter) {
8434         unorm_closeIter(sNormIter);
8435         unorm_closeIter(tNormIter);
8436     }
8437 
8438     UTRACE_EXIT_VALUE_STATUS(result, *status)
8439     return result;
8440 }
8441 
8442 
8443 /*                                                                      */
8444 /* ucol_strcoll     Main public API string comparison function          */
8445 /*                                                                      */
8446 U_CAPI UCollationResult U_EXPORT2
ucol_strcoll(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)8447 ucol_strcoll( const UCollator    *coll,
8448               const UChar        *source,
8449               int32_t            sourceLength,
8450               const UChar        *target,
8451               int32_t            targetLength)
8452 {
8453     U_ALIGN_CODE(16);
8454 
8455     UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
8456     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8457         UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8458         UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
8459         UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
8460     }
8461 
8462     if(source == NULL || target == NULL) {
8463         // do not crash, but return. Should have
8464         // status argument to return error.
8465         UTRACE_EXIT_VALUE(UCOL_EQUAL);
8466         return UCOL_EQUAL;
8467     }
8468 
8469     /* Quick check if source and target are same strings. */
8470     /* They should either both be NULL terminated or the explicit length should be set on both. */
8471     if (source==target && sourceLength==targetLength) {
8472         UTRACE_EXIT_VALUE(UCOL_EQUAL);
8473         return UCOL_EQUAL;
8474     }
8475 
8476     /* Scan the strings.  Find:                                                             */
8477     /*    The length of any leading portion that is equal                                   */
8478     /*    Whether they are exactly equal.  (in which case we just return)                   */
8479     const UChar    *pSrc    = source;
8480     const UChar    *pTarg   = target;
8481     int32_t        equalLength;
8482 
8483     if (sourceLength == -1 && targetLength == -1) {
8484         // Both strings are null terminated.
8485         //    Scan through any leading equal portion.
8486         while (*pSrc == *pTarg && *pSrc != 0) {
8487             pSrc++;
8488             pTarg++;
8489         }
8490         if (*pSrc == 0 && *pTarg == 0) {
8491             UTRACE_EXIT_VALUE(UCOL_EQUAL);
8492             return UCOL_EQUAL;
8493         }
8494         equalLength = (int32_t)(pSrc - source);
8495     }
8496     else
8497     {
8498         // One or both strings has an explicit length.
8499         const UChar    *pSrcEnd = source + sourceLength;
8500         const UChar    *pTargEnd = target + targetLength;
8501 
8502         // Scan while the strings are bitwise ==, or until one is exhausted.
8503         for (;;) {
8504             if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8505                 break;
8506             }
8507             if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8508                 break;
8509             }
8510             if (*pSrc != *pTarg) {
8511                 break;
8512             }
8513             pSrc++;
8514             pTarg++;
8515         }
8516         equalLength = (int32_t)(pSrc - source);
8517 
8518         // If we made it all the way through both strings, we are done.  They are ==
8519         if ((pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0))  &&   /* At end of src string, however it was specified. */
8520             (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)))     /* and also at end of dest string                  */
8521         {
8522             UTRACE_EXIT_VALUE(UCOL_EQUAL);
8523             return UCOL_EQUAL;
8524         }
8525     }
8526     if (equalLength > 0) {
8527         /* There is an identical portion at the beginning of the two strings.        */
8528         /*   If the identical portion ends within a contraction or a comibining      */
8529         /*   character sequence, back up to the start of that sequence.              */
8530 
8531         // These values should already be set by the code above.
8532         //pSrc  = source + equalLength;        /* point to the first differing chars   */
8533         //pTarg = target + equalLength;
8534         if ((pSrc  != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) ||
8535             (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll)))
8536         {
8537             // We are stopped in the middle of a contraction.
8538             // Scan backwards through the == part of the string looking for the start of the contraction.
8539             //   It doesn't matter which string we scan, since they are the same in this region.
8540             do
8541             {
8542                 equalLength--;
8543                 pSrc--;
8544             }
8545             while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
8546         }
8547 
8548         source += equalLength;
8549         target += equalLength;
8550         if (sourceLength > 0) {
8551             sourceLength -= equalLength;
8552         }
8553         if (targetLength > 0) {
8554             targetLength -= equalLength;
8555         }
8556     }
8557 
8558     UErrorCode status = U_ZERO_ERROR;
8559     UCollationResult returnVal;
8560     if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
8561         returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status);
8562     } else {
8563         returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
8564     }
8565     UTRACE_EXIT_VALUE(returnVal);
8566     return returnVal;
8567 }
8568 
8569 /* convenience function for comparing strings */
8570 U_CAPI UBool U_EXPORT2
ucol_greater(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)8571 ucol_greater(    const    UCollator        *coll,
8572         const    UChar            *source,
8573         int32_t            sourceLength,
8574         const    UChar            *target,
8575         int32_t            targetLength)
8576 {
8577     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8578         == UCOL_GREATER);
8579 }
8580 
8581 /* convenience function for comparing strings */
8582 U_CAPI UBool U_EXPORT2
ucol_greaterOrEqual(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)8583 ucol_greaterOrEqual(    const    UCollator    *coll,
8584             const    UChar        *source,
8585             int32_t        sourceLength,
8586             const    UChar        *target,
8587             int32_t        targetLength)
8588 {
8589     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8590         != UCOL_LESS);
8591 }
8592 
8593 /* convenience function for comparing strings */
8594 U_CAPI UBool U_EXPORT2
ucol_equal(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)8595 ucol_equal(        const    UCollator        *coll,
8596             const    UChar            *source,
8597             int32_t            sourceLength,
8598             const    UChar            *target,
8599             int32_t            targetLength)
8600 {
8601     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8602         == UCOL_EQUAL);
8603 }
8604 
8605 U_CAPI void U_EXPORT2
ucol_getUCAVersion(const UCollator * coll,UVersionInfo info)8606 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
8607     if(coll && coll->UCA) {
8608         uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
8609     }
8610 }
8611 
8612 #endif /* #if !UCONFIG_NO_COLLATION */
8613