• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *   Copyright (C) 1996-2007, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 *   file name:  ucol.cpp
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 * Modification history
12 * Date        Name      Comments
13 * 1996-1999   various members of ICU team maintained C API for collation framework
14 * 02/16/2001  synwee    Added internal method getPrevSpecialCE
15 * 03/01/2001  synwee    Added maxexpansion functionality.
16 * 03/16/2001  weiv      Collation framework is rewritten in C and made UCA compliant
17 */
18 
19 #include "unicode/utypes.h"
20 #include "uassert.h"
21 
22 #if !UCONFIG_NO_COLLATION
23 
24 #include "unicode/coleitr.h"
25 #include "unicode/unorm.h"
26 #include "unicode/udata.h"
27 #include "unicode/ustring.h"
28 
29 #include "ucol_imp.h"
30 #include "ucol_elm.h"
31 #include "bocsu.h"
32 
33 #include "unormimp.h"
34 #include "unorm_it.h"
35 #include "umutex.h"
36 #include "cmemory.h"
37 #include "ucln_in.h"
38 #include "cstring.h"
39 #include "utracimp.h"
40 #include "putilimp.h"
41 
42 #ifdef UCOL_DEBUG
43 #include <stdio.h>
44 #endif
45 
46 U_NAMESPACE_USE
47 
48 /* added by synwee for trie manipulation*/
49 #define STAGE_1_SHIFT_            10
50 #define STAGE_2_SHIFT_            4
51 #define STAGE_2_MASK_AFTER_SHIFT_ 0x3F
52 #define STAGE_3_MASK_             0xF
53 #define LAST_BYTE_MASK_           0xFF
54 #define SECOND_LAST_BYTE_SHIFT_   8
55 
56 #define ZERO_CC_LIMIT_            0xC0
57 
58 // static UCA. There is only one. Collators don't use it.
59 // It is referenced only in ucol_initUCA and ucol_cleanup
60 static UCollator* _staticUCA = NULL;
61 // static pointer to udata memory. Inited in ucol_initUCA
62 // used for cleanup in ucol_cleanup
63 static UDataMemory* UCA_DATA_MEM = NULL;
64 
65 // this is static pointer to the normalizer fcdTrieIndex
66 // it is always the same between calls to u_cleanup
67 // and therefore writing to it is not synchronized.
68 // It is cleaned in ucol_cleanup
69 static const uint16_t *fcdTrieIndex=NULL;
70 
71 // These are values from UCA required for
72 // implicit generation and supressing sort key compression
73 // they should regularly be in the UCA, but if one
74 // is running without UCA, it could be a problem
75 static int32_t maxRegularPrimary  = 0xA0;
76 static int32_t minImplicitPrimary = 0xE0;
77 static int32_t maxImplicitPrimary = 0xE4;
78 
79 U_CDECL_BEGIN
80 static UBool U_CALLCONV
isAcceptableUCA(void *,const char *,const char *,const UDataInfo * pInfo)81 isAcceptableUCA(void * /*context*/,
82              const char * /*type*/, const char * /*name*/,
83              const UDataInfo *pInfo){
84   /* context, type & name are intentionally not used */
85     if( pInfo->size>=20 &&
86         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
87         pInfo->charsetFamily==U_CHARSET_FAMILY &&
88         pInfo->dataFormat[0]==UCA_DATA_FORMAT_0 &&   /* dataFormat="UCol" */
89         pInfo->dataFormat[1]==UCA_DATA_FORMAT_1 &&
90         pInfo->dataFormat[2]==UCA_DATA_FORMAT_2 &&
91         pInfo->dataFormat[3]==UCA_DATA_FORMAT_3 &&
92         pInfo->formatVersion[0]==UCA_FORMAT_VERSION_0 &&
93         pInfo->formatVersion[1]>=UCA_FORMAT_VERSION_1// &&
94         //pInfo->formatVersion[1]==UCA_FORMAT_VERSION_1 &&
95         //pInfo->formatVersion[2]==UCA_FORMAT_VERSION_2 && // Too harsh
96         //pInfo->formatVersion[3]==UCA_FORMAT_VERSION_3 && // Too harsh
97         ) {
98         UVersionInfo UCDVersion;
99         u_getUnicodeVersion(UCDVersion);
100         return (UBool)(pInfo->dataVersion[0]==UCDVersion[0]
101             && pInfo->dataVersion[1]==UCDVersion[1]);
102             //&& pInfo->dataVersion[2]==ucaDataInfo.dataVersion[2]
103             //&& pInfo->dataVersion[3]==ucaDataInfo.dataVersion[3]);
104     } else {
105         return FALSE;
106     }
107 }
108 
109 
110 static int32_t U_CALLCONV
_getFoldingOffset(uint32_t data)111 _getFoldingOffset(uint32_t data) {
112     return (int32_t)(data&0xFFFFFF);
113 }
114 
115 U_CDECL_END
116 
117 static
IInit_collIterate(const UCollator * collator,const UChar * sourceString,int32_t sourceLen,collIterate * s)118 inline void  IInit_collIterate(const UCollator *collator, const UChar *sourceString,
119                               int32_t sourceLen, collIterate *s) {
120     (s)->string = (s)->pos = (UChar *)(sourceString);
121     (s)->origFlags = 0;
122     (s)->flags = 0;
123     if (sourceLen >= 0) {
124         s->flags |= UCOL_ITER_HASLEN;
125         (s)->endp = (UChar *)sourceString+sourceLen;
126     }
127     else {
128         /* change to enable easier checking for end of string for fcdpositon */
129         (s)->endp = NULL;
130     }
131     (s)->CEpos = (s)->toReturn = (s)->CEs;
132     (s)->writableBuffer = (s)->stackWritableBuffer;
133     (s)->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE;
134     (s)->coll = (collator);
135     (s)->fcdPosition = 0;
136     if(collator->normalizationMode == UCOL_ON) {
137         (s)->flags |= UCOL_ITER_NORM;
138     }
139     if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
140       (s)->flags |= UCOL_HIRAGANA_Q;
141     }
142     (s)->iterator = NULL;
143     //(s)->iteratorIndex = 0;
144 }
145 
146 U_CAPI void  U_EXPORT2
uprv_init_collIterate(const UCollator * collator,const UChar * sourceString,int32_t sourceLen,collIterate * s)147 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
148                              int32_t sourceLen, collIterate *s){
149     /* Out-of-line version for use from other files. */
150     IInit_collIterate(collator, sourceString, sourceLen, s);
151 }
152 
153 
154 /**
155 * Backup the state of the collIterate struct data
156 * @param data collIterate to backup
157 * @param backup storage
158 */
159 static
backupState(const collIterate * data,collIterateState * backup)160 inline void backupState(const collIterate *data, collIterateState *backup)
161 {
162     backup->fcdPosition = data->fcdPosition;
163     backup->flags       = data->flags;
164     backup->origFlags   = data->origFlags;
165     backup->pos         = data->pos;
166     backup->bufferaddress = data->writableBuffer;
167     backup->buffersize    = data->writableBufSize;
168     backup->iteratorMove = 0;
169     backup->iteratorIndex = 0;
170     if(data->iterator != NULL) {
171         //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
172         backup->iteratorIndex = data->iterator->getState(data->iterator);
173         // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
174         if(backup->iteratorIndex == UITER_NO_STATE) {
175             while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
176                 backup->iteratorMove++;
177                 data->iterator->move(data->iterator, -1, UITER_CURRENT);
178             }
179             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
180         }
181     }
182 }
183 
184 /**
185 * Loads the state into the collIterate struct data
186 * @param data collIterate to backup
187 * @param backup storage
188 * @param forwards boolean to indicate if forwards iteration is used,
189 *        false indicates backwards iteration
190 */
191 static
loadState(collIterate * data,const collIterateState * backup,UBool forwards)192 inline void loadState(collIterate *data, const collIterateState *backup,
193                       UBool        forwards)
194 {
195     UErrorCode status = U_ZERO_ERROR;
196     data->flags       = backup->flags;
197     data->origFlags   = backup->origFlags;
198     if(data->iterator != NULL) {
199         //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
200         data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
201         if(backup->iteratorMove != 0) {
202             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
203         }
204     }
205     data->pos         = backup->pos;
206     if ((data->flags & UCOL_ITER_INNORMBUF) &&
207         data->writableBuffer != backup->bufferaddress) {
208         /*
209         this is when a new buffer has been reallocated and we'll have to
210         calculate the new position.
211         note the new buffer has to contain the contents of the old buffer.
212         */
213         if (forwards) {
214             data->pos = data->writableBuffer +
215                                          (data->pos - backup->bufferaddress);
216         }
217         else {
218             /* backwards direction */
219             uint32_t temp = backup->buffersize -
220                                   (data->pos - backup->bufferaddress);
221             data->pos = data->writableBuffer + (data->writableBufSize - temp);
222         }
223     }
224     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
225         /*
226         this is alittle tricky.
227         if we are initially not in the normalization buffer, even if we
228         normalize in the later stage, the data in the buffer will be
229         ignored, since we skip back up to the data string.
230         however if we are already in the normalization buffer, any
231         further normalization will pull data into the normalization
232         buffer and modify the fcdPosition.
233         since we are keeping the data in the buffer for use, the
234         fcdPosition can not be reverted back.
235         arrgghh....
236         */
237         data->fcdPosition = backup->fcdPosition;
238     }
239 }
240 
241 
242 /*
243 * collIter_eos()
244 *     Checks for a collIterate being positioned at the end of
245 *     its source string.
246 *
247 */
248 static
collIter_eos(collIterate * s)249 inline UBool collIter_eos(collIterate *s) {
250     if(s->flags & UCOL_USE_ITERATOR) {
251       return !(s->iterator->hasNext(s->iterator));
252     }
253     if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
254         // Null terminated string, but not at null, so not at end.
255         //   Whether in main or normalization buffer doesn't matter.
256         return FALSE;
257     }
258 
259     // String with length.  Can't be in normalization buffer, which is always
260     //  null termintated.
261     if (s->flags & UCOL_ITER_HASLEN) {
262         return (s->pos == s->endp);
263     }
264 
265     // We are at a null termination, could be either normalization buffer or main string.
266     if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
267         // At null at end of main string.
268         return TRUE;
269     }
270 
271     // At null at end of normalization buffer.  Need to check whether there there are
272     //   any characters left in the main buffer.
273     if(s->origFlags & UCOL_USE_ITERATOR) {
274       return !(s->iterator->hasNext(s->iterator));
275     } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
276         // Null terminated main string.  fcdPosition is the 'return' position into main buf.
277         return (*s->fcdPosition == 0);
278     }
279     else {
280         // Main string with an end pointer.
281         return s->fcdPosition == s->endp;
282     }
283 }
284 
285 /*
286 * collIter_bos()
287 *     Checks for a collIterate being positioned at the start of
288 *     its source string.
289 *
290 */
291 static
collIter_bos(collIterate * source)292 inline UBool collIter_bos(collIterate *source) {
293   // if we're going backwards, we need to know whether there is more in the
294   // iterator, even if we are in the side buffer
295   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
296     return !source->iterator->hasPrevious(source->iterator);
297   }
298   if (source->pos <= source->string ||
299       ((source->flags & UCOL_ITER_INNORMBUF) &&
300       *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
301     return TRUE;
302   }
303   return FALSE;
304 }
305 
306 /*static
307 inline UBool collIter_SimpleBos(collIterate *source) {
308   // if we're going backwards, we need to know whether there is more in the
309   // iterator, even if we are in the side buffer
310   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
311     return !source->iterator->hasPrevious(source->iterator);
312   }
313   if (source->pos == source->string) {
314     return TRUE;
315   }
316   return FALSE;
317 }*/
318     //return (data->pos == data->string) ||
319 
320 
321 /**
322 * Checks and free writable buffer if it is not the original stack buffer
323 * in collIterate. This function does not reassign the writable buffer.
324 * @param data collIterate struct to determine and free the writable buffer
325 */
326 static
freeHeapWritableBuffer(collIterate * data)327 inline void freeHeapWritableBuffer(collIterate *data)
328 {
329     if (data->writableBuffer != data->stackWritableBuffer) {
330         uprv_free(data->writableBuffer);
331     }
332 }
333 
334 
335 /****************************************************************************/
336 /* Following are the open/close functions                                   */
337 /*                                                                          */
338 /****************************************************************************/
339 
340 static UCollator*
ucol_initFromBinary(const uint8_t * bin,int32_t length,const UCollator * base,UCollator * fillIn,UErrorCode * status)341 ucol_initFromBinary(const uint8_t *bin, int32_t length,
342                 const UCollator *base,
343                 UCollator *fillIn,
344                 UErrorCode *status)
345 {
346     UCollator *result = fillIn;
347     if(U_FAILURE(*status)) {
348         return NULL;
349     }
350     /*
351     if(base == NULL) {
352         // we don't support null base yet
353         *status = U_ILLEGAL_ARGUMENT_ERROR;
354         return NULL;
355     }
356     */
357     // We need these and we could be running without UCA
358     uprv_uca_initImplicitConstants(0, 0, status);
359     UCATableHeader *colData = (UCATableHeader *)bin;
360     // do we want version check here? We're trying to figure out whether collators are compatible
361     if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
362         uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
363         colData->version[0] != UCOL_BUILDER_VERSION)
364     {
365         *status = U_COLLATOR_VERSION_MISMATCH;
366         return NULL;
367     }
368     else {
369         if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
370             result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
371             if(U_FAILURE(*status)){
372                 return NULL;
373             }
374             result->hasRealData = TRUE;
375         }
376         else {
377             if(base) {
378                 result = ucol_initCollator(base->image, result, base, status);
379                 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
380                 if(U_FAILURE(*status)){
381                     return NULL;
382                 }
383                 result->hasRealData = FALSE;
384             }
385             else {
386                 *status = U_USELESS_COLLATOR_ERROR;
387                 return NULL;
388             }
389         }
390         result->freeImageOnClose = FALSE;
391     }
392     result->validLocale = NULL;
393     result->requestedLocale = NULL;
394     result->rules = NULL;
395     result->rulesLength = 0;
396     result->freeRulesOnClose = FALSE;
397     result->rb = NULL;
398     result->elements = NULL;
399     return result;
400 }
401 
402 U_CAPI UCollator* U_EXPORT2
ucol_openBinary(const uint8_t * bin,int32_t length,const UCollator * base,UErrorCode * status)403 ucol_openBinary(const uint8_t *bin, int32_t length,
404                 const UCollator *base,
405                 UErrorCode *status)
406 {
407     return ucol_initFromBinary(bin, length, base, NULL, status);
408 }
409 
410 U_CAPI UCollator* U_EXPORT2
ucol_safeClone(const UCollator * coll,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)411 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
412 {
413     UCollator * localCollator;
414     int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
415     char *stackBufferChars = (char *)stackBuffer;
416     int32_t imageSize = 0;
417     int32_t rulesSize = 0;
418     int32_t rulesPadding = 0;
419     uint8_t *image;
420     UChar *rules;
421     UBool colAllocated = FALSE;
422     UBool imageAllocated = FALSE;
423 
424     if (status == NULL || U_FAILURE(*status)){
425         return 0;
426     }
427     if ((stackBuffer && !pBufferSize) || !coll){
428        *status = U_ILLEGAL_ARGUMENT_ERROR;
429         return 0;
430     }
431     if (coll->rules && coll->freeRulesOnClose) {
432         rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
433         rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
434         bufferSizeNeeded += rulesSize + rulesPadding;
435     }
436 
437     if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
438         *pBufferSize =  bufferSizeNeeded;
439         return 0;
440     }
441 
442     /* Pointers on 64-bit platforms need to be aligned
443      * on a 64-bit boundry in memory.
444      */
445     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
446         int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
447         if (*pBufferSize > offsetUp) {
448             *pBufferSize -= offsetUp;
449             stackBufferChars += offsetUp;
450         }
451         else {
452             /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
453             *pBufferSize = 1;
454         }
455     }
456     stackBuffer = (void *)stackBufferChars;
457 
458     if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) {
459         /* allocate one here...*/
460         stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
461         colAllocated = TRUE;
462         if (U_SUCCESS(*status)) {
463             *status = U_SAFECLONE_ALLOCATED_WARNING;
464         }
465     }
466     localCollator = (UCollator *)stackBufferChars;
467     rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
468     {
469         UErrorCode tempStatus = U_ZERO_ERROR;
470         imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
471     }
472     if (coll->freeImageOnClose) {
473         image = (uint8_t *)uprv_malloc(imageSize);
474         ucol_cloneBinary(coll, image, imageSize, status);
475         imageAllocated = TRUE;
476     }
477     else {
478         image = (uint8_t *)coll->image;
479     }
480     localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
481     if (U_FAILURE(*status)) {
482         return NULL;
483     }
484 
485     if (coll->rules) {
486         if (coll->freeRulesOnClose) {
487             localCollator->rules = u_strcpy(rules, coll->rules);
488             //bufferEnd += rulesSize;
489         }
490         else {
491             localCollator->rules = coll->rules;
492         }
493         localCollator->freeRulesOnClose = FALSE;
494         localCollator->rulesLength = coll->rulesLength;
495     }
496 
497     int32_t i;
498     for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
499         ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
500     }
501     localCollator->requestedLocale = NULL; // zero copies of pointers
502     localCollator->validLocale = NULL;
503     localCollator->rb = NULL;
504     localCollator->elements = NULL;
505     localCollator->freeOnClose = colAllocated;
506     localCollator->freeImageOnClose = imageAllocated;
507     return localCollator;
508 }
509 
510 U_CAPI void U_EXPORT2
ucol_close(UCollator * coll)511 ucol_close(UCollator *coll)
512 {
513     UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
514     UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
515     if(coll != NULL) {
516         // these are always owned by each UCollator struct,
517         // so we always free them
518         if(coll->validLocale != NULL) {
519             uprv_free(coll->validLocale);
520         }
521         if(coll->requestedLocale != NULL) {
522             uprv_free(coll->requestedLocale);
523         }
524         if(coll->resCleaner != NULL) {
525             coll->resCleaner(coll);
526         }
527         if(coll->latinOneCEs != NULL) {
528             uprv_free(coll->latinOneCEs);
529         }
530         if(coll->options != NULL && coll->freeOptionsOnClose) {
531             uprv_free(coll->options);
532         }
533         if(coll->rules != NULL && coll->freeRulesOnClose) {
534             uprv_free((UChar *)coll->rules);
535         }
536         if(coll->image != NULL && coll->freeImageOnClose) {
537             uprv_free((UCATableHeader *)coll->image);
538         }
539 
540         /* Here, it would be advisable to close: */
541         /* - UData for UCA (unless we stuff it in the root resb */
542         /* Again, do we need additional housekeeping... HMMM! */
543         UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
544         if(coll->freeOnClose){
545             /* for safeClone, if freeOnClose is FALSE,
546             don't free the other instance data */
547             uprv_free(coll);
548         }
549     }
550     UTRACE_EXIT();
551 }
552 
553 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
554 /* you should be able to get the binary chunk to write out...  Doesn't look very full now */
555 U_CFUNC uint8_t* U_EXPORT2
ucol_cloneRuleData(const UCollator * coll,int32_t * length,UErrorCode * status)556 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
557 {
558   uint8_t *result = NULL;
559   if(U_FAILURE(*status)) {
560     return NULL;
561   }
562   if(coll->hasRealData == TRUE) {
563     *length = coll->image->size;
564     result = (uint8_t *)uprv_malloc(*length);
565     /* test for NULL */
566     if (result == NULL) {
567         *status = U_MEMORY_ALLOCATION_ERROR;
568         return NULL;
569     }
570     uprv_memcpy(result, coll->image, *length);
571   } else {
572     *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
573     result = (uint8_t *)uprv_malloc(*length);
574     /* test for NULL */
575     if (result == NULL) {
576         *status = U_MEMORY_ALLOCATION_ERROR;
577         return NULL;
578     }
579 
580     /* build the UCATableHeader with minimal entries */
581     /* do not copy the header from the UCA file because its values are wrong! */
582     /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
583 
584     /* reset everything */
585     uprv_memset(result, 0, *length);
586 
587     /* set the tailoring-specific values */
588     UCATableHeader *myData = (UCATableHeader *)result;
589     myData->size = *length;
590 
591     /* offset for the options, the only part of the data that is present after the header */
592     myData->options = sizeof(UCATableHeader);
593 
594     /* need to always set the expansion value for an upper bound of the options */
595     myData->expansion = myData->options + sizeof(UColOptionSet);
596 
597     myData->magic = UCOL_HEADER_MAGIC;
598     myData->isBigEndian = U_IS_BIG_ENDIAN;
599     myData->charSetFamily = U_CHARSET_FAMILY;
600 
601     /* copy UCA's version; genrb will override all but the builder version with tailoring data */
602     uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
603 
604     uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
605     uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
606     uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
607     myData->jamoSpecial = coll->image->jamoSpecial;
608 
609     /* copy the collator options */
610     uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
611   }
612   return result;
613 }
614 
ucol_setOptionsFromHeader(UCollator * result,UColOptionSet * opts,UErrorCode * status)615 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
616   if(U_FAILURE(*status)) {
617     return;
618   }
619     result->caseFirst = (UColAttributeValue)opts->caseFirst;
620     result->caseLevel = (UColAttributeValue)opts->caseLevel;
621     result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
622     result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
623     result->strength = (UColAttributeValue)opts->strength;
624     result->variableTopValue = opts->variableTopValue;
625     result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
626     result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
627     result->numericCollation = (UColAttributeValue)opts->numericCollation;
628 
629     result->caseFirstisDefault = TRUE;
630     result->caseLevelisDefault = TRUE;
631     result->frenchCollationisDefault = TRUE;
632     result->normalizationModeisDefault = TRUE;
633     result->strengthisDefault = TRUE;
634     result->variableTopValueisDefault = TRUE;
635     result->hiraganaQisDefault = TRUE;
636     result->numericCollationisDefault = TRUE;
637 
638     ucol_updateInternalState(result, status);
639 
640     result->options = opts;
641 }
642 
643 
644 /**
645 * Approximate determination if a character is at a contraction end.
646 * Guaranteed to be TRUE if a character is at the end of a contraction,
647 * otherwise it is not deterministic.
648 * @param c character to be determined
649 * @param coll collator
650 */
651 static
ucol_contractionEndCP(UChar c,const UCollator * coll)652 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
653     if (U16_IS_TRAIL(c)) {
654       return TRUE;
655     }
656 
657     if (c < coll->minContrEndCP) {
658         return FALSE;
659     }
660 
661     int32_t  hash = c;
662     uint8_t  htbyte;
663     if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
664         hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
665     }
666     htbyte = coll->contrEndCP[hash>>3];
667     return (((htbyte >> (hash & 7)) & 1) == 1);
668 }
669 
670 
671 
672 /*
673 *   i_getCombiningClass()
674 *        A fast, at least partly inline version of u_getCombiningClass()
675 *        This is a candidate for further optimization.  Used heavily
676 *        in contraction processing.
677 */
678 static
i_getCombiningClass(UChar32 c,const UCollator * coll)679 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
680     uint8_t sCC = 0;
681     if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
682         sCC = u_getCombiningClass(c);
683     }
684     return sCC;
685 }
686 
ucol_initCollator(const UCATableHeader * image,UCollator * fillIn,const UCollator * UCA,UErrorCode * status)687 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
688     UChar c;
689     UCollator *result = fillIn;
690     if(U_FAILURE(*status) || image == NULL) {
691         return NULL;
692     }
693 
694     if(result == NULL) {
695         result = (UCollator *)uprv_malloc(sizeof(UCollator));
696         if(result == NULL) {
697             *status = U_MEMORY_ALLOCATION_ERROR;
698             return result;
699         }
700         result->freeOnClose = TRUE;
701     } else {
702         result->freeOnClose = FALSE;
703     }
704 
705     result->image = image;
706     result->mapping.getFoldingOffset = _getFoldingOffset;
707     const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
708     utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
709     if(U_FAILURE(*status)) {
710         if(result->freeOnClose == TRUE) {
711             uprv_free(result);
712             result = NULL;
713         }
714         return result;
715     }
716 
717     /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/
718     result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
719     result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
720     result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
721     result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
722 
723     result->options = (UColOptionSet*)((uint8_t*)result->image+result->image->options);
724     result->freeOptionsOnClose = FALSE;
725 
726     /* set attributes */
727     result->caseFirst = (UColAttributeValue)result->options->caseFirst;
728     result->caseLevel = (UColAttributeValue)result->options->caseLevel;
729     result->frenchCollation = (UColAttributeValue)result->options->frenchCollation;
730     result->normalizationMode = (UColAttributeValue)result->options->normalizationMode;
731     result->strength = (UColAttributeValue)result->options->strength;
732     result->variableTopValue = result->options->variableTopValue;
733     result->alternateHandling = (UColAttributeValue)result->options->alternateHandling;
734     result->hiraganaQ = (UColAttributeValue)result->options->hiraganaQ;
735     result->numericCollation = (UColAttributeValue)result->options->numericCollation;
736 
737     result->caseFirstisDefault = TRUE;
738     result->caseLevelisDefault = TRUE;
739     result->frenchCollationisDefault = TRUE;
740     result->normalizationModeisDefault = TRUE;
741     result->strengthisDefault = TRUE;
742     result->variableTopValueisDefault = TRUE;
743     result->alternateHandlingisDefault = TRUE;
744     result->hiraganaQisDefault = TRUE;
745     result->numericCollationisDefault = TRUE;
746 
747     /*result->scriptOrder = NULL;*/
748 
749     result->rules = NULL;
750     result->rulesLength = 0;
751 
752     /* get the version info from UCATableHeader and populate the Collator struct*/
753     result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
754     result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
755     result->dataVersion[2] = 0;
756     result->dataVersion[3] = 0;
757 
758     result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
759     result->minUnsafeCP = 0;
760     for (c=0; c<0x300; c++) {  // Find the smallest unsafe char.
761         if (ucol_unsafeCP(c, result)) break;
762     }
763     result->minUnsafeCP = c;
764 
765     result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
766     result->minContrEndCP = 0;
767     for (c=0; c<0x300; c++) {  // Find the Contraction-ending char.
768         if (ucol_contractionEndCP(c, result)) break;
769     }
770     result->minContrEndCP = c;
771 
772     /* max expansion tables */
773     result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
774                                          result->image->endExpansionCE);
775     result->lastEndExpansionCE = result->endExpansionCE +
776                                  result->image->endExpansionCECount - 1;
777     result->expansionCESize = (uint8_t*)result->image +
778                                                result->image->expansionCESize;
779 
780 
781     //result->errorCode = *status;
782 
783     result->latinOneCEs = NULL;
784 
785     result->latinOneRegenTable = FALSE;
786     result->latinOneFailed = FALSE;
787     result->UCA = UCA;
788     result->resCleaner = NULL;
789 
790     ucol_updateInternalState(result, status);
791 
792     /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
793     result->rb = NULL;
794     result->elements = NULL;
795     result->validLocale = NULL;
796     result->requestedLocale = NULL;
797     result->hasRealData = FALSE; // real data lives in .dat file...
798     result->freeImageOnClose = FALSE;
799 
800     return result;
801 }
802 
803 /* new Mark's code */
804 
805 /**
806  * For generation of Implicit CEs
807  * @author Davis
808  *
809  * Cleaned up so that changes can be made more easily.
810  * Old values:
811 # First Implicit: E26A792D
812 # Last Implicit: E3DC70C0
813 # First CJK: E0030300
814 # Last CJK: E0A9DD00
815 # First CJK_A: E0A9DF00
816 # Last CJK_A: E0DE3100
817  */
818 /* Following is a port of Mark's code for new treatment of implicits.
819  * It is positioned here, since ucol_initUCA need to initialize the
820  * variables below according to the data in the fractional UCA.
821  */
822 
823 /**
824     * Function used to:
825     * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
826     * b) bump any non-CJK characters by 10FFFF.
827     * The relevant blocks are:
828     * A:    4E00..9FFF; CJK Unified Ideographs
829     *       F900..FAFF; CJK Compatibility Ideographs
830     * B:    3400..4DBF; CJK Unified Ideographs Extension A
831     *       20000..XX;  CJK Unified Ideographs Extension B (and others later on)
832     * As long as
833     *   no new B characters are allocated between 4E00 and FAFF, and
834     *   no new A characters are outside of this range,
835     * (very high probability) this simple code will work.
836     * The reordered blocks are:
837     * Block1 is CJK
838     * Block2 is CJK_COMPAT_USED
839     * Block3 is CJK_A
840     * (all contiguous)
841     * Any other CJK gets its normal code point
842     * Any non-CJK gets +10FFFF
843     * When we reorder Block1, we make sure that it is at the very start,
844     * so that it will use a 3-byte form.
845     * Warning: the we only pick up the compatibility characters that are
846     * NOT decomposed, so that block is smaller!
847     */
848 
849 // CONSTANTS
850 static const UChar32
851     NON_CJK_OFFSET = 0x110000,
852     UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
853 
854 /**
855  * Precomputed by constructor
856  */
857 static int32_t
858     final3Multiplier = 0,
859     final4Multiplier = 0,
860     final3Count = 0,
861     final4Count = 0,
862     medialCount = 0,
863     min3Primary = 0,
864     min4Primary = 0,
865     max4Primary = 0,
866     minTrail = 0,
867     maxTrail = 0,
868     max3Trail = 0,
869     max4Trail = 0,
870     min4Boundary = 0;
871 
872 static const UChar32
873     CJK_BASE = 0x4E00,
874     CJK_LIMIT = 0x9FFF+1,
875     CJK_COMPAT_USED_BASE = 0xFA0E,
876     CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
877     CJK_A_BASE = 0x3400,
878     CJK_A_LIMIT = 0x4DBF+1,
879     CJK_B_BASE = 0x20000,
880     CJK_B_LIMIT = 0x2A6DF+1;
881 
swapCJK(UChar32 i)882 static UChar32 swapCJK(UChar32 i) {
883 
884     if (i >= CJK_BASE) {
885         if (i < CJK_LIMIT)              return i - CJK_BASE;
886 
887         if (i < CJK_COMPAT_USED_BASE)   return i + NON_CJK_OFFSET;
888 
889         if (i < CJK_COMPAT_USED_LIMIT)  return i - CJK_COMPAT_USED_BASE
890                                                 + (CJK_LIMIT - CJK_BASE);
891         if (i < CJK_B_BASE)             return i + NON_CJK_OFFSET;
892 
893         if (i < CJK_B_LIMIT)            return i; // non-BMP-CJK
894 
895         return i + NON_CJK_OFFSET;  // non-CJK
896     }
897     if (i < CJK_A_BASE)                 return i + NON_CJK_OFFSET;
898 
899     if (i < CJK_A_LIMIT)                return i - CJK_A_BASE
900                                                 + (CJK_LIMIT - CJK_BASE)
901                                                 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
902     return i + NON_CJK_OFFSET; // non-CJK
903 }
904 
905 U_CAPI UChar32 U_EXPORT2
uprv_uca_getRawFromCodePoint(UChar32 i)906 uprv_uca_getRawFromCodePoint(UChar32 i) {
907     return swapCJK(i)+1;
908 }
909 
910 U_CAPI UChar32 U_EXPORT2
uprv_uca_getCodePointFromRaw(UChar32 i)911 uprv_uca_getCodePointFromRaw(UChar32 i) {
912     i--;
913     UChar32 result = 0;
914     if(i >= NON_CJK_OFFSET) {
915         result = i - NON_CJK_OFFSET;
916     } else if(i >= CJK_B_BASE) {
917         result = i;
918     } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
919         if(i < CJK_LIMIT - CJK_BASE) {
920             result = i + CJK_BASE;
921         } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
922             result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
923         } else {
924             result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
925         }
926     } else {
927         result = -1;
928     }
929     return result;
930 }
931 
932 // GET IMPLICIT PRIMARY WEIGHTS
933 // Return value is left justified primary key
934 U_CAPI uint32_t U_EXPORT2
uprv_uca_getImplicitFromRaw(UChar32 cp)935 uprv_uca_getImplicitFromRaw(UChar32 cp) {
936     /*
937     if (cp < 0 || cp > UCOL_MAX_INPUT) {
938         throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
939     }
940     */
941     int32_t last0 = cp - min4Boundary;
942     if (last0 < 0) {
943         int32_t last1 = cp / final3Count;
944         last0 = cp % final3Count;
945 
946         int32_t last2 = last1 / medialCount;
947         last1 %= medialCount;
948 
949         last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
950         last1 = minTrail + last1; // offset
951         last2 = min3Primary + last2; // offset
952         /*
953         if (last2 >= min4Primary) {
954             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
955         }
956         */
957         return (last2 << 24) + (last1 << 16) + (last0 << 8);
958     } else {
959         int32_t last1 = last0 / final4Count;
960         last0 %= final4Count;
961 
962         int32_t last2 = last1 / medialCount;
963         last1 %= medialCount;
964 
965         int32_t last3 = last2 / medialCount;
966         last2 %= medialCount;
967 
968         last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
969         last1 = minTrail + last1; // offset
970         last2 = minTrail + last2; // offset
971         last3 = min4Primary + last3; // offset
972         /*
973         if (last3 > max4Primary) {
974             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
975         }
976         */
977         return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
978     }
979 }
980 
981 static uint32_t U_EXPORT2
uprv_uca_getImplicitPrimary(UChar32 cp)982 uprv_uca_getImplicitPrimary(UChar32 cp) {
983     //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
984 
985     cp = swapCJK(cp);
986     cp++;
987     // we now have a range of numbers from 0 to 21FFFF.
988 
989     //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
990 
991     return uprv_uca_getImplicitFromRaw(cp);
992 }
993 
994 /**
995  * Converts implicit CE into raw integer ("code point")
996  * @param implicit
997  * @return -1 if illegal format
998  */
999 U_CAPI UChar32 U_EXPORT2
uprv_uca_getRawFromImplicit(uint32_t implicit)1000 uprv_uca_getRawFromImplicit(uint32_t implicit) {
1001     UChar32 result;
1002     UChar32 b3 = implicit & 0xFF;
1003     implicit >>= 8;
1004     UChar32 b2 = implicit & 0xFF;
1005     implicit >>= 8;
1006     UChar32 b1 = implicit & 0xFF;
1007     implicit >>= 8;
1008     UChar32 b0 = implicit & 0xFF;
1009 
1010     // simple parameter checks
1011     if (b0 < min3Primary || b0 > max4Primary
1012       || b1 < minTrail || b1 > maxTrail) return -1;
1013     // normal offsets
1014     b1 -= minTrail;
1015 
1016     // take care of the final values, and compose
1017     if (b0 < min4Primary) {
1018         if (b2 < minTrail || b2 > max3Trail || b3 != 0) return -1;
1019         b2 -= minTrail;
1020         UChar32 remainder = b2 % final3Multiplier;
1021         if (remainder != 0) return -1;
1022         b0 -= min3Primary;
1023         b2 /= final3Multiplier;
1024         result = ((b0 * medialCount) + b1) * final3Count + b2;
1025     } else {
1026          if (b2 < minTrail || b2 > maxTrail
1027         || b3 < minTrail || b3 > max4Trail) return -1;
1028         b2 -= minTrail;
1029         b3 -= minTrail;
1030         UChar32 remainder = b3 % final4Multiplier;
1031         if (remainder != 0) return -1;
1032         b3 /= final4Multiplier;
1033         b0 -= min4Primary;
1034         result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
1035     }
1036     // final check
1037     if (result < 0 || result > UCOL_MAX_INPUT) return -1;
1038     return result;
1039 }
1040 
1041 
divideAndRoundUp(int a,int b)1042 static inline int32_t divideAndRoundUp(int a, int b) {
1043     return 1 + (a-1)/b;
1044 }
1045 
1046 /* this function is either called from initUCA or from genUCA before
1047  * doing canonical closure for the UCA.
1048  */
1049 
1050 /**
1051  * Set up to generate implicits.
1052  * @param minPrimary
1053  * @param maxPrimary
1054  * @param minTrail final byte
1055  * @param maxTrail final byte
1056  * @param gap3 the gap we leave for tailoring for 3-byte forms
1057  * @param gap4 the gap we leave for tailoring for 4-byte forms
1058  */
initImplicitConstants(int minPrimary,int maxPrimary,int minTrailIn,int maxTrailIn,int gap3,int primaries3count,UErrorCode * status)1059 static void initImplicitConstants(int minPrimary, int maxPrimary,
1060                                     int minTrailIn, int maxTrailIn,
1061                                     int gap3, int primaries3count,
1062                                     UErrorCode *status) {
1063     // some simple parameter checks
1064     if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) {
1065         *status = U_ILLEGAL_ARGUMENT_ERROR;
1066         return;
1067     };
1068     if (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) {
1069         *status = U_ILLEGAL_ARGUMENT_ERROR;
1070         return;
1071     };
1072     if (primaries3count < 1) {
1073         *status = U_ILLEGAL_ARGUMENT_ERROR;
1074         return;
1075     };
1076 
1077     minTrail = minTrailIn;
1078     maxTrail = maxTrailIn;
1079 
1080     min3Primary = minPrimary;
1081     max4Primary = maxPrimary;
1082     // compute constants for use later.
1083     // number of values we can use in trailing bytes
1084     // leave room for empty values between AND above, e.g. if gap = 2
1085     // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1086     // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1087     // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1088     final3Multiplier = gap3 + 1;
1089     final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
1090     max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
1091 
1092     // medials can use full range
1093     medialCount = (maxTrail - minTrail + 1);
1094     // find out how many values fit in each form
1095     int32_t threeByteCount = medialCount * final3Count;
1096     // now determine where the 3/4 boundary is.
1097     // we use 3 bytes below the boundary, and 4 above
1098     int32_t primariesAvailable = maxPrimary - minPrimary + 1;
1099     int32_t primaries4count = primariesAvailable - primaries3count;
1100 
1101 
1102     int32_t min3ByteCoverage = primaries3count * threeByteCount;
1103     min4Primary = minPrimary + primaries3count;
1104     min4Boundary = min3ByteCoverage;
1105     // Now expand out the multiplier for the 4 bytes, and redo.
1106 
1107     int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
1108     int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
1109     //if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte);
1110     int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
1111     //if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte);
1112     int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
1113     //if (DEBUG) System.out.println("expandedGap: " + gap4);
1114     if (gap4 < 1) {
1115         *status = U_ILLEGAL_ARGUMENT_ERROR;
1116         return;
1117     }
1118     final4Multiplier = gap4 + 1;
1119     final4Count = neededPerFinalByte;
1120     max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
1121     /*
1122     if (DEBUG) {
1123         System.out.println("final4Count: " + final4Count);
1124         for (int counter = 0; counter <= final4Count; ++counter) {
1125             int value = minTrail + (1 + counter)*final4Multiplier;
1126             System.out.println(counter + "\t" + value + "\t" + Utility.hex(value));
1127         }
1128     }
1129     */
1130 }
1131 
1132     /**
1133      * Supply parameters for generating implicit CEs
1134      */
1135 U_CAPI void U_EXPORT2
uprv_uca_initImplicitConstants(int32_t,int32_t,UErrorCode * status)1136 uprv_uca_initImplicitConstants(int32_t, int32_t, UErrorCode *status) {
1137     // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1138     //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1139   initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
1140 }
1141 
1142 U_CDECL_BEGIN
1143 static UBool U_CALLCONV
ucol_cleanup(void)1144 ucol_cleanup(void)
1145 {
1146     if (UCA_DATA_MEM) {
1147         udata_close(UCA_DATA_MEM);
1148         UCA_DATA_MEM = NULL;
1149     }
1150     if (_staticUCA) {
1151         ucol_close(_staticUCA);
1152         _staticUCA = NULL;
1153     }
1154     fcdTrieIndex = NULL;
1155     return TRUE;
1156 }
1157 U_CDECL_END
1158 
1159 /* do not close UCA returned by ucol_initUCA! */
1160 UCollator *
ucol_initUCA(UErrorCode * status)1161 ucol_initUCA(UErrorCode *status) {
1162     if(U_FAILURE(*status)) {
1163         return NULL;
1164     }
1165     UBool needsInit;
1166     UMTX_CHECK(NULL, (_staticUCA == NULL), needsInit);
1167 
1168     if(needsInit) {
1169         UCollator *newUCA = NULL;
1170         UDataMemory *result = udata_openChoice(NULL, UCA_DATA_TYPE, UCA_DATA_NAME, isAcceptableUCA, NULL, status);
1171 
1172         if(U_FAILURE(*status)) {
1173             if (result) {
1174                 udata_close(result);
1175             }
1176             uprv_free(newUCA);
1177         }
1178 
1179         // init FCD data
1180         if (fcdTrieIndex == NULL) {
1181             fcdTrieIndex = unorm_getFCDTrie(status);
1182             ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
1183         }
1184 
1185         if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1186             newUCA = ucol_initCollator((const UCATableHeader *)udata_getMemory(result), newUCA, newUCA, status);
1187             if(U_SUCCESS(*status)){
1188                 umtx_lock(NULL);
1189                 if(_staticUCA == NULL) {
1190                     _staticUCA = newUCA;
1191                     UCA_DATA_MEM = result;
1192                     result = NULL;
1193                     newUCA = NULL;
1194                 }
1195                 umtx_unlock(NULL);
1196 
1197                 if(newUCA != NULL) {
1198                     udata_close(result);
1199                     uprv_free(newUCA);
1200                 }
1201                 else {
1202                     ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
1203                 }
1204                 // Initalize variables for implicit generation
1205                 const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)_staticUCA->image + _staticUCA->image->UCAConsts);
1206                 uprv_uca_initImplicitConstants(UCAconsts->UCA_PRIMARY_IMPLICIT_MIN, UCAconsts->UCA_PRIMARY_IMPLICIT_MAX, status);
1207                 //_staticUCA->mapping.getFoldingOffset = _getFoldingOffset;
1208             }else{
1209                 udata_close(result);
1210                 uprv_free(newUCA);
1211                 _staticUCA= NULL;
1212             }
1213         }
1214     }
1215     return _staticUCA;
1216 }
1217 
1218 
1219 /*    collIterNormalize     Incremental Normalization happens here.                       */
1220 /*                          pick up the range of chars identifed by FCD,                  */
1221 /*                          normalize it into the collIterate's writable buffer,          */
1222 /*                          switch the collIterate's state to use the writable buffer.    */
1223 /*                                                                                        */
1224 static
collIterNormalize(collIterate * collationSource)1225 void collIterNormalize(collIterate *collationSource)
1226 {
1227     UErrorCode  status = U_ZERO_ERROR;
1228 
1229     int32_t    normLen;
1230     UChar      *srcP = collationSource->pos - 1;      /*  Start of chars to normalize    */
1231     UChar      *endP = collationSource->fcdPosition;  /* End of region to normalize+1    */
1232 
1233     normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1234                               srcP, (int32_t)(endP - srcP),
1235                               FALSE, 0,
1236                               &status);
1237     if(status == U_BUFFER_OVERFLOW_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) {
1238         // reallocate and terminate
1239         if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1240                                    &collationSource->writableBuffer,
1241                                    (int32_t *)&collationSource->writableBufSize, normLen + 1,
1242                                    0)
1243         ) {
1244 #ifdef UCOL_DEBUG
1245             fprintf(stderr, "collIterNormalize(), out of memory\n");
1246 #endif
1247             return;
1248         }
1249         status = U_ZERO_ERROR;
1250         normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1251                                   srcP, (int32_t)(endP - srcP),
1252                                   FALSE, 0,
1253                                   &status);
1254     }
1255     if (U_FAILURE(status)) {
1256 #ifdef UCOL_DEBUG
1257         fprintf(stderr, "collIterNormalize(), unorm_decompose() failed, status = %s\n", u_errorName(status));
1258 #endif
1259         return;
1260     }
1261 
1262   if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1263       collationSource->flags |= UCOL_ITER_ALLOCATED;
1264   }
1265   collationSource->pos        = collationSource->writableBuffer;
1266   collationSource->origFlags  = collationSource->flags;
1267   collationSource->flags     |= UCOL_ITER_INNORMBUF;
1268   collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1269 }
1270 
1271 
1272 // This function takes the iterator and extracts normalized stuff up to the next boundary
1273 // It is similar in the end results to the collIterNormalize, but for the cases when we
1274 // use an iterator
1275 /*static
1276 inline void normalizeIterator(collIterate *collationSource) {
1277   UErrorCode status = U_ZERO_ERROR;
1278   UBool wasNormalized = FALSE;
1279   //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1280   uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
1281   int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1282     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1283   if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1284     // reallocate and terminate
1285     if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1286                                &collationSource->writableBuffer,
1287                                (int32_t *)&collationSource->writableBufSize, normLen + 1,
1288                                0)
1289     ) {
1290     #ifdef UCOL_DEBUG
1291         fprintf(stderr, "normalizeIterator(), out of memory\n");
1292     #endif
1293         return;
1294     }
1295     status = U_ZERO_ERROR;
1296     //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1297     collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
1298     normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1299     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1300   }
1301   // Terminate the buffer - we already checked that it is big enough
1302   collationSource->writableBuffer[normLen] = 0;
1303   if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1304       collationSource->flags |= UCOL_ITER_ALLOCATED;
1305   }
1306   collationSource->pos        = collationSource->writableBuffer;
1307   collationSource->origFlags  = collationSource->flags;
1308   collationSource->flags     |= UCOL_ITER_INNORMBUF;
1309   collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1310 }*/
1311 
1312 
1313 /* Incremental FCD check and normalize                                                    */
1314 /*   Called from getNextCE when normalization state is suspect.                           */
1315 /*   When entering, the state is known to be this:                                        */
1316 /*      o   We are working in the main buffer of the collIterate, not the side            */
1317 /*          writable buffer.  When in the side buffer, normalization mode is always off,  */
1318 /*          so we won't get here.                                                         */
1319 /*      o   The leading combining class from the current character is 0 or                */
1320 /*          the trailing combining class of the previous char was zero.                   */
1321 /*          True because the previous call to this function will have always exited       */
1322 /*          that way, and we get called for every char where cc might be non-zero.        */
1323 static
collIterFCD(collIterate * collationSource)1324 inline UBool collIterFCD(collIterate *collationSource) {
1325     UChar       c, c2;
1326     const UChar *srcP, *endP;
1327     uint8_t     leadingCC;
1328     uint8_t     prevTrailingCC = 0;
1329     uint16_t    fcd;
1330     UBool       needNormalize = FALSE;
1331 
1332     srcP = collationSource->pos-1;
1333 
1334     if (collationSource->flags & UCOL_ITER_HASLEN) {
1335         endP = collationSource->endp;
1336     } else {
1337         endP = NULL;
1338     }
1339 
1340     // Get the trailing combining class of the current character.  If it's zero,
1341     //   we are OK.
1342     c = *srcP++;
1343     /* trie access */
1344     fcd = unorm_getFCD16(fcdTrieIndex, c);
1345     if (fcd != 0) {
1346         if (U16_IS_LEAD(c)) {
1347             if ((endP == NULL || srcP != endP) && U16_IS_TRAIL(c2=*srcP)) {
1348                 ++srcP;
1349                 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
1350             } else {
1351                 fcd = 0;
1352             }
1353         }
1354 
1355         prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1356 
1357         if (prevTrailingCC != 0) {
1358             // The current char has a non-zero trailing CC.  Scan forward until we find
1359             //   a char with a leading cc of zero.
1360             while (endP == NULL || srcP != endP)
1361             {
1362                 const UChar *savedSrcP = srcP;
1363 
1364                 c = *srcP++;
1365                 /* trie access */
1366                 fcd = unorm_getFCD16(fcdTrieIndex, c);
1367                 if (fcd != 0 && U16_IS_LEAD(c)) {
1368                     if ((endP == NULL || srcP != endP) && U16_IS_TRAIL(c2=*srcP)) {
1369                         ++srcP;
1370                         fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
1371                     } else {
1372                         fcd = 0;
1373                     }
1374                 }
1375                 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1376                 if (leadingCC == 0) {
1377                     srcP = savedSrcP;      // Hit char that is not part of combining sequence.
1378                                            //   back up over it.  (Could be surrogate pair!)
1379                     break;
1380                 }
1381 
1382                 if (leadingCC < prevTrailingCC) {
1383                     needNormalize = TRUE;
1384                 }
1385 
1386                 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1387             }
1388         }
1389     }
1390 
1391     collationSource->fcdPosition = (UChar *)srcP;
1392 
1393     return needNormalize;
1394 }
1395 
1396 /****************************************************************************/
1397 /* Following are the CE retrieval functions                                 */
1398 /*                                                                          */
1399 /****************************************************************************/
1400 
1401 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
1402 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
1403 
1404 /* there should be a macro version of this function in the header file */
1405 /* This is the first function that tries to fetch a collation element  */
1406 /* If it's not succesfull or it encounters a more difficult situation  */
1407 /* some more sofisticated and slower functions are invoked             */
1408 static
ucol_IGetNextCE(const UCollator * coll,collIterate * collationSource,UErrorCode * status)1409 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1410     uint32_t order = 0;
1411     if (collationSource->CEpos > collationSource->toReturn) {       /* Are there any CEs from previous expansions? */
1412       order = *(collationSource->toReturn++);                         /* if so, return them */
1413       if(collationSource->CEpos == collationSource->toReturn) {
1414         collationSource->CEpos = collationSource->toReturn = collationSource->CEs;
1415       }
1416       return order;
1417     }
1418 
1419     UChar ch = 0;
1420 
1421     for (;;)                           /* Loop handles case when incremental normalize switches   */
1422     {                                  /*   to or from the side buffer / original string, and we  */
1423                                        /*   need to start again to get the next character.        */
1424 
1425         if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1426         {
1427             // The source string is null terminated and we're not working from the side buffer,
1428             //   and we're not normalizing.  This is the fast path.
1429             //   (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1430             ch = *collationSource->pos++;
1431             if (ch != 0) {
1432                 break;
1433             }
1434             else {
1435                 return UCOL_NO_MORE_CES;
1436             }
1437         }
1438 
1439         if (collationSource->flags & UCOL_ITER_HASLEN) {
1440             // Normal path for strings when length is specified.
1441             //   (We can't be in side buffer because it is always null terminated.)
1442             if (collationSource->pos >= collationSource->endp) {
1443                 // Ran off of the end of the main source string.  We're done.
1444                 return UCOL_NO_MORE_CES;
1445             }
1446             ch = *collationSource->pos++;
1447         }
1448         else if(collationSource->flags & UCOL_USE_ITERATOR) {
1449             UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
1450             if(iterCh == U_SENTINEL) {
1451               return UCOL_NO_MORE_CES;
1452             }
1453             ch = (UChar)iterCh;
1454         }
1455         else
1456         {
1457             // Null terminated string.
1458             ch = *collationSource->pos++;
1459             if (ch == 0) {
1460                 // Ran off end of buffer.
1461                 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1462                     // Ran off end of main string. backing up one character.
1463                     collationSource->pos--;
1464                     return UCOL_NO_MORE_CES;
1465                 }
1466                 else
1467                 {
1468                     // Hit null in the normalize side buffer.
1469                     // Usually this means the end of the normalized data,
1470                     // except for one odd case: a null followed by combining chars,
1471                     //   which is the case if we are at the start of the buffer.
1472                     if (collationSource->pos == collationSource->writableBuffer+1) {
1473                         break;
1474                     }
1475 
1476                     //  Null marked end of side buffer.
1477                     //   Revert to the main string and
1478                     //   loop back to top to try again to get a character.
1479                     collationSource->pos   = collationSource->fcdPosition;
1480                     collationSource->flags = collationSource->origFlags;
1481                     continue;
1482                 }
1483             }
1484         }
1485 
1486         if(collationSource->flags&UCOL_HIRAGANA_Q) {
1487           if((ch>=0x3040 && ch<=0x3094) || ch == 0x309d || ch == 0x309e) {
1488             collationSource->flags |= UCOL_WAS_HIRAGANA;
1489           } else {
1490             collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1491           }
1492         }
1493 
1494         // We've got a character.  See if there's any fcd and/or normalization stuff to do.
1495         //    Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1496         if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1497             break;
1498         }
1499 
1500         if (collationSource->fcdPosition >= collationSource->pos) {
1501             // An earlier FCD check has already covered the current character.
1502             // We can go ahead and process this char.
1503             break;
1504         }
1505 
1506         if (ch < ZERO_CC_LIMIT_ ) {
1507             // Fast fcd safe path.  Trailing combining class == 0.  This char is OK.
1508             break;
1509         }
1510 
1511         if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1512             // We need to peek at the next character in order to tell if we are FCD
1513             if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
1514                 // We are at the last char of source string.
1515                 //  It is always OK for FCD check.
1516                 break;
1517             }
1518 
1519             // Not at last char of source string (or we'll check against terminating null).  Do the FCD fast test
1520             if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1521                 break;
1522             }
1523         }
1524 
1525 
1526         // Need a more complete FCD check and possible normalization.
1527         if (collIterFCD(collationSource)) {
1528             collIterNormalize(collationSource);
1529         }
1530         if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1531             //  No normalization was needed.  Go ahead and process the char we already had.
1532             break;
1533         }
1534 
1535         // Some normalization happened.  Next loop iteration will pick up a char
1536         //   from the normalization buffer.
1537 
1538     }   // end for (;;)
1539 
1540 
1541       if (ch <= 0xFF) {
1542           /*  For latin-1 characters we never need to fall back to the UCA table        */
1543           /*    because all of the UCA data is replicated in the latinOneMapping array  */
1544           order = coll->latinOneMapping[ch];
1545           if (order > UCOL_NOT_FOUND) {
1546               order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
1547           }
1548       }
1549       else
1550       {
1551           order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1552           if(order > UCOL_NOT_FOUND) {                                       /* if a CE is special                */
1553               order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);    /* and try to get the special CE     */
1554           }
1555           if(order == UCOL_NOT_FOUND && coll->UCA) {   /* We couldn't find a good CE in the tailoring */
1556             /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1557             order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1558 
1559             if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
1560               order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
1561             }
1562           }
1563       }
1564       if(order == UCOL_NOT_FOUND) {
1565         order = getImplicit(ch, collationSource);
1566       }
1567       return order; /* return the CE */
1568 }
1569 
1570 /* ucol_getNextCE, out-of-line version for use from other files.   */
1571 U_CAPI uint32_t  U_EXPORT2
ucol_getNextCE(const UCollator * coll,collIterate * collationSource,UErrorCode * status)1572 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1573     return ucol_IGetNextCE(coll, collationSource, status);
1574 }
1575 
1576 
1577 /**
1578 * Incremental previous normalization happens here. Pick up the range of chars
1579 * identifed by FCD, normalize it into the collIterate's writable buffer,
1580 * switch the collIterate's state to use the writable buffer.
1581 * @param data collation iterator data
1582 */
1583 static
collPrevIterNormalize(collIterate * data)1584 void collPrevIterNormalize(collIterate *data)
1585 {
1586     UErrorCode status  = U_ZERO_ERROR;
1587     UChar      *pEnd   = data->pos;         /* End normalize + 1 */
1588     UChar      *pStart;
1589     uint32_t    normLen;
1590     UChar      *pStartNorm;
1591 
1592     /* Start normalize */
1593     if (data->fcdPosition == NULL) {
1594         pStart = data->string;
1595     }
1596     else {
1597         pStart = data->fcdPosition + 1;
1598     }
1599 
1600     normLen = unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0,
1601                               data->writableBuffer, 0, &status);
1602 
1603     if (data->writableBufSize <= normLen) {
1604             freeHeapWritableBuffer(data);
1605             data->writableBuffer = (UChar *)uprv_malloc((normLen + 1) *
1606                                                         sizeof(UChar));
1607             if(data->writableBuffer == NULL) { // something is wrong here, return
1608               return;
1609             }
1610             data->flags |= UCOL_ITER_ALLOCATED;
1611             /* to handle the zero termination */
1612             data->writableBufSize = normLen + 1;
1613     }
1614             status = U_ZERO_ERROR;
1615     /*
1616     this puts the null termination infront of the normalized string instead
1617     of the end
1618     */
1619     pStartNorm = data->writableBuffer + (data->writableBufSize - normLen);
1620     *(pStartNorm - 1) = 0;
1621     unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, pStartNorm,
1622                     normLen, &status);
1623 
1624     data->pos        = data->writableBuffer + data->writableBufSize;
1625     data->origFlags  = data->flags;
1626     data->flags     |= UCOL_ITER_INNORMBUF;
1627     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1628 }
1629 
1630 
1631 /**
1632 * Incremental FCD check for previous iteration and normalize. Called from
1633 * getPrevCE when normalization state is suspect.
1634 * When entering, the state is known to be this:
1635 * o  We are working in the main buffer of the collIterate, not the side
1636 *    writable buffer. When in the side buffer, normalization mode is always
1637 *    off, so we won't get here.
1638 * o  The leading combining class from the current character is 0 or the
1639 *    trailing combining class of the previous char was zero.
1640 *    True because the previous call to this function will have always exited
1641 *    that way, and we get called for every char where cc might be non-zero.
1642 * @param data collation iterate struct
1643 * @return normalization status, TRUE for normalization to be done, FALSE
1644 *         otherwise
1645 */
1646 static
collPrevIterFCD(collIterate * data)1647 inline UBool collPrevIterFCD(collIterate *data)
1648 {
1649     const UChar *src, *start;
1650     UChar       c, c2;
1651     uint8_t     leadingCC;
1652     uint8_t     trailingCC = 0;
1653     uint16_t    fcd;
1654     UBool       result = FALSE;
1655 
1656     start = data->string;
1657     src = data->pos + 1;
1658 
1659     /* Get the trailing combining class of the current character. */
1660     c = *--src;
1661     if (!U16_IS_SURROGATE(c)) {
1662         fcd = unorm_getFCD16(fcdTrieIndex, c);
1663     } else if (U16_IS_TRAIL(c) && start < src && U16_IS_LEAD(c2 = *(src - 1))) {
1664         --src;
1665         fcd = unorm_getFCD16(fcdTrieIndex, c2);
1666         if (fcd != 0) {
1667             fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
1668         }
1669     } else /* unpaired surrogate */ {
1670         fcd = 0;
1671     }
1672 
1673     leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1674 
1675     if (leadingCC != 0) {
1676         /*
1677         The current char has a non-zero leading combining class.
1678         Scan backward until we find a char with a trailing cc of zero.
1679         */
1680         for (;;)
1681         {
1682             if (start == src) {
1683                 data->fcdPosition = NULL;
1684                 return result;
1685             }
1686 
1687             c = *--src;
1688             if (!U16_IS_SURROGATE(c)) {
1689                 fcd = unorm_getFCD16(fcdTrieIndex, c);
1690             } else if (U16_IS_TRAIL(c) && start < src && U16_IS_LEAD(c2 = *(src - 1))) {
1691                 --src;
1692                 fcd = unorm_getFCD16(fcdTrieIndex, c2);
1693                 if (fcd != 0) {
1694                     fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
1695                 }
1696             } else /* unpaired surrogate */ {
1697                 fcd = 0;
1698             }
1699 
1700             trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1701 
1702             if (trailingCC == 0) {
1703                 break;
1704             }
1705 
1706             if (leadingCC < trailingCC) {
1707                 result = TRUE;
1708             }
1709 
1710             leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1711         }
1712     }
1713 
1714     data->fcdPosition = (UChar *)src;
1715 
1716     return result;
1717 }
1718 
1719 /** gets a character from the string at a given offset
1720  *  Handles both normal and iterative cases.
1721  *  No error checking - caller beware!
1722  */
1723 inline static
peekCharacter(collIterate * source,int32_t offset)1724 UChar peekCharacter(collIterate *source, int32_t offset) {
1725   if(source->pos != NULL) {
1726     return *(source->pos + offset);
1727   } else if(source->iterator != NULL) {
1728     if(offset != 0) {
1729       source->iterator->move(source->iterator, offset, UITER_CURRENT);
1730       UChar toReturn = (UChar)source->iterator->next(source->iterator);
1731       source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1732       return toReturn;
1733     } else {
1734       return (UChar)source->iterator->current(source->iterator);
1735     }
1736   } else {
1737     return (UChar)U_SENTINEL;
1738   }
1739 }
1740 
1741 /**
1742 * Determines if we are at the start of the data string in the backwards
1743 * collation iterator
1744 * @param data collation iterator
1745 * @return TRUE if we are at the start
1746 */
1747 static
isAtStartPrevIterate(collIterate * data)1748 inline UBool isAtStartPrevIterate(collIterate *data) {
1749   if(data->pos == NULL && data->iterator != NULL) {
1750     return !data->iterator->hasPrevious(data->iterator);
1751   }
1752   //return (collIter_bos(data)) ||
1753   return (data->pos == data->string) ||
1754             ((data->flags & UCOL_ITER_INNORMBUF) &&
1755             *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1756 }
1757 
1758 static
goBackOne(collIterate * data)1759 inline void goBackOne(collIterate *data) {
1760 # if 0
1761   // somehow, it looks like we need to keep iterator synced up
1762   // at all times, as above.
1763   if(data->pos) {
1764     data->pos--;
1765   }
1766   if(data->iterator) {
1767     data->iterator->previous(data->iterator);
1768   }
1769 #endif
1770   if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
1771     data->iterator->previous(data->iterator);
1772   }
1773   if(data->pos) {
1774     data->pos --;
1775   }
1776 }
1777 
1778 /**
1779 * Inline function that gets a simple CE.
1780 * So what it does is that it will first check the expansion buffer. If the
1781 * expansion buffer is not empty, ie the end pointer to the expansion buffer
1782 * is different from the string pointer, we return the collation element at the
1783 * return pointer and decrement it.
1784 * For more complicated CEs it resorts to getComplicatedCE.
1785 * @param coll collator data
1786 * @param data collation iterator struct
1787 * @param status error status
1788 */
1789 static
ucol_IGetPrevCE(const UCollator * coll,collIterate * data,UErrorCode * status)1790 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
1791                                UErrorCode *status)
1792 {
1793     uint32_t result = (uint32_t)UCOL_NULLORDER;
1794     if (data->toReturn > data->CEs) {
1795         data->toReturn --;
1796         result = *(data->toReturn);
1797         if (data->CEs == data->toReturn) {
1798             data->CEpos = data->toReturn;
1799         }
1800     }
1801     else {
1802         UChar ch = 0;
1803         /*
1804         Loop handles case when incremental normalize switches to or from the
1805         side buffer / original string, and we need to start again to get the
1806         next character.
1807         */
1808         for (;;) {
1809             if (data->flags & UCOL_ITER_HASLEN) {
1810                 /*
1811                 Normal path for strings when length is specified.
1812                 Not in side buffer because it is always null terminated.
1813                 */
1814                 if (data->pos <= data->string) {
1815                     /* End of the main source string */
1816                     return UCOL_NO_MORE_CES;
1817                 }
1818                 data->pos --;
1819                 ch = *data->pos;
1820             }
1821             // we are using an iterator to go back. Pray for us!
1822             else if (data->flags & UCOL_USE_ITERATOR) {
1823               UChar32 iterCh = data->iterator->previous(data->iterator);
1824               if(iterCh == U_SENTINEL) {
1825                 return UCOL_NO_MORE_CES;
1826               } else {
1827                 ch = (UChar)iterCh;
1828               }
1829             }
1830             else {
1831                 data->pos --;
1832                 ch = *data->pos;
1833                 /* we are in the side buffer. */
1834                 if (ch == 0) {
1835                     /*
1836                     At the start of the normalize side buffer.
1837                     Go back to string.
1838                     Because pointer points to the last accessed character,
1839                     hence we have to increment it by one here.
1840                     */
1841                     if (data->fcdPosition == NULL) {
1842                         data->pos = data->string;
1843                         return UCOL_NO_MORE_CES;
1844                     }
1845                     else {
1846                         data->pos   = data->fcdPosition + 1;
1847                     }
1848                     data->flags = data->origFlags;
1849                     continue;
1850                 }
1851             }
1852 
1853             if(data->flags&UCOL_HIRAGANA_Q) {
1854               if(ch>=0x3040 && ch<=0x309f) {
1855                 data->flags |= UCOL_WAS_HIRAGANA;
1856               } else {
1857                 data->flags &= ~UCOL_WAS_HIRAGANA;
1858               }
1859             }
1860 
1861             /*
1862             * got a character to determine if there's fcd and/or normalization
1863             * stuff to do.
1864             * if the current character is not fcd.
1865             * if current character is at the start of the string
1866             * Trailing combining class == 0.
1867             * Note if pos is in the writablebuffer, norm is always 0
1868             */
1869             if (ch < ZERO_CC_LIMIT_ ||
1870               // this should propel us out of the loop in the iterator case
1871                 (data->flags & UCOL_ITER_NORM) == 0 ||
1872                 (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
1873                 || data->string == data->pos) {
1874                 break;
1875             }
1876 
1877             if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1878                 /* if next character is FCD */
1879                 if (data->pos == data->string) {
1880                     /* First char of string is always OK for FCD check */
1881                     break;
1882                 }
1883 
1884                 /* Not first char of string, do the FCD fast test */
1885                 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
1886                     break;
1887                 }
1888             }
1889 
1890             /* Need a more complete FCD check and possible normalization. */
1891             if (collPrevIterFCD(data)) {
1892                 collPrevIterNormalize(data);
1893             }
1894 
1895             if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
1896                 /*  No normalization. Go ahead and process the char. */
1897                 break;
1898             }
1899 
1900             /*
1901             Some normalization happened.
1902             Next loop picks up a char from the normalization buffer.
1903             */
1904         }
1905 
1906         /* attempt to handle contractions, after removal of the backwards
1907         contraction
1908         */
1909         if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
1910           result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
1911         } else {
1912           if (ch <= 0xFF) {
1913             result = coll->latinOneMapping[ch];
1914           }
1915           else {
1916             result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1917           }
1918           if (result > UCOL_NOT_FOUND) {
1919             result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
1920           }
1921           if (result == UCOL_NOT_FOUND) { // Not found in master list
1922             if (!isAtStartPrevIterate(data) &&
1923               ucol_contractionEndCP(ch, data->coll)) {
1924                 result = UCOL_CONTRACTION;
1925             } else {
1926               if(coll->UCA) {
1927                 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1928               }
1929             }
1930 
1931             if (result > UCOL_NOT_FOUND) {
1932               if(coll->UCA) {
1933                 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
1934               }
1935             }
1936           }
1937         }
1938         if(result == UCOL_NOT_FOUND) {
1939           result = getPrevImplicit(ch, data);
1940         }
1941     }
1942     return result;
1943 }
1944 
1945 
1946 /*   ucol_getPrevCE, out-of-line version for use from other files.  */
1947 U_CFUNC uint32_t  U_EXPORT2
ucol_getPrevCE(const UCollator * coll,collIterate * data,UErrorCode * status)1948 ucol_getPrevCE(const UCollator *coll, collIterate *data,
1949                         UErrorCode *status) {
1950     return ucol_IGetPrevCE(coll, data, status);
1951 }
1952 
1953 
1954 /* this should be connected to special Jamo handling */
1955 U_CFUNC uint32_t  U_EXPORT2
ucol_getFirstCE(const UCollator * coll,UChar u,UErrorCode * status)1956 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
1957   collIterate colIt;
1958   uint32_t order;
1959   IInit_collIterate(coll, &u, 1, &colIt);
1960   order = ucol_IGetNextCE(coll, &colIt, status);
1961   /*UCOL_GETNEXTCE(order, coll, colIt, status);*/
1962   return order;
1963 }
1964 
1965 /**
1966 * Inserts the argument character into the end of the buffer pushing back the
1967 * null terminator.
1968 * @param data collIterate struct data
1969 * @param pNull pointer to the null termination
1970 * @param ch character to be appended
1971 * @return the position of the new addition
1972 */
1973 static
insertBufferEnd(collIterate * data,UChar * pNull,UChar ch)1974 inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar ch)
1975 {
1976           uint32_t  size    = data->writableBufSize;
1977           UChar    *newbuffer;
1978     const uint32_t  incsize = 5;
1979 
1980     if ((data->writableBuffer + size) > (pNull + 1)) {
1981         *pNull = ch;
1982         *(pNull + 1) = 0;
1983         return pNull;
1984     }
1985 
1986     /*
1987     buffer will always be null terminated at the end.
1988     giving extra space since it is likely that more characters will be added.
1989     */
1990     size += incsize;
1991     newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
1992     if(newbuffer != NULL) { // something wrong, but no status
1993       uprv_memcpy(newbuffer, data->writableBuffer,
1994                   data->writableBufSize * sizeof(UChar));
1995 
1996       freeHeapWritableBuffer(data);
1997       data->writableBufSize = size;
1998       data->writableBuffer  = newbuffer;
1999 
2000       newbuffer        = newbuffer + data->writableBufSize;
2001       *newbuffer       = ch;
2002       *(newbuffer + 1) = 0;
2003     }
2004     return newbuffer;
2005 }
2006 
2007 /**
2008 * Inserts the argument string into the end of the buffer pushing back the
2009 * null terminator.
2010 * @param data collIterate struct data
2011 * @param pNull pointer to the null termination
2012 * @param string to be appended
2013 * @param length of the string to be appended
2014 * @return the position of the new addition
2015 */
2016 static
insertBufferEnd(collIterate * data,UChar * pNull,UChar * str,int32_t length)2017 inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar *str,
2018                                int32_t length)
2019 {
2020     uint32_t  size = pNull - data->writableBuffer;
2021     UChar    *newbuffer;
2022 
2023     if (data->writableBuffer + data->writableBufSize > pNull + length + 1) {
2024         uprv_memcpy(pNull, str, length * sizeof(UChar));
2025         *(pNull + length) = 0;
2026         return pNull;
2027     }
2028 
2029     /*
2030     buffer will always be null terminated at the end.
2031     giving extra space since it is likely that more characters will be added.
2032     */
2033     newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * (size + length + 1));
2034     if(newbuffer != NULL) {
2035       uprv_memcpy(newbuffer, data->writableBuffer, size * sizeof(UChar));
2036       uprv_memcpy(newbuffer + size, str, length * sizeof(UChar));
2037 
2038       freeHeapWritableBuffer(data);
2039       data->writableBufSize = size + length + 1;
2040       data->writableBuffer  = newbuffer;
2041     }
2042 
2043     return newbuffer;
2044 }
2045 
2046 /**
2047 * Special normalization function for contraction in the forwards iterator.
2048 * This normalization sequence will place the current character at source->pos
2049 * and its following normalized sequence into the buffer.
2050 * The fcd position, pos will be changed.
2051 * pos will now point to positions in the buffer.
2052 * Flags will be changed accordingly.
2053 * @param data collation iterator data
2054 */
2055 static
normalizeNextContraction(collIterate * data)2056 inline void normalizeNextContraction(collIterate *data)
2057 {
2058     UChar      *buffer     = data->writableBuffer;
2059     uint32_t    buffersize = data->writableBufSize;
2060     uint32_t    strsize;
2061     UErrorCode  status     = U_ZERO_ERROR;
2062     /* because the pointer points to the next character */
2063     UChar      *pStart     = data->pos - 1;
2064     UChar      *pEnd;
2065     uint32_t    normLen;
2066     UChar      *pStartNorm;
2067 
2068     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2069         *data->writableBuffer = *(pStart - 1);
2070         strsize               = 1;
2071     }
2072     else {
2073         strsize = u_strlen(data->writableBuffer);
2074     }
2075 
2076     pEnd = data->fcdPosition;
2077 
2078     normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
2079                               &status);
2080 
2081     if (buffersize <= normLen + strsize) {
2082         uint32_t  size = strsize + normLen + 1;
2083         UChar    *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
2084         if(temp != NULL) {
2085           uprv_memcpy(temp, buffer, sizeof(UChar) * strsize);
2086           freeHeapWritableBuffer(data);
2087           data->writableBuffer = temp;
2088           data->writableBufSize = size;
2089           data->flags |= UCOL_ITER_ALLOCATED;
2090         }
2091     }
2092 
2093     status            = U_ZERO_ERROR;
2094     pStartNorm        = buffer + strsize;
2095     /* null-termination will be added here */
2096     unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm,
2097                     normLen + 1, &status);
2098 
2099     data->pos        = data->writableBuffer + strsize;
2100     data->origFlags  = data->flags;
2101     data->flags     |= UCOL_ITER_INNORMBUF;
2102     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2103 }
2104 
2105 /**
2106 * Contraction character management function that returns the next character
2107 * for the forwards iterator.
2108 * Does nothing if the next character is in buffer and not the first character
2109 * in it.
2110 * Else it checks next character in data string to see if it is normalizable.
2111 * If it is not, the character is simply copied into the buffer, else
2112 * the whole normalized substring is copied into the buffer, including the
2113 * current character.
2114 * @param data collation element iterator data
2115 * @return next character
2116 */
2117 static
getNextNormalizedChar(collIterate * data)2118 inline UChar getNextNormalizedChar(collIterate *data)
2119 {
2120     UChar  nextch;
2121     UChar  ch;
2122     // Here we need to add the iterator code. One problem is the way
2123     // end of string is handled. If we just return next char, it could
2124     // be the sentinel. Most of the cases already check for this, but we
2125     // need to be sure.
2126     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2127          /* if no normalization and not in buffer. */
2128       if(data->flags & UCOL_USE_ITERATOR) {
2129          return (UChar)data->iterator->next(data->iterator);
2130       } else {
2131          return *(data->pos ++);
2132       }
2133     }
2134 
2135     //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2136       //normalizeIterator(data);
2137     //}
2138 
2139     UChar  *pEndWritableBuffer = NULL;
2140     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2141     if ((innormbuf && *data->pos != 0) ||
2142         (data->fcdPosition != NULL && !innormbuf &&
2143         data->pos < data->fcdPosition)) {
2144         /*
2145         if next character is in normalized buffer, no further normalization
2146         is required
2147         */
2148         return *(data->pos ++);
2149     }
2150 
2151     if (data->flags & UCOL_ITER_HASLEN) {
2152         /* in data string */
2153         if (data->pos + 1 == data->endp) {
2154             return *(data->pos ++);
2155         }
2156     }
2157     else {
2158         if (innormbuf) {
2159           // inside the normalization buffer, but at the end
2160           // (since we encountered zero). This means, in the
2161           // case we're using char iterator, that we need to
2162           // do another round of normalization.
2163           //if(data->origFlags & UCOL_USE_ITERATOR) {
2164             // we need to restore original flags,
2165             // otherwise, we'll lose them
2166             //data->flags = data->origFlags;
2167             //normalizeIterator(data);
2168             //return *(data->pos++);
2169           //} else {
2170             /*
2171             in writable buffer, at this point fcdPosition can not be
2172             pointing to the end of the data string. see contracting tag.
2173             */
2174           if(data->fcdPosition) {
2175             if (*(data->fcdPosition + 1) == 0 ||
2176                 data->fcdPosition + 1 == data->endp) {
2177                 /* at the end of the string, dump it into the normalizer */
2178                 data->pos = insertBufferEnd(data, data->pos,
2179                                             *(data->fcdPosition)) + 1;
2180                 return *(data->fcdPosition ++);
2181             }
2182             pEndWritableBuffer = data->pos;
2183             data->pos = data->fcdPosition;
2184           } else if(data->origFlags & UCOL_USE_ITERATOR) {
2185             // if we are here, we're using a normalizing iterator.
2186             // we should just continue further.
2187             data->flags = data->origFlags;
2188             data->pos = NULL;
2189             return (UChar)data->iterator->next(data->iterator);
2190           }
2191           //}
2192         }
2193         else {
2194             if (*(data->pos + 1) == 0) {
2195                 return *(data->pos ++);
2196             }
2197         }
2198     }
2199 
2200     ch = *data->pos ++;
2201     nextch = *data->pos;
2202 
2203     /*
2204     * if the current character is not fcd.
2205     * Trailing combining class == 0.
2206     */
2207     if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2208         (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2209          ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2210             /*
2211             Need a more complete FCD check and possible normalization.
2212             normalize substring will be appended to buffer
2213             */
2214         if (collIterFCD(data)) {
2215             normalizeNextContraction(data);
2216             return *(data->pos ++);
2217         }
2218         else if (innormbuf) {
2219             /* fcdposition shifted even when there's no normalization, if we
2220             don't input the rest into this, we'll get the wrong position when
2221             we reach the end of the writableBuffer */
2222             int32_t length = data->fcdPosition - data->pos + 1;
2223             data->pos = insertBufferEnd(data, pEndWritableBuffer,
2224                                         data->pos - 1, length);
2225             return *(data->pos ++);
2226         }
2227     }
2228 
2229     if (innormbuf) {
2230         /*
2231         no normalization is to be done hence only one character will be
2232         appended to the buffer.
2233         */
2234         data->pos = insertBufferEnd(data, pEndWritableBuffer, ch) + 1;
2235     }
2236 
2237     /* points back to the pos in string */
2238     return ch;
2239 }
2240 
2241 
2242 
2243 /**
2244 * Function to copy the buffer into writableBuffer and sets the fcd position to
2245 * the correct position
2246 * @param source data string source
2247 * @param buffer character buffer
2248 * @param tempdb current position in buffer that has been used up
2249 */
2250 static
setDiscontiguosAttribute(collIterate * source,UChar * buffer,UChar * tempdb)2251 inline void setDiscontiguosAttribute(collIterate *source, UChar *buffer,
2252                                      UChar *tempdb)
2253 {
2254     /* okay confusing part here. to ensure that the skipped characters are
2255     considered later, we need to place it in the appropriate position in the
2256     normalization buffer and reassign the pos pointer. simple case if pos
2257     reside in string, simply copy to normalization buffer and
2258     fcdposition = pos, pos = start of normalization buffer. if pos in
2259     normalization buffer, we'll insert the copy infront of pos and point pos
2260     to the start of the normalization buffer. why am i doing these copies?
2261     well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2262     not require any changes, which be really painful. */
2263     uint32_t length = u_strlen(buffer);;
2264     if (source->flags & UCOL_ITER_INNORMBUF) {
2265         u_strcpy(tempdb, source->pos);
2266     }
2267     else {
2268         source->fcdPosition  = source->pos;
2269         source->origFlags    = source->flags;
2270         source->flags       |= UCOL_ITER_INNORMBUF;
2271         source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2272     }
2273 
2274     if (length >= source->writableBufSize) {
2275         freeHeapWritableBuffer(source);
2276         source->writableBuffer =
2277                      (UChar *)uprv_malloc((length + 1) * sizeof(UChar));
2278         if(source->writableBuffer == NULL) {
2279           return;
2280         }
2281         source->writableBufSize = length;
2282     }
2283 
2284     u_strcpy(source->writableBuffer, buffer);
2285     source->pos = source->writableBuffer;
2286 }
2287 
2288 /**
2289 * Function to get the discontiguos collation element within the source.
2290 * Note this function will set the position to the appropriate places.
2291 * @param coll current collator used
2292 * @param source data string source
2293 * @param constart index to the start character in the contraction table
2294 * @return discontiguos collation element offset
2295 */
2296 static
getDiscontiguous(const UCollator * coll,collIterate * source,const UChar * constart)2297 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2298                                 const UChar *constart)
2299 {
2300     /* source->pos currently points to the second combining character after
2301        the start character */
2302           UChar   *temppos      = source->pos;
2303           UChar    buffer[4*UCOL_MAX_BUFFER];
2304           UChar   *tempdb       = buffer;
2305     const UChar   *tempconstart = constart;
2306           uint8_t  tempflags    = source->flags;
2307           UBool    multicontraction = FALSE;
2308           UChar   *tempbufferpos = 0;
2309           collIterateState discState;
2310 
2311           backupState(source, &discState);
2312 
2313     //*tempdb = *(source->pos - 1);
2314     *tempdb = peekCharacter(source, -1);
2315     tempdb++;
2316     for (;;) {
2317         UChar    *UCharOffset;
2318         UChar     schar,
2319                   tchar;
2320         uint32_t  result;
2321 
2322         if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2323             || (peekCharacter(source, 0) == 0  &&
2324             //|| (*source->pos == 0  &&
2325                 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2326                  source->fcdPosition == NULL ||
2327                  source->fcdPosition == source->endp ||
2328                  *(source->fcdPosition) == 0 ||
2329                  u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2330                  /* end of string in null terminated string or stopped by a
2331                  null character, note fcd does not always point to a base
2332                  character after the discontiguos change */
2333                  u_getCombiningClass(peekCharacter(source, 0)) == 0) {
2334                  //u_getCombiningClass(*(source->pos)) == 0) {
2335             //constart = (UChar *)coll->image + getContractOffset(CE);
2336             if (multicontraction) {
2337                 *tempbufferpos = 0;
2338                 source->pos    = temppos - 1;
2339                 setDiscontiguosAttribute(source, buffer, tempdb);
2340                 return *(coll->contractionCEs +
2341                                     (tempconstart - coll->contractionIndex));
2342             }
2343             constart = tempconstart;
2344             break;
2345         }
2346 
2347         UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2348         schar = getNextNormalizedChar(source);
2349 
2350         while (schar > (tchar = *UCharOffset)) {
2351             UCharOffset++;
2352         }
2353 
2354         if (schar != tchar) {
2355             /* not the correct codepoint. we stuff the current codepoint into
2356             the discontiguos buffer and try the next character */
2357             *tempdb = schar;
2358             tempdb ++;
2359             continue;
2360         }
2361         else {
2362             if (u_getCombiningClass(schar) ==
2363                 u_getCombiningClass(peekCharacter(source, -2))) {
2364                 //u_getCombiningClass(*(source->pos - 2))) {
2365                 *tempdb = schar;
2366                 tempdb ++;
2367                 continue;
2368             }
2369             result = *(coll->contractionCEs +
2370                                       (UCharOffset - coll->contractionIndex));
2371         }
2372         *tempdb = 0;
2373 
2374         if (result == UCOL_NOT_FOUND) {
2375           break;
2376         } else if (isContraction(result)) {
2377             /* this is a multi-contraction*/
2378             tempconstart = (UChar *)coll->image + getContractOffset(result);
2379             if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2380                 != UCOL_NOT_FOUND) {
2381                 multicontraction = TRUE;
2382                 temppos       = source->pos + 1;
2383                 tempbufferpos = buffer + u_strlen(buffer);
2384             }
2385         } else {
2386             setDiscontiguosAttribute(source, buffer, tempdb);
2387             return result;
2388         }
2389     }
2390 
2391     /* no problems simply reverting just like that,
2392     if we are in string before getting into this function, points back to
2393     string hence no problem.
2394     if we are in normalization buffer before getting into this function,
2395     since we'll never use another normalization within this function, we
2396     know that fcdposition points to a base character. the normalization buffer
2397     never change, hence this revert works. */
2398     loadState(source, &discState, TRUE);
2399     goBackOne(source);
2400 
2401     //source->pos   = temppos - 1;
2402     source->flags = tempflags;
2403     return *(coll->contractionCEs + (constart - coll->contractionIndex));
2404 }
2405 
2406 static
isNonChar(UChar32 cp)2407 inline UBool isNonChar(UChar32 cp) {
2408   if ((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF)) {
2409     return TRUE;
2410   }
2411   return FALSE;
2412 }
2413 
2414 /* now uses Mark's getImplicitPrimary code */
2415 static
getImplicit(UChar32 cp,collIterate * collationSource)2416 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2417   if(isNonChar(cp)) {
2418     return 0;
2419   }
2420   uint32_t r = uprv_uca_getImplicitPrimary(cp);
2421   *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2422   return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2423 }
2424 
2425 /**
2426 * Inserts the argument character into the front of the buffer replacing the
2427 * front null terminator.
2428 * @param data collation element iterator data
2429 * @param pNull pointer to the null terminator
2430 * @param ch character to be appended
2431 * @return positon of added character
2432 */
2433 static
insertBufferFront(collIterate * data,UChar * pNull,UChar ch)2434 inline UChar * insertBufferFront(collIterate *data, UChar *pNull, UChar ch)
2435 {
2436           uint32_t  size    = data->writableBufSize;
2437           UChar    *end;
2438           UChar    *newbuffer;
2439     const uint32_t  incsize = 5;
2440 
2441     if (pNull > data->writableBuffer + 1) {
2442         *pNull       = ch;
2443         *(pNull - 1) = 0;
2444         return pNull;
2445     }
2446 
2447     /*
2448     buffer will always be null terminated infront.
2449     giving extra space since it is likely that more characters will be added.
2450     */
2451     size += incsize;
2452     newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
2453     if(newbuffer == NULL) {
2454       return NULL;
2455     }
2456     end = newbuffer + incsize;
2457     uprv_memcpy(end, data->writableBuffer,
2458                 data->writableBufSize * sizeof(UChar));
2459     *end       = ch;
2460     *(end - 1) = 0;
2461 
2462     freeHeapWritableBuffer(data);
2463 
2464     data->writableBufSize = size;
2465     data->writableBuffer  = newbuffer;
2466     return end;
2467 }
2468 
2469 /**
2470 * Special normalization function for contraction in the previous iterator.
2471 * This normalization sequence will place the current character at source->pos
2472 * and its following normalized sequence into the buffer.
2473 * The fcd position, pos will be changed.
2474 * pos will now point to positions in the buffer.
2475 * Flags will be changed accordingly.
2476 * @param data collation iterator data
2477 */
2478 static
normalizePrevContraction(collIterate * data,UErrorCode * status)2479 inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
2480 {
2481     UChar      *buffer     = data->writableBuffer;
2482     uint32_t    buffersize = data->writableBufSize;
2483     uint32_t    nulltermsize;
2484     UErrorCode  localstatus = U_ZERO_ERROR;
2485     UChar      *pEnd       = data->pos + 1;         /* End normalize + 1 */
2486     UChar      *pStart;
2487     uint32_t    normLen;
2488     UChar      *pStartNorm;
2489 
2490     if (data->flags & UCOL_ITER_HASLEN) {
2491         /*
2492         normalization buffer not used yet, we'll pull down the next
2493         character into the end of the buffer
2494         */
2495         *(buffer + (buffersize - 1)) = *(data->pos + 1);
2496         nulltermsize                  = buffersize - 1;
2497     }
2498     else {
2499         nulltermsize = buffersize;
2500         UChar *temp = buffer + (nulltermsize - 1);
2501         while (*(temp --) != 0) {
2502             nulltermsize --;
2503         }
2504     }
2505 
2506     /* Start normalize */
2507     if (data->fcdPosition == NULL) {
2508         pStart = data->string;
2509     }
2510     else {
2511         pStart = data->fcdPosition + 1;
2512     }
2513 
2514     normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
2515                               &localstatus);
2516 
2517     if (nulltermsize <= normLen) {
2518         uint32_t  size = buffersize - nulltermsize + normLen + 1;
2519         UChar    *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
2520         if (temp == NULL) {
2521             *status = U_MEMORY_ALLOCATION_ERROR;
2522             return;
2523         }
2524         nulltermsize   = normLen + 1;
2525         uprv_memcpy(temp + normLen, buffer,
2526                     sizeof(UChar) * (buffersize - nulltermsize));
2527         freeHeapWritableBuffer(data);
2528         data->writableBuffer = temp;
2529         data->writableBufSize = size;
2530     }
2531 
2532     /*
2533     this puts the null termination infront of the normalized string instead
2534     of the end
2535     */
2536     pStartNorm   = buffer + (nulltermsize - normLen);
2537     *(pStartNorm - 1) = 0;
2538     unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen,
2539                     status);
2540 
2541     data->pos        = data->writableBuffer + nulltermsize;
2542     data->origFlags  = data->flags;
2543     data->flags     |= UCOL_ITER_INNORMBUF;
2544     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2545 }
2546 
2547 /**
2548 * Contraction character management function that returns the previous character
2549 * for the backwards iterator.
2550 * Does nothing if the previous character is in buffer and not the first
2551 * character in it.
2552 * Else it checks previous character in data string to see if it is
2553 * normalizable.
2554 * If it is not, the character is simply copied into the buffer, else
2555 * the whole normalized substring is copied into the buffer, including the
2556 * current character.
2557 * @param data collation element iterator data
2558 * @return previous character
2559 */
2560 static
getPrevNormalizedChar(collIterate * data,UErrorCode * status)2561 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
2562 {
2563     UChar  prevch;
2564     UChar  ch;
2565     UChar *start;
2566     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2567     UChar *pNull = NULL;
2568     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2569         (innormbuf && *(data->pos - 1) != 0)) {
2570         /*
2571         if no normalization.
2572         if previous character is in normalized buffer, no further normalization
2573         is required
2574         */
2575       if(data->flags & UCOL_USE_ITERATOR) {
2576         data->iterator->move(data->iterator, -1, UITER_CURRENT);
2577         return (UChar)data->iterator->next(data->iterator);
2578       } else {
2579         return *(data->pos - 1);
2580       }
2581     }
2582 
2583     start = data->pos;
2584     if (data->flags & UCOL_ITER_HASLEN) {
2585         /* in data string */
2586         if ((start - 1) == data->string) {
2587             return *(start - 1);
2588         }
2589         start --;
2590         ch     = *start;
2591         prevch = *(start - 1);
2592     }
2593     else {
2594         /*
2595         in writable buffer, at this point fcdPosition can not be NULL.
2596         see contracting tag.
2597         */
2598         if (data->fcdPosition == data->string) {
2599             /* at the start of the string, just dump it into the normalizer */
2600             insertBufferFront(data, data->pos - 1, *(data->fcdPosition));
2601             data->fcdPosition = NULL;
2602             return *(data->pos - 1);
2603         }
2604         pNull  = data->pos - 1;
2605         start  = data->fcdPosition;
2606         ch     = *start;
2607         prevch = *(start - 1);
2608     }
2609     /*
2610     * if the current character is not fcd.
2611     * Trailing combining class == 0.
2612     */
2613     if (data->fcdPosition > start &&
2614        (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2615     {
2616         /*
2617         Need a more complete FCD check and possible normalization.
2618         normalize substring will be appended to buffer
2619         */
2620         UChar *backuppos = data->pos;
2621         data->pos = start;
2622         if (collPrevIterFCD(data)) {
2623             normalizePrevContraction(data, status);
2624             return *(data->pos - 1);
2625         }
2626         data->pos = backuppos;
2627         data->fcdPosition ++;
2628     }
2629 
2630     if (innormbuf) {
2631     /*
2632     no normalization is to be done hence only one character will be
2633     appended to the buffer.
2634     */
2635         insertBufferFront(data, pNull, ch);
2636         data->fcdPosition --;
2637     }
2638 
2639     return ch;
2640 }
2641 
2642 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2643 /* It is called by getNextCE */
2644 
ucol_prv_getSpecialCE(const UCollator * coll,UChar ch,uint32_t CE,collIterate * source,UErrorCode * status)2645 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
2646   collIterateState entryState;
2647   backupState(source, &entryState);
2648   UChar32 cp = ch;
2649 
2650   for (;;) {
2651     // This loop will repeat only in the case of contractions, and only when a contraction
2652     //   is found and the first CE resulting from that contraction is itself a special
2653     //   (an expansion, for example.)  All other special CE types are fully handled the
2654     //   first time through, and the loop exits.
2655 
2656     const uint32_t *CEOffset = NULL;
2657     switch(getCETag(CE)) {
2658     case NOT_FOUND_TAG:
2659       /* This one is not found, and we'll let somebody else bother about it... no more games */
2660       return CE;
2661     case SURROGATE_TAG:
2662       /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
2663       /* two things can happen here: next code point can be a trailing surrogate - we will use it */
2664       /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
2665       /* we return 0 (completely ignorable - per UCA specification */
2666       {
2667         UChar trail;
2668         collIterateState state;
2669         backupState(source, &state);
2670         if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
2671           // we chould have stepped one char forward and it might have turned that it
2672           // was not a trail surrogate. In that case, we have to backup.
2673           loadState(source, &state, TRUE);
2674           return 0;
2675         } else {
2676           /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
2677           CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
2678           if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
2679             // We need to backup
2680             loadState(source, &state, TRUE);
2681             return CE;
2682           }
2683           // calculate the supplementary code point value, if surrogate was not tailored
2684           cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
2685         }
2686       }
2687       break;
2688     case SPEC_PROC_TAG:
2689       {
2690         // Special processing is getting a CE that is preceded by a certain prefix
2691         // Currently this is only needed for optimizing Japanese length and iteration marks.
2692         // When we encouter a special processing tag, we go backwards and try to see if
2693         // we have a match.
2694         // Contraction tables are used - so the whole process is not unlike contraction.
2695         // prefix data is stored backwards in the table.
2696         const UChar *UCharOffset;
2697         UChar schar, tchar;
2698         collIterateState prefixState;
2699         backupState(source, &prefixState);
2700         loadState(source, &entryState, TRUE);
2701         goBackOne(source); // We want to look at the point where we entered - actually one
2702         // before that...
2703 
2704         for(;;) {
2705         // This loop will run once per source string character, for as long as we
2706         //  are matching a potential contraction sequence
2707 
2708           // First we position ourselves at the begining of contraction sequence
2709           const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2710           if (collIter_bos(source)) {
2711             CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2712             break;
2713           }
2714           schar = getPrevNormalizedChar(source, status);
2715           goBackOne(source);
2716 
2717           while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2718             UCharOffset++;
2719           }
2720 
2721           if (schar == tchar) {
2722               // Found the source string char in the table.
2723               //  Pick up the corresponding CE from the table.
2724               CE = *(coll->contractionCEs +
2725                   (UCharOffset - coll->contractionIndex));
2726           }
2727           else
2728           {
2729               // Source string char was not in the table.
2730               //   We have not found the prefix.
2731               CE = *(coll->contractionCEs +
2732                   (ContractionStart - coll->contractionIndex));
2733           }
2734 
2735           if(!isPrefix(CE)) {
2736               // The source string char was in the contraction table, and the corresponding
2737               //   CE is not a prefix CE.  We found the prefix, break
2738               //   out of loop, this CE will end up being returned.  This is the normal
2739               //   way out of prefix handling when the source actually contained
2740               //   the prefix.
2741               break;
2742           }
2743         }
2744         if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
2745           loadState(source, &prefixState, TRUE);
2746           if(source->origFlags & UCOL_USE_ITERATOR) {
2747             source->flags = source->origFlags;
2748           }
2749         } else { // prefix search was a failure, we have to backup all the way to the start
2750           loadState(source, &entryState, TRUE);
2751         }
2752       break;
2753       }
2754     case CONTRACTION_TAG:
2755       {
2756       /* This should handle contractions */
2757       collIterateState state;
2758       backupState(source, &state);
2759       uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
2760       const UChar *UCharOffset;
2761       UChar schar, tchar;
2762 
2763       for (;;) {
2764         /* This loop will run once per source string character, for as long as we     */
2765         /*  are matching a potential contraction sequence                  */
2766 
2767         /* First we position ourselves at the begining of contraction sequence */
2768         const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2769 
2770         if (collIter_eos(source)) {
2771             // Ran off the end of the source string.
2772             CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2773             // So we'll pick whatever we have at the point...
2774             if (CE == UCOL_NOT_FOUND) {
2775                 // back up the source over all the chars we scanned going into this contraction.
2776                 CE = firstCE;
2777                 loadState(source, &state, TRUE);
2778                 if(source->origFlags & UCOL_USE_ITERATOR) {
2779                     source->flags = source->origFlags;
2780                 }
2781             }
2782             break;
2783         }
2784 
2785         uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2786         uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
2787 
2788         schar = getNextNormalizedChar(source);
2789         while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2790           UCharOffset++;
2791         }
2792 
2793         if (schar == tchar) {
2794             // Found the source string char in the contraction table.
2795             //  Pick up the corresponding CE from the table.
2796             CE = *(coll->contractionCEs +
2797                 (UCharOffset - coll->contractionIndex));
2798         }
2799         else
2800         {
2801             // Source string char was not in contraction table.
2802             //   Unless we have a discontiguous contraction, we have finished
2803             //   with this contraction.
2804             UChar32 miss = schar;
2805             if(U16_IS_LEAD(schar)) { // in order to do the proper detection, we
2806               // need to see if we're dealing with a supplementary
2807               miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
2808             }
2809 
2810             uint8_t sCC;
2811             if (miss < 0x300 ||
2812                 maxCC == 0 ||
2813                 (sCC = i_getCombiningClass(miss, coll)) == 0 ||
2814                 sCC>maxCC ||
2815                 (allSame != 0 && sCC == maxCC) ||
2816                 collIter_eos(source)) {
2817                     //  Contraction can not be discontiguous.
2818                     goBackOne(source);  // back up the source string by one,
2819                                         //  because  the character we just looked at was
2820                                         //  not part of the contraction.   */
2821                     if(U_IS_SUPPLEMENTARY(miss)) {
2822                       goBackOne(source);
2823                     }
2824                     CE = *(coll->contractionCEs +
2825                         (ContractionStart - coll->contractionIndex));
2826             } else {
2827                 //
2828                 // Contraction is possibly discontiguous.
2829                 //   Scan more of source string looking for a match
2830                 //
2831                 UChar tempchar;
2832                 /* find the next character if schar is not a base character
2833                     and we are not yet at the end of the string */
2834                 tempchar = getNextNormalizedChar(source);
2835                 // probably need another supplementary thingie here
2836                 goBackOne(source);
2837                 if (i_getCombiningClass(tempchar, coll) == 0) {
2838                     goBackOne(source);
2839                     if(U_IS_SUPPLEMENTARY(miss)) {
2840                       goBackOne(source);
2841                     }
2842                     /* Spit out the last char of the string, wasn't tasty enough */
2843                     CE = *(coll->contractionCEs +
2844                         (ContractionStart - coll->contractionIndex));
2845                 } else {
2846                     CE = getDiscontiguous(coll, source, ContractionStart);
2847                 }
2848             }
2849         } // else after if(schar == tchar)
2850 
2851         if(CE == UCOL_NOT_FOUND) {
2852             /* The Source string did not match the contraction that we were checking.  */
2853             /*  Back up the source position to undo the effects of having partially    */
2854             /*   scanned through what ultimately proved to not be a contraction.       */
2855           loadState(source, &state, TRUE);
2856           CE = firstCE;
2857           break;
2858         }
2859 
2860         if(!isContraction(CE)) {
2861             // The source string char was in the contraction table, and the corresponding
2862             //   CE is not a contraction CE.  We completed the contraction, break
2863             //   out of loop, this CE will end up being returned.  This is the normal
2864             //   way out of contraction handling when the source actually contained
2865             //   the contraction.
2866             break;
2867         }
2868 
2869 
2870         // The source string char was in the contraction table, and the corresponding
2871         //   CE is IS  a contraction CE.  We will continue looping to check the source
2872         //   string for the remaining chars in the contraction.
2873         uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
2874         if(tempCE != UCOL_NOT_FOUND) {
2875             // We have scanned a a section of source string for which there is a
2876             //  CE from the contraction table.  Remember the CE and scan position, so
2877             //  that we can return to this point if further scanning fails to
2878             //  match a longer contraction sequence.
2879             firstCE = tempCE;
2880 
2881             goBackOne(source);
2882             backupState(source, &state);
2883             getNextNormalizedChar(source);
2884 
2885             // Another way to do this is:
2886             //collIterateState tempState;
2887             //backupState(source, &tempState);
2888             //goBackOne(source);
2889             //backupState(source, &state);
2890             //loadState(source, &tempState, TRUE);
2891 
2892             // The problem is that for incomplete contractions we have to remember the previous
2893             // position. Before, the only thing I needed to do was state.pos--;
2894             // After iterator introduction and especially after introduction of normalizing
2895             // iterators, it became much more difficult to decrease the saved state.
2896             // I'm not yet sure which of the two methods above is faster.
2897         }
2898       } // for(;;)
2899       break;
2900       } // case CONTRACTION_TAG:
2901     case LONG_PRIMARY_TAG:
2902       {
2903         *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
2904         CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
2905         return CE;
2906       }
2907     case EXPANSION_TAG:
2908       {
2909       /* This should handle expansion. */
2910       /* NOTE: we can encounter both continuations and expansions in an expansion! */
2911       /* I have to decide where continuations are going to be dealt with */
2912       uint32_t size;
2913       uint32_t i;    /* general counter */
2914       CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
2915       size = getExpansionCount(CE);
2916       CE = *CEOffset++;
2917       if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
2918         for(i = 1; i<size; i++) {
2919           *(source->CEpos++) = *CEOffset++;
2920         }
2921       } else { /* else, we do */
2922         while(*CEOffset != 0) {
2923           *(source->CEpos++) = *CEOffset++;
2924         }
2925       }
2926       return CE;
2927       }
2928     case DIGIT_TAG:
2929       {
2930       /*
2931          We do a check to see if we want to collate digits as numbers; if so we generate
2932          a custom collation key. Otherwise we pull out the value stored in the expansion table.
2933       */
2934       //uint32_t size;
2935       uint32_t i;    /* general counter */
2936 
2937       if (source->coll->numericCollation == UCOL_ON){
2938         collIterateState digitState = {0,0,0,0,0,0,0,0};
2939         UChar32 char32 = 0;
2940 
2941         uint32_t digIndx = 0;
2942         uint32_t endIndex = 0;
2943         uint32_t trailingZeroIndex = 0;
2944 
2945         uint32_t primWeight = 0;
2946 
2947         int32_t digVal = 0;
2948         uint8_t collateVal = 0;
2949 
2950         UBool nonZeroValReached = FALSE;
2951 
2952         uint8_t *numTempBuf;
2953         uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs.
2954         uint32_t numTempBufSize = UCOL_MAX_BUFFER;
2955 
2956         numTempBuf = stackNumTempBuf;
2957         /*
2958              We parse the source string until we hit a char that's NOT a digit.
2959             Use this u_charDigitValue. This might be slow because we have to
2960             handle surrogates...
2961         */
2962 /*
2963         if (U16_IS_LEAD(ch)){
2964           if (!collIter_eos(source)) {
2965             backupState(source, &digitState);
2966             UChar trail = getNextNormalizedChar(source);
2967             if(U16_IS_TRAIL(trail)) {
2968               char32 = U16_GET_SUPPLEMENTARY(ch, trail);
2969             } else {
2970               loadState(source, &digitState, TRUE);
2971               char32 = ch;
2972             }
2973           } else {
2974             char32 = ch;
2975           }
2976         } else {
2977           char32 = ch;
2978         }
2979         digVal = u_charDigitValue(char32);
2980 */
2981         digVal = u_charDigitValue(cp); // if we have arrived here, we have
2982         // already processed possible supplementaries that trigered the digit tag -
2983         // all supplementaries are marked in the UCA.
2984         /*
2985             We  pad a zero in front of the first element anyways. This takes
2986             care of the (probably) most common case where people are sorting things followed
2987             by a single digit
2988         */
2989         digIndx++;
2990         for(;;){
2991             // Make sure we have enough space.
2992             if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
2993             {
2994                 numTempBufSize *= 2;
2995                 if (numTempBuf == stackNumTempBuf){
2996                     numTempBuf = (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize);
2997                     uprv_memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
2998                 } else {
2999                     uprv_realloc(numTempBuf, numTempBufSize);
3000                 }
3001             }
3002 
3003             // Skipping over leading zeroes.
3004             if (digVal != 0) {
3005                 nonZeroValReached = TRUE;
3006             }
3007             if (nonZeroValReached) {
3008                 /*
3009                     We parse the digit string into base 100 numbers (this fits into a byte).
3010                     We only add to the buffer in twos, thus if we are parsing an odd character,
3011                     that serves as the 'tens' digit while the if we are parsing an even one, that
3012                     is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3013                     a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3014                     overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3015                     than all the other bytes.
3016                  */
3017 
3018                 if (digIndx % 2 == 1){
3019                     collateVal += (uint8_t)digVal;
3020 
3021                     // We don't enter the low-order-digit case unless we've already seen
3022                     // the high order, or for the first digit, which is always non-zero.
3023                     if (collateVal != 0)
3024                         trailingZeroIndex = 0;
3025 
3026                     numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3027                     collateVal = 0;
3028                 }
3029                 else{
3030                     // We drop the collation value into the buffer so if we need to do
3031                     // a "front patch" we don't have to check to see if we're hitting the
3032                     // last element.
3033                     collateVal = (uint8_t)(digVal * 10);
3034 
3035                     // Check for trailing zeroes.
3036                     if (collateVal == 0)
3037                     {
3038                         if (!trailingZeroIndex)
3039                             trailingZeroIndex = (digIndx/2) + 2;
3040                     }
3041                     else
3042                         trailingZeroIndex = 0;
3043 
3044                     numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3045                 }
3046                 digIndx++;
3047             }
3048 
3049             // Get next character.
3050             if (!collIter_eos(source)){
3051                 ch = getNextNormalizedChar(source);
3052                 if (U16_IS_LEAD(ch)){
3053                   if (!collIter_eos(source)) {
3054                     backupState(source, &digitState);
3055                     UChar trail = getNextNormalizedChar(source);
3056                     if(U16_IS_TRAIL(trail)) {
3057                       char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3058                     } else {
3059                       loadState(source, &digitState, TRUE);
3060                       char32 = ch;
3061                     }
3062                   }
3063                 } else {
3064                   char32 = ch;
3065                 }
3066 
3067                 if ((digVal = u_charDigitValue(char32)) == -1){
3068                     // Resetting position to point to the next unprocessed char. We
3069                     // overshot it when doing our test/set for numbers.
3070                   if (char32 > 0xFFFF) { // For surrogates.
3071                     loadState(source, &digitState, TRUE);
3072                     //goBackOne(source);
3073                   }
3074                   goBackOne(source);
3075                   break;
3076                 }
3077             } else {
3078               break;
3079             }
3080         }
3081 
3082         if (nonZeroValReached == FALSE){
3083             digIndx = 2;
3084             numTempBuf[2] = 6;
3085         }
3086 
3087         endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
3088         if (digIndx % 2 != 0){
3089             /*
3090                 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3091                 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3092                 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3093                 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3094             */
3095 
3096             for(i = 2; i < endIndex; i++){
3097                 numTempBuf[i] =     (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3098                                     (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3099             }
3100             --digIndx;
3101         }
3102 
3103         // Subtract one off of the last byte.
3104         numTempBuf[endIndex-1] -= 1;
3105 
3106         /*
3107             We want to skip over the first two slots in the buffer. The first slot
3108             is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3109             sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3110         */
3111         numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3112         numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3113 
3114         // Now transfer the collation key to our collIterate struct.
3115         // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3116           //size = ((endIndex+1) & ~1)/2;
3117           CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3118                 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3119                 UCOL_BYTE_COMMON; // Tertiary weight.
3120           i = 2; // Reset the index into the buffer.
3121           while(i < endIndex)
3122           {
3123             primWeight = numTempBuf[i++] << 8;
3124             if ( i < endIndex)
3125                 primWeight |= numTempBuf[i++];
3126             *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3127           }
3128 
3129           if (numTempBuf != stackNumTempBuf)
3130             uprv_free(numTempBuf);
3131       } else {
3132         // no numeric mode, we'll just switch to whatever we stashed and continue
3133           CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3134           CE = *CEOffset++;
3135           break;
3136       }
3137       return CE;
3138       }
3139     /* various implicits optimization */
3140     // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3141     case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3142       //return getImplicit(cp, source, 0x04000000);
3143       return getImplicit(cp, source);
3144     case IMPLICIT_TAG:        /* everything that is not defined otherwise */
3145       /* UCA is filled with these. Tailorings are NOT_FOUND */
3146       //return getImplicit(cp, source, 0);
3147       return getImplicit(cp, source);
3148     case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3149       return 0; /* broken surrogate sequence */
3150     case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
3151       UChar nextChar;
3152       if( source->flags & UCOL_USE_ITERATOR) {
3153         if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
3154           cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3155           source->iterator->next(source->iterator);
3156           return getImplicit(cp, source);
3157         }  else {
3158           return 0;
3159         }
3160       } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
3161         U_IS_TRAIL((nextChar=*source->pos))) {
3162         cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3163         source->pos++;
3164         return getImplicit(cp, source);
3165       } else {
3166         return 0; /* completely ignorable */
3167       }
3168     case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3169       {
3170         const uint32_t
3171           SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3172         //const uint32_t LCount = 19;
3173         const uint32_t VCount = 21;
3174         const uint32_t TCount = 28;
3175         //const uint32_t NCount = VCount * TCount;   // 588
3176         //const uint32_t SCount = LCount * NCount;   // 11172
3177         uint32_t L = ch - SBase;
3178 
3179         // divide into pieces
3180 
3181         uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
3182         L /= TCount;
3183         uint32_t V = L % VCount;
3184         L /= VCount;
3185 
3186         // offset them
3187 
3188         L += LBase;
3189         V += VBase;
3190         T += TBase;
3191 
3192         // return the first CE, but first put the rest into the expansion buffer
3193         if (!source->coll->image->jamoSpecial) { // FAST PATH
3194 
3195           *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3196           if (T != TBase) {
3197               *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3198           }
3199 
3200           return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3201 
3202         } else { // Jamo is Special
3203           // Since Hanguls pass the FCD check, it is
3204           // guaranteed that we won't be in
3205           // the normalization buffer if something like this happens
3206           // However, if we are using a uchar iterator and normalization
3207           // is ON, the Hangul that lead us here is going to be in that
3208           // normalization buffer. Here we want to restore the uchar
3209           // iterator state and pull out of the normalization buffer
3210           if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
3211             source->flags = source->origFlags; // restore the iterator
3212             source->pos = NULL;
3213           }
3214           // Move Jamos into normalization buffer
3215           source->writableBuffer[0] = (UChar)L;
3216           source->writableBuffer[1] = (UChar)V;
3217           if (T != TBase) {
3218             source->writableBuffer[2] = (UChar)T;
3219             source->writableBuffer[3] = 0;
3220           } else {
3221             source->writableBuffer[2] = 0;
3222           }
3223 
3224           source->fcdPosition       = source->pos;   // Indicate where to continue in main input string
3225                                                          //   after exhausting the writableBuffer
3226           source->pos   = source->writableBuffer;
3227           source->origFlags         = source->flags;
3228           source->flags            |= UCOL_ITER_INNORMBUF;
3229           source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3230 
3231           return(UCOL_IGNORABLE);
3232         }
3233       }
3234     case CHARSET_TAG:
3235     /* not yet implemented */
3236       /* probably after 1.8 */
3237       return UCOL_NOT_FOUND;
3238     default:
3239       *status = U_INTERNAL_PROGRAM_ERROR;
3240       CE=0;
3241       break;
3242     }
3243     if (CE <= UCOL_NOT_FOUND) break;
3244   }
3245   return CE;
3246 }
3247 
3248 
3249 /* now uses Mark's getImplicitPrimary code */
3250 static
getPrevImplicit(UChar32 cp,collIterate * collationSource)3251 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3252   if(isNonChar(cp)) {
3253     return 0;
3254   }
3255 
3256   uint32_t r = uprv_uca_getImplicitPrimary(cp);
3257 
3258   *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3259   collationSource->toReturn = collationSource->CEpos;
3260   return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3261 }
3262 
3263 /**
3264  * This function handles the special CEs like contractions, expansions,
3265  * surrogates, Thai.
3266  * It is called by both getPrevCE
3267  */
ucol_prv_getSpecialPrevCE(const UCollator * coll,UChar ch,uint32_t CE,collIterate * source,UErrorCode * status)3268 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3269                           collIterate *source,
3270                           UErrorCode *status)
3271 {
3272   const uint32_t *CEOffset    = NULL;
3273         UChar    *UCharOffset = NULL;
3274         UChar    schar;
3275   const UChar    *constart    = NULL;
3276         uint32_t size;
3277         UChar    buffer[UCOL_MAX_BUFFER];
3278         uint32_t *endCEBuffer;
3279         UChar   *strbuffer;
3280         int32_t noChars = 0;
3281 
3282   for(;;)
3283   {
3284     /* the only ces that loops are thai and contractions */
3285     switch (getCETag(CE))
3286     {
3287     case NOT_FOUND_TAG:  /* this tag always returns */
3288       return CE;
3289     case SURROGATE_TAG:  /* This is a surrogate pair */
3290       /* essentialy an engaged lead surrogate. */
3291       /* if you have encountered it here, it means that a */
3292       /* broken sequence was encountered and this is an error */
3293       return 0;
3294     case SPEC_PROC_TAG:
3295       {
3296         // Special processing is getting a CE that is preceded by a certain prefix
3297         // Currently this is only needed for optimizing Japanese length and iteration marks.
3298         // When we encouter a special processing tag, we go backwards and try to see if
3299         // we have a match.
3300         // Contraction tables are used - so the whole process is not unlike contraction.
3301         // prefix data is stored backwards in the table.
3302         const UChar *UCharOffset;
3303         UChar schar, tchar;
3304         collIterateState prefixState;
3305         backupState(source, &prefixState);
3306         for(;;) {
3307         // This loop will run once per source string character, for as long as we
3308         //  are matching a potential contraction sequence
3309 
3310           // First we position ourselves at the begining of contraction sequence
3311           const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3312 
3313           if (collIter_bos(source)) {
3314             CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3315             break;
3316           }
3317           schar = getPrevNormalizedChar(source, status);
3318           goBackOne(source);
3319 
3320           while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3321             UCharOffset++;
3322           }
3323 
3324           if (schar == tchar) {
3325               // Found the source string char in the table.
3326               //  Pick up the corresponding CE from the table.
3327               CE = *(coll->contractionCEs +
3328                   (UCharOffset - coll->contractionIndex));
3329           }
3330           else
3331           {
3332               // if there is a completely ignorable code point in the middle of
3333               // a prefix, we need to act as if it's not there
3334               // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3335               // lone surrogates cannot be set to zero as it would break other processing
3336               uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
3337               // it's easy for BMP code points
3338               if(isZeroCE == 0) {
3339                 continue;
3340               } else if(U16_IS_TRAIL(schar) || U16_IS_LEAD(schar)) {
3341                 // for supplementary code points, we have to check the next one
3342                 // situations where we are going to ignore
3343                 // 1. beginning of the string: schar is a lone surrogate
3344                 // 2. schar is a lone surrogate
3345                 // 3. schar is a trail surrogate in a valid surrogate sequence
3346                 //    that is explicitly set to zero.
3347                 if (!collIter_bos(source)) {
3348                   UChar lead;
3349                   if(U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
3350                     isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
3351                     if(getCETag(isZeroCE) == SURROGATE_TAG) {
3352                       uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
3353                       if(finalCE == 0) {
3354                         // this is a real, assigned completely ignorable code point
3355                         goBackOne(source);
3356                         continue;
3357                       }
3358                     }
3359                   } else {
3360                     // lone surrogate, completely ignorable
3361                     continue;
3362                   }
3363                 } else {
3364                   // lone surrogate at the beggining, completely ignorable
3365                   continue;
3366                 }
3367               }
3368               // Source string char was not in the table.
3369               //   We have not found the prefix.
3370               CE = *(coll->contractionCEs +
3371                   (ContractionStart - coll->contractionIndex));
3372           }
3373 
3374           if(!isPrefix(CE)) {
3375               // The source string char was in the contraction table, and the corresponding
3376               //   CE is not a prefix CE.  We found the prefix, break
3377               //   out of loop, this CE will end up being returned.  This is the normal
3378               //   way out of prefix handling when the source actually contained
3379               //   the prefix.
3380               break;
3381           }
3382         }
3383       loadState(source, &prefixState, TRUE);
3384       break;
3385       }
3386 
3387     case CONTRACTION_TAG:
3388         /* to ensure that the backwards and forwards iteration matches, we
3389         take the current region of most possible match and pass it through
3390         the forward iteration. this will ensure that the obstinate problem of
3391         overlapping contractions will not occur.
3392         */
3393         schar = peekCharacter(source, 0);
3394         constart = (UChar *)coll->image + getContractOffset(CE);
3395         if (isAtStartPrevIterate(source)
3396             /* commented away contraction end checks after adding the checks
3397             in getPrevCE  */) {
3398             /* start of string or this is not the end of any contraction */
3399             CE = *(coll->contractionCEs +
3400                      (constart - coll->contractionIndex));
3401             break;
3402         }
3403         strbuffer = buffer;
3404         UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3405         *(UCharOffset --) = 0;
3406         noChars = 0;
3407         // have to swap thai characters
3408         while (ucol_unsafeCP(schar, coll)) {
3409             *(UCharOffset) = schar;
3410             noChars++;
3411             UCharOffset --;
3412             schar = getPrevNormalizedChar(source, status);
3413             goBackOne(source);
3414             // TODO: when we exhaust the contraction buffer,
3415             // it needs to get reallocated. The problem is
3416             // that the size depends on the string which is
3417             // not iterated over. However, since we're travelling
3418             // backwards, we already had to set the iterator at
3419             // the end - so we might as well know where we are?
3420             if (UCharOffset + 1 == buffer) {
3421                 /* we have exhausted the buffer */
3422               int32_t newsize = 0;
3423               if(source->pos) { // actually dealing with a position
3424                 newsize = source->pos - source->string + 1;
3425               } else { // iterator
3426                 newsize = 4 * UCOL_MAX_BUFFER;
3427               }
3428                 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3429                                              (newsize + UCOL_MAX_BUFFER));
3430                 /* test for NULL */
3431                 if (strbuffer == NULL) {
3432                     *status = U_MEMORY_ALLOCATION_ERROR;
3433                     return UCOL_NO_MORE_CES;
3434                 }
3435                 UCharOffset = strbuffer + newsize;
3436                 uprv_memcpy(UCharOffset, buffer,
3437                                              UCOL_MAX_BUFFER * sizeof(UChar));
3438                 UCharOffset --;
3439             }
3440             if ((source->pos && (source->pos == source->string ||
3441                 ((source->flags & UCOL_ITER_INNORMBUF) &&
3442                 *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3443                 || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
3444                 break;
3445             }
3446         }
3447         /* adds the initial base character to the string */
3448         *(UCharOffset) = schar;
3449         noChars++;
3450 
3451         /* a new collIterate is used to simplify things, since using the current
3452         collIterate will mean that the forward and backwards iteration will
3453         share and change the same buffers. we don't want to get into that. */
3454         collIterate temp;
3455         //IInit_collIterate(coll, UCharOffset, -1, &temp);
3456         IInit_collIterate(coll, UCharOffset, noChars, &temp);
3457         temp.flags &= ~UCOL_ITER_NORM;
3458 
3459         CE = ucol_IGetNextCE(coll, &temp, status);
3460         endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3461         while (CE != UCOL_NO_MORE_CES) {
3462             *(source->CEpos ++) = CE;
3463             if (source->CEpos == endCEBuffer) {
3464                 /* ran out of CE space, bail.
3465                 there's no guarantee of the right character position after
3466                 this bail*/
3467                 *status = U_BUFFER_OVERFLOW_ERROR;
3468                 source->CEpos = source->CEs;
3469                 freeHeapWritableBuffer(&temp);
3470                 if (strbuffer != buffer) {
3471                     uprv_free(strbuffer);
3472                 }
3473                 return (uint32_t)UCOL_NULLORDER;
3474             }
3475             CE = ucol_IGetNextCE(coll, &temp, status);
3476         }
3477         freeHeapWritableBuffer(&temp);
3478         if (strbuffer != buffer) {
3479             uprv_free(strbuffer);
3480         }
3481         source->toReturn = source->CEpos - 1;
3482         if (source->toReturn == source->CEs) {
3483             source->CEpos = source->CEs;
3484         }
3485         return *(source->toReturn);
3486     case LONG_PRIMARY_TAG:
3487       {
3488         *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3489         *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3490         source->toReturn = source->CEpos - 1;
3491         return *(source->toReturn);
3492       }
3493     case EXPANSION_TAG: /* this tag always returns */
3494       /*
3495       This should handle expansion.
3496       NOTE: we can encounter both continuations and expansions in an expansion!
3497       I have to decide where continuations are going to be dealt with
3498       */
3499       /* find the offset to expansion table */
3500       CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3501       size     = getExpansionCount(CE);
3502       if (size != 0) {
3503         /*
3504         if there are less than 16 elements in expansion, we don't terminate
3505         */
3506         uint32_t count;
3507         for (count = 0; count < size; count++) {
3508           *(source->CEpos ++) = *CEOffset++;
3509         }
3510       }
3511       else {
3512         /* else, we do */
3513         while (*CEOffset != 0) {
3514           *(source->CEpos ++) = *CEOffset ++;
3515         }
3516       }
3517       source->toReturn = source->CEpos - 1;
3518       // in case of one element expansion, we
3519       // want to immediately return CEpos
3520       if(source->toReturn == source->CEs) {
3521         source->CEpos = source->CEs;
3522       }
3523       return *(source->toReturn);
3524      case DIGIT_TAG:
3525       {
3526       /*
3527          We do a check to see if we want to collate digits as numbers; if so we generate
3528          a custom collation key. Otherwise we pull out the value stored in the expansion table.
3529       */
3530       //uint32_t size;
3531       uint32_t i;    /* general counter */
3532 
3533       if (source->coll->numericCollation == UCOL_ON){
3534         collIterateState state = {0,0,0,0,0,0,0,0};
3535         UChar32 char32 = 0;
3536 
3537         uint32_t digIndx = 0;
3538         uint32_t endIndex = 0;
3539         uint32_t leadingZeroIndex = 0;
3540         uint32_t trailingZeroCount = 0;
3541 
3542         uint32_t primWeight = 0;
3543 
3544         int32_t digVal = 0;
3545         uint8_t collateVal = 0;
3546 
3547         UBool nonZeroValReached = FALSE;
3548 
3549         uint8_t *numTempBuf;
3550         uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs.
3551         uint32_t numTempBufSize = UCOL_MAX_BUFFER;
3552 
3553         numTempBuf = stackNumTempBuf;
3554         /*
3555              We parse the source string until we hit a char that's NOT a digit.
3556             Use this u_charDigitValue. This might be slow because we have to
3557             handle surrogates...
3558         */
3559 
3560         if (U16_IS_TRAIL (ch)){
3561             if (!collIter_bos(source)){
3562               UChar lead = getPrevNormalizedChar(source, status);
3563               if(U16_IS_LEAD(lead)) {
3564                 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3565                 goBackOne(source);
3566               } else {
3567                 char32 = ch;
3568               }
3569             } else {
3570                 char32 = ch;
3571             }
3572         } else {
3573             char32 = ch;
3574         }
3575         digVal = u_charDigitValue(char32);
3576 
3577         for(;;){
3578         // Make sure we have enough space.
3579         if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
3580         {
3581             numTempBufSize *= 2;
3582             if (numTempBuf == stackNumTempBuf){
3583                 numTempBuf = (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize);
3584                 uprv_memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
3585             }else
3586                 uprv_realloc(numTempBuf, numTempBufSize);
3587         }
3588 
3589             // Skip over trailing zeroes, and keep a count of them.
3590             if (digVal != 0)
3591                     nonZeroValReached = TRUE;
3592             if (nonZeroValReached){
3593                 /*
3594                     We parse the digit string into base 100 numbers (this fits into a byte).
3595                     We only add to the buffer in twos, thus if we are parsing an odd character,
3596                     that serves as the 'tens' digit while the if we are parsing an even one, that
3597                     is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3598                     a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3599                     overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3600                     than all the other bytes.
3601 
3602                     Since we're doing in this reverse we want to put the first digit encountered into the
3603                     ones place and the second digit encountered into the tens place.
3604                  */
3605 
3606                 if ((digIndx + trailingZeroCount) % 2 == 1){
3607                     // High-order digit case (tens place)
3608                     collateVal += (uint8_t)(digVal * 10);
3609 
3610                     // We cannot set leadingZeroIndex unless it has been set for the
3611                     // low-order digit. Therefore, all we can do for the high-order
3612                     // digit is turn it off, never on.
3613                     // The only time we will have a high digit without a low is for
3614                     // the very first non-zero digit, so no zero check is necessary.
3615                     if (collateVal != 0)
3616                         leadingZeroIndex = 0;
3617 
3618                     numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3619                     collateVal = 0;
3620                 }
3621                 else{
3622                     // Low-order digit case (ones place)
3623                     collateVal = (uint8_t)digVal;
3624 
3625                     // Check for leading zeroes.
3626                     if (collateVal == 0)
3627                     {
3628                         if (!leadingZeroIndex)
3629                             leadingZeroIndex = (digIndx/2) + 2;
3630                     }
3631                     else
3632                         leadingZeroIndex = 0;
3633 
3634                     // No need to write to buffer; the case of a last odd digit
3635                     // is handled below.
3636                 }
3637                 ++digIndx;
3638             }
3639             else
3640                 ++trailingZeroCount;
3641 
3642             if (!collIter_bos(source)){
3643                 ch = getPrevNormalizedChar(source, status);
3644                 //goBackOne(source);
3645                 if (U16_IS_TRAIL(ch)){
3646                     backupState(source, &state);
3647                     if (!collIter_bos(source))
3648                     {
3649                         goBackOne(source);
3650                         UChar lead = getPrevNormalizedChar(source, status);
3651                         if(U16_IS_LEAD(lead)) {
3652                           char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3653                         } else {
3654                           loadState(source, &state, FALSE);
3655                           char32 = ch;
3656                         }
3657                     }
3658                 }
3659                 else
3660                     char32 = ch;
3661 
3662                 if ((digVal = u_charDigitValue(char32)) == -1){
3663                   if (char32 > 0xFFFF) {// For surrogates.
3664                     loadState(source, &state, FALSE);
3665                   }
3666                     // Don't need to "reverse" the goBackOne call,
3667                     // as this points to the next position to process..
3668                     //if (char32 > 0xFFFF) // For surrogates.
3669                         //getNextNormalizedChar(source);
3670                     break;
3671                 }
3672                 goBackOne(source);
3673             }else
3674                 break;
3675         }
3676 
3677         if (nonZeroValReached == FALSE){
3678             digIndx = 2;
3679             trailingZeroCount = 0;
3680             numTempBuf[2] = 6;
3681         }
3682 
3683         if ((digIndx + trailingZeroCount) % 2 != 0){
3684                 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
3685             digIndx += 1;       // The implicit leading zero
3686             }
3687         if (trailingZeroCount % 2 != 0){
3688             // We had to consume one trailing zero for the low digit
3689             // of the least significant byte
3690             digIndx += 1;       // The trailing zero not in the exponent
3691             trailingZeroCount -= 1;
3692         }
3693 
3694         endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
3695 
3696         // Subtract one off of the last byte. Really the first byte here, but it's reversed...
3697         numTempBuf[2] -= 1;
3698 
3699         /*
3700             We want to skip over the first two slots in the buffer. The first slot
3701             is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3702             sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3703             The exponent must be adjusted by the number of leading zeroes, and the number of
3704             trailing zeroes.
3705         */
3706         numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3707         uint32_t exponent = (digIndx+trailingZeroCount)/2;
3708         if (leadingZeroIndex)
3709             exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
3710         numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
3711 
3712         // Now transfer the collation key to our collIterate struct.
3713         // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3714         //size = ((endIndex+1) & ~1)/2;
3715           *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3716                 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3717                 UCOL_BYTE_COMMON; // Tertiary weight.
3718           i = endIndex - 1; // Reset the index into the buffer.
3719           while(i >= 2)
3720           {
3721             primWeight = numTempBuf[i--] << 8;
3722             if ( i >= 2)
3723                 primWeight |= numTempBuf[i--];
3724             *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3725           }
3726           if (numTempBuf != stackNumTempBuf)
3727             uprv_free(numTempBuf);
3728 
3729           source->toReturn = source->CEpos -1;
3730           return *(source->toReturn);
3731       }
3732       else {
3733           CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3734           CE = *(CEOffset++);
3735           break;
3736       }
3737       }
3738     case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3739       {
3740         const uint32_t
3741           SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3742         //const uint32_t LCount = 19;
3743         const uint32_t VCount = 21;
3744         const uint32_t TCount = 28;
3745         //const uint32_t NCount = VCount * TCount;   /* 588 */
3746         //const uint32_t SCount = LCount * NCount;   /* 11172 */
3747 
3748         uint32_t L = ch - SBase;
3749         /*
3750         divide into pieces.
3751         we do it in this order since some compilers can do % and / in one
3752         operation
3753         */
3754         uint32_t T = L % TCount;
3755         L /= TCount;
3756         uint32_t V = L % VCount;
3757         L /= VCount;
3758 
3759         /* offset them */
3760         L += LBase;
3761         V += VBase;
3762         T += TBase;
3763 
3764         /*
3765         return the first CE, but first put the rest into the expansion buffer
3766         */
3767         if (!source->coll->image->jamoSpecial)
3768         {
3769           *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3770           *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3771           if (T != TBase)
3772             *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3773 
3774           source->toReturn = source->CEpos - 1;
3775           return *(source->toReturn);
3776         } else {
3777           // Since Hanguls pass the FCD check, it is
3778           // guaranteed that we won't be in
3779           // the normalization buffer if something like this happens
3780           // Move Jamos into normalization buffer
3781           /*
3782           Move the Jamos into the
3783           normalization buffer
3784           */
3785           UChar *tempbuffer = source->writableBuffer +
3786                               (source->writableBufSize - 1);
3787           *(tempbuffer) = 0;
3788           if (T != TBase) {
3789             *(tempbuffer - 1) = (UChar)T;
3790             *(tempbuffer - 2) = (UChar)V;
3791             *(tempbuffer - 3) = (UChar)L;
3792             *(tempbuffer - 4) = 0;
3793           } else {
3794             *(tempbuffer - 1) = (UChar)V;
3795             *(tempbuffer - 2) = (UChar)L;
3796             *(tempbuffer - 3) = 0;
3797           }
3798 
3799           /*
3800           Indicate where to continue in main input string after exhausting
3801           the writableBuffer
3802           */
3803           if (source->pos  == source->string) {
3804             source->fcdPosition = NULL;
3805           } else {
3806             source->fcdPosition       = source->pos-1;
3807           }
3808 
3809           source->pos               = tempbuffer;
3810           source->origFlags         = source->flags;
3811           source->flags            |= UCOL_ITER_INNORMBUF;
3812           source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3813 
3814           return(UCOL_IGNORABLE);
3815         }
3816       }
3817     case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
3818       return 0; /* broken surrogate sequence */
3819     case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3820     {
3821       UChar32 cp = 0;
3822       UChar  prevChar;
3823       UChar *prev;
3824       if (isAtStartPrevIterate(source)) {
3825           /* we are at the start of the string, wrong place to be at */
3826           return 0;
3827       }
3828       if (source->pos != source->writableBuffer) {
3829           prev     = source->pos - 1;
3830       } else {
3831           prev     = source->fcdPosition;
3832       }
3833       prevChar = *prev;
3834 
3835       /* Handles Han and Supplementary characters here.*/
3836       if (U16_IS_LEAD(prevChar)) {
3837         cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3838         source->pos = prev;
3839       } else {
3840         return 0; /* completely ignorable */
3841       }
3842       return getPrevImplicit(cp, source);
3843     }
3844     // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
3845     case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3846       return getPrevImplicit(ch, source);
3847     case IMPLICIT_TAG:        /* everything that is not defined otherwise */
3848       return getPrevImplicit(ch, source);
3849       /* UCA is filled with these. Tailorings are NOT_FOUND */
3850     /* not yet implemented */
3851     case CHARSET_TAG:  /* this tag always returns */
3852       /* probably after 1.8 */
3853       return UCOL_NOT_FOUND;
3854     default:           /* this tag always returns */
3855       *status = U_INTERNAL_PROGRAM_ERROR;
3856       CE=0;
3857       break;
3858     }
3859     if (CE <= UCOL_NOT_FOUND) {
3860       break;
3861     }
3862   }
3863   return CE;
3864 }
3865 
3866 /* This should really be a macro        */
3867 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
3868 /* anyway */
3869 static
reallocateBuffer(uint8_t ** secondaries,uint8_t * secStart,uint8_t * second,uint32_t * secSize,uint32_t newSize,UErrorCode * status)3870 uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) {
3871 #ifdef UCOL_DEBUG
3872   fprintf(stderr, ".");
3873 #endif
3874   uint8_t *newStart = NULL;
3875   uint32_t offset = *secondaries-secStart;
3876 
3877   if(secStart==second) {
3878     newStart=(uint8_t*)uprv_malloc(newSize);
3879     if(newStart==NULL) {
3880       *status = U_MEMORY_ALLOCATION_ERROR;
3881       return NULL;
3882     }
3883     uprv_memcpy(newStart, secStart, *secondaries-secStart);
3884   } else {
3885     newStart=(uint8_t*)uprv_realloc(secStart, newSize);
3886     if(newStart==NULL) {
3887       *status = U_MEMORY_ALLOCATION_ERROR;
3888       return NULL;
3889     }
3890   }
3891   *secondaries=newStart+offset;
3892   *secSize=newSize;
3893   return newStart;
3894 }
3895 
3896 
3897 /* This should really be a macro                                                                      */
3898 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
3899 /* secondaries in French                                                                              */
3900 /*
3901 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
3902   uint8_t temp;
3903   while(start<end) {
3904     temp = *start;
3905     *start++ = *end;
3906     *end-- = temp;
3907   }
3908 }
3909 */
3910 
3911 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
3912   TYPE tempA; \
3913 while((start)<(end)) { \
3914     tempA = *(start); \
3915     *(start)++ = *(end); \
3916     *(end)-- = tempA; \
3917 } \
3918 }
3919 
3920 /****************************************************************************/
3921 /* Following are the sortkey generation functions                           */
3922 /*                                                                          */
3923 /****************************************************************************/
3924 
3925 /**
3926  * Merge two sort keys.
3927  * This is useful, for example, to combine sort keys from first and last names
3928  * to sort such pairs.
3929  * Merged sort keys consider on each collation level the first part first entirely,
3930  * then the second one.
3931  * It is possible to merge multiple sort keys by consecutively merging
3932  * another one with the intermediate result.
3933  *
3934  * The length of the merge result is the sum of the lengths of the input sort keys
3935  * minus 1.
3936  *
3937  * @param src1 the first sort key
3938  * @param src1Length the length of the first sort key, including the zero byte at the end;
3939  *        can be -1 if the function is to find the length
3940  * @param src2 the second sort key
3941  * @param src2Length the length of the second sort key, including the zero byte at the end;
3942  *        can be -1 if the function is to find the length
3943  * @param dest the buffer where the merged sort key is written,
3944  *        can be NULL if destCapacity==0
3945  * @param destCapacity the number of bytes in the dest buffer
3946  * @return the length of the merged sort key, src1Length+src2Length-1;
3947  *         can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
3948  *         in which cases the contents of dest is undefined
3949  *
3950  * @draft
3951  */
3952 U_CAPI int32_t U_EXPORT2
ucol_mergeSortkeys(const uint8_t * src1,int32_t src1Length,const uint8_t * src2,int32_t src2Length,uint8_t * dest,int32_t destCapacity)3953 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
3954                    const uint8_t *src2, int32_t src2Length,
3955                    uint8_t *dest, int32_t destCapacity) {
3956     int32_t destLength;
3957     uint8_t b;
3958 
3959     /* check arguments */
3960     if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
3961         src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
3962         destCapacity<0 || (destCapacity>0 && dest==NULL)
3963     ) {
3964         /* error, attempt to write a zero byte and return 0 */
3965         if(dest!=NULL && destCapacity>0) {
3966             *dest=0;
3967         }
3968         return 0;
3969     }
3970 
3971     /* check lengths and capacity */
3972     if(src1Length<0) {
3973         src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
3974     }
3975     if(src2Length<0) {
3976         src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
3977     }
3978 
3979     destLength=src1Length+src2Length-1;
3980     if(destLength>destCapacity) {
3981         /* the merged sort key does not fit into the destination */
3982         return destLength;
3983     }
3984 
3985     /* merge the sort keys with the same number of levels */
3986     while(*src1!=0 && *src2!=0) { /* while both have another level */
3987         /* copy level from src1 not including 00 or 01 */
3988         while((b=*src1)>=2) {
3989             ++src1;
3990             *dest++=b;
3991         }
3992 
3993         /* add a 02 merge separator */
3994         *dest++=2;
3995 
3996         /* copy level from src2 not including 00 or 01 */
3997         while((b=*src2)>=2) {
3998             ++src2;
3999             *dest++=b;
4000         }
4001 
4002         /* if both sort keys have another level, then add a 01 level separator and continue */
4003         if(*src1==1 && *src2==1) {
4004             ++src1;
4005             ++src2;
4006             *dest++=1;
4007         }
4008     }
4009 
4010     /*
4011      * here, at least one sort key is finished now, but the other one
4012      * might have some contents left from containing more levels;
4013      * that contents is just appended to the result
4014      */
4015     if(*src1!=0) {
4016         /* src1 is not finished, therefore *src2==0, and src1 is appended */
4017         src2=src1;
4018     }
4019     /* append src2, "the other, unfinished sort key" */
4020     uprv_strcpy((char *)dest, (const char *)src2);
4021 
4022     /* trust that neither sort key contained illegally embedded zero bytes */
4023     return destLength;
4024 }
4025 
4026 /* sortkey API */
4027 U_CAPI int32_t U_EXPORT2
ucol_getSortKey(const UCollator * coll,const UChar * source,int32_t sourceLength,uint8_t * result,int32_t resultLength)4028 ucol_getSortKey(const    UCollator    *coll,
4029         const    UChar        *source,
4030         int32_t        sourceLength,
4031         uint8_t        *result,
4032         int32_t        resultLength)
4033 {
4034   UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
4035   if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
4036       UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
4037           ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
4038   }
4039 
4040   UErrorCode status = U_ZERO_ERROR;
4041   int32_t keySize   = 0;
4042 
4043   if(source != NULL) {
4044       // source == NULL is actually an error situation, but we would need to
4045       // have an error code to return it. Until we introduce a new
4046       // API, it stays like this
4047 
4048       /* this uses the function pointer that is set in updateinternalstate */
4049       /* currently, there are two funcs: */
4050       /*ucol_calcSortKey(...);*/
4051       /*ucol_calcSortKeySimpleTertiary(...);*/
4052 
4053       keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status);
4054       //((UCollator *)coll)->errorCode = status; /*semantically const */
4055   }
4056   UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
4057   UTRACE_EXIT_STATUS(status);
4058   return keySize;
4059 }
4060 
4061 /* this function is called by the C++ API for sortkey generation */
4062 U_CFUNC int32_t
ucol_getSortKeyWithAllocation(const UCollator * coll,const UChar * source,int32_t sourceLength,uint8_t ** pResult,UErrorCode * pErrorCode)4063 ucol_getSortKeyWithAllocation(const UCollator *coll,
4064                               const UChar *source, int32_t sourceLength,
4065                               uint8_t **pResult,
4066                               UErrorCode *pErrorCode) {
4067     *pResult = 0;
4068     return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode);
4069 }
4070 
4071 #define UCOL_FSEC_BUF_SIZE 256
4072 
4073 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0  */
4074 /* or if we run out of space while making a sortkey and want to return ASAP                                   */
ucol_getSortKeySize(const UCollator * coll,collIterate * s,int32_t currentSize,UColAttributeValue strength,int32_t len)4075 int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) {
4076     UErrorCode status = U_ZERO_ERROR;
4077     //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4078     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4079     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4080     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4081     UBool  compareIdent = (strength == UCOL_IDENTICAL);
4082     UBool  doCase = (coll->caseLevel == UCOL_ON);
4083     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
4084     //UBool  qShifted = shifted  && (compareQuad == 0);
4085     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4086     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4087     uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE];
4088     uint8_t *fSecs = fSecsBuff;
4089     uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE;
4090     uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL;
4091 
4092     uint32_t variableTopValue = coll->variableTopValue;
4093     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4094     if(doHiragana) {
4095       UCOL_COMMON_BOT4++;
4096       /* allocate one more space for hiragana */
4097     }
4098     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4099 
4100     uint32_t order = UCOL_NO_MORE_CES;
4101     uint8_t primary1 = 0;
4102     uint8_t primary2 = 0;
4103     uint8_t secondary = 0;
4104     uint8_t tertiary = 0;
4105     int32_t caseShift = 0;
4106     uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */
4107 
4108     uint8_t caseSwitch = coll->caseSwitch;
4109     uint8_t tertiaryMask = coll->tertiaryMask;
4110     uint8_t tertiaryCommon = coll->tertiaryCommon;
4111 
4112     UBool wasShifted = FALSE;
4113     UBool notIsContinuation = FALSE;
4114     uint8_t leadPrimary = 0;
4115 
4116 
4117     for(;;) {
4118           order = ucol_IGetNextCE(coll, s, &status);
4119           if(order == UCOL_NO_MORE_CES) {
4120               break;
4121           }
4122 
4123           if(order == 0) {
4124             continue;
4125           }
4126 
4127           notIsContinuation = !isContinuation(order);
4128 
4129 
4130           if(notIsContinuation) {
4131             tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK));
4132           } else {
4133             tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4134           }
4135           secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4136           primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4137           primary1 = (uint8_t)(order >> 8);
4138 
4139 
4140           if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4141             || (!notIsContinuation && wasShifted))
4142             || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
4143             /* and other ignorables should be removed if following a shifted code point */
4144             if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4145                                 /* we should just completely ignore it */
4146               continue;
4147             }
4148             if(compareQuad == 0) {
4149               if(c4 > 0) {
4150                 currentSize += (c2/UCOL_BOT_COUNT4)+1;
4151                 c4 = 0;
4152               }
4153               currentSize++;
4154               if(primary2 != 0) {
4155                 currentSize++;
4156               }
4157             }
4158             wasShifted = TRUE;
4159           } else {
4160             wasShifted = FALSE;
4161             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4162             /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
4163             /* calculate sortkey size */
4164             if(primary1 != UCOL_IGNORABLE) {
4165               if(notIsContinuation) {
4166                 if(leadPrimary == primary1) {
4167                   currentSize++;
4168                 } else {
4169                   if(leadPrimary != 0) {
4170                     currentSize++;
4171                   }
4172                   if(primary2 == UCOL_IGNORABLE) {
4173                   /* one byter, not compressed */
4174                       currentSize++;
4175                       leadPrimary = 0;
4176                   } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
4177                       //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
4178                       //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4179                       (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
4180                   /* not compressible */
4181                       leadPrimary = 0;
4182                       currentSize+=2;
4183                   } else { /* compress */
4184                       leadPrimary = primary1;
4185                       currentSize+=2;
4186                   }
4187                 }
4188               } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4189                 currentSize++;
4190                 if(primary2 != UCOL_IGNORABLE) {
4191                   currentSize++;
4192                 }
4193               }
4194             }
4195 
4196             if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */
4197               if(!isFrenchSec){
4198                 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4199                   c2++;
4200                 } else {
4201                   if(c2 > 0) {
4202                     if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4203                       currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1;
4204                     } else {
4205                       currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1;
4206                     }
4207                     c2 = 0;
4208                   }
4209                   currentSize++;
4210                 }
4211               } else {
4212                 fSecs[fSecsLen++] = secondary;
4213                 if(fSecsLen == fSecsMaxLen) {
4214                   if(fSecs == fSecsBuff) {
4215                     fSecs = (uint8_t *)uprv_malloc(2*fSecsLen);
4216                   } else {
4217                     fSecs = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen);
4218                   }
4219                   if(fSecs == NULL) {
4220                     status = U_MEMORY_ALLOCATION_ERROR;
4221                     return -1;
4222                   }
4223                   fSecsMaxLen *= 2;
4224                 }
4225                 if(notIsContinuation) {
4226                   if (frenchStartPtr != NULL) {
4227                       /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4228                     uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4229                     frenchStartPtr = NULL;
4230                   }
4231                 } else {
4232                   if (frenchStartPtr == NULL) {
4233                     frenchStartPtr = fSecs+fSecsLen-2;
4234                   }
4235                   frenchEndPtr = fSecs+fSecsLen-1;
4236                 }
4237               }
4238             }
4239 
4240             if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4241                 // do the case level if we need to do it. We don't want to calculate
4242                 // case level for primary ignorables if we have only primary strength and case level
4243                 // otherwise we would break well formedness of CEs
4244               if (caseShift  == 0) {
4245                 currentSize++;
4246                 caseShift = UCOL_CASE_SHIFT_START;
4247               }
4248               if((tertiary&0x3F) > 0 && notIsContinuation) {
4249                 caseShift--;
4250                 if((tertiary &0xC0) != 0) {
4251                   if (caseShift  == 0) {
4252                     currentSize++;
4253                     caseShift = UCOL_CASE_SHIFT_START;
4254                   }
4255                   caseShift--;
4256                 }
4257               }
4258             } else {
4259               if(notIsContinuation) {
4260                 tertiary ^= caseSwitch;
4261               }
4262             }
4263 
4264             tertiary &= tertiaryMask;
4265             if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */
4266               if (tertiary == tertiaryCommon && notIsContinuation) {
4267                 c3++;
4268               } else {
4269                 if(c3 > 0) {
4270                   if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
4271                     || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
4272                     currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1;
4273                   } else {
4274                     currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1;
4275                   }
4276                   c3 = 0;
4277                 }
4278                 currentSize++;
4279               }
4280             }
4281 
4282             if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
4283               if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4284                 if(c4>0) { // Close this part
4285                   currentSize += (c4/UCOL_BOT_COUNT4)+1;
4286                   c4 = 0;
4287                 }
4288                 currentSize++; // Add the Hiragana
4289               } else { // This wasn't Hiragana, so we can continue adding stuff
4290                 c4++;
4291               }
4292             }
4293 
4294           }
4295     }
4296 
4297     if(!isFrenchSec){
4298       if(c2 > 0) {
4299         currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4300       }
4301     } else {
4302       uint32_t i = 0;
4303       if(frenchStartPtr != NULL) {
4304         uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4305       }
4306       for(i = 0; i<fSecsLen; i++) {
4307         secondary = *(fSecs+fSecsLen-i-1);
4308         /* This is compression code. */
4309         if (secondary == UCOL_COMMON2) {
4310           ++c2;
4311         } else {
4312           if(c2 > 0) {
4313             if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4314               currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0);
4315             } else {
4316               currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4317             }
4318             c2 = 0;
4319           }
4320           currentSize++;
4321         }
4322       }
4323       if(c2 > 0) {
4324         currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4325       }
4326       if(fSecs != fSecsBuff) {
4327         uprv_free(fSecs);
4328       }
4329     }
4330 
4331     if(c3 > 0) {
4332       currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0);
4333     }
4334 
4335     if(c4 > 0  && compareQuad == 0) {
4336       currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0);
4337     }
4338 
4339     if(compareIdent) {
4340       currentSize += u_lengthOfIdenticalLevelRun(s->string, len);
4341     }
4342     return currentSize;
4343 
4344 }
4345 
4346 static
doCaseShift(uint8_t ** cases,uint32_t & caseShift)4347 inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) {
4348   if (caseShift  == 0) {
4349     *(*cases)++ = UCOL_CASE_BYTE_START;
4350     caseShift = UCOL_CASE_SHIFT_START;
4351   }
4352 }
4353 
4354 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
4355 // know how many values we wanted to add, even if we didn't add them all
4356 static
addWithIncrement(uint8_t * & primaries,uint8_t * limit,uint32_t & size,const uint8_t value)4357 inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) {
4358   size++;
4359   if(primaries < limit) {
4360     *(primaries)++ = value;
4361   }
4362 }
4363 
4364 // Packs the secondary buffer when processing French locale. Adds the terminator.
4365 static
packFrench(uint8_t * primaries,uint8_t * primEnd,uint8_t * secondaries,uint32_t * secsize,uint8_t * frenchStartPtr,uint8_t * frenchEndPtr)4366 inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) {
4367   uint8_t secondary;
4368   int32_t count2 = 0;
4369   uint32_t i = 0, size = 0;
4370   // we use i here since the key size already accounts for terminators, so we'll discard the increment
4371   addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR);
4372   /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
4373   if(frenchStartPtr != NULL) {
4374     uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4375   }
4376   for(i = 0; i<*secsize; i++) {
4377     secondary = *(secondaries-i-1);
4378     /* This is compression code. */
4379     if (secondary == UCOL_COMMON2) {
4380       ++count2;
4381     } else {
4382       if (count2 > 0) {
4383         if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4384           while (count2 > UCOL_TOP_COUNT2) {
4385             addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
4386             count2 -= (uint32_t)UCOL_TOP_COUNT2;
4387           }
4388           addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
4389         } else {
4390           while (count2 > UCOL_BOT_COUNT2) {
4391             addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4392             count2 -= (uint32_t)UCOL_BOT_COUNT2;
4393           }
4394           addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4395         }
4396         count2 = 0;
4397       }
4398       addWithIncrement(primaries, primEnd, size, secondary);
4399     }
4400   }
4401   if (count2 > 0) {
4402     while (count2 > UCOL_BOT_COUNT2) {
4403       addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4404       count2 -= (uint32_t)UCOL_BOT_COUNT2;
4405     }
4406     addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4407   }
4408   *secsize = size;
4409   return primaries;
4410 }
4411 
4412 /* This is the sortkey work horse function */
4413 U_CFUNC int32_t U_CALLCONV
ucol_calcSortKey(const UCollator * coll,const UChar * source,int32_t sourceLength,uint8_t ** result,uint32_t resultLength,UBool allocateSKBuffer,UErrorCode * status)4414 ucol_calcSortKey(const    UCollator    *coll,
4415         const    UChar        *source,
4416         int32_t        sourceLength,
4417         uint8_t        **result,
4418         uint32_t        resultLength,
4419         UBool allocateSKBuffer,
4420         UErrorCode *status)
4421 {
4422     //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4423 
4424     uint32_t i = 0; /* general purpose counter */
4425 
4426     /* Stack allocated buffers for buffers we use */
4427     uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER];
4428 
4429     uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad;
4430 
4431     if(U_FAILURE(*status)) {
4432       return 0;
4433     }
4434 
4435     if(primaries == NULL && allocateSKBuffer == TRUE) {
4436         primaries = *result = prim;
4437         resultLength = UCOL_PRIMARY_MAX_BUFFER;
4438     }
4439 
4440     uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER,
4441       caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER;
4442 
4443     uint32_t sortKeySize = 1; /* it is always \0 terminated */
4444 
4445     UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
4446     UChar *normSource = normBuffer;
4447     int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
4448 
4449     int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4450 
4451     UColAttributeValue strength = coll->strength;
4452 
4453     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4454     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4455     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4456     UBool  compareIdent = (strength == UCOL_IDENTICAL);
4457     UBool  doCase = (coll->caseLevel == UCOL_ON);
4458     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4459     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
4460     //UBool  qShifted = shifted && (compareQuad == 0);
4461     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4462     /*const uint8_t *scriptOrder = coll->scriptOrder;*/
4463 
4464     uint32_t variableTopValue = coll->variableTopValue;
4465     // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4466     // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4467     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4468     uint8_t UCOL_HIRAGANA_QUAD = 0;
4469     if(doHiragana) {
4470       UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4471       /* allocate one more space for hiragana, value for hiragana */
4472     }
4473     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4474 
4475     /* support for special features like caselevel and funky secondaries */
4476     uint8_t *frenchStartPtr = NULL;
4477     uint8_t *frenchEndPtr = NULL;
4478     uint32_t caseShift = 0;
4479 
4480     sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShifted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0));
4481 
4482     /* If we need to normalize, we'll do it all at once at the beginning! */
4483     UNormalizationMode normMode;
4484     if(compareIdent) {
4485         normMode = UNORM_NFD;
4486     } else if(coll->normalizationMode != UCOL_OFF) {
4487         normMode = UNORM_FCD;
4488     } else {
4489         normMode = UNORM_NONE;
4490     }
4491 
4492     if(normMode != UNORM_NONE && UNORM_YES != unorm_quickCheck(source, len, normMode, status)) {
4493         len = unorm_internalNormalize(normSource, normSourceLen,
4494                                       source, len,
4495                                       normMode, FALSE,
4496                                       status);
4497         if(*status == U_BUFFER_OVERFLOW_ERROR) {
4498             normSourceLen = len;
4499             normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
4500             if(normSource == NULL) {
4501                 *status = U_MEMORY_ALLOCATION_ERROR;
4502                 return 0;
4503             }
4504             *status = U_ZERO_ERROR;
4505             len = unorm_internalNormalize(normSource, normSourceLen,
4506                                           source, len,
4507                                           normMode, FALSE,
4508                                           status);
4509         }
4510 
4511         if(U_FAILURE(*status)) {
4512             return 0;
4513         }
4514         source = normSource;
4515     }
4516 
4517     collIterate s;
4518     IInit_collIterate(coll, (UChar *)source, len, &s);
4519     if(source == normSource) {
4520         s.flags &= ~UCOL_ITER_NORM;
4521     }
4522 
4523     if(resultLength == 0 || primaries == NULL) {
4524       int32_t keyLen = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4525       if(normSource != normBuffer) {
4526           uprv_free(normSource);
4527       }
4528       return keyLen;
4529     }
4530     uint8_t *primarySafeEnd = primaries + resultLength - 1;
4531     if(strength > UCOL_PRIMARY) {
4532         primarySafeEnd--;
4533     }
4534 
4535     uint32_t minBufferSize = UCOL_MAX_BUFFER;
4536 
4537     uint8_t *primStart = primaries;
4538     uint8_t *secStart = secondaries;
4539     uint8_t *terStart = tertiaries;
4540     uint8_t *caseStart = cases;
4541     uint8_t *quadStart = quads;
4542 
4543     uint32_t order = 0;
4544 
4545     uint8_t primary1 = 0;
4546     uint8_t primary2 = 0;
4547     uint8_t secondary = 0;
4548     uint8_t tertiary = 0;
4549     uint8_t caseSwitch = coll->caseSwitch;
4550     uint8_t tertiaryMask = coll->tertiaryMask;
4551     int8_t tertiaryAddition = (int8_t)coll->tertiaryAddition;
4552     uint8_t tertiaryTop = coll->tertiaryTop;
4553     uint8_t tertiaryBottom = coll->tertiaryBottom;
4554     uint8_t tertiaryCommon = coll->tertiaryCommon;
4555     uint8_t caseBits = 0;
4556 
4557     UBool finished = FALSE;
4558     UBool wasShifted = FALSE;
4559     UBool notIsContinuation = FALSE;
4560 
4561     uint32_t prevBuffSize = 0;
4562 
4563     uint32_t count2 = 0, count3 = 0, count4 = 0;
4564     uint8_t leadPrimary = 0;
4565 
4566     for(;;) {
4567         for(i=prevBuffSize; i<minBufferSize; ++i) {
4568 
4569             order = ucol_IGetNextCE(coll, &s, status);
4570             if(order == UCOL_NO_MORE_CES) {
4571                 finished = TRUE;
4572                 break;
4573             }
4574 
4575             if(order == 0) {
4576               continue;
4577             }
4578 
4579             notIsContinuation = !isContinuation(order);
4580 
4581             if(notIsContinuation) {
4582               tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
4583             } else {
4584               tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4585             }
4586 
4587             secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4588             primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4589             primary1 = (uint8_t)(order >> 8);
4590 
4591             /*if(notIsContinuation && scriptOrder != NULL) {
4592               primary1 = scriptOrder[primary1];
4593             }*/
4594 
4595             if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4596               || (!notIsContinuation && wasShifted))
4597               || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
4598               /* and other ignorables should be removed if following a shifted code point */
4599               if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4600                                   /* we should just completely ignore it */
4601                 continue;
4602               }
4603               if(compareQuad == 0) {
4604                 if(count4 > 0) {
4605                   while (count4 > UCOL_BOT_COUNT4) {
4606                     *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4607                     count4 -= UCOL_BOT_COUNT4;
4608                   }
4609                   *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4610                   count4 = 0;
4611                 }
4612                 /* We are dealing with a variable and we're treating them as shifted */
4613                 /* This is a shifted ignorable */
4614                 if(primary1 != 0) { /* we need to check this since we could be in continuation */
4615                   *quads++ = primary1;
4616                 }
4617                 if(primary2 != 0) {
4618                   *quads++ = primary2;
4619                 }
4620               }
4621               wasShifted = TRUE;
4622             } else {
4623               wasShifted = FALSE;
4624               /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4625               /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
4626               /* regular and simple sortkey calc */
4627               if(primary1 != UCOL_IGNORABLE) {
4628                 if(notIsContinuation) {
4629                   if(leadPrimary == primary1) {
4630                     *primaries++ = primary2;
4631                   } else {
4632                     if(leadPrimary != 0) {
4633                       *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
4634                     }
4635                     if(primary2 == UCOL_IGNORABLE) {
4636                     /* one byter, not compressed */
4637                         *primaries++ = primary1;
4638                         leadPrimary = 0;
4639                     } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
4640                         //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4641                        (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
4642                     /* not compressible */
4643                         leadPrimary = 0;
4644                         *primaries++ = primary1;
4645                         *primaries++ = primary2;
4646                     } else { /* compress */
4647                         *primaries++ = leadPrimary = primary1;
4648                         *primaries++ = primary2;
4649                     }
4650                   }
4651                 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4652                   *primaries++ = primary1;
4653                   if(primary2 != UCOL_IGNORABLE) {
4654                     *primaries++ = primary2; /* second part */
4655                   }
4656                 }
4657               }
4658 
4659             if(secondary > compareSec) {
4660               if(!isFrenchSec) {
4661                 /* This is compression code. */
4662                 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4663                   ++count2;
4664                 } else {
4665                   if (count2 > 0) {
4666                     if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4667                       while (count2 > UCOL_TOP_COUNT2) {
4668                         *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
4669                         count2 -= (uint32_t)UCOL_TOP_COUNT2;
4670                       }
4671                       *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
4672                     } else {
4673                       while (count2 > UCOL_BOT_COUNT2) {
4674                         *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4675                         count2 -= (uint32_t)UCOL_BOT_COUNT2;
4676                       }
4677                       *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
4678                     }
4679                     count2 = 0;
4680                   }
4681                   *secondaries++ = secondary;
4682                 }
4683               } else {
4684                   *secondaries++ = secondary;
4685                   /* Do the special handling for French secondaries */
4686                   /* We need to get continuation elements and do intermediate restore */
4687                   /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
4688                   if(notIsContinuation) {
4689                     if (frenchStartPtr != NULL) {
4690                         /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4691                       uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4692                       frenchStartPtr = NULL;
4693                     }
4694                   } else {
4695                     if (frenchStartPtr == NULL) {
4696                       frenchStartPtr = secondaries - 2;
4697                     }
4698                     frenchEndPtr = secondaries-1;
4699                   }
4700                 }
4701               }
4702 
4703               if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4704                 // do the case level if we need to do it. We don't want to calculate
4705                 // case level for primary ignorables if we have only primary strength and case level
4706                 // otherwise we would break well formedness of CEs
4707                 doCaseShift(&cases, caseShift);
4708                 if(notIsContinuation) {
4709                   caseBits = (uint8_t)(tertiary & 0xC0);
4710 
4711                   if(tertiary != 0) {
4712                     if(coll->caseFirst == UCOL_UPPER_FIRST) {
4713                       if((caseBits & 0xC0) == 0) {
4714                         *(cases-1) |= 1 << (--caseShift);
4715                       } else {
4716                         *(cases-1) |= 0 << (--caseShift);
4717                         /* second bit */
4718                         doCaseShift(&cases, caseShift);
4719                         *(cases-1) |= ((caseBits>>6)&1) << (--caseShift);
4720                       }
4721                     } else {
4722                       if((caseBits & 0xC0) == 0) {
4723                         *(cases-1) |= 0 << (--caseShift);
4724                       } else {
4725                         *(cases-1) |= 1 << (--caseShift);
4726                         /* second bit */
4727                         doCaseShift(&cases, caseShift);
4728                         *(cases-1) |= ((caseBits>>7)&1) << (--caseShift);
4729                       }
4730                     }
4731                   }
4732 
4733                 }
4734               } else {
4735                 if(notIsContinuation) {
4736                   tertiary ^= caseSwitch;
4737                 }
4738               }
4739 
4740               tertiary &= tertiaryMask;
4741               if(tertiary > compareTer) {
4742                 /* This is compression code. */
4743                 /* sequence size check is included in the if clause */
4744                 if (tertiary == tertiaryCommon && notIsContinuation) {
4745                   ++count3;
4746                 } else {
4747                   if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
4748                     tertiary += tertiaryAddition;
4749                   } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
4750                     tertiary -= tertiaryAddition;
4751                   }
4752                   if (count3 > 0) {
4753                     if ((tertiary > tertiaryCommon)) {
4754                       while (count3 > coll->tertiaryTopCount) {
4755                         *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
4756                         count3 -= (uint32_t)coll->tertiaryTopCount;
4757                       }
4758                       *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
4759                     } else {
4760                       while (count3 > coll->tertiaryBottomCount) {
4761                         *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
4762                         count3 -= (uint32_t)coll->tertiaryBottomCount;
4763                       }
4764                       *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
4765                     }
4766                     count3 = 0;
4767                   }
4768                   *tertiaries++ = tertiary;
4769                 }
4770               }
4771 
4772               if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
4773                 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4774                   if(count4>0) { // Close this part
4775                     while (count4 > UCOL_BOT_COUNT4) {
4776                       *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4777                       count4 -= UCOL_BOT_COUNT4;
4778                     }
4779                     *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4780                     count4 = 0;
4781                   }
4782                   *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana
4783                 } else { // This wasn't Hiragana, so we can continue adding stuff
4784                   count4++;
4785                 }
4786               }
4787             }
4788 
4789             if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
4790               if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
4791                 IInit_collIterate(coll, (UChar *)source, len, &s);
4792                 if(source == normSource) {
4793                     s.flags &= ~UCOL_ITER_NORM;
4794                 }
4795                 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4796                 *status = U_BUFFER_OVERFLOW_ERROR;
4797                 finished = TRUE;
4798                 break;
4799               } else { /* It's much nicer if we can actually reallocate */
4800                 int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart);
4801                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
4802                 if(U_SUCCESS(*status)) {
4803                   *result = primStart;
4804                   primarySafeEnd = primStart + resultLength - 1;
4805                   if(strength > UCOL_PRIMARY) {
4806                       primarySafeEnd--;
4807                   }
4808                 } else {
4809                   IInit_collIterate(coll, (UChar *)source, len, &s);
4810                   if(source == normSource) {
4811                       s.flags &= ~UCOL_ITER_NORM;
4812                   }
4813                   sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4814                   finished = TRUE;
4815                   break;
4816                 }
4817               }
4818             }
4819         }
4820         if(finished) {
4821             break;
4822         } else {
4823           prevBuffSize = minBufferSize;
4824 
4825           uint32_t frenchStartOffset = 0, frenchEndOffset = 0;
4826           if (frenchStartPtr != NULL) {
4827              frenchStartOffset = frenchStartPtr - secStart;
4828              frenchEndOffset = frenchEndPtr - secStart;
4829           }
4830           secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
4831           if (frenchStartPtr != NULL) {
4832              frenchStartPtr = secStart + frenchStartOffset;
4833              frenchEndPtr = secStart + frenchEndOffset;
4834           }
4835 
4836           terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
4837           caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status);
4838           quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status);
4839           minBufferSize *= 2;
4840           if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
4841             IInit_collIterate(coll, (UChar *)source, len, &s);
4842             if(source == normSource) {
4843                 s.flags &= ~UCOL_ITER_NORM;
4844             }
4845             sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4846             break;
4847           }
4848         }
4849     }
4850 
4851     /* Here, we are generally done with processing */
4852     /* bailing out would not be too productive */
4853 
4854     if(U_SUCCESS(*status)) {
4855       sortKeySize += (primaries - primStart);
4856       /* we have done all the CE's, now let's put them together to form a key */
4857       if(compareSec == 0) {
4858         if (count2 > 0) {
4859           while (count2 > UCOL_BOT_COUNT2) {
4860             *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4861             count2 -= (uint32_t)UCOL_BOT_COUNT2;
4862           }
4863           *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
4864         }
4865         uint32_t secsize = secondaries-secStart;
4866         if(!isFrenchSec) { // Regular situation, we know the length of secondaries
4867           sortKeySize += secsize;
4868           if(sortKeySize <= resultLength) {
4869             *(primaries++) = UCOL_LEVELTERMINATOR;
4870             uprv_memcpy(primaries, secStart, secsize);
4871             primaries += secsize;
4872           } else {
4873             if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
4874               primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4875               if(U_SUCCESS(*status)) {
4876                 *result = primStart;
4877                 *(primaries++) = UCOL_LEVELTERMINATOR;
4878                 uprv_memcpy(primaries, secStart, secsize);
4879                 primaries += secsize;
4880               }
4881             } else {
4882               *status = U_BUFFER_OVERFLOW_ERROR;
4883             }
4884           }
4885         } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
4886           uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
4887           sortKeySize += secsize;
4888           if(sortKeySize <= resultLength) { // if we managed to pack fine
4889             primaries = newPrim; // update the primary pointer
4890           } else { // overflow, need to reallocate and redo
4891             if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
4892               primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4893               if(U_SUCCESS(*status)) {
4894                 primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
4895               }
4896             } else {
4897               *status = U_BUFFER_OVERFLOW_ERROR;
4898             }
4899           }
4900         }
4901       }
4902 
4903       if(doCase) {
4904         uint32_t casesize = cases - caseStart;
4905         sortKeySize += casesize;
4906         if(sortKeySize <= resultLength) {
4907           *(primaries++) = UCOL_LEVELTERMINATOR;
4908           uprv_memcpy(primaries, caseStart, casesize);
4909           primaries += casesize;
4910         } else {
4911           if(allocateSKBuffer == TRUE) {
4912             primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4913             if(U_SUCCESS(*status)) {
4914               *result = primStart;
4915               *(primaries++) = UCOL_LEVELTERMINATOR;
4916               uprv_memcpy(primaries, caseStart, casesize);
4917             }
4918           } else {
4919             *status = U_BUFFER_OVERFLOW_ERROR;
4920           }
4921         }
4922       }
4923 
4924       if(compareTer == 0) {
4925         if (count3 > 0) {
4926           if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
4927             while (count3 >= coll->tertiaryTopCount) {
4928               *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
4929               count3 -= (uint32_t)coll->tertiaryTopCount;
4930             }
4931             *tertiaries++ = (uint8_t)(tertiaryTop - count3);
4932           } else {
4933             while (count3 > coll->tertiaryBottomCount) {
4934               *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
4935               count3 -= (uint32_t)coll->tertiaryBottomCount;
4936             }
4937             *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
4938           }
4939         }
4940         uint32_t tersize = tertiaries - terStart;
4941         sortKeySize += tersize;
4942         if(sortKeySize <= resultLength) {
4943           *(primaries++) = UCOL_LEVELTERMINATOR;
4944           uprv_memcpy(primaries, terStart, tersize);
4945           primaries += tersize;
4946         } else {
4947           if(allocateSKBuffer == TRUE) {
4948             primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4949             if(U_SUCCESS(*status)) {
4950               *result = primStart;
4951               *(primaries++) = UCOL_LEVELTERMINATOR;
4952               uprv_memcpy(primaries, terStart, tersize);
4953             }
4954           } else {
4955             *status = U_BUFFER_OVERFLOW_ERROR;
4956           }
4957         }
4958 
4959         if(compareQuad == 0/*qShifted == TRUE*/) {
4960             if(count4 > 0) {
4961               while (count4 > UCOL_BOT_COUNT4) {
4962                 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4963                 count4 -= UCOL_BOT_COUNT4;
4964               }
4965               *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4966             }
4967             uint32_t quadsize = quads - quadStart;
4968             sortKeySize += quadsize;
4969             if(sortKeySize <= resultLength) {
4970               *(primaries++) = UCOL_LEVELTERMINATOR;
4971               uprv_memcpy(primaries, quadStart, quadsize);
4972               primaries += quadsize;
4973             } else {
4974               if(allocateSKBuffer == TRUE) {
4975                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4976                 if(U_SUCCESS(*status)) {
4977                   *result = primStart;
4978                   *(primaries++) = UCOL_LEVELTERMINATOR;
4979                   uprv_memcpy(primaries, quadStart, quadsize);
4980                 }
4981               } else {
4982                 *status = U_BUFFER_OVERFLOW_ERROR;
4983               }
4984             }
4985         }
4986 
4987         if(compareIdent) {
4988           sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len);
4989           if(sortKeySize <= resultLength) {
4990             *(primaries++) = UCOL_LEVELTERMINATOR;
4991             primaries += u_writeIdenticalLevelRun(s.string, len, primaries);
4992           } else {
4993             if(allocateSKBuffer == TRUE) {
4994               primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status);
4995               if(U_SUCCESS(*status)) {
4996                 *result = primStart;
4997                 *(primaries++) = UCOL_LEVELTERMINATOR;
4998                 u_writeIdenticalLevelRun(s.string, len, primaries);
4999               }
5000             } else {
5001               *status = U_BUFFER_OVERFLOW_ERROR;
5002             }
5003           }
5004         }
5005       }
5006       *(primaries++) = '\0';
5007     }
5008 
5009     if(terStart != tert) {
5010         uprv_free(terStart);
5011         uprv_free(secStart);
5012         uprv_free(caseStart);
5013         uprv_free(quadStart);
5014     }
5015 
5016     if(normSource != normBuffer) {
5017         uprv_free(normSource);
5018     }
5019 
5020     if(allocateSKBuffer == TRUE) {
5021       *result = (uint8_t*)uprv_malloc(sortKeySize);
5022       /* test for NULL */
5023       if (*result == NULL) {
5024         *status = U_MEMORY_ALLOCATION_ERROR;
5025         return sortKeySize;
5026       }
5027       uprv_memcpy(*result, primStart, sortKeySize);
5028       if(primStart != prim) {
5029         uprv_free(primStart);
5030       }
5031     }
5032 
5033     return sortKeySize;
5034 }
5035 
5036 
5037 U_CFUNC int32_t U_CALLCONV
ucol_calcSortKeySimpleTertiary(const UCollator * coll,const UChar * source,int32_t sourceLength,uint8_t ** result,uint32_t resultLength,UBool allocateSKBuffer,UErrorCode * status)5038 ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
5039         const    UChar        *source,
5040         int32_t        sourceLength,
5041         uint8_t        **result,
5042         uint32_t        resultLength,
5043         UBool allocateSKBuffer,
5044         UErrorCode *status)
5045 {
5046     U_ALIGN_CODE(16);
5047 
5048     //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
5049     uint32_t i = 0; /* general purpose counter */
5050 
5051     /* Stack allocated buffers for buffers we use */
5052     uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
5053 
5054     uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert;
5055 
5056     if(U_FAILURE(*status)) {
5057       return 0;
5058     }
5059 
5060     if(primaries == NULL && allocateSKBuffer == TRUE) {
5061         primaries = *result = prim;
5062         resultLength = UCOL_PRIMARY_MAX_BUFFER;
5063     }
5064 
5065     uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER;
5066 
5067     uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */
5068 
5069     UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
5070     UChar *normSource = normBuffer;
5071     int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
5072 
5073     int32_t len =  sourceLength;
5074 
5075     /* If we need to normalize, we'll do it all at once at the beginning! */
5076     if(coll->normalizationMode != UCOL_OFF && UNORM_YES != unorm_quickCheck(source, len, UNORM_FCD, status)) {
5077         len = unorm_internalNormalize(normSource, normSourceLen,
5078                                       source, len,
5079                                       UNORM_FCD, FALSE,
5080                                       status);
5081         if(*status == U_BUFFER_OVERFLOW_ERROR) {
5082             normSourceLen = len;
5083             normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
5084             if(normSource == NULL) {
5085                 *status = U_MEMORY_ALLOCATION_ERROR;
5086                 return 0;
5087             }
5088             *status = U_ZERO_ERROR;
5089             len = unorm_internalNormalize(normSource, normSourceLen,
5090                                           source, len,
5091                                           UNORM_FCD, FALSE,
5092                                           status);
5093         }
5094 
5095         if(U_FAILURE(*status)) {
5096             return 0;
5097         }
5098         source = normSource;
5099     }
5100 
5101     collIterate s;
5102     IInit_collIterate(coll, (UChar *)source, len, &s);
5103     if(source == normSource) {
5104         s.flags &= ~UCOL_ITER_NORM;
5105     }
5106 
5107     if(resultLength == 0 || primaries == NULL) {
5108         int32_t t = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5109         if(normSource != normBuffer) {
5110             uprv_free(normSource);
5111         }
5112         return t;
5113     }
5114 
5115     uint8_t *primarySafeEnd = primaries + resultLength - 2;
5116 
5117     uint32_t minBufferSize = UCOL_MAX_BUFFER;
5118 
5119     uint8_t *primStart = primaries;
5120     uint8_t *secStart = secondaries;
5121     uint8_t *terStart = tertiaries;
5122 
5123     uint32_t order = 0;
5124 
5125     uint8_t primary1 = 0;
5126     uint8_t primary2 = 0;
5127     uint8_t secondary = 0;
5128     uint8_t tertiary = 0;
5129     uint8_t caseSwitch = coll->caseSwitch;
5130     uint8_t tertiaryMask = coll->tertiaryMask;
5131     int8_t tertiaryAddition = (int8_t)coll->tertiaryAddition;
5132     uint8_t tertiaryTop = coll->tertiaryTop;
5133     uint8_t tertiaryBottom = coll->tertiaryBottom;
5134     uint8_t tertiaryCommon = coll->tertiaryCommon;
5135 
5136     uint32_t prevBuffSize = 0;
5137 
5138     UBool finished = FALSE;
5139     UBool notIsContinuation = FALSE;
5140 
5141     uint32_t count2 = 0, count3 = 0;
5142     uint8_t leadPrimary = 0;
5143 
5144     for(;;) {
5145         for(i=prevBuffSize; i<minBufferSize; ++i) {
5146 
5147             order = ucol_IGetNextCE(coll, &s, status);
5148 
5149             if(order == 0) {
5150               continue;
5151             }
5152 
5153             if(order == UCOL_NO_MORE_CES) {
5154                 finished = TRUE;
5155                 break;
5156             }
5157 
5158             notIsContinuation = !isContinuation(order);
5159 
5160             if(notIsContinuation) {
5161               tertiary = (uint8_t)((order & tertiaryMask));
5162             } else {
5163               tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5164             }
5165             secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5166             primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5167             primary1 = (uint8_t)(order >> 8);
5168 
5169             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5170             /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
5171             /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above.               */
5172             /* regular and simple sortkey calc */
5173             if(primary1 != UCOL_IGNORABLE) {
5174               if(notIsContinuation) {
5175                 if(leadPrimary == primary1) {
5176                   *primaries++ = primary2;
5177                 } else {
5178                   if(leadPrimary != 0) {
5179                     *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5180                   }
5181                   if(primary2 == UCOL_IGNORABLE) {
5182                   /* one byter, not compressed */
5183                       *primaries++ = primary1;
5184                       leadPrimary = 0;
5185                   } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
5186                       //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24)))
5187                       //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
5188                       (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
5189                   /* not compressible */
5190                       leadPrimary = 0;
5191                       *primaries++ = primary1;
5192                       *primaries++ = primary2;
5193                   } else { /* compress */
5194                       *primaries++ = leadPrimary = primary1;
5195                       *primaries++ = primary2;
5196                   }
5197                 }
5198               } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5199                 *primaries++ = primary1;
5200                 if(primary2 != UCOL_IGNORABLE) {
5201                   *primaries++ = primary2; /* second part */
5202                 }
5203               }
5204             }
5205 
5206             if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5207               /* This is compression code. */
5208               if (secondary == UCOL_COMMON2 && notIsContinuation) {
5209                 ++count2;
5210               } else {
5211                 if (count2 > 0) {
5212                   if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5213                     while (count2 > UCOL_TOP_COUNT2) {
5214                       *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5215                       count2 -= (uint32_t)UCOL_TOP_COUNT2;
5216                     }
5217                     *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
5218                   } else {
5219                     while (count2 > UCOL_BOT_COUNT2) {
5220                       *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5221                       count2 -= (uint32_t)UCOL_BOT_COUNT2;
5222                     }
5223                     *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5224                   }
5225                   count2 = 0;
5226                 }
5227                 *secondaries++ = secondary;
5228               }
5229             }
5230 
5231             if(notIsContinuation) {
5232               tertiary ^= caseSwitch;
5233             }
5234 
5235               if(tertiary > 0) {
5236               /* This is compression code. */
5237               /* sequence size check is included in the if clause */
5238               if (tertiary == tertiaryCommon && notIsContinuation) {
5239                 ++count3;
5240               } else {
5241                 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5242                   tertiary += tertiaryAddition;
5243                 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5244                   tertiary -= tertiaryAddition;
5245                 }
5246                 if (count3 > 0) {
5247                   if ((tertiary > tertiaryCommon)) {
5248                     while (count3 > coll->tertiaryTopCount) {
5249                       *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5250                       count3 -= (uint32_t)coll->tertiaryTopCount;
5251                     }
5252                     *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
5253                   } else {
5254                     while (count3 > coll->tertiaryBottomCount) {
5255                       *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5256                       count3 -= (uint32_t)coll->tertiaryBottomCount;
5257                     }
5258                     *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5259                   }
5260                   count3 = 0;
5261                 }
5262                 *tertiaries++ = tertiary;
5263               }
5264             }
5265 
5266             if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
5267               if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
5268                 IInit_collIterate(coll, (UChar *)source, len, &s);
5269                 if(source == normSource) {
5270                     s.flags &= ~UCOL_ITER_NORM;
5271                 }
5272                 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5273                 *status = U_BUFFER_OVERFLOW_ERROR;
5274                 finished = TRUE;
5275                 break;
5276               } else { /* It's much nicer if we can actually reallocate */
5277                 int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart);
5278                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
5279                 if(U_SUCCESS(*status)) {
5280                   *result = primStart;
5281                   primarySafeEnd = primStart + resultLength - 2;
5282                 } else {
5283                   IInit_collIterate(coll, (UChar *)source, len, &s);
5284                   if(source == normSource) {
5285                       s.flags &= ~UCOL_ITER_NORM;
5286                   }
5287                   sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5288                   finished = TRUE;
5289                   break;
5290                 }
5291               }
5292             }
5293         }
5294         if(finished) {
5295             break;
5296         } else {
5297           prevBuffSize = minBufferSize;
5298           secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
5299           terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
5300           minBufferSize *= 2;
5301           if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5302             IInit_collIterate(coll, (UChar *)source, len, &s);
5303             if(source == normSource) {
5304                 s.flags &= ~UCOL_ITER_NORM;
5305             }
5306             sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5307             break;
5308           }
5309         }
5310     }
5311 
5312     if(U_SUCCESS(*status)) {
5313       sortKeySize += (primaries - primStart);
5314       /* we have done all the CE's, now let's put them together to form a key */
5315       if (count2 > 0) {
5316         while (count2 > UCOL_BOT_COUNT2) {
5317           *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5318           count2 -= (uint32_t)UCOL_BOT_COUNT2;
5319         }
5320         *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5321       }
5322       uint32_t secsize = secondaries-secStart;
5323       sortKeySize += secsize;
5324       if(sortKeySize <= resultLength) {
5325         *(primaries++) = UCOL_LEVELTERMINATOR;
5326         uprv_memcpy(primaries, secStart, secsize);
5327         primaries += secsize;
5328       } else {
5329         if(allocateSKBuffer == TRUE) {
5330           primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5331           if(U_SUCCESS(*status)) {
5332             *(primaries++) = UCOL_LEVELTERMINATOR;
5333             *result = primStart;
5334             uprv_memcpy(primaries, secStart, secsize);
5335           }
5336         } else {
5337           *status = U_BUFFER_OVERFLOW_ERROR;
5338         }
5339       }
5340 
5341       if (count3 > 0) {
5342         if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5343           while (count3 >= coll->tertiaryTopCount) {
5344             *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5345             count3 -= (uint32_t)coll->tertiaryTopCount;
5346           }
5347           *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5348         } else {
5349           while (count3 > coll->tertiaryBottomCount) {
5350             *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5351             count3 -= (uint32_t)coll->tertiaryBottomCount;
5352           }
5353           *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5354         }
5355       }
5356       uint32_t tersize = tertiaries - terStart;
5357       sortKeySize += tersize;
5358       if(sortKeySize <= resultLength) {
5359         *(primaries++) = UCOL_LEVELTERMINATOR;
5360         uprv_memcpy(primaries, terStart, tersize);
5361         primaries += tersize;
5362       } else {
5363         if(allocateSKBuffer == TRUE) {
5364           primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5365           if(U_SUCCESS(*status)) {
5366             *result = primStart;
5367             *(primaries++) = UCOL_LEVELTERMINATOR;
5368             uprv_memcpy(primaries, terStart, tersize);
5369           }
5370         } else {
5371           *status = U_MEMORY_ALLOCATION_ERROR;
5372         }
5373       }
5374 
5375       *(primaries++) = '\0';
5376     }
5377 
5378     if(terStart != tert) {
5379         uprv_free(terStart);
5380         uprv_free(secStart);
5381     }
5382 
5383     if(normSource != normBuffer) {
5384         uprv_free(normSource);
5385     }
5386 
5387     if(allocateSKBuffer == TRUE) {
5388       *result = (uint8_t*)uprv_malloc(sortKeySize);
5389       /* test for NULL */
5390       if (*result == NULL) {
5391         *status = U_MEMORY_ALLOCATION_ERROR;
5392         return sortKeySize;
5393       }
5394       uprv_memcpy(*result, primStart, sortKeySize);
5395       if(primStart != prim) {
5396         uprv_free(primStart);
5397       }
5398     }
5399 
5400     return sortKeySize;
5401 }
5402 
5403 static inline
isShiftedCE(uint32_t CE,uint32_t LVT,UBool * wasShifted)5404 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5405   UBool notIsContinuation = !isContinuation(CE);
5406   uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
5407   if(LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5408     || (!notIsContinuation && *wasShifted))
5409     || (*wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
5410     // The stuff below should probably be in the sortkey code... maybe not...
5411     if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
5412                         /* we should just completely ignore it */
5413       *wasShifted = TRUE;
5414       //continue;
5415     }
5416     //*wasShifted = TRUE;
5417     return TRUE;
5418   } else {
5419     *wasShifted = FALSE;
5420     return FALSE;
5421   }
5422 }
5423 static inline
terminatePSKLevel(int32_t level,int32_t maxLevel,int32_t & i,uint8_t * dest)5424 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
5425   if(level < maxLevel) {
5426     dest[i++] = UCOL_LEVELTERMINATOR;
5427   } else {
5428     dest[i++] = 0;
5429   }
5430 }
5431 
5432 /** enumeration of level identifiers for partial sort key generation */
5433 enum {
5434   UCOL_PSK_PRIMARY = 0,
5435     UCOL_PSK_SECONDARY = 1,
5436     UCOL_PSK_CASE = 2,
5437     UCOL_PSK_TERTIARY = 3,
5438     UCOL_PSK_QUATERNARY = 4,
5439     UCOL_PSK_QUIN = 5,      /** This is an extra level, not used - but we have three bits to blow */
5440     UCOL_PSK_IDENTICAL = 6,
5441     UCOL_PSK_NULL = 7,      /** level for the end of sort key. Will just produce zeros */
5442     UCOL_PSK_LIMIT
5443 };
5444 
5445 /** collation state enum. *_SHIFT value is how much to shift right
5446  *  to get the state piece to the right. *_MASK value should be
5447  *  ANDed with the shifted state. This data is stored in state[1]
5448  *  field.
5449  */
5450 enum {
5451     UCOL_PSK_LEVEL_SHIFT = 0,      /** level identificator. stores an enum value from above */
5452     UCOL_PSK_LEVEL_MASK = 7,       /** three bits */
5453     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
5454     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
5455     /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5456      *  This field is also used to denote that the French secondary level is finished
5457      */
5458     UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5459     UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5460     UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
5461     UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5462     /** When we do French we need to reverse secondary values. However, continuations
5463      *  need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5464      */
5465     UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
5466     UCOL_PSK_BOCSU_BYTES_MASK = 3,
5467     UCOL_PSK_CONSUMED_CES_SHIFT = 9,
5468     UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
5469 };
5470 
5471 // macro calculating the number of expansion CEs available
5472 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
5473 
5474 
5475 /** main sortkey part procedure. On the first call,
5476  *  you should pass in a collator, an iterator, empty state
5477  *  state[0] == state[1] == 0, a buffer to hold results
5478  *  number of bytes you need and an error code pointer.
5479  *  Make sure your buffer is big enough to hold the wanted
5480  *  number of sortkey bytes. I don't check.
5481  *  The only meaningful status you can get back is
5482  *  U_BUFFER_OVERFLOW_ERROR, which basically means that you
5483  *  have been dealt a raw deal and that you probably won't
5484  *  be able to use partial sortkey generation for this
5485  *  particular combination of string and collator. This
5486  *  is highly unlikely, but you should still check the error code.
5487  *  Any other status means that you're not in a sane situation
5488  *  anymore. After the first call, preserve state values and
5489  *  use them on subsequent calls to obtain more bytes of a sortkey.
5490  *  Use until the number of bytes written is smaller than the requested
5491  *  number of bytes. Generated sortkey is not compatible with the
5492  *  one generated by ucol_getSortKey, as we don't do any compression.
5493  *  However, levels are still terminated by a 1 (one) and the sortkey
5494  *  is terminated by a 0 (zero). Identical level is the same as in the
5495  *  regular sortkey - internal bocu-1 implementation is used.
5496  *  For curious, although you cannot do much about this, here is
5497  *  the structure of state words.
5498  *  state[0] - iterator state. Depends on the iterator implementation,
5499  *             but allows the iterator to continue where it stopped in
5500  *             the last iteration.
5501  *  state[1] - collation processing state. Here is the distribution
5502  *             of the bits:
5503  *   0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5504  *             quaternary, quin (we don't use this one), identical and
5505  *             null (producing only zeroes - first one to terminate the
5506  *             sortkey and subsequent to fill the buffer).
5507  *   3       - byte count. Number of bytes written on the primary level.
5508  *   4       - was shifted. Whether the previous iteration finished in the
5509  *             shifted state.
5510  *   5, 6    - French continuation bytes written. See the comment in the enum
5511  *   7,8     - Bocsu bytes used. Number of bytes from a bocu sequence on
5512  *             the identical level.
5513  *   9..31   - CEs consumed. Number of getCE or next32 operations performed
5514  *             since thes last successful update of the iterator state.
5515  */
5516 U_CAPI int32_t U_EXPORT2
ucol_nextSortKeyPart(const UCollator * coll,UCharIterator * iter,uint32_t state[2],uint8_t * dest,int32_t count,UErrorCode * status)5517 ucol_nextSortKeyPart(const UCollator *coll,
5518                      UCharIterator *iter,
5519                      uint32_t state[2],
5520                      uint8_t *dest, int32_t count,
5521                      UErrorCode *status) {
5522     /* error checking */
5523     if(status==NULL || U_FAILURE(*status)) {
5524         return 0;
5525     }
5526     UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
5527     if( coll==NULL || iter==NULL ||
5528         state==NULL ||
5529         count<0 || (count>0 && dest==NULL)
5530     ) {
5531         *status=U_ILLEGAL_ARGUMENT_ERROR;
5532         UTRACE_EXIT_STATUS(status);
5533         return 0;
5534     }
5535 
5536     UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5537                   coll, iter, state[0], state[1], dest, count);
5538 
5539     if(count==0) {
5540         /* nothing to do */
5541         UTRACE_EXIT_VALUE(0);
5542         return 0;
5543     }
5544     /** Setting up situation according to the state we got from the previous iteration */
5545     // The state of the iterator from the previous invocation
5546     uint32_t iterState = state[0];
5547     // Has the last iteration ended in the shifted state
5548     UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
5549     // What is the current level of the sortkey?
5550     int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
5551     // Have we written only one byte from a two byte primary in the previous iteration?
5552     // Also on secondary level - have we finished with the French secondary?
5553     int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
5554     // number of bytes in the continuation buffer for French
5555     int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
5556     // Number of bytes already written from a bocsu sequence. Since
5557     // the longes bocsu sequence is 4 long, this can be up to 3.
5558     int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
5559     // Number of elements that need to be consumed in this iteration because
5560     // the iterator returned UITER_NO_STATE at the end of the last iteration,
5561     // so we had to save the last valid state.
5562     int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
5563 
5564     /** values that depend on the collator attributes */
5565     // strength of the collator.
5566     int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
5567     // maximal level of the partial sortkey. Need to take whether case level is done
5568     int32_t maxLevel = 0;
5569     if(strength < UCOL_TERTIARY) {
5570       if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5571         maxLevel = UCOL_PSK_CASE;
5572       } else {
5573         maxLevel = strength;
5574       }
5575     } else {
5576         if(strength == UCOL_TERTIARY) {
5577           maxLevel = UCOL_PSK_TERTIARY;
5578         } else if(strength == UCOL_QUATERNARY) {
5579           maxLevel = UCOL_PSK_QUATERNARY;
5580         } else { // identical
5581           maxLevel = UCOL_IDENTICAL;
5582         }
5583     }
5584     // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
5585     uint8_t UCOL_HIRAGANA_QUAD =
5586       (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
5587     // Boundary value that decides whether a CE is shifted or not
5588     uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
5589     // Are we doing French collation?
5590     UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
5591 
5592     /** initializing the collation state */
5593     UBool notIsContinuation = FALSE;
5594     uint32_t CE = UCOL_NO_MORE_CES;
5595 
5596     collIterate s;
5597     IInit_collIterate(coll, NULL, -1, &s);
5598     s.iterator = iter;
5599     s.flags |= UCOL_USE_ITERATOR;
5600     // This variable tells us whether we have produced some other levels in this iteration
5601     // before we moved to the identical level. In that case, we need to switch the
5602     // type of the iterator.
5603     UBool doingIdenticalFromStart = FALSE;
5604     // Normalizing iterator
5605     // The division for the array length may truncate the array size to
5606     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
5607     // for all platforms anyway.
5608     UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
5609     UNormIterator *normIter = NULL;
5610     // If the normalization is turned on for the collator and we are below identical level
5611     // we will use a FCD normalizing iterator
5612     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
5613       normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5614       s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
5615       s.flags &= ~UCOL_ITER_NORM;
5616       if(U_FAILURE(*status)) {
5617         UTRACE_EXIT_STATUS(*status);
5618         return 0;
5619       }
5620     } else if(level == UCOL_PSK_IDENTICAL) {
5621       // for identical level, we need a NFD iterator. We need to instantiate it here, since we
5622       // will be updating the state - and this cannot be done on an ordinary iterator.
5623       normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5624       s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5625       s.flags &= ~UCOL_ITER_NORM;
5626       if(U_FAILURE(*status)) {
5627         UTRACE_EXIT_STATUS(*status);
5628         return 0;
5629       }
5630       doingIdenticalFromStart = TRUE;
5631     }
5632 
5633     // This is the tentative new state of the iterator. The problem
5634     // is that the iterator might return an undefined state, in
5635     // which case we should save the last valid state and increase
5636     // the iterator skip value.
5637     uint32_t newState = 0;
5638 
5639     // First, we set the iterator to the last valid position
5640     // from the last iteration. This was saved in state[0].
5641     if(iterState == 0) {
5642       /* initial state */
5643       if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
5644         s.iterator->move(s.iterator, 0, UITER_LIMIT);
5645       } else {
5646         s.iterator->move(s.iterator, 0, UITER_START);
5647       }
5648     } else {
5649         /* reset to previous state */
5650       s.iterator->setState(s.iterator, iterState, status);
5651       if(U_FAILURE(*status)) {
5652           UTRACE_EXIT_STATUS(*status);
5653           return 0;
5654       }
5655     }
5656 
5657 
5658 
5659     // This variable tells us whether we can attempt to update the state
5660     // of iterator. Situations where we don't want to update iterator state
5661     // are the existence of expansion CEs that are not yet processed, and
5662     // finishing the case level without enough space in the buffer to insert
5663     // a level terminator.
5664     UBool canUpdateState = TRUE;
5665 
5666     // Consume all the CEs that were consumed at the end of the previous
5667     // iteration without updating the iterator state. On identical level,
5668     // consume the code points.
5669     int32_t counter = cces;
5670     if(level < UCOL_PSK_IDENTICAL) {
5671       while(counter-->0) {
5672         // If we're doing French and we are on the secondary level,
5673         // we go backwards.
5674         if(level == UCOL_PSK_SECONDARY && doingFrench) {
5675           CE = ucol_IGetPrevCE(coll, &s, status);
5676         } else {
5677           CE = ucol_IGetNextCE(coll, &s, status);
5678         }
5679         if(CE==UCOL_NO_MORE_CES) {
5680           /* should not happen */
5681           *status=U_INTERNAL_PROGRAM_ERROR;
5682           UTRACE_EXIT_STATUS(*status);
5683           return 0;
5684         }
5685         if(uprv_numAvailableExpCEs(s)) {
5686           canUpdateState = FALSE;
5687         }
5688       }
5689     } else {
5690       while(counter-->0) {
5691         uiter_next32(s.iterator);
5692       }
5693     }
5694 
5695     // French secondary needs to know whether the iterator state of zero came from previous level OR
5696     // from a new invocation...
5697     UBool wasDoingPrimary = FALSE;
5698     // destination buffer byte counter. When this guy
5699     // gets to count, we're done with the iteration
5700     int32_t i = 0;
5701     // used to count the zero bytes written after we
5702     // have finished with the sort key
5703     int32_t j = 0;
5704 
5705 
5706     // Hm.... I think we're ready to plunge in. Basic story is as following:
5707     // we have a fall through case based on level. This is used for initial
5708     // positioning on iteration start. Every level processor contains a
5709     // for(;;) which will be broken when we exhaust all the CEs. Other
5710     // way to exit is a goto saveState, which happens when we have filled
5711     // out our buffer.
5712     switch(level) {
5713     case UCOL_PSK_PRIMARY:
5714       wasDoingPrimary = TRUE;
5715       for(;;) {
5716           if(i==count) {
5717               goto saveState;
5718           }
5719           // We should save the state only if we
5720           // are sure that we are done with the
5721           // previous iterator state
5722           if(canUpdateState && byteCountOrFrenchDone == 0) {
5723             newState = s.iterator->getState(s.iterator);
5724             if(newState != UITER_NO_STATE) {
5725               iterState = newState;
5726               cces = 0;
5727             }
5728           }
5729           CE = ucol_IGetNextCE(coll, &s, status);
5730           cces++;
5731           if(CE==UCOL_NO_MORE_CES) {
5732               // Add the level separator
5733               terminatePSKLevel(level, maxLevel, i, dest);
5734               byteCountOrFrenchDone=0;
5735               // Restart the iteration an move to the
5736               // second level
5737               s.iterator->move(s.iterator, 0, UITER_START);
5738               cces = 0;
5739               level = UCOL_PSK_SECONDARY;
5740               break;
5741           }
5742           if(!isShiftedCE(CE, LVT, &wasShifted)) {
5743             CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
5744             if(CE != 0) {
5745               if(byteCountOrFrenchDone == 0) {
5746                 // get the second byte of primary
5747                 dest[i++]=(uint8_t)(CE >> 8);
5748               } else {
5749                 byteCountOrFrenchDone = 0;
5750               }
5751               if((CE &=0xff)!=0) {
5752                   if(i==count) {
5753                       /* overflow */
5754                       byteCountOrFrenchDone = 1;
5755                       cces--;
5756                       goto saveState;
5757                   }
5758                   dest[i++]=(uint8_t)CE;
5759               }
5760             }
5761           }
5762           if(uprv_numAvailableExpCEs(s)) {
5763             canUpdateState = FALSE;
5764           } else {
5765             canUpdateState = TRUE;
5766           }
5767       }
5768       /* fall through to next level */
5769     case UCOL_PSK_SECONDARY:
5770       if(strength >= UCOL_SECONDARY) {
5771         if(!doingFrench) {
5772           for(;;) {
5773             if(i == count) {
5774               goto saveState;
5775             }
5776             // We should save the state only if we
5777             // are sure that we are done with the
5778             // previous iterator state
5779             if(canUpdateState) {
5780               newState = s.iterator->getState(s.iterator);
5781               if(newState != UITER_NO_STATE) {
5782                 iterState = newState;
5783                 cces = 0;
5784               }
5785             }
5786             CE = ucol_IGetNextCE(coll, &s, status);
5787             cces++;
5788             if(CE==UCOL_NO_MORE_CES) {
5789                 // Add the level separator
5790                 terminatePSKLevel(level, maxLevel, i, dest);
5791                 byteCountOrFrenchDone = 0;
5792                 // Restart the iteration an move to the
5793                 // second level
5794                 s.iterator->move(s.iterator, 0, UITER_START);
5795                 cces = 0;
5796                 level = UCOL_PSK_CASE;
5797                 break;
5798             }
5799             if(!isShiftedCE(CE, LVT, &wasShifted)) {
5800               CE >>= 8; /* get secondary */
5801               if(CE != 0) {
5802                 dest[i++]=(uint8_t)CE;
5803               }
5804             }
5805             if(uprv_numAvailableExpCEs(s)) {
5806               canUpdateState = FALSE;
5807             } else {
5808               canUpdateState = TRUE;
5809             }
5810           }
5811         } else { // French secondary processing
5812           uint8_t frenchBuff[UCOL_MAX_BUFFER];
5813           int32_t frenchIndex = 0;
5814           // Here we are going backwards.
5815           // If the iterator is at the beggining, it should be
5816           // moved to end.
5817           if(wasDoingPrimary) {
5818             s.iterator->move(s.iterator, 0, UITER_LIMIT);
5819             cces = 0;
5820           }
5821           for(;;) {
5822             if(i == count) {
5823               goto saveState;
5824             }
5825             if(canUpdateState) {
5826               newState = s.iterator->getState(s.iterator);
5827               if(newState != UITER_NO_STATE) {
5828                 iterState = newState;
5829                 cces = 0;
5830               }
5831             }
5832             CE = ucol_IGetPrevCE(coll, &s, status);
5833             cces++;
5834             if(CE==UCOL_NO_MORE_CES) {
5835                 // Add the level separator
5836                 terminatePSKLevel(level, maxLevel, i, dest);
5837                 byteCountOrFrenchDone = 0;
5838                 // Restart the iteration an move to the next level
5839                 s.iterator->move(s.iterator, 0, UITER_START);
5840                 level = UCOL_PSK_CASE;
5841                 break;
5842             }
5843             if(isContinuation(CE)) { // if it's a continuation, we want to save it and
5844               // reverse when we get a first non-continuation CE.
5845               CE >>= 8;
5846               frenchBuff[frenchIndex++] = (uint8_t)CE;
5847             } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
5848               CE >>= 8; /* get secondary */
5849               if(!frenchIndex) {
5850                 if(CE != 0) {
5851                   dest[i++]=(uint8_t)CE;
5852                 }
5853               } else {
5854                 frenchBuff[frenchIndex++] = (uint8_t)CE;
5855                 frenchIndex -= usedFrench;
5856                 usedFrench = 0;
5857                 while(i < count && frenchIndex) {
5858                   dest[i++] = frenchBuff[--frenchIndex];
5859                   usedFrench++;
5860                 }
5861               }
5862             }
5863             if(uprv_numAvailableExpCEs(s)) {
5864               canUpdateState = FALSE;
5865             } else {
5866               canUpdateState = TRUE;
5867             }
5868           }
5869         }
5870       } else {
5871         level = UCOL_PSK_CASE;
5872       }
5873         /* fall through to next level */
5874     case UCOL_PSK_CASE:
5875       if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5876         uint32_t caseShift = UCOL_CASE_SHIFT_START;
5877         uint8_t caseByte = UCOL_CASE_BYTE_START;
5878         uint8_t caseBits = 0;
5879 
5880         for(;;) {
5881           if(i == count) {
5882             goto saveState;
5883           }
5884           // We should save the state only if we
5885           // are sure that we are done with the
5886           // previous iterator state
5887           if(canUpdateState) {
5888             newState = s.iterator->getState(s.iterator);
5889             if(newState != UITER_NO_STATE) {
5890               iterState = newState;
5891               cces = 0;
5892             }
5893           }
5894           CE = ucol_IGetNextCE(coll, &s, status);
5895           cces++;
5896           if(CE==UCOL_NO_MORE_CES) {
5897             // On the case level we might have an unfinished
5898             // case byte. Add one if it's started.
5899             if(caseShift != UCOL_CASE_SHIFT_START) {
5900               dest[i++] = caseByte;
5901             }
5902             cces = 0;
5903             // We have finished processing CEs on this level.
5904             // However, we don't know if we have enough space
5905             // to add a case level terminator.
5906             if(i < count) {
5907               // Add the level separator
5908               terminatePSKLevel(level, maxLevel, i, dest);
5909               // Restart the iteration and move to the
5910               // next level
5911               s.iterator->move(s.iterator, 0, UITER_START);
5912               level = UCOL_PSK_TERTIARY;
5913             } else {
5914               canUpdateState = FALSE;
5915             }
5916             break;
5917           }
5918 
5919           if(!isShiftedCE(CE, LVT, &wasShifted)) {
5920             if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
5921                 // do the case level if we need to do it. We don't want to calculate
5922                 // case level for primary ignorables if we have only primary strength and case level
5923                 // otherwise we would break well formedness of CEs
5924               CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5925               caseBits = (uint8_t)(CE & 0xC0);
5926               // this copies the case level logic from the
5927               // sort key generation code
5928               if(CE != 0) {
5929                 if(coll->caseFirst == UCOL_UPPER_FIRST) {
5930                   if((caseBits & 0xC0) == 0) {
5931                     caseByte |= 1 << (--caseShift);
5932                   } else {
5933                     caseByte |= 0 << (--caseShift);
5934                     /* second bit */
5935                     if(caseShift == 0) {
5936                       dest[i++] = caseByte;
5937                       caseShift = UCOL_CASE_SHIFT_START;
5938                       caseByte = UCOL_CASE_BYTE_START;
5939                     }
5940                     caseByte |= ((caseBits>>6)&1) << (--caseShift);
5941                   }
5942                 } else {
5943                   if((caseBits & 0xC0) == 0) {
5944                     caseByte |= 0 << (--caseShift);
5945                   } else {
5946                     caseByte |= 1 << (--caseShift);
5947                     /* second bit */
5948                     if(caseShift == 0) {
5949                       dest[i++] = caseByte;
5950                       caseShift = UCOL_CASE_SHIFT_START;
5951                       caseByte = UCOL_CASE_BYTE_START;
5952                     }
5953                     caseByte |= ((caseBits>>7)&1) << (--caseShift);
5954                   }
5955                 }
5956               }
5957 
5958             }
5959           }
5960           // Not sure this is correct for the case level - revisit
5961           if(uprv_numAvailableExpCEs(s)) {
5962             canUpdateState = FALSE;
5963           } else {
5964             canUpdateState = TRUE;
5965           }
5966         }
5967       } else {
5968         level = UCOL_PSK_TERTIARY;
5969       }
5970         /* fall through to next level */
5971     case UCOL_PSK_TERTIARY:
5972       if(strength >= UCOL_TERTIARY) {
5973         for(;;) {
5974           if(i == count) {
5975             goto saveState;
5976           }
5977           // We should save the state only if we
5978           // are sure that we are done with the
5979           // previous iterator state
5980           if(canUpdateState) {
5981             newState = s.iterator->getState(s.iterator);
5982             if(newState != UITER_NO_STATE) {
5983               iterState = newState;
5984               cces = 0;
5985             }
5986           }
5987           CE = ucol_IGetNextCE(coll, &s, status);
5988           cces++;
5989           if(CE==UCOL_NO_MORE_CES) {
5990               // Add the level separator
5991               terminatePSKLevel(level, maxLevel, i, dest);
5992               byteCountOrFrenchDone = 0;
5993               // Restart the iteration an move to the
5994               // second level
5995               s.iterator->move(s.iterator, 0, UITER_START);
5996               cces = 0;
5997               level = UCOL_PSK_QUATERNARY;
5998               break;
5999           }
6000           if(!isShiftedCE(CE, LVT, &wasShifted)) {
6001             notIsContinuation = !isContinuation(CE);
6002 
6003             if(notIsContinuation) {
6004               CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
6005               CE ^= coll->caseSwitch;
6006               CE &= coll->tertiaryMask;
6007             } else {
6008               CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6009             }
6010 
6011             if(CE != 0) {
6012               dest[i++]=(uint8_t)CE;
6013             }
6014           }
6015           if(uprv_numAvailableExpCEs(s)) {
6016             canUpdateState = FALSE;
6017           } else {
6018             canUpdateState = TRUE;
6019           }
6020         }
6021       } else {
6022         // if we're not doing tertiary
6023         // skip to the end
6024         level = UCOL_PSK_NULL;
6025       }
6026         /* fall through to next level */
6027     case UCOL_PSK_QUATERNARY:
6028       if(strength >= UCOL_QUATERNARY) {
6029         for(;;) {
6030           if(i == count) {
6031             goto saveState;
6032           }
6033           // We should save the state only if we
6034           // are sure that we are done with the
6035           // previous iterator state
6036           if(canUpdateState) {
6037             newState = s.iterator->getState(s.iterator);
6038             if(newState != UITER_NO_STATE) {
6039               iterState = newState;
6040               cces = 0;
6041             }
6042           }
6043           CE = ucol_IGetNextCE(coll, &s, status);
6044           cces++;
6045           if(CE==UCOL_NO_MORE_CES) {
6046               // Add the level separator
6047               terminatePSKLevel(level, maxLevel, i, dest);
6048               //dest[i++] = UCOL_LEVELTERMINATOR;
6049               byteCountOrFrenchDone = 0;
6050               // Restart the iteration an move to the
6051               // second level
6052               s.iterator->move(s.iterator, 0, UITER_START);
6053               cces = 0;
6054               level = UCOL_PSK_QUIN;
6055               break;
6056           }
6057 		  if(CE==0)
6058 			  continue;
6059           if(isShiftedCE(CE, LVT, &wasShifted)) {
6060             CE >>= 16; /* get primary */
6061             if(CE != 0) {
6062               if(byteCountOrFrenchDone == 0) {
6063                 dest[i++]=(uint8_t)(CE >> 8);
6064               } else {
6065                 byteCountOrFrenchDone = 0;
6066               }
6067               if((CE &=0xff)!=0) {
6068                   if(i==count) {
6069                       /* overflow */
6070                       byteCountOrFrenchDone = 1;
6071                       goto saveState;
6072                   }
6073                   dest[i++]=(uint8_t)CE;
6074               }
6075             }
6076           } else {
6077             notIsContinuation = !isContinuation(CE);
6078             if(notIsContinuation) {
6079               if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
6080                 dest[i++] = UCOL_HIRAGANA_QUAD;
6081               } else {
6082                 dest[i++] = 0xFF;
6083               }
6084             }
6085           }
6086           if(uprv_numAvailableExpCEs(s)) {
6087             canUpdateState = FALSE;
6088           } else {
6089             canUpdateState = TRUE;
6090           }
6091         }
6092       } else {
6093         // if we're not doing quaternary
6094         // skip to the end
6095         level = UCOL_PSK_NULL;
6096       }
6097         /* fall through to next level */
6098     case UCOL_PSK_QUIN:
6099       level = UCOL_PSK_IDENTICAL;
6100         /* fall through to next level */
6101     case UCOL_PSK_IDENTICAL:
6102       if(strength >= UCOL_IDENTICAL) {
6103         UChar32 first, second;
6104         int32_t bocsuBytesWritten = 0;
6105         // We always need to do identical on
6106         // the NFD form of the string.
6107         if(normIter == NULL) {
6108           // we arrived from the level below and
6109           // normalization was not turned on.
6110           // therefore, we need to make a fresh NFD iterator
6111           normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6112           s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6113         } else if(!doingIdenticalFromStart) {
6114           // there is an iterator, but we did some other levels.
6115           // therefore, we have a FCD iterator - need to make
6116           // a NFD one.
6117           // normIter being at the beginning does not guarantee
6118           // that the underlying iterator is at the beginning
6119           iter->move(iter, 0, UITER_START);
6120           s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6121         }
6122         // At this point we have a NFD iterator that is positioned
6123         // in the right place
6124         if(U_FAILURE(*status)) {
6125           UTRACE_EXIT_STATUS(*status);
6126           return 0;
6127         }
6128         first = uiter_previous32(s.iterator);
6129         // maybe we're at the start of the string
6130         if(first == U_SENTINEL) {
6131           first = 0;
6132         } else {
6133           uiter_next32(s.iterator);
6134         }
6135 
6136         j = 0;
6137         for(;;) {
6138           if(i == count) {
6139             if(j+1 < bocsuBytesWritten) {
6140               bocsuBytesUsed = j+1;
6141             }
6142             goto saveState;
6143           }
6144 
6145           // On identical level, we will always save
6146           // the state if we reach this point, since
6147           // we don't depend on getNextCE for content
6148           // all the content is in our buffer and we
6149           // already either stored the full buffer OR
6150           // otherwise we won't arrive here.
6151           newState = s.iterator->getState(s.iterator);
6152           if(newState != UITER_NO_STATE) {
6153             iterState = newState;
6154             cces = 0;
6155           }
6156 
6157           uint8_t buff[4];
6158           second = uiter_next32(s.iterator);
6159           cces++;
6160 
6161           // end condition for identical level
6162           if(second == U_SENTINEL) {
6163             terminatePSKLevel(level, maxLevel, i, dest);
6164             level = UCOL_PSK_NULL;
6165             break;
6166           }
6167           bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
6168           first = second;
6169 
6170           j = 0;
6171           if(bocsuBytesUsed != 0) {
6172             while(bocsuBytesUsed-->0) {
6173               j++;
6174             }
6175           }
6176 
6177           while(i < count && j < bocsuBytesWritten) {
6178             dest[i++] = buff[j++];
6179           }
6180         }
6181 
6182       } else {
6183         level = UCOL_PSK_NULL;
6184       }
6185         /* fall through to next level */
6186     case UCOL_PSK_NULL:
6187       j = i;
6188       while(j<count) {
6189           dest[j++]=0;
6190       }
6191       break;
6192     default:
6193       *status = U_INTERNAL_PROGRAM_ERROR;
6194       UTRACE_EXIT_STATUS(*status);
6195       return 0;
6196     }
6197 
6198 saveState:
6199     // Now we need to return stuff. First we want to see whether we have
6200     // done everything for the current state of iterator.
6201     if(byteCountOrFrenchDone
6202     || canUpdateState == FALSE
6203     || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE) {
6204       // Any of above mean that the previous transaction
6205       // wasn't finished and that we should store the
6206       // previous iterator state.
6207       state[0] = iterState;
6208     } else {
6209       // The transaction is complete. We will continue in the next iteration.
6210         state[0] = s.iterator->getState(s.iterator);
6211         cces = 0;
6212     }
6213     // Store the number of bocsu bytes written.
6214     if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
6215       *status = U_INDEX_OUTOFBOUNDS_ERROR;
6216     }
6217     state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
6218 
6219     // Next we put in the level of comparison
6220     state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
6221 
6222     // If we are doing French, we need to store whether we have just finished the French level
6223     if(level == UCOL_PSK_SECONDARY && doingFrench) {
6224       state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6225     } else {
6226       state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6227     }
6228 
6229     // Was the latest CE shifted
6230     if(wasShifted) {
6231       state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6232     }
6233     // Check for cces overflow
6234     if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
6235       *status = U_INDEX_OUTOFBOUNDS_ERROR;
6236     }
6237     // Store cces
6238     state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
6239 
6240     // Check for French overflow
6241     if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6242       *status = U_INDEX_OUTOFBOUNDS_ERROR;
6243     }
6244     // Store number of bytes written in the French secondary continuation sequence
6245     state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
6246 
6247 
6248     // If we have used normalizing iterator, get rid of it
6249     if(normIter != NULL) {
6250       unorm_closeIter(normIter);
6251     }
6252 
6253     // Return number of meaningful sortkey bytes.
6254     UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
6255                   dest,i, state[0], state[1]);
6256     UTRACE_EXIT_VALUE(i);
6257     return i;
6258 }
6259 
6260 /**
6261  * Produce a bound for a given sortkey and a number of levels.
6262  */
6263 U_CAPI int32_t U_EXPORT2
ucol_getBound(const uint8_t * source,int32_t sourceLength,UColBoundMode boundType,uint32_t noOfLevels,uint8_t * result,int32_t resultLength,UErrorCode * status)6264 ucol_getBound(const uint8_t       *source,
6265         int32_t             sourceLength,
6266         UColBoundMode       boundType,
6267         uint32_t            noOfLevels,
6268         uint8_t             *result,
6269         int32_t             resultLength,
6270         UErrorCode          *status) {
6271   // consistency checks
6272   if(status == NULL || U_FAILURE(*status)) {
6273     return 0;
6274   }
6275   if(source == NULL) {
6276     *status = U_ILLEGAL_ARGUMENT_ERROR;
6277     return 0;
6278   }
6279 
6280   int32_t sourceIndex = 0;
6281   // Scan the string until we skip enough of the key OR reach the end of the key
6282   do {
6283     sourceIndex++;
6284     if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
6285       noOfLevels--;
6286     }
6287   } while (noOfLevels > 0
6288     && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6289 
6290   if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6291     && noOfLevels > 0) {
6292     *status = U_SORT_KEY_TOO_SHORT_WARNING;
6293   }
6294 
6295 
6296   // READ ME: this code assumes that the values for boundType
6297   // enum will not changes. They are set so that the enum value
6298   // corresponds to the number of extra bytes each bound type
6299   // needs.
6300   if(result != NULL && resultLength >= sourceIndex+boundType) {
6301     uprv_memcpy(result, source, sourceIndex);
6302     switch(boundType) {
6303     // Lower bound just gets terminated. No extra bytes
6304     case UCOL_BOUND_LOWER: // = 0
6305       break;
6306     // Upper bound needs one extra byte
6307     case UCOL_BOUND_UPPER: // = 1
6308       result[sourceIndex++] = 2;
6309       break;
6310     // Upper long bound needs two extra bytes
6311     case UCOL_BOUND_UPPER_LONG: // = 2
6312       result[sourceIndex++] = 0xFF;
6313       result[sourceIndex++] = 0xFF;
6314       break;
6315     default:
6316       *status = U_ILLEGAL_ARGUMENT_ERROR;
6317       return 0;
6318     }
6319     result[sourceIndex++] = 0;
6320 
6321     return sourceIndex;
6322   } else {
6323     return sourceIndex+boundType+1;
6324   }
6325 }
6326 
6327 /****************************************************************************/
6328 /* Following are the functions that deal with the properties of a collator  */
6329 /* there are new APIs and some compatibility APIs                           */
6330 /****************************************************************************/
6331 
6332 static inline void
ucol_addLatinOneEntry(UCollator * coll,UChar ch,uint32_t CE,int32_t * primShift,int32_t * secShift,int32_t * terShift)6333 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
6334                     int32_t *primShift, int32_t *secShift, int32_t *terShift) {
6335   uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6336   UBool reverseSecondary = FALSE;
6337   if(!isContinuation(CE)) {
6338     tertiary = (uint8_t)((CE & coll->tertiaryMask));
6339     tertiary ^= coll->caseSwitch;
6340     reverseSecondary = TRUE;
6341   } else {
6342     tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6343     tertiary &= UCOL_REMOVE_CASE;
6344     reverseSecondary = FALSE;
6345   }
6346 
6347   secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6348   primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6349   primary1 = (uint8_t)(CE >> 8);
6350 
6351   if(primary1 != 0) {
6352     coll->latinOneCEs[ch] |= (primary1 << *primShift);
6353     *primShift -= 8;
6354   }
6355   if(primary2 != 0) {
6356     if(*primShift < 0) {
6357       coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6358       coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6359       coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6360       return;
6361     }
6362     coll->latinOneCEs[ch] |= (primary2 << *primShift);
6363     *primShift -= 8;
6364   }
6365   if(secondary != 0) {
6366     if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
6367       coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
6368       coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
6369     } else { // normal case
6370       coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
6371     }
6372     *secShift -= 8;
6373   }
6374   if(tertiary != 0) {
6375     coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
6376     *terShift -= 8;
6377   }
6378 }
6379 
6380 static inline UBool
ucol_resizeLatinOneTable(UCollator * coll,int32_t size,UErrorCode * status)6381 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6382     uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6383     if(newTable == NULL) {
6384       *status = U_MEMORY_ALLOCATION_ERROR;
6385       coll->latinOneFailed = TRUE;
6386       return FALSE;
6387     }
6388     int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
6389     uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6390     uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6391     uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
6392     uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
6393     coll->latinOneTableLen = size;
6394     uprv_free(coll->latinOneCEs);
6395     coll->latinOneCEs = newTable;
6396     return TRUE;
6397 }
6398 
6399 static UBool
ucol_setUpLatinOne(UCollator * coll,UErrorCode * status)6400 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6401   UBool result = TRUE;
6402   if(coll->latinOneCEs == NULL) {
6403     coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
6404     if(coll->latinOneCEs == NULL) {
6405       *status = U_MEMORY_ALLOCATION_ERROR;
6406       return FALSE;
6407     }
6408     coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6409   }
6410   UChar ch = 0;
6411   UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6412   uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
6413 
6414   int32_t primShift = 24, secShift = 24, terShift = 24;
6415   uint32_t CE = 0;
6416   int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6417 
6418   // TODO: make safe if you get more than you wanted...
6419   for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
6420     primShift = 24; secShift = 24; terShift = 24;
6421     if(ch < 0x100) {
6422       CE = coll->latinOneMapping[ch];
6423     } else {
6424       CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
6425       if(CE == UCOL_NOT_FOUND && coll->UCA) {
6426         CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
6427       }
6428     }
6429     if(CE < UCOL_NOT_FOUND) {
6430       ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6431     } else {
6432       switch (getCETag(CE)) {
6433       case EXPANSION_TAG:
6434       case DIGIT_TAG:
6435         ucol_setText(it, &ch, 1, status);
6436         while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
6437           if(primShift < 0 || secShift < 0 || terShift < 0) {
6438             coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6439             coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6440             coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6441             break;
6442           }
6443           ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6444         }
6445         break;
6446       case CONTRACTION_TAG:
6447         // here is the trick
6448         // F2 is contraction. We do something very similar to contractions
6449         // but have two indices, one in the real contraction table and the
6450         // other to where we stuffed things. This hopes that we don't have
6451         // many contractions (this should work for latin-1 tables).
6452         {
6453           if((CE & 0x00FFF000) != 0) {
6454             *status = U_UNSUPPORTED_ERROR;
6455             goto cleanup_after_failure;
6456           }
6457 
6458           const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
6459 
6460           CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
6461 
6462           coll->latinOneCEs[ch] = CE;
6463           coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
6464           coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
6465 
6466           // We're going to jump into contraction table, pick the elements
6467           // and use them
6468           do {
6469               CE = *(coll->contractionCEs +
6470                   (UCharOffset - coll->contractionIndex));
6471               if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
6472                 uint32_t size;
6473                 uint32_t i;    /* general counter */
6474                 uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
6475                 size = getExpansionCount(CE);
6476                 //CE = *CEOffset++;
6477                 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6478                   for(i = 0; i<size; i++) {
6479                     if(primShift < 0 || secShift < 0 || terShift < 0) {
6480                       coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6481                       coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6482                       coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6483                       break;
6484                     }
6485                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6486                   }
6487                 } else { /* else, we do */
6488                   while(*CEOffset != 0) {
6489                     if(primShift < 0 || secShift < 0 || terShift < 0) {
6490                       coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6491                       coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6492                       coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6493                       break;
6494                     }
6495                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6496                   }
6497                 }
6498                 contractionOffset++;
6499               } else if(CE < UCOL_NOT_FOUND) {
6500                 ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
6501               } else {
6502                 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6503                 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6504                 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6505                 contractionOffset++;
6506               }
6507               UCharOffset++;
6508               primShift = 24; secShift = 24; terShift = 24;
6509               if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
6510                 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
6511                   goto cleanup_after_failure;
6512                 }
6513               }
6514           } while(*UCharOffset != 0xFFFF);
6515         }
6516         break;
6517       default:
6518         goto cleanup_after_failure;
6519       }
6520     }
6521   }
6522   // compact table
6523   if(contractionOffset < coll->latinOneTableLen) {
6524     if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
6525       goto cleanup_after_failure;
6526     }
6527   }
6528   ucol_closeElements(it);
6529   return result;
6530 
6531 cleanup_after_failure:
6532   // status should already be set before arriving here.
6533   coll->latinOneFailed = TRUE;
6534   ucol_closeElements(it);
6535   return FALSE;
6536 }
6537 
ucol_updateInternalState(UCollator * coll,UErrorCode * status)6538 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
6539   if(U_SUCCESS(*status)) {
6540     if(coll->caseFirst == UCOL_UPPER_FIRST) {
6541       coll->caseSwitch = UCOL_CASE_SWITCH;
6542     } else {
6543       coll->caseSwitch = UCOL_NO_CASE_SWITCH;
6544     }
6545 
6546     if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
6547       coll->tertiaryMask = UCOL_REMOVE_CASE;
6548       coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6549       coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_OFF;
6550       coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
6551       coll->tertiaryBottom = UCOL_COMMON_BOT3;
6552     } else {
6553       coll->tertiaryMask = UCOL_KEEP_CASE;
6554       coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
6555       if(coll->caseFirst == UCOL_UPPER_FIRST) {
6556         coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
6557         coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
6558         coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
6559       } else {
6560         coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6561         coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
6562         coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
6563       }
6564     }
6565 
6566     /* Set the compression values */
6567     uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1);
6568     coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
6569     coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
6570 
6571     if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
6572       && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE) {
6573       coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
6574     } else {
6575       coll->sortKeyGen = ucol_calcSortKey;
6576     }
6577     if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
6578       && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed) {
6579       if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
6580         if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
6581           //fprintf(stderr, "F");
6582           coll->latinOneUse = TRUE;
6583         } else {
6584           coll->latinOneUse = FALSE;
6585         }
6586         if(*status == U_UNSUPPORTED_ERROR) {
6587           *status = U_ZERO_ERROR;
6588         }
6589       } else { // latin1Table exists and it doesn't need to be regenerated, just use it
6590         coll->latinOneUse = TRUE;
6591       }
6592     } else {
6593       coll->latinOneUse = FALSE;
6594     }
6595   }
6596 }
6597 
6598 U_CAPI uint32_t  U_EXPORT2
ucol_setVariableTop(UCollator * coll,const UChar * varTop,int32_t len,UErrorCode * status)6599 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
6600   if(U_FAILURE(*status) || coll == NULL) {
6601     return 0;
6602   }
6603   if(len == -1) {
6604     len = u_strlen(varTop);
6605   }
6606   if(len == 0) {
6607     *status = U_ILLEGAL_ARGUMENT_ERROR;
6608     return 0;
6609   }
6610 
6611   collIterate s;
6612   IInit_collIterate(coll, varTop, len, &s);
6613 
6614   uint32_t CE = ucol_IGetNextCE(coll, &s, status);
6615 
6616   /* here we check if we have consumed all characters */
6617   /* you can put in either one character or a contraction */
6618   /* you shouldn't put more... */
6619   if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
6620     *status = U_CE_NOT_FOUND_ERROR;
6621     return 0;
6622   }
6623 
6624   uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
6625 
6626   if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
6627     *status = U_PRIMARY_TOO_LONG_ERROR;
6628     return 0;
6629   }
6630   if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
6631     coll->variableTopValueisDefault = FALSE;
6632     coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
6633   }
6634 
6635   return CE & UCOL_PRIMARYMASK;
6636 }
6637 
ucol_getVariableTop(const UCollator * coll,UErrorCode * status)6638 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
6639   if(U_FAILURE(*status) || coll == NULL) {
6640     return 0;
6641   }
6642   return coll->variableTopValue<<16;
6643 }
6644 
6645 U_CAPI void  U_EXPORT2
ucol_restoreVariableTop(UCollator * coll,const uint32_t varTop,UErrorCode * status)6646 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
6647   if(U_FAILURE(*status) || coll == NULL) {
6648     return;
6649   }
6650 
6651   if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
6652       coll->variableTopValueisDefault = FALSE;
6653       coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
6654   }
6655 }
6656 /* Attribute setter API */
6657 U_CAPI void  U_EXPORT2
ucol_setAttribute(UCollator * coll,UColAttribute attr,UColAttributeValue value,UErrorCode * status)6658 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
6659     if(U_FAILURE(*status) || coll == NULL) {
6660       return;
6661     }
6662     UColAttributeValue oldFrench = coll->frenchCollation;
6663     UColAttributeValue oldCaseFirst = coll->caseFirst;
6664     switch(attr) {
6665     case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
6666       if(value == UCOL_ON) {
6667         coll->numericCollation = UCOL_ON;
6668         coll->numericCollationisDefault = FALSE;
6669       } else if (value == UCOL_OFF) {
6670         coll->numericCollation = UCOL_OFF;
6671         coll->numericCollationisDefault = FALSE;
6672       } else if (value == UCOL_DEFAULT) {
6673         coll->numericCollationisDefault = TRUE;
6674         coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
6675       } else {
6676         *status = U_ILLEGAL_ARGUMENT_ERROR;
6677       }
6678       break;
6679     case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
6680       if(value == UCOL_ON) {
6681         coll->hiraganaQ = UCOL_ON;
6682         coll->hiraganaQisDefault = FALSE;
6683       } else if (value == UCOL_OFF) {
6684         coll->hiraganaQ = UCOL_OFF;
6685         coll->hiraganaQisDefault = FALSE;
6686       } else if (value == UCOL_DEFAULT) {
6687         coll->hiraganaQisDefault = TRUE;
6688         coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ;
6689       } else {
6690         *status = U_ILLEGAL_ARGUMENT_ERROR;
6691       }
6692       break;
6693     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6694         if(value == UCOL_ON) {
6695             coll->frenchCollation = UCOL_ON;
6696             coll->frenchCollationisDefault = FALSE;
6697         } else if (value == UCOL_OFF) {
6698             coll->frenchCollation = UCOL_OFF;
6699             coll->frenchCollationisDefault = FALSE;
6700         } else if (value == UCOL_DEFAULT) {
6701             coll->frenchCollationisDefault = TRUE;
6702             coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
6703         } else {
6704             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6705         }
6706         break;
6707     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6708         if(value == UCOL_SHIFTED) {
6709             coll->alternateHandling = UCOL_SHIFTED;
6710             coll->alternateHandlingisDefault = FALSE;
6711         } else if (value == UCOL_NON_IGNORABLE) {
6712             coll->alternateHandling = UCOL_NON_IGNORABLE;
6713             coll->alternateHandlingisDefault = FALSE;
6714         } else if (value == UCOL_DEFAULT) {
6715             coll->alternateHandlingisDefault = TRUE;
6716             coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
6717         } else {
6718             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6719         }
6720         break;
6721     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6722         if(value == UCOL_LOWER_FIRST) {
6723             coll->caseFirst = UCOL_LOWER_FIRST;
6724             coll->caseFirstisDefault = FALSE;
6725         } else if (value == UCOL_UPPER_FIRST) {
6726             coll->caseFirst = UCOL_UPPER_FIRST;
6727             coll->caseFirstisDefault = FALSE;
6728         } else if (value == UCOL_OFF) {
6729           coll->caseFirst = UCOL_OFF;
6730           coll->caseFirstisDefault = FALSE;
6731         } else if (value == UCOL_DEFAULT) {
6732             coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
6733             coll->caseFirstisDefault = TRUE;
6734         } else {
6735             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6736         }
6737         break;
6738     case UCOL_CASE_LEVEL: /* do we have an extra case level */
6739         if(value == UCOL_ON) {
6740             coll->caseLevel = UCOL_ON;
6741             coll->caseLevelisDefault = FALSE;
6742         } else if (value == UCOL_OFF) {
6743             coll->caseLevel = UCOL_OFF;
6744             coll->caseLevelisDefault = FALSE;
6745         } else if (value == UCOL_DEFAULT) {
6746             coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
6747             coll->caseLevelisDefault = TRUE;
6748         } else {
6749             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6750         }
6751         break;
6752     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6753         if(value == UCOL_ON) {
6754             coll->normalizationMode = UCOL_ON;
6755             coll->normalizationModeisDefault = FALSE;
6756         } else if (value == UCOL_OFF) {
6757             coll->normalizationMode = UCOL_OFF;
6758             coll->normalizationModeisDefault = FALSE;
6759         } else if (value == UCOL_DEFAULT) {
6760             coll->normalizationModeisDefault = TRUE;
6761             coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
6762         } else {
6763             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6764         }
6765         break;
6766     case UCOL_STRENGTH:         /* attribute for strength */
6767         if (value == UCOL_DEFAULT) {
6768             coll->strengthisDefault = TRUE;
6769             coll->strength = (UColAttributeValue)coll->options->strength;
6770         } else if (value <= UCOL_IDENTICAL) {
6771             coll->strengthisDefault = FALSE;
6772             coll->strength = value;
6773         } else {
6774             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6775         }
6776         break;
6777     case UCOL_ATTRIBUTE_COUNT:
6778     default:
6779         *status = U_ILLEGAL_ARGUMENT_ERROR;
6780         break;
6781     }
6782     if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
6783       coll->latinOneRegenTable = TRUE;
6784     } else {
6785       coll->latinOneRegenTable = FALSE;
6786     }
6787     ucol_updateInternalState(coll, status);
6788 }
6789 
6790 U_CAPI UColAttributeValue  U_EXPORT2
ucol_getAttribute(const UCollator * coll,UColAttribute attr,UErrorCode * status)6791 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
6792     if(U_FAILURE(*status) || coll == NULL) {
6793       return UCOL_DEFAULT;
6794     }
6795     switch(attr) {
6796     case UCOL_NUMERIC_COLLATION:
6797       return coll->numericCollation;
6798     case UCOL_HIRAGANA_QUATERNARY_MODE:
6799       return coll->hiraganaQ;
6800     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6801         return coll->frenchCollation;
6802     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6803         return coll->alternateHandling;
6804     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6805         return coll->caseFirst;
6806     case UCOL_CASE_LEVEL: /* do we have an extra case level */
6807         return coll->caseLevel;
6808     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6809         return coll->normalizationMode;
6810     case UCOL_STRENGTH:         /* attribute for strength */
6811         return coll->strength;
6812     case UCOL_ATTRIBUTE_COUNT:
6813     default:
6814         *status = U_ILLEGAL_ARGUMENT_ERROR;
6815         break;
6816     }
6817     return UCOL_DEFAULT;
6818 }
6819 
6820 U_CAPI void U_EXPORT2
ucol_setStrength(UCollator * coll,UCollationStrength strength)6821 ucol_setStrength(    UCollator                *coll,
6822             UCollationStrength        strength)
6823 {
6824   UErrorCode status = U_ZERO_ERROR;
6825   ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
6826 }
6827 
6828 U_CAPI UCollationStrength U_EXPORT2
ucol_getStrength(const UCollator * coll)6829 ucol_getStrength(const UCollator *coll)
6830 {
6831   UErrorCode status = U_ZERO_ERROR;
6832   return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
6833 }
6834 
6835 /****************************************************************************/
6836 /* Following are misc functions                                             */
6837 /* there are new APIs and some compatibility APIs                           */
6838 /****************************************************************************/
6839 
6840 U_CAPI void U_EXPORT2
ucol_getVersion(const UCollator * coll,UVersionInfo versionInfo)6841 ucol_getVersion(const UCollator* coll,
6842                 UVersionInfo versionInfo)
6843 {
6844     /* RunTime version  */
6845     uint8_t rtVersion = UCOL_RUNTIME_VERSION;
6846     /* Builder version*/
6847     uint8_t bdVersion = coll->image->version[0];
6848 
6849     /* Charset Version. Need to get the version from cnv files
6850      * makeconv should populate cnv files with version and
6851      * an api has to be provided in ucnv.h to obtain this version
6852      */
6853     uint8_t csVersion = 0;
6854 
6855     /* combine the version info */
6856     uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
6857 
6858     /* Tailoring rules */
6859     versionInfo[0] = (uint8_t)(cmbVersion>>8);
6860     versionInfo[1] = (uint8_t)cmbVersion;
6861     versionInfo[2] = coll->image->version[1];
6862     if(coll->UCA) {
6863         versionInfo[3] = coll->UCA->image->UCAVersion[0];
6864     } else {
6865         versionInfo[3] = 0;
6866     }
6867 }
6868 
6869 
6870 /* This internal API checks whether a character is tailored or not */
6871 U_CAPI UBool  U_EXPORT2
ucol_isTailored(const UCollator * coll,const UChar u,UErrorCode * status)6872 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
6873     if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
6874         return FALSE;
6875     }
6876 
6877     uint32_t CE = UCOL_NOT_FOUND;
6878     const UChar *ContractionStart = NULL;
6879     if(u < 0x100) { /* latin-1 */
6880         CE = coll->latinOneMapping[u];
6881         if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
6882             return FALSE;
6883         }
6884     } else { /* regular */
6885         CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
6886     }
6887 
6888     if(isContraction(CE)) {
6889         ContractionStart = (UChar *)coll->image+getContractOffset(CE);
6890         CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
6891     }
6892 
6893     return (UBool)(CE != UCOL_NOT_FOUND);
6894 }
6895 
6896 
6897 /****************************************************************************/
6898 /* Following are the string compare functions                               */
6899 /*                                                                          */
6900 /****************************************************************************/
6901 
6902 
6903 /*  ucol_checkIdent    internal function.  Does byte level string compare.   */
6904 /*                     Used by strcoll if strength == identical and strings  */
6905 /*                     are otherwise equal.  Moved out-of-line because this  */
6906 /*                     is a rare case.                                       */
6907 /*                                                                           */
6908 /*                     Comparison must be done on NFD normalized strings.    */
6909 /*                     FCD is not good enough.                               */
6910 /*                                                                           */
6911 /*      TODO:  make an incremental NFD Comparison function, which could      */
6912 /*             be of general use                                             */
6913 
6914 static
ucol_checkIdent(collIterate * sColl,collIterate * tColl,UBool normalize,UErrorCode * status)6915 UCollationResult    ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
6916 {
6917 
6918   // TODO: When we have an UChar iterator, we need to access the whole string. One
6919   // useful modification would be a UChar iterator extract API, since reset next next...
6920   // is not optimal.
6921   // TODO: Handle long strings. Do the same in compareUsingSortKeys.
6922 
6923   // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
6924   // of same type, but that doesn't really mean that it will stay that way.
6925 
6926     // The division for the array length may truncate the array size to
6927     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
6928     // for all platforms anyway.
6929     UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6930     UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6931     //UChar sStackBuf[256], tStackBuf[256];
6932     //int32_t sBufSize = 256, tBufSize = 256;
6933     int32_t            comparison;
6934     int32_t          sLen        = 0;
6935     UChar            *sBuf       = NULL;
6936     int32_t          tLen        = 0;
6937     UChar            *tBuf       = NULL;
6938     UBool freeSBuf = FALSE, freeTBuf = FALSE;
6939 
6940     if (sColl->flags & UCOL_USE_ITERATOR) {
6941       UNormIterator *sNIt = NULL, *tNIt = NULL;
6942       sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
6943       tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
6944       sColl->iterator->move(sColl->iterator, 0, UITER_START);
6945       tColl->iterator->move(tColl->iterator, 0, UITER_START);
6946       UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
6947       UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
6948       comparison = u_strCompareIter(sIt, tIt, TRUE);
6949       unorm_closeIter(sNIt);
6950       unorm_closeIter(tNIt);
6951     } else {
6952       sLen        = (sColl->flags & UCOL_ITER_HASLEN) ? sColl->endp - sColl->string : -1;
6953       sBuf = sColl->string;
6954       tLen        = (tColl->flags & UCOL_ITER_HASLEN) ? tColl->endp - tColl->string : -1;
6955       tBuf = tColl->string;
6956 
6957       if (normalize) {
6958           *status = U_ZERO_ERROR;
6959           if (unorm_quickCheck(sBuf, sLen, UNORM_NFD, status) != UNORM_YES) {
6960               sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
6961                                      sBuf, sLen,
6962                                      FALSE, 0,
6963                                      status);
6964               if(*status == U_BUFFER_OVERFLOW_ERROR) {
6965                   if(!u_growBufferFromStatic(sColl->stackWritableBuffer,
6966                                              &sColl->writableBuffer,
6967                                              (int32_t *)&sColl->writableBufSize, sLen,
6968                                              0)
6969                   ) {
6970                       *status = U_MEMORY_ALLOCATION_ERROR;
6971                       return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
6972                   }
6973                   *status = U_ZERO_ERROR;
6974                   sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
6975                                          sBuf, sLen,
6976                                          FALSE, 0,
6977                                          status);
6978               }
6979               if(freeSBuf) {
6980                 uprv_free(sBuf);
6981                 freeSBuf = FALSE;
6982               }
6983               sBuf = sColl->writableBuffer;
6984               if (sBuf != sColl->stackWritableBuffer) {
6985                   sColl->flags |= UCOL_ITER_ALLOCATED;
6986               }
6987           }
6988 
6989           *status = U_ZERO_ERROR;
6990           if (unorm_quickCheck(tBuf, tLen, UNORM_NFD, status) != UNORM_YES) {
6991               tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
6992                                      tBuf, tLen,
6993                                      FALSE, 0,
6994                                      status);
6995               if(*status == U_BUFFER_OVERFLOW_ERROR) {
6996                   if(!u_growBufferFromStatic(tColl->stackWritableBuffer,
6997                                              &tColl->writableBuffer,
6998                                              (int32_t *)&tColl->writableBufSize, tLen,
6999                                              0)
7000                   ) {
7001                       *status = U_MEMORY_ALLOCATION_ERROR;
7002                       return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7003                   }
7004                   *status = U_ZERO_ERROR;
7005                   tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
7006                                          tBuf, tLen,
7007                                          FALSE, 0,
7008                                          status);
7009               }
7010               if(freeTBuf) {
7011                 uprv_free(tBuf);
7012                 freeTBuf = FALSE;
7013               }
7014               tBuf = tColl->writableBuffer;
7015               if (tBuf != tColl->stackWritableBuffer) {
7016                   tColl->flags |= UCOL_ITER_ALLOCATED;
7017               }
7018           }
7019       }
7020 
7021       if (sLen == -1 && tLen == -1) {
7022           comparison = u_strcmpCodePointOrder(sBuf, tBuf);
7023       } else {
7024           if (sLen == -1) {
7025               sLen = u_strlen(sBuf);
7026           }
7027           if (tLen == -1) {
7028               tLen = u_strlen(tBuf);
7029           }
7030           comparison = u_memcmpCodePointOrder(sBuf, tBuf, uprv_min(sLen, tLen));
7031           if (comparison == 0) {
7032               comparison = sLen - tLen;
7033           }
7034       }
7035     }
7036 
7037     if (comparison < 0) {
7038         return UCOL_LESS;
7039     } else if (comparison == 0) {
7040         return UCOL_EQUAL;
7041     } else /* comparison > 0 */ {
7042         return UCOL_GREATER;
7043     }
7044 }
7045 
7046 /*  CEBuf - A struct and some inline functions to handle the saving    */
7047 /*          of CEs in a buffer within ucol_strcoll                     */
7048 
7049 #define UCOL_CEBUF_SIZE 512
7050 typedef struct ucol_CEBuf {
7051     uint32_t    *buf;
7052     uint32_t    *endp;
7053     uint32_t    *pos;
7054     uint32_t     localArray[UCOL_CEBUF_SIZE];
7055 } ucol_CEBuf;
7056 
7057 
7058 static
UCOL_INIT_CEBUF(ucol_CEBuf * b)7059 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
7060     (b)->buf = (b)->pos = (b)->localArray;
7061     (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
7062 }
7063 
7064 static
ucol_CEBuf_Expand(ucol_CEBuf * b,collIterate * ci)7065 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci) {
7066     uint32_t  oldSize;
7067     uint32_t  newSize;
7068     uint32_t  *newBuf;
7069 
7070     ci->flags |= UCOL_ITER_ALLOCATED;
7071     oldSize = b->pos - b->buf;
7072     newSize = oldSize * 2;
7073     newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
7074     if(newBuf != NULL) {
7075       uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
7076       if (b->buf != b->localArray) {
7077           uprv_free(b->buf);
7078       }
7079       b->buf = newBuf;
7080       b->endp = b->buf + newSize;
7081       b->pos  = b->buf + oldSize;
7082     }
7083 }
7084 
7085 static
UCOL_CEBUF_PUT(ucol_CEBuf * b,uint32_t ce,collIterate * ci)7086 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci) {
7087     if (b->pos == b->endp) {
7088         ucol_CEBuf_Expand(b, ci);
7089 }
7090     *(b)->pos++ = ce;
7091 }
7092 
7093 /* This is a trick string compare function that goes in and uses sortkeys to compare */
7094 /* It is used when compare gets in trouble and needs to bail out                     */
ucol_compareUsingSortKeys(collIterate * sColl,collIterate * tColl,UErrorCode * status)7095 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7096                                                   collIterate *tColl,
7097                                                   UErrorCode *status)
7098 {
7099     uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7100     uint8_t *sourceKeyP = sourceKey;
7101     uint8_t *targetKeyP = targetKey;
7102     int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7103     const UCollator *coll = sColl->coll;
7104     UChar *source = NULL;
7105     UChar *target = NULL;
7106     int32_t result = UCOL_EQUAL;
7107     UChar sStackBuf[256], tStackBuf[256];
7108     int32_t sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7109     int32_t targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7110 
7111     // TODO: Handle long strings. Do the same in ucol_checkIdent.
7112     if(sColl->flags & UCOL_USE_ITERATOR) {
7113         sColl->iterator->move(sColl->iterator, 0, UITER_START);
7114         tColl->iterator->move(tColl->iterator, 0, UITER_START);
7115         source = sStackBuf;
7116         UChar *sBufp = source;
7117         target = tStackBuf;
7118         UChar *tBufp = target;
7119         while(sColl->iterator->hasNext(sColl->iterator)) {
7120             *sBufp++ = (UChar)sColl->iterator->next(sColl->iterator);
7121         }
7122         while(tColl->iterator->hasNext(tColl->iterator)) {
7123             *tBufp++ = (UChar)tColl->iterator->next(tColl->iterator);
7124         }
7125         sourceLength = sBufp - source;
7126         targetLength = tBufp - target;
7127     } else { // no iterators
7128         sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7129         targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7130         source = sColl->string;
7131         target = tColl->string;
7132     }
7133 
7134 
7135 
7136     sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7137     if(sourceKeyLen > UCOL_MAX_BUFFER) {
7138         sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7139         if(sourceKeyP == NULL) {
7140             *status = U_MEMORY_ALLOCATION_ERROR;
7141             goto cleanup_and_do_compare;
7142         }
7143         sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7144     }
7145 
7146     targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7147     if(targetKeyLen > UCOL_MAX_BUFFER) {
7148         targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7149         if(targetKeyP == NULL) {
7150             *status = U_MEMORY_ALLOCATION_ERROR;
7151             goto cleanup_and_do_compare;
7152         }
7153         targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7154     }
7155 
7156     result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7157 
7158 cleanup_and_do_compare:
7159     if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
7160         uprv_free(sourceKeyP);
7161     }
7162 
7163     if(targetKeyP != NULL && targetKeyP != targetKey) {
7164         uprv_free(targetKeyP);
7165     }
7166 
7167     if(result<0) {
7168         return UCOL_LESS;
7169     } else if(result>0) {
7170         return UCOL_GREATER;
7171     } else {
7172         return UCOL_EQUAL;
7173     }
7174 }
7175 
7176 
7177 static inline UCollationResult
ucol_strcollRegular(collIterate * sColl,collIterate * tColl,UErrorCode * status)7178 ucol_strcollRegular( collIterate *sColl, collIterate *tColl,
7179 //              const UCollator    *coll,
7180 //              const UChar        *source,
7181 //              int32_t            sourceLength,
7182 //              const UChar        *target,
7183 //              int32_t            targetLength,
7184               UErrorCode *status)
7185 {
7186     U_ALIGN_CODE(16);
7187 
7188     const UCollator *coll = sColl->coll;
7189 
7190 
7191     // setting up the collator parameters
7192     UColAttributeValue strength = coll->strength;
7193     UBool initialCheckSecTer = (strength  >= UCOL_SECONDARY);
7194 
7195     UBool checkSecTer = initialCheckSecTer;
7196     UBool checkTertiary = (strength  >= UCOL_TERTIARY);
7197     UBool checkQuad = (strength  >= UCOL_QUATERNARY);
7198     UBool checkIdent = (strength == UCOL_IDENTICAL);
7199     UBool checkCase = (coll->caseLevel == UCOL_ON);
7200     UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7201     UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7202     UBool qShifted = shifted && checkQuad;
7203     UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7204 
7205     if(doHiragana && shifted) {
7206       return (ucol_compareUsingSortKeys(sColl, tColl, status));
7207     }
7208     uint8_t caseSwitch = coll->caseSwitch;
7209     uint8_t tertiaryMask = coll->tertiaryMask;
7210 
7211     // This is the lowest primary value that will not be ignored if shifted
7212     uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7213 
7214     UCollationResult result = UCOL_EQUAL;
7215     UCollationResult hirResult = UCOL_EQUAL;
7216 
7217     // Preparing the CE buffers. They will be filled during the primary phase
7218     ucol_CEBuf   sCEs;
7219     ucol_CEBuf   tCEs;
7220     UCOL_INIT_CEBUF(&sCEs);
7221     UCOL_INIT_CEBUF(&tCEs);
7222 
7223     uint32_t secS = 0, secT = 0;
7224     uint32_t sOrder=0, tOrder=0;
7225 
7226     // Non shifted primary processing is quite simple
7227     if(!shifted) {
7228       for(;;) {
7229 
7230         // We fetch CEs until we hit a non ignorable primary or end.
7231         do {
7232           // We get the next CE
7233           sOrder = ucol_IGetNextCE(coll, sColl, status);
7234           // Stuff it in the buffer
7235           UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7236           // And keep just the primary part.
7237           sOrder &= UCOL_PRIMARYMASK;
7238         } while(sOrder == 0);
7239 
7240         // see the comments on the above block
7241         do {
7242           tOrder = ucol_IGetNextCE(coll, tColl, status);
7243           UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7244           tOrder &= UCOL_PRIMARYMASK;
7245         } while(tOrder == 0);
7246 
7247         // if both primaries are the same
7248         if(sOrder == tOrder) {
7249             // and there are no more CEs, we advance to the next level
7250             if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7251               break;
7252             }
7253             if(doHiragana && hirResult == UCOL_EQUAL) {
7254               if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
7255                 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
7256                   ? UCOL_LESS:UCOL_GREATER;
7257               }
7258             }
7259         } else {
7260             // if two primaries are different, we are done
7261             result = (sOrder < tOrder) ?  UCOL_LESS: UCOL_GREATER;
7262             goto commonReturn;
7263         }
7264       } // no primary difference... do the rest from the buffers
7265     } else { // shifted - do a slightly more complicated processing :)
7266       for(;;) {
7267         UBool sInShifted = FALSE;
7268         UBool tInShifted = FALSE;
7269         // This version of code can be refactored. However, it seems easier to understand this way.
7270         // Source loop. Sam as the target loop.
7271         for(;;) {
7272           sOrder = ucol_IGetNextCE(coll, sColl, status);
7273           if(sOrder == UCOL_NO_MORE_CES) {
7274             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7275             break;
7276           } else if(sOrder == 0
7277             || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
7278             /* UCA amendment - ignore ignorables that follow shifted code points */
7279             continue;
7280           } else if(isContinuation(sOrder)) {
7281             if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7282               if(sInShifted) {
7283                 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7284                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7285                 continue;
7286               } else {
7287                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7288                 break;
7289               }
7290             } else { /* Just lower level values */
7291               if(sInShifted) {
7292                 continue;
7293               } else {
7294                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7295                 continue;
7296               }
7297             }
7298           } else { /* regular */
7299             if((sOrder & UCOL_PRIMARYMASK) > LVT) {
7300               UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7301               break;
7302             } else {
7303               if((sOrder & UCOL_PRIMARYMASK) > 0) {
7304                 sInShifted = TRUE;
7305                 sOrder &= UCOL_PRIMARYMASK;
7306                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7307                 continue;
7308               } else {
7309                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7310                 sInShifted = FALSE;
7311                 continue;
7312               }
7313             }
7314           }
7315         }
7316         sOrder &= UCOL_PRIMARYMASK;
7317         sInShifted = FALSE;
7318 
7319         for(;;) {
7320           tOrder = ucol_IGetNextCE(coll, tColl, status);
7321           if(tOrder == UCOL_NO_MORE_CES) {
7322             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7323             break;
7324           } else if(tOrder == 0
7325             || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
7326             /* UCA amendment - ignore ignorables that follow shifted code points */
7327             continue;
7328           } else if(isContinuation(tOrder)) {
7329             if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7330               if(tInShifted) {
7331                 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7332                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7333                 continue;
7334               } else {
7335                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7336                 break;
7337               }
7338             } else { /* Just lower level values */
7339               if(tInShifted) {
7340                 continue;
7341               } else {
7342                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7343                 continue;
7344               }
7345             }
7346           } else { /* regular */
7347             if((tOrder & UCOL_PRIMARYMASK) > LVT) {
7348               UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7349               break;
7350             } else {
7351               if((tOrder & UCOL_PRIMARYMASK) > 0) {
7352                 tInShifted = TRUE;
7353                 tOrder &= UCOL_PRIMARYMASK;
7354                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7355                 continue;
7356               } else {
7357                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7358                 tInShifted = FALSE;
7359                 continue;
7360               }
7361             }
7362           }
7363         }
7364         tOrder &= UCOL_PRIMARYMASK;
7365         tInShifted = FALSE;
7366 
7367         if(sOrder == tOrder) {
7368           /*
7369             if(doHiragana && hirResult == UCOL_EQUAL) {
7370               if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7371                 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7372                   ? UCOL_LESS:UCOL_GREATER;
7373               }
7374             }
7375           */
7376             if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7377               break;
7378             } else {
7379               sOrder = 0; tOrder = 0;
7380               continue;
7381             }
7382         } else {
7383             result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
7384             goto commonReturn;
7385         }
7386       } /* no primary difference... do the rest from the buffers */
7387     }
7388 
7389     /* now, we're gonna reexamine collected CEs */
7390     uint32_t    *sCE;
7391     uint32_t    *tCE;
7392 
7393     /* This is the secondary level of comparison */
7394     if(checkSecTer) {
7395       if(!isFrenchSec) { /* normal */
7396         sCE = sCEs.buf;
7397         tCE = tCEs.buf;
7398         for(;;) {
7399           while (secS == 0) {
7400             secS = *(sCE++) & UCOL_SECONDARYMASK;
7401           }
7402 
7403           while(secT == 0) {
7404               secT = *(tCE++) & UCOL_SECONDARYMASK;
7405           }
7406 
7407           if(secS == secT) {
7408             if(secS == UCOL_NO_MORE_CES_SECONDARY) {
7409               break;
7410             } else {
7411               secS = 0; secT = 0;
7412               continue;
7413             }
7414           } else {
7415                result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7416                goto commonReturn;
7417           }
7418         }
7419       } else { /* do the French */
7420         uint32_t *sCESave = NULL;
7421         uint32_t *tCESave = NULL;
7422         sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
7423         tCE = tCEs.pos-2;
7424         for(;;) {
7425           while (secS == 0 && sCE >= sCEs.buf) {
7426             if(sCESave == 0) {
7427               secS = *(sCE--);
7428               if(isContinuation(secS)) {
7429                 while(isContinuation(secS = *(sCE--)));
7430                 /* after this, secS has the start of continuation, and sCEs points before that */
7431                 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
7432                 sCE+=2;  /* need to point to the first continuation CP */
7433                 /* However, now you can just continue doing stuff */
7434               }
7435             } else {
7436               secS = *(sCE++);
7437               if(!isContinuation(secS)) { /* This means we have finished with this cont */
7438                 sCE = sCESave;            /* reset the pointer to before continuation */
7439                 sCESave = 0;
7440                 continue;
7441               }
7442             }
7443             secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7444           }
7445 
7446           while(secT == 0 && tCE >= tCEs.buf) {
7447             if(tCESave == 0) {
7448               secT = *(tCE--);
7449               if(isContinuation(secT)) {
7450                 while(isContinuation(secT = *(tCE--)));
7451                 /* after this, secS has the start of continuation, and sCEs points before that */
7452                 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
7453                 tCE+=2;  /* need to point to the first continuation CP */
7454                 /* However, now you can just continue doing stuff */
7455               }
7456             } else {
7457               secT = *(tCE++);
7458               if(!isContinuation(secT)) { /* This means we have finished with this cont */
7459                 tCE = tCESave;          /* reset the pointer to before continuation */
7460                 tCESave = 0;
7461                 continue;
7462               }
7463             }
7464             secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7465           }
7466 
7467           if(secS == secT) {
7468             if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
7469               break;
7470             } else {
7471               secS = 0; secT = 0;
7472               continue;
7473             }
7474           } else {
7475               result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7476               goto commonReturn;
7477           }
7478         }
7479       }
7480     }
7481 
7482     /* doing the case bit */
7483     if(checkCase) {
7484       sCE = sCEs.buf;
7485       tCE = tCEs.buf;
7486       for(;;) {
7487         while((secS & UCOL_REMOVE_CASE) == 0) {
7488           if(!isContinuation(*sCE++)) {
7489             secS =*(sCE-1);
7490             if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7491             // primary ignorables should not be considered on the case level when the strength is primary
7492             // otherwise, the CEs stop being well-formed
7493               secS &= UCOL_TERT_CASE_MASK;
7494               secS ^= caseSwitch;
7495             } else {
7496               secS = 0;
7497             }
7498           } else {
7499             secS = 0;
7500           }
7501         }
7502 
7503         while((secT & UCOL_REMOVE_CASE) == 0) {
7504           if(!isContinuation(*tCE++)) {
7505             secT = *(tCE-1);
7506             if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7507             // primary ignorables should not be considered on the case level when the strength is primary
7508             // otherwise, the CEs stop being well-formed
7509               secT &= UCOL_TERT_CASE_MASK;
7510               secT ^= caseSwitch;
7511             } else {
7512               secT = 0;
7513             }
7514           } else {
7515             secT = 0;
7516           }
7517         }
7518 
7519         if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
7520           result = UCOL_LESS;
7521           goto commonReturn;
7522         } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
7523           result = UCOL_GREATER;
7524           goto commonReturn;
7525         }
7526 
7527         if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
7528           break;
7529         } else {
7530           secS = 0;
7531           secT = 0;
7532         }
7533       }
7534     }
7535 
7536     /* Tertiary level */
7537     if(checkTertiary) {
7538       secS = 0;
7539       secT = 0;
7540       sCE = sCEs.buf;
7541       tCE = tCEs.buf;
7542       for(;;) {
7543         while((secS & UCOL_REMOVE_CASE) == 0) {
7544           secS = *(sCE++) & tertiaryMask;
7545           if(!isContinuation(secS)) {
7546             secS ^= caseSwitch;
7547           } else {
7548             secS &= UCOL_REMOVE_CASE;
7549           }
7550         }
7551 
7552         while((secT & UCOL_REMOVE_CASE)  == 0) {
7553           secT = *(tCE++) & tertiaryMask;
7554           if(!isContinuation(secT)) {
7555             secT ^= caseSwitch;
7556           } else {
7557             secT &= UCOL_REMOVE_CASE;
7558           }
7559         }
7560 
7561         if(secS == secT) {
7562           if((secS & UCOL_REMOVE_CASE) == 1) {
7563             break;
7564           } else {
7565             secS = 0; secT = 0;
7566             continue;
7567           }
7568         } else {
7569             result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7570             goto commonReturn;
7571         }
7572       }
7573     }
7574 
7575 
7576     if(qShifted /*checkQuad*/) {
7577       UBool sInShifted = TRUE;
7578       UBool tInShifted = TRUE;
7579       secS = 0;
7580       secT = 0;
7581       sCE = sCEs.buf;
7582       tCE = tCEs.buf;
7583       for(;;) {
7584         while(secS == 0 && secS != UCOL_NO_MORE_CES || (isContinuation(secS) && !sInShifted)) {
7585           secS = *(sCE++);
7586           if(isContinuation(secS)) {
7587             if(!sInShifted) {
7588               continue;
7589             }
7590           } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
7591             secS = UCOL_PRIMARYMASK;
7592             sInShifted = FALSE;
7593           } else {
7594             sInShifted = TRUE;
7595           }
7596         }
7597         secS &= UCOL_PRIMARYMASK;
7598 
7599 
7600         while(secT == 0 && secT != UCOL_NO_MORE_CES || (isContinuation(secT) && !tInShifted)) {
7601           secT = *(tCE++);
7602           if(isContinuation(secT)) {
7603             if(!tInShifted) {
7604               continue;
7605             }
7606           } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
7607             secT = UCOL_PRIMARYMASK;
7608             tInShifted = FALSE;
7609           } else {
7610             tInShifted = TRUE;
7611           }
7612         }
7613         secT &= UCOL_PRIMARYMASK;
7614 
7615         if(secS == secT) {
7616           if(secS == UCOL_NO_MORE_CES_PRIMARY) {
7617             break;
7618           } else {
7619             secS = 0; secT = 0;
7620             continue;
7621           }
7622         } else {
7623             result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7624             goto commonReturn;
7625         }
7626       }
7627     } else if(doHiragana && hirResult != UCOL_EQUAL) {
7628       // If we're fine on quaternaries, we might be different
7629       // on Hiragana. This, however, might fail us in shifted.
7630       result = hirResult;
7631       goto commonReturn;
7632     }
7633 
7634     /*  For IDENTICAL comparisons, we use a bitwise character comparison */
7635     /*  as a tiebreaker if all else is equal.                                */
7636     /*  Getting here  should be quite rare - strings are not identical -     */
7637     /*     that is checked first, but compared == through all other checks.  */
7638     if(checkIdent)
7639     {
7640         //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
7641         result = ucol_checkIdent(sColl, tColl, TRUE, status);
7642     }
7643 
7644 commonReturn:
7645     if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
7646         freeHeapWritableBuffer(sColl);
7647         freeHeapWritableBuffer(tColl);
7648 
7649         if (sCEs.buf != sCEs.localArray ) {
7650             uprv_free(sCEs.buf);
7651         }
7652         if (tCEs.buf != tCEs.localArray ) {
7653             uprv_free(tCEs.buf);
7654         }
7655     }
7656 
7657     return result;
7658 }
7659 
7660 
7661 static inline uint32_t
ucol_getLatinOneContraction(const UCollator * coll,int32_t strength,uint32_t CE,const UChar * s,int32_t * index,int32_t len)7662 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
7663                           uint32_t CE, const UChar *s, int32_t *index, int32_t len) {
7664   const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
7665   int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
7666   int32_t offset = 1;
7667   UChar schar = 0, tchar = 0;
7668 
7669   for(;;) {
7670     if(len == -1) {
7671       if(s[*index] == 0) { // end of string
7672         return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7673       } else {
7674         schar = s[*index];
7675       }
7676     } else {
7677       if(*index == len) {
7678         return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7679       } else {
7680         schar = s[*index];
7681       }
7682     }
7683 
7684     while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
7685       offset++;
7686     }
7687 
7688     if (schar == tchar) {
7689       (*index)++;
7690       return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
7691     }
7692     else
7693     {
7694       if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
7695         return UCOL_BAIL_OUT_CE;
7696       }
7697       // skip completely ignorables
7698       uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
7699       if(isZeroCE == 0) { // we have to ignore completely ignorables
7700         (*index)++;
7701         continue;
7702       }
7703 
7704       return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7705     }
7706   }
7707 }
7708 
7709 
7710 /**
7711  * This is a fast strcoll, geared towards text in Latin-1.
7712  * It supports contractions of size two, French secondaries
7713  * and case switching. You can use it with strengths primary
7714  * to tertiary. It does not support shifted and case level.
7715  * It relies on the table build by setupLatin1Table. If it
7716  * doesn't understand something, it will go to the regular
7717  * strcoll.
7718  */
7719 static inline UCollationResult
ucol_strcollUseLatin1(const UCollator * coll,const UChar * source,int32_t sLen,const UChar * target,int32_t tLen,UErrorCode * status)7720 ucol_strcollUseLatin1( const UCollator    *coll,
7721               const UChar        *source,
7722               int32_t            sLen,
7723               const UChar        *target,
7724               int32_t            tLen,
7725               UErrorCode *status)
7726 {
7727     U_ALIGN_CODE(16);
7728     int32_t strength = coll->strength;
7729 
7730     int32_t sIndex = 0, tIndex = 0;
7731     UChar sChar = 0, tChar = 0;
7732     uint32_t sOrder=0, tOrder=0;
7733 
7734     UBool endOfSource = FALSE;
7735 
7736     uint32_t *elements = coll->latinOneCEs;
7737 
7738     UBool haveContractions = FALSE; // if we have contractions in our string
7739                                     // we cannot do French secondary
7740 
7741     // Do the primary level
7742     for(;;) {
7743       while(sOrder==0) { // this loop skips primary ignorables
7744         // sOrder=getNextlatinOneCE(source);
7745         if(sLen==-1) {   // handling zero terminated strings
7746           sChar=source[sIndex++];
7747           if(sChar==0) {
7748             endOfSource = TRUE;
7749             break;
7750           }
7751         } else {        // handling strings with known length
7752           if(sIndex==sLen) {
7753             endOfSource = TRUE;
7754             break;
7755           }
7756           sChar=source[sIndex++];
7757         }
7758         if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7759           //fprintf(stderr, "R");
7760           goto returnRegular;
7761           //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7762         }
7763         sOrder = elements[sChar];
7764         if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
7765           // specials can basically be either contractions or bail-out signs. If we get anything
7766           // else, we'll bail out anywasy
7767           if(getCETag(sOrder) == CONTRACTION_TAG) {
7768             sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
7769             haveContractions = TRUE; // if there are contractions, we cannot do French secondary
7770             // However, if there are contractions in the table, but we always use just one char,
7771             // we might be able to do French. This should be checked out.
7772           }
7773           if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7774             //fprintf(stderr, "S");
7775             goto returnRegular;
7776             //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7777           }
7778         }
7779       }
7780 
7781       while(tOrder==0) {  // this loop skips primary ignorables
7782         // tOrder=getNextlatinOneCE(target);
7783         if(tLen==-1) {    // handling zero terminated strings
7784           tChar=target[tIndex++];
7785           if(tChar==0) {
7786             if(endOfSource) { // this is different than source loop,
7787               // as we already know that source loop is done here,
7788               // so we can either finish the primary loop if both
7789               // strings are done or anounce the result if only
7790               // target is done. Same below.
7791               goto endOfPrimLoop;
7792             } else {
7793               return UCOL_GREATER;
7794             }
7795           }
7796         } else {          // handling strings with known length
7797           if(tIndex==tLen) {
7798             if(endOfSource) {
7799               goto endOfPrimLoop;
7800             } else {
7801               return UCOL_GREATER;
7802             }
7803           }
7804           tChar=target[tIndex++];
7805         }
7806         if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7807           //fprintf(stderr, "R");
7808           goto returnRegular;
7809           //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7810         }
7811         tOrder = elements[tChar];
7812         if(tOrder >= UCOL_NOT_FOUND) {
7813           // Handling specials, see the comments for source
7814           if(getCETag(tOrder) == CONTRACTION_TAG) {
7815             tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
7816             haveContractions = TRUE;
7817           }
7818           if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7819             //fprintf(stderr, "S");
7820             goto returnRegular;
7821             //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7822           }
7823         }
7824       }
7825       if(endOfSource) { // source is finished, but target is not, say the result.
7826           return UCOL_LESS;
7827       }
7828 
7829       if(sOrder == tOrder) { // if we have same CEs, we continue the loop
7830         sOrder = 0; tOrder = 0;
7831         continue;
7832       } else {
7833         // compare current top bytes
7834         if(((sOrder^tOrder)&0xFF000000)!=0) {
7835           // top bytes differ, return difference
7836           if(sOrder < tOrder) {
7837             return UCOL_LESS;
7838           } else if(sOrder > tOrder) {
7839             return UCOL_GREATER;
7840           }
7841           // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
7842           // since we must return enum value
7843         }
7844 
7845         // top bytes match, continue with following bytes
7846         sOrder<<=8;
7847         tOrder<<=8;
7848       }
7849     }
7850 
7851 endOfPrimLoop:
7852     // after primary loop, we definitely know the sizes of strings,
7853     // so we set it and use simpler loop for secondaries and tertiaries
7854     sLen = sIndex; tLen = tIndex;
7855     if(strength >= UCOL_SECONDARY) {
7856       // adjust the table beggining
7857       elements += coll->latinOneTableLen;
7858       endOfSource = FALSE;
7859 
7860       if(coll->frenchCollation == UCOL_OFF) { // non French
7861         // This loop is a simplified copy of primary loop
7862         // at this point we know that whole strings are latin-1, so we don't
7863         // check for that. We also know that we only have contractions as
7864         // specials.
7865         sIndex = 0; tIndex = 0;
7866         for(;;) {
7867           while(sOrder==0) {
7868             if(sIndex==sLen) {
7869               endOfSource = TRUE;
7870               break;
7871             }
7872             sChar=source[sIndex++];
7873             sOrder = elements[sChar];
7874             if(sOrder > UCOL_NOT_FOUND) {
7875               sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
7876             }
7877           }
7878 
7879           while(tOrder==0) {
7880             if(tIndex==tLen) {
7881               if(endOfSource) {
7882                 goto endOfSecLoop;
7883               } else {
7884                 return UCOL_GREATER;
7885               }
7886             }
7887             tChar=target[tIndex++];
7888             tOrder = elements[tChar];
7889             if(tOrder > UCOL_NOT_FOUND) {
7890               tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
7891             }
7892           }
7893           if(endOfSource) {
7894               return UCOL_LESS;
7895           }
7896 
7897           if(sOrder == tOrder) {
7898             sOrder = 0; tOrder = 0;
7899             continue;
7900           } else {
7901             // see primary loop for comments on this
7902             if(((sOrder^tOrder)&0xFF000000)!=0) {
7903               if(sOrder < tOrder) {
7904                 return UCOL_LESS;
7905               } else if(sOrder > tOrder) {
7906                 return UCOL_GREATER;
7907               }
7908             }
7909             sOrder<<=8;
7910             tOrder<<=8;
7911           }
7912         }
7913       } else { // French
7914         if(haveContractions) { // if we have contractions, we have to bail out
7915           // since we don't really know how to handle them here
7916           goto returnRegular;
7917           //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7918         }
7919         // For French, we go backwards
7920         sIndex = sLen; tIndex = tLen;
7921         for(;;) {
7922           while(sOrder==0) {
7923             if(sIndex==0) {
7924               endOfSource = TRUE;
7925               break;
7926             }
7927             sChar=source[--sIndex];
7928             sOrder = elements[sChar];
7929             // don't even look for contractions
7930           }
7931 
7932           while(tOrder==0) {
7933             if(tIndex==0) {
7934               if(endOfSource) {
7935                 goto endOfSecLoop;
7936               } else {
7937                 return UCOL_GREATER;
7938               }
7939             }
7940             tChar=target[--tIndex];
7941             tOrder = elements[tChar];
7942             // don't even look for contractions
7943           }
7944           if(endOfSource) {
7945               return UCOL_LESS;
7946           }
7947 
7948           if(sOrder == tOrder) {
7949             sOrder = 0; tOrder = 0;
7950             continue;
7951           } else {
7952             // see the primary loop for comments
7953             if(((sOrder^tOrder)&0xFF000000)!=0) {
7954               if(sOrder < tOrder) {
7955                 return UCOL_LESS;
7956               } else if(sOrder > tOrder) {
7957                 return UCOL_GREATER;
7958               }
7959             }
7960             sOrder<<=8;
7961             tOrder<<=8;
7962           }
7963         }
7964       }
7965     }
7966 
7967 endOfSecLoop:
7968     if(strength >= UCOL_TERTIARY) {
7969       // tertiary loop is the same as secondary (except no French)
7970       elements += coll->latinOneTableLen;
7971       sIndex = 0; tIndex = 0;
7972       endOfSource = FALSE;
7973       for(;;) {
7974         while(sOrder==0) {
7975           if(sIndex==sLen) {
7976             endOfSource = TRUE;
7977             break;
7978           }
7979           sChar=source[sIndex++];
7980           sOrder = elements[sChar];
7981           if(sOrder > UCOL_NOT_FOUND) {
7982             sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
7983           }
7984         }
7985         while(tOrder==0) {
7986           if(tIndex==tLen) {
7987             if(endOfSource) {
7988               return UCOL_EQUAL; // if both strings are at the end, they are equal
7989             } else {
7990               return UCOL_GREATER;
7991             }
7992           }
7993           tChar=target[tIndex++];
7994           tOrder = elements[tChar];
7995           if(tOrder > UCOL_NOT_FOUND) {
7996             tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
7997           }
7998         }
7999         if(endOfSource) {
8000             return UCOL_LESS;
8001         }
8002         if(sOrder == tOrder) {
8003           sOrder = 0; tOrder = 0;
8004           continue;
8005         } else {
8006           if(((sOrder^tOrder)&0xff000000)!=0) {
8007             if(sOrder < tOrder) {
8008               return UCOL_LESS;
8009             } else if(sOrder > tOrder) {
8010               return UCOL_GREATER;
8011             }
8012           }
8013           sOrder<<=8;
8014           tOrder<<=8;
8015         }
8016       }
8017     }
8018     return UCOL_EQUAL;
8019 
8020 returnRegular:
8021     // Preparing the context objects for iterating over strings
8022     collIterate sColl, tColl;
8023 
8024     IInit_collIterate(coll, source, sLen, &sColl);
8025     IInit_collIterate(coll, target, tLen, &tColl);
8026     return ucol_strcollRegular(&sColl, &tColl, status);
8027 }
8028 
8029 
8030 U_CAPI UCollationResult U_EXPORT2
ucol_strcollIter(const UCollator * coll,UCharIterator * sIter,UCharIterator * tIter,UErrorCode * status)8031 ucol_strcollIter( const UCollator    *coll,
8032                  UCharIterator *sIter,
8033                  UCharIterator *tIter,
8034                  UErrorCode         *status) {
8035   if(!status || U_FAILURE(*status)) {
8036     return UCOL_EQUAL;
8037   }
8038 
8039   UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
8040   UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
8041 
8042   if (sIter == tIter) {
8043     UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8044     return UCOL_EQUAL;
8045   }
8046   if(sIter == NULL || tIter == NULL || coll == NULL) {
8047     *status = U_ILLEGAL_ARGUMENT_ERROR;
8048     UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8049     return UCOL_EQUAL;
8050   }
8051 
8052   UCollationResult result = UCOL_EQUAL;
8053 
8054   // Preparing the context objects for iterating over strings
8055   collIterate sColl, tColl;
8056   // The division for the array length may truncate the array size to
8057   // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8058   // for all platforms anyway.
8059   UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8060   UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8061   UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8062 
8063   IInit_collIterate(coll, NULL, -1, &sColl);
8064   sColl.iterator = sIter;
8065   sColl.flags |= UCOL_USE_ITERATOR;
8066   IInit_collIterate(coll, NULL, -1, &tColl);
8067   tColl.flags |= UCOL_USE_ITERATOR;
8068   tColl.iterator = tIter;
8069 
8070   if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8071     sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8072     sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8073     sColl.flags &= ~UCOL_ITER_NORM;
8074 
8075     tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8076     tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8077     tColl.flags &= ~UCOL_ITER_NORM;
8078   }
8079 
8080   UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
8081 
8082   while((sChar = sColl.iterator->next(sColl.iterator)) ==
8083     (tChar = tColl.iterator->next(tColl.iterator))) {
8084     if(sChar == U_SENTINEL) {
8085       result = UCOL_EQUAL;
8086       goto end_compare;
8087     }
8088   }
8089 
8090   if(sChar == U_SENTINEL) {
8091     tChar = tColl.iterator->previous(tColl.iterator);
8092   }
8093 
8094   if(tChar == U_SENTINEL) {
8095     sChar = sColl.iterator->previous(sColl.iterator);
8096   }
8097 
8098   sChar = sColl.iterator->previous(sColl.iterator);
8099   tChar = tColl.iterator->previous(tColl.iterator);
8100 
8101   if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8102   {
8103       // We are stopped in the middle of a contraction.
8104       // Scan backwards through the == part of the string looking for the start of the contraction.
8105       //   It doesn't matter which string we scan, since they are the same in this region.
8106       do
8107       {
8108         sChar = sColl.iterator->previous(sColl.iterator);
8109         tChar = tColl.iterator->previous(tColl.iterator);
8110       }
8111       while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8112   }
8113 
8114 
8115   if(U_SUCCESS(*status)) {
8116     result = ucol_strcollRegular(&sColl, &tColl, status);
8117   }
8118 
8119 end_compare:
8120   if(sNormIter || tNormIter) {
8121     unorm_closeIter(sNormIter);
8122     unorm_closeIter(tNormIter);
8123   }
8124 
8125   UTRACE_EXIT_VALUE_STATUS(result, *status)
8126   return result;
8127 }
8128 
8129 
8130 
8131 /*                                                                      */
8132 /* ucol_strcoll     Main public API string comparison function          */
8133 /*                                                                      */
8134 U_CAPI UCollationResult U_EXPORT2
ucol_strcoll(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)8135 ucol_strcoll( const UCollator    *coll,
8136               const UChar        *source,
8137               int32_t            sourceLength,
8138               const UChar        *target,
8139               int32_t            targetLength) {
8140     U_ALIGN_CODE(16);
8141 
8142     UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
8143     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8144       UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8145       UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
8146       UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
8147     }
8148 
8149     UErrorCode status = U_ZERO_ERROR;
8150     if(source == NULL || target == NULL) {
8151       // do not crash, but return. Should have
8152       // status argument to return error.
8153       UTRACE_EXIT_VALUE(UTRACE_UCOL_STRCOLL);
8154       return UCOL_EQUAL;
8155     }
8156       collIterate sColl, tColl;
8157 
8158     /* Scan the strings.  Find:                                                             */
8159     /*    The length of any leading portion that is equal                                   */
8160     /*    Whether they are exactly equal.  (in which case we just return)                   */
8161     const UChar    *pSrc    = source;
8162     const UChar    *pTarg   = target;
8163     int32_t        equalLength;
8164 
8165     if (sourceLength == -1 && targetLength == -1) {
8166         // Both strings are null terminated.
8167         //    Check for them being the same string, and scan through
8168         //    any leading equal portion.
8169         if (source==target) {
8170             UTRACE_EXIT_VALUE(UCOL_EQUAL);
8171             return UCOL_EQUAL;
8172         }
8173 
8174         for (;;) {
8175             if ( *pSrc != *pTarg || *pSrc == 0) {
8176                 break;
8177             }
8178             pSrc++;
8179             pTarg++;
8180         }
8181         if (*pSrc == 0 && *pTarg == 0) {
8182             UTRACE_EXIT_VALUE(UCOL_EQUAL);
8183             return UCOL_EQUAL;
8184         }
8185         equalLength = pSrc - source;
8186     }
8187     else
8188     {
8189         // One or both strings has an explicit length.
8190         /* check if source and target are same strings */
8191 
8192         if (source==target  && sourceLength==targetLength) {
8193             UTRACE_EXIT_VALUE(UCOL_EQUAL);
8194             return UCOL_EQUAL;
8195         }
8196         const UChar    *pSrcEnd = source + sourceLength;
8197         const UChar    *pTargEnd = target + targetLength;
8198 
8199 
8200         // Scan while the strings are bitwise ==, or until one is exhausted.
8201             for (;;) {
8202                 if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8203                     break;
8204                 }
8205                 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8206                     break;
8207                 }
8208                 if (*pSrc != *pTarg) {
8209                     break;
8210                 }
8211                 pSrc++;
8212                 pTarg++;
8213             }
8214             equalLength = pSrc - source;
8215 
8216             // If we made it all the way through both strings, we are done.  They are ==
8217             if ((pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0))  &&   /* At end of src string, however it was specified. */
8218                 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)))  {  /* and also at end of dest string                  */
8219                 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8220                 return UCOL_EQUAL;
8221             }
8222     }
8223     if (equalLength > 0) {
8224         /* There is an identical portion at the beginning of the two strings.        */
8225         /*   If the identical portion ends within a contraction or a comibining      */
8226         /*   character sequence, back up to the start of that sequence.              */
8227         pSrc  = source + equalLength;        /* point to the first differing chars   */
8228         pTarg = target + equalLength;
8229         if (pSrc  != source+sourceLength && ucol_unsafeCP(*pSrc, coll) ||
8230             pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))
8231         {
8232             // We are stopped in the middle of a contraction.
8233             // Scan backwards through the == part of the string looking for the start of the contraction.
8234             //   It doesn't matter which string we scan, since they are the same in this region.
8235             do
8236             {
8237                 equalLength--;
8238                 pSrc--;
8239             }
8240             while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
8241         }
8242 
8243         source += equalLength;
8244         target += equalLength;
8245         if (sourceLength > 0) {
8246             sourceLength -= equalLength;
8247         }
8248         if (targetLength > 0) {
8249             targetLength -= equalLength;
8250         }
8251     }
8252 
8253     UCollationResult  returnVal;
8254     if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
8255       // Preparing the context objects for iterating over strings
8256       IInit_collIterate(coll, source, sourceLength, &sColl);
8257       IInit_collIterate(coll, target, targetLength, &tColl);
8258       returnVal = ucol_strcollRegular(&sColl, &tColl, &status);
8259     } else {
8260       returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
8261     }
8262     UTRACE_EXIT_VALUE(returnVal);
8263     return returnVal;
8264 }
8265 
8266 /* convenience function for comparing strings */
8267 U_CAPI UBool U_EXPORT2
ucol_greater(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)8268 ucol_greater(    const    UCollator        *coll,
8269         const    UChar            *source,
8270         int32_t            sourceLength,
8271         const    UChar            *target,
8272         int32_t            targetLength)
8273 {
8274   return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8275       == UCOL_GREATER);
8276 }
8277 
8278 /* convenience function for comparing strings */
8279 U_CAPI UBool U_EXPORT2
ucol_greaterOrEqual(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)8280 ucol_greaterOrEqual(    const    UCollator    *coll,
8281             const    UChar        *source,
8282             int32_t        sourceLength,
8283             const    UChar        *target,
8284             int32_t        targetLength)
8285 {
8286   return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8287       != UCOL_LESS);
8288 }
8289 
8290 /* convenience function for comparing strings */
8291 U_CAPI UBool U_EXPORT2
ucol_equal(const UCollator * coll,const UChar * source,int32_t sourceLength,const UChar * target,int32_t targetLength)8292 ucol_equal(        const    UCollator        *coll,
8293             const    UChar            *source,
8294             int32_t            sourceLength,
8295             const    UChar            *target,
8296             int32_t            targetLength)
8297 {
8298   return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8299       == UCOL_EQUAL);
8300 }
8301 
8302 U_CAPI void U_EXPORT2
ucol_getUCAVersion(const UCollator * coll,UVersionInfo info)8303 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
8304   if(coll && coll->UCA) {
8305     uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
8306   }
8307 }
8308 
8309 U_CAPI int32_t U_EXPORT2
ucol_cloneBinary(const UCollator * coll,uint8_t * buffer,int32_t capacity,UErrorCode * status)8310 ucol_cloneBinary(const UCollator *coll,
8311                  uint8_t *buffer, int32_t capacity,
8312                  UErrorCode *status)
8313 {
8314     int32_t length = 0;
8315     if(U_FAILURE(*status)) {
8316         return length;
8317     }
8318     if(capacity < 0) {
8319       *status = U_ILLEGAL_ARGUMENT_ERROR;
8320       return length;
8321     }
8322     if(coll->hasRealData == TRUE) {
8323         length = coll->image->size;
8324         if(length <= capacity) {
8325             uprv_memcpy(buffer, coll->image, length);
8326         } else {
8327             *status = U_BUFFER_OVERFLOW_ERROR;
8328         }
8329     } else {
8330         length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
8331         if(length <= capacity) {
8332             /* build the UCATableHeader with minimal entries */
8333             /* do not copy the header from the UCA file because its values are wrong! */
8334             /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
8335 
8336             /* reset everything */
8337             uprv_memset(buffer, 0, length);
8338 
8339             /* set the tailoring-specific values */
8340             UCATableHeader *myData = (UCATableHeader *)buffer;
8341             myData->size = length;
8342 
8343             /* offset for the options, the only part of the data that is present after the header */
8344             myData->options = sizeof(UCATableHeader);
8345 
8346             /* need to always set the expansion value for an upper bound of the options */
8347             myData->expansion = myData->options + sizeof(UColOptionSet);
8348 
8349             myData->magic = UCOL_HEADER_MAGIC;
8350             myData->isBigEndian = U_IS_BIG_ENDIAN;
8351             myData->charSetFamily = U_CHARSET_FAMILY;
8352 
8353             /* copy UCA's version; genrb will override all but the builder version with tailoring data */
8354             uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
8355 
8356             uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
8357             uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
8358             uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
8359             myData->jamoSpecial = coll->image->jamoSpecial;
8360 
8361             /* copy the collator options */
8362             uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
8363         } else {
8364             *status = U_BUFFER_OVERFLOW_ERROR;
8365         }
8366     }
8367     return length;
8368 }
8369 
8370 U_CAPI void U_EXPORT2
ucol_forgetUCA(void)8371 ucol_forgetUCA(void)
8372 {
8373   _staticUCA = NULL;
8374   UCA_DATA_MEM = NULL;
8375 }
8376 
8377 #endif /* #if !UCONFIG_NO_COLLATION */
8378 
8379