• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 2001-2009 IBM and others. All rights reserved.
4 **********************************************************************
5 *   Date        Name        Description
6 *  07/02/2001   synwee      Creation.
7 **********************************************************************
8 */
9 
10 #include "unicode/utypes.h"
11 
12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
13 
14 #include "unicode/usearch.h"
15 #include "unicode/ustring.h"
16 #include "unicode/uchar.h"
17 #include "unormimp.h"
18 #include "ucol_imp.h"
19 #include "usrchimp.h"
20 #include "cmemory.h"
21 #include "ucln_in.h"
22 #include "uassert.h"
23 
24 U_NAMESPACE_USE
25 
26 // don't use Boyer-Moore
27 #define BOYER_MOORE 0
28 
29 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
30 
31 // internal definition ---------------------------------------------------
32 
33 #define LAST_BYTE_MASK_          0xFF
34 #define SECOND_LAST_BYTE_SHIFT_  8
35 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
36 
37 static const uint16_t *fcdTrieIndex = NULL;
38 static UChar32 fcdHighStart = 0;
39 
40 // internal methods -------------------------------------------------
41 
42 /**
43 * Fast collation element iterator setOffset.
44 * This function does not check for bounds.
45 * @param coleiter collation element iterator
46 * @param offset to set
47 */
48 static
setColEIterOffset(UCollationElements * elems,int32_t offset)49 inline void setColEIterOffset(UCollationElements *elems,
50                       int32_t             offset)
51 {
52     collIterate *ci = &(elems->iteratordata_);
53     ci->pos         = ci->string + offset;
54     ci->CEpos       = ci->toReturn = ci->extendCEs ? ci->extendCEs : ci->CEs;
55     if (ci->flags & UCOL_ITER_INNORMBUF) {
56         ci->flags = ci->origFlags;
57     }
58     ci->fcdPosition = NULL;
59 
60 	ci->offsetReturn = NULL;
61     ci->offsetStore  = ci->offsetBuffer;
62 	ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
63 }
64 
65 /**
66 * Getting the mask for collation strength
67 * @param strength collation strength
68 * @return collation element mask
69 */
70 static
getMask(UCollationStrength strength)71 inline uint32_t getMask(UCollationStrength strength)
72 {
73     switch (strength)
74     {
75     case UCOL_PRIMARY:
76         return UCOL_PRIMARYORDERMASK;
77     case UCOL_SECONDARY:
78         return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
79     default:
80         return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
81                UCOL_PRIMARYORDERMASK;
82     }
83 }
84 
85 /**
86 * This is to squeeze the 21bit ces into a 256 table
87 * @param ce collation element
88 * @return collapsed version of the collation element
89 */
90 static
hash(uint32_t ce)91 inline int hash(uint32_t ce)
92 {
93     // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
94     // well with the new collation where most of the latin 1 characters
95     // are of the value xx000xxx. their hashes will most of the time be 0
96     // to be discussed on the hash algo.
97     return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
98 }
99 
100 U_CDECL_BEGIN
101 static UBool U_CALLCONV
usearch_cleanup(void)102 usearch_cleanup(void) {
103     fcdTrieIndex = NULL;
104     return TRUE;
105 }
106 U_CDECL_END
107 
108 /**
109 * Initializing the fcd tables.
110 * Internal method, status assumed to be a success.
111 * @param status output error if any, caller to check status before calling
112 *               method, status assumed to be success when passed in.
113 */
114 static
initializeFCD(UErrorCode * status)115 inline void initializeFCD(UErrorCode *status)
116 {
117     if (fcdTrieIndex == NULL) {
118         fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
119         ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
120     }
121 }
122 
123 /**
124 * Gets the fcd value for a character at the argument index.
125 * This method takes into accounts of the supplementary characters.
126 * @param str UTF16 string where character for fcd retrieval resides
127 * @param offset position of the character whose fcd is to be retrieved, to be
128 *               overwritten with the next character position, taking
129 *               surrogate characters into consideration.
130 * @param strlength length of the argument string
131 * @return fcd value
132 */
133 static
getFCD(const UChar * str,int32_t * offset,int32_t strlength)134 uint16_t getFCD(const UChar   *str, int32_t *offset,
135                              int32_t  strlength)
136 {
137     const UChar *temp = str + *offset;
138     uint16_t    result = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, temp, str + strlength);
139     *offset = (int32_t)(temp - str);
140     return result;
141 }
142 
143 /**
144 * Getting the modified collation elements taking into account the collation
145 * attributes
146 * @param strsrch string search data
147 * @param sourcece
148 * @return the modified collation element
149 */
150 static
getCE(const UStringSearch * strsrch,uint32_t sourcece)151 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
152 {
153     // note for tertiary we can't use the collator->tertiaryMask, that
154     // is a preprocessed mask that takes into account case options. since
155     // we are only concerned with exact matches, we don't need that.
156     sourcece &= strsrch->ceMask;
157 
158     if (strsrch->toShift) {
159         // alternate handling here, since only the 16 most significant digits
160         // is only used, we can safely do a compare without masking
161         // if the ce is a variable, we mask and get only the primary values
162         // no shifting to quartenary is required since all primary values
163         // less than variabletop will need to be masked off anyway.
164         if (strsrch->variableTop > sourcece) {
165             if (strsrch->strength >= UCOL_QUATERNARY) {
166                 sourcece &= UCOL_PRIMARYORDERMASK;
167             }
168             else {
169                 sourcece = UCOL_IGNORABLE;
170             }
171         }
172     } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
173         sourcece = 0xFFFF;
174     }
175 
176     return sourcece;
177 }
178 
179 /**
180 * Allocate a memory and returns NULL if it failed.
181 * Internal method, status assumed to be a success.
182 * @param size to allocate
183 * @param status output error if any, caller to check status before calling
184 *               method, status assumed to be success when passed in.
185 * @return newly allocated array, NULL otherwise
186 */
187 static
allocateMemory(uint32_t size,UErrorCode * status)188 inline void * allocateMemory(uint32_t size, UErrorCode *status)
189 {
190     uint32_t *result = (uint32_t *)uprv_malloc(size);
191     if (result == NULL) {
192         *status = U_MEMORY_ALLOCATION_ERROR;
193     }
194     return result;
195 }
196 
197 /**
198 * Adds a uint32_t value to a destination array.
199 * Creates a new array if we run out of space. The caller will have to
200 * manually deallocate the newly allocated array.
201 * Internal method, status assumed to be success, caller has to check status
202 * before calling this method. destination not to be NULL and has at least
203 * size destinationlength.
204 * @param destination target array
205 * @param offset destination offset to add value
206 * @param destinationlength target array size, return value for the new size
207 * @param value to be added
208 * @param increments incremental size expected
209 * @param status output error if any, caller to check status before calling
210 *               method, status assumed to be success when passed in.
211 * @return new destination array, destination if there was no new allocation
212 */
213 static
addTouint32_tArray(int32_t * destination,uint32_t offset,uint32_t * destinationlength,uint32_t value,uint32_t increments,UErrorCode * status)214 inline int32_t * addTouint32_tArray(int32_t    *destination,
215                                     uint32_t    offset,
216                                     uint32_t   *destinationlength,
217                                     uint32_t    value,
218                                     uint32_t    increments,
219                                     UErrorCode *status)
220 {
221     uint32_t newlength = *destinationlength;
222     if (offset + 1 == newlength) {
223         newlength += increments;
224         int32_t *temp = (int32_t *)allocateMemory(
225                                          sizeof(int32_t) * newlength, status);
226         if (U_FAILURE(*status)) {
227             return NULL;
228         }
229         uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
230         *destinationlength = newlength;
231         destination        = temp;
232     }
233     destination[offset] = value;
234     return destination;
235 }
236 
237 /**
238 * Adds a uint64_t value to a destination array.
239 * Creates a new array if we run out of space. The caller will have to
240 * manually deallocate the newly allocated array.
241 * Internal method, status assumed to be success, caller has to check status
242 * before calling this method. destination not to be NULL and has at least
243 * size destinationlength.
244 * @param destination target array
245 * @param offset destination offset to add value
246 * @param destinationlength target array size, return value for the new size
247 * @param value to be added
248 * @param increments incremental size expected
249 * @param status output error if any, caller to check status before calling
250 *               method, status assumed to be success when passed in.
251 * @return new destination array, destination if there was no new allocation
252 */
253 static
addTouint64_tArray(int64_t * destination,uint32_t offset,uint32_t * destinationlength,uint64_t value,uint32_t increments,UErrorCode * status)254 inline int64_t * addTouint64_tArray(int64_t    *destination,
255                                     uint32_t    offset,
256                                     uint32_t   *destinationlength,
257                                     uint64_t    value,
258                                     uint32_t    increments,
259                                     UErrorCode *status)
260 {
261     uint32_t newlength = *destinationlength;
262     if (offset + 1 == newlength) {
263         newlength += increments;
264         int64_t *temp = (int64_t *)allocateMemory(
265                                          sizeof(int64_t) * newlength, status);
266 
267         if (U_FAILURE(*status)) {
268             return NULL;
269         }
270 
271         uprv_memcpy(temp, destination, sizeof(int64_t) * offset);
272         *destinationlength = newlength;
273         destination        = temp;
274     }
275 
276     destination[offset] = value;
277 
278     return destination;
279 }
280 
281 /**
282 * Initializing the ce table for a pattern.
283 * Stores non-ignorable collation keys.
284 * Table size will be estimated by the size of the pattern text. Table
285 * expansion will be perform as we go along. Adding 1 to ensure that the table
286 * size definitely increases.
287 * Internal method, status assumed to be a success.
288 * @param strsrch string search data
289 * @param status output error if any, caller to check status before calling
290 *               method, status assumed to be success when passed in.
291 * @return total number of expansions
292 */
293 static
initializePatternCETable(UStringSearch * strsrch,UErrorCode * status)294 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
295                                          UErrorCode    *status)
296 {
297     UPattern *pattern            = &(strsrch->pattern);
298     uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
299     int32_t  *cetable            = pattern->CEBuffer;
300     uint32_t  patternlength      = pattern->textLength;
301     UCollationElements *coleiter = strsrch->utilIter;
302 
303     if (coleiter == NULL) {
304         coleiter = ucol_openElements(strsrch->collator, pattern->text,
305                                      patternlength, status);
306         // status will be checked in ucol_next(..) later and if it is an
307         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
308         // returned.
309         strsrch->utilIter = coleiter;
310     }
311     else {
312         uprv_init_collIterate(strsrch->collator, pattern->text,
313                          pattern->textLength,
314                          &coleiter->iteratordata_);
315     }
316 
317     if (pattern->CE != cetable && pattern->CE) {
318         uprv_free(pattern->CE);
319     }
320 
321     uint16_t  offset      = 0;
322     uint16_t  result      = 0;
323     int32_t   ce;
324 
325     while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
326            U_SUCCESS(*status)) {
327         uint32_t newce = getCE(strsrch, ce);
328         if (newce) {
329             int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
330                                   newce,
331                                   patternlength - ucol_getOffset(coleiter) + 1,
332                                   status);
333             if (U_FAILURE(*status)) {
334                 return 0;
335             }
336             offset ++;
337             if (cetable != temp && cetable != pattern->CEBuffer) {
338                 uprv_free(cetable);
339             }
340             cetable = temp;
341         }
342         result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
343     }
344 
345     cetable[offset]   = 0;
346     pattern->CE       = cetable;
347     pattern->CELength = offset;
348 
349     return result;
350 }
351 
352 /**
353 * Initializing the pce table for a pattern.
354 * Stores non-ignorable collation keys.
355 * Table size will be estimated by the size of the pattern text. Table
356 * expansion will be perform as we go along. Adding 1 to ensure that the table
357 * size definitely increases.
358 * Internal method, status assumed to be a success.
359 * @param strsrch string search data
360 * @param status output error if any, caller to check status before calling
361 *               method, status assumed to be success when passed in.
362 * @return total number of expansions
363 */
364 static
initializePatternPCETable(UStringSearch * strsrch,UErrorCode * status)365 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
366                                           UErrorCode    *status)
367 {
368     UPattern *pattern            = &(strsrch->pattern);
369     uint32_t  pcetablesize       = INITIAL_ARRAY_SIZE_;
370     int64_t  *pcetable           = pattern->PCEBuffer;
371     uint32_t  patternlength      = pattern->textLength;
372     UCollationElements *coleiter = strsrch->utilIter;
373 
374     if (coleiter == NULL) {
375         coleiter = ucol_openElements(strsrch->collator, pattern->text,
376                                      patternlength, status);
377         // status will be checked in ucol_next(..) later and if it is an
378         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
379         // returned.
380         strsrch->utilIter = coleiter;
381     } else {
382         uprv_init_collIterate(strsrch->collator, pattern->text,
383                               pattern->textLength,
384                               &coleiter->iteratordata_);
385     }
386 
387     if (pattern->PCE != pcetable && pattern->PCE != NULL) {
388         uprv_free(pattern->PCE);
389     }
390 
391     uint16_t  offset = 0;
392     uint16_t  result = 0;
393     int64_t   pce;
394 
395     uprv_init_pce(coleiter);
396 
397     // ** Should processed CEs be signed or unsigned?
398     // ** (the rest of the code in this file seems to play fast-and-loose with
399     // **  whether a CE is signed or unsigned. For example, look at routine above this one.)
400     while ((pce = ucol_nextProcessed(coleiter, NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
401            U_SUCCESS(*status)) {
402         int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
403                               pce,
404                               patternlength - ucol_getOffset(coleiter) + 1,
405                               status);
406 
407         if (U_FAILURE(*status)) {
408             return 0;
409         }
410 
411         offset += 1;
412 
413         if (pcetable != temp && pcetable != pattern->PCEBuffer) {
414             uprv_free(pcetable);
415         }
416 
417         pcetable = temp;
418         //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
419     }
420 
421     pcetable[offset]   = 0;
422     pattern->PCE       = pcetable;
423     pattern->PCELength = offset;
424 
425     return result;
426 }
427 
428 /**
429 * Initializes the pattern struct.
430 * Internal method, status assumed to be success.
431 * @param strsrch UStringSearch data storage
432 * @param status output error if any, caller to check status before calling
433 *               method, status assumed to be success when passed in.
434 * @return expansionsize the total expansion size of the pattern
435 */
436 static
initializePattern(UStringSearch * strsrch,UErrorCode * status)437 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
438 {
439           UPattern   *pattern     = &(strsrch->pattern);
440     const UChar      *patterntext = pattern->text;
441           int32_t     length      = pattern->textLength;
442           int32_t index       = 0;
443 
444     // Since the strength is primary, accents are ignored in the pattern.
445     if (strsrch->strength == UCOL_PRIMARY) {
446     	pattern->hasPrefixAccents = 0;
447     	pattern->hasSuffixAccents = 0;
448     } else {
449 	    pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
450 	                                                     SECOND_LAST_BYTE_SHIFT_;
451 	    index = length;
452 	    UTF_BACK_1(patterntext, 0, index);
453 	    pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
454 	                                                             LAST_BYTE_MASK_;
455     }
456 
457     // ** HACK **
458     if (strsrch->pattern.PCE != NULL) {
459         if (strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
460             uprv_free(strsrch->pattern.PCE);
461         }
462 
463         strsrch->pattern.PCE = NULL;
464     }
465 
466     // since intializePattern is an internal method status is a success.
467     return initializePatternCETable(strsrch, status);
468 }
469 
470 /**
471 * Initializing shift tables, with the default values.
472 * If a corresponding default value is 0, the shift table is not set.
473 * @param shift table for forwards shift
474 * @param backshift table for backwards shift
475 * @param cetable table containing pattern ce
476 * @param cesize size of the pattern ces
477 * @param expansionsize total size of the expansions
478 * @param defaultforward the default forward value
479 * @param defaultbackward the default backward value
480 */
481 static
setShiftTable(int16_t shift[],int16_t backshift[],int32_t * cetable,int32_t cesize,int16_t expansionsize,int16_t defaultforward,int16_t defaultbackward)482 inline void setShiftTable(int16_t   shift[], int16_t backshift[],
483                           int32_t  *cetable, int32_t cesize,
484                           int16_t   expansionsize,
485                           int16_t   defaultforward,
486                           int16_t   defaultbackward)
487 {
488     // estimate the value to shift. to do that we estimate the smallest
489     // number of characters to give the relevant ces, ie approximately
490     // the number of ces minus their expansion, since expansions can come
491     // from a character.
492     int32_t count;
493     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
494         shift[count] = defaultforward;
495     }
496     cesize --; // down to the last index
497     for (count = 0; count < cesize; count ++) {
498         // number of ces from right of array to the count
499         int temp = defaultforward - count - 1;
500         shift[hash(cetable[count])] = temp > 1 ? temp : 1;
501     }
502     shift[hash(cetable[cesize])] = 1;
503     // for ignorables we just shift by one. see test examples.
504     shift[hash(0)] = 1;
505 
506     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
507         backshift[count] = defaultbackward;
508     }
509     for (count = cesize; count > 0; count --) {
510         // the original value count does not seem to work
511         backshift[hash(cetable[count])] = count > expansionsize ?
512                                           (int16_t)(count - expansionsize) : 1;
513     }
514     backshift[hash(cetable[0])] = 1;
515     backshift[hash(0)] = 1;
516 }
517 
518 /**
519 * Building of the pattern collation element list and the boyer moore strsrch
520 * table.
521 * The canonical match will only be performed after the default match fails.
522 * For both cases we need to remember the size of the composed and decomposed
523 * versions of the string. Since the Boyer-Moore shift calculations shifts by
524 * a number of characters in the text and tries to match the pattern from that
525 * offset, the shift value can not be too large in case we miss some
526 * characters. To choose a right shift size, we estimate the NFC form of the
527 * and use its size as a shift guide. The NFC form should be the small
528 * possible representation of the pattern. Anyways, we'll err on the smaller
529 * shift size. Hence the calculation for minlength.
530 * Canonical match will be performed slightly differently. We'll split the
531 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
532 * the first and last base character (MS), the ending accents (EA). Matches
533 * will be done on MS first, and only when we match MS then some processing
534 * will be required for the prefix and end accents in order to determine if
535 * they match PA and EA. Hence the default shift values
536 * for the canonical match will take the size of either end's accent into
537 * consideration. Forwards search will take the end accents into consideration
538 * for the default shift values and the backwards search will take the prefix
539 * accents into consideration.
540 * If pattern has no non-ignorable ce, we return a illegal argument error.
541 * Internal method, status assumed to be success.
542 * @param strsrch UStringSearch data storage
543 * @param status  for output errors if it occurs, status is assumed to be a
544 *                success when it is passed in.
545 */
546 static
initialize(UStringSearch * strsrch,UErrorCode * status)547 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
548 {
549     int16_t expandlength  = initializePattern(strsrch, status);
550     if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) {
551         UPattern *pattern = &strsrch->pattern;
552         int32_t   cesize  = pattern->CELength;
553 
554         int16_t minlength = cesize > expandlength
555                             ? (int16_t)cesize - expandlength : 1;
556         pattern->defaultShiftSize    = minlength;
557         setShiftTable(pattern->shift, pattern->backShift, pattern->CE,
558                       cesize, expandlength, minlength, minlength);
559         return;
560     }
561     strsrch->pattern.defaultShiftSize = 0;
562 }
563 
564 #if BOYER_MOORE
565 /**
566 * Check to make sure that the match length is at the end of the character by
567 * using the breakiterator.
568 * @param strsrch string search data
569 * @param start target text start offset
570 * @param end target text end offset
571 */
572 static
checkBreakBoundary(const UStringSearch * strsrch,int32_t *,int32_t * end)573 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
574                                int32_t *end)
575 {
576 #if !UCONFIG_NO_BREAK_ITERATION
577     UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
578     if (breakiterator) {
579 	    int32_t matchend = *end;
580 	    //int32_t matchstart = *start;
581 
582 	    if (!ubrk_isBoundary(breakiterator, matchend)) {
583 	    	*end = ubrk_following(breakiterator, matchend);
584         }
585 
586 	    /* Check the start of the matched text to make sure it doesn't have any accents
587 	     * before it.  This code may not be necessary and so it is commented out */
588 	    /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
589 	    	*start = ubrk_preceding(breakiterator, matchstart);
590 	    }*/
591     }
592 #endif
593 }
594 
595 /**
596 * Determine whether the target text in UStringSearch bounded by the offset
597 * start and end is one or more whole units of text as
598 * determined by the breakiterator in UStringSearch.
599 * @param strsrch string search data
600 * @param start target text start offset
601 * @param end target text end offset
602 */
603 static
isBreakUnit(const UStringSearch * strsrch,int32_t start,int32_t end)604 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
605                                int32_t    end)
606 {
607 #if !UCONFIG_NO_BREAK_ITERATION
608     UBreakIterator *breakiterator = strsrch->search->breakIter;
609     //TODO: Add here.
610     if (breakiterator) {
611         int32_t startindex = ubrk_first(breakiterator);
612         int32_t endindex   = ubrk_last(breakiterator);
613 
614         // out-of-range indexes are never boundary positions
615         if (start < startindex || start > endindex ||
616             end < startindex || end > endindex) {
617             return FALSE;
618         }
619         // otherwise, we can use following() on the position before the
620         // specified one and return true of the position we get back is the
621         // one the user specified
622         UBool result = (start == startindex ||
623                 ubrk_following(breakiterator, start - 1) == start) &&
624                (end == endindex ||
625                 ubrk_following(breakiterator, end - 1) == end);
626         if (result) {
627             // iterates the individual ces
628                   UCollationElements *coleiter  = strsrch->utilIter;
629             const UChar              *text      = strsrch->search->text +
630                                                                       start;
631                   UErrorCode          status    = U_ZERO_ERROR;
632             ucol_setText(coleiter, text, end - start, &status);
633             for (int32_t count = 0; count < strsrch->pattern.CELength;
634                  count ++) {
635                 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
636                 if (ce == UCOL_IGNORABLE) {
637                     count --;
638                     continue;
639                 }
640                 if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) {
641                     return FALSE;
642                 }
643             }
644             int32_t nextce = ucol_next(coleiter, &status);
645             while (ucol_getOffset(coleiter) == (end - start)
646                    && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
647                 nextce = ucol_next(coleiter, &status);
648             }
649             if (ucol_getOffset(coleiter) == (end - start)
650                 && nextce != UCOL_NULLORDER) {
651                 // extra collation elements at the end of the match
652                 return FALSE;
653             }
654         }
655         return result;
656     }
657 #endif
658     return TRUE;
659 }
660 
661 /**
662 * Getting the next base character offset if current offset is an accent,
663 * or the current offset if the current character contains a base character.
664 * accents the following base character will be returned
665 * @param text string
666 * @param textoffset current offset
667 * @param textlength length of text string
668 * @return the next base character or the current offset
669 *         if the current character is contains a base character.
670 */
671 static
getNextBaseOffset(const UChar * text,int32_t textoffset,int32_t textlength)672 inline int32_t getNextBaseOffset(const UChar       *text,
673                                            int32_t  textoffset,
674                                            int32_t      textlength)
675 {
676     if (textoffset < textlength) {
677         int32_t temp = textoffset;
678         if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
679             while (temp < textlength) {
680                 int32_t result = temp;
681                 if ((getFCD(text, &temp, textlength) >>
682                      SECOND_LAST_BYTE_SHIFT_) == 0) {
683                     return result;
684                 }
685             }
686             return textlength;
687         }
688     }
689     return textoffset;
690 }
691 
692 /**
693 * Gets the next base character offset depending on the string search pattern
694 * data
695 * @param strsrch string search data
696 * @param textoffset current offset, one offset away from the last character
697 *                   to search for.
698 * @return start index of the next base character or the current offset
699 *         if the current character is contains a base character.
700 */
701 static
getNextUStringSearchBaseOffset(UStringSearch * strsrch,int32_t textoffset)702 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
703                                                   int32_t    textoffset)
704 {
705     int32_t textlength = strsrch->search->textLength;
706     if (strsrch->pattern.hasSuffixAccents &&
707         textoffset < textlength) {
708               int32_t  temp       = textoffset;
709         const UChar       *text       = strsrch->search->text;
710         UTF_BACK_1(text, 0, temp);
711         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
712             return getNextBaseOffset(text, textoffset, textlength);
713         }
714     }
715     return textoffset;
716 }
717 
718 /**
719 * Shifting the collation element iterator position forward to prepare for
720 * a following match. If the last character is a unsafe character, we'll only
721 * shift by 1 to capture contractions, normalization etc.
722 * Internal method, status assumed to be success.
723 * @param text strsrch string search data
724 * @param textoffset start text position to do search
725 * @param ce the text ce which failed the match.
726 * @param patternceindex index of the ce within the pattern ce buffer which
727 *        failed the match
728 * @return final offset
729 */
730 static
shiftForward(UStringSearch * strsrch,int32_t textoffset,int32_t ce,int32_t patternceindex)731 inline int32_t shiftForward(UStringSearch *strsrch,
732                                 int32_t    textoffset,
733                                 int32_t       ce,
734                                 int32_t        patternceindex)
735 {
736     UPattern *pattern = &(strsrch->pattern);
737     if (ce != UCOL_NULLORDER) {
738         int32_t shift = pattern->shift[hash(ce)];
739         // this is to adjust for characters in the middle of the
740         // substring for matching that failed.
741         int32_t adjust = pattern->CELength - patternceindex;
742         if (adjust > 1 && shift >= adjust) {
743             shift -= adjust - 1;
744         }
745         textoffset += shift;
746     }
747     else {
748         textoffset += pattern->defaultShiftSize;
749     }
750 
751     textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
752     // check for unsafe characters
753     // * if it is the start or middle of a contraction: to be done after
754     //   a initial match is found
755     // * thai or lao base consonant character: similar to contraction
756     // * high surrogate character: similar to contraction
757     // * next character is a accent: shift to the next base character
758     return textoffset;
759 }
760 #endif // #if BOYER_MOORE
761 
762 /**
763 * sets match not found
764 * @param strsrch string search data
765 */
766 static
setMatchNotFound(UStringSearch * strsrch)767 inline void setMatchNotFound(UStringSearch *strsrch)
768 {
769     // this method resets the match result regardless of the error status.
770     strsrch->search->matchedIndex = USEARCH_DONE;
771     strsrch->search->matchedLength = 0;
772     if (strsrch->search->isForwardSearching) {
773         setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
774     }
775     else {
776         setColEIterOffset(strsrch->textIter, 0);
777     }
778 }
779 
780 #if BOYER_MOORE
781 /**
782 * Gets the offset to the next safe point in text.
783 * ie. not the middle of a contraction, swappable characters or supplementary
784 * characters.
785 * @param collator collation sata
786 * @param text string to work with
787 * @param textoffset offset in string
788 * @param textlength length of text string
789 * @return offset to the next safe character
790 */
791 static
getNextSafeOffset(const UCollator * collator,const UChar * text,int32_t textoffset,int32_t textlength)792 inline int32_t getNextSafeOffset(const UCollator   *collator,
793                                      const UChar       *text,
794                                            int32_t  textoffset,
795                                            int32_t      textlength)
796 {
797     int32_t result = textoffset; // first contraction character
798     while (result != textlength && ucol_unsafeCP(text[result], collator)) {
799         result ++;
800     }
801     return result;
802 }
803 
804 /**
805 * This checks for accents in the potential match started with a .
806 * composite character.
807 * This is really painful... we have to check that composite character do not
808 * have any extra accents. We have to normalize the potential match and find
809 * the immediate decomposed character before the match.
810 * The first composite character would have been taken care of by the fcd
811 * checks in checkForwardExactMatch.
812 * This is the slow path after the fcd of the first character and
813 * the last character has been checked by checkForwardExactMatch and we
814 * determine that the potential match has extra non-ignorable preceding
815 * ces.
816 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
817 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
818 * Note here that accents checking are slow and cautioned in the API docs.
819 * Internal method, status assumed to be a success, caller should check status
820 * before calling this method
821 * @param strsrch string search data
822 * @param start index of the potential unfriendly composite character
823 * @param end index of the potential unfriendly composite character
824 * @param status output error status if any.
825 * @return TRUE if there is non-ignorable accents before at the beginning
826 *              of the match, FALSE otherwise.
827 */
828 
829 static
checkExtraMatchAccents(const UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)830 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
831                                    int32_t    end,
832                                    UErrorCode    *status)
833 {
834     UBool result = FALSE;
835     if (strsrch->pattern.hasPrefixAccents) {
836               int32_t  length = end - start;
837               int32_t  offset = 0;
838         const UChar       *text   = strsrch->search->text + start;
839 
840         UTF_FWD_1(text, offset, length);
841         // we are only concerned with the first composite character
842         if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
843             int32_t safeoffset = getNextSafeOffset(strsrch->collator,
844                                                        text, 0, length);
845             if (safeoffset != length) {
846                 safeoffset ++;
847             }
848             UChar   *norm = NULL;
849             UChar    buffer[INITIAL_ARRAY_SIZE_];
850             int32_t  size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
851                                             buffer, INITIAL_ARRAY_SIZE_,
852                                             status);
853             if (U_FAILURE(*status)) {
854                 return FALSE;
855             }
856             if (size >= INITIAL_ARRAY_SIZE_) {
857                 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
858                                                status);
859                 // if allocation failed, status will be set to
860                 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
861                 // checks for it.
862                 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
863                                        size, status);
864                 if (U_FAILURE(*status) && norm != NULL) {
865                     uprv_free(norm);
866                     return FALSE;
867                 }
868             }
869             else {
870                 norm = buffer;
871             }
872 
873             UCollationElements *coleiter  = strsrch->utilIter;
874             ucol_setText(coleiter, norm, size, status);
875             uint32_t            firstce   = strsrch->pattern.CE[0];
876             UBool               ignorable = TRUE;
877             uint32_t            ce        = UCOL_IGNORABLE;
878             while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
879                 offset = ucol_getOffset(coleiter);
880                 if (ce != firstce && ce != UCOL_IGNORABLE) {
881                     ignorable = FALSE;
882                 }
883                 ce = ucol_next(coleiter, status);
884             }
885             UChar32 codepoint;
886             UTF_PREV_CHAR(norm, 0, offset, codepoint);
887             result = !ignorable && (u_getCombiningClass(codepoint) != 0);
888 
889             if (norm != buffer) {
890                 uprv_free(norm);
891             }
892         }
893     }
894 
895     return result;
896 }
897 
898 /**
899 * Used by exact matches, checks if there are accents before the match.
900 * This is really painful... we have to check that composite characters at
901 * the start of the matches have to not have any extra accents.
902 * We check the FCD of the character first, if it starts with an accent and
903 * the first pattern ce does not match the first ce of the character, we bail.
904 * Otherwise we try normalizing the first composite
905 * character and find the immediate decomposed character before the match to
906 * see if it is an non-ignorable accent.
907 * Now normalizing the first composite character is enough because we ensure
908 * that when the match is passed in here with extra beginning ces, the
909 * first or last ce that match has to occur within the first character.
910 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
911 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
912 * Note here that accents checking are slow and cautioned in the API docs.
913 * @param strsrch string search data
914 * @param start offset
915 * @param end offset
916 * @return TRUE if there are accents on either side of the match,
917 *         FALSE otherwise
918 */
919 static
hasAccentsBeforeMatch(const UStringSearch * strsrch,int32_t start,int32_t end)920 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
921                                   int32_t    end)
922 {
923     if (strsrch->pattern.hasPrefixAccents) {
924         UCollationElements *coleiter  = strsrch->textIter;
925         UErrorCode          status    = U_ZERO_ERROR;
926         // we have been iterating forwards previously
927         uint32_t            ignorable = TRUE;
928         int32_t             firstce   = strsrch->pattern.CE[0];
929 
930         setColEIterOffset(coleiter, start);
931         int32_t ce  = getCE(strsrch, ucol_next(coleiter, &status));
932         if (U_FAILURE(status)) {
933             return TRUE;
934         }
935         while (ce != firstce) {
936             if (ce != UCOL_IGNORABLE) {
937                 ignorable = FALSE;
938             }
939             ce = getCE(strsrch, ucol_next(coleiter, &status));
940             if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
941                 return TRUE;
942             }
943         }
944         if (!ignorable && inNormBuf(coleiter)) {
945             // within normalization buffer, discontiguous handled here
946             return TRUE;
947         }
948 
949         // within text
950         int32_t temp = start;
951         // original code
952         // accent = (getFCD(strsrch->search->text, &temp,
953         //                  strsrch->search->textLength)
954         //            >> SECOND_LAST_BYTE_SHIFT_);
955         // however this code does not work well with VC7 .net in release mode.
956         // maybe the inlines for getFCD combined with shifting has bugs in
957         // VC7. anyways this is a work around.
958         UBool accent = getFCD(strsrch->search->text, &temp,
959                               strsrch->search->textLength) > 0xFF;
960         if (!accent) {
961             return checkExtraMatchAccents(strsrch, start, end, &status);
962         }
963         if (!ignorable) {
964             return TRUE;
965         }
966         if (start > 0) {
967             temp = start;
968             UTF_BACK_1(strsrch->search->text, 0, temp);
969             if (getFCD(strsrch->search->text, &temp,
970                        strsrch->search->textLength) & LAST_BYTE_MASK_) {
971                 setColEIterOffset(coleiter, start);
972                 ce = ucol_previous(coleiter, &status);
973                 if (U_FAILURE(status) ||
974                     (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
975                     return TRUE;
976                 }
977             }
978         }
979     }
980 
981     return FALSE;
982 }
983 
984 /**
985 * Used by exact matches, checks if there are accents bounding the match.
986 * Note this is the initial boundary check. If the potential match
987 * starts or ends with composite characters, the accents in those
988 * characters will be determined later.
989 * Not doing backwards iteration here, since discontiguos contraction for
990 * backwards collation element iterator, use up too many characters.
991 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
992 * should fail since there is a acute at the end of \u01FA
993 * Note here that accents checking are slow and cautioned in the API docs.
994 * @param strsrch string search data
995 * @param start offset of match
996 * @param end end offset of the match
997 * @return TRUE if there are accents on either side of the match,
998 *         FALSE otherwise
999 */
1000 static
hasAccentsAfterMatch(const UStringSearch * strsrch,int32_t start,int32_t end)1001 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
1002                                  int32_t    end)
1003 {
1004     if (strsrch->pattern.hasSuffixAccents) {
1005         const UChar       *text       = strsrch->search->text;
1006               int32_t  temp       = end;
1007               int32_t      textlength = strsrch->search->textLength;
1008         UTF_BACK_1(text, 0, temp);
1009         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
1010             int32_t             firstce  = strsrch->pattern.CE[0];
1011             UCollationElements *coleiter = strsrch->textIter;
1012             UErrorCode          status   = U_ZERO_ERROR;
1013 			int32_t ce;
1014             setColEIterOffset(coleiter, start);
1015             while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1016                 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
1017                     return TRUE;
1018                 }
1019             }
1020             int32_t count = 1;
1021             while (count < strsrch->pattern.CELength) {
1022                 if (getCE(strsrch, ucol_next(coleiter, &status))
1023                     == UCOL_IGNORABLE) {
1024                     // Thai can give an ignorable here.
1025                     count --;
1026                 }
1027                 if (U_FAILURE(status)) {
1028                     return TRUE;
1029                 }
1030                 count ++;
1031             }
1032 
1033 			ce = ucol_next(coleiter, &status);
1034             if (U_FAILURE(status)) {
1035                 return TRUE;
1036             }
1037             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1038             	ce = getCE(strsrch, ce);
1039             }
1040             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1041                 if (ucol_getOffset(coleiter) <= end) {
1042                     return TRUE;
1043                 }
1044                 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1045                     return TRUE;
1046                 }
1047             }
1048         }
1049     }
1050     return FALSE;
1051 }
1052 #endif // #if BOYER_MOORE
1053 
1054 /**
1055 * Checks if the offset runs out of the text string
1056 * @param offset
1057 * @param textlength of the text string
1058 * @return TRUE if offset is out of bounds, FALSE otherwise
1059 */
1060 static
isOutOfBounds(int32_t textlength,int32_t offset)1061 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1062 {
1063     return offset < 0 || offset > textlength;
1064 }
1065 
1066 /**
1067 * Checks for identical match
1068 * @param strsrch string search data
1069 * @param start offset of possible match
1070 * @param end offset of possible match
1071 * @return TRUE if identical match is found
1072 */
1073 static
checkIdentical(const UStringSearch * strsrch,int32_t start,int32_t end)1074 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1075                                   int32_t    end)
1076 {
1077     UChar t2[32], p2[32];
1078     int32_t length = end - start;
1079     if (strsrch->strength != UCOL_IDENTICAL) {
1080         return TRUE;
1081     }
1082 
1083     UErrorCode status = U_ZERO_ERROR, status2 = U_ZERO_ERROR;
1084     int32_t decomplength = unorm_decompose(t2, LENGTHOF(t2),
1085                                        strsrch->search->text + start, length,
1086                                        FALSE, 0, &status);
1087     // use separate status2 in case of buffer overflow
1088     if (decomplength != unorm_decompose(p2, LENGTHOF(p2),
1089                                         strsrch->pattern.text,
1090                                         strsrch->pattern.textLength,
1091                                         FALSE, 0, &status2)) {
1092         return FALSE; // lengths are different
1093     }
1094 
1095     // compare contents
1096     UChar *text, *pattern;
1097     if(U_SUCCESS(status)) {
1098         text = t2;
1099         pattern = p2;
1100     } else if(status==U_BUFFER_OVERFLOW_ERROR) {
1101         status = U_ZERO_ERROR;
1102         // allocate one buffer for both decompositions
1103         text = (UChar *)uprv_malloc(decomplength * 2 * U_SIZEOF_UCHAR);
1104         // Check for allocation failure.
1105         if (text == NULL) {
1106         	return FALSE;
1107         }
1108         pattern = text + decomplength;
1109         unorm_decompose(text, decomplength, strsrch->search->text + start,
1110                         length, FALSE, 0, &status);
1111         unorm_decompose(pattern, decomplength, strsrch->pattern.text,
1112                         strsrch->pattern.textLength, FALSE, 0, &status);
1113     } else {
1114         // NFD failed, make sure that u_memcmp() does not overrun t2 & p2
1115         // and that we don't uprv_free() an undefined text pointer
1116         text = pattern = t2;
1117         decomplength = 0;
1118     }
1119     UBool result = (UBool)(u_memcmp(pattern, text, decomplength) == 0);
1120     if(text != t2) {
1121         uprv_free(text);
1122     }
1123     // return FALSE if NFD failed
1124     return U_SUCCESS(status) && result;
1125 }
1126 
1127 #if BOYER_MOORE
1128 /**
1129 * Checks to see if the match is repeated
1130 * @param strsrch string search data
1131 * @param start new match start index
1132 * @param end new match end index
1133 * @return TRUE if the the match is repeated, FALSE otherwise
1134 */
1135 static
checkRepeatedMatch(UStringSearch * strsrch,int32_t start,int32_t end)1136 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1137                                 int32_t    start,
1138                                 int32_t    end)
1139 {
1140     int32_t lastmatchindex = strsrch->search->matchedIndex;
1141     UBool       result;
1142     if (lastmatchindex == USEARCH_DONE) {
1143         return FALSE;
1144     }
1145     if (strsrch->search->isForwardSearching) {
1146         result = start <= lastmatchindex;
1147     }
1148     else {
1149         result = start >= lastmatchindex;
1150     }
1151     if (!result && !strsrch->search->isOverlap) {
1152         if (strsrch->search->isForwardSearching) {
1153             result = start < lastmatchindex + strsrch->search->matchedLength;
1154         }
1155         else {
1156             result = end > lastmatchindex;
1157         }
1158     }
1159     return result;
1160 }
1161 
1162 /**
1163 * Gets the collation element iterator's current offset.
1164 * @param coleiter collation element iterator
1165 * @param forwards flag TRUE if we are moving in th forwards direction
1166 * @return current offset
1167 */
1168 static
getColElemIterOffset(const UCollationElements * coleiter,UBool forwards)1169 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1170                                               UBool               forwards)
1171 {
1172     int32_t result = ucol_getOffset(coleiter);
1173     // intricacies of the the backwards collation element iterator
1174     if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
1175         result ++;
1176     }
1177     return result;
1178 }
1179 
1180 /**
1181 * Checks match for contraction.
1182 * If the match ends with a partial contraction we fail.
1183 * If the match starts too far off (because of backwards iteration) we try to
1184 * chip off the extra characters depending on whether a breakiterator has
1185 * been used.
1186 * Internal method, error assumed to be success, caller has to check status
1187 * before calling this method.
1188 * @param strsrch string search data
1189 * @param start offset of potential match, to be modified if necessary
1190 * @param end offset of potential match, to be modified if necessary
1191 * @param status output error status if any
1192 * @return TRUE if match passes the contraction test, FALSE otherwise
1193 */
1194 
1195 static
checkNextExactContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)1196 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1197                                      int32_t   *start,
1198                                      int32_t   *end, UErrorCode  *status)
1199 {
1200           UCollationElements *coleiter   = strsrch->textIter;
1201           int32_t             textlength = strsrch->search->textLength;
1202           int32_t             temp       = *start;
1203     const UCollator          *collator   = strsrch->collator;
1204     const UChar              *text       = strsrch->search->text;
1205     // This part checks if either ends of the match contains potential
1206     // contraction. If so we'll have to iterate through them
1207     // The start contraction needs to be checked since ucol_previous dumps
1208     // all characters till the first safe character into the buffer.
1209     // *start + 1 is used to test for the unsafe characters instead of *start
1210     // because ucol_prev takes all unsafe characters till the first safe
1211     // character ie *start. so by testing *start + 1, we can estimate if
1212     // excess prefix characters has been included in the potential search
1213     // results.
1214     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1215         (*start + 1 < textlength
1216          && ucol_unsafeCP(text[*start + 1], collator))) {
1217         int32_t expansion  = getExpansionPrefix(coleiter);
1218         UBool   expandflag = expansion > 0;
1219         setColEIterOffset(coleiter, *start);
1220         while (expansion > 0) {
1221             // getting rid of the redundant ce, caused by setOffset.
1222             // since backward contraction/expansion may have extra ces if we
1223             // are in the normalization buffer, hasAccentsBeforeMatch would
1224             // have taken care of it.
1225             // E.g. the character \u01FA will have an expansion of 3, but if
1226             // we are only looking for acute and ring \u030A and \u0301, we'll
1227             // have to skip the first ce in the expansion buffer.
1228             ucol_next(coleiter, status);
1229             if (U_FAILURE(*status)) {
1230                 return FALSE;
1231             }
1232             if (ucol_getOffset(coleiter) != temp) {
1233                 *start = temp;
1234                 temp  = ucol_getOffset(coleiter);
1235             }
1236             expansion --;
1237         }
1238 
1239         int32_t  *patternce       = strsrch->pattern.CE;
1240         int32_t   patterncelength = strsrch->pattern.CELength;
1241         int32_t   count           = 0;
1242         while (count < patterncelength) {
1243             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1244             if (ce == UCOL_IGNORABLE) {
1245                 continue;
1246             }
1247             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1248                 *start = temp;
1249                 temp   = ucol_getOffset(coleiter);
1250             }
1251             if (U_FAILURE(*status) || ce != patternce[count]) {
1252                 (*end) ++;
1253                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1254                 return FALSE;
1255             }
1256             count ++;
1257         }
1258     }
1259     return TRUE;
1260 }
1261 
1262 /**
1263 * Checks and sets the match information if found.
1264 * Checks
1265 * <ul>
1266 * <li> the potential match does not repeat the previous match
1267 * <li> boundaries are correct
1268 * <li> exact matches has no extra accents
1269 * <li> identical matchesb
1270 * <li> potential match does not end in the middle of a contraction
1271 * <\ul>
1272 * Otherwise the offset will be shifted to the next character.
1273 * Internal method, status assumed to be success, caller has to check status
1274 * before calling this method.
1275 * @param strsrch string search data
1276 * @param textoffset offset in the collation element text. the returned value
1277 *        will be the truncated end offset of the match or the new start
1278 *        search offset.
1279 * @param status output error status if any
1280 * @return TRUE if the match is valid, FALSE otherwise
1281 */
1282 static
checkNextExactMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)1283 inline UBool checkNextExactMatch(UStringSearch *strsrch,
1284                                  int32_t   *textoffset, UErrorCode *status)
1285 {
1286     UCollationElements *coleiter = strsrch->textIter;
1287     int32_t         start    = getColElemIterOffset(coleiter, FALSE);
1288 
1289     if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1290         return FALSE;
1291     }
1292 
1293     // this totally matches, however we need to check if it is repeating
1294     if (!isBreakUnit(strsrch, start, *textoffset) ||
1295         checkRepeatedMatch(strsrch, start, *textoffset) ||
1296         hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1297         !checkIdentical(strsrch, start, *textoffset) ||
1298         hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1299 
1300         (*textoffset) ++;
1301         *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1302         return FALSE;
1303     }
1304 
1305     //Add breakiterator boundary check for primary strength search.
1306     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
1307     	checkBreakBoundary(strsrch, &start, textoffset);
1308     }
1309 
1310     // totally match, we will get rid of the ending ignorables.
1311     strsrch->search->matchedIndex  = start;
1312     strsrch->search->matchedLength = *textoffset - start;
1313     return TRUE;
1314 }
1315 
1316 /**
1317 * Getting the previous base character offset, or the current offset if the
1318 * current character is a base character
1319 * @param text string
1320 * @param textoffset one offset after the current character
1321 * @return the offset of the next character after the base character or the first
1322 *         composed character with accents
1323 */
1324 static
getPreviousBaseOffset(const UChar * text,int32_t textoffset)1325 inline int32_t getPreviousBaseOffset(const UChar       *text,
1326                                                int32_t  textoffset)
1327 {
1328     if (textoffset > 0) {
1329         for (;;) {
1330             int32_t result = textoffset;
1331             UTF_BACK_1(text, 0, textoffset);
1332             int32_t temp = textoffset;
1333             uint16_t fcd = getFCD(text, &temp, result);
1334             if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1335                 if (fcd & LAST_BYTE_MASK_) {
1336                     return textoffset;
1337                 }
1338                 return result;
1339             }
1340             if (textoffset == 0) {
1341                 return 0;
1342             }
1343         }
1344     }
1345     return textoffset;
1346 }
1347 
1348 /**
1349 * Getting the indexes of the accents that are not blocked in the argument
1350 * accent array
1351 * @param accents array of accents in nfd terminated by a 0.
1352 * @param accentsindex array of indexes of the accents that are not blocked
1353 */
1354 static
getUnblockedAccentIndex(UChar * accents,int32_t * accentsindex)1355 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1356 {
1357     int32_t index     = 0;
1358     int32_t     length    = u_strlen(accents);
1359     UChar32     codepoint = 0;
1360     int         cclass    = 0;
1361     int         result    = 0;
1362     int32_t temp;
1363     while (index < length) {
1364         temp = index;
1365         UTF_NEXT_CHAR(accents, index, length, codepoint);
1366         if (u_getCombiningClass(codepoint) != cclass) {
1367             cclass        = u_getCombiningClass(codepoint);
1368             accentsindex[result] = temp;
1369             result ++;
1370         }
1371     }
1372     accentsindex[result] = length;
1373     return result;
1374 }
1375 
1376 /**
1377 * Appends 3 UChar arrays to a destination array.
1378 * Creates a new array if we run out of space. The caller will have to
1379 * manually deallocate the newly allocated array.
1380 * Internal method, status assumed to be success, caller has to check status
1381 * before calling this method. destination not to be NULL and has at least
1382 * size destinationlength.
1383 * @param destination target array
1384 * @param destinationlength target array size, returning the appended length
1385 * @param source1 null-terminated first array
1386 * @param source2 second array
1387 * @param source2length length of seond array
1388 * @param source3 null-terminated third array
1389 * @param status error status if any
1390 * @return new destination array, destination if there was no new allocation
1391 */
1392 static
addToUCharArray(UChar * destination,int32_t * destinationlength,const UChar * source1,const UChar * source2,int32_t source2length,const UChar * source3,UErrorCode * status)1393 inline UChar * addToUCharArray(      UChar      *destination,
1394                                      int32_t    *destinationlength,
1395                                const UChar      *source1,
1396                                const UChar      *source2,
1397                                      int32_t     source2length,
1398                                const UChar      *source3,
1399                                      UErrorCode *status)
1400 {
1401     int32_t source1length = source1 ? u_strlen(source1) : 0;
1402     int32_t source3length = source3 ? u_strlen(source3) : 0;
1403     if (*destinationlength < source1length + source2length + source3length +
1404                                                                            1)
1405     {
1406         destination = (UChar *)allocateMemory(
1407           (source1length + source2length + source3length + 1) * sizeof(UChar),
1408           status);
1409         // if error allocating memory, status will be
1410         // U_MEMORY_ALLOCATION_ERROR
1411         if (U_FAILURE(*status)) {
1412             *destinationlength = 0;
1413             return NULL;
1414         }
1415     }
1416     if (source1length != 0) {
1417         uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
1418     }
1419     if (source2length != 0) {
1420         uprv_memcpy(destination + source1length, source2,
1421                     sizeof(UChar) * source2length);
1422     }
1423     if (source3length != 0) {
1424         uprv_memcpy(destination + source1length + source2length, source3,
1425                     sizeof(UChar) * source3length);
1426     }
1427     *destinationlength = source1length + source2length + source3length;
1428     return destination;
1429 }
1430 
1431 /**
1432 * Running through a collation element iterator to see if the contents matches
1433 * pattern in string search data
1434 * @param strsrch string search data
1435 * @param coleiter collation element iterator
1436 * @return TRUE if a match if found, FALSE otherwise
1437 */
1438 static
checkCollationMatch(const UStringSearch * strsrch,UCollationElements * coleiter)1439 inline UBool checkCollationMatch(const UStringSearch      *strsrch,
1440                                        UCollationElements *coleiter)
1441 {
1442     int         patternceindex = strsrch->pattern.CELength;
1443     int32_t    *patternce      = strsrch->pattern.CE;
1444     UErrorCode  status = U_ZERO_ERROR;
1445     while (patternceindex > 0) {
1446         int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1447         if (ce == UCOL_IGNORABLE) {
1448             continue;
1449         }
1450         if (U_FAILURE(status) || ce != *patternce) {
1451             return FALSE;
1452         }
1453         patternce ++;
1454         patternceindex --;
1455     }
1456     return TRUE;
1457 }
1458 
1459 /**
1460 * Rearranges the front accents to try matching.
1461 * Prefix accents in the text will be grouped according to their combining
1462 * class and the groups will be mixed and matched to try find the perfect
1463 * match with the pattern.
1464 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1465 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1466 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1467 *         "\u0301\u0325".
1468 * step 2: check if any of the generated substrings matches the pattern.
1469 * Internal method, status is assumed to be success, caller has to check status
1470 * before calling this method.
1471 * @param strsrch string search match
1472 * @param start first offset of the accents to start searching
1473 * @param end start of the last accent set
1474 * @param status output error status if any
1475 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1476 *         offset of the match. Note this start includes all preceding accents.
1477 */
1478 static
doNextCanonicalPrefixMatch(UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)1479 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1480                                        int32_t    start,
1481                                        int32_t    end,
1482                                        UErrorCode    *status)
1483 {
1484     const UChar       *text       = strsrch->search->text;
1485           int32_t      textlength = strsrch->search->textLength;
1486           int32_t  tempstart  = start;
1487 
1488     if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1489         // die... failed at a base character
1490         return USEARCH_DONE;
1491     }
1492 
1493     int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1494     start = getPreviousBaseOffset(text, tempstart);
1495 
1496     UChar       accents[INITIAL_ARRAY_SIZE_];
1497     // normalizing the offensive string
1498     unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1499                     INITIAL_ARRAY_SIZE_, status);
1500     if (U_FAILURE(*status)) {
1501         return USEARCH_DONE;
1502     }
1503 
1504     int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
1505     int32_t         accentsize = getUnblockedAccentIndex(accents,
1506                                                                  accentsindex);
1507     int32_t         count      = (2 << (accentsize - 1)) - 1;
1508     UChar               buffer[INITIAL_ARRAY_SIZE_];
1509     UCollationElements *coleiter   = strsrch->utilIter;
1510     while (U_SUCCESS(*status) && count > 0) {
1511         UChar *rearrange = strsrch->canonicalPrefixAccents;
1512         // copy the base characters
1513         for (int k = 0; k < accentsindex[0]; k ++) {
1514             *rearrange ++ = accents[k];
1515         }
1516         // forming all possible canonical rearrangement by dropping
1517         // sets of accents
1518         for (int i = 0; i <= accentsize - 1; i ++) {
1519             int32_t mask = 1 << (accentsize - i - 1);
1520             if (count & mask) {
1521                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1522                     *rearrange ++ = accents[j];
1523                 }
1524             }
1525         }
1526         *rearrange = 0;
1527         int32_t  matchsize = INITIAL_ARRAY_SIZE_;
1528         UChar   *match     = addToUCharArray(buffer, &matchsize,
1529                                            strsrch->canonicalPrefixAccents,
1530                                            strsrch->search->text + offset,
1531                                            end - offset,
1532                                            strsrch->canonicalSuffixAccents,
1533                                            status);
1534 
1535         // if status is a failure, ucol_setText does nothing.
1536         // run the collator iterator through this match
1537         ucol_setText(coleiter, match, matchsize, status);
1538         if (U_SUCCESS(*status)) {
1539             if (checkCollationMatch(strsrch, coleiter)) {
1540                 if (match != buffer) {
1541                     uprv_free(match);
1542                 }
1543                 return start;
1544             }
1545         }
1546         count --;
1547     }
1548     return USEARCH_DONE;
1549 }
1550 
1551 /**
1552 * Gets the offset to the safe point in text before textoffset.
1553 * ie. not the middle of a contraction, swappable characters or supplementary
1554 * characters.
1555 * @param collator collation sata
1556 * @param text string to work with
1557 * @param textoffset offset in string
1558 * @param textlength length of text string
1559 * @return offset to the previous safe character
1560 */
1561 static
getPreviousSafeOffset(const UCollator * collator,const UChar * text,int32_t textoffset)1562 inline uint32_t getPreviousSafeOffset(const UCollator   *collator,
1563                                       const UChar       *text,
1564                                             int32_t  textoffset)
1565 {
1566     int32_t result = textoffset; // first contraction character
1567     while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1568         result --;
1569     }
1570     if (result != 0) {
1571         // the first contraction character is consider unsafe here
1572         result --;
1573     }
1574     return result;
1575 }
1576 
1577 /**
1578 * Cleaning up after we passed the safe zone
1579 * @param strsrch string search data
1580 * @param safetext safe text array
1581 * @param safebuffer safe text buffer
1582 * @param coleiter collation element iterator for safe text
1583 */
1584 static
cleanUpSafeText(const UStringSearch * strsrch,UChar * safetext,UChar * safebuffer)1585 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1586                                   UChar         *safebuffer)
1587 {
1588     if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1589     {
1590        uprv_free(safetext);
1591     }
1592 }
1593 
1594 /**
1595 * Take the rearranged end accents and tries matching. If match failed at
1596 * a seperate preceding set of accents (seperated from the rearranged on by
1597 * at least a base character) then we rearrange the preceding accents and
1598 * tries matching again.
1599 * We allow skipping of the ends of the accent set if the ces do not match.
1600 * However if the failure is found before the accent set, it fails.
1601 * Internal method, status assumed to be success, caller has to check status
1602 * before calling this method.
1603 * @param strsrch string search data
1604 * @param textoffset of the start of the rearranged accent
1605 * @param status output error status if any
1606 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1607 *         offset of the match. Note this start includes all preceding accents.
1608 */
1609 static
doNextCanonicalSuffixMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)1610 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1611                                        int32_t    textoffset,
1612                                        UErrorCode    *status)
1613 {
1614     const UChar              *text           = strsrch->search->text;
1615     const UCollator          *collator       = strsrch->collator;
1616           int32_t             safelength     = 0;
1617           UChar              *safetext;
1618           int32_t             safetextlength;
1619           UChar               safebuffer[INITIAL_ARRAY_SIZE_];
1620           UCollationElements *coleiter       = strsrch->utilIter;
1621           int32_t         safeoffset     = textoffset;
1622 
1623     if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1624                                          collator)) {
1625         safeoffset     = getPreviousSafeOffset(collator, text, textoffset);
1626         safelength     = textoffset - safeoffset;
1627         safetextlength = INITIAL_ARRAY_SIZE_;
1628         safetext       = addToUCharArray(safebuffer, &safetextlength, NULL,
1629                                          text + safeoffset, safelength,
1630                                          strsrch->canonicalSuffixAccents,
1631                                          status);
1632     }
1633     else {
1634         safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1635         safetext       = strsrch->canonicalSuffixAccents;
1636     }
1637 
1638     // if status is a failure, ucol_setText does nothing
1639     ucol_setText(coleiter, safetext, safetextlength, status);
1640     // status checked in loop below
1641 
1642     int32_t  *ce        = strsrch->pattern.CE;
1643     int32_t   celength  = strsrch->pattern.CELength;
1644     int       ceindex   = celength - 1;
1645     UBool     isSafe    = TRUE; // indication flag for position in safe zone
1646 
1647     while (ceindex >= 0) {
1648         int32_t textce = ucol_previous(coleiter, status);
1649         if (U_FAILURE(*status)) {
1650             if (isSafe) {
1651                 cleanUpSafeText(strsrch, safetext, safebuffer);
1652             }
1653             return USEARCH_DONE;
1654         }
1655         if (textce == UCOL_NULLORDER) {
1656             // check if we have passed the safe buffer
1657             if (coleiter == strsrch->textIter) {
1658                 cleanUpSafeText(strsrch, safetext, safebuffer);
1659                 return USEARCH_DONE;
1660             }
1661             cleanUpSafeText(strsrch, safetext, safebuffer);
1662             safetext = safebuffer;
1663             coleiter = strsrch->textIter;
1664             setColEIterOffset(coleiter, safeoffset);
1665             // status checked at the start of the loop
1666             isSafe = FALSE;
1667             continue;
1668         }
1669         textce = getCE(strsrch, textce);
1670         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1671             // do the beginning stuff
1672             int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1673             if (isSafe && failedoffset >= safelength) {
1674                 // alas... no hope. failed at rearranged accent set
1675                 cleanUpSafeText(strsrch, safetext, safebuffer);
1676                 return USEARCH_DONE;
1677             }
1678             else {
1679                 if (isSafe) {
1680                     failedoffset += safeoffset;
1681                     cleanUpSafeText(strsrch, safetext, safebuffer);
1682                 }
1683 
1684                 // try rearranging the front accents
1685                 int32_t result = doNextCanonicalPrefixMatch(strsrch,
1686                                         failedoffset, textoffset, status);
1687                 if (result != USEARCH_DONE) {
1688                     // if status is a failure, ucol_setOffset does nothing
1689                     setColEIterOffset(strsrch->textIter, result);
1690                 }
1691                 if (U_FAILURE(*status)) {
1692                     return USEARCH_DONE;
1693                 }
1694                 return result;
1695             }
1696         }
1697         if (textce == ce[ceindex]) {
1698             ceindex --;
1699         }
1700     }
1701     // set offset here
1702     if (isSafe) {
1703         int32_t result     = getColElemIterOffset(coleiter, FALSE);
1704         // sets the text iterator here with the correct expansion and offset
1705         int32_t    leftoverces = getExpansionPrefix(coleiter);
1706         cleanUpSafeText(strsrch, safetext, safebuffer);
1707         if (result >= safelength) {
1708             result = textoffset;
1709         }
1710         else {
1711             result += safeoffset;
1712         }
1713         setColEIterOffset(strsrch->textIter, result);
1714         strsrch->textIter->iteratordata_.toReturn =
1715                        setExpansionPrefix(strsrch->textIter, leftoverces);
1716         return result;
1717     }
1718 
1719     return ucol_getOffset(coleiter);
1720 }
1721 
1722 /**
1723 * Trying out the substring and sees if it can be a canonical match.
1724 * This will try normalizing the end accents and arranging them into canonical
1725 * equivalents and check their corresponding ces with the pattern ce.
1726 * Suffix accents in the text will be grouped according to their combining
1727 * class and the groups will be mixed and matched to try find the perfect
1728 * match with the pattern.
1729 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1730 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1731 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1732 *         "\u0301\u0325".
1733 * step 2: check if any of the generated substrings matches the pattern.
1734 * Internal method, status assumed to be success, caller has to check status
1735 * before calling this method.
1736 * @param strsrch string search data
1737 * @param textoffset end offset in the collation element text that ends with
1738 *                   the accents to be rearranged
1739 * @param status error status if any
1740 * @return TRUE if the match is valid, FALSE otherwise
1741 */
1742 static
doNextCanonicalMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)1743 UBool doNextCanonicalMatch(UStringSearch *strsrch,
1744                            int32_t    textoffset,
1745                            UErrorCode    *status)
1746 {
1747     const UChar       *text = strsrch->search->text;
1748           int32_t  temp = textoffset;
1749     UTF_BACK_1(text, 0, temp);
1750     if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1751         UCollationElements *coleiter = strsrch->textIter;
1752         int32_t         offset   = getColElemIterOffset(coleiter, FALSE);
1753         if (strsrch->pattern.hasPrefixAccents) {
1754             offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1755                                                 status);
1756             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1757                 setColEIterOffset(coleiter, offset);
1758                 return TRUE;
1759             }
1760         }
1761         return FALSE;
1762     }
1763 
1764     if (!strsrch->pattern.hasSuffixAccents) {
1765         return FALSE;
1766     }
1767 
1768     UChar       accents[INITIAL_ARRAY_SIZE_];
1769     // offset to the last base character in substring to search
1770     int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1771     // normalizing the offensive string
1772     unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1773                                0, accents, INITIAL_ARRAY_SIZE_, status);
1774     // status checked in loop below
1775 
1776     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1777     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1778 
1779     // 2 power n - 1 plus the full set of accents
1780     int32_t  count = (2 << (size - 1)) - 1;
1781     while (U_SUCCESS(*status) && count > 0) {
1782         UChar *rearrange = strsrch->canonicalSuffixAccents;
1783         // copy the base characters
1784         for (int k = 0; k < accentsindex[0]; k ++) {
1785             *rearrange ++ = accents[k];
1786         }
1787         // forming all possible canonical rearrangement by dropping
1788         // sets of accents
1789         for (int i = 0; i <= size - 1; i ++) {
1790             int32_t mask = 1 << (size - i - 1);
1791             if (count & mask) {
1792                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1793                     *rearrange ++ = accents[j];
1794                 }
1795             }
1796         }
1797         *rearrange = 0;
1798         int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1799                                                         status);
1800         if (offset != USEARCH_DONE) {
1801             return TRUE; // match found
1802         }
1803         count --;
1804     }
1805     return FALSE;
1806 }
1807 
1808 /**
1809 * Gets the previous base character offset depending on the string search
1810 * pattern data
1811 * @param strsrch string search data
1812 * @param textoffset current offset, current character
1813 * @return the offset of the next character after this base character or itself
1814 *         if it is a composed character with accents
1815 */
1816 static
getPreviousUStringSearchBaseOffset(UStringSearch * strsrch,int32_t textoffset)1817 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1818                                                       int32_t textoffset)
1819 {
1820     if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1821         const UChar       *text = strsrch->search->text;
1822               int32_t  offset = textoffset;
1823         if (getFCD(text, &offset, strsrch->search->textLength) >>
1824                                                    SECOND_LAST_BYTE_SHIFT_) {
1825             return getPreviousBaseOffset(text, textoffset);
1826         }
1827     }
1828     return textoffset;
1829 }
1830 
1831 /**
1832 * Checks match for contraction.
1833 * If the match ends with a partial contraction we fail.
1834 * If the match starts too far off (because of backwards iteration) we try to
1835 * chip off the extra characters
1836 * Internal method, status assumed to be success, caller has to check status
1837 * before calling this method.
1838 * @param strsrch string search data
1839 * @param start offset of potential match, to be modified if necessary
1840 * @param end offset of potential match, to be modified if necessary
1841 * @param status output error status if any
1842 * @return TRUE if match passes the contraction test, FALSE otherwise
1843 */
1844 static
checkNextCanonicalContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)1845 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1846                                          int32_t   *start,
1847                                          int32_t   *end,
1848                                          UErrorCode    *status)
1849 {
1850           UCollationElements *coleiter   = strsrch->textIter;
1851           int32_t             textlength = strsrch->search->textLength;
1852           int32_t         temp       = *start;
1853     const UCollator          *collator   = strsrch->collator;
1854     const UChar              *text       = strsrch->search->text;
1855     // This part checks if either ends of the match contains potential
1856     // contraction. If so we'll have to iterate through them
1857     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1858         (*start + 1 < textlength
1859          && ucol_unsafeCP(text[*start + 1], collator))) {
1860         int32_t expansion  = getExpansionPrefix(coleiter);
1861         UBool   expandflag = expansion > 0;
1862         setColEIterOffset(coleiter, *start);
1863         while (expansion > 0) {
1864             // getting rid of the redundant ce, caused by setOffset.
1865             // since backward contraction/expansion may have extra ces if we
1866             // are in the normalization buffer, hasAccentsBeforeMatch would
1867             // have taken care of it.
1868             // E.g. the character \u01FA will have an expansion of 3, but if
1869             // we are only looking for acute and ring \u030A and \u0301, we'll
1870             // have to skip the first ce in the expansion buffer.
1871             ucol_next(coleiter, status);
1872             if (U_FAILURE(*status)) {
1873                 return FALSE;
1874             }
1875             if (ucol_getOffset(coleiter) != temp) {
1876                 *start = temp;
1877                 temp  = ucol_getOffset(coleiter);
1878             }
1879             expansion --;
1880         }
1881 
1882         int32_t  *patternce       = strsrch->pattern.CE;
1883         int32_t   patterncelength = strsrch->pattern.CELength;
1884         int32_t   count           = 0;
1885         int32_t   textlength      = strsrch->search->textLength;
1886         while (count < patterncelength) {
1887             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1888             // status checked below, note that if status is a failure
1889             // ucol_next returns UCOL_NULLORDER
1890             if (ce == UCOL_IGNORABLE) {
1891                 continue;
1892             }
1893             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1894                 *start = temp;
1895                 temp   = ucol_getOffset(coleiter);
1896             }
1897 
1898             if (count == 0 && ce != patternce[0]) {
1899                 // accents may have extra starting ces, this occurs when a
1900                 // pure accent pattern is matched without rearrangement
1901                 // text \u0325\u0300 and looking for \u0300
1902                 int32_t expected = patternce[0];
1903                 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1904                     ce = getCE(strsrch, ucol_next(coleiter, status));
1905                     while (U_SUCCESS(*status) && ce != expected &&
1906                            ce != UCOL_NULLORDER &&
1907                            ucol_getOffset(coleiter) <= *end) {
1908                         ce = getCE(strsrch, ucol_next(coleiter, status));
1909                     }
1910                 }
1911             }
1912             if (U_FAILURE(*status) || ce != patternce[count]) {
1913                 (*end) ++;
1914                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1915                 return FALSE;
1916             }
1917             count ++;
1918         }
1919     }
1920     return TRUE;
1921 }
1922 
1923 /**
1924 * Checks and sets the match information if found.
1925 * Checks
1926 * <ul>
1927 * <li> the potential match does not repeat the previous match
1928 * <li> boundaries are correct
1929 * <li> potential match does not end in the middle of a contraction
1930 * <li> identical matches
1931 * <\ul>
1932 * Otherwise the offset will be shifted to the next character.
1933 * Internal method, status assumed to be success, caller has to check the
1934 * status before calling this method.
1935 * @param strsrch string search data
1936 * @param textoffset offset in the collation element text. the returned value
1937 *        will be the truncated end offset of the match or the new start
1938 *        search offset.
1939 * @param status output error status if any
1940 * @return TRUE if the match is valid, FALSE otherwise
1941 */
1942 static
checkNextCanonicalMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)1943 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1944                                      int32_t   *textoffset,
1945                                      UErrorCode    *status)
1946 {
1947     // to ensure that the start and ends are not composite characters
1948     UCollationElements *coleiter = strsrch->textIter;
1949     // if we have a canonical accent match
1950     if ((strsrch->pattern.hasSuffixAccents &&
1951         strsrch->canonicalSuffixAccents[0]) ||
1952         (strsrch->pattern.hasPrefixAccents &&
1953         strsrch->canonicalPrefixAccents[0])) {
1954         strsrch->search->matchedIndex  = getPreviousUStringSearchBaseOffset(
1955                                                     strsrch,
1956                                                     ucol_getOffset(coleiter));
1957         strsrch->search->matchedLength = *textoffset -
1958                                                 strsrch->search->matchedIndex;
1959         return TRUE;
1960     }
1961 
1962     int32_t start = getColElemIterOffset(coleiter, FALSE);
1963     if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1964                                             status) || U_FAILURE(*status)) {
1965         return FALSE;
1966     }
1967 
1968     start = getPreviousUStringSearchBaseOffset(strsrch, start);
1969     // this totally matches, however we need to check if it is repeating
1970     if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1971         !isBreakUnit(strsrch, start, *textoffset) ||
1972         !checkIdentical(strsrch, start, *textoffset)) {
1973         (*textoffset) ++;
1974         *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1975                                         strsrch->search->textLength);
1976         return FALSE;
1977     }
1978 
1979     strsrch->search->matchedIndex  = start;
1980     strsrch->search->matchedLength = *textoffset - start;
1981     return TRUE;
1982 }
1983 
1984 /**
1985 * Shifting the collation element iterator position forward to prepare for
1986 * a preceding match. If the first character is a unsafe character, we'll only
1987 * shift by 1 to capture contractions, normalization etc.
1988 * Internal method, status assumed to be success, caller has to check status
1989 * before calling this method.
1990 * @param text strsrch string search data
1991 * @param textoffset start text position to do search
1992 * @param ce the text ce which failed the match.
1993 * @param patternceindex index of the ce within the pattern ce buffer which
1994 *        failed the match
1995 * @return final offset
1996 */
1997 static
reverseShift(UStringSearch * strsrch,int32_t textoffset,int32_t ce,int32_t patternceindex)1998 inline int32_t reverseShift(UStringSearch *strsrch,
1999                                 int32_t    textoffset,
2000                                 int32_t       ce,
2001                                 int32_t        patternceindex)
2002 {
2003     if (strsrch->search->isOverlap) {
2004         if (textoffset != strsrch->search->textLength) {
2005             textoffset --;
2006         }
2007         else {
2008             textoffset -= strsrch->pattern.defaultShiftSize;
2009         }
2010     }
2011     else {
2012         if (ce != UCOL_NULLORDER) {
2013             int32_t shift = strsrch->pattern.backShift[hash(ce)];
2014 
2015             // this is to adjust for characters in the middle of the substring
2016             // for matching that failed.
2017             int32_t adjust = patternceindex;
2018             if (adjust > 1 && shift > adjust) {
2019                 shift -= adjust - 1;
2020             }
2021             textoffset -= shift;
2022         }
2023         else {
2024             textoffset -= strsrch->pattern.defaultShiftSize;
2025         }
2026     }
2027     textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
2028     return textoffset;
2029 }
2030 
2031 /**
2032 * Checks match for contraction.
2033 * If the match starts with a partial contraction we fail.
2034 * Internal method, status assumed to be success, caller has to check status
2035 * before calling this method.
2036 * @param strsrch string search data
2037 * @param start offset of potential match, to be modified if necessary
2038 * @param end offset of potential match, to be modified if necessary
2039 * @param status output error status if any
2040 * @return TRUE if match passes the contraction test, FALSE otherwise
2041 */
2042 static
checkPreviousExactContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)2043 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2044                                      int32_t   *start,
2045                                      int32_t   *end, UErrorCode  *status)
2046 {
2047           UCollationElements *coleiter   = strsrch->textIter;
2048           int32_t             textlength = strsrch->search->textLength;
2049           int32_t             temp       = *end;
2050     const UCollator          *collator   = strsrch->collator;
2051     const UChar              *text       = strsrch->search->text;
2052     // This part checks if either if the start of the match contains potential
2053     // contraction. If so we'll have to iterate through them
2054     // Since we used ucol_next while previously looking for the potential
2055     // match, this guarantees that our end will not be a partial contraction,
2056     // or a partial supplementary character.
2057     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2058         int32_t expansion  = getExpansionSuffix(coleiter);
2059         UBool   expandflag = expansion > 0;
2060         setColEIterOffset(coleiter, *end);
2061         while (U_SUCCESS(*status) && expansion > 0) {
2062             // getting rid of the redundant ce
2063             // since forward contraction/expansion may have extra ces
2064             // if we are in the normalization buffer, hasAccentsBeforeMatch
2065             // would have taken care of it.
2066             // E.g. the character \u01FA will have an expansion of 3, but if
2067             // we are only looking for A ring A\u030A, we'll have to skip the
2068             // last ce in the expansion buffer
2069             ucol_previous(coleiter, status);
2070             if (U_FAILURE(*status)) {
2071                 return FALSE;
2072             }
2073             if (ucol_getOffset(coleiter) != temp) {
2074                 *end = temp;
2075                 temp  = ucol_getOffset(coleiter);
2076             }
2077             expansion --;
2078         }
2079 
2080         int32_t  *patternce       = strsrch->pattern.CE;
2081         int32_t   patterncelength = strsrch->pattern.CELength;
2082         int32_t   count           = patterncelength;
2083         while (count > 0) {
2084             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2085             // status checked below, note that if status is a failure
2086             // ucol_previous returns UCOL_NULLORDER
2087             if (ce == UCOL_IGNORABLE) {
2088                 continue;
2089             }
2090             if (expandflag && count == 0 &&
2091                 getColElemIterOffset(coleiter, FALSE) != temp) {
2092                 *end = temp;
2093                 temp  = ucol_getOffset(coleiter);
2094             }
2095             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2096                 (*start) --;
2097                 *start = getPreviousBaseOffset(text, *start);
2098                 return FALSE;
2099             }
2100             count --;
2101         }
2102     }
2103     return TRUE;
2104 }
2105 
2106 /**
2107 * Checks and sets the match information if found.
2108 * Checks
2109 * <ul>
2110 * <li> the current match does not repeat the last match
2111 * <li> boundaries are correct
2112 * <li> exact matches has no extra accents
2113 * <li> identical matches
2114 * <\ul>
2115 * Otherwise the offset will be shifted to the preceding character.
2116 * Internal method, status assumed to be success, caller has to check status
2117 * before calling this method.
2118 * @param strsrch string search data
2119 * @param collator
2120 * @param coleiter collation element iterator
2121 * @param text string
2122 * @param textoffset offset in the collation element text. the returned value
2123 *        will be the truncated start offset of the match or the new start
2124 *        search offset.
2125 * @param status output error status if any
2126 * @return TRUE if the match is valid, FALSE otherwise
2127 */
2128 static
checkPreviousExactMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)2129 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2130                                      int32_t   *textoffset,
2131                                      UErrorCode    *status)
2132 {
2133     // to ensure that the start and ends are not composite characters
2134     int32_t end = ucol_getOffset(strsrch->textIter);
2135     if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2136         || U_FAILURE(*status)) {
2137             return FALSE;
2138     }
2139 
2140     // this totally matches, however we need to check if it is repeating
2141     // the old match
2142     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2143         !isBreakUnit(strsrch, *textoffset, end) ||
2144         hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
2145         !checkIdentical(strsrch, *textoffset, end) ||
2146         hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2147         (*textoffset) --;
2148         *textoffset = getPreviousBaseOffset(strsrch->search->text,
2149                                             *textoffset);
2150         return FALSE;
2151     }
2152 
2153     //Add breakiterator boundary check for primary strength search.
2154     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
2155     	checkBreakBoundary(strsrch, textoffset, &end);
2156     }
2157 
2158     strsrch->search->matchedIndex = *textoffset;
2159     strsrch->search->matchedLength = end - *textoffset;
2160     return TRUE;
2161 }
2162 
2163 /**
2164 * Rearranges the end accents to try matching.
2165 * Suffix accents in the text will be grouped according to their combining
2166 * class and the groups will be mixed and matched to try find the perfect
2167 * match with the pattern.
2168 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2169 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2170 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2171 *         "\u0301\u0325".
2172 * step 2: check if any of the generated substrings matches the pattern.
2173 * Internal method, status assumed to be success, user has to check status
2174 * before calling this method.
2175 * @param strsrch string search match
2176 * @param start offset of the first base character
2177 * @param end start of the last accent set
2178 * @param status only error status if any
2179 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2180 *         offset of the match. Note this start includes all following accents.
2181 */
2182 static
doPreviousCanonicalSuffixMatch(UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)2183 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
2184                                            int32_t    start,
2185                                            int32_t    end,
2186                                            UErrorCode    *status)
2187 {
2188     const UChar       *text       = strsrch->search->text;
2189           int32_t  tempend    = end;
2190 
2191     UTF_BACK_1(text, 0, tempend);
2192     if (!(getFCD(text, &tempend, strsrch->search->textLength) &
2193                                                            LAST_BYTE_MASK_)) {
2194         // die... failed at a base character
2195         return USEARCH_DONE;
2196     }
2197     end = getNextBaseOffset(text, end, strsrch->search->textLength);
2198 
2199     if (U_SUCCESS(*status)) {
2200         UChar       accents[INITIAL_ARRAY_SIZE_];
2201         int32_t offset = getPreviousBaseOffset(text, end);
2202         // normalizing the offensive string
2203         unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2204                         INITIAL_ARRAY_SIZE_, status);
2205 
2206         int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
2207         int32_t         accentsize = getUnblockedAccentIndex(accents,
2208                                                          accentsindex);
2209         int32_t         count      = (2 << (accentsize - 1)) - 1;
2210         UChar               buffer[INITIAL_ARRAY_SIZE_];
2211         UCollationElements *coleiter = strsrch->utilIter;
2212         while (U_SUCCESS(*status) && count > 0) {
2213             UChar *rearrange = strsrch->canonicalSuffixAccents;
2214             // copy the base characters
2215             for (int k = 0; k < accentsindex[0]; k ++) {
2216                 *rearrange ++ = accents[k];
2217             }
2218             // forming all possible canonical rearrangement by dropping
2219             // sets of accents
2220             for (int i = 0; i <= accentsize - 1; i ++) {
2221                 int32_t mask = 1 << (accentsize - i - 1);
2222                 if (count & mask) {
2223                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2224                         *rearrange ++ = accents[j];
2225                     }
2226                 }
2227             }
2228             *rearrange = 0;
2229             int32_t  matchsize = INITIAL_ARRAY_SIZE_;
2230             UChar   *match     = addToUCharArray(buffer, &matchsize,
2231                                            strsrch->canonicalPrefixAccents,
2232                                            strsrch->search->text + start,
2233                                            offset - start,
2234                                            strsrch->canonicalSuffixAccents,
2235                                            status);
2236 
2237             // run the collator iterator through this match
2238             // if status is a failure ucol_setText does nothing
2239             ucol_setText(coleiter, match, matchsize, status);
2240             if (U_SUCCESS(*status)) {
2241                 if (checkCollationMatch(strsrch, coleiter)) {
2242                     if (match != buffer) {
2243                         uprv_free(match);
2244                     }
2245                     return end;
2246                 }
2247             }
2248             count --;
2249         }
2250     }
2251     return USEARCH_DONE;
2252 }
2253 
2254 /**
2255 * Take the rearranged start accents and tries matching. If match failed at
2256 * a seperate following set of accents (seperated from the rearranged on by
2257 * at least a base character) then we rearrange the preceding accents and
2258 * tries matching again.
2259 * We allow skipping of the ends of the accent set if the ces do not match.
2260 * However if the failure is found before the accent set, it fails.
2261 * Internal method, status assumed to be success, caller has to check status
2262 * before calling this method.
2263 * @param strsrch string search data
2264 * @param textoffset of the ends of the rearranged accent
2265 * @param status output error status if any
2266 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2267 *         offset of the match. Note this start includes all following accents.
2268 */
2269 static
doPreviousCanonicalPrefixMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)2270 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2271                                            int32_t    textoffset,
2272                                            UErrorCode    *status)
2273 {
2274     const UChar       *text       = strsrch->search->text;
2275     const UCollator   *collator   = strsrch->collator;
2276           int32_t      safelength = 0;
2277           UChar       *safetext;
2278           int32_t      safetextlength;
2279           UChar        safebuffer[INITIAL_ARRAY_SIZE_];
2280           int32_t  safeoffset = textoffset;
2281 
2282     if (textoffset &&
2283         ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2284                                  u_strlen(strsrch->canonicalPrefixAccents) - 1
2285                                          ], collator)) {
2286         safeoffset     = getNextSafeOffset(collator, text, textoffset,
2287                                            strsrch->search->textLength);
2288         safelength     = safeoffset - textoffset;
2289         safetextlength = INITIAL_ARRAY_SIZE_;
2290         safetext       = addToUCharArray(safebuffer, &safetextlength,
2291                                          strsrch->canonicalPrefixAccents,
2292                                          text + textoffset, safelength,
2293                                          NULL, status);
2294     }
2295     else {
2296         safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2297         safetext       = strsrch->canonicalPrefixAccents;
2298     }
2299 
2300     UCollationElements *coleiter = strsrch->utilIter;
2301      // if status is a failure, ucol_setText does nothing
2302     ucol_setText(coleiter, safetext, safetextlength, status);
2303     // status checked in loop below
2304 
2305     int32_t  *ce           = strsrch->pattern.CE;
2306     int32_t   celength     = strsrch->pattern.CELength;
2307     int       ceindex      = 0;
2308     UBool     isSafe       = TRUE; // safe zone indication flag for position
2309     int32_t   prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2310 
2311     while (ceindex < celength) {
2312         int32_t textce = ucol_next(coleiter, status);
2313         if (U_FAILURE(*status)) {
2314             if (isSafe) {
2315                 cleanUpSafeText(strsrch, safetext, safebuffer);
2316             }
2317             return USEARCH_DONE;
2318         }
2319         if (textce == UCOL_NULLORDER) {
2320             // check if we have passed the safe buffer
2321             if (coleiter == strsrch->textIter) {
2322                 cleanUpSafeText(strsrch, safetext, safebuffer);
2323                 return USEARCH_DONE;
2324             }
2325             cleanUpSafeText(strsrch, safetext, safebuffer);
2326             safetext = safebuffer;
2327             coleiter = strsrch->textIter;
2328             setColEIterOffset(coleiter, safeoffset);
2329             // status checked at the start of the loop
2330             isSafe = FALSE;
2331             continue;
2332         }
2333         textce = getCE(strsrch, textce);
2334         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2335             // do the beginning stuff
2336             int32_t failedoffset = ucol_getOffset(coleiter);
2337             if (isSafe && failedoffset <= prefixlength) {
2338                 // alas... no hope. failed at rearranged accent set
2339                 cleanUpSafeText(strsrch, safetext, safebuffer);
2340                 return USEARCH_DONE;
2341             }
2342             else {
2343                 if (isSafe) {
2344                     failedoffset = safeoffset - failedoffset;
2345                     cleanUpSafeText(strsrch, safetext, safebuffer);
2346                 }
2347 
2348                 // try rearranging the end accents
2349                 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2350                                         textoffset, failedoffset, status);
2351                 if (result != USEARCH_DONE) {
2352                     // if status is a failure, ucol_setOffset does nothing
2353                     setColEIterOffset(strsrch->textIter, result);
2354                 }
2355                 if (U_FAILURE(*status)) {
2356                     return USEARCH_DONE;
2357                 }
2358                 return result;
2359             }
2360         }
2361         if (textce == ce[ceindex]) {
2362             ceindex ++;
2363         }
2364     }
2365     // set offset here
2366     if (isSafe) {
2367         int32_t result      = ucol_getOffset(coleiter);
2368         // sets the text iterator here with the correct expansion and offset
2369         int32_t     leftoverces = getExpansionSuffix(coleiter);
2370         cleanUpSafeText(strsrch, safetext, safebuffer);
2371         if (result <= prefixlength) {
2372             result = textoffset;
2373         }
2374         else {
2375             result = textoffset + (safeoffset - result);
2376         }
2377         setColEIterOffset(strsrch->textIter, result);
2378         setExpansionSuffix(strsrch->textIter, leftoverces);
2379         return result;
2380     }
2381 
2382     return ucol_getOffset(coleiter);
2383 }
2384 
2385 /**
2386 * Trying out the substring and sees if it can be a canonical match.
2387 * This will try normalizing the starting accents and arranging them into
2388 * canonical equivalents and check their corresponding ces with the pattern ce.
2389 * Prefix accents in the text will be grouped according to their combining
2390 * class and the groups will be mixed and matched to try find the perfect
2391 * match with the pattern.
2392 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2393 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2394 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2395 *         "\u0301\u0325".
2396 * step 2: check if any of the generated substrings matches the pattern.
2397 * Internal method, status assumed to be success, caller has to check status
2398 * before calling this method.
2399 * @param strsrch string search data
2400 * @param textoffset start offset in the collation element text that starts
2401 *                   with the accents to be rearranged
2402 * @param status output error status if any
2403 * @return TRUE if the match is valid, FALSE otherwise
2404 */
2405 static
doPreviousCanonicalMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)2406 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2407                                int32_t    textoffset,
2408                                UErrorCode    *status)
2409 {
2410     const UChar       *text       = strsrch->search->text;
2411           int32_t  temp       = textoffset;
2412           int32_t      textlength = strsrch->search->textLength;
2413     if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2414         UCollationElements *coleiter = strsrch->textIter;
2415         int32_t         offset   = ucol_getOffset(coleiter);
2416         if (strsrch->pattern.hasSuffixAccents) {
2417             offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2418                                                     offset, status);
2419             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2420                 setColEIterOffset(coleiter, offset);
2421                 return TRUE;
2422             }
2423         }
2424         return FALSE;
2425     }
2426 
2427     if (!strsrch->pattern.hasPrefixAccents) {
2428         return FALSE;
2429     }
2430 
2431     UChar       accents[INITIAL_ARRAY_SIZE_];
2432     // offset to the last base character in substring to search
2433     int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2434     // normalizing the offensive string
2435     unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2436                                0, accents, INITIAL_ARRAY_SIZE_, status);
2437     // status checked in loop
2438 
2439     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2440     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2441 
2442     // 2 power n - 1 plus the full set of accents
2443     int32_t  count = (2 << (size - 1)) - 1;
2444     while (U_SUCCESS(*status) && count > 0) {
2445         UChar *rearrange = strsrch->canonicalPrefixAccents;
2446         // copy the base characters
2447         for (int k = 0; k < accentsindex[0]; k ++) {
2448             *rearrange ++ = accents[k];
2449         }
2450         // forming all possible canonical rearrangement by dropping
2451         // sets of accents
2452         for (int i = 0; i <= size - 1; i ++) {
2453             int32_t mask = 1 << (size - i - 1);
2454             if (count & mask) {
2455                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2456                     *rearrange ++ = accents[j];
2457                 }
2458             }
2459         }
2460         *rearrange = 0;
2461         int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2462                                                           baseoffset, status);
2463         if (offset != USEARCH_DONE) {
2464             return TRUE; // match found
2465         }
2466         count --;
2467     }
2468     return FALSE;
2469 }
2470 
2471 /**
2472 * Checks match for contraction.
2473 * If the match starts with a partial contraction we fail.
2474 * Internal method, status assumed to be success, caller has to check status
2475 * before calling this method.
2476 * @param strsrch string search data
2477 * @param start offset of potential match, to be modified if necessary
2478 * @param end offset of potential match, to be modified if necessary
2479 * @param status only error status if any
2480 * @return TRUE if match passes the contraction test, FALSE otherwise
2481 */
2482 static
checkPreviousCanonicalContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)2483 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2484                                      int32_t   *start,
2485                                      int32_t   *end, UErrorCode  *status)
2486 {
2487           UCollationElements *coleiter   = strsrch->textIter;
2488           int32_t             textlength = strsrch->search->textLength;
2489           int32_t         temp       = *end;
2490     const UCollator          *collator   = strsrch->collator;
2491     const UChar              *text       = strsrch->search->text;
2492     // This part checks if either if the start of the match contains potential
2493     // contraction. If so we'll have to iterate through them
2494     // Since we used ucol_next while previously looking for the potential
2495     // match, this guarantees that our end will not be a partial contraction,
2496     // or a partial supplementary character.
2497     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2498         int32_t expansion  = getExpansionSuffix(coleiter);
2499         UBool   expandflag = expansion > 0;
2500         setColEIterOffset(coleiter, *end);
2501         while (expansion > 0) {
2502             // getting rid of the redundant ce
2503             // since forward contraction/expansion may have extra ces
2504             // if we are in the normalization buffer, hasAccentsBeforeMatch
2505             // would have taken care of it.
2506             // E.g. the character \u01FA will have an expansion of 3, but if
2507             // we are only looking for A ring A\u030A, we'll have to skip the
2508             // last ce in the expansion buffer
2509             ucol_previous(coleiter, status);
2510             if (U_FAILURE(*status)) {
2511                 return FALSE;
2512             }
2513             if (ucol_getOffset(coleiter) != temp) {
2514                 *end = temp;
2515                 temp  = ucol_getOffset(coleiter);
2516             }
2517             expansion --;
2518         }
2519 
2520         int32_t  *patternce       = strsrch->pattern.CE;
2521         int32_t   patterncelength = strsrch->pattern.CELength;
2522         int32_t   count           = patterncelength;
2523         while (count > 0) {
2524             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2525             // status checked below, note that if status is a failure
2526             // ucol_previous returns UCOL_NULLORDER
2527             if (ce == UCOL_IGNORABLE) {
2528                 continue;
2529             }
2530             if (expandflag && count == 0 &&
2531                 getColElemIterOffset(coleiter, FALSE) != temp) {
2532                 *end = temp;
2533                 temp  = ucol_getOffset(coleiter);
2534             }
2535             if (count == patterncelength &&
2536                 ce != patternce[patterncelength - 1]) {
2537                 // accents may have extra starting ces, this occurs when a
2538                 // pure accent pattern is matched without rearrangement
2539                 int32_t    expected = patternce[patterncelength - 1];
2540                 UTF_BACK_1(text, 0, *end);
2541                 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2542                     ce = getCE(strsrch, ucol_previous(coleiter, status));
2543                     while (U_SUCCESS(*status) && ce != expected &&
2544                            ce != UCOL_NULLORDER &&
2545                            ucol_getOffset(coleiter) <= *start) {
2546                         ce = getCE(strsrch, ucol_previous(coleiter, status));
2547                     }
2548                 }
2549             }
2550             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2551                 (*start) --;
2552                 *start = getPreviousBaseOffset(text, *start);
2553                 return FALSE;
2554             }
2555             count --;
2556         }
2557     }
2558     return TRUE;
2559 }
2560 
2561 /**
2562 * Checks and sets the match information if found.
2563 * Checks
2564 * <ul>
2565 * <li> the potential match does not repeat the previous match
2566 * <li> boundaries are correct
2567 * <li> potential match does not end in the middle of a contraction
2568 * <li> identical matches
2569 * <\ul>
2570 * Otherwise the offset will be shifted to the next character.
2571 * Internal method, status assumed to be success, caller has to check status
2572 * before calling this method.
2573 * @param strsrch string search data
2574 * @param textoffset offset in the collation element text. the returned value
2575 *        will be the truncated start offset of the match or the new start
2576 *        search offset.
2577 * @param status only error status if any
2578 * @return TRUE if the match is valid, FALSE otherwise
2579 */
2580 static
checkPreviousCanonicalMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)2581 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2582                                          int32_t   *textoffset,
2583                                          UErrorCode    *status)
2584 {
2585     // to ensure that the start and ends are not composite characters
2586     UCollationElements *coleiter = strsrch->textIter;
2587     // if we have a canonical accent match
2588     if ((strsrch->pattern.hasSuffixAccents &&
2589         strsrch->canonicalSuffixAccents[0]) ||
2590         (strsrch->pattern.hasPrefixAccents &&
2591         strsrch->canonicalPrefixAccents[0])) {
2592         strsrch->search->matchedIndex  = *textoffset;
2593         strsrch->search->matchedLength =
2594             getNextUStringSearchBaseOffset(strsrch,
2595                                       getColElemIterOffset(coleiter, FALSE))
2596             - *textoffset;
2597         return TRUE;
2598     }
2599 
2600     int32_t end = ucol_getOffset(coleiter);
2601     if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2602                                                 status) ||
2603          U_FAILURE(*status)) {
2604         return FALSE;
2605     }
2606 
2607     end = getNextUStringSearchBaseOffset(strsrch, end);
2608     // this totally matches, however we need to check if it is repeating
2609     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2610         !isBreakUnit(strsrch, *textoffset, end) ||
2611         !checkIdentical(strsrch, *textoffset, end)) {
2612         (*textoffset) --;
2613         *textoffset = getPreviousBaseOffset(strsrch->search->text,
2614                                             *textoffset);
2615         return FALSE;
2616     }
2617 
2618     strsrch->search->matchedIndex  = *textoffset;
2619     strsrch->search->matchedLength = end - *textoffset;
2620     return TRUE;
2621 }
2622 #endif // #if BOYER_MOORE
2623 
2624 // constructors and destructor -------------------------------------------
2625 
usearch_open(const UChar * pattern,int32_t patternlength,const UChar * text,int32_t textlength,const char * locale,UBreakIterator * breakiter,UErrorCode * status)2626 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2627                                           int32_t         patternlength,
2628                                     const UChar          *text,
2629                                           int32_t         textlength,
2630                                     const char           *locale,
2631                                           UBreakIterator *breakiter,
2632                                           UErrorCode     *status)
2633 {
2634     if (U_FAILURE(*status)) {
2635         return NULL;
2636     }
2637 #if UCONFIG_NO_BREAK_ITERATION
2638     if (breakiter != NULL) {
2639         *status = U_UNSUPPORTED_ERROR;
2640         return NULL;
2641     }
2642 #endif
2643     if (locale) {
2644         // ucol_open internally checks for status
2645         UCollator     *collator = ucol_open(locale, status);
2646         // pattern, text checks are done in usearch_openFromCollator
2647         UStringSearch *result   = usearch_openFromCollator(pattern,
2648                                               patternlength, text, textlength,
2649                                               collator, breakiter, status);
2650 
2651         if (result == NULL || U_FAILURE(*status)) {
2652             if (collator) {
2653                 ucol_close(collator);
2654             }
2655             return NULL;
2656         }
2657         else {
2658             result->ownCollator = TRUE;
2659         }
2660         return result;
2661     }
2662     *status = U_ILLEGAL_ARGUMENT_ERROR;
2663     return NULL;
2664 }
2665 
usearch_openFromCollator(const UChar * pattern,int32_t patternlength,const UChar * text,int32_t textlength,const UCollator * collator,UBreakIterator * breakiter,UErrorCode * status)2666 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2667                                   const UChar          *pattern,
2668                                         int32_t         patternlength,
2669                                   const UChar          *text,
2670                                         int32_t         textlength,
2671                                   const UCollator      *collator,
2672                                         UBreakIterator *breakiter,
2673                                         UErrorCode     *status)
2674 {
2675     if (U_FAILURE(*status)) {
2676         return NULL;
2677     }
2678 #if UCONFIG_NO_BREAK_ITERATION
2679     if (breakiter != NULL) {
2680         *status = U_UNSUPPORTED_ERROR;
2681         return NULL;
2682     }
2683 #endif
2684     if (pattern == NULL || text == NULL || collator == NULL) {
2685         *status = U_ILLEGAL_ARGUMENT_ERROR;
2686         return NULL;
2687     }
2688 
2689     // string search does not really work when numeric collation is turned on
2690     if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2691         *status = U_UNSUPPORTED_ERROR;
2692         return NULL;
2693     }
2694 
2695     if (U_SUCCESS(*status)) {
2696         initializeFCD(status);
2697         if (U_FAILURE(*status)) {
2698             return NULL;
2699         }
2700 
2701         UStringSearch *result;
2702         if (textlength == -1) {
2703             textlength = u_strlen(text);
2704         }
2705         if (patternlength == -1) {
2706             patternlength = u_strlen(pattern);
2707         }
2708         if (textlength <= 0 || patternlength <= 0) {
2709             *status = U_ILLEGAL_ARGUMENT_ERROR;
2710             return NULL;
2711         }
2712 
2713         result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2714         if (result == NULL) {
2715             *status = U_MEMORY_ALLOCATION_ERROR;
2716             return NULL;
2717         }
2718 
2719         result->collator    = collator;
2720         result->strength    = ucol_getStrength(collator);
2721         result->ceMask      = getMask(result->strength);
2722         result->toShift     =
2723              ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2724                                                             UCOL_SHIFTED;
2725         result->variableTop = ucol_getVariableTop(collator, status);
2726 
2727         if (U_FAILURE(*status)) {
2728             uprv_free(result);
2729             return NULL;
2730         }
2731 
2732         result->search             = (USearch *)uprv_malloc(sizeof(USearch));
2733         if (result->search == NULL) {
2734             *status = U_MEMORY_ALLOCATION_ERROR;
2735             uprv_free(result);
2736             return NULL;
2737         }
2738 
2739         result->search->text       = text;
2740         result->search->textLength = textlength;
2741 
2742         result->pattern.text       = pattern;
2743         result->pattern.textLength = patternlength;
2744         result->pattern.CE         = NULL;
2745         result->pattern.PCE        = NULL;
2746 
2747         result->search->breakIter  = breakiter;
2748 #if !UCONFIG_NO_BREAK_ITERATION
2749         result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
2750         if (breakiter) {
2751         	ubrk_setText(breakiter, text, textlength, status);
2752         }
2753 #endif
2754 
2755         result->ownCollator           = FALSE;
2756         result->search->matchedLength = 0;
2757         result->search->matchedIndex  = USEARCH_DONE;
2758         result->utilIter              = NULL;
2759         result->textIter              = ucol_openElements(collator, text,
2760                                                           textlength, status);
2761         if (U_FAILURE(*status)) {
2762             usearch_close(result);
2763             return NULL;
2764         }
2765 
2766         result->search->isOverlap          = FALSE;
2767         result->search->isCanonicalMatch   = FALSE;
2768         result->search->isForwardSearching = TRUE;
2769         result->search->reset              = TRUE;
2770 
2771         initialize(result, status);
2772 
2773         if (U_FAILURE(*status)) {
2774             usearch_close(result);
2775             return NULL;
2776         }
2777 
2778         return result;
2779     }
2780     return NULL;
2781 }
2782 
usearch_close(UStringSearch * strsrch)2783 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2784 {
2785     if (strsrch) {
2786         if (strsrch->pattern.CE != strsrch->pattern.CEBuffer &&
2787             strsrch->pattern.CE) {
2788             uprv_free(strsrch->pattern.CE);
2789         }
2790 
2791         if (strsrch->pattern.PCE != NULL &&
2792             strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
2793             uprv_free(strsrch->pattern.PCE);
2794         }
2795 
2796         ucol_closeElements(strsrch->textIter);
2797         ucol_closeElements(strsrch->utilIter);
2798 
2799         if (strsrch->ownCollator && strsrch->collator) {
2800             ucol_close((UCollator *)strsrch->collator);
2801         }
2802 
2803 #if !UCONFIG_NO_BREAK_ITERATION
2804         if (strsrch->search->internalBreakIter) {
2805         	ubrk_close(strsrch->search->internalBreakIter);
2806         }
2807 #endif
2808 
2809         uprv_free(strsrch->search);
2810         uprv_free(strsrch);
2811     }
2812 }
2813 
2814 // set and get methods --------------------------------------------------
2815 
usearch_setOffset(UStringSearch * strsrch,int32_t position,UErrorCode * status)2816 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2817                                         int32_t    position,
2818                                         UErrorCode    *status)
2819 {
2820     if (U_SUCCESS(*status) && strsrch) {
2821         if (isOutOfBounds(strsrch->search->textLength, position)) {
2822             *status = U_INDEX_OUTOFBOUNDS_ERROR;
2823         }
2824         else {
2825             setColEIterOffset(strsrch->textIter, position);
2826         }
2827         strsrch->search->matchedIndex  = USEARCH_DONE;
2828         strsrch->search->matchedLength = 0;
2829         strsrch->search->reset         = FALSE;
2830     }
2831 }
2832 
usearch_getOffset(const UStringSearch * strsrch)2833 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2834 {
2835     if (strsrch) {
2836         int32_t result = ucol_getOffset(strsrch->textIter);
2837         if (isOutOfBounds(strsrch->search->textLength, result)) {
2838             return USEARCH_DONE;
2839         }
2840         return result;
2841     }
2842     return USEARCH_DONE;
2843 }
2844 
usearch_setAttribute(UStringSearch * strsrch,USearchAttribute attribute,USearchAttributeValue value,UErrorCode * status)2845 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2846                                  USearchAttribute attribute,
2847                                  USearchAttributeValue value,
2848                                  UErrorCode *status)
2849 {
2850     if (U_SUCCESS(*status) && strsrch) {
2851         switch (attribute)
2852         {
2853         case USEARCH_OVERLAP :
2854             strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2855             break;
2856         case USEARCH_CANONICAL_MATCH :
2857             strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2858                                                                       FALSE);
2859             break;
2860         case USEARCH_ATTRIBUTE_COUNT :
2861         default:
2862             *status = U_ILLEGAL_ARGUMENT_ERROR;
2863         }
2864     }
2865     if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2866         *status = U_ILLEGAL_ARGUMENT_ERROR;
2867     }
2868 }
2869 
usearch_getAttribute(const UStringSearch * strsrch,USearchAttribute attribute)2870 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2871                                                 const UStringSearch *strsrch,
2872                                                 USearchAttribute attribute)
2873 {
2874     if (strsrch) {
2875         switch (attribute) {
2876         case USEARCH_OVERLAP :
2877             return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2878                                                         USEARCH_OFF);
2879         case USEARCH_CANONICAL_MATCH :
2880             return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2881                                                                USEARCH_OFF);
2882         case USEARCH_ATTRIBUTE_COUNT :
2883             return USEARCH_DEFAULT;
2884         }
2885     }
2886     return USEARCH_DEFAULT;
2887 }
2888 
usearch_getMatchedStart(const UStringSearch * strsrch)2889 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2890                                                 const UStringSearch *strsrch)
2891 {
2892     if (strsrch == NULL) {
2893         return USEARCH_DONE;
2894     }
2895     return strsrch->search->matchedIndex;
2896 }
2897 
2898 
usearch_getMatchedText(const UStringSearch * strsrch,UChar * result,int32_t resultCapacity,UErrorCode * status)2899 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2900                                             UChar         *result,
2901                                             int32_t        resultCapacity,
2902                                             UErrorCode    *status)
2903 {
2904     if (U_FAILURE(*status)) {
2905         return USEARCH_DONE;
2906     }
2907     if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2908         result == NULL)) {
2909         *status = U_ILLEGAL_ARGUMENT_ERROR;
2910         return USEARCH_DONE;
2911     }
2912 
2913     int32_t     copylength = strsrch->search->matchedLength;
2914     int32_t copyindex  = strsrch->search->matchedIndex;
2915     if (copyindex == USEARCH_DONE) {
2916         u_terminateUChars(result, resultCapacity, 0, status);
2917         return USEARCH_DONE;
2918     }
2919 
2920     if (resultCapacity < copylength) {
2921         copylength = resultCapacity;
2922     }
2923     if (copylength > 0) {
2924         uprv_memcpy(result, strsrch->search->text + copyindex,
2925                     copylength * sizeof(UChar));
2926     }
2927     return u_terminateUChars(result, resultCapacity,
2928                              strsrch->search->matchedLength, status);
2929 }
2930 
usearch_getMatchedLength(const UStringSearch * strsrch)2931 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2932                                               const UStringSearch *strsrch)
2933 {
2934     if (strsrch) {
2935         return strsrch->search->matchedLength;
2936     }
2937     return USEARCH_DONE;
2938 }
2939 
2940 #if !UCONFIG_NO_BREAK_ITERATION
2941 
usearch_setBreakIterator(UStringSearch * strsrch,UBreakIterator * breakiter,UErrorCode * status)2942 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
2943                                                UBreakIterator *breakiter,
2944                                                UErrorCode     *status)
2945 {
2946     if (U_SUCCESS(*status) && strsrch) {
2947     	strsrch->search->breakIter = breakiter;
2948         if (breakiter) {
2949             ubrk_setText(breakiter, strsrch->search->text,
2950                          strsrch->search->textLength, status);
2951         }
2952     }
2953 }
2954 
2955 U_CAPI const UBreakIterator* U_EXPORT2
usearch_getBreakIterator(const UStringSearch * strsrch)2956 usearch_getBreakIterator(const UStringSearch *strsrch)
2957 {
2958     if (strsrch) {
2959         return strsrch->search->breakIter;
2960     }
2961     return NULL;
2962 }
2963 
2964 #endif
2965 
usearch_setText(UStringSearch * strsrch,const UChar * text,int32_t textlength,UErrorCode * status)2966 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
2967                                       const UChar         *text,
2968                                             int32_t        textlength,
2969                                             UErrorCode    *status)
2970 {
2971     if (U_SUCCESS(*status)) {
2972         if (strsrch == NULL || text == NULL || textlength < -1 ||
2973             textlength == 0) {
2974             *status = U_ILLEGAL_ARGUMENT_ERROR;
2975         }
2976         else {
2977             if (textlength == -1) {
2978                 textlength = u_strlen(text);
2979             }
2980             strsrch->search->text       = text;
2981             strsrch->search->textLength = textlength;
2982             ucol_setText(strsrch->textIter, text, textlength, status);
2983             strsrch->search->matchedIndex  = USEARCH_DONE;
2984             strsrch->search->matchedLength = 0;
2985             strsrch->search->reset         = TRUE;
2986 #if !UCONFIG_NO_BREAK_ITERATION
2987             if (strsrch->search->breakIter != NULL) {
2988                 ubrk_setText(strsrch->search->breakIter, text,
2989                              textlength, status);
2990             }
2991             ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
2992 #endif
2993         }
2994     }
2995 }
2996 
usearch_getText(const UStringSearch * strsrch,int32_t * length)2997 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
2998                                                      int32_t       *length)
2999 {
3000     if (strsrch) {
3001         *length = strsrch->search->textLength;
3002         return strsrch->search->text;
3003     }
3004     return NULL;
3005 }
3006 
usearch_setCollator(UStringSearch * strsrch,const UCollator * collator,UErrorCode * status)3007 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
3008                                           const UCollator     *collator,
3009                                                 UErrorCode    *status)
3010 {
3011     if (U_SUCCESS(*status)) {
3012         if (collator == NULL) {
3013             *status = U_ILLEGAL_ARGUMENT_ERROR;
3014             return;
3015         }
3016 
3017         if (strsrch) {
3018             if (strsrch->ownCollator && (strsrch->collator != collator)) {
3019                 ucol_close((UCollator *)strsrch->collator);
3020                 strsrch->ownCollator = FALSE;
3021             }
3022             strsrch->collator    = collator;
3023             strsrch->strength    = ucol_getStrength(collator);
3024             strsrch->ceMask      = getMask(strsrch->strength);
3025 #if !UCONFIG_NO_BREAK_ITERATION
3026         	ubrk_close(strsrch->search->internalBreakIter);
3027         	strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
3028         											 strsrch->search->text, strsrch->search->textLength, status);
3029 #endif
3030             // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3031             strsrch->toShift     =
3032                ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
3033                                                                 UCOL_SHIFTED;
3034             // if status is a failure, ucol_getVariableTop returns 0
3035             strsrch->variableTop = ucol_getVariableTop(collator, status);
3036             if (U_SUCCESS(*status)) {
3037                 initialize(strsrch, status);
3038                 if (U_SUCCESS(*status)) {
3039                     /* free offset buffer to avoid memory leak before initializing. */
3040                     ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));
3041                     uprv_init_collIterate(collator, strsrch->search->text,
3042                                           strsrch->search->textLength,
3043                                           &(strsrch->textIter->iteratordata_));
3044                     strsrch->utilIter->iteratordata_.coll = collator;
3045                 }
3046             }
3047         }
3048 
3049         // **** are these calls needed?
3050         // **** we call uprv_init_pce in initializePatternPCETable
3051         // **** and the CEBuffer constructor...
3052 #if 0
3053         uprv_init_pce(strsrch->textIter);
3054         uprv_init_pce(strsrch->utilIter);
3055 #endif
3056     }
3057 }
3058 
usearch_getCollator(const UStringSearch * strsrch)3059 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3060 {
3061     if (strsrch) {
3062         return (UCollator *)strsrch->collator;
3063     }
3064     return NULL;
3065 }
3066 
usearch_setPattern(UStringSearch * strsrch,const UChar * pattern,int32_t patternlength,UErrorCode * status)3067 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
3068                                          const UChar         *pattern,
3069                                                int32_t        patternlength,
3070                                                UErrorCode    *status)
3071 {
3072     if (U_SUCCESS(*status)) {
3073         if (strsrch == NULL || pattern == NULL) {
3074             *status = U_ILLEGAL_ARGUMENT_ERROR;
3075         }
3076         else {
3077             if (patternlength == -1) {
3078                 patternlength = u_strlen(pattern);
3079             }
3080             if (patternlength == 0) {
3081                 *status = U_ILLEGAL_ARGUMENT_ERROR;
3082                 return;
3083             }
3084             strsrch->pattern.text       = pattern;
3085             strsrch->pattern.textLength = patternlength;
3086             initialize(strsrch, status);
3087         }
3088     }
3089 }
3090 
3091 U_CAPI const UChar* U_EXPORT2
usearch_getPattern(const UStringSearch * strsrch,int32_t * length)3092 usearch_getPattern(const UStringSearch *strsrch,
3093                    int32_t       *length)
3094 {
3095     if (strsrch) {
3096         *length = strsrch->pattern.textLength;
3097         return strsrch->pattern.text;
3098     }
3099     return NULL;
3100 }
3101 
3102 // miscellanous methods --------------------------------------------------
3103 
usearch_first(UStringSearch * strsrch,UErrorCode * status)3104 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3105                                            UErrorCode    *status)
3106 {
3107     if (strsrch && U_SUCCESS(*status)) {
3108         strsrch->search->isForwardSearching = TRUE;
3109         usearch_setOffset(strsrch, 0, status);
3110         if (U_SUCCESS(*status)) {
3111             return usearch_next(strsrch, status);
3112         }
3113     }
3114     return USEARCH_DONE;
3115 }
3116 
usearch_following(UStringSearch * strsrch,int32_t position,UErrorCode * status)3117 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
3118                                                int32_t    position,
3119                                                UErrorCode    *status)
3120 {
3121     if (strsrch && U_SUCCESS(*status)) {
3122         strsrch->search->isForwardSearching = TRUE;
3123         // position checked in usearch_setOffset
3124         usearch_setOffset(strsrch, position, status);
3125         if (U_SUCCESS(*status)) {
3126             return usearch_next(strsrch, status);
3127         }
3128     }
3129     return USEARCH_DONE;
3130 }
3131 
usearch_last(UStringSearch * strsrch,UErrorCode * status)3132 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
3133                                           UErrorCode    *status)
3134 {
3135     if (strsrch && U_SUCCESS(*status)) {
3136         strsrch->search->isForwardSearching = FALSE;
3137         usearch_setOffset(strsrch, strsrch->search->textLength, status);
3138         if (U_SUCCESS(*status)) {
3139             return usearch_previous(strsrch, status);
3140         }
3141     }
3142     return USEARCH_DONE;
3143 }
3144 
usearch_preceding(UStringSearch * strsrch,int32_t position,UErrorCode * status)3145 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
3146                                                int32_t    position,
3147                                                UErrorCode    *status)
3148 {
3149     if (strsrch && U_SUCCESS(*status)) {
3150         strsrch->search->isForwardSearching = FALSE;
3151         // position checked in usearch_setOffset
3152         usearch_setOffset(strsrch, position, status);
3153         if (U_SUCCESS(*status)) {
3154             return usearch_previous(strsrch, status);
3155         }
3156     }
3157     return USEARCH_DONE;
3158 }
3159 
3160 /**
3161 * If a direction switch is required, we'll count the number of ces till the
3162 * beginning of the collation element iterator and iterate forwards that
3163 * number of times. This is so that we get to the correct point within the
3164 * string to continue the search in. Imagine when we are in the middle of the
3165 * normalization buffer when the change in direction is request. arrrgghh....
3166 * After searching the offset within the collation element iterator will be
3167 * shifted to the start of the match. If a match is not found, the offset would
3168 * have been set to the end of the text string in the collation element
3169 * iterator.
3170 * Okay, here's my take on normalization buffer. The only time when there can
3171 * be 2 matches within the same normalization is when the pattern is consists
3172 * of all accents. But since the offset returned is from the text string, we
3173 * should not confuse the caller by returning the second match within the
3174 * same normalization buffer. If we do, the 2 results will have the same match
3175 * offsets, and that'll be confusing. I'll return the next match that doesn't
3176 * fall within the same normalization buffer. Note this does not affect the
3177 * results of matches spanning the text and the normalization buffer.
3178 * The position to start searching is taken from the collation element
3179 * iterator. Callers of this API would have to set the offset in the collation
3180 * element iterator before using this method.
3181 */
usearch_next(UStringSearch * strsrch,UErrorCode * status)3182 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3183                                           UErrorCode    *status)
3184 {
3185     if (U_SUCCESS(*status) && strsrch) {
3186         // note offset is either equivalent to the start of the previous match
3187         // or is set by the user
3188         int32_t      offset       = usearch_getOffset(strsrch);
3189         USearch     *search       = strsrch->search;
3190         search->reset             = FALSE;
3191         int32_t      textlength   = search->textLength;
3192         if (search->isForwardSearching) {
3193 #if BOYER_MOORE
3194             if (offset == textlength
3195                 || (!search->isOverlap &&
3196                     (offset + strsrch->pattern.defaultShiftSize > textlength ||
3197                     (search->matchedIndex != USEARCH_DONE &&
3198                      offset + search->matchedLength >= textlength)))) {
3199                 // not enough characters to match
3200                 setMatchNotFound(strsrch);
3201                 return USEARCH_DONE;
3202             }
3203 #else
3204             if (offset == textlength ||
3205                 (! search->isOverlap &&
3206                 (search->matchedIndex != USEARCH_DONE &&
3207                 offset + search->matchedLength > textlength))) {
3208                     // not enough characters to match
3209                     setMatchNotFound(strsrch);
3210                     return USEARCH_DONE;
3211             }
3212 #endif
3213         }
3214         else {
3215             // switching direction.
3216             // if matchedIndex == USEARCH_DONE, it means that either a
3217             // setOffset has been called or that previous ran off the text
3218             // string. the iterator would have been set to offset 0 if a
3219             // match is not found.
3220             search->isForwardSearching = TRUE;
3221             if (search->matchedIndex != USEARCH_DONE) {
3222                 // there's no need to set the collation element iterator
3223                 // the next call to next will set the offset.
3224                 return search->matchedIndex;
3225             }
3226         }
3227 
3228         if (U_SUCCESS(*status)) {
3229             if (strsrch->pattern.CELength == 0) {
3230                 if (search->matchedIndex == USEARCH_DONE) {
3231                     search->matchedIndex = offset;
3232                 }
3233                 else { // moves by codepoints
3234                     UTF_FWD_1(search->text, search->matchedIndex, textlength);
3235                 }
3236 
3237                 search->matchedLength = 0;
3238                 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3239                 // status checked below
3240                 if (search->matchedIndex == textlength) {
3241                     search->matchedIndex = USEARCH_DONE;
3242                 }
3243             }
3244             else {
3245                 if (search->matchedLength > 0) {
3246                     // if matchlength is 0 we are at the start of the iteration
3247                     if (search->isOverlap) {
3248                         ucol_setOffset(strsrch->textIter, offset + 1, status);
3249                     }
3250                     else {
3251                         ucol_setOffset(strsrch->textIter,
3252                                        offset + search->matchedLength, status);
3253                     }
3254                 }
3255                 else {
3256                     // for boundary check purposes. this will ensure that the
3257                     // next match will not preceed the current offset
3258                     // note search->matchedIndex will always be set to something
3259                     // in the code
3260                     search->matchedIndex = offset - 1;
3261                 }
3262 
3263                 if (search->isCanonicalMatch) {
3264                     // can't use exact here since extra accents are allowed.
3265                     usearch_handleNextCanonical(strsrch, status);
3266                 }
3267                 else {
3268                     usearch_handleNextExact(strsrch, status);
3269                 }
3270             }
3271 
3272             if (U_FAILURE(*status)) {
3273                 return USEARCH_DONE;
3274             }
3275 
3276 #if !BOYER_MOORE
3277             if (search->matchedIndex == USEARCH_DONE) {
3278                 ucol_setOffset(strsrch->textIter, search->textLength, status);
3279             } else {
3280                 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3281             }
3282 #endif
3283 
3284             return search->matchedIndex;
3285         }
3286     }
3287     return USEARCH_DONE;
3288 }
3289 
usearch_previous(UStringSearch * strsrch,UErrorCode * status)3290 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3291                                               UErrorCode *status)
3292 {
3293     if (U_SUCCESS(*status) && strsrch) {
3294         int32_t offset;
3295         USearch *search = strsrch->search;
3296         if (search->reset) {
3297             offset                     = search->textLength;
3298             search->isForwardSearching = FALSE;
3299             search->reset              = FALSE;
3300             setColEIterOffset(strsrch->textIter, offset);
3301         }
3302         else {
3303             offset = usearch_getOffset(strsrch);
3304         }
3305 
3306         int32_t matchedindex = search->matchedIndex;
3307         if (search->isForwardSearching == TRUE) {
3308             // switching direction.
3309             // if matchedIndex == USEARCH_DONE, it means that either a
3310             // setOffset has been called or that next ran off the text
3311             // string. the iterator would have been set to offset textLength if
3312             // a match is not found.
3313             search->isForwardSearching = FALSE;
3314             if (matchedindex != USEARCH_DONE) {
3315                 return matchedindex;
3316             }
3317         }
3318         else {
3319 #if BOYER_MOORE
3320             if (offset == 0 || matchedindex == 0 ||
3321                 (!search->isOverlap &&
3322                     (offset < strsrch->pattern.defaultShiftSize ||
3323                     (matchedindex != USEARCH_DONE &&
3324                     matchedindex < strsrch->pattern.defaultShiftSize)))) {
3325                 // not enough characters to match
3326                 setMatchNotFound(strsrch);
3327                 return USEARCH_DONE;
3328             }
3329 #else
3330             // Could check pattern length, but the
3331             // linear search will do the right thing
3332             if (offset == 0 || matchedindex == 0) {
3333                 setMatchNotFound(strsrch);
3334                 return USEARCH_DONE;
3335             }
3336 #endif
3337         }
3338 
3339         if (U_SUCCESS(*status)) {
3340             if (strsrch->pattern.CELength == 0) {
3341                 search->matchedIndex =
3342                       (matchedindex == USEARCH_DONE ? offset : matchedindex);
3343                 if (search->matchedIndex == 0) {
3344                     setMatchNotFound(strsrch);
3345                     // status checked below
3346                 }
3347                 else { // move by codepoints
3348                     UTF_BACK_1(search->text, 0, search->matchedIndex);
3349                     setColEIterOffset(strsrch->textIter, search->matchedIndex);
3350                     // status checked below
3351                     search->matchedLength = 0;
3352                 }
3353             }
3354             else {
3355 #if !BOYER_MOORE
3356                 if (search->matchedIndex != USEARCH_DONE) {
3357                     if (search->isOverlap) {
3358                         ucol_setOffset(strsrch->textIter, search->matchedIndex + search->matchedLength - 2, status);
3359                     }
3360                 }
3361 #endif
3362 
3363                 if (strsrch->search->isCanonicalMatch) {
3364                     // can't use exact here since extra accents are allowed.
3365                     usearch_handlePreviousCanonical(strsrch, status);
3366                     // status checked below
3367                 }
3368                 else {
3369                     usearch_handlePreviousExact(strsrch, status);
3370                     // status checked below
3371                 }
3372             }
3373 
3374             if (U_FAILURE(*status)) {
3375                 return USEARCH_DONE;
3376             }
3377 
3378             return search->matchedIndex;
3379         }
3380     }
3381     return USEARCH_DONE;
3382 }
3383 
3384 
3385 
usearch_reset(UStringSearch * strsrch)3386 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3387 {
3388     /*
3389     reset is setting the attributes that are already in
3390     string search, hence all attributes in the collator should
3391     be retrieved without any problems
3392     */
3393     if (strsrch) {
3394         UErrorCode status            = U_ZERO_ERROR;
3395         UBool      sameCollAttribute = TRUE;
3396         uint32_t   ceMask;
3397         UBool      shift;
3398         uint32_t   varTop;
3399 
3400         // **** hack to deal w/ how processed CEs encode quaternary ****
3401         UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
3402         if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
3403             (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
3404                 sameCollAttribute = FALSE;
3405         }
3406 
3407         strsrch->strength    = ucol_getStrength(strsrch->collator);
3408         ceMask = getMask(strsrch->strength);
3409         if (strsrch->ceMask != ceMask) {
3410             strsrch->ceMask = ceMask;
3411             sameCollAttribute = FALSE;
3412         }
3413 
3414         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3415         shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3416                                   &status) == UCOL_SHIFTED;
3417         if (strsrch->toShift != shift) {
3418             strsrch->toShift  = shift;
3419             sameCollAttribute = FALSE;
3420         }
3421 
3422         // if status is a failure, ucol_getVariableTop returns 0
3423         varTop = ucol_getVariableTop(strsrch->collator, &status);
3424         if (strsrch->variableTop != varTop) {
3425             strsrch->variableTop = varTop;
3426             sameCollAttribute    = FALSE;
3427         }
3428         if (!sameCollAttribute) {
3429             initialize(strsrch, &status);
3430         }
3431         /* free offset buffer to avoid memory leak before initializing. */
3432         ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));
3433         uprv_init_collIterate(strsrch->collator, strsrch->search->text,
3434                               strsrch->search->textLength,
3435                               &(strsrch->textIter->iteratordata_));
3436         strsrch->search->matchedLength      = 0;
3437         strsrch->search->matchedIndex       = USEARCH_DONE;
3438         strsrch->search->isOverlap          = FALSE;
3439         strsrch->search->isCanonicalMatch   = FALSE;
3440         strsrch->search->isForwardSearching = TRUE;
3441         strsrch->search->reset              = TRUE;
3442     }
3443 }
3444 
3445 //
3446 //  CEI  Collation Element + source text index.
3447 //       These structs are kept in the circular buffer.
3448 //
3449 struct  CEI {
3450     int64_t ce;
3451     int32_t lowIndex;
3452     int32_t highIndex;
3453 };
3454 
3455 U_NAMESPACE_BEGIN
3456 
3457 
3458 //
3459 //  CEBuffer   A circular buffer of CEs from the text being searched.
3460 //
3461 #define   DEFAULT_CEBUFFER_SIZE 50
3462 struct CEBuffer {
3463     CEI                  defBuf[DEFAULT_CEBUFFER_SIZE];
3464     CEI                 *buf;
3465     int32_t              bufSize;
3466     int32_t              firstIx;
3467     int32_t              limitIx;
3468     UCollationElements  *ceIter;
3469     UStringSearch       *strSearch;
3470 
3471 
3472 
3473                CEBuffer(UStringSearch *ss, UErrorCode *status);
3474                ~CEBuffer();
3475    const CEI   *get(int32_t index);
3476    const CEI   *getPrevious(int32_t index);
3477 };
3478 
3479 
CEBuffer(UStringSearch * ss,UErrorCode * status)3480 CEBuffer::CEBuffer(UStringSearch *ss, UErrorCode *status) {
3481     buf = defBuf;
3482     strSearch = ss;
3483     bufSize = ss->pattern.CELength+10;
3484     ceIter    = ss->textIter;
3485     firstIx = 0;
3486     limitIx = 0;
3487 
3488     uprv_init_pce(ceIter);
3489 
3490     if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3491         buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
3492         if (buf == NULL) {
3493             *status = U_MEMORY_ALLOCATION_ERROR;
3494         }
3495     }
3496 }
3497 
3498 // TODO: add a reset or init function so that allocated
3499 //       buffers can be retained & reused.
3500 
~CEBuffer()3501 CEBuffer::~CEBuffer() {
3502     if (buf != defBuf) {
3503         uprv_free(buf);
3504     }
3505 }
3506 
3507 
3508 // Get the CE with the specified index.
3509 //   Index must be in the range
3510 //          n-history_size < index < n+1
3511 //   where n is the largest index to have been fetched by some previous call to this function.
3512 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3513 //
get(int32_t index)3514 const CEI *CEBuffer::get(int32_t index) {
3515     int i = index % bufSize;
3516 
3517     if (index>=firstIx && index<limitIx) {
3518         // The request was for an entry already in our buffer.
3519         //  Just return it.
3520         return &buf[i];
3521     }
3522 
3523     // Caller is requesting a new, never accessed before, CE.
3524     //   Verify that it is the next one in sequence, which is all
3525     //   that is allowed.
3526     if (index != limitIx) {
3527         U_ASSERT(FALSE);
3528 
3529         return NULL;
3530     }
3531 
3532     // Manage the circular CE buffer indexing
3533     limitIx++;
3534 
3535     if (limitIx - firstIx >= bufSize) {
3536         // The buffer is full, knock out the lowest-indexed entry.
3537         firstIx++;
3538     }
3539 
3540     UErrorCode status = U_ZERO_ERROR;
3541 
3542     buf[i].ce = ucol_nextProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status);
3543 
3544     return &buf[i];
3545 }
3546 
3547 // Get the CE with the specified index.
3548 //   Index must be in the range
3549 //          n-history_size < index < n+1
3550 //   where n is the largest index to have been fetched by some previous call to this function.
3551 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3552 //
getPrevious(int32_t index)3553 const CEI *CEBuffer::getPrevious(int32_t index) {
3554     int i = index % bufSize;
3555 
3556     if (index>=firstIx && index<limitIx) {
3557         // The request was for an entry already in our buffer.
3558         //  Just return it.
3559         return &buf[i];
3560     }
3561 
3562     // Caller is requesting a new, never accessed before, CE.
3563     //   Verify that it is the next one in sequence, which is all
3564     //   that is allowed.
3565     if (index != limitIx) {
3566         U_ASSERT(FALSE);
3567 
3568         return NULL;
3569     }
3570 
3571     // Manage the circular CE buffer indexing
3572     limitIx++;
3573 
3574     if (limitIx - firstIx >= bufSize) {
3575         // The buffer is full, knock out the lowest-indexed entry.
3576         firstIx++;
3577     }
3578 
3579     UErrorCode status = U_ZERO_ERROR;
3580 
3581     buf[i].ce = ucol_previousProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status);
3582 
3583     return &buf[i];
3584 }
3585 
3586 U_NAMESPACE_END
3587 
3588 
3589 // #define USEARCH_DEBUG
3590 
3591 #ifdef USEARCH_DEBUG
3592 #include <stdio.h>
3593 #include <stdlib.h>
3594 #endif
3595 
3596 /*
3597  * Find the next break boundary after startIndex. If the UStringSearch object
3598  * has an external break iterator, use that. Otherwise use the internal character
3599  * break iterator.
3600  */
nextBoundaryAfter(UStringSearch * strsrch,int32_t startIndex)3601 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3602 #if 0
3603     const UChar *text = strsrch->search->text;
3604     int32_t textLen   = strsrch->search->textLength;
3605 
3606     U_ASSERT(startIndex>=0);
3607     U_ASSERT(startIndex<=textLen);
3608 
3609     if (startIndex >= textLen) {
3610         return startIndex;
3611     }
3612 
3613     UChar32  c;
3614     int32_t  i = startIndex;
3615     U16_NEXT(text, i, textLen, c);
3616 
3617     // If we are on a control character, stop without looking for combining marks.
3618     //    Control characters do not combine.
3619     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3620     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
3621         return i;
3622     }
3623 
3624     // The initial character was not a control, and can thus accept trailing
3625     //   combining characters.  Advance over however many of them there are.
3626     int32_t  indexOfLastCharChecked;
3627     for (;;) {
3628         indexOfLastCharChecked = i;
3629         if (i>=textLen) {
3630             break;
3631         }
3632         U16_NEXT(text, i, textLen, c);
3633         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3634         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3635             break;
3636         }
3637     }
3638     return indexOfLastCharChecked;
3639 #elif !UCONFIG_NO_BREAK_ITERATION
3640     UBreakIterator *breakiterator = strsrch->search->breakIter;
3641 
3642     if (breakiterator == NULL) {
3643         breakiterator = strsrch->search->internalBreakIter;
3644     }
3645 
3646     if (breakiterator != NULL) {
3647     	return ubrk_following(breakiterator, startIndex);
3648     }
3649 
3650     return startIndex;
3651 #else
3652     // **** or should we use the original code? ****
3653     return startIndex;
3654 #endif
3655 
3656 }
3657 
3658 /*
3659  * Returns TRUE if index is on a break boundary. If the UStringSearch
3660  * has an external break iterator, test using that, otherwise test
3661  * using the internal character break iterator.
3662  */
isBreakBoundary(UStringSearch * strsrch,int32_t index)3663 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3664 #if 0
3665     const UChar *text = strsrch->search->text;
3666     int32_t textLen   = strsrch->search->textLength;
3667 
3668     U_ASSERT(index>=0);
3669     U_ASSERT(index<=textLen);
3670 
3671     if (index>=textLen || index<=0) {
3672         return FALSE;
3673     }
3674 
3675     // If the character at the current index is not a GRAPHEME_EXTEND
3676     //    then we can not be within a combining sequence.
3677     UChar32  c;
3678     U16_GET(text, 0, index, textLen, c);
3679     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3680     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3681         return FALSE;
3682     }
3683 
3684     // We are at a combining mark.  If the preceding character is anything
3685     //   except a CONTROL, CR or LF, we are in a combining sequence.
3686     U16_PREV(text, 0, index, c);
3687     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3688     UBool combining =  !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
3689     return combining;
3690 #elif !UCONFIG_NO_BREAK_ITERATION
3691     UBreakIterator *breakiterator = strsrch->search->breakIter;
3692 
3693     if (breakiterator == NULL) {
3694         breakiterator = strsrch->search->internalBreakIter;
3695     }
3696 
3697     return (breakiterator != NULL && ! ubrk_isBoundary(breakiterator, index));
3698 #else
3699     // **** or use the original code? ****
3700     return FALSE;
3701 #endif
3702 }
3703 
3704 #if 0
3705 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3706 {
3707 #if !UCONFIG_NO_BREAK_ITERATION
3708     UBreakIterator *breakiterator = strsrch->search->breakIter;
3709 
3710     if (breakiterator != NULL) {
3711         int32_t startindex = ubrk_first(breakiterator);
3712         int32_t endindex   = ubrk_last(breakiterator);
3713 
3714         // out-of-range indexes are never boundary positions
3715         if (start < startindex || start > endindex ||
3716             end < startindex || end > endindex) {
3717             return FALSE;
3718         }
3719 
3720         return ubrk_isBoundary(breakiterator, start) &&
3721                ubrk_isBoundary(breakiterator, end);
3722     }
3723 #endif
3724 
3725     return TRUE;
3726 }
3727 #endif
3728 
3729 
usearch_search(UStringSearch * strsrch,int32_t startIdx,int32_t * matchStart,int32_t * matchLimit,UErrorCode * status)3730 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
3731                                        int32_t        startIdx,
3732                                        int32_t        *matchStart,
3733                                        int32_t        *matchLimit,
3734                                        UErrorCode     *status)
3735 {
3736     if (U_FAILURE(*status)) {
3737         return FALSE;
3738     }
3739 
3740     // TODO:  reject search patterns beginning with a combining char.
3741 
3742 #ifdef USEARCH_DEBUG
3743     if (getenv("USEARCH_DEBUG") != NULL) {
3744         printf("Pattern CEs\n");
3745         for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
3746             printf(" %8x", strsrch->pattern.CE[ii]);
3747         }
3748         printf("\n");
3749     }
3750 
3751 #endif
3752     // Input parameter sanity check.
3753     //  TODO:  should input indicies clip to the text length
3754     //         in the same way that UText does.
3755     if(strsrch->pattern.CELength == 0         ||
3756        startIdx < 0                           ||
3757        startIdx > strsrch->search->textLength ||
3758        strsrch->pattern.CE == NULL) {
3759            *status = U_ILLEGAL_ARGUMENT_ERROR;
3760            return FALSE;
3761     }
3762 
3763     if (strsrch->pattern.PCE == NULL) {
3764         initializePatternPCETable(strsrch, status);
3765     }
3766 
3767     ucol_setOffset(strsrch->textIter, startIdx, status);
3768     CEBuffer ceb(strsrch, status);
3769 
3770 
3771     int32_t    targetIx = 0;
3772     const CEI *targetCEI;
3773     int32_t    patIx;
3774     UBool      found;
3775 
3776     int32_t  mStart = -1;
3777     int32_t  mLimit = -1;
3778     int32_t  minLimit;
3779     int32_t  maxLimit;
3780 
3781 
3782 
3783     // Outer loop moves over match starting positions in the
3784     //      target CE space.
3785     for(targetIx=0; ; targetIx++)
3786     {
3787         found = TRUE;
3788         //  Inner loop checks for a match beginning at each
3789         //  position from the outer loop.
3790         for (patIx=0; patIx<strsrch->pattern.PCELength; patIx++) {
3791             int64_t patCE = strsrch->pattern.PCE[patIx];
3792             targetCEI = ceb.get(targetIx+patIx);
3793             //  Compare CE from target string with CE from the pattern.
3794             //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
3795             //    which will fail the compare, below.
3796             if (targetCEI->ce != patCE) {
3797                 found = FALSE;
3798                 break;
3799             }
3800         }
3801 
3802         if (!found && targetCEI->ce != UCOL_PROCESSED_NULLORDER) {
3803             // No match at this targetIx.  Try again at the next.
3804             continue;
3805         }
3806 
3807         if (!found) {
3808             // No match at all, we have run off the end of the target text.
3809             break;
3810         }
3811 
3812 
3813         // We have found a match in CE space.
3814         // Now determine the bounds in string index space.
3815         //  There still is a chance of match failure if the CE range not correspond to
3816         //     an acceptable character range.
3817         //
3818         const CEI *firstCEI = ceb.get(targetIx);
3819         const CEI *lastCEI  = ceb.get(targetIx + strsrch->pattern.PCELength - 1);
3820         const CEI *nextCEI  = ceb.get(targetIx + strsrch->pattern.PCELength);
3821 
3822         mStart   = firstCEI->lowIndex;
3823         minLimit = lastCEI->lowIndex;
3824         maxLimit = nextCEI->lowIndex;
3825 
3826         // Look at the CE following the match.  If it is UCOL_NULLORDER the match
3827         //   extended to the end of input, and the match is good.
3828 
3829         // Look at the high and low indices of the CE following the match. If
3830         // they are the same it means one of two things:
3831         //    1. The match extended to the last CE from the target text, which is OK, or
3832         //    2. The last CE that was part of the match is in an expansion that extends
3833         //       to the first CE after the match. In this case, we reject the match.
3834         if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3835             found = FALSE;
3836         }
3837 
3838 
3839         // Check for the start of the match being within a combining sequence.
3840         //   This can happen if the pattern itself begins with a combining char, and
3841         //   the match found combining marks in the target text that were attached
3842         //    to something else.
3843         //   This type of match should be rejected for not completely consuming a
3844         //   combining sequence.
3845         if (isBreakBoundary(strsrch, mStart)) {
3846             found = FALSE;
3847         }
3848 
3849         // Check for the start of the match being within an Collation Element Expansion,
3850         //   meaning that the first char of the match is only partially matched.
3851         //   With exapnsions, the first CE will report the index of the source
3852         //   character, and all subsequent (expansions) CEs will report the source index of the
3853         //    _following_ character.
3854         int32_t secondIx = firstCEI->highIndex;
3855         if (mStart == secondIx) {
3856             found = FALSE;
3857         }
3858 
3859         //  Advance the match end position to the first acceptable match boundary.
3860         //    This advances the index over any combining charcters.
3861         mLimit = maxLimit;
3862         if (minLimit < maxLimit) {
3863             int32_t nba = nextBoundaryAfter(strsrch, minLimit);
3864 
3865             if (nba >= lastCEI->highIndex) {
3866                 mLimit = nba;
3867             }
3868         }
3869 
3870     #ifdef USEARCH_DEBUG
3871         if (getenv("USEARCH_DEBUG") != NULL) {
3872             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
3873         }
3874     #endif
3875 
3876         // If advancing to the end of a combining sequence in character indexing space
3877         //   advanced us beyond the end of the match in CE space, reject this match.
3878         if (mLimit > maxLimit) {
3879             found = FALSE;
3880         }
3881 
3882         if (isBreakBoundary(strsrch, mLimit)) {
3883             found = FALSE;
3884         }
3885 
3886         if (! checkIdentical(strsrch, mStart, mLimit)) {
3887             found = FALSE;
3888         }
3889 
3890         if (found) {
3891             break;
3892         }
3893     }
3894 
3895     #ifdef USEARCH_DEBUG
3896     if (getenv("USEARCH_DEBUG") != NULL) {
3897         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
3898         int32_t  lastToPrint = ceb.limitIx+2;
3899         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
3900             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
3901         }
3902         printf("\n%s\n", found? "match found" : "no match");
3903     }
3904     #endif
3905 
3906     // All Done.  Store back the match bounds to the caller.
3907     //
3908     if (found==FALSE) {
3909         mLimit = -1;
3910         mStart = -1;
3911     }
3912 
3913     if (matchStart != NULL) {
3914         *matchStart= mStart;
3915     }
3916 
3917     if (matchLimit != NULL) {
3918         *matchLimit = mLimit;
3919     }
3920 
3921     return found;
3922 }
3923 
3924 
usearch_searchBackwards(UStringSearch * strsrch,int32_t startIdx,int32_t * matchStart,int32_t * matchLimit,UErrorCode * status)3925 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
3926                                                 int32_t        startIdx,
3927                                                 int32_t        *matchStart,
3928                                                 int32_t        *matchLimit,
3929                                                 UErrorCode     *status)
3930 {
3931     if (U_FAILURE(*status)) {
3932         return FALSE;
3933     }
3934 
3935     // TODO:  reject search patterns beginning with a combining char.
3936 
3937 #ifdef USEARCH_DEBUG
3938     if (getenv("USEARCH_DEBUG") != NULL) {
3939         printf("Pattern CEs\n");
3940         for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
3941             printf(" %8x", strsrch->pattern.CE[ii]);
3942         }
3943         printf("\n");
3944     }
3945 
3946 #endif
3947     // Input parameter sanity check.
3948     //  TODO:  should input indicies clip to the text length
3949     //         in the same way that UText does.
3950     if(strsrch->pattern.CELength == 0         ||
3951        startIdx < 0                           ||
3952        startIdx > strsrch->search->textLength ||
3953        strsrch->pattern.CE == NULL) {
3954            *status = U_ILLEGAL_ARGUMENT_ERROR;
3955            return FALSE;
3956     }
3957 
3958     if (strsrch->pattern.PCE == NULL) {
3959         initializePatternPCETable(strsrch, status);
3960     }
3961 
3962     CEBuffer ceb(strsrch, status);
3963     int32_t    targetIx = 0;
3964 
3965     /*
3966      * Pre-load the buffer with the CE's for the grapheme
3967      * after our starting position so that we're sure that
3968      * we can look at the CE following the match when we
3969      * check the match boundaries.
3970      *
3971      * This will also pre-fetch the first CE that we'll
3972      * consider for the match.
3973      */
3974     if (startIdx < strsrch->search->textLength) {
3975         UBreakIterator *bi = strsrch->search->internalBreakIter;
3976         int32_t next = ubrk_following(bi, startIdx);
3977 
3978         ucol_setOffset(strsrch->textIter, next, status);
3979 
3980         for (targetIx = 0; ; targetIx += 1) {
3981             if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
3982                 break;
3983             }
3984         }
3985     } else {
3986         ucol_setOffset(strsrch->textIter, startIdx, status);
3987     }
3988 
3989 
3990    const CEI  *targetCEI;
3991     int32_t    patIx;
3992     UBool      found;
3993 
3994     int32_t  limitIx = targetIx;
3995     int32_t  mStart = -1;
3996     int32_t  mLimit = -1;
3997     int32_t  minLimit;
3998     int32_t  maxLimit;
3999 
4000 
4001 
4002     // Outer loop moves over match starting positions in the
4003     //      target CE space.
4004     for(targetIx = limitIx; ; targetIx += 1)
4005     {
4006         found = TRUE;
4007         //  Inner loop checks for a match beginning at each
4008         //  position from the outer loop.
4009         for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) {
4010             int64_t patCE = strsrch->pattern.PCE[patIx];
4011 
4012             targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx);
4013             //  Compare CE from target string with CE from the pattern.
4014             //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4015             //    which will fail the compare, below.
4016             if (targetCEI->ce != patCE) {
4017                 found = FALSE;
4018                 break;
4019             }
4020         }
4021 
4022         if (!found && targetCEI->ce != UCOL_PROCESSED_NULLORDER) {
4023             // No match at this targetIx.  Try again at the next.
4024             continue;
4025         }
4026 
4027         if (!found) {
4028             // No match at all, we have run off the end of the target text.
4029             break;
4030         }
4031 
4032 
4033         // We have found a match in CE space.
4034         // Now determine the bounds in string index space.
4035         //  There still is a chance of match failure if the CE range not correspond to
4036         //     an acceptable character range.
4037         //
4038         const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1);
4039         const CEI *lastCEI  = ceb.getPrevious(targetIx);
4040         const CEI *nextCEI  = targetIx > 0? ceb.getPrevious(targetIx - 1) : NULL;
4041 
4042         mStart   = firstCEI->lowIndex;
4043         minLimit = lastCEI->lowIndex;
4044         maxLimit = targetIx > 0? nextCEI->lowIndex : lastCEI->highIndex;
4045 
4046         // Look at the CE following the match.  If it is UCOL_NULLORDER the match
4047         //   extended to the end of input, and the match is good.
4048 
4049         // Look at the high and low indices of the CE following the match. If
4050         // they are the same it means one of two things:
4051         //    1. The match extended to the last CE from the target text, which is OK, or
4052         //    2. The last CE that was part of the match is in an expansion that extends
4053         //       to the first CE after the match. In this case, we reject the match.
4054         if (targetIx >= 1) {
4055             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4056                 found = FALSE;
4057             }
4058         }
4059 
4060 
4061         // Check for the start of the match being within a combining sequence.
4062         //   This can happen if the pattern itself begins with a combining char, and
4063         //   the match found combining marks in the target text that were attached
4064         //    to something else.
4065         //   This type of match should be rejected for not completely consuming a
4066         //   combining sequence.
4067         if (isBreakBoundary(strsrch, mStart)) {
4068             found = FALSE;
4069         }
4070 
4071         // Look at the high index of the first CE in the match. If it's the same as the
4072         // low index, the first CE in the match is in the middle of an expansion.
4073         if (mStart == firstCEI->highIndex) {
4074             found = FALSE;
4075         }
4076 
4077         //  Advance the match end position to the first acceptable match boundary.
4078         //    This advances the index over any combining charcters.
4079         mLimit = maxLimit;
4080         if (/*targetIx > 0 &&*/ minLimit < maxLimit) {
4081             int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4082 
4083             if (nba >= lastCEI->highIndex) {
4084                 mLimit = nba;
4085             }
4086         }
4087 
4088     #ifdef USEARCH_DEBUG
4089         if (getenv("USEARCH_DEBUG") != NULL) {
4090             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4091         }
4092     #endif
4093 
4094         // If advancing to the end of a combining sequence in character indexing space
4095         //   advanced us beyond the end of the match in CE space, reject this match.
4096         if (mLimit > maxLimit) {
4097             found = FALSE;
4098         }
4099 
4100         // Make sure the end of the match is on a break boundary
4101         if (isBreakBoundary(strsrch, mLimit)) {
4102             found = FALSE;
4103         }
4104 
4105         if (! checkIdentical(strsrch, mStart, mLimit)) {
4106             found = FALSE;
4107         }
4108 
4109         if (found) {
4110             break;
4111         }
4112     }
4113 
4114     #ifdef USEARCH_DEBUG
4115     if (getenv("USEARCH_DEBUG") != NULL) {
4116         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4117         int32_t  lastToPrint = ceb.limitIx+2;
4118         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4119             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4120         }
4121         printf("\n%s\n", found? "match found" : "no match");
4122     }
4123     #endif
4124 
4125     // All Done.  Store back the match bounds to the caller.
4126     //
4127     if (found==FALSE) {
4128         mLimit = -1;
4129         mStart = -1;
4130     }
4131 
4132     if (matchStart != NULL) {
4133         *matchStart= mStart;
4134     }
4135 
4136     if (matchLimit != NULL) {
4137         *matchLimit = mLimit;
4138     }
4139 
4140     return found;
4141 }
4142 
4143 
4144 
4145 
4146 // internal use methods declared in usrchimp.h -----------------------------
4147 
usearch_handleNextExact(UStringSearch * strsrch,UErrorCode * status)4148 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
4149 {
4150     if (U_FAILURE(*status)) {
4151         setMatchNotFound(strsrch);
4152         return FALSE;
4153     }
4154 
4155 #if BOYER_MOORE
4156     UCollationElements *coleiter        = strsrch->textIter;
4157     int32_t             textlength      = strsrch->search->textLength;
4158     int32_t            *patternce       = strsrch->pattern.CE;
4159     int32_t             patterncelength = strsrch->pattern.CELength;
4160     int32_t             textoffset      = ucol_getOffset(coleiter);
4161 
4162     // status used in setting coleiter offset, since offset is checked in
4163     // shiftForward before setting the coleiter offset, status never
4164     // a failure
4165     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4166                               patterncelength);
4167     while (textoffset <= textlength)
4168     {
4169         uint32_t    patternceindex = patterncelength - 1;
4170         int32_t     targetce;
4171         UBool       found          = FALSE;
4172         int32_t    lastce          = UCOL_NULLORDER;
4173 
4174         setColEIterOffset(coleiter, textoffset);
4175 
4176         for (;;) {
4177             // finding the last pattern ce match, imagine composite characters
4178             // for example: search for pattern A in text \u00C0
4179             // we'll have to skip \u0300 the grave first before we get to A
4180             targetce = ucol_previous(coleiter, status);
4181             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4182                 found = FALSE;
4183                 break;
4184             }
4185             targetce = getCE(strsrch, targetce);
4186             if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4187                 // this is for the text \u0315\u0300 that requires
4188                 // normalization and pattern \u0300, where \u0315 is ignorable
4189                 continue;
4190             }
4191             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4192                 lastce = targetce;
4193             }
4194             if (targetce == patternce[patternceindex]) {
4195                 // the first ce can be a contraction
4196                 found = TRUE;
4197                 break;
4198             }
4199             if (!hasExpansion(coleiter)) {
4200                 found = FALSE;
4201                 break;
4202             }
4203         }
4204 
4205         //targetce = lastce;
4206 
4207         while (found && patternceindex > 0) {
4208         	lastce = targetce;
4209             targetce    = ucol_previous(coleiter, status);
4210             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4211                 found = FALSE;
4212                 break;
4213             }
4214             targetce    = getCE(strsrch, targetce);
4215             if (targetce == UCOL_IGNORABLE) {
4216                 continue;
4217             }
4218 
4219             patternceindex --;
4220             found = found && targetce == patternce[patternceindex];
4221         }
4222 
4223         targetce = lastce;
4224 
4225         if (!found) {
4226             if (U_FAILURE(*status)) {
4227                 break;
4228             }
4229             textoffset = shiftForward(strsrch, textoffset, lastce,
4230                                       patternceindex);
4231             // status checked at loop.
4232             patternceindex = patterncelength;
4233             continue;
4234         }
4235 
4236         if (checkNextExactMatch(strsrch, &textoffset, status)) {
4237             // status checked in ucol_setOffset
4238             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4239             return TRUE;
4240         }
4241     }
4242     setMatchNotFound(strsrch);
4243     return FALSE;
4244 #else
4245     int32_t textOffset = ucol_getOffset(strsrch->textIter);
4246     int32_t start = -1;
4247     int32_t end = -1;
4248 
4249     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4250         strsrch->search->matchedIndex  = start;
4251         strsrch->search->matchedLength = end - start;
4252         return TRUE;
4253     } else {
4254         setMatchNotFound(strsrch);
4255         return FALSE;
4256     }
4257 #endif
4258 }
4259 
usearch_handleNextCanonical(UStringSearch * strsrch,UErrorCode * status)4260 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
4261 {
4262     if (U_FAILURE(*status)) {
4263         setMatchNotFound(strsrch);
4264         return FALSE;
4265     }
4266 
4267 #if BOYER_MOORE
4268     UCollationElements *coleiter        = strsrch->textIter;
4269     int32_t             textlength      = strsrch->search->textLength;
4270     int32_t            *patternce       = strsrch->pattern.CE;
4271     int32_t             patterncelength = strsrch->pattern.CELength;
4272     int32_t             textoffset      = ucol_getOffset(coleiter);
4273     UBool               hasPatternAccents =
4274        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4275 
4276     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4277                               patterncelength);
4278     strsrch->canonicalPrefixAccents[0] = 0;
4279     strsrch->canonicalSuffixAccents[0] = 0;
4280 
4281     while (textoffset <= textlength)
4282     {
4283         int32_t     patternceindex = patterncelength - 1;
4284         int32_t     targetce;
4285         UBool       found          = FALSE;
4286         int32_t     lastce         = UCOL_NULLORDER;
4287 
4288         setColEIterOffset(coleiter, textoffset);
4289 
4290         for (;;) {
4291             // finding the last pattern ce match, imagine composite characters
4292             // for example: search for pattern A in text \u00C0
4293             // we'll have to skip \u0300 the grave first before we get to A
4294             targetce = ucol_previous(coleiter, status);
4295             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4296                 found = FALSE;
4297                 break;
4298             }
4299             targetce = getCE(strsrch, targetce);
4300             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4301                 lastce = targetce;
4302             }
4303             if (targetce == patternce[patternceindex]) {
4304                 // the first ce can be a contraction
4305                 found = TRUE;
4306                 break;
4307             }
4308             if (!hasExpansion(coleiter)) {
4309                 found = FALSE;
4310                 break;
4311             }
4312         }
4313 
4314         while (found && patternceindex > 0) {
4315             targetce    = ucol_previous(coleiter, status);
4316             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4317                 found = FALSE;
4318                 break;
4319             }
4320             targetce    = getCE(strsrch, targetce);
4321             if (targetce == UCOL_IGNORABLE) {
4322                 continue;
4323             }
4324 
4325             patternceindex --;
4326             found = found && targetce == patternce[patternceindex];
4327         }
4328 
4329         // initializing the rearranged accent array
4330         if (hasPatternAccents && !found) {
4331             strsrch->canonicalPrefixAccents[0] = 0;
4332             strsrch->canonicalSuffixAccents[0] = 0;
4333             if (U_FAILURE(*status)) {
4334                 break;
4335             }
4336             found = doNextCanonicalMatch(strsrch, textoffset, status);
4337         }
4338 
4339         if (!found) {
4340             if (U_FAILURE(*status)) {
4341                 break;
4342             }
4343             textoffset = shiftForward(strsrch, textoffset, lastce,
4344                                       patternceindex);
4345             // status checked at loop
4346             patternceindex = patterncelength;
4347             continue;
4348         }
4349 
4350         if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4351             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4352             return TRUE;
4353         }
4354     }
4355     setMatchNotFound(strsrch);
4356     return FALSE;
4357 #else
4358     int32_t textOffset = ucol_getOffset(strsrch->textIter);
4359     int32_t start = -1;
4360     int32_t end = -1;
4361 
4362     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4363         strsrch->search->matchedIndex  = start;
4364         strsrch->search->matchedLength = end - start;
4365         return TRUE;
4366     } else {
4367         setMatchNotFound(strsrch);
4368         return FALSE;
4369     }
4370 #endif
4371 }
4372 
usearch_handlePreviousExact(UStringSearch * strsrch,UErrorCode * status)4373 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
4374 {
4375     if (U_FAILURE(*status)) {
4376         setMatchNotFound(strsrch);
4377         return FALSE;
4378     }
4379 
4380 #if BOYER_MOORE
4381     UCollationElements *coleiter        = strsrch->textIter;
4382     int32_t            *patternce       = strsrch->pattern.CE;
4383     int32_t             patterncelength = strsrch->pattern.CELength;
4384     int32_t             textoffset      = ucol_getOffset(coleiter);
4385 
4386     // shifting it check for setting offset
4387     // if setOffset is called previously or there was no previous match, we
4388     // leave the offset as it is.
4389     if (strsrch->search->matchedIndex != USEARCH_DONE) {
4390         textoffset = strsrch->search->matchedIndex;
4391     }
4392 
4393     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4394                               patterncelength);
4395 
4396     while (textoffset >= 0)
4397     {
4398         int32_t     patternceindex = 1;
4399         int32_t     targetce;
4400         UBool       found          = FALSE;
4401         int32_t     firstce        = UCOL_NULLORDER;
4402 
4403         // if status is a failure, ucol_setOffset does nothing
4404         setColEIterOffset(coleiter, textoffset);
4405 
4406         for (;;) {
4407             // finding the first pattern ce match, imagine composite
4408             // characters. for example: search for pattern \u0300 in text
4409             // \u00C0, we'll have to skip A first before we get to
4410             // \u0300 the grave accent
4411             targetce = ucol_next(coleiter, status);
4412             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4413                 found = FALSE;
4414                 break;
4415             }
4416             targetce = getCE(strsrch, targetce);
4417             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4418                 firstce = targetce;
4419             }
4420             if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
4421                 continue;
4422             }
4423             if (targetce == patternce[0]) {
4424                 found = TRUE;
4425                 break;
4426             }
4427             if (!hasExpansion(coleiter)) {
4428                 // checking for accents in composite character
4429                 found = FALSE;
4430                 break;
4431             }
4432         }
4433 
4434         //targetce = firstce;
4435 
4436         while (found && (patternceindex < patterncelength)) {
4437         	firstce = targetce;
4438             targetce    = ucol_next(coleiter, status);
4439             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4440                 found = FALSE;
4441                 break;
4442             }
4443             targetce    = getCE(strsrch, targetce);
4444             if (targetce == UCOL_IGNORABLE) {
4445                 continue;
4446             }
4447 
4448             found = found && targetce == patternce[patternceindex];
4449             patternceindex ++;
4450         }
4451 
4452         targetce = firstce;
4453 
4454         if (!found) {
4455             if (U_FAILURE(*status)) {
4456                 break;
4457             }
4458 
4459             textoffset = reverseShift(strsrch, textoffset, targetce,
4460                                       patternceindex);
4461             patternceindex = 0;
4462             continue;
4463         }
4464 
4465         if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4466             setColEIterOffset(coleiter, textoffset);
4467             return TRUE;
4468         }
4469     }
4470     setMatchNotFound(strsrch);
4471     return FALSE;
4472 #else
4473     int32_t textOffset = ucol_getOffset(strsrch->textIter);
4474     int32_t start = -1;
4475     int32_t end = -1;
4476 
4477     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4478         strsrch->search->matchedIndex = start;
4479         strsrch->search->matchedLength = end - start;
4480         return TRUE;
4481     } else {
4482         setMatchNotFound(strsrch);
4483         return FALSE;
4484     }
4485 #endif
4486 }
4487 
usearch_handlePreviousCanonical(UStringSearch * strsrch,UErrorCode * status)4488 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
4489                                       UErrorCode    *status)
4490 {
4491     if (U_FAILURE(*status)) {
4492         setMatchNotFound(strsrch);
4493         return FALSE;
4494     }
4495 
4496 #if BOYER_MOORE
4497     UCollationElements *coleiter        = strsrch->textIter;
4498     int32_t            *patternce       = strsrch->pattern.CE;
4499     int32_t             patterncelength = strsrch->pattern.CELength;
4500     int32_t             textoffset      = ucol_getOffset(coleiter);
4501     UBool               hasPatternAccents =
4502        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4503 
4504     // shifting it check for setting offset
4505     // if setOffset is called previously or there was no previous match, we
4506     // leave the offset as it is.
4507     if (strsrch->search->matchedIndex != USEARCH_DONE) {
4508         textoffset = strsrch->search->matchedIndex;
4509     }
4510 
4511     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4512                               patterncelength);
4513     strsrch->canonicalPrefixAccents[0] = 0;
4514     strsrch->canonicalSuffixAccents[0] = 0;
4515 
4516     while (textoffset >= 0)
4517     {
4518         int32_t     patternceindex = 1;
4519         int32_t     targetce;
4520         UBool       found          = FALSE;
4521         int32_t     firstce        = UCOL_NULLORDER;
4522 
4523         setColEIterOffset(coleiter, textoffset);
4524         for (;;) {
4525             // finding the first pattern ce match, imagine composite
4526             // characters. for example: search for pattern \u0300 in text
4527             // \u00C0, we'll have to skip A first before we get to
4528             // \u0300 the grave accent
4529             targetce = ucol_next(coleiter, status);
4530             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4531                 found = FALSE;
4532                 break;
4533             }
4534             targetce = getCE(strsrch, targetce);
4535             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4536                 firstce = targetce;
4537             }
4538 
4539             if (targetce == patternce[0]) {
4540                 // the first ce can be a contraction
4541                 found = TRUE;
4542                 break;
4543             }
4544             if (!hasExpansion(coleiter)) {
4545                 // checking for accents in composite character
4546                 found = FALSE;
4547                 break;
4548             }
4549         }
4550 
4551         targetce = firstce;
4552 
4553         while (found && patternceindex < patterncelength) {
4554             targetce    = ucol_next(coleiter, status);
4555             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4556                 found = FALSE;
4557                 break;
4558             }
4559             targetce = getCE(strsrch, targetce);
4560             if (targetce == UCOL_IGNORABLE) {
4561                 continue;
4562             }
4563 
4564             found = found && targetce == patternce[patternceindex];
4565             patternceindex ++;
4566         }
4567 
4568         // initializing the rearranged accent array
4569         if (hasPatternAccents && !found) {
4570             strsrch->canonicalPrefixAccents[0] = 0;
4571             strsrch->canonicalSuffixAccents[0] = 0;
4572             if (U_FAILURE(*status)) {
4573                 break;
4574             }
4575             found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4576         }
4577 
4578         if (!found) {
4579             if (U_FAILURE(*status)) {
4580                 break;
4581             }
4582             textoffset = reverseShift(strsrch, textoffset, targetce,
4583                                       patternceindex);
4584             patternceindex = 0;
4585             continue;
4586         }
4587 
4588         if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4589             setColEIterOffset(coleiter, textoffset);
4590             return TRUE;
4591         }
4592     }
4593     setMatchNotFound(strsrch);
4594     return FALSE;
4595 #else
4596     int32_t textOffset = ucol_getOffset(strsrch->textIter);
4597     int32_t start = -1;
4598     int32_t end = -1;
4599 
4600     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4601         strsrch->search->matchedIndex = start;
4602         strsrch->search->matchedLength = end - start;
4603         return TRUE;
4604     } else {
4605         setMatchNotFound(strsrch);
4606         return FALSE;
4607     }
4608 #endif
4609 }
4610 
4611 #endif /* #if !UCONFIG_NO_COLLATION */
4612