• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 2001-2015 IBM and others. All rights reserved.
4 **********************************************************************
5 *   Date        Name        Description
6 *  07/02/2001   synwee      Creation.
7 **********************************************************************
8 */
9 
10 #include "unicode/utypes.h"
11 
12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
13 
14 #include "unicode/usearch.h"
15 #include "unicode/ustring.h"
16 #include "unicode/uchar.h"
17 #include "unicode/utf16.h"
18 #include "normalizer2impl.h"
19 #include "usrchimp.h"
20 #include "cmemory.h"
21 #include "ucln_in.h"
22 #include "uassert.h"
23 #include "ustr_imp.h"
24 
25 U_NAMESPACE_USE
26 
27 // don't use Boyer-Moore
28 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
29 #define BOYER_MOORE 0
30 
31 // internal definition ---------------------------------------------------
32 
33 #define LAST_BYTE_MASK_          0xFF
34 #define SECOND_LAST_BYTE_SHIFT_  8
35 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
36 
37 static const Normalizer2Impl *g_nfcImpl = NULL;
38 
39 // internal methods -------------------------------------------------
40 
41 /**
42 * Fast collation element iterator setOffset.
43 * This function does not check for bounds.
44 * @param coleiter collation element iterator
45 * @param offset to set
46 */
47 static
setColEIterOffset(UCollationElements * elems,int32_t offset)48 inline void setColEIterOffset(UCollationElements *elems,
49                       int32_t             offset)
50 {
51     // Note: Not "fast" any more after the 2013 collation rewrite.
52     // We do not want to expose more internals than necessary.
53     UErrorCode status = U_ZERO_ERROR;
54     ucol_setOffset(elems, offset, &status);
55 }
56 
57 /**
58 * Getting the mask for collation strength
59 * @param strength collation strength
60 * @return collation element mask
61 */
62 static
getMask(UCollationStrength strength)63 inline uint32_t getMask(UCollationStrength strength)
64 {
65     switch (strength)
66     {
67     case UCOL_PRIMARY:
68         return UCOL_PRIMARYORDERMASK;
69     case UCOL_SECONDARY:
70         return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
71     default:
72         return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
73                UCOL_PRIMARYORDERMASK;
74     }
75 }
76 
77 /**
78 * @param ce 32-bit collation element
79 * @return hash code
80 */
81 static
hashFromCE32(uint32_t ce)82 inline int hashFromCE32(uint32_t ce)
83 {
84     int hc = (int)(
85             ((((((ce >> 24) * 37) +
86             (ce >> 16)) * 37) +
87             (ce >> 8)) * 37) +
88             ce);
89     hc %= MAX_TABLE_SIZE_;
90     if (hc < 0) {
91         hc += MAX_TABLE_SIZE_;
92     }
93     return hc;
94 }
95 
96 U_CDECL_BEGIN
97 static UBool U_CALLCONV
usearch_cleanup(void)98 usearch_cleanup(void) {
99     g_nfcImpl = NULL;
100     return TRUE;
101 }
102 U_CDECL_END
103 
104 /**
105 * Initializing the fcd tables.
106 * Internal method, status assumed to be a success.
107 * @param status output error if any, caller to check status before calling
108 *               method, status assumed to be success when passed in.
109 */
110 static
initializeFCD(UErrorCode * status)111 inline void initializeFCD(UErrorCode *status)
112 {
113     if (g_nfcImpl == NULL) {
114         g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
115         ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
116     }
117 }
118 
119 /**
120 * Gets the fcd value for a character at the argument index.
121 * This method takes into accounts of the supplementary characters.
122 * @param str UTF16 string where character for fcd retrieval resides
123 * @param offset position of the character whose fcd is to be retrieved, to be
124 *               overwritten with the next character position, taking
125 *               surrogate characters into consideration.
126 * @param strlength length of the argument string
127 * @return fcd value
128 */
129 static
getFCD(const UChar * str,int32_t * offset,int32_t strlength)130 uint16_t getFCD(const UChar   *str, int32_t *offset,
131                              int32_t  strlength)
132 {
133     const UChar *temp = str + *offset;
134     uint16_t    result = g_nfcImpl->nextFCD16(temp, str + strlength);
135     *offset = (int32_t)(temp - str);
136     return result;
137 }
138 
139 /**
140 * Getting the modified collation elements taking into account the collation
141 * attributes
142 * @param strsrch string search data
143 * @param sourcece
144 * @return the modified collation element
145 */
146 static
getCE(const UStringSearch * strsrch,uint32_t sourcece)147 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
148 {
149     // note for tertiary we can't use the collator->tertiaryMask, that
150     // is a preprocessed mask that takes into account case options. since
151     // we are only concerned with exact matches, we don't need that.
152     sourcece &= strsrch->ceMask;
153 
154     if (strsrch->toShift) {
155         // alternate handling here, since only the 16 most significant digits
156         // is only used, we can safely do a compare without masking
157         // if the ce is a variable, we mask and get only the primary values
158         // no shifting to quartenary is required since all primary values
159         // less than variabletop will need to be masked off anyway.
160         if (strsrch->variableTop > sourcece) {
161             if (strsrch->strength >= UCOL_QUATERNARY) {
162                 sourcece &= UCOL_PRIMARYORDERMASK;
163             }
164             else {
165                 sourcece = UCOL_IGNORABLE;
166             }
167         }
168     } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
169         sourcece = 0xFFFF;
170     }
171 
172     return sourcece;
173 }
174 
175 /**
176 * Allocate a memory and returns NULL if it failed.
177 * Internal method, status assumed to be a success.
178 * @param size to allocate
179 * @param status output error if any, caller to check status before calling
180 *               method, status assumed to be success when passed in.
181 * @return newly allocated array, NULL otherwise
182 */
183 static
allocateMemory(uint32_t size,UErrorCode * status)184 inline void * allocateMemory(uint32_t size, UErrorCode *status)
185 {
186     uint32_t *result = (uint32_t *)uprv_malloc(size);
187     if (result == NULL) {
188         *status = U_MEMORY_ALLOCATION_ERROR;
189     }
190     return result;
191 }
192 
193 /**
194 * Adds a uint32_t value to a destination array.
195 * Creates a new array if we run out of space. The caller will have to
196 * manually deallocate the newly allocated array.
197 * Internal method, status assumed to be success, caller has to check status
198 * before calling this method. destination not to be NULL and has at least
199 * size destinationlength.
200 * @param destination target array
201 * @param offset destination offset to add value
202 * @param destinationlength target array size, return value for the new size
203 * @param value to be added
204 * @param increments incremental size expected
205 * @param status output error if any, caller to check status before calling
206 *               method, status assumed to be success when passed in.
207 * @return new destination array, destination if there was no new allocation
208 */
209 static
addTouint32_tArray(int32_t * destination,uint32_t offset,uint32_t * destinationlength,uint32_t value,uint32_t increments,UErrorCode * status)210 inline int32_t * addTouint32_tArray(int32_t    *destination,
211                                     uint32_t    offset,
212                                     uint32_t   *destinationlength,
213                                     uint32_t    value,
214                                     uint32_t    increments,
215                                     UErrorCode *status)
216 {
217     uint32_t newlength = *destinationlength;
218     if (offset + 1 == newlength) {
219         newlength += increments;
220         int32_t *temp = (int32_t *)allocateMemory(
221                                          sizeof(int32_t) * newlength, status);
222         if (U_FAILURE(*status)) {
223             return NULL;
224         }
225         uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
226         *destinationlength = newlength;
227         destination        = temp;
228     }
229     destination[offset] = value;
230     return destination;
231 }
232 
233 /**
234 * Adds a uint64_t value to a destination array.
235 * Creates a new array if we run out of space. The caller will have to
236 * manually deallocate the newly allocated array.
237 * Internal method, status assumed to be success, caller has to check status
238 * before calling this method. destination not to be NULL and has at least
239 * size destinationlength.
240 * @param destination target array
241 * @param offset destination offset to add value
242 * @param destinationlength target array size, return value for the new size
243 * @param value to be added
244 * @param increments incremental size expected
245 * @param status output error if any, caller to check status before calling
246 *               method, status assumed to be success when passed in.
247 * @return new destination array, destination if there was no new allocation
248 */
249 static
addTouint64_tArray(int64_t * destination,uint32_t offset,uint32_t * destinationlength,uint64_t value,uint32_t increments,UErrorCode * status)250 inline int64_t * addTouint64_tArray(int64_t    *destination,
251                                     uint32_t    offset,
252                                     uint32_t   *destinationlength,
253                                     uint64_t    value,
254                                     uint32_t    increments,
255                                     UErrorCode *status)
256 {
257     uint32_t newlength = *destinationlength;
258     if (offset + 1 == newlength) {
259         newlength += increments;
260         int64_t *temp = (int64_t *)allocateMemory(
261                                          sizeof(int64_t) * newlength, status);
262 
263         if (U_FAILURE(*status)) {
264             return NULL;
265         }
266 
267         uprv_memcpy(temp, destination, sizeof(int64_t) * offset);
268         *destinationlength = newlength;
269         destination        = temp;
270     }
271 
272     destination[offset] = value;
273 
274     return destination;
275 }
276 
277 /**
278 * Initializing the ce table for a pattern.
279 * Stores non-ignorable collation keys.
280 * Table size will be estimated by the size of the pattern text. Table
281 * expansion will be perform as we go along. Adding 1 to ensure that the table
282 * size definitely increases.
283 * Internal method, status assumed to be a success.
284 * @param strsrch string search data
285 * @param status output error if any, caller to check status before calling
286 *               method, status assumed to be success when passed in.
287 * @return total number of expansions
288 */
289 static
initializePatternCETable(UStringSearch * strsrch,UErrorCode * status)290 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
291                                          UErrorCode    *status)
292 {
293     UPattern *pattern            = &(strsrch->pattern);
294     uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
295     int32_t  *cetable            = pattern->cesBuffer;
296     uint32_t  patternlength      = pattern->textLength;
297     UCollationElements *coleiter = strsrch->utilIter;
298 
299     if (coleiter == NULL) {
300         coleiter = ucol_openElements(strsrch->collator, pattern->text,
301                                      patternlength, status);
302         // status will be checked in ucol_next(..) later and if it is an
303         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
304         // returned.
305         strsrch->utilIter = coleiter;
306     }
307     else {
308         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
309     }
310     if(U_FAILURE(*status)) {
311         return 0;
312     }
313 
314     if (pattern->ces != cetable && pattern->ces) {
315         uprv_free(pattern->ces);
316     }
317 
318     uint16_t  offset      = 0;
319     uint16_t  result      = 0;
320     int32_t   ce;
321 
322     while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
323            U_SUCCESS(*status)) {
324         uint32_t newce = getCE(strsrch, ce);
325         if (newce) {
326             int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
327                                   newce,
328                                   patternlength - ucol_getOffset(coleiter) + 1,
329                                   status);
330             if (U_FAILURE(*status)) {
331                 return 0;
332             }
333             offset ++;
334             if (cetable != temp && cetable != pattern->cesBuffer) {
335                 uprv_free(cetable);
336             }
337             cetable = temp;
338         }
339         result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
340     }
341 
342     cetable[offset]   = 0;
343     pattern->ces       = cetable;
344     pattern->cesLength = offset;
345 
346     return result;
347 }
348 
349 /**
350 * Initializing the pce table for a pattern.
351 * Stores non-ignorable collation keys.
352 * Table size will be estimated by the size of the pattern text. Table
353 * expansion will be perform as we go along. Adding 1 to ensure that the table
354 * size definitely increases.
355 * Internal method, status assumed to be a success.
356 * @param strsrch string search data
357 * @param status output error if any, caller to check status before calling
358 *               method, status assumed to be success when passed in.
359 * @return total number of expansions
360 */
361 static
initializePatternPCETable(UStringSearch * strsrch,UErrorCode * status)362 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
363                                           UErrorCode    *status)
364 {
365     UPattern *pattern            = &(strsrch->pattern);
366     uint32_t  pcetablesize       = INITIAL_ARRAY_SIZE_;
367     int64_t  *pcetable           = pattern->pcesBuffer;
368     uint32_t  patternlength      = pattern->textLength;
369     UCollationElements *coleiter = strsrch->utilIter;
370 
371     if (coleiter == NULL) {
372         coleiter = ucol_openElements(strsrch->collator, pattern->text,
373                                      patternlength, status);
374         // status will be checked in ucol_next(..) later and if it is an
375         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
376         // returned.
377         strsrch->utilIter = coleiter;
378     } else {
379         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
380     }
381     if(U_FAILURE(*status)) {
382         return 0;
383     }
384 
385     if (pattern->pces != pcetable && pattern->pces != NULL) {
386         uprv_free(pattern->pces);
387     }
388 
389     uint16_t  offset = 0;
390     uint16_t  result = 0;
391     int64_t   pce;
392 
393     icu::UCollationPCE iter(coleiter);
394 
395     // ** Should processed CEs be signed or unsigned?
396     // ** (the rest of the code in this file seems to play fast-and-loose with
397     // **  whether a CE is signed or unsigned. For example, look at routine above this one.)
398     while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
399            U_SUCCESS(*status)) {
400         int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
401                               pce,
402                               patternlength - ucol_getOffset(coleiter) + 1,
403                               status);
404 
405         if (U_FAILURE(*status)) {
406             return 0;
407         }
408 
409         offset += 1;
410 
411         if (pcetable != temp && pcetable != pattern->pcesBuffer) {
412             uprv_free(pcetable);
413         }
414 
415         pcetable = temp;
416         //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
417     }
418 
419     pcetable[offset]   = 0;
420     pattern->pces       = pcetable;
421     pattern->pcesLength = offset;
422 
423     return result;
424 }
425 
426 /**
427 * Initializes the pattern struct.
428 * Internal method, status assumed to be success.
429 * @param strsrch UStringSearch data storage
430 * @param status output error if any, caller to check status before calling
431 *               method, status assumed to be success when passed in.
432 * @return expansionsize the total expansion size of the pattern
433 */
434 static
initializePattern(UStringSearch * strsrch,UErrorCode * status)435 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
436 {
437     if (U_FAILURE(*status)) { return 0; }
438           UPattern   *pattern     = &(strsrch->pattern);
439     const UChar      *patterntext = pattern->text;
440           int32_t     length      = pattern->textLength;
441           int32_t index       = 0;
442 
443     // Since the strength is primary, accents are ignored in the pattern.
444     if (strsrch->strength == UCOL_PRIMARY) {
445         pattern->hasPrefixAccents = 0;
446         pattern->hasSuffixAccents = 0;
447     } else {
448         pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
449                                                          SECOND_LAST_BYTE_SHIFT_;
450         index = length;
451         U16_BACK_1(patterntext, 0, index);
452         pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
453                                                                  LAST_BYTE_MASK_;
454     }
455 
456     // ** HACK **
457     if (strsrch->pattern.pces != NULL) {
458         if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
459             uprv_free(strsrch->pattern.pces);
460         }
461 
462         strsrch->pattern.pces = NULL;
463     }
464 
465     // since intializePattern is an internal method status is a success.
466     return initializePatternCETable(strsrch, status);
467 }
468 
469 /**
470 * Initializing shift tables, with the default values.
471 * If a corresponding default value is 0, the shift table is not set.
472 * @param shift table for forwards shift
473 * @param backshift table for backwards shift
474 * @param cetable table containing pattern ce
475 * @param cesize size of the pattern ces
476 * @param expansionsize total size of the expansions
477 * @param defaultforward the default forward value
478 * @param defaultbackward the default backward value
479 */
480 static
setShiftTable(int16_t shift[],int16_t backshift[],int32_t * cetable,int32_t cesize,int16_t expansionsize,int16_t defaultforward,int16_t defaultbackward)481 inline void setShiftTable(int16_t   shift[], int16_t backshift[],
482                           int32_t  *cetable, int32_t cesize,
483                           int16_t   expansionsize,
484                           int16_t   defaultforward,
485                           int16_t   defaultbackward)
486 {
487     // estimate the value to shift. to do that we estimate the smallest
488     // number of characters to give the relevant ces, ie approximately
489     // the number of ces minus their expansion, since expansions can come
490     // from a character.
491     int32_t count;
492     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
493         shift[count] = defaultforward;
494     }
495     cesize --; // down to the last index
496     for (count = 0; count < cesize; count ++) {
497         // number of ces from right of array to the count
498         int temp = defaultforward - count - 1;
499         shift[hashFromCE32(cetable[count])] = temp > 1 ? temp : 1;
500     }
501     shift[hashFromCE32(cetable[cesize])] = 1;
502     // for ignorables we just shift by one. see test examples.
503     shift[hashFromCE32(0)] = 1;
504 
505     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
506         backshift[count] = defaultbackward;
507     }
508     for (count = cesize; count > 0; count --) {
509         // the original value count does not seem to work
510         backshift[hashFromCE32(cetable[count])] = count > expansionsize ?
511                                           (int16_t)(count - expansionsize) : 1;
512     }
513     backshift[hashFromCE32(cetable[0])] = 1;
514     backshift[hashFromCE32(0)] = 1;
515 }
516 
517 /**
518 * Building of the pattern collation element list and the boyer moore strsrch
519 * table.
520 * The canonical match will only be performed after the default match fails.
521 * For both cases we need to remember the size of the composed and decomposed
522 * versions of the string. Since the Boyer-Moore shift calculations shifts by
523 * a number of characters in the text and tries to match the pattern from that
524 * offset, the shift value can not be too large in case we miss some
525 * characters. To choose a right shift size, we estimate the NFC form of the
526 * and use its size as a shift guide. The NFC form should be the small
527 * possible representation of the pattern. Anyways, we'll err on the smaller
528 * shift size. Hence the calculation for minlength.
529 * Canonical match will be performed slightly differently. We'll split the
530 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
531 * the first and last base character (MS), the ending accents (EA). Matches
532 * will be done on MS first, and only when we match MS then some processing
533 * will be required for the prefix and end accents in order to determine if
534 * they match PA and EA. Hence the default shift values
535 * for the canonical match will take the size of either end's accent into
536 * consideration. Forwards search will take the end accents into consideration
537 * for the default shift values and the backwards search will take the prefix
538 * accents into consideration.
539 * If pattern has no non-ignorable ce, we return a illegal argument error.
540 * Internal method, status assumed to be success.
541 * @param strsrch UStringSearch data storage
542 * @param status  for output errors if it occurs, status is assumed to be a
543 *                success when it is passed in.
544 */
545 static
initialize(UStringSearch * strsrch,UErrorCode * status)546 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
547 {
548     int16_t expandlength  = initializePattern(strsrch, status);
549     if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
550         UPattern *pattern = &strsrch->pattern;
551         int32_t   cesize  = pattern->cesLength;
552 
553         int16_t minlength = cesize > expandlength
554                             ? (int16_t)cesize - expandlength : 1;
555         pattern->defaultShiftSize    = minlength;
556         setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
557                       cesize, expandlength, minlength, minlength);
558         return;
559     }
560     strsrch->pattern.defaultShiftSize = 0;
561 }
562 
563 #if BOYER_MOORE
564 /**
565 * Check to make sure that the match length is at the end of the character by
566 * using the breakiterator.
567 * @param strsrch string search data
568 * @param start target text start offset
569 * @param end target text end offset
570 */
571 static
checkBreakBoundary(const UStringSearch * strsrch,int32_t *,int32_t * end)572 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
573                                int32_t *end)
574 {
575 #if !UCONFIG_NO_BREAK_ITERATION
576     UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
577     if (breakiterator) {
578         int32_t matchend = *end;
579         //int32_t matchstart = *start;
580 
581         if (!ubrk_isBoundary(breakiterator, matchend)) {
582             *end = ubrk_following(breakiterator, matchend);
583         }
584 
585         /* Check the start of the matched text to make sure it doesn't have any accents
586          * before it.  This code may not be necessary and so it is commented out */
587         /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
588             *start = ubrk_preceding(breakiterator, matchstart);
589         }*/
590     }
591 #endif
592 }
593 
594 /**
595 * Determine whether the target text in UStringSearch bounded by the offset
596 * start and end is one or more whole units of text as
597 * determined by the breakiterator in UStringSearch.
598 * @param strsrch string search data
599 * @param start target text start offset
600 * @param end target text end offset
601 */
602 static
isBreakUnit(const UStringSearch * strsrch,int32_t start,int32_t end)603 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
604                                int32_t    end)
605 {
606 #if !UCONFIG_NO_BREAK_ITERATION
607     UBreakIterator *breakiterator = strsrch->search->breakIter;
608     //TODO: Add here.
609     if (breakiterator) {
610         int32_t startindex = ubrk_first(breakiterator);
611         int32_t endindex   = ubrk_last(breakiterator);
612 
613         // out-of-range indexes are never boundary positions
614         if (start < startindex || start > endindex ||
615             end < startindex || end > endindex) {
616             return FALSE;
617         }
618         // otherwise, we can use following() on the position before the
619         // specified one and return true of the position we get back is the
620         // one the user specified
621         UBool result = (start == startindex ||
622                 ubrk_following(breakiterator, start - 1) == start) &&
623                (end == endindex ||
624                 ubrk_following(breakiterator, end - 1) == end);
625         if (result) {
626             // iterates the individual ces
627                   UCollationElements *coleiter  = strsrch->utilIter;
628             const UChar              *text      = strsrch->search->text +
629                                                                       start;
630                   UErrorCode          status    = U_ZERO_ERROR;
631             ucol_setText(coleiter, text, end - start, &status);
632             for (int32_t count = 0; count < strsrch->pattern.cesLength;
633                  count ++) {
634                 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
635                 if (ce == UCOL_IGNORABLE) {
636                     count --;
637                     continue;
638                 }
639                 if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) {
640                     return FALSE;
641                 }
642             }
643             int32_t nextce = ucol_next(coleiter, &status);
644             while (ucol_getOffset(coleiter) == (end - start)
645                    && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
646                 nextce = ucol_next(coleiter, &status);
647             }
648             if (ucol_getOffset(coleiter) == (end - start)
649                 && nextce != UCOL_NULLORDER) {
650                 // extra collation elements at the end of the match
651                 return FALSE;
652             }
653         }
654         return result;
655     }
656 #endif
657     return TRUE;
658 }
659 
660 /**
661 * Getting the next base character offset if current offset is an accent,
662 * or the current offset if the current character contains a base character.
663 * accents the following base character will be returned
664 * @param text string
665 * @param textoffset current offset
666 * @param textlength length of text string
667 * @return the next base character or the current offset
668 *         if the current character is contains a base character.
669 */
670 static
getNextBaseOffset(const UChar * text,int32_t textoffset,int32_t textlength)671 inline int32_t getNextBaseOffset(const UChar       *text,
672                                            int32_t  textoffset,
673                                            int32_t      textlength)
674 {
675     if (textoffset < textlength) {
676         int32_t temp = textoffset;
677         if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
678             while (temp < textlength) {
679                 int32_t result = temp;
680                 if ((getFCD(text, &temp, textlength) >>
681                      SECOND_LAST_BYTE_SHIFT_) == 0) {
682                     return result;
683                 }
684             }
685             return textlength;
686         }
687     }
688     return textoffset;
689 }
690 
691 /**
692 * Gets the next base character offset depending on the string search pattern
693 * data
694 * @param strsrch string search data
695 * @param textoffset current offset, one offset away from the last character
696 *                   to search for.
697 * @return start index of the next base character or the current offset
698 *         if the current character is contains a base character.
699 */
700 static
getNextUStringSearchBaseOffset(UStringSearch * strsrch,int32_t textoffset)701 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
702                                                   int32_t    textoffset)
703 {
704     int32_t textlength = strsrch->search->textLength;
705     if (strsrch->pattern.hasSuffixAccents &&
706         textoffset < textlength) {
707               int32_t  temp       = textoffset;
708         const UChar       *text       = strsrch->search->text;
709         U16_BACK_1(text, 0, temp);
710         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
711             return getNextBaseOffset(text, textoffset, textlength);
712         }
713     }
714     return textoffset;
715 }
716 
717 /**
718 * Shifting the collation element iterator position forward to prepare for
719 * a following match. If the last character is a unsafe character, we'll only
720 * shift by 1 to capture contractions, normalization etc.
721 * Internal method, status assumed to be success.
722 * @param text strsrch string search data
723 * @param textoffset start text position to do search
724 * @param ce the text ce which failed the match.
725 * @param patternceindex index of the ce within the pattern ce buffer which
726 *        failed the match
727 * @return final offset
728 */
729 static
shiftForward(UStringSearch * strsrch,int32_t textoffset,int32_t ce,int32_t patternceindex)730 inline int32_t shiftForward(UStringSearch *strsrch,
731                                 int32_t    textoffset,
732                                 int32_t       ce,
733                                 int32_t        patternceindex)
734 {
735     UPattern *pattern = &(strsrch->pattern);
736     if (ce != UCOL_NULLORDER) {
737         int32_t shift = pattern->shift[hashFromCE32(ce)];
738         // this is to adjust for characters in the middle of the
739         // substring for matching that failed.
740         int32_t adjust = pattern->cesLength - patternceindex;
741         if (adjust > 1 && shift >= adjust) {
742             shift -= adjust - 1;
743         }
744         textoffset += shift;
745     }
746     else {
747         textoffset += pattern->defaultShiftSize;
748     }
749 
750     textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
751     // check for unsafe characters
752     // * if it is the start or middle of a contraction: to be done after
753     //   a initial match is found
754     // * thai or lao base consonant character: similar to contraction
755     // * high surrogate character: similar to contraction
756     // * next character is a accent: shift to the next base character
757     return textoffset;
758 }
759 #endif // #if BOYER_MOORE
760 
761 /**
762 * sets match not found
763 * @param strsrch string search data
764 */
765 static
setMatchNotFound(UStringSearch * strsrch)766 inline void setMatchNotFound(UStringSearch *strsrch)
767 {
768     // this method resets the match result regardless of the error status.
769     strsrch->search->matchedIndex = USEARCH_DONE;
770     strsrch->search->matchedLength = 0;
771     if (strsrch->search->isForwardSearching) {
772         setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
773     }
774     else {
775         setColEIterOffset(strsrch->textIter, 0);
776     }
777 }
778 
779 #if BOYER_MOORE
780 /**
781 * Gets the offset to the next safe point in text.
782 * ie. not the middle of a contraction, swappable characters or supplementary
783 * characters.
784 * @param collator collation sata
785 * @param text string to work with
786 * @param textoffset offset in string
787 * @param textlength length of text string
788 * @return offset to the next safe character
789 */
790 static
getNextSafeOffset(const UCollator * collator,const UChar * text,int32_t textoffset,int32_t textlength)791 inline int32_t getNextSafeOffset(const UCollator   *collator,
792                                      const UChar       *text,
793                                            int32_t  textoffset,
794                                            int32_t      textlength)
795 {
796     int32_t result = textoffset; // first contraction character
797     while (result != textlength && ucol_unsafeCP(text[result], collator)) {
798         result ++;
799     }
800     return result;
801 }
802 
803 /**
804 * This checks for accents in the potential match started with a .
805 * composite character.
806 * This is really painful... we have to check that composite character do not
807 * have any extra accents. We have to normalize the potential match and find
808 * the immediate decomposed character before the match.
809 * The first composite character would have been taken care of by the fcd
810 * checks in checkForwardExactMatch.
811 * This is the slow path after the fcd of the first character and
812 * the last character has been checked by checkForwardExactMatch and we
813 * determine that the potential match has extra non-ignorable preceding
814 * ces.
815 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
816 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
817 * Note here that accents checking are slow and cautioned in the API docs.
818 * Internal method, status assumed to be a success, caller should check status
819 * before calling this method
820 * @param strsrch string search data
821 * @param start index of the potential unfriendly composite character
822 * @param end index of the potential unfriendly composite character
823 * @param status output error status if any.
824 * @return TRUE if there is non-ignorable accents before at the beginning
825 *              of the match, FALSE otherwise.
826 */
827 
828 static
checkExtraMatchAccents(const UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)829 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
830                                    int32_t    end,
831                                    UErrorCode    *status)
832 {
833     UBool result = FALSE;
834     if (strsrch->pattern.hasPrefixAccents) {
835               int32_t  length = end - start;
836               int32_t  offset = 0;
837         const UChar       *text   = strsrch->search->text + start;
838 
839         U16_FWD_1(text, offset, length);
840         // we are only concerned with the first composite character
841         if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
842             int32_t safeoffset = getNextSafeOffset(strsrch->collator,
843                                                        text, 0, length);
844             if (safeoffset != length) {
845                 safeoffset ++;
846             }
847             UChar   *norm = NULL;
848             UChar    buffer[INITIAL_ARRAY_SIZE_];
849             int32_t  size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
850                                             buffer, INITIAL_ARRAY_SIZE_,
851                                             status);
852             if (U_FAILURE(*status)) {
853                 return FALSE;
854             }
855             if (size >= INITIAL_ARRAY_SIZE_) {
856                 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
857                                                status);
858                 // if allocation failed, status will be set to
859                 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
860                 // checks for it.
861                 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
862                                        size, status);
863                 if (U_FAILURE(*status) && norm != NULL) {
864                     uprv_free(norm);
865                     return FALSE;
866                 }
867             }
868             else {
869                 norm = buffer;
870             }
871 
872             UCollationElements *coleiter  = strsrch->utilIter;
873             ucol_setText(coleiter, norm, size, status);
874             uint32_t            firstce   = strsrch->pattern.ces[0];
875             UBool               ignorable = TRUE;
876             uint32_t            ce        = UCOL_IGNORABLE;
877             while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
878                 offset = ucol_getOffset(coleiter);
879                 if (ce != firstce && ce != UCOL_IGNORABLE) {
880                     ignorable = FALSE;
881                 }
882                 ce = ucol_next(coleiter, status);
883             }
884             UChar32 codepoint;
885             U16_PREV(norm, 0, offset, codepoint);
886             result = !ignorable && (u_getCombiningClass(codepoint) != 0);
887 
888             if (norm != buffer) {
889                 uprv_free(norm);
890             }
891         }
892     }
893 
894     return result;
895 }
896 
897 /**
898 * Used by exact matches, checks if there are accents before the match.
899 * This is really painful... we have to check that composite characters at
900 * the start of the matches have to not have any extra accents.
901 * We check the FCD of the character first, if it starts with an accent and
902 * the first pattern ce does not match the first ce of the character, we bail.
903 * Otherwise we try normalizing the first composite
904 * character and find the immediate decomposed character before the match to
905 * see if it is an non-ignorable accent.
906 * Now normalizing the first composite character is enough because we ensure
907 * that when the match is passed in here with extra beginning ces, the
908 * first or last ce that match has to occur within the first character.
909 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
910 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
911 * Note here that accents checking are slow and cautioned in the API docs.
912 * @param strsrch string search data
913 * @param start offset
914 * @param end offset
915 * @return TRUE if there are accents on either side of the match,
916 *         FALSE otherwise
917 */
918 static
hasAccentsBeforeMatch(const UStringSearch * strsrch,int32_t start,int32_t end)919 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
920                                   int32_t    end)
921 {
922     if (strsrch->pattern.hasPrefixAccents) {
923         UCollationElements *coleiter  = strsrch->textIter;
924         UErrorCode          status    = U_ZERO_ERROR;
925         // we have been iterating forwards previously
926         uint32_t            ignorable = TRUE;
927         int32_t             firstce   = strsrch->pattern.ces[0];
928 
929         setColEIterOffset(coleiter, start);
930         int32_t ce  = getCE(strsrch, ucol_next(coleiter, &status));
931         if (U_FAILURE(status)) {
932             return TRUE;
933         }
934         while (ce != firstce) {
935             if (ce != UCOL_IGNORABLE) {
936                 ignorable = FALSE;
937             }
938             ce = getCE(strsrch, ucol_next(coleiter, &status));
939             if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
940                 return TRUE;
941             }
942         }
943         if (!ignorable && inNormBuf(coleiter)) {
944             // within normalization buffer, discontiguous handled here
945             return TRUE;
946         }
947 
948         // within text
949         int32_t temp = start;
950         // original code
951         // accent = (getFCD(strsrch->search->text, &temp,
952         //                  strsrch->search->textLength)
953         //            >> SECOND_LAST_BYTE_SHIFT_);
954         // however this code does not work well with VC7 .net in release mode.
955         // maybe the inlines for getFCD combined with shifting has bugs in
956         // VC7. anyways this is a work around.
957         UBool accent = getFCD(strsrch->search->text, &temp,
958                               strsrch->search->textLength) > 0xFF;
959         if (!accent) {
960             return checkExtraMatchAccents(strsrch, start, end, &status);
961         }
962         if (!ignorable) {
963             return TRUE;
964         }
965         if (start > 0) {
966             temp = start;
967             U16_BACK_1(strsrch->search->text, 0, temp);
968             if (getFCD(strsrch->search->text, &temp,
969                        strsrch->search->textLength) & LAST_BYTE_MASK_) {
970                 setColEIterOffset(coleiter, start);
971                 ce = ucol_previous(coleiter, &status);
972                 if (U_FAILURE(status) ||
973                     (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
974                     return TRUE;
975                 }
976             }
977         }
978     }
979 
980     return FALSE;
981 }
982 
983 /**
984 * Used by exact matches, checks if there are accents bounding the match.
985 * Note this is the initial boundary check. If the potential match
986 * starts or ends with composite characters, the accents in those
987 * characters will be determined later.
988 * Not doing backwards iteration here, since discontiguos contraction for
989 * backwards collation element iterator, use up too many characters.
990 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
991 * should fail since there is a acute at the end of \u01FA
992 * Note here that accents checking are slow and cautioned in the API docs.
993 * @param strsrch string search data
994 * @param start offset of match
995 * @param end end offset of the match
996 * @return TRUE if there are accents on either side of the match,
997 *         FALSE otherwise
998 */
999 static
hasAccentsAfterMatch(const UStringSearch * strsrch,int32_t start,int32_t end)1000 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
1001                                  int32_t    end)
1002 {
1003     if (strsrch->pattern.hasSuffixAccents) {
1004         const UChar       *text       = strsrch->search->text;
1005               int32_t  temp       = end;
1006               int32_t      textlength = strsrch->search->textLength;
1007         U16_BACK_1(text, 0, temp);
1008         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
1009             int32_t             firstce  = strsrch->pattern.ces[0];
1010             UCollationElements *coleiter = strsrch->textIter;
1011             UErrorCode          status   = U_ZERO_ERROR;
1012             int32_t ce;
1013             setColEIterOffset(coleiter, start);
1014             while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1015                 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
1016                     return TRUE;
1017                 }
1018             }
1019             int32_t count = 1;
1020             while (count < strsrch->pattern.cesLength) {
1021                 if (getCE(strsrch, ucol_next(coleiter, &status))
1022                     == UCOL_IGNORABLE) {
1023                     // Thai can give an ignorable here.
1024                     count --;
1025                 }
1026                 if (U_FAILURE(status)) {
1027                     return TRUE;
1028                 }
1029                 count ++;
1030             }
1031 
1032             ce = ucol_next(coleiter, &status);
1033             if (U_FAILURE(status)) {
1034                 return TRUE;
1035             }
1036             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1037                 ce = getCE(strsrch, ce);
1038             }
1039             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1040                 if (ucol_getOffset(coleiter) <= end) {
1041                     return TRUE;
1042                 }
1043                 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1044                     return TRUE;
1045                 }
1046             }
1047         }
1048     }
1049     return FALSE;
1050 }
1051 #endif // #if BOYER_MOORE
1052 
1053 /**
1054 * Checks if the offset runs out of the text string
1055 * @param offset
1056 * @param textlength of the text string
1057 * @return TRUE if offset is out of bounds, FALSE otherwise
1058 */
1059 static
isOutOfBounds(int32_t textlength,int32_t offset)1060 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1061 {
1062     return offset < 0 || offset > textlength;
1063 }
1064 
1065 /**
1066 * Checks for identical match
1067 * @param strsrch string search data
1068 * @param start offset of possible match
1069 * @param end offset of possible match
1070 * @return TRUE if identical match is found
1071 */
1072 static
checkIdentical(const UStringSearch * strsrch,int32_t start,int32_t end)1073 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1074                                   int32_t    end)
1075 {
1076     if (strsrch->strength != UCOL_IDENTICAL) {
1077         return TRUE;
1078     }
1079 
1080     // Note: We could use Normalizer::compare() or similar, but for short strings
1081     // which may not be in FCD it might be faster to just NFD them.
1082     UErrorCode status = U_ZERO_ERROR;
1083     UnicodeString t2, p2;
1084     strsrch->nfd->normalize(
1085         UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
1086     strsrch->nfd->normalize(
1087         UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
1088     // return FALSE if NFD failed
1089     return U_SUCCESS(status) && t2 == p2;
1090 }
1091 
1092 #if BOYER_MOORE
1093 /**
1094 * Checks to see if the match is repeated
1095 * @param strsrch string search data
1096 * @param start new match start index
1097 * @param end new match end index
1098 * @return TRUE if the the match is repeated, FALSE otherwise
1099 */
1100 static
checkRepeatedMatch(UStringSearch * strsrch,int32_t start,int32_t end)1101 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1102                                 int32_t    start,
1103                                 int32_t    end)
1104 {
1105     int32_t lastmatchindex = strsrch->search->matchedIndex;
1106     UBool       result;
1107     if (lastmatchindex == USEARCH_DONE) {
1108         return FALSE;
1109     }
1110     if (strsrch->search->isForwardSearching) {
1111         result = start <= lastmatchindex;
1112     }
1113     else {
1114         result = start >= lastmatchindex;
1115     }
1116     if (!result && !strsrch->search->isOverlap) {
1117         if (strsrch->search->isForwardSearching) {
1118             result = start < lastmatchindex + strsrch->search->matchedLength;
1119         }
1120         else {
1121             result = end > lastmatchindex;
1122         }
1123     }
1124     return result;
1125 }
1126 
1127 /**
1128 * Gets the collation element iterator's current offset.
1129 * @param coleiter collation element iterator
1130 * @param forwards flag TRUE if we are moving in th forwards direction
1131 * @return current offset
1132 */
1133 static
getColElemIterOffset(const UCollationElements * coleiter,UBool forwards)1134 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1135                                               UBool               forwards)
1136 {
1137     int32_t result = ucol_getOffset(coleiter);
1138     // intricacies of the the backwards collation element iterator
1139     if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
1140         result ++;
1141     }
1142     return result;
1143 }
1144 
1145 /**
1146 * Checks match for contraction.
1147 * If the match ends with a partial contraction we fail.
1148 * If the match starts too far off (because of backwards iteration) we try to
1149 * chip off the extra characters depending on whether a breakiterator has
1150 * been used.
1151 * Internal method, error assumed to be success, caller has to check status
1152 * before calling this method.
1153 * @param strsrch string search data
1154 * @param start offset of potential match, to be modified if necessary
1155 * @param end offset of potential match, to be modified if necessary
1156 * @param status output error status if any
1157 * @return TRUE if match passes the contraction test, FALSE otherwise
1158 */
1159 
1160 static
checkNextExactContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)1161 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1162                                      int32_t   *start,
1163                                      int32_t   *end, UErrorCode  *status)
1164 {
1165           UCollationElements *coleiter   = strsrch->textIter;
1166           int32_t             textlength = strsrch->search->textLength;
1167           int32_t             temp       = *start;
1168     const UCollator          *collator   = strsrch->collator;
1169     const UChar              *text       = strsrch->search->text;
1170     // This part checks if either ends of the match contains potential
1171     // contraction. If so we'll have to iterate through them
1172     // The start contraction needs to be checked since ucol_previous dumps
1173     // all characters till the first safe character into the buffer.
1174     // *start + 1 is used to test for the unsafe characters instead of *start
1175     // because ucol_prev takes all unsafe characters till the first safe
1176     // character ie *start. so by testing *start + 1, we can estimate if
1177     // excess prefix characters has been included in the potential search
1178     // results.
1179     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1180         (*start + 1 < textlength
1181          && ucol_unsafeCP(text[*start + 1], collator))) {
1182         int32_t expansion  = getExpansionPrefix(coleiter);
1183         UBool   expandflag = expansion > 0;
1184         setColEIterOffset(coleiter, *start);
1185         while (expansion > 0) {
1186             // getting rid of the redundant ce, caused by setOffset.
1187             // since backward contraction/expansion may have extra ces if we
1188             // are in the normalization buffer, hasAccentsBeforeMatch would
1189             // have taken care of it.
1190             // E.g. the character \u01FA will have an expansion of 3, but if
1191             // we are only looking for acute and ring \u030A and \u0301, we'll
1192             // have to skip the first ce in the expansion buffer.
1193             ucol_next(coleiter, status);
1194             if (U_FAILURE(*status)) {
1195                 return FALSE;
1196             }
1197             if (ucol_getOffset(coleiter) != temp) {
1198                 *start = temp;
1199                 temp  = ucol_getOffset(coleiter);
1200             }
1201             expansion --;
1202         }
1203 
1204         int32_t  *patternce       = strsrch->pattern.ces;
1205         int32_t   patterncelength = strsrch->pattern.cesLength;
1206         int32_t   count           = 0;
1207         while (count < patterncelength) {
1208             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1209             if (ce == UCOL_IGNORABLE) {
1210                 continue;
1211             }
1212             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1213                 *start = temp;
1214                 temp   = ucol_getOffset(coleiter);
1215             }
1216             if (U_FAILURE(*status) || ce != patternce[count]) {
1217                 (*end) ++;
1218                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1219                 return FALSE;
1220             }
1221             count ++;
1222         }
1223     }
1224     return TRUE;
1225 }
1226 
1227 /**
1228 * Checks and sets the match information if found.
1229 * Checks
1230 * <ul>
1231 * <li> the potential match does not repeat the previous match
1232 * <li> boundaries are correct
1233 * <li> exact matches has no extra accents
1234 * <li> identical matchesb
1235 * <li> potential match does not end in the middle of a contraction
1236 * <\ul>
1237 * Otherwise the offset will be shifted to the next character.
1238 * Internal method, status assumed to be success, caller has to check status
1239 * before calling this method.
1240 * @param strsrch string search data
1241 * @param textoffset offset in the collation element text. the returned value
1242 *        will be the truncated end offset of the match or the new start
1243 *        search offset.
1244 * @param status output error status if any
1245 * @return TRUE if the match is valid, FALSE otherwise
1246 */
1247 static
checkNextExactMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)1248 inline UBool checkNextExactMatch(UStringSearch *strsrch,
1249                                  int32_t   *textoffset, UErrorCode *status)
1250 {
1251     UCollationElements *coleiter = strsrch->textIter;
1252     int32_t         start    = getColElemIterOffset(coleiter, FALSE);
1253 
1254     if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1255         return FALSE;
1256     }
1257 
1258     // this totally matches, however we need to check if it is repeating
1259     if (!isBreakUnit(strsrch, start, *textoffset) ||
1260         checkRepeatedMatch(strsrch, start, *textoffset) ||
1261         hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1262         !checkIdentical(strsrch, start, *textoffset) ||
1263         hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1264 
1265         (*textoffset) ++;
1266         *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1267         return FALSE;
1268     }
1269 
1270     //Add breakiterator boundary check for primary strength search.
1271     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
1272         checkBreakBoundary(strsrch, &start, textoffset);
1273     }
1274 
1275     // totally match, we will get rid of the ending ignorables.
1276     strsrch->search->matchedIndex  = start;
1277     strsrch->search->matchedLength = *textoffset - start;
1278     return TRUE;
1279 }
1280 
1281 /**
1282 * Getting the previous base character offset, or the current offset if the
1283 * current character is a base character
1284 * @param text string
1285 * @param textoffset one offset after the current character
1286 * @return the offset of the next character after the base character or the first
1287 *         composed character with accents
1288 */
1289 static
getPreviousBaseOffset(const UChar * text,int32_t textoffset)1290 inline int32_t getPreviousBaseOffset(const UChar       *text,
1291                                                int32_t  textoffset)
1292 {
1293     if (textoffset > 0) {
1294         for (;;) {
1295             int32_t result = textoffset;
1296             U16_BACK_1(text, 0, textoffset);
1297             int32_t temp = textoffset;
1298             uint16_t fcd = getFCD(text, &temp, result);
1299             if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1300                 if (fcd & LAST_BYTE_MASK_) {
1301                     return textoffset;
1302                 }
1303                 return result;
1304             }
1305             if (textoffset == 0) {
1306                 return 0;
1307             }
1308         }
1309     }
1310     return textoffset;
1311 }
1312 
1313 /**
1314 * Getting the indexes of the accents that are not blocked in the argument
1315 * accent array
1316 * @param accents array of accents in nfd terminated by a 0.
1317 * @param accentsindex array of indexes of the accents that are not blocked
1318 */
1319 static
getUnblockedAccentIndex(UChar * accents,int32_t * accentsindex)1320 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1321 {
1322     int32_t index     = 0;
1323     int32_t     length    = u_strlen(accents);
1324     UChar32     codepoint = 0;
1325     int         cclass    = 0;
1326     int         result    = 0;
1327     int32_t temp;
1328     while (index < length) {
1329         temp = index;
1330         U16_NEXT(accents, index, length, codepoint);
1331         if (u_getCombiningClass(codepoint) != cclass) {
1332             cclass        = u_getCombiningClass(codepoint);
1333             accentsindex[result] = temp;
1334             result ++;
1335         }
1336     }
1337     accentsindex[result] = length;
1338     return result;
1339 }
1340 
1341 /**
1342 * Appends 3 UChar arrays to a destination array.
1343 * Creates a new array if we run out of space. The caller will have to
1344 * manually deallocate the newly allocated array.
1345 * Internal method, status assumed to be success, caller has to check status
1346 * before calling this method. destination not to be NULL and has at least
1347 * size destinationlength.
1348 * @param destination target array
1349 * @param destinationlength target array size, returning the appended length
1350 * @param source1 null-terminated first array
1351 * @param source2 second array
1352 * @param source2length length of seond array
1353 * @param source3 null-terminated third array
1354 * @param status error status if any
1355 * @return new destination array, destination if there was no new allocation
1356 */
1357 static
addToUCharArray(UChar * destination,int32_t * destinationlength,const UChar * source1,const UChar * source2,int32_t source2length,const UChar * source3,UErrorCode * status)1358 inline UChar * addToUCharArray(      UChar      *destination,
1359                                      int32_t    *destinationlength,
1360                                const UChar      *source1,
1361                                const UChar      *source2,
1362                                      int32_t     source2length,
1363                                const UChar      *source3,
1364                                      UErrorCode *status)
1365 {
1366     int32_t source1length = source1 ? u_strlen(source1) : 0;
1367     int32_t source3length = source3 ? u_strlen(source3) : 0;
1368     if (*destinationlength < source1length + source2length + source3length +
1369                                                                            1)
1370     {
1371         destination = (UChar *)allocateMemory(
1372           (source1length + source2length + source3length + 1) * sizeof(UChar),
1373           status);
1374         // if error allocating memory, status will be
1375         // U_MEMORY_ALLOCATION_ERROR
1376         if (U_FAILURE(*status)) {
1377             *destinationlength = 0;
1378             return NULL;
1379         }
1380     }
1381     if (source1length != 0) {
1382         uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
1383     }
1384     if (source2length != 0) {
1385         uprv_memcpy(destination + source1length, source2,
1386                     sizeof(UChar) * source2length);
1387     }
1388     if (source3length != 0) {
1389         uprv_memcpy(destination + source1length + source2length, source3,
1390                     sizeof(UChar) * source3length);
1391     }
1392     *destinationlength = source1length + source2length + source3length;
1393     return destination;
1394 }
1395 
1396 /**
1397 * Running through a collation element iterator to see if the contents matches
1398 * pattern in string search data
1399 * @param strsrch string search data
1400 * @param coleiter collation element iterator
1401 * @return TRUE if a match if found, FALSE otherwise
1402 */
1403 static
checkCollationMatch(const UStringSearch * strsrch,UCollationElements * coleiter)1404 inline UBool checkCollationMatch(const UStringSearch      *strsrch,
1405                                        UCollationElements *coleiter)
1406 {
1407     int         patternceindex = strsrch->pattern.cesLength;
1408     int32_t    *patternce      = strsrch->pattern.ces;
1409     UErrorCode  status = U_ZERO_ERROR;
1410     while (patternceindex > 0) {
1411         int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1412         if (ce == UCOL_IGNORABLE) {
1413             continue;
1414         }
1415         if (U_FAILURE(status) || ce != *patternce) {
1416             return FALSE;
1417         }
1418         patternce ++;
1419         patternceindex --;
1420     }
1421     return TRUE;
1422 }
1423 
1424 /**
1425 * Rearranges the front accents to try matching.
1426 * Prefix accents in the text will be grouped according to their combining
1427 * class and the groups will be mixed and matched to try find the perfect
1428 * match with the pattern.
1429 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1430 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1431 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1432 *         "\u0301\u0325".
1433 * step 2: check if any of the generated substrings matches the pattern.
1434 * Internal method, status is assumed to be success, caller has to check status
1435 * before calling this method.
1436 * @param strsrch string search match
1437 * @param start first offset of the accents to start searching
1438 * @param end start of the last accent set
1439 * @param status output error status if any
1440 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1441 *         offset of the match. Note this start includes all preceding accents.
1442 */
1443 static
doNextCanonicalPrefixMatch(UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)1444 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1445                                        int32_t    start,
1446                                        int32_t    end,
1447                                        UErrorCode    *status)
1448 {
1449     const UChar       *text       = strsrch->search->text;
1450           int32_t      textlength = strsrch->search->textLength;
1451           int32_t  tempstart  = start;
1452 
1453     if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1454         // die... failed at a base character
1455         return USEARCH_DONE;
1456     }
1457 
1458     int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1459     start = getPreviousBaseOffset(text, tempstart);
1460 
1461     UChar       accents[INITIAL_ARRAY_SIZE_];
1462     // normalizing the offensive string
1463     unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1464                     INITIAL_ARRAY_SIZE_, status);
1465     if (U_FAILURE(*status)) {
1466         return USEARCH_DONE;
1467     }
1468 
1469     int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
1470     int32_t         accentsize = getUnblockedAccentIndex(accents,
1471                                                                  accentsindex);
1472     int32_t         count      = (2 << (accentsize - 1)) - 1;
1473     UChar               buffer[INITIAL_ARRAY_SIZE_];
1474     UCollationElements *coleiter   = strsrch->utilIter;
1475     while (U_SUCCESS(*status) && count > 0) {
1476         UChar *rearrange = strsrch->canonicalPrefixAccents;
1477         // copy the base characters
1478         for (int k = 0; k < accentsindex[0]; k ++) {
1479             *rearrange ++ = accents[k];
1480         }
1481         // forming all possible canonical rearrangement by dropping
1482         // sets of accents
1483         for (int i = 0; i <= accentsize - 1; i ++) {
1484             int32_t mask = 1 << (accentsize - i - 1);
1485             if (count & mask) {
1486                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1487                     *rearrange ++ = accents[j];
1488                 }
1489             }
1490         }
1491         *rearrange = 0;
1492         int32_t  matchsize = INITIAL_ARRAY_SIZE_;
1493         UChar   *match     = addToUCharArray(buffer, &matchsize,
1494                                            strsrch->canonicalPrefixAccents,
1495                                            strsrch->search->text + offset,
1496                                            end - offset,
1497                                            strsrch->canonicalSuffixAccents,
1498                                            status);
1499 
1500         // if status is a failure, ucol_setText does nothing.
1501         // run the collator iterator through this match
1502         ucol_setText(coleiter, match, matchsize, status);
1503         if (U_SUCCESS(*status)) {
1504             if (checkCollationMatch(strsrch, coleiter)) {
1505                 if (match != buffer) {
1506                     uprv_free(match);
1507                 }
1508                 return start;
1509             }
1510         }
1511         count --;
1512     }
1513     return USEARCH_DONE;
1514 }
1515 
1516 /**
1517 * Gets the offset to the safe point in text before textoffset.
1518 * ie. not the middle of a contraction, swappable characters or supplementary
1519 * characters.
1520 * @param collator collation sata
1521 * @param text string to work with
1522 * @param textoffset offset in string
1523 * @param textlength length of text string
1524 * @return offset to the previous safe character
1525 */
1526 static
getPreviousSafeOffset(const UCollator * collator,const UChar * text,int32_t textoffset)1527 inline uint32_t getPreviousSafeOffset(const UCollator   *collator,
1528                                       const UChar       *text,
1529                                             int32_t  textoffset)
1530 {
1531     int32_t result = textoffset; // first contraction character
1532     while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1533         result --;
1534     }
1535     if (result != 0) {
1536         // the first contraction character is consider unsafe here
1537         result --;
1538     }
1539     return result;
1540 }
1541 
1542 /**
1543 * Cleaning up after we passed the safe zone
1544 * @param strsrch string search data
1545 * @param safetext safe text array
1546 * @param safebuffer safe text buffer
1547 * @param coleiter collation element iterator for safe text
1548 */
1549 static
cleanUpSafeText(const UStringSearch * strsrch,UChar * safetext,UChar * safebuffer)1550 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1551                                   UChar         *safebuffer)
1552 {
1553     if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1554     {
1555        uprv_free(safetext);
1556     }
1557 }
1558 
1559 /**
1560 * Take the rearranged end accents and tries matching. If match failed at
1561 * a seperate preceding set of accents (seperated from the rearranged on by
1562 * at least a base character) then we rearrange the preceding accents and
1563 * tries matching again.
1564 * We allow skipping of the ends of the accent set if the ces do not match.
1565 * However if the failure is found before the accent set, it fails.
1566 * Internal method, status assumed to be success, caller has to check status
1567 * before calling this method.
1568 * @param strsrch string search data
1569 * @param textoffset of the start of the rearranged accent
1570 * @param status output error status if any
1571 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1572 *         offset of the match. Note this start includes all preceding accents.
1573 */
1574 static
doNextCanonicalSuffixMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)1575 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1576                                        int32_t    textoffset,
1577                                        UErrorCode    *status)
1578 {
1579     const UChar              *text           = strsrch->search->text;
1580     const UCollator          *collator       = strsrch->collator;
1581           int32_t             safelength     = 0;
1582           UChar              *safetext;
1583           int32_t             safetextlength;
1584           UChar               safebuffer[INITIAL_ARRAY_SIZE_];
1585           UCollationElements *coleiter       = strsrch->utilIter;
1586           int32_t         safeoffset     = textoffset;
1587 
1588     if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1589                                          collator)) {
1590         safeoffset     = getPreviousSafeOffset(collator, text, textoffset);
1591         safelength     = textoffset - safeoffset;
1592         safetextlength = INITIAL_ARRAY_SIZE_;
1593         safetext       = addToUCharArray(safebuffer, &safetextlength, NULL,
1594                                          text + safeoffset, safelength,
1595                                          strsrch->canonicalSuffixAccents,
1596                                          status);
1597     }
1598     else {
1599         safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1600         safetext       = strsrch->canonicalSuffixAccents;
1601     }
1602 
1603     // if status is a failure, ucol_setText does nothing
1604     ucol_setText(coleiter, safetext, safetextlength, status);
1605     // status checked in loop below
1606 
1607     int32_t  *ce        = strsrch->pattern.ces;
1608     int32_t   celength  = strsrch->pattern.cesLength;
1609     int       ceindex   = celength - 1;
1610     UBool     isSafe    = TRUE; // indication flag for position in safe zone
1611 
1612     while (ceindex >= 0) {
1613         int32_t textce = ucol_previous(coleiter, status);
1614         if (U_FAILURE(*status)) {
1615             if (isSafe) {
1616                 cleanUpSafeText(strsrch, safetext, safebuffer);
1617             }
1618             return USEARCH_DONE;
1619         }
1620         if (textce == UCOL_NULLORDER) {
1621             // check if we have passed the safe buffer
1622             if (coleiter == strsrch->textIter) {
1623                 cleanUpSafeText(strsrch, safetext, safebuffer);
1624                 return USEARCH_DONE;
1625             }
1626             cleanUpSafeText(strsrch, safetext, safebuffer);
1627             safetext = safebuffer;
1628             coleiter = strsrch->textIter;
1629             setColEIterOffset(coleiter, safeoffset);
1630             // status checked at the start of the loop
1631             isSafe = FALSE;
1632             continue;
1633         }
1634         textce = getCE(strsrch, textce);
1635         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1636             // do the beginning stuff
1637             int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1638             if (isSafe && failedoffset >= safelength) {
1639                 // alas... no hope. failed at rearranged accent set
1640                 cleanUpSafeText(strsrch, safetext, safebuffer);
1641                 return USEARCH_DONE;
1642             }
1643             else {
1644                 if (isSafe) {
1645                     failedoffset += safeoffset;
1646                     cleanUpSafeText(strsrch, safetext, safebuffer);
1647                 }
1648 
1649                 // try rearranging the front accents
1650                 int32_t result = doNextCanonicalPrefixMatch(strsrch,
1651                                         failedoffset, textoffset, status);
1652                 if (result != USEARCH_DONE) {
1653                     // if status is a failure, ucol_setOffset does nothing
1654                     setColEIterOffset(strsrch->textIter, result);
1655                 }
1656                 if (U_FAILURE(*status)) {
1657                     return USEARCH_DONE;
1658                 }
1659                 return result;
1660             }
1661         }
1662         if (textce == ce[ceindex]) {
1663             ceindex --;
1664         }
1665     }
1666     // set offset here
1667     if (isSafe) {
1668         int32_t result     = getColElemIterOffset(coleiter, FALSE);
1669         // sets the text iterator here with the correct expansion and offset
1670         int32_t    leftoverces = getExpansionPrefix(coleiter);
1671         cleanUpSafeText(strsrch, safetext, safebuffer);
1672         if (result >= safelength) {
1673             result = textoffset;
1674         }
1675         else {
1676             result += safeoffset;
1677         }
1678         setColEIterOffset(strsrch->textIter, result);
1679         strsrch->textIter->iteratordata_.toReturn =
1680                        setExpansionPrefix(strsrch->textIter, leftoverces);
1681         return result;
1682     }
1683 
1684     return ucol_getOffset(coleiter);
1685 }
1686 
1687 /**
1688 * Trying out the substring and sees if it can be a canonical match.
1689 * This will try normalizing the end accents and arranging them into canonical
1690 * equivalents and check their corresponding ces with the pattern ce.
1691 * Suffix accents in the text will be grouped according to their combining
1692 * class and the groups will be mixed and matched to try find the perfect
1693 * match with the pattern.
1694 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1695 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1696 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1697 *         "\u0301\u0325".
1698 * step 2: check if any of the generated substrings matches the pattern.
1699 * Internal method, status assumed to be success, caller has to check status
1700 * before calling this method.
1701 * @param strsrch string search data
1702 * @param textoffset end offset in the collation element text that ends with
1703 *                   the accents to be rearranged
1704 * @param status error status if any
1705 * @return TRUE if the match is valid, FALSE otherwise
1706 */
1707 static
doNextCanonicalMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)1708 UBool doNextCanonicalMatch(UStringSearch *strsrch,
1709                            int32_t    textoffset,
1710                            UErrorCode    *status)
1711 {
1712     const UChar       *text = strsrch->search->text;
1713           int32_t  temp = textoffset;
1714     U16_BACK_1(text, 0, temp);
1715     if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1716         UCollationElements *coleiter = strsrch->textIter;
1717         int32_t         offset   = getColElemIterOffset(coleiter, FALSE);
1718         if (strsrch->pattern.hasPrefixAccents) {
1719             offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1720                                                 status);
1721             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1722                 setColEIterOffset(coleiter, offset);
1723                 return TRUE;
1724             }
1725         }
1726         return FALSE;
1727     }
1728 
1729     if (!strsrch->pattern.hasSuffixAccents) {
1730         return FALSE;
1731     }
1732 
1733     UChar       accents[INITIAL_ARRAY_SIZE_];
1734     // offset to the last base character in substring to search
1735     int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1736     // normalizing the offensive string
1737     unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1738                                0, accents, INITIAL_ARRAY_SIZE_, status);
1739     // status checked in loop below
1740 
1741     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1742     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1743 
1744     // 2 power n - 1 plus the full set of accents
1745     int32_t  count = (2 << (size - 1)) - 1;
1746     while (U_SUCCESS(*status) && count > 0) {
1747         UChar *rearrange = strsrch->canonicalSuffixAccents;
1748         // copy the base characters
1749         for (int k = 0; k < accentsindex[0]; k ++) {
1750             *rearrange ++ = accents[k];
1751         }
1752         // forming all possible canonical rearrangement by dropping
1753         // sets of accents
1754         for (int i = 0; i <= size - 1; i ++) {
1755             int32_t mask = 1 << (size - i - 1);
1756             if (count & mask) {
1757                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1758                     *rearrange ++ = accents[j];
1759                 }
1760             }
1761         }
1762         *rearrange = 0;
1763         int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1764                                                         status);
1765         if (offset != USEARCH_DONE) {
1766             return TRUE; // match found
1767         }
1768         count --;
1769     }
1770     return FALSE;
1771 }
1772 
1773 /**
1774 * Gets the previous base character offset depending on the string search
1775 * pattern data
1776 * @param strsrch string search data
1777 * @param textoffset current offset, current character
1778 * @return the offset of the next character after this base character or itself
1779 *         if it is a composed character with accents
1780 */
1781 static
getPreviousUStringSearchBaseOffset(UStringSearch * strsrch,int32_t textoffset)1782 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1783                                                       int32_t textoffset)
1784 {
1785     if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1786         const UChar       *text = strsrch->search->text;
1787               int32_t  offset = textoffset;
1788         if (getFCD(text, &offset, strsrch->search->textLength) >>
1789                                                    SECOND_LAST_BYTE_SHIFT_) {
1790             return getPreviousBaseOffset(text, textoffset);
1791         }
1792     }
1793     return textoffset;
1794 }
1795 
1796 /**
1797 * Checks match for contraction.
1798 * If the match ends with a partial contraction we fail.
1799 * If the match starts too far off (because of backwards iteration) we try to
1800 * chip off the extra characters
1801 * Internal method, status assumed to be success, caller has to check status
1802 * before calling this method.
1803 * @param strsrch string search data
1804 * @param start offset of potential match, to be modified if necessary
1805 * @param end offset of potential match, to be modified if necessary
1806 * @param status output error status if any
1807 * @return TRUE if match passes the contraction test, FALSE otherwise
1808 */
1809 static
checkNextCanonicalContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)1810 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1811                                          int32_t   *start,
1812                                          int32_t   *end,
1813                                          UErrorCode    *status)
1814 {
1815           UCollationElements *coleiter   = strsrch->textIter;
1816           int32_t             textlength = strsrch->search->textLength;
1817           int32_t         temp       = *start;
1818     const UCollator          *collator   = strsrch->collator;
1819     const UChar              *text       = strsrch->search->text;
1820     // This part checks if either ends of the match contains potential
1821     // contraction. If so we'll have to iterate through them
1822     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1823         (*start + 1 < textlength
1824          && ucol_unsafeCP(text[*start + 1], collator))) {
1825         int32_t expansion  = getExpansionPrefix(coleiter);
1826         UBool   expandflag = expansion > 0;
1827         setColEIterOffset(coleiter, *start);
1828         while (expansion > 0) {
1829             // getting rid of the redundant ce, caused by setOffset.
1830             // since backward contraction/expansion may have extra ces if we
1831             // are in the normalization buffer, hasAccentsBeforeMatch would
1832             // have taken care of it.
1833             // E.g. the character \u01FA will have an expansion of 3, but if
1834             // we are only looking for acute and ring \u030A and \u0301, we'll
1835             // have to skip the first ce in the expansion buffer.
1836             ucol_next(coleiter, status);
1837             if (U_FAILURE(*status)) {
1838                 return FALSE;
1839             }
1840             if (ucol_getOffset(coleiter) != temp) {
1841                 *start = temp;
1842                 temp  = ucol_getOffset(coleiter);
1843             }
1844             expansion --;
1845         }
1846 
1847         int32_t  *patternce       = strsrch->pattern.ces;
1848         int32_t   patterncelength = strsrch->pattern.cesLength;
1849         int32_t   count           = 0;
1850         int32_t   textlength      = strsrch->search->textLength;
1851         while (count < patterncelength) {
1852             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1853             // status checked below, note that if status is a failure
1854             // ucol_next returns UCOL_NULLORDER
1855             if (ce == UCOL_IGNORABLE) {
1856                 continue;
1857             }
1858             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1859                 *start = temp;
1860                 temp   = ucol_getOffset(coleiter);
1861             }
1862 
1863             if (count == 0 && ce != patternce[0]) {
1864                 // accents may have extra starting ces, this occurs when a
1865                 // pure accent pattern is matched without rearrangement
1866                 // text \u0325\u0300 and looking for \u0300
1867                 int32_t expected = patternce[0];
1868                 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1869                     ce = getCE(strsrch, ucol_next(coleiter, status));
1870                     while (U_SUCCESS(*status) && ce != expected &&
1871                            ce != UCOL_NULLORDER &&
1872                            ucol_getOffset(coleiter) <= *end) {
1873                         ce = getCE(strsrch, ucol_next(coleiter, status));
1874                     }
1875                 }
1876             }
1877             if (U_FAILURE(*status) || ce != patternce[count]) {
1878                 (*end) ++;
1879                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1880                 return FALSE;
1881             }
1882             count ++;
1883         }
1884     }
1885     return TRUE;
1886 }
1887 
1888 /**
1889 * Checks and sets the match information if found.
1890 * Checks
1891 * <ul>
1892 * <li> the potential match does not repeat the previous match
1893 * <li> boundaries are correct
1894 * <li> potential match does not end in the middle of a contraction
1895 * <li> identical matches
1896 * <\ul>
1897 * Otherwise the offset will be shifted to the next character.
1898 * Internal method, status assumed to be success, caller has to check the
1899 * status before calling this method.
1900 * @param strsrch string search data
1901 * @param textoffset offset in the collation element text. the returned value
1902 *        will be the truncated end offset of the match or the new start
1903 *        search offset.
1904 * @param status output error status if any
1905 * @return TRUE if the match is valid, FALSE otherwise
1906 */
1907 static
checkNextCanonicalMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)1908 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1909                                      int32_t   *textoffset,
1910                                      UErrorCode    *status)
1911 {
1912     // to ensure that the start and ends are not composite characters
1913     UCollationElements *coleiter = strsrch->textIter;
1914     // if we have a canonical accent match
1915     if ((strsrch->pattern.hasSuffixAccents &&
1916         strsrch->canonicalSuffixAccents[0]) ||
1917         (strsrch->pattern.hasPrefixAccents &&
1918         strsrch->canonicalPrefixAccents[0])) {
1919         strsrch->search->matchedIndex  = getPreviousUStringSearchBaseOffset(
1920                                                     strsrch,
1921                                                     ucol_getOffset(coleiter));
1922         strsrch->search->matchedLength = *textoffset -
1923                                                 strsrch->search->matchedIndex;
1924         return TRUE;
1925     }
1926 
1927     int32_t start = getColElemIterOffset(coleiter, FALSE);
1928     if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1929                                             status) || U_FAILURE(*status)) {
1930         return FALSE;
1931     }
1932 
1933     start = getPreviousUStringSearchBaseOffset(strsrch, start);
1934     // this totally matches, however we need to check if it is repeating
1935     if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1936         !isBreakUnit(strsrch, start, *textoffset) ||
1937         !checkIdentical(strsrch, start, *textoffset)) {
1938         (*textoffset) ++;
1939         *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1940                                         strsrch->search->textLength);
1941         return FALSE;
1942     }
1943 
1944     strsrch->search->matchedIndex  = start;
1945     strsrch->search->matchedLength = *textoffset - start;
1946     return TRUE;
1947 }
1948 
1949 /**
1950 * Shifting the collation element iterator position forward to prepare for
1951 * a preceding match. If the first character is a unsafe character, we'll only
1952 * shift by 1 to capture contractions, normalization etc.
1953 * Internal method, status assumed to be success, caller has to check status
1954 * before calling this method.
1955 * @param text strsrch string search data
1956 * @param textoffset start text position to do search
1957 * @param ce the text ce which failed the match.
1958 * @param patternceindex index of the ce within the pattern ce buffer which
1959 *        failed the match
1960 * @return final offset
1961 */
1962 static
reverseShift(UStringSearch * strsrch,int32_t textoffset,int32_t ce,int32_t patternceindex)1963 inline int32_t reverseShift(UStringSearch *strsrch,
1964                                 int32_t    textoffset,
1965                                 int32_t       ce,
1966                                 int32_t        patternceindex)
1967 {
1968     if (strsrch->search->isOverlap) {
1969         if (textoffset != strsrch->search->textLength) {
1970             textoffset --;
1971         }
1972         else {
1973             textoffset -= strsrch->pattern.defaultShiftSize;
1974         }
1975     }
1976     else {
1977         if (ce != UCOL_NULLORDER) {
1978             int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)];
1979 
1980             // this is to adjust for characters in the middle of the substring
1981             // for matching that failed.
1982             int32_t adjust = patternceindex;
1983             if (adjust > 1 && shift > adjust) {
1984                 shift -= adjust - 1;
1985             }
1986             textoffset -= shift;
1987         }
1988         else {
1989             textoffset -= strsrch->pattern.defaultShiftSize;
1990         }
1991     }
1992     textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1993     return textoffset;
1994 }
1995 
1996 /**
1997 * Checks match for contraction.
1998 * If the match starts with a partial contraction we fail.
1999 * Internal method, status assumed to be success, caller has to check status
2000 * before calling this method.
2001 * @param strsrch string search data
2002 * @param start offset of potential match, to be modified if necessary
2003 * @param end offset of potential match, to be modified if necessary
2004 * @param status output error status if any
2005 * @return TRUE if match passes the contraction test, FALSE otherwise
2006 */
2007 static
checkPreviousExactContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)2008 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2009                                      int32_t   *start,
2010                                      int32_t   *end, UErrorCode  *status)
2011 {
2012           UCollationElements *coleiter   = strsrch->textIter;
2013           int32_t             textlength = strsrch->search->textLength;
2014           int32_t             temp       = *end;
2015     const UCollator          *collator   = strsrch->collator;
2016     const UChar              *text       = strsrch->search->text;
2017     // This part checks if either if the start of the match contains potential
2018     // contraction. If so we'll have to iterate through them
2019     // Since we used ucol_next while previously looking for the potential
2020     // match, this guarantees that our end will not be a partial contraction,
2021     // or a partial supplementary character.
2022     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2023         int32_t expansion  = getExpansionSuffix(coleiter);
2024         UBool   expandflag = expansion > 0;
2025         setColEIterOffset(coleiter, *end);
2026         while (U_SUCCESS(*status) && expansion > 0) {
2027             // getting rid of the redundant ce
2028             // since forward contraction/expansion may have extra ces
2029             // if we are in the normalization buffer, hasAccentsBeforeMatch
2030             // would have taken care of it.
2031             // E.g. the character \u01FA will have an expansion of 3, but if
2032             // we are only looking for A ring A\u030A, we'll have to skip the
2033             // last ce in the expansion buffer
2034             ucol_previous(coleiter, status);
2035             if (U_FAILURE(*status)) {
2036                 return FALSE;
2037             }
2038             if (ucol_getOffset(coleiter) != temp) {
2039                 *end = temp;
2040                 temp  = ucol_getOffset(coleiter);
2041             }
2042             expansion --;
2043         }
2044 
2045         int32_t  *patternce       = strsrch->pattern.ces;
2046         int32_t   patterncelength = strsrch->pattern.cesLength;
2047         int32_t   count           = patterncelength;
2048         while (count > 0) {
2049             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2050             // status checked below, note that if status is a failure
2051             // ucol_previous returns UCOL_NULLORDER
2052             if (ce == UCOL_IGNORABLE) {
2053                 continue;
2054             }
2055             if (expandflag && count == 0 &&
2056                 getColElemIterOffset(coleiter, FALSE) != temp) {
2057                 *end = temp;
2058                 temp  = ucol_getOffset(coleiter);
2059             }
2060             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2061                 (*start) --;
2062                 *start = getPreviousBaseOffset(text, *start);
2063                 return FALSE;
2064             }
2065             count --;
2066         }
2067     }
2068     return TRUE;
2069 }
2070 
2071 /**
2072 * Checks and sets the match information if found.
2073 * Checks
2074 * <ul>
2075 * <li> the current match does not repeat the last match
2076 * <li> boundaries are correct
2077 * <li> exact matches has no extra accents
2078 * <li> identical matches
2079 * <\ul>
2080 * Otherwise the offset will be shifted to the preceding character.
2081 * Internal method, status assumed to be success, caller has to check status
2082 * before calling this method.
2083 * @param strsrch string search data
2084 * @param collator
2085 * @param coleiter collation element iterator
2086 * @param text string
2087 * @param textoffset offset in the collation element text. the returned value
2088 *        will be the truncated start offset of the match or the new start
2089 *        search offset.
2090 * @param status output error status if any
2091 * @return TRUE if the match is valid, FALSE otherwise
2092 */
2093 static
checkPreviousExactMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)2094 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2095                                      int32_t   *textoffset,
2096                                      UErrorCode    *status)
2097 {
2098     // to ensure that the start and ends are not composite characters
2099     int32_t end = ucol_getOffset(strsrch->textIter);
2100     if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2101         || U_FAILURE(*status)) {
2102             return FALSE;
2103     }
2104 
2105     // this totally matches, however we need to check if it is repeating
2106     // the old match
2107     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2108         !isBreakUnit(strsrch, *textoffset, end) ||
2109         hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
2110         !checkIdentical(strsrch, *textoffset, end) ||
2111         hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2112         (*textoffset) --;
2113         *textoffset = getPreviousBaseOffset(strsrch->search->text,
2114                                             *textoffset);
2115         return FALSE;
2116     }
2117 
2118     //Add breakiterator boundary check for primary strength search.
2119     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
2120         checkBreakBoundary(strsrch, textoffset, &end);
2121     }
2122 
2123     strsrch->search->matchedIndex = *textoffset;
2124     strsrch->search->matchedLength = end - *textoffset;
2125     return TRUE;
2126 }
2127 
2128 /**
2129 * Rearranges the end accents to try matching.
2130 * Suffix accents in the text will be grouped according to their combining
2131 * class and the groups will be mixed and matched to try find the perfect
2132 * match with the pattern.
2133 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2134 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2135 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2136 *         "\u0301\u0325".
2137 * step 2: check if any of the generated substrings matches the pattern.
2138 * Internal method, status assumed to be success, user has to check status
2139 * before calling this method.
2140 * @param strsrch string search match
2141 * @param start offset of the first base character
2142 * @param end start of the last accent set
2143 * @param status only error status if any
2144 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2145 *         offset of the match. Note this start includes all following accents.
2146 */
2147 static
doPreviousCanonicalSuffixMatch(UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)2148 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
2149                                            int32_t    start,
2150                                            int32_t    end,
2151                                            UErrorCode    *status)
2152 {
2153     const UChar       *text       = strsrch->search->text;
2154           int32_t  tempend    = end;
2155 
2156     U16_BACK_1(text, 0, tempend);
2157     if (!(getFCD(text, &tempend, strsrch->search->textLength) &
2158                                                            LAST_BYTE_MASK_)) {
2159         // die... failed at a base character
2160         return USEARCH_DONE;
2161     }
2162     end = getNextBaseOffset(text, end, strsrch->search->textLength);
2163 
2164     if (U_SUCCESS(*status)) {
2165         UChar       accents[INITIAL_ARRAY_SIZE_];
2166         int32_t offset = getPreviousBaseOffset(text, end);
2167         // normalizing the offensive string
2168         unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2169                         INITIAL_ARRAY_SIZE_, status);
2170 
2171         int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
2172         int32_t         accentsize = getUnblockedAccentIndex(accents,
2173                                                          accentsindex);
2174         int32_t         count      = (2 << (accentsize - 1)) - 1;
2175         UChar               buffer[INITIAL_ARRAY_SIZE_];
2176         UCollationElements *coleiter = strsrch->utilIter;
2177         while (U_SUCCESS(*status) && count > 0) {
2178             UChar *rearrange = strsrch->canonicalSuffixAccents;
2179             // copy the base characters
2180             for (int k = 0; k < accentsindex[0]; k ++) {
2181                 *rearrange ++ = accents[k];
2182             }
2183             // forming all possible canonical rearrangement by dropping
2184             // sets of accents
2185             for (int i = 0; i <= accentsize - 1; i ++) {
2186                 int32_t mask = 1 << (accentsize - i - 1);
2187                 if (count & mask) {
2188                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2189                         *rearrange ++ = accents[j];
2190                     }
2191                 }
2192             }
2193             *rearrange = 0;
2194             int32_t  matchsize = INITIAL_ARRAY_SIZE_;
2195             UChar   *match     = addToUCharArray(buffer, &matchsize,
2196                                            strsrch->canonicalPrefixAccents,
2197                                            strsrch->search->text + start,
2198                                            offset - start,
2199                                            strsrch->canonicalSuffixAccents,
2200                                            status);
2201 
2202             // run the collator iterator through this match
2203             // if status is a failure ucol_setText does nothing
2204             ucol_setText(coleiter, match, matchsize, status);
2205             if (U_SUCCESS(*status)) {
2206                 if (checkCollationMatch(strsrch, coleiter)) {
2207                     if (match != buffer) {
2208                         uprv_free(match);
2209                     }
2210                     return end;
2211                 }
2212             }
2213             count --;
2214         }
2215     }
2216     return USEARCH_DONE;
2217 }
2218 
2219 /**
2220 * Take the rearranged start accents and tries matching. If match failed at
2221 * a seperate following set of accents (seperated from the rearranged on by
2222 * at least a base character) then we rearrange the preceding accents and
2223 * tries matching again.
2224 * We allow skipping of the ends of the accent set if the ces do not match.
2225 * However if the failure is found before the accent set, it fails.
2226 * Internal method, status assumed to be success, caller has to check status
2227 * before calling this method.
2228 * @param strsrch string search data
2229 * @param textoffset of the ends of the rearranged accent
2230 * @param status output error status if any
2231 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2232 *         offset of the match. Note this start includes all following accents.
2233 */
2234 static
doPreviousCanonicalPrefixMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)2235 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2236                                            int32_t    textoffset,
2237                                            UErrorCode    *status)
2238 {
2239     const UChar       *text       = strsrch->search->text;
2240     const UCollator   *collator   = strsrch->collator;
2241           int32_t      safelength = 0;
2242           UChar       *safetext;
2243           int32_t      safetextlength;
2244           UChar        safebuffer[INITIAL_ARRAY_SIZE_];
2245           int32_t  safeoffset = textoffset;
2246 
2247     if (textoffset &&
2248         ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2249                                  u_strlen(strsrch->canonicalPrefixAccents) - 1
2250                                          ], collator)) {
2251         safeoffset     = getNextSafeOffset(collator, text, textoffset,
2252                                            strsrch->search->textLength);
2253         safelength     = safeoffset - textoffset;
2254         safetextlength = INITIAL_ARRAY_SIZE_;
2255         safetext       = addToUCharArray(safebuffer, &safetextlength,
2256                                          strsrch->canonicalPrefixAccents,
2257                                          text + textoffset, safelength,
2258                                          NULL, status);
2259     }
2260     else {
2261         safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2262         safetext       = strsrch->canonicalPrefixAccents;
2263     }
2264 
2265     UCollationElements *coleiter = strsrch->utilIter;
2266      // if status is a failure, ucol_setText does nothing
2267     ucol_setText(coleiter, safetext, safetextlength, status);
2268     // status checked in loop below
2269 
2270     int32_t  *ce           = strsrch->pattern.ces;
2271     int32_t   celength     = strsrch->pattern.cesLength;
2272     int       ceindex      = 0;
2273     UBool     isSafe       = TRUE; // safe zone indication flag for position
2274     int32_t   prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2275 
2276     while (ceindex < celength) {
2277         int32_t textce = ucol_next(coleiter, status);
2278         if (U_FAILURE(*status)) {
2279             if (isSafe) {
2280                 cleanUpSafeText(strsrch, safetext, safebuffer);
2281             }
2282             return USEARCH_DONE;
2283         }
2284         if (textce == UCOL_NULLORDER) {
2285             // check if we have passed the safe buffer
2286             if (coleiter == strsrch->textIter) {
2287                 cleanUpSafeText(strsrch, safetext, safebuffer);
2288                 return USEARCH_DONE;
2289             }
2290             cleanUpSafeText(strsrch, safetext, safebuffer);
2291             safetext = safebuffer;
2292             coleiter = strsrch->textIter;
2293             setColEIterOffset(coleiter, safeoffset);
2294             // status checked at the start of the loop
2295             isSafe = FALSE;
2296             continue;
2297         }
2298         textce = getCE(strsrch, textce);
2299         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2300             // do the beginning stuff
2301             int32_t failedoffset = ucol_getOffset(coleiter);
2302             if (isSafe && failedoffset <= prefixlength) {
2303                 // alas... no hope. failed at rearranged accent set
2304                 cleanUpSafeText(strsrch, safetext, safebuffer);
2305                 return USEARCH_DONE;
2306             }
2307             else {
2308                 if (isSafe) {
2309                     failedoffset = safeoffset - failedoffset;
2310                     cleanUpSafeText(strsrch, safetext, safebuffer);
2311                 }
2312 
2313                 // try rearranging the end accents
2314                 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2315                                         textoffset, failedoffset, status);
2316                 if (result != USEARCH_DONE) {
2317                     // if status is a failure, ucol_setOffset does nothing
2318                     setColEIterOffset(strsrch->textIter, result);
2319                 }
2320                 if (U_FAILURE(*status)) {
2321                     return USEARCH_DONE;
2322                 }
2323                 return result;
2324             }
2325         }
2326         if (textce == ce[ceindex]) {
2327             ceindex ++;
2328         }
2329     }
2330     // set offset here
2331     if (isSafe) {
2332         int32_t result      = ucol_getOffset(coleiter);
2333         // sets the text iterator here with the correct expansion and offset
2334         int32_t     leftoverces = getExpansionSuffix(coleiter);
2335         cleanUpSafeText(strsrch, safetext, safebuffer);
2336         if (result <= prefixlength) {
2337             result = textoffset;
2338         }
2339         else {
2340             result = textoffset + (safeoffset - result);
2341         }
2342         setColEIterOffset(strsrch->textIter, result);
2343         setExpansionSuffix(strsrch->textIter, leftoverces);
2344         return result;
2345     }
2346 
2347     return ucol_getOffset(coleiter);
2348 }
2349 
2350 /**
2351 * Trying out the substring and sees if it can be a canonical match.
2352 * This will try normalizing the starting accents and arranging them into
2353 * canonical equivalents and check their corresponding ces with the pattern ce.
2354 * Prefix accents in the text will be grouped according to their combining
2355 * class and the groups will be mixed and matched to try find the perfect
2356 * match with the pattern.
2357 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2358 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2359 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2360 *         "\u0301\u0325".
2361 * step 2: check if any of the generated substrings matches the pattern.
2362 * Internal method, status assumed to be success, caller has to check status
2363 * before calling this method.
2364 * @param strsrch string search data
2365 * @param textoffset start offset in the collation element text that starts
2366 *                   with the accents to be rearranged
2367 * @param status output error status if any
2368 * @return TRUE if the match is valid, FALSE otherwise
2369 */
2370 static
doPreviousCanonicalMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)2371 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2372                                int32_t    textoffset,
2373                                UErrorCode    *status)
2374 {
2375     const UChar       *text       = strsrch->search->text;
2376           int32_t  temp       = textoffset;
2377           int32_t      textlength = strsrch->search->textLength;
2378     if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2379         UCollationElements *coleiter = strsrch->textIter;
2380         int32_t         offset   = ucol_getOffset(coleiter);
2381         if (strsrch->pattern.hasSuffixAccents) {
2382             offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2383                                                     offset, status);
2384             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2385                 setColEIterOffset(coleiter, offset);
2386                 return TRUE;
2387             }
2388         }
2389         return FALSE;
2390     }
2391 
2392     if (!strsrch->pattern.hasPrefixAccents) {
2393         return FALSE;
2394     }
2395 
2396     UChar       accents[INITIAL_ARRAY_SIZE_];
2397     // offset to the last base character in substring to search
2398     int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2399     // normalizing the offensive string
2400     unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2401                                0, accents, INITIAL_ARRAY_SIZE_, status);
2402     // status checked in loop
2403 
2404     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2405     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2406 
2407     // 2 power n - 1 plus the full set of accents
2408     int32_t  count = (2 << (size - 1)) - 1;
2409     while (U_SUCCESS(*status) && count > 0) {
2410         UChar *rearrange = strsrch->canonicalPrefixAccents;
2411         // copy the base characters
2412         for (int k = 0; k < accentsindex[0]; k ++) {
2413             *rearrange ++ = accents[k];
2414         }
2415         // forming all possible canonical rearrangement by dropping
2416         // sets of accents
2417         for (int i = 0; i <= size - 1; i ++) {
2418             int32_t mask = 1 << (size - i - 1);
2419             if (count & mask) {
2420                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2421                     *rearrange ++ = accents[j];
2422                 }
2423             }
2424         }
2425         *rearrange = 0;
2426         int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2427                                                           baseoffset, status);
2428         if (offset != USEARCH_DONE) {
2429             return TRUE; // match found
2430         }
2431         count --;
2432     }
2433     return FALSE;
2434 }
2435 
2436 /**
2437 * Checks match for contraction.
2438 * If the match starts with a partial contraction we fail.
2439 * Internal method, status assumed to be success, caller has to check status
2440 * before calling this method.
2441 * @param strsrch string search data
2442 * @param start offset of potential match, to be modified if necessary
2443 * @param end offset of potential match, to be modified if necessary
2444 * @param status only error status if any
2445 * @return TRUE if match passes the contraction test, FALSE otherwise
2446 */
2447 static
checkPreviousCanonicalContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)2448 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2449                                      int32_t   *start,
2450                                      int32_t   *end, UErrorCode  *status)
2451 {
2452           UCollationElements *coleiter   = strsrch->textIter;
2453           int32_t             textlength = strsrch->search->textLength;
2454           int32_t         temp       = *end;
2455     const UCollator          *collator   = strsrch->collator;
2456     const UChar              *text       = strsrch->search->text;
2457     // This part checks if either if the start of the match contains potential
2458     // contraction. If so we'll have to iterate through them
2459     // Since we used ucol_next while previously looking for the potential
2460     // match, this guarantees that our end will not be a partial contraction,
2461     // or a partial supplementary character.
2462     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2463         int32_t expansion  = getExpansionSuffix(coleiter);
2464         UBool   expandflag = expansion > 0;
2465         setColEIterOffset(coleiter, *end);
2466         while (expansion > 0) {
2467             // getting rid of the redundant ce
2468             // since forward contraction/expansion may have extra ces
2469             // if we are in the normalization buffer, hasAccentsBeforeMatch
2470             // would have taken care of it.
2471             // E.g. the character \u01FA will have an expansion of 3, but if
2472             // we are only looking for A ring A\u030A, we'll have to skip the
2473             // last ce in the expansion buffer
2474             ucol_previous(coleiter, status);
2475             if (U_FAILURE(*status)) {
2476                 return FALSE;
2477             }
2478             if (ucol_getOffset(coleiter) != temp) {
2479                 *end = temp;
2480                 temp  = ucol_getOffset(coleiter);
2481             }
2482             expansion --;
2483         }
2484 
2485         int32_t  *patternce       = strsrch->pattern.ces;
2486         int32_t   patterncelength = strsrch->pattern.cesLength;
2487         int32_t   count           = patterncelength;
2488         while (count > 0) {
2489             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2490             // status checked below, note that if status is a failure
2491             // ucol_previous returns UCOL_NULLORDER
2492             if (ce == UCOL_IGNORABLE) {
2493                 continue;
2494             }
2495             if (expandflag && count == 0 &&
2496                 getColElemIterOffset(coleiter, FALSE) != temp) {
2497                 *end = temp;
2498                 temp  = ucol_getOffset(coleiter);
2499             }
2500             if (count == patterncelength &&
2501                 ce != patternce[patterncelength - 1]) {
2502                 // accents may have extra starting ces, this occurs when a
2503                 // pure accent pattern is matched without rearrangement
2504                 int32_t    expected = patternce[patterncelength - 1];
2505                 U16_BACK_1(text, 0, *end);
2506                 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2507                     ce = getCE(strsrch, ucol_previous(coleiter, status));
2508                     while (U_SUCCESS(*status) && ce != expected &&
2509                            ce != UCOL_NULLORDER &&
2510                            ucol_getOffset(coleiter) <= *start) {
2511                         ce = getCE(strsrch, ucol_previous(coleiter, status));
2512                     }
2513                 }
2514             }
2515             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2516                 (*start) --;
2517                 *start = getPreviousBaseOffset(text, *start);
2518                 return FALSE;
2519             }
2520             count --;
2521         }
2522     }
2523     return TRUE;
2524 }
2525 
2526 /**
2527 * Checks and sets the match information if found.
2528 * Checks
2529 * <ul>
2530 * <li> the potential match does not repeat the previous match
2531 * <li> boundaries are correct
2532 * <li> potential match does not end in the middle of a contraction
2533 * <li> identical matches
2534 * <\ul>
2535 * Otherwise the offset will be shifted to the next character.
2536 * Internal method, status assumed to be success, caller has to check status
2537 * before calling this method.
2538 * @param strsrch string search data
2539 * @param textoffset offset in the collation element text. the returned value
2540 *        will be the truncated start offset of the match or the new start
2541 *        search offset.
2542 * @param status only error status if any
2543 * @return TRUE if the match is valid, FALSE otherwise
2544 */
2545 static
checkPreviousCanonicalMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)2546 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2547                                          int32_t   *textoffset,
2548                                          UErrorCode    *status)
2549 {
2550     // to ensure that the start and ends are not composite characters
2551     UCollationElements *coleiter = strsrch->textIter;
2552     // if we have a canonical accent match
2553     if ((strsrch->pattern.hasSuffixAccents &&
2554         strsrch->canonicalSuffixAccents[0]) ||
2555         (strsrch->pattern.hasPrefixAccents &&
2556         strsrch->canonicalPrefixAccents[0])) {
2557         strsrch->search->matchedIndex  = *textoffset;
2558         strsrch->search->matchedLength =
2559             getNextUStringSearchBaseOffset(strsrch,
2560                                       getColElemIterOffset(coleiter, FALSE))
2561             - *textoffset;
2562         return TRUE;
2563     }
2564 
2565     int32_t end = ucol_getOffset(coleiter);
2566     if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2567                                                 status) ||
2568          U_FAILURE(*status)) {
2569         return FALSE;
2570     }
2571 
2572     end = getNextUStringSearchBaseOffset(strsrch, end);
2573     // this totally matches, however we need to check if it is repeating
2574     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2575         !isBreakUnit(strsrch, *textoffset, end) ||
2576         !checkIdentical(strsrch, *textoffset, end)) {
2577         (*textoffset) --;
2578         *textoffset = getPreviousBaseOffset(strsrch->search->text,
2579                                             *textoffset);
2580         return FALSE;
2581     }
2582 
2583     strsrch->search->matchedIndex  = *textoffset;
2584     strsrch->search->matchedLength = end - *textoffset;
2585     return TRUE;
2586 }
2587 #endif // #if BOYER_MOORE
2588 
2589 // constructors and destructor -------------------------------------------
2590 
usearch_open(const UChar * pattern,int32_t patternlength,const UChar * text,int32_t textlength,const char * locale,UBreakIterator * breakiter,UErrorCode * status)2591 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2592                                           int32_t         patternlength,
2593                                     const UChar          *text,
2594                                           int32_t         textlength,
2595                                     const char           *locale,
2596                                           UBreakIterator *breakiter,
2597                                           UErrorCode     *status)
2598 {
2599     if (U_FAILURE(*status)) {
2600         return NULL;
2601     }
2602 #if UCONFIG_NO_BREAK_ITERATION
2603     if (breakiter != NULL) {
2604         *status = U_UNSUPPORTED_ERROR;
2605         return NULL;
2606     }
2607 #endif
2608     if (locale) {
2609         // ucol_open internally checks for status
2610         UCollator     *collator = ucol_open(locale, status);
2611         // pattern, text checks are done in usearch_openFromCollator
2612         UStringSearch *result   = usearch_openFromCollator(pattern,
2613                                               patternlength, text, textlength,
2614                                               collator, breakiter, status);
2615 
2616         if (result == NULL || U_FAILURE(*status)) {
2617             if (collator) {
2618                 ucol_close(collator);
2619             }
2620             return NULL;
2621         }
2622         else {
2623             result->ownCollator = TRUE;
2624         }
2625         return result;
2626     }
2627     *status = U_ILLEGAL_ARGUMENT_ERROR;
2628     return NULL;
2629 }
2630 
usearch_openFromCollator(const UChar * pattern,int32_t patternlength,const UChar * text,int32_t textlength,const UCollator * collator,UBreakIterator * breakiter,UErrorCode * status)2631 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2632                                   const UChar          *pattern,
2633                                         int32_t         patternlength,
2634                                   const UChar          *text,
2635                                         int32_t         textlength,
2636                                   const UCollator      *collator,
2637                                         UBreakIterator *breakiter,
2638                                         UErrorCode     *status)
2639 {
2640     if (U_FAILURE(*status)) {
2641         return NULL;
2642     }
2643 #if UCONFIG_NO_BREAK_ITERATION
2644     if (breakiter != NULL) {
2645         *status = U_UNSUPPORTED_ERROR;
2646         return NULL;
2647     }
2648 #endif
2649     if (pattern == NULL || text == NULL || collator == NULL) {
2650         *status = U_ILLEGAL_ARGUMENT_ERROR;
2651         return NULL;
2652     }
2653 
2654     // string search does not really work when numeric collation is turned on
2655     if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2656         *status = U_UNSUPPORTED_ERROR;
2657         return NULL;
2658     }
2659 
2660     if (U_SUCCESS(*status)) {
2661         initializeFCD(status);
2662         if (U_FAILURE(*status)) {
2663             return NULL;
2664         }
2665 
2666         UStringSearch *result;
2667         if (textlength == -1) {
2668             textlength = u_strlen(text);
2669         }
2670         if (patternlength == -1) {
2671             patternlength = u_strlen(pattern);
2672         }
2673         if (textlength <= 0 || patternlength <= 0) {
2674             *status = U_ILLEGAL_ARGUMENT_ERROR;
2675             return NULL;
2676         }
2677 
2678         result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2679         if (result == NULL) {
2680             *status = U_MEMORY_ALLOCATION_ERROR;
2681             return NULL;
2682         }
2683 
2684         result->collator    = collator;
2685         result->strength    = ucol_getStrength(collator);
2686         result->ceMask      = getMask(result->strength);
2687         result->toShift     =
2688              ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2689                                                             UCOL_SHIFTED;
2690         result->variableTop = ucol_getVariableTop(collator, status);
2691 
2692         result->nfd         = Normalizer2::getNFDInstance(*status);
2693 
2694         if (U_FAILURE(*status)) {
2695             uprv_free(result);
2696             return NULL;
2697         }
2698 
2699         result->search             = (USearch *)uprv_malloc(sizeof(USearch));
2700         if (result->search == NULL) {
2701             *status = U_MEMORY_ALLOCATION_ERROR;
2702             uprv_free(result);
2703             return NULL;
2704         }
2705 
2706         result->search->text       = text;
2707         result->search->textLength = textlength;
2708 
2709         result->pattern.text       = pattern;
2710         result->pattern.textLength = patternlength;
2711         result->pattern.ces         = NULL;
2712         result->pattern.pces        = NULL;
2713 
2714         result->search->breakIter  = breakiter;
2715 #if !UCONFIG_NO_BREAK_ITERATION
2716         result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
2717         if (breakiter) {
2718             ubrk_setText(breakiter, text, textlength, status);
2719         }
2720 #endif
2721 
2722         result->ownCollator           = FALSE;
2723         result->search->matchedLength = 0;
2724         result->search->matchedIndex  = USEARCH_DONE;
2725         result->utilIter              = NULL;
2726         result->textIter              = ucol_openElements(collator, text,
2727                                                           textlength, status);
2728         result->textProcessedIter     = NULL;
2729         if (U_FAILURE(*status)) {
2730             usearch_close(result);
2731             return NULL;
2732         }
2733 
2734         result->search->isOverlap          = FALSE;
2735         result->search->isCanonicalMatch   = FALSE;
2736         result->search->elementComparisonType = 0;
2737         result->search->isForwardSearching = TRUE;
2738         result->search->reset              = TRUE;
2739 
2740         initialize(result, status);
2741 
2742         if (U_FAILURE(*status)) {
2743             usearch_close(result);
2744             return NULL;
2745         }
2746 
2747         return result;
2748     }
2749     return NULL;
2750 }
2751 
usearch_close(UStringSearch * strsrch)2752 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2753 {
2754     if (strsrch) {
2755         if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
2756             strsrch->pattern.ces) {
2757             uprv_free(strsrch->pattern.ces);
2758         }
2759 
2760         if (strsrch->pattern.pces != NULL &&
2761             strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
2762             uprv_free(strsrch->pattern.pces);
2763         }
2764 
2765         delete strsrch->textProcessedIter;
2766         ucol_closeElements(strsrch->textIter);
2767         ucol_closeElements(strsrch->utilIter);
2768 
2769         if (strsrch->ownCollator && strsrch->collator) {
2770             ucol_close((UCollator *)strsrch->collator);
2771         }
2772 
2773 #if !UCONFIG_NO_BREAK_ITERATION
2774         if (strsrch->search->internalBreakIter) {
2775             ubrk_close(strsrch->search->internalBreakIter);
2776         }
2777 #endif
2778 
2779         uprv_free(strsrch->search);
2780         uprv_free(strsrch);
2781     }
2782 }
2783 
2784 namespace {
2785 
initTextProcessedIter(UStringSearch * strsrch,UErrorCode * status)2786 UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
2787     if (U_FAILURE(*status)) { return FALSE; }
2788     if (strsrch->textProcessedIter == NULL) {
2789         strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
2790         if (strsrch->textProcessedIter == NULL) {
2791             *status = U_MEMORY_ALLOCATION_ERROR;
2792             return FALSE;
2793         }
2794     } else {
2795         strsrch->textProcessedIter->init(strsrch->textIter);
2796     }
2797     return TRUE;
2798 }
2799 
2800 }
2801 
2802 // set and get methods --------------------------------------------------
2803 
usearch_setOffset(UStringSearch * strsrch,int32_t position,UErrorCode * status)2804 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2805                                         int32_t    position,
2806                                         UErrorCode    *status)
2807 {
2808     if (U_SUCCESS(*status) && strsrch) {
2809         if (isOutOfBounds(strsrch->search->textLength, position)) {
2810             *status = U_INDEX_OUTOFBOUNDS_ERROR;
2811         }
2812         else {
2813             setColEIterOffset(strsrch->textIter, position);
2814         }
2815         strsrch->search->matchedIndex  = USEARCH_DONE;
2816         strsrch->search->matchedLength = 0;
2817         strsrch->search->reset         = FALSE;
2818     }
2819 }
2820 
usearch_getOffset(const UStringSearch * strsrch)2821 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2822 {
2823     if (strsrch) {
2824         int32_t result = ucol_getOffset(strsrch->textIter);
2825         if (isOutOfBounds(strsrch->search->textLength, result)) {
2826             return USEARCH_DONE;
2827         }
2828         return result;
2829     }
2830     return USEARCH_DONE;
2831 }
2832 
usearch_setAttribute(UStringSearch * strsrch,USearchAttribute attribute,USearchAttributeValue value,UErrorCode * status)2833 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2834                                  USearchAttribute attribute,
2835                                  USearchAttributeValue value,
2836                                  UErrorCode *status)
2837 {
2838     if (U_SUCCESS(*status) && strsrch) {
2839         switch (attribute)
2840         {
2841         case USEARCH_OVERLAP :
2842             strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2843             break;
2844         case USEARCH_CANONICAL_MATCH :
2845             strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2846                                                                       FALSE);
2847             break;
2848         case USEARCH_ELEMENT_COMPARISON :
2849             if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2850                 strsrch->search->elementComparisonType = (int16_t)value;
2851             } else {
2852                 strsrch->search->elementComparisonType = 0;
2853             }
2854             break;
2855         case USEARCH_ATTRIBUTE_COUNT :
2856         default:
2857             *status = U_ILLEGAL_ARGUMENT_ERROR;
2858         }
2859     }
2860     if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2861         *status = U_ILLEGAL_ARGUMENT_ERROR;
2862     }
2863 }
2864 
usearch_getAttribute(const UStringSearch * strsrch,USearchAttribute attribute)2865 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2866                                                 const UStringSearch *strsrch,
2867                                                 USearchAttribute attribute)
2868 {
2869     if (strsrch) {
2870         switch (attribute) {
2871         case USEARCH_OVERLAP :
2872             return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2873                                                         USEARCH_OFF);
2874         case USEARCH_CANONICAL_MATCH :
2875             return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2876                                                                USEARCH_OFF);
2877         case USEARCH_ELEMENT_COMPARISON :
2878             {
2879                 int16_t value = strsrch->search->elementComparisonType;
2880                 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2881                     return (USearchAttributeValue)value;
2882                 } else {
2883                     return USEARCH_STANDARD_ELEMENT_COMPARISON;
2884                 }
2885             }
2886         case USEARCH_ATTRIBUTE_COUNT :
2887             return USEARCH_DEFAULT;
2888         }
2889     }
2890     return USEARCH_DEFAULT;
2891 }
2892 
usearch_getMatchedStart(const UStringSearch * strsrch)2893 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2894                                                 const UStringSearch *strsrch)
2895 {
2896     if (strsrch == NULL) {
2897         return USEARCH_DONE;
2898     }
2899     return strsrch->search->matchedIndex;
2900 }
2901 
2902 
usearch_getMatchedText(const UStringSearch * strsrch,UChar * result,int32_t resultCapacity,UErrorCode * status)2903 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2904                                             UChar         *result,
2905                                             int32_t        resultCapacity,
2906                                             UErrorCode    *status)
2907 {
2908     if (U_FAILURE(*status)) {
2909         return USEARCH_DONE;
2910     }
2911     if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2912         result == NULL)) {
2913         *status = U_ILLEGAL_ARGUMENT_ERROR;
2914         return USEARCH_DONE;
2915     }
2916 
2917     int32_t     copylength = strsrch->search->matchedLength;
2918     int32_t copyindex  = strsrch->search->matchedIndex;
2919     if (copyindex == USEARCH_DONE) {
2920         u_terminateUChars(result, resultCapacity, 0, status);
2921         return USEARCH_DONE;
2922     }
2923 
2924     if (resultCapacity < copylength) {
2925         copylength = resultCapacity;
2926     }
2927     if (copylength > 0) {
2928         uprv_memcpy(result, strsrch->search->text + copyindex,
2929                     copylength * sizeof(UChar));
2930     }
2931     return u_terminateUChars(result, resultCapacity,
2932                              strsrch->search->matchedLength, status);
2933 }
2934 
usearch_getMatchedLength(const UStringSearch * strsrch)2935 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2936                                               const UStringSearch *strsrch)
2937 {
2938     if (strsrch) {
2939         return strsrch->search->matchedLength;
2940     }
2941     return USEARCH_DONE;
2942 }
2943 
2944 #if !UCONFIG_NO_BREAK_ITERATION
2945 
usearch_setBreakIterator(UStringSearch * strsrch,UBreakIterator * breakiter,UErrorCode * status)2946 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
2947                                                UBreakIterator *breakiter,
2948                                                UErrorCode     *status)
2949 {
2950     if (U_SUCCESS(*status) && strsrch) {
2951         strsrch->search->breakIter = breakiter;
2952         if (breakiter) {
2953             ubrk_setText(breakiter, strsrch->search->text,
2954                          strsrch->search->textLength, status);
2955         }
2956     }
2957 }
2958 
2959 U_CAPI const UBreakIterator* U_EXPORT2
usearch_getBreakIterator(const UStringSearch * strsrch)2960 usearch_getBreakIterator(const UStringSearch *strsrch)
2961 {
2962     if (strsrch) {
2963         return strsrch->search->breakIter;
2964     }
2965     return NULL;
2966 }
2967 
2968 #endif
2969 
usearch_setText(UStringSearch * strsrch,const UChar * text,int32_t textlength,UErrorCode * status)2970 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
2971                                       const UChar         *text,
2972                                             int32_t        textlength,
2973                                             UErrorCode    *status)
2974 {
2975     if (U_SUCCESS(*status)) {
2976         if (strsrch == NULL || text == NULL || textlength < -1 ||
2977             textlength == 0) {
2978             *status = U_ILLEGAL_ARGUMENT_ERROR;
2979         }
2980         else {
2981             if (textlength == -1) {
2982                 textlength = u_strlen(text);
2983             }
2984             strsrch->search->text       = text;
2985             strsrch->search->textLength = textlength;
2986             ucol_setText(strsrch->textIter, text, textlength, status);
2987             strsrch->search->matchedIndex  = USEARCH_DONE;
2988             strsrch->search->matchedLength = 0;
2989             strsrch->search->reset         = TRUE;
2990 #if !UCONFIG_NO_BREAK_ITERATION
2991             if (strsrch->search->breakIter != NULL) {
2992                 ubrk_setText(strsrch->search->breakIter, text,
2993                              textlength, status);
2994             }
2995             ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
2996 #endif
2997         }
2998     }
2999 }
3000 
usearch_getText(const UStringSearch * strsrch,int32_t * length)3001 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
3002                                                      int32_t       *length)
3003 {
3004     if (strsrch) {
3005         *length = strsrch->search->textLength;
3006         return strsrch->search->text;
3007     }
3008     return NULL;
3009 }
3010 
usearch_setCollator(UStringSearch * strsrch,const UCollator * collator,UErrorCode * status)3011 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
3012                                           const UCollator     *collator,
3013                                                 UErrorCode    *status)
3014 {
3015     if (U_SUCCESS(*status)) {
3016         if (collator == NULL) {
3017             *status = U_ILLEGAL_ARGUMENT_ERROR;
3018             return;
3019         }
3020 
3021         if (strsrch) {
3022             delete strsrch->textProcessedIter;
3023             strsrch->textProcessedIter = NULL;
3024             ucol_closeElements(strsrch->textIter);
3025             ucol_closeElements(strsrch->utilIter);
3026             strsrch->textIter = strsrch->utilIter = NULL;
3027             if (strsrch->ownCollator && (strsrch->collator != collator)) {
3028                 ucol_close((UCollator *)strsrch->collator);
3029                 strsrch->ownCollator = FALSE;
3030             }
3031             strsrch->collator    = collator;
3032             strsrch->strength    = ucol_getStrength(collator);
3033             strsrch->ceMask      = getMask(strsrch->strength);
3034 #if !UCONFIG_NO_BREAK_ITERATION
3035             ubrk_close(strsrch->search->internalBreakIter);
3036             strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
3037                                                      strsrch->search->text, strsrch->search->textLength, status);
3038 #endif
3039             // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3040             strsrch->toShift     =
3041                ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
3042                                                                 UCOL_SHIFTED;
3043             // if status is a failure, ucol_getVariableTop returns 0
3044             strsrch->variableTop = ucol_getVariableTop(collator, status);
3045             strsrch->textIter = ucol_openElements(collator,
3046                                       strsrch->search->text,
3047                                       strsrch->search->textLength,
3048                                       status);
3049             strsrch->utilIter = ucol_openElements(
3050                     collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
3051             // initialize() _after_ setting the iterators for the new collator.
3052             initialize(strsrch, status);
3053         }
3054 
3055         // **** are these calls needed?
3056         // **** we call uprv_init_pce in initializePatternPCETable
3057         // **** and the CEIBuffer constructor...
3058 #if 0
3059         uprv_init_pce(strsrch->textIter);
3060         uprv_init_pce(strsrch->utilIter);
3061 #endif
3062     }
3063 }
3064 
usearch_getCollator(const UStringSearch * strsrch)3065 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3066 {
3067     if (strsrch) {
3068         return (UCollator *)strsrch->collator;
3069     }
3070     return NULL;
3071 }
3072 
usearch_setPattern(UStringSearch * strsrch,const UChar * pattern,int32_t patternlength,UErrorCode * status)3073 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
3074                                          const UChar         *pattern,
3075                                                int32_t        patternlength,
3076                                                UErrorCode    *status)
3077 {
3078     if (U_SUCCESS(*status)) {
3079         if (strsrch == NULL || pattern == NULL) {
3080             *status = U_ILLEGAL_ARGUMENT_ERROR;
3081         }
3082         else {
3083             if (patternlength == -1) {
3084                 patternlength = u_strlen(pattern);
3085             }
3086             if (patternlength == 0) {
3087                 *status = U_ILLEGAL_ARGUMENT_ERROR;
3088                 return;
3089             }
3090             strsrch->pattern.text       = pattern;
3091             strsrch->pattern.textLength = patternlength;
3092             initialize(strsrch, status);
3093         }
3094     }
3095 }
3096 
3097 U_CAPI const UChar* U_EXPORT2
usearch_getPattern(const UStringSearch * strsrch,int32_t * length)3098 usearch_getPattern(const UStringSearch *strsrch,
3099                    int32_t       *length)
3100 {
3101     if (strsrch) {
3102         *length = strsrch->pattern.textLength;
3103         return strsrch->pattern.text;
3104     }
3105     return NULL;
3106 }
3107 
3108 // miscellanous methods --------------------------------------------------
3109 
usearch_first(UStringSearch * strsrch,UErrorCode * status)3110 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3111                                            UErrorCode    *status)
3112 {
3113     if (strsrch && U_SUCCESS(*status)) {
3114         strsrch->search->isForwardSearching = TRUE;
3115         usearch_setOffset(strsrch, 0, status);
3116         if (U_SUCCESS(*status)) {
3117             return usearch_next(strsrch, status);
3118         }
3119     }
3120     return USEARCH_DONE;
3121 }
3122 
usearch_following(UStringSearch * strsrch,int32_t position,UErrorCode * status)3123 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
3124                                                int32_t    position,
3125                                                UErrorCode    *status)
3126 {
3127     if (strsrch && U_SUCCESS(*status)) {
3128         strsrch->search->isForwardSearching = TRUE;
3129         // position checked in usearch_setOffset
3130         usearch_setOffset(strsrch, position, status);
3131         if (U_SUCCESS(*status)) {
3132             return usearch_next(strsrch, status);
3133         }
3134     }
3135     return USEARCH_DONE;
3136 }
3137 
usearch_last(UStringSearch * strsrch,UErrorCode * status)3138 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
3139                                           UErrorCode    *status)
3140 {
3141     if (strsrch && U_SUCCESS(*status)) {
3142         strsrch->search->isForwardSearching = FALSE;
3143         usearch_setOffset(strsrch, strsrch->search->textLength, status);
3144         if (U_SUCCESS(*status)) {
3145             return usearch_previous(strsrch, status);
3146         }
3147     }
3148     return USEARCH_DONE;
3149 }
3150 
usearch_preceding(UStringSearch * strsrch,int32_t position,UErrorCode * status)3151 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
3152                                                int32_t    position,
3153                                                UErrorCode    *status)
3154 {
3155     if (strsrch && U_SUCCESS(*status)) {
3156         strsrch->search->isForwardSearching = FALSE;
3157         // position checked in usearch_setOffset
3158         usearch_setOffset(strsrch, position, status);
3159         if (U_SUCCESS(*status)) {
3160             return usearch_previous(strsrch, status);
3161         }
3162     }
3163     return USEARCH_DONE;
3164 }
3165 
3166 /**
3167 * If a direction switch is required, we'll count the number of ces till the
3168 * beginning of the collation element iterator and iterate forwards that
3169 * number of times. This is so that we get to the correct point within the
3170 * string to continue the search in. Imagine when we are in the middle of the
3171 * normalization buffer when the change in direction is request. arrrgghh....
3172 * After searching the offset within the collation element iterator will be
3173 * shifted to the start of the match. If a match is not found, the offset would
3174 * have been set to the end of the text string in the collation element
3175 * iterator.
3176 * Okay, here's my take on normalization buffer. The only time when there can
3177 * be 2 matches within the same normalization is when the pattern is consists
3178 * of all accents. But since the offset returned is from the text string, we
3179 * should not confuse the caller by returning the second match within the
3180 * same normalization buffer. If we do, the 2 results will have the same match
3181 * offsets, and that'll be confusing. I'll return the next match that doesn't
3182 * fall within the same normalization buffer. Note this does not affect the
3183 * results of matches spanning the text and the normalization buffer.
3184 * The position to start searching is taken from the collation element
3185 * iterator. Callers of this API would have to set the offset in the collation
3186 * element iterator before using this method.
3187 */
usearch_next(UStringSearch * strsrch,UErrorCode * status)3188 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3189                                           UErrorCode    *status)
3190 {
3191     if (U_SUCCESS(*status) && strsrch) {
3192         // note offset is either equivalent to the start of the previous match
3193         // or is set by the user
3194         int32_t      offset       = usearch_getOffset(strsrch);
3195         USearch     *search       = strsrch->search;
3196         search->reset             = FALSE;
3197         int32_t      textlength   = search->textLength;
3198         if (search->isForwardSearching) {
3199 #if BOYER_MOORE
3200             if (offset == textlength
3201                 || (!search->isOverlap &&
3202                     (offset + strsrch->pattern.defaultShiftSize > textlength ||
3203                     (search->matchedIndex != USEARCH_DONE &&
3204                      offset + search->matchedLength >= textlength)))) {
3205                 // not enough characters to match
3206                 setMatchNotFound(strsrch);
3207                 return USEARCH_DONE;
3208             }
3209 #else
3210             if (offset == textlength ||
3211                 (! search->isOverlap &&
3212                 (search->matchedIndex != USEARCH_DONE &&
3213                 offset + search->matchedLength > textlength))) {
3214                     // not enough characters to match
3215                     setMatchNotFound(strsrch);
3216                     return USEARCH_DONE;
3217             }
3218 #endif
3219         }
3220         else {
3221             // switching direction.
3222             // if matchedIndex == USEARCH_DONE, it means that either a
3223             // setOffset has been called or that previous ran off the text
3224             // string. the iterator would have been set to offset 0 if a
3225             // match is not found.
3226             search->isForwardSearching = TRUE;
3227             if (search->matchedIndex != USEARCH_DONE) {
3228                 // there's no need to set the collation element iterator
3229                 // the next call to next will set the offset.
3230                 return search->matchedIndex;
3231             }
3232         }
3233 
3234         if (U_SUCCESS(*status)) {
3235             if (strsrch->pattern.cesLength == 0) {
3236                 if (search->matchedIndex == USEARCH_DONE) {
3237                     search->matchedIndex = offset;
3238                 }
3239                 else { // moves by codepoints
3240                     U16_FWD_1(search->text, search->matchedIndex, textlength);
3241                 }
3242 
3243                 search->matchedLength = 0;
3244                 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3245                 // status checked below
3246                 if (search->matchedIndex == textlength) {
3247                     search->matchedIndex = USEARCH_DONE;
3248                 }
3249             }
3250             else {
3251                 if (search->matchedLength > 0) {
3252                     // if matchlength is 0 we are at the start of the iteration
3253                     if (search->isOverlap) {
3254                         ucol_setOffset(strsrch->textIter, offset + 1, status);
3255                     }
3256                     else {
3257                         ucol_setOffset(strsrch->textIter,
3258                                        offset + search->matchedLength, status);
3259                     }
3260                 }
3261                 else {
3262                     // for boundary check purposes. this will ensure that the
3263                     // next match will not preceed the current offset
3264                     // note search->matchedIndex will always be set to something
3265                     // in the code
3266                     search->matchedIndex = offset - 1;
3267                 }
3268 
3269                 if (search->isCanonicalMatch) {
3270                     // can't use exact here since extra accents are allowed.
3271                     usearch_handleNextCanonical(strsrch, status);
3272                 }
3273                 else {
3274                     usearch_handleNextExact(strsrch, status);
3275                 }
3276             }
3277 
3278             if (U_FAILURE(*status)) {
3279                 return USEARCH_DONE;
3280             }
3281 
3282 #if !BOYER_MOORE
3283             if (search->matchedIndex == USEARCH_DONE) {
3284                 ucol_setOffset(strsrch->textIter, search->textLength, status);
3285             } else {
3286                 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3287             }
3288 #endif
3289 
3290             return search->matchedIndex;
3291         }
3292     }
3293     return USEARCH_DONE;
3294 }
3295 
usearch_previous(UStringSearch * strsrch,UErrorCode * status)3296 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3297                                               UErrorCode *status)
3298 {
3299     if (U_SUCCESS(*status) && strsrch) {
3300         int32_t offset;
3301         USearch *search = strsrch->search;
3302         if (search->reset) {
3303             offset                     = search->textLength;
3304             search->isForwardSearching = FALSE;
3305             search->reset              = FALSE;
3306             setColEIterOffset(strsrch->textIter, offset);
3307         }
3308         else {
3309             offset = usearch_getOffset(strsrch);
3310         }
3311 
3312         int32_t matchedindex = search->matchedIndex;
3313         if (search->isForwardSearching == TRUE) {
3314             // switching direction.
3315             // if matchedIndex == USEARCH_DONE, it means that either a
3316             // setOffset has been called or that next ran off the text
3317             // string. the iterator would have been set to offset textLength if
3318             // a match is not found.
3319             search->isForwardSearching = FALSE;
3320             if (matchedindex != USEARCH_DONE) {
3321                 return matchedindex;
3322             }
3323         }
3324         else {
3325 #if BOYER_MOORE
3326             if (offset == 0 || matchedindex == 0 ||
3327                 (!search->isOverlap &&
3328                     (offset < strsrch->pattern.defaultShiftSize ||
3329                     (matchedindex != USEARCH_DONE &&
3330                     matchedindex < strsrch->pattern.defaultShiftSize)))) {
3331                 // not enough characters to match
3332                 setMatchNotFound(strsrch);
3333                 return USEARCH_DONE;
3334             }
3335 #else
3336             // Could check pattern length, but the
3337             // linear search will do the right thing
3338             if (offset == 0 || matchedindex == 0) {
3339                 setMatchNotFound(strsrch);
3340                 return USEARCH_DONE;
3341             }
3342 #endif
3343         }
3344 
3345         if (U_SUCCESS(*status)) {
3346             if (strsrch->pattern.cesLength == 0) {
3347                 search->matchedIndex =
3348                       (matchedindex == USEARCH_DONE ? offset : matchedindex);
3349                 if (search->matchedIndex == 0) {
3350                     setMatchNotFound(strsrch);
3351                     // status checked below
3352                 }
3353                 else { // move by codepoints
3354                     U16_BACK_1(search->text, 0, search->matchedIndex);
3355                     setColEIterOffset(strsrch->textIter, search->matchedIndex);
3356                     // status checked below
3357                     search->matchedLength = 0;
3358                 }
3359             }
3360             else {
3361                 if (strsrch->search->isCanonicalMatch) {
3362                     // can't use exact here since extra accents are allowed.
3363                     usearch_handlePreviousCanonical(strsrch, status);
3364                     // status checked below
3365                 }
3366                 else {
3367                     usearch_handlePreviousExact(strsrch, status);
3368                     // status checked below
3369                 }
3370             }
3371 
3372             if (U_FAILURE(*status)) {
3373                 return USEARCH_DONE;
3374             }
3375 
3376             return search->matchedIndex;
3377         }
3378     }
3379     return USEARCH_DONE;
3380 }
3381 
3382 
3383 
usearch_reset(UStringSearch * strsrch)3384 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3385 {
3386     /*
3387     reset is setting the attributes that are already in
3388     string search, hence all attributes in the collator should
3389     be retrieved without any problems
3390     */
3391     if (strsrch) {
3392         UErrorCode status            = U_ZERO_ERROR;
3393         UBool      sameCollAttribute = TRUE;
3394         uint32_t   ceMask;
3395         UBool      shift;
3396         uint32_t   varTop;
3397 
3398         // **** hack to deal w/ how processed CEs encode quaternary ****
3399         UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
3400         if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
3401             (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
3402                 sameCollAttribute = FALSE;
3403         }
3404 
3405         strsrch->strength    = ucol_getStrength(strsrch->collator);
3406         ceMask = getMask(strsrch->strength);
3407         if (strsrch->ceMask != ceMask) {
3408             strsrch->ceMask = ceMask;
3409             sameCollAttribute = FALSE;
3410         }
3411 
3412         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3413         shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3414                                   &status) == UCOL_SHIFTED;
3415         if (strsrch->toShift != shift) {
3416             strsrch->toShift  = shift;
3417             sameCollAttribute = FALSE;
3418         }
3419 
3420         // if status is a failure, ucol_getVariableTop returns 0
3421         varTop = ucol_getVariableTop(strsrch->collator, &status);
3422         if (strsrch->variableTop != varTop) {
3423             strsrch->variableTop = varTop;
3424             sameCollAttribute    = FALSE;
3425         }
3426         if (!sameCollAttribute) {
3427             initialize(strsrch, &status);
3428         }
3429         ucol_setText(strsrch->textIter, strsrch->search->text,
3430                               strsrch->search->textLength,
3431                               &status);
3432         strsrch->search->matchedLength      = 0;
3433         strsrch->search->matchedIndex       = USEARCH_DONE;
3434         strsrch->search->isOverlap          = FALSE;
3435         strsrch->search->isCanonicalMatch   = FALSE;
3436         strsrch->search->elementComparisonType = 0;
3437         strsrch->search->isForwardSearching = TRUE;
3438         strsrch->search->reset              = TRUE;
3439     }
3440 }
3441 
3442 //
3443 //  CEI  Collation Element + source text index.
3444 //       These structs are kept in the circular buffer.
3445 //
3446 struct  CEI {
3447     int64_t ce;
3448     int32_t lowIndex;
3449     int32_t highIndex;
3450 };
3451 
3452 U_NAMESPACE_BEGIN
3453 
3454 namespace {
3455 //
3456 //  CEIBuffer   A circular buffer of CEs-with-index from the text being searched.
3457 //
3458 #define   DEFAULT_CEBUFFER_SIZE 96
3459 #define   CEBUFFER_EXTRA 32
3460 // Some typical max values to make buffer size more reasonable for asymmetric search.
3461 // #8694 is for a better long-term solution to allocation of this buffer.
3462 #define   MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3463 #define   MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3464 #define   MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
3465 struct CEIBuffer {
3466     CEI                  defBuf[DEFAULT_CEBUFFER_SIZE];
3467     CEI                 *buf;
3468     int32_t              bufSize;
3469     int32_t              firstIx;
3470     int32_t              limitIx;
3471     UCollationElements  *ceIter;
3472     UStringSearch       *strSearch;
3473 
3474 
3475 
3476                CEIBuffer(UStringSearch *ss, UErrorCode *status);
3477                ~CEIBuffer();
3478    const CEI   *get(int32_t index);
3479    const CEI   *getPrevious(int32_t index);
3480 };
3481 
3482 
CEIBuffer(UStringSearch * ss,UErrorCode * status)3483 CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
3484     buf = defBuf;
3485     strSearch = ss;
3486     bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
3487     if (ss->search->elementComparisonType != 0) {
3488         const UChar * patText = ss->pattern.text;
3489         if (patText) {
3490             const UChar * patTextLimit = patText + ss->pattern.textLength;
3491             while ( patText < patTextLimit ) {
3492                 UChar c = *patText++;
3493                 if (MIGHT_BE_JAMO_L(c)) {
3494                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
3495                 } else {
3496                     // No check for surrogates, we might allocate slightly more buffer than necessary.
3497                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
3498                 }
3499             }
3500         }
3501     }
3502     ceIter    = ss->textIter;
3503     firstIx = 0;
3504     limitIx = 0;
3505 
3506     if (!initTextProcessedIter(ss, status)) { return; }
3507 
3508     if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3509         buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
3510         if (buf == NULL) {
3511             *status = U_MEMORY_ALLOCATION_ERROR;
3512         }
3513     }
3514 }
3515 
3516 // TODO: add a reset or init function so that allocated
3517 //       buffers can be retained & reused.
3518 
~CEIBuffer()3519 CEIBuffer::~CEIBuffer() {
3520     if (buf != defBuf) {
3521         uprv_free(buf);
3522     }
3523 }
3524 
3525 
3526 // Get the CE with the specified index.
3527 //   Index must be in the range
3528 //          n-history_size < index < n+1
3529 //   where n is the largest index to have been fetched by some previous call to this function.
3530 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3531 //
get(int32_t index)3532 const CEI *CEIBuffer::get(int32_t index) {
3533     int i = index % bufSize;
3534 
3535     if (index>=firstIx && index<limitIx) {
3536         // The request was for an entry already in our buffer.
3537         //  Just return it.
3538         return &buf[i];
3539     }
3540 
3541     // Caller is requesting a new, never accessed before, CE.
3542     //   Verify that it is the next one in sequence, which is all
3543     //   that is allowed.
3544     if (index != limitIx) {
3545         U_ASSERT(FALSE);
3546 
3547         return NULL;
3548     }
3549 
3550     // Manage the circular CE buffer indexing
3551     limitIx++;
3552 
3553     if (limitIx - firstIx >= bufSize) {
3554         // The buffer is full, knock out the lowest-indexed entry.
3555         firstIx++;
3556     }
3557 
3558     UErrorCode status = U_ZERO_ERROR;
3559 
3560     buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3561 
3562     return &buf[i];
3563 }
3564 
3565 // Get the CE with the specified index.
3566 //   Index must be in the range
3567 //          n-history_size < index < n+1
3568 //   where n is the largest index to have been fetched by some previous call to this function.
3569 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3570 //
getPrevious(int32_t index)3571 const CEI *CEIBuffer::getPrevious(int32_t index) {
3572     int i = index % bufSize;
3573 
3574     if (index>=firstIx && index<limitIx) {
3575         // The request was for an entry already in our buffer.
3576         //  Just return it.
3577         return &buf[i];
3578     }
3579 
3580     // Caller is requesting a new, never accessed before, CE.
3581     //   Verify that it is the next one in sequence, which is all
3582     //   that is allowed.
3583     if (index != limitIx) {
3584         U_ASSERT(FALSE);
3585 
3586         return NULL;
3587     }
3588 
3589     // Manage the circular CE buffer indexing
3590     limitIx++;
3591 
3592     if (limitIx - firstIx >= bufSize) {
3593         // The buffer is full, knock out the lowest-indexed entry.
3594         firstIx++;
3595     }
3596 
3597     UErrorCode status = U_ZERO_ERROR;
3598 
3599     buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3600 
3601     return &buf[i];
3602 }
3603 
3604 }
3605 
3606 U_NAMESPACE_END
3607 
3608 
3609 // #define USEARCH_DEBUG
3610 
3611 #ifdef USEARCH_DEBUG
3612 #include <stdio.h>
3613 #include <stdlib.h>
3614 #endif
3615 
3616 /*
3617  * Find the next break boundary after startIndex. If the UStringSearch object
3618  * has an external break iterator, use that. Otherwise use the internal character
3619  * break iterator.
3620  */
nextBoundaryAfter(UStringSearch * strsrch,int32_t startIndex)3621 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3622 #if 0
3623     const UChar *text = strsrch->search->text;
3624     int32_t textLen   = strsrch->search->textLength;
3625 
3626     U_ASSERT(startIndex>=0);
3627     U_ASSERT(startIndex<=textLen);
3628 
3629     if (startIndex >= textLen) {
3630         return startIndex;
3631     }
3632 
3633     UChar32  c;
3634     int32_t  i = startIndex;
3635     U16_NEXT(text, i, textLen, c);
3636 
3637     // If we are on a control character, stop without looking for combining marks.
3638     //    Control characters do not combine.
3639     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3640     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
3641         return i;
3642     }
3643 
3644     // The initial character was not a control, and can thus accept trailing
3645     //   combining characters.  Advance over however many of them there are.
3646     int32_t  indexOfLastCharChecked;
3647     for (;;) {
3648         indexOfLastCharChecked = i;
3649         if (i>=textLen) {
3650             break;
3651         }
3652         U16_NEXT(text, i, textLen, c);
3653         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3654         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3655             break;
3656         }
3657     }
3658     return indexOfLastCharChecked;
3659 #elif !UCONFIG_NO_BREAK_ITERATION
3660     UBreakIterator *breakiterator = strsrch->search->breakIter;
3661 
3662     if (breakiterator == NULL) {
3663         breakiterator = strsrch->search->internalBreakIter;
3664     }
3665 
3666     if (breakiterator != NULL) {
3667         return ubrk_following(breakiterator, startIndex);
3668     }
3669 
3670     return startIndex;
3671 #else
3672     // **** or should we use the original code? ****
3673     return startIndex;
3674 #endif
3675 
3676 }
3677 
3678 /*
3679  * Returns TRUE if index is on a break boundary. If the UStringSearch
3680  * has an external break iterator, test using that, otherwise test
3681  * using the internal character break iterator.
3682  */
isBreakBoundary(UStringSearch * strsrch,int32_t index)3683 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3684 #if 0
3685     const UChar *text = strsrch->search->text;
3686     int32_t textLen   = strsrch->search->textLength;
3687 
3688     U_ASSERT(index>=0);
3689     U_ASSERT(index<=textLen);
3690 
3691     if (index>=textLen || index<=0) {
3692         return TRUE;
3693     }
3694 
3695     // If the character at the current index is not a GRAPHEME_EXTEND
3696     //    then we can not be within a combining sequence.
3697     UChar32  c;
3698     U16_GET(text, 0, index, textLen, c);
3699     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3700     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3701         return TRUE;
3702     }
3703 
3704     // We are at a combining mark.  If the preceding character is anything
3705     //   except a CONTROL, CR or LF, we are in a combining sequence.
3706     U16_PREV(text, 0, index, c);
3707     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3708     UBool combining =  !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
3709     return !combining;
3710 #elif !UCONFIG_NO_BREAK_ITERATION
3711     UBreakIterator *breakiterator = strsrch->search->breakIter;
3712 
3713     if (breakiterator == NULL) {
3714         breakiterator = strsrch->search->internalBreakIter;
3715     }
3716 
3717     return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
3718 #else
3719     // **** or use the original code? ****
3720     return TRUE;
3721 #endif
3722 }
3723 
3724 #if 0
3725 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3726 {
3727 #if !UCONFIG_NO_BREAK_ITERATION
3728     UBreakIterator *breakiterator = strsrch->search->breakIter;
3729 
3730     if (breakiterator != NULL) {
3731         int32_t startindex = ubrk_first(breakiterator);
3732         int32_t endindex   = ubrk_last(breakiterator);
3733 
3734         // out-of-range indexes are never boundary positions
3735         if (start < startindex || start > endindex ||
3736             end < startindex || end > endindex) {
3737             return FALSE;
3738         }
3739 
3740         return ubrk_isBoundary(breakiterator, start) &&
3741                ubrk_isBoundary(breakiterator, end);
3742     }
3743 #endif
3744 
3745     return TRUE;
3746 }
3747 #endif
3748 
3749 typedef enum {
3750     U_CE_MATCH = -1,
3751     U_CE_NO_MATCH = 0,
3752     U_CE_SKIP_TARG,
3753     U_CE_SKIP_PATN
3754 } UCompareCEsResult;
3755 #define U_CE_LEVEL2_BASE 0x00000005
3756 #define U_CE_LEVEL3_BASE 0x00050000
3757 
compareCE64s(int64_t targCE,int64_t patCE,int16_t compareType)3758 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
3759     if (targCE == patCE) {
3760         return U_CE_MATCH;
3761     }
3762     if (compareType == 0) {
3763         return U_CE_NO_MATCH;
3764     }
3765 
3766     int64_t targCEshifted = targCE >> 32;
3767     int64_t patCEshifted = patCE >> 32;
3768     int64_t mask;
3769 
3770     mask = 0xFFFF0000;
3771     int32_t targLev1 = (int32_t)(targCEshifted & mask);
3772     int32_t patLev1 = (int32_t)(patCEshifted & mask);
3773     if ( targLev1 != patLev1 ) {
3774         if ( targLev1 == 0 ) {
3775             return U_CE_SKIP_TARG;
3776         }
3777         if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3778             return U_CE_SKIP_PATN;
3779         }
3780         return U_CE_NO_MATCH;
3781     }
3782 
3783     mask = 0x0000FFFF;
3784     int32_t targLev2 = (int32_t)(targCEshifted & mask);
3785     int32_t patLev2 = (int32_t)(patCEshifted & mask);
3786     if ( targLev2 != patLev2 ) {
3787         if ( targLev2 == 0 ) {
3788             return U_CE_SKIP_TARG;
3789         }
3790         if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3791             return U_CE_SKIP_PATN;
3792         }
3793         return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
3794             U_CE_MATCH: U_CE_NO_MATCH;
3795     }
3796 
3797     mask = 0xFFFF0000;
3798     int32_t targLev3 = (int32_t)(targCE & mask);
3799     int32_t patLev3 = (int32_t)(patCE & mask);
3800     if ( targLev3 != patLev3 ) {
3801         return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
3802             U_CE_MATCH: U_CE_NO_MATCH;
3803    }
3804 
3805     return U_CE_MATCH;
3806 }
3807 
3808 #if BOYER_MOORE
3809 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3810 #endif
3811 
3812 namespace {
3813 
codePointAt(const USearch & search,int32_t index)3814 UChar32 codePointAt(const USearch &search, int32_t index) {
3815     if (index < search.textLength) {
3816         UChar32 c;
3817         U16_NEXT(search.text, index, search.textLength, c);
3818         return c;
3819     }
3820     return U_SENTINEL;
3821 }
3822 
codePointBefore(const USearch & search,int32_t index)3823 UChar32 codePointBefore(const USearch &search, int32_t index) {
3824     if (0 < index) {
3825         UChar32 c;
3826         U16_PREV(search.text, 0, index, c);
3827         return c;
3828     }
3829     return U_SENTINEL;
3830 }
3831 
3832 }  // namespace
3833 
usearch_search(UStringSearch * strsrch,int32_t startIdx,int32_t * matchStart,int32_t * matchLimit,UErrorCode * status)3834 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
3835                                        int32_t        startIdx,
3836                                        int32_t        *matchStart,
3837                                        int32_t        *matchLimit,
3838                                        UErrorCode     *status)
3839 {
3840     if (U_FAILURE(*status)) {
3841         return FALSE;
3842     }
3843 
3844     // TODO:  reject search patterns beginning with a combining char.
3845 
3846 #ifdef USEARCH_DEBUG
3847     if (getenv("USEARCH_DEBUG") != NULL) {
3848         printf("Pattern CEs\n");
3849         for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
3850             printf(" %8x", strsrch->pattern.ces[ii]);
3851         }
3852         printf("\n");
3853     }
3854 
3855 #endif
3856     // Input parameter sanity check.
3857     //  TODO:  should input indicies clip to the text length
3858     //         in the same way that UText does.
3859     if(strsrch->pattern.cesLength == 0         ||
3860        startIdx < 0                           ||
3861        startIdx > strsrch->search->textLength ||
3862        strsrch->pattern.ces == NULL) {
3863            *status = U_ILLEGAL_ARGUMENT_ERROR;
3864            return FALSE;
3865     }
3866 
3867     if (strsrch->pattern.pces == NULL) {
3868         initializePatternPCETable(strsrch, status);
3869     }
3870 
3871     ucol_setOffset(strsrch->textIter, startIdx, status);
3872     CEIBuffer ceb(strsrch, status);
3873 
3874 
3875     int32_t    targetIx = 0;
3876     const CEI *targetCEI = NULL;
3877     int32_t    patIx;
3878     UBool      found;
3879 
3880     int32_t  mStart = -1;
3881     int32_t  mLimit = -1;
3882     int32_t  minLimit;
3883     int32_t  maxLimit;
3884 
3885 
3886 
3887     // Outer loop moves over match starting positions in the
3888     //      target CE space.
3889     // Here we see the target as a sequence of collation elements, resulting from the following:
3890     // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3891     //    (for example, digraphs such as IJ may be broken into two characters).
3892     // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3893     //    16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3894     //    fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3895     //    CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3896     //    then the CE is deleted, so the following code sees only CEs that are relevant.
3897     // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3898     // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3899     // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3900     //
3901     for(targetIx=0; ; targetIx++)
3902     {
3903         found = TRUE;
3904         //  Inner loop checks for a match beginning at each
3905         //  position from the outer loop.
3906         int32_t targetIxOffset = 0;
3907         int64_t patCE = 0;
3908         // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3909         // (compared to the last CE fetched for the previous targetIx value) as we need to go
3910         // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3911         const CEI *firstCEI = ceb.get(targetIx);
3912         if (firstCEI == NULL) {
3913             *status = U_INTERNAL_PROGRAM_ERROR;
3914             found = FALSE;
3915             break;
3916         }
3917 
3918         for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
3919             patCE = strsrch->pattern.pces[patIx];
3920             targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
3921             //  Compare CE from target string with CE from the pattern.
3922             //    Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
3923             //    which will fail the compare, below.
3924             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
3925             if ( ceMatch == U_CE_NO_MATCH ) {
3926                 found = FALSE;
3927                 break;
3928             } else if ( ceMatch > U_CE_NO_MATCH ) {
3929                 if ( ceMatch == U_CE_SKIP_TARG ) {
3930                     // redo with same patCE, next targCE
3931                     patIx--;
3932                     targetIxOffset++;
3933                 } else { // ceMatch == U_CE_SKIP_PATN
3934                     // redo with same targCE, next patCE
3935                     targetIxOffset--;
3936                 }
3937             }
3938         }
3939         targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
3940 
3941         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
3942             // No match at this targetIx.  Try again at the next.
3943             continue;
3944         }
3945 
3946         if (!found) {
3947             // No match at all, we have run off the end of the target text.
3948             break;
3949         }
3950 
3951 
3952         // We have found a match in CE space.
3953         // Now determine the bounds in string index space.
3954         //  There still is a chance of match failure if the CE range not correspond to
3955         //     an acceptable character range.
3956         //
3957         const CEI *lastCEI  = ceb.get(targetIx + targetIxOffset - 1);
3958 
3959         mStart   = firstCEI->lowIndex;
3960         minLimit = lastCEI->lowIndex;
3961 
3962         // Look at the CE following the match.  If it is UCOL_NULLORDER the match
3963         //   extended to the end of input, and the match is good.
3964 
3965         // Look at the high and low indices of the CE following the match. If
3966         // they are the same it means one of two things:
3967         //    1. The match extended to the last CE from the target text, which is OK, or
3968         //    2. The last CE that was part of the match is in an expansion that extends
3969         //       to the first CE after the match. In this case, we reject the match.
3970         const CEI *nextCEI = 0;
3971         if (strsrch->search->elementComparisonType == 0) {
3972             nextCEI  = ceb.get(targetIx + targetIxOffset);
3973             maxLimit = nextCEI->lowIndex;
3974             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3975                 found = FALSE;
3976             }
3977         } else {
3978             for ( ; ; ++targetIxOffset ) {
3979                 nextCEI = ceb.get(targetIx + targetIxOffset);
3980                 maxLimit = nextCEI->lowIndex;
3981                 // If we are at the end of the target too, match succeeds
3982                 if (  nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
3983                     break;
3984                 }
3985                 // As long as the next CE has primary weight of 0,
3986                 // it is part of the last target element matched by the pattern;
3987                 // make sure it can be part of a match with the last patCE
3988                 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
3989                     UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
3990                     if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
3991                         found = FALSE;
3992                         break;
3993                     }
3994                 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
3995                 // target element, but it has non-zero primary weight => match fails
3996                 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
3997                     found = false;
3998                     break;
3999                 // Else the target CE is not part of an expansion of the last matched element, match succeeds
4000                 } else {
4001                     break;
4002                 }
4003             }
4004         }
4005 
4006 
4007         // Check for the start of the match being within a combining sequence.
4008         //   This can happen if the pattern itself begins with a combining char, and
4009         //   the match found combining marks in the target text that were attached
4010         //    to something else.
4011         //   This type of match should be rejected for not completely consuming a
4012         //   combining sequence.
4013         if (!isBreakBoundary(strsrch, mStart)) {
4014             found = FALSE;
4015         }
4016 
4017         // Check for the start of the match being within an Collation Element Expansion,
4018         //   meaning that the first char of the match is only partially matched.
4019         //   With exapnsions, the first CE will report the index of the source
4020         //   character, and all subsequent (expansions) CEs will report the source index of the
4021         //    _following_ character.
4022         int32_t secondIx = firstCEI->highIndex;
4023         if (mStart == secondIx) {
4024             found = FALSE;
4025         }
4026 
4027         // Allow matches to end in the middle of a grapheme cluster if the following
4028         // conditions are met; this is needed to make prefix search work properly in
4029         // Indic, see #11750
4030         // * the default breakIter is being used
4031         // * the next collation element after this combining sequence
4032         //   - has non-zero primary weight
4033         //   - corresponds to a separate character following the one at end of the current match
4034         //   (the second of these conditions, and perhaps both, may be redundant given the
4035         //   subsequent check for normalization boundary; however they are likely much faster
4036         //   tests in any case)
4037         // * the match limit is a normalization boundary
4038         UBool allowMidclusterMatch = FALSE;
4039         if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4040             allowMidclusterMatch =
4041                     strsrch->search->breakIter == NULL &&
4042                     nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4043                     maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4044                     (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4045                         strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4046         }
4047         // If those conditions are met, then:
4048         // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4049         //   the match limit may be backed off to a previous break boundary. This handles
4050         //   cases in which mLimit includes target characters that are ignorable with current
4051         //   settings (such as space) and which extend beyond the pattern match.
4052         // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4053         // * do NOT require that match limit be on a breakIter boundary
4054 
4055         //  Advance the match end position to the first acceptable match boundary.
4056         //    This advances the index over any combining charcters.
4057         mLimit = maxLimit;
4058         if (minLimit < maxLimit) {
4059             // When the last CE's low index is same with its high index, the CE is likely
4060             // a part of expansion. In this case, the index is located just after the
4061             // character corresponding to the CEs compared above. If the index is right
4062             // at the break boundary, move the position to the next boundary will result
4063             // incorrect match length when there are ignorable characters exist between
4064             // the position and the next character produces CE(s). See ticket#8482.
4065             if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
4066                 mLimit = minLimit;
4067             } else {
4068                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4069                 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4070                 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4071                 // (i.e. we back off mLimit to the previous breakIterator boundary).
4072                 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4073                     mLimit = nba;
4074                 }
4075             }
4076         }
4077 
4078     #ifdef USEARCH_DEBUG
4079         if (getenv("USEARCH_DEBUG") != NULL) {
4080             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4081         }
4082     #endif
4083 
4084         if (!allowMidclusterMatch) {
4085             // If advancing to the end of a combining sequence in character indexing space
4086             //   advanced us beyond the end of the match in CE space, reject this match.
4087             if (mLimit > maxLimit) {
4088                 found = FALSE;
4089             }
4090 
4091             if (!isBreakBoundary(strsrch, mLimit)) {
4092                 found = FALSE;
4093             }
4094         }
4095 
4096         if (! checkIdentical(strsrch, mStart, mLimit)) {
4097             found = FALSE;
4098         }
4099 
4100         if (found) {
4101             break;
4102         }
4103     }
4104 
4105     #ifdef USEARCH_DEBUG
4106     if (getenv("USEARCH_DEBUG") != NULL) {
4107         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4108         int32_t  lastToPrint = ceb.limitIx+2;
4109         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4110             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4111         }
4112         printf("\n%s\n", found? "match found" : "no match");
4113     }
4114     #endif
4115 
4116     // All Done.  Store back the match bounds to the caller.
4117     //
4118     if (found==FALSE) {
4119         mLimit = -1;
4120         mStart = -1;
4121     }
4122 
4123     if (matchStart != NULL) {
4124         *matchStart= mStart;
4125     }
4126 
4127     if (matchLimit != NULL) {
4128         *matchLimit = mLimit;
4129     }
4130 
4131     return found;
4132 }
4133 
usearch_searchBackwards(UStringSearch * strsrch,int32_t startIdx,int32_t * matchStart,int32_t * matchLimit,UErrorCode * status)4134 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
4135                                                 int32_t        startIdx,
4136                                                 int32_t        *matchStart,
4137                                                 int32_t        *matchLimit,
4138                                                 UErrorCode     *status)
4139 {
4140     if (U_FAILURE(*status)) {
4141         return FALSE;
4142     }
4143 
4144     // TODO:  reject search patterns beginning with a combining char.
4145 
4146 #ifdef USEARCH_DEBUG
4147     if (getenv("USEARCH_DEBUG") != NULL) {
4148         printf("Pattern CEs\n");
4149         for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
4150             printf(" %8x", strsrch->pattern.ces[ii]);
4151         }
4152         printf("\n");
4153     }
4154 
4155 #endif
4156     // Input parameter sanity check.
4157     //  TODO:  should input indicies clip to the text length
4158     //         in the same way that UText does.
4159     if(strsrch->pattern.cesLength == 0         ||
4160        startIdx < 0                           ||
4161        startIdx > strsrch->search->textLength ||
4162        strsrch->pattern.ces == NULL) {
4163            *status = U_ILLEGAL_ARGUMENT_ERROR;
4164            return FALSE;
4165     }
4166 
4167     if (strsrch->pattern.pces == NULL) {
4168         initializePatternPCETable(strsrch, status);
4169     }
4170 
4171     CEIBuffer ceb(strsrch, status);
4172     int32_t    targetIx = 0;
4173 
4174     /*
4175      * Pre-load the buffer with the CE's for the grapheme
4176      * after our starting position so that we're sure that
4177      * we can look at the CE following the match when we
4178      * check the match boundaries.
4179      *
4180      * This will also pre-fetch the first CE that we'll
4181      * consider for the match.
4182      */
4183     if (startIdx < strsrch->search->textLength) {
4184         UBreakIterator *bi = strsrch->search->internalBreakIter;
4185         int32_t next = ubrk_following(bi, startIdx);
4186 
4187         ucol_setOffset(strsrch->textIter, next, status);
4188 
4189         for (targetIx = 0; ; targetIx += 1) {
4190             if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
4191                 break;
4192             }
4193         }
4194     } else {
4195         ucol_setOffset(strsrch->textIter, startIdx, status);
4196     }
4197 
4198 
4199     const CEI *targetCEI = NULL;
4200     int32_t    patIx;
4201     UBool      found;
4202 
4203     int32_t  limitIx = targetIx;
4204     int32_t  mStart = -1;
4205     int32_t  mLimit = -1;
4206     int32_t  minLimit;
4207     int32_t  maxLimit;
4208 
4209 
4210 
4211     // Outer loop moves over match starting positions in the
4212     //      target CE space.
4213     // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4214     // But  patIx is 0 at the beginning of the pattern and increases toward the end.
4215     // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4216     // and the beginning of the base text.
4217     for(targetIx = limitIx; ; targetIx += 1)
4218     {
4219         found = TRUE;
4220         // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4221         // (compared to the last CE fetched for the previous targetIx value) as we need to go
4222         // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4223         const CEI *lastCEI  = ceb.getPrevious(targetIx);
4224         if (lastCEI == NULL) {
4225             *status = U_INTERNAL_PROGRAM_ERROR;
4226             found = FALSE;
4227              break;
4228         }
4229         //  Inner loop checks for a match beginning at each
4230         //  position from the outer loop.
4231         int32_t targetIxOffset = 0;
4232         for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
4233             int64_t patCE = strsrch->pattern.pces[patIx];
4234 
4235             targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
4236             //  Compare CE from target string with CE from the pattern.
4237             //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4238             //    which will fail the compare, below.
4239             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
4240             if ( ceMatch == U_CE_NO_MATCH ) {
4241                 found = FALSE;
4242                 break;
4243             } else if ( ceMatch > U_CE_NO_MATCH ) {
4244                 if ( ceMatch == U_CE_SKIP_TARG ) {
4245                     // redo with same patCE, next targCE
4246                     patIx++;
4247                     targetIxOffset++;
4248                 } else { // ceMatch == U_CE_SKIP_PATN
4249                     // redo with same targCE, next patCE
4250                     targetIxOffset--;
4251                 }
4252             }
4253         }
4254 
4255         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
4256             // No match at this targetIx.  Try again at the next.
4257             continue;
4258         }
4259 
4260         if (!found) {
4261             // No match at all, we have run off the end of the target text.
4262             break;
4263         }
4264 
4265 
4266         // We have found a match in CE space.
4267         // Now determine the bounds in string index space.
4268         //  There still is a chance of match failure if the CE range not correspond to
4269         //     an acceptable character range.
4270         //
4271         const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
4272         mStart   = firstCEI->lowIndex;
4273 
4274         // Check for the start of the match being within a combining sequence.
4275         //   This can happen if the pattern itself begins with a combining char, and
4276         //   the match found combining marks in the target text that were attached
4277         //    to something else.
4278         //   This type of match should be rejected for not completely consuming a
4279         //   combining sequence.
4280         if (!isBreakBoundary(strsrch, mStart)) {
4281             found = FALSE;
4282         }
4283 
4284         // Look at the high index of the first CE in the match. If it's the same as the
4285         // low index, the first CE in the match is in the middle of an expansion.
4286         if (mStart == firstCEI->highIndex) {
4287             found = FALSE;
4288         }
4289 
4290 
4291         minLimit = lastCEI->lowIndex;
4292 
4293         if (targetIx > 0) {
4294             // Look at the CE following the match.  If it is UCOL_NULLORDER the match
4295             //   extended to the end of input, and the match is good.
4296 
4297             // Look at the high and low indices of the CE following the match. If
4298             // they are the same it means one of two things:
4299             //    1. The match extended to the last CE from the target text, which is OK, or
4300             //    2. The last CE that was part of the match is in an expansion that extends
4301             //       to the first CE after the match. In this case, we reject the match.
4302             const CEI *nextCEI  = ceb.getPrevious(targetIx - 1);
4303 
4304             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4305                 found = FALSE;
4306             }
4307 
4308             mLimit = maxLimit = nextCEI->lowIndex;
4309 
4310             // Allow matches to end in the middle of a grapheme cluster if the following
4311             // conditions are met; this is needed to make prefix search work properly in
4312             // Indic, see #11750
4313             // * the default breakIter is being used
4314             // * the next collation element after this combining sequence
4315             //   - has non-zero primary weight
4316             //   - corresponds to a separate character following the one at end of the current match
4317             //   (the second of these conditions, and perhaps both, may be redundant given the
4318             //   subsequent check for normalization boundary; however they are likely much faster
4319             //   tests in any case)
4320             // * the match limit is a normalization boundary
4321             UBool allowMidclusterMatch = FALSE;
4322             if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4323                 allowMidclusterMatch =
4324                         strsrch->search->breakIter == NULL &&
4325                         nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4326                         maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4327                         (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4328                             strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4329             }
4330             // If those conditions are met, then:
4331             // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4332             //   the match limit may be backed off to a previous break boundary. This handles
4333             //   cases in which mLimit includes target characters that are ignorable with current
4334             //   settings (such as space) and which extend beyond the pattern match.
4335             // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4336             // * do NOT require that match limit be on a breakIter boundary
4337 
4338             //  Advance the match end position to the first acceptable match boundary.
4339             //    This advances the index over any combining characters.
4340             if (minLimit < maxLimit) {
4341                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4342                 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4343                 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4344                 // (i.e. we back off mLimit to the previous breakIterator boundary).
4345                 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4346                     mLimit = nba;
4347                 }
4348             }
4349 
4350             if (!allowMidclusterMatch) {
4351                 // If advancing to the end of a combining sequence in character indexing space
4352                 //   advanced us beyond the end of the match in CE space, reject this match.
4353                 if (mLimit > maxLimit) {
4354                     found = FALSE;
4355                 }
4356 
4357                 // Make sure the end of the match is on a break boundary
4358                 if (!isBreakBoundary(strsrch, mLimit)) {
4359                     found = FALSE;
4360                 }
4361             }
4362 
4363         } else {
4364             // No non-ignorable CEs after this point.
4365             // The maximum position is detected by boundary after
4366             // the last non-ignorable CE. Combining sequence
4367             // across the start index will be truncated.
4368             int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4369             mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
4370         }
4371 
4372     #ifdef USEARCH_DEBUG
4373         if (getenv("USEARCH_DEBUG") != NULL) {
4374             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4375         }
4376     #endif
4377 
4378 
4379         if (! checkIdentical(strsrch, mStart, mLimit)) {
4380             found = FALSE;
4381         }
4382 
4383         if (found) {
4384             break;
4385         }
4386     }
4387 
4388     #ifdef USEARCH_DEBUG
4389     if (getenv("USEARCH_DEBUG") != NULL) {
4390         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4391         int32_t  lastToPrint = ceb.limitIx+2;
4392         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4393             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4394         }
4395         printf("\n%s\n", found? "match found" : "no match");
4396     }
4397     #endif
4398 
4399     // All Done.  Store back the match bounds to the caller.
4400     //
4401     if (found==FALSE) {
4402         mLimit = -1;
4403         mStart = -1;
4404     }
4405 
4406     if (matchStart != NULL) {
4407         *matchStart= mStart;
4408     }
4409 
4410     if (matchLimit != NULL) {
4411         *matchLimit = mLimit;
4412     }
4413 
4414     return found;
4415 }
4416 
4417 // internal use methods declared in usrchimp.h -----------------------------
4418 
usearch_handleNextExact(UStringSearch * strsrch,UErrorCode * status)4419 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
4420 {
4421     if (U_FAILURE(*status)) {
4422         setMatchNotFound(strsrch);
4423         return FALSE;
4424     }
4425 
4426 #if BOYER_MOORE
4427     UCollationElements *coleiter        = strsrch->textIter;
4428     int32_t             textlength      = strsrch->search->textLength;
4429     int32_t            *patternce       = strsrch->pattern.ces;
4430     int32_t             patterncelength = strsrch->pattern.cesLength;
4431     int32_t             textoffset      = ucol_getOffset(coleiter);
4432 
4433     // status used in setting coleiter offset, since offset is checked in
4434     // shiftForward before setting the coleiter offset, status never
4435     // a failure
4436     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4437                               patterncelength);
4438     while (textoffset <= textlength)
4439     {
4440         uint32_t    patternceindex = patterncelength - 1;
4441         int32_t     targetce;
4442         UBool       found          = FALSE;
4443         int32_t    lastce          = UCOL_NULLORDER;
4444 
4445         setColEIterOffset(coleiter, textoffset);
4446 
4447         for (;;) {
4448             // finding the last pattern ce match, imagine composite characters
4449             // for example: search for pattern A in text \u00C0
4450             // we'll have to skip \u0300 the grave first before we get to A
4451             targetce = ucol_previous(coleiter, status);
4452             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4453                 found = FALSE;
4454                 break;
4455             }
4456             targetce = getCE(strsrch, targetce);
4457             if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4458                 // this is for the text \u0315\u0300 that requires
4459                 // normalization and pattern \u0300, where \u0315 is ignorable
4460                 continue;
4461             }
4462             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4463                 lastce = targetce;
4464             }
4465             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4466             if (targetce == patternce[patternceindex]) {
4467                 // the first ce can be a contraction
4468                 found = TRUE;
4469                 break;
4470             }
4471             if (!hasExpansion(coleiter)) {
4472                 found = FALSE;
4473                 break;
4474             }
4475         }
4476 
4477         //targetce = lastce;
4478 
4479         while (found && patternceindex > 0) {
4480             lastce = targetce;
4481             targetce    = ucol_previous(coleiter, status);
4482             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4483                 found = FALSE;
4484                 break;
4485             }
4486             targetce    = getCE(strsrch, targetce);
4487             if (targetce == UCOL_IGNORABLE) {
4488                 continue;
4489             }
4490 
4491             patternceindex --;
4492             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4493             found = found && targetce == patternce[patternceindex];
4494         }
4495 
4496         targetce = lastce;
4497 
4498         if (!found) {
4499             if (U_FAILURE(*status)) {
4500                 break;
4501             }
4502             textoffset = shiftForward(strsrch, textoffset, lastce,
4503                                       patternceindex);
4504             // status checked at loop.
4505             patternceindex = patterncelength;
4506             continue;
4507         }
4508 
4509         if (checkNextExactMatch(strsrch, &textoffset, status)) {
4510             // status checked in ucol_setOffset
4511             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4512             return TRUE;
4513         }
4514     }
4515     setMatchNotFound(strsrch);
4516     return FALSE;
4517 #else
4518     int32_t textOffset = ucol_getOffset(strsrch->textIter);
4519     int32_t start = -1;
4520     int32_t end = -1;
4521 
4522     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4523         strsrch->search->matchedIndex  = start;
4524         strsrch->search->matchedLength = end - start;
4525         return TRUE;
4526     } else {
4527         setMatchNotFound(strsrch);
4528         return FALSE;
4529     }
4530 #endif
4531 }
4532 
usearch_handleNextCanonical(UStringSearch * strsrch,UErrorCode * status)4533 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
4534 {
4535     if (U_FAILURE(*status)) {
4536         setMatchNotFound(strsrch);
4537         return FALSE;
4538     }
4539 
4540 #if BOYER_MOORE
4541     UCollationElements *coleiter        = strsrch->textIter;
4542     int32_t             textlength      = strsrch->search->textLength;
4543     int32_t            *patternce       = strsrch->pattern.ces;
4544     int32_t             patterncelength = strsrch->pattern.cesLength;
4545     int32_t             textoffset      = ucol_getOffset(coleiter);
4546     UBool               hasPatternAccents =
4547        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4548 
4549     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4550                               patterncelength);
4551     strsrch->canonicalPrefixAccents[0] = 0;
4552     strsrch->canonicalSuffixAccents[0] = 0;
4553 
4554     while (textoffset <= textlength)
4555     {
4556         int32_t     patternceindex = patterncelength - 1;
4557         int32_t     targetce;
4558         UBool       found          = FALSE;
4559         int32_t     lastce         = UCOL_NULLORDER;
4560 
4561         setColEIterOffset(coleiter, textoffset);
4562 
4563         for (;;) {
4564             // finding the last pattern ce match, imagine composite characters
4565             // for example: search for pattern A in text \u00C0
4566             // we'll have to skip \u0300 the grave first before we get to A
4567             targetce = ucol_previous(coleiter, status);
4568             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4569                 found = FALSE;
4570                 break;
4571             }
4572             targetce = getCE(strsrch, targetce);
4573             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4574                 lastce = targetce;
4575             }
4576             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4577             if (targetce == patternce[patternceindex]) {
4578                 // the first ce can be a contraction
4579                 found = TRUE;
4580                 break;
4581             }
4582             if (!hasExpansion(coleiter)) {
4583                 found = FALSE;
4584                 break;
4585             }
4586         }
4587 
4588         while (found && patternceindex > 0) {
4589             targetce    = ucol_previous(coleiter, status);
4590             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4591                 found = FALSE;
4592                 break;
4593             }
4594             targetce    = getCE(strsrch, targetce);
4595             if (targetce == UCOL_IGNORABLE) {
4596                 continue;
4597             }
4598 
4599             patternceindex --;
4600             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4601             found = found && targetce == patternce[patternceindex];
4602         }
4603 
4604         // initializing the rearranged accent array
4605         if (hasPatternAccents && !found) {
4606             strsrch->canonicalPrefixAccents[0] = 0;
4607             strsrch->canonicalSuffixAccents[0] = 0;
4608             if (U_FAILURE(*status)) {
4609                 break;
4610             }
4611             found = doNextCanonicalMatch(strsrch, textoffset, status);
4612         }
4613 
4614         if (!found) {
4615             if (U_FAILURE(*status)) {
4616                 break;
4617             }
4618             textoffset = shiftForward(strsrch, textoffset, lastce,
4619                                       patternceindex);
4620             // status checked at loop
4621             patternceindex = patterncelength;
4622             continue;
4623         }
4624 
4625         if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4626             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4627             return TRUE;
4628         }
4629     }
4630     setMatchNotFound(strsrch);
4631     return FALSE;
4632 #else
4633     int32_t textOffset = ucol_getOffset(strsrch->textIter);
4634     int32_t start = -1;
4635     int32_t end = -1;
4636 
4637     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4638         strsrch->search->matchedIndex  = start;
4639         strsrch->search->matchedLength = end - start;
4640         return TRUE;
4641     } else {
4642         setMatchNotFound(strsrch);
4643         return FALSE;
4644     }
4645 #endif
4646 }
4647 
usearch_handlePreviousExact(UStringSearch * strsrch,UErrorCode * status)4648 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
4649 {
4650     if (U_FAILURE(*status)) {
4651         setMatchNotFound(strsrch);
4652         return FALSE;
4653     }
4654 
4655 #if BOYER_MOORE
4656     UCollationElements *coleiter        = strsrch->textIter;
4657     int32_t            *patternce       = strsrch->pattern.ces;
4658     int32_t             patterncelength = strsrch->pattern.cesLength;
4659     int32_t             textoffset      = ucol_getOffset(coleiter);
4660 
4661     // shifting it check for setting offset
4662     // if setOffset is called previously or there was no previous match, we
4663     // leave the offset as it is.
4664     if (strsrch->search->matchedIndex != USEARCH_DONE) {
4665         textoffset = strsrch->search->matchedIndex;
4666     }
4667 
4668     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4669                               patterncelength);
4670 
4671     while (textoffset >= 0)
4672     {
4673         int32_t     patternceindex = 1;
4674         int32_t     targetce;
4675         UBool       found          = FALSE;
4676         int32_t     firstce        = UCOL_NULLORDER;
4677 
4678         // if status is a failure, ucol_setOffset does nothing
4679         setColEIterOffset(coleiter, textoffset);
4680 
4681         for (;;) {
4682             // finding the first pattern ce match, imagine composite
4683             // characters. for example: search for pattern \u0300 in text
4684             // \u00C0, we'll have to skip A first before we get to
4685             // \u0300 the grave accent
4686             targetce = ucol_next(coleiter, status);
4687             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4688                 found = FALSE;
4689                 break;
4690             }
4691             targetce = getCE(strsrch, targetce);
4692             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4693                 firstce = targetce;
4694             }
4695             if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
4696                 continue;
4697             }
4698             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4699             if (targetce == patternce[0]) {
4700                 found = TRUE;
4701                 break;
4702             }
4703             if (!hasExpansion(coleiter)) {
4704                 // checking for accents in composite character
4705                 found = FALSE;
4706                 break;
4707             }
4708         }
4709 
4710         //targetce = firstce;
4711 
4712         while (found && (patternceindex < patterncelength)) {
4713             firstce = targetce;
4714             targetce    = ucol_next(coleiter, status);
4715             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4716                 found = FALSE;
4717                 break;
4718             }
4719             targetce    = getCE(strsrch, targetce);
4720             if (targetce == UCOL_IGNORABLE) {
4721                 continue;
4722             }
4723 
4724             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4725             found = found && targetce == patternce[patternceindex];
4726             patternceindex ++;
4727         }
4728 
4729         targetce = firstce;
4730 
4731         if (!found) {
4732             if (U_FAILURE(*status)) {
4733                 break;
4734             }
4735 
4736             textoffset = reverseShift(strsrch, textoffset, targetce,
4737                                       patternceindex);
4738             patternceindex = 0;
4739             continue;
4740         }
4741 
4742         if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4743             setColEIterOffset(coleiter, textoffset);
4744             return TRUE;
4745         }
4746     }
4747     setMatchNotFound(strsrch);
4748     return FALSE;
4749 #else
4750     int32_t textOffset;
4751 
4752     if (strsrch->search->isOverlap) {
4753         if (strsrch->search->matchedIndex != USEARCH_DONE) {
4754             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4755         } else {
4756             // move the start position at the end of possible match
4757             initializePatternPCETable(strsrch, status);
4758             if (!initTextProcessedIter(strsrch, status)) {
4759                 setMatchNotFound(strsrch);
4760                 return FALSE;
4761             }
4762             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4763                 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4764                 if (pce == UCOL_PROCESSED_NULLORDER) {
4765                     // at the end of the text
4766                     break;
4767                 }
4768             }
4769             if (U_FAILURE(*status)) {
4770                 setMatchNotFound(strsrch);
4771                 return FALSE;
4772             }
4773             textOffset = ucol_getOffset(strsrch->textIter);
4774         }
4775     } else {
4776         textOffset = ucol_getOffset(strsrch->textIter);
4777     }
4778 
4779     int32_t start = -1;
4780     int32_t end = -1;
4781 
4782     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4783         strsrch->search->matchedIndex = start;
4784         strsrch->search->matchedLength = end - start;
4785         return TRUE;
4786     } else {
4787         setMatchNotFound(strsrch);
4788         return FALSE;
4789     }
4790 #endif
4791 }
4792 
usearch_handlePreviousCanonical(UStringSearch * strsrch,UErrorCode * status)4793 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
4794                                       UErrorCode    *status)
4795 {
4796     if (U_FAILURE(*status)) {
4797         setMatchNotFound(strsrch);
4798         return FALSE;
4799     }
4800 
4801 #if BOYER_MOORE
4802     UCollationElements *coleiter        = strsrch->textIter;
4803     int32_t            *patternce       = strsrch->pattern.ces;
4804     int32_t             patterncelength = strsrch->pattern.cesLength;
4805     int32_t             textoffset      = ucol_getOffset(coleiter);
4806     UBool               hasPatternAccents =
4807        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4808 
4809     // shifting it check for setting offset
4810     // if setOffset is called previously or there was no previous match, we
4811     // leave the offset as it is.
4812     if (strsrch->search->matchedIndex != USEARCH_DONE) {
4813         textoffset = strsrch->search->matchedIndex;
4814     }
4815 
4816     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4817                               patterncelength);
4818     strsrch->canonicalPrefixAccents[0] = 0;
4819     strsrch->canonicalSuffixAccents[0] = 0;
4820 
4821     while (textoffset >= 0)
4822     {
4823         int32_t     patternceindex = 1;
4824         int32_t     targetce;
4825         UBool       found          = FALSE;
4826         int32_t     firstce        = UCOL_NULLORDER;
4827 
4828         setColEIterOffset(coleiter, textoffset);
4829         for (;;) {
4830             // finding the first pattern ce match, imagine composite
4831             // characters. for example: search for pattern \u0300 in text
4832             // \u00C0, we'll have to skip A first before we get to
4833             // \u0300 the grave accent
4834             targetce = ucol_next(coleiter, status);
4835             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4836                 found = FALSE;
4837                 break;
4838             }
4839             targetce = getCE(strsrch, targetce);
4840             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4841                 firstce = targetce;
4842             }
4843 
4844             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4845             if (targetce == patternce[0]) {
4846                 // the first ce can be a contraction
4847                 found = TRUE;
4848                 break;
4849             }
4850             if (!hasExpansion(coleiter)) {
4851                 // checking for accents in composite character
4852                 found = FALSE;
4853                 break;
4854             }
4855         }
4856 
4857         targetce = firstce;
4858 
4859         while (found && patternceindex < patterncelength) {
4860             targetce    = ucol_next(coleiter, status);
4861             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4862                 found = FALSE;
4863                 break;
4864             }
4865             targetce = getCE(strsrch, targetce);
4866             if (targetce == UCOL_IGNORABLE) {
4867                 continue;
4868             }
4869 
4870             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4871             found = found && targetce == patternce[patternceindex];
4872             patternceindex ++;
4873         }
4874 
4875         // initializing the rearranged accent array
4876         if (hasPatternAccents && !found) {
4877             strsrch->canonicalPrefixAccents[0] = 0;
4878             strsrch->canonicalSuffixAccents[0] = 0;
4879             if (U_FAILURE(*status)) {
4880                 break;
4881             }
4882             found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4883         }
4884 
4885         if (!found) {
4886             if (U_FAILURE(*status)) {
4887                 break;
4888             }
4889             textoffset = reverseShift(strsrch, textoffset, targetce,
4890                                       patternceindex);
4891             patternceindex = 0;
4892             continue;
4893         }
4894 
4895         if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4896             setColEIterOffset(coleiter, textoffset);
4897             return TRUE;
4898         }
4899     }
4900     setMatchNotFound(strsrch);
4901     return FALSE;
4902 #else
4903     int32_t textOffset;
4904 
4905     if (strsrch->search->isOverlap) {
4906         if (strsrch->search->matchedIndex != USEARCH_DONE) {
4907             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4908         } else {
4909             // move the start position at the end of possible match
4910             initializePatternPCETable(strsrch, status);
4911             if (!initTextProcessedIter(strsrch, status)) {
4912                 setMatchNotFound(strsrch);
4913                 return FALSE;
4914             }
4915             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4916                 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4917                 if (pce == UCOL_PROCESSED_NULLORDER) {
4918                     // at the end of the text
4919                     break;
4920                 }
4921             }
4922             if (U_FAILURE(*status)) {
4923                 setMatchNotFound(strsrch);
4924                 return FALSE;
4925             }
4926             textOffset = ucol_getOffset(strsrch->textIter);
4927         }
4928     } else {
4929         textOffset = ucol_getOffset(strsrch->textIter);
4930     }
4931 
4932     int32_t start = -1;
4933     int32_t end = -1;
4934 
4935     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4936         strsrch->search->matchedIndex = start;
4937         strsrch->search->matchedLength = end - start;
4938         return TRUE;
4939     } else {
4940         setMatchNotFound(strsrch);
4941         return FALSE;
4942     }
4943 #endif
4944 }
4945 
4946 #endif /* #if !UCONFIG_NO_COLLATION */
4947