• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2001-2015 IBM and others. All rights reserved.
6 **********************************************************************
7 *   Date        Name        Description
8 *  07/02/2001   synwee      Creation.
9 **********************************************************************
10 */
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
15 
16 #include "unicode/usearch.h"
17 #include "unicode/ustring.h"
18 #include "unicode/uchar.h"
19 #include "unicode/utf16.h"
20 #include "normalizer2impl.h"
21 #include "usrchimp.h"
22 #include "cmemory.h"
23 #include "ucln_in.h"
24 #include "uassert.h"
25 #include "ustr_imp.h"
26 
27 U_NAMESPACE_USE
28 
29 // internal definition ---------------------------------------------------
30 
31 #define LAST_BYTE_MASK_          0xFF
32 #define SECOND_LAST_BYTE_SHIFT_  8
33 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
34 
35 static const Normalizer2Impl *g_nfcImpl = nullptr;
36 
37 // internal methods -------------------------------------------------
38 
39 /**
40 * Fast collation element iterator setOffset.
41 * This function does not check for bounds.
42 * @param coleiter collation element iterator
43 * @param offset to set
44 */
45 static
setColEIterOffset(UCollationElements * elems,int32_t offset,UErrorCode & status)46 inline void setColEIterOffset(UCollationElements *elems,
47                               int32_t offset,
48                               UErrorCode &status)
49 {
50     // Note: Not "fast" any more after the 2013 collation rewrite.
51     // We do not want to expose more internals than necessary.
52     ucol_setOffset(elems, offset, &status);
53 }
54 
55 /**
56 * Getting the mask for collation strength
57 * @param strength collation strength
58 * @return collation element mask
59 */
60 static
getMask(UCollationStrength strength)61 inline uint32_t getMask(UCollationStrength strength)
62 {
63     switch (strength)
64     {
65     case UCOL_PRIMARY:
66         return UCOL_PRIMARYORDERMASK;
67     case UCOL_SECONDARY:
68         return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
69     default:
70         return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
71                UCOL_PRIMARYORDERMASK;
72     }
73 }
74 
75 U_CDECL_BEGIN
76 static UBool U_CALLCONV
usearch_cleanup(void)77 usearch_cleanup(void) {
78     g_nfcImpl = nullptr;
79     return TRUE;
80 }
81 U_CDECL_END
82 
83 /**
84 * Initializing the fcd tables.
85 * Internal method, status assumed to be a success.
86 * @param status output error if any, caller to check status before calling
87 *               method, status assumed to be success when passed in.
88 */
89 static
initializeFCD(UErrorCode * status)90 inline void initializeFCD(UErrorCode *status)
91 {
92     if (g_nfcImpl == nullptr) {
93         g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
94         ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
95     }
96 }
97 
98 /**
99 * Gets the fcd value for a character at the argument index.
100 * This method takes into accounts of the supplementary characters.
101 * @param str UTF16 string where character for fcd retrieval resides
102 * @param offset position of the character whose fcd is to be retrieved, to be
103 *               overwritten with the next character position, taking
104 *               surrogate characters into consideration.
105 * @param strlength length of the argument string
106 * @return fcd value
107 */
108 static
getFCD(const UChar * str,int32_t * offset,int32_t strlength)109 uint16_t getFCD(const UChar   *str, int32_t *offset,
110                              int32_t  strlength)
111 {
112     const UChar *temp = str + *offset;
113     uint16_t    result = g_nfcImpl->nextFCD16(temp, str + strlength);
114     *offset = (int32_t)(temp - str);
115     return result;
116 }
117 
118 /**
119 * Getting the modified collation elements taking into account the collation
120 * attributes
121 * @param strsrch string search data
122 * @param sourcece
123 * @return the modified collation element
124 */
125 static
getCE(const UStringSearch * strsrch,uint32_t sourcece)126 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
127 {
128     // note for tertiary we can't use the collator->tertiaryMask, that
129     // is a preprocessed mask that takes into account case options. since
130     // we are only concerned with exact matches, we don't need that.
131     sourcece &= strsrch->ceMask;
132 
133     if (strsrch->toShift) {
134         // alternate handling here, since only the 16 most significant digits
135         // is only used, we can safely do a compare without masking
136         // if the ce is a variable, we mask and get only the primary values
137         // no shifting to quartenary is required since all primary values
138         // less than variabletop will need to be masked off anyway.
139         if (strsrch->variableTop > sourcece) {
140             if (strsrch->strength >= UCOL_QUATERNARY) {
141                 sourcece &= UCOL_PRIMARYORDERMASK;
142             }
143             else {
144                 sourcece = UCOL_IGNORABLE;
145             }
146         }
147     } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
148         sourcece = 0xFFFF;
149     }
150 
151     return sourcece;
152 }
153 
154 /**
155 * Allocate a memory and returns nullptr if it failed.
156 * Internal method, status assumed to be a success.
157 * @param size to allocate
158 * @param status output error if any, caller to check status before calling
159 *               method, status assumed to be success when passed in.
160 * @return newly allocated array, nullptr otherwise
161 */
162 static
allocateMemory(uint32_t size,UErrorCode * status)163 inline void * allocateMemory(uint32_t size, UErrorCode *status)
164 {
165     uint32_t *result = (uint32_t *)uprv_malloc(size);
166     if (result == nullptr) {
167         *status = U_MEMORY_ALLOCATION_ERROR;
168     }
169     return result;
170 }
171 
172 /**
173 * Adds a uint32_t value to a destination array.
174 * Creates a new array if we run out of space. The caller will have to
175 * manually deallocate the newly allocated array.
176 * Internal method, status assumed to be success, caller has to check status
177 * before calling this method. destination not to be nullptr and has at least
178 * size destinationlength.
179 * @param destination target array
180 * @param offset destination offset to add value
181 * @param destinationlength target array size, return value for the new size
182 * @param value to be added
183 * @param increments incremental size expected
184 * @param status output error if any, caller to check status before calling
185 *               method, status assumed to be success when passed in.
186 * @return new destination array, destination if there was no new allocation
187 */
188 static
addTouint32_tArray(int32_t * destination,uint32_t offset,uint32_t * destinationlength,uint32_t value,uint32_t increments,UErrorCode * status)189 inline int32_t * addTouint32_tArray(int32_t    *destination,
190                                     uint32_t    offset,
191                                     uint32_t   *destinationlength,
192                                     uint32_t    value,
193                                     uint32_t    increments,
194                                     UErrorCode *status)
195 {
196     uint32_t newlength = *destinationlength;
197     if (offset + 1 == newlength) {
198         newlength += increments;
199         int32_t *temp = (int32_t *)allocateMemory(
200                                          sizeof(int32_t) * newlength, status);
201         if (U_FAILURE(*status)) {
202             return nullptr;
203         }
204         uprv_memcpy(temp, destination, sizeof(int32_t) * (size_t)offset);
205         *destinationlength = newlength;
206         destination        = temp;
207     }
208     destination[offset] = value;
209     return destination;
210 }
211 
212 /**
213 * Adds a uint64_t value to a destination array.
214 * Creates a new array if we run out of space. The caller will have to
215 * manually deallocate the newly allocated array.
216 * Internal method, status assumed to be success, caller has to check status
217 * before calling this method. destination not to be nullptr and has at least
218 * size destinationlength.
219 * @param destination target array
220 * @param offset destination offset to add value
221 * @param destinationlength target array size, return value for the new size
222 * @param value to be added
223 * @param increments incremental size expected
224 * @param status output error if any, caller to check status before calling
225 *               method, status assumed to be success when passed in.
226 * @return new destination array, destination if there was no new allocation
227 */
228 static
addTouint64_tArray(int64_t * destination,uint32_t offset,uint32_t * destinationlength,uint64_t value,uint32_t increments,UErrorCode * status)229 inline int64_t * addTouint64_tArray(int64_t    *destination,
230                                     uint32_t    offset,
231                                     uint32_t   *destinationlength,
232                                     uint64_t    value,
233                                     uint32_t    increments,
234                                     UErrorCode *status)
235 {
236     uint32_t newlength = *destinationlength;
237     if (offset + 1 == newlength) {
238         newlength += increments;
239         int64_t *temp = (int64_t *)allocateMemory(
240                                          sizeof(int64_t) * newlength, status);
241 
242         if (U_FAILURE(*status)) {
243             return nullptr;
244         }
245 
246         uprv_memcpy(temp, destination, sizeof(int64_t) * (size_t)offset);
247         *destinationlength = newlength;
248         destination        = temp;
249     }
250 
251     destination[offset] = value;
252 
253     return destination;
254 }
255 
256 /**
257 * Initializing the ce table for a pattern.
258 * Stores non-ignorable collation keys.
259 * Table size will be estimated by the size of the pattern text. Table
260 * expansion will be perform as we go along. Adding 1 to ensure that the table
261 * size definitely increases.
262 * Internal method, status assumed to be a success.
263 * @param strsrch string search data
264 * @param status output error if any, caller to check status before calling
265 *               method, status assumed to be success when passed in.
266 */
267 static
initializePatternCETable(UStringSearch * strsrch,UErrorCode * status)268 inline void initializePatternCETable(UStringSearch *strsrch, UErrorCode *status)
269 {
270     UPattern *pattern            = &(strsrch->pattern);
271     uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
272     int32_t  *cetable            = pattern->cesBuffer;
273     uint32_t  patternlength      = pattern->textLength;
274     UCollationElements *coleiter = strsrch->utilIter;
275 
276     if (coleiter == nullptr) {
277         coleiter = ucol_openElements(strsrch->collator, pattern->text,
278                                      patternlength, status);
279         // status will be checked in ucol_next(..) later and if it is an
280         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
281         // returned.
282         strsrch->utilIter = coleiter;
283     }
284     else {
285         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
286     }
287     if(U_FAILURE(*status)) {
288         return;
289     }
290 
291     if (pattern->ces != cetable && pattern->ces) {
292         uprv_free(pattern->ces);
293     }
294 
295     uint32_t  offset      = 0;
296     int32_t   ce;
297 
298     while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
299            U_SUCCESS(*status)) {
300         uint32_t newce = getCE(strsrch, ce);
301         if (newce) {
302             int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
303                                   newce,
304                                   patternlength - ucol_getOffset(coleiter) + 1,
305                                   status);
306             if (U_FAILURE(*status)) {
307                 return;
308             }
309             offset ++;
310             if (cetable != temp && cetable != pattern->cesBuffer) {
311                 uprv_free(cetable);
312             }
313             cetable = temp;
314         }
315     }
316 
317     cetable[offset]   = 0;
318     pattern->ces       = cetable;
319     pattern->cesLength = offset;
320 }
321 
322 /**
323 * Initializing the pce table for a pattern.
324 * Stores non-ignorable collation keys.
325 * Table size will be estimated by the size of the pattern text. Table
326 * expansion will be perform as we go along. Adding 1 to ensure that the table
327 * size definitely increases.
328 * Internal method, status assumed to be a success.
329 * @param strsrch string search data
330 * @param status output error if any, caller to check status before calling
331 *               method, status assumed to be success when passed in.
332 */
333 static
initializePatternPCETable(UStringSearch * strsrch,UErrorCode * status)334 inline void initializePatternPCETable(UStringSearch *strsrch,
335                                       UErrorCode    *status)
336 {
337     UPattern *pattern            = &(strsrch->pattern);
338     uint32_t  pcetablesize       = INITIAL_ARRAY_SIZE_;
339     int64_t  *pcetable           = pattern->pcesBuffer;
340     uint32_t  patternlength      = pattern->textLength;
341     UCollationElements *coleiter = strsrch->utilIter;
342 
343     if (coleiter == nullptr) {
344         coleiter = ucol_openElements(strsrch->collator, pattern->text,
345                                      patternlength, status);
346         // status will be checked in nextProcessed(..) later and if it is an error
347         // then UCOL_PROCESSED_NULLORDER is returned by nextProcessed(..), so 0 will be
348         // returned.
349         strsrch->utilIter = coleiter;
350     } else {
351         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
352     }
353     if(U_FAILURE(*status)) {
354         return;
355     }
356 
357     if (pattern->pces != pcetable && pattern->pces != nullptr) {
358         uprv_free(pattern->pces);
359     }
360 
361     uint32_t  offset = 0;
362     int64_t   pce;
363 
364     icu::UCollationPCE iter(coleiter);
365 
366     // ** Should processed CEs be signed or unsigned?
367     // ** (the rest of the code in this file seems to play fast-and-loose with
368     // **  whether a CE is signed or unsigned. For example, look at routine above this one.)
369     while ((pce = iter.nextProcessed(nullptr, nullptr, status)) != UCOL_PROCESSED_NULLORDER &&
370            U_SUCCESS(*status)) {
371         int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
372                               pce,
373                               patternlength - ucol_getOffset(coleiter) + 1,
374                               status);
375 
376         if (U_FAILURE(*status)) {
377             return;
378         }
379 
380         offset += 1;
381 
382         if (pcetable != temp && pcetable != pattern->pcesBuffer) {
383             uprv_free(pcetable);
384         }
385 
386         pcetable = temp;
387     }
388 
389     pcetable[offset]   = 0;
390     pattern->pces       = pcetable;
391     pattern->pcesLength = offset;
392 }
393 
394 /**
395 * Initializes the pattern struct.
396 * @param strsrch UStringSearch data storage
397 * @param status output error if any, caller to check status before calling
398 *               method, status assumed to be success when passed in.
399 */
400 static
initializePattern(UStringSearch * strsrch,UErrorCode * status)401 inline void initializePattern(UStringSearch *strsrch, UErrorCode *status)
402 {
403     if (U_FAILURE(*status)) { return; }
404 
405           UPattern   *pattern     = &(strsrch->pattern);
406     const UChar      *patterntext = pattern->text;
407           int32_t     length      = pattern->textLength;
408           int32_t index       = 0;
409 
410     // Since the strength is primary, accents are ignored in the pattern.
411     if (strsrch->strength == UCOL_PRIMARY) {
412         pattern->hasPrefixAccents = 0;
413         pattern->hasSuffixAccents = 0;
414     } else {
415         pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
416                                                          SECOND_LAST_BYTE_SHIFT_;
417         index = length;
418         U16_BACK_1(patterntext, 0, index);
419         pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
420                                                                  LAST_BYTE_MASK_;
421     }
422 
423     // ** HACK **
424     if (strsrch->pattern.pces != nullptr) {
425         if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
426             uprv_free(strsrch->pattern.pces);
427         }
428 
429         strsrch->pattern.pces = nullptr;
430     }
431 
432     initializePatternCETable(strsrch, status);
433 }
434 
435 /**
436 * Initializes the pattern struct and builds the pattern collation element table.
437 * @param strsrch UStringSearch data storage
438 * @param status  for output errors if it occurs, status is assumed to be a
439 *                success when it is passed in.
440 */
441 static
initialize(UStringSearch * strsrch,UErrorCode * status)442 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
443 {
444     initializePattern(strsrch, status);
445 }
446 
447 #if !UCONFIG_NO_BREAK_ITERATION
448 // If the caller provided a character breakiterator we'll return that,
449 // otherwise we lazily create the internal break iterator.
getBreakIterator(UStringSearch * strsrch,UErrorCode & status)450 static UBreakIterator* getBreakIterator(UStringSearch *strsrch, UErrorCode &status)
451 {
452     if (U_FAILURE(status)) {
453         return nullptr;
454     }
455 
456     if (strsrch->search->breakIter != nullptr) {
457         return strsrch->search->breakIter;
458     }
459 
460     if (strsrch->search->internalBreakIter != nullptr) {
461         return strsrch->search->internalBreakIter;
462     }
463 
464     // Need to create the internal break iterator.
465     strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER,
466         ucol_getLocaleByType(strsrch->collator, ULOC_VALID_LOCALE, &status),
467         strsrch->search->text, strsrch->search->textLength, &status);
468 
469     return strsrch->search->internalBreakIter;
470 }
471 #endif
472 
473 /**
474 * Sets the match result to "not found", regardless of the incoming error status.
475 * If an error occurs while setting the result, it is reported back.
476 *
477 * @param strsrch string search data
478 * @param status  for output errors, if they occur.
479 */
480 static
setMatchNotFound(UStringSearch * strsrch,UErrorCode & status)481 inline void setMatchNotFound(UStringSearch *strsrch, UErrorCode &status)
482 {
483     UErrorCode localStatus = U_ZERO_ERROR;
484 
485     strsrch->search->matchedIndex = USEARCH_DONE;
486     strsrch->search->matchedLength = 0;
487     if (strsrch->search->isForwardSearching) {
488         setColEIterOffset(strsrch->textIter, strsrch->search->textLength, localStatus);
489     }
490     else {
491         setColEIterOffset(strsrch->textIter, 0, localStatus);
492     }
493 
494     // If an error occurred while setting the result to not found (ex: OOM),
495     // then we want to report that error back to the caller.
496     if (U_SUCCESS(status) && U_FAILURE(localStatus)) {
497         status = localStatus;
498     }
499 }
500 
501 /**
502 * Checks if the offset runs out of the text string
503 * @param offset
504 * @param textlength of the text string
505 * @return TRUE if offset is out of bounds, FALSE otherwise
506 */
507 static
isOutOfBounds(int32_t textlength,int32_t offset)508 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
509 {
510     return offset < 0 || offset > textlength;
511 }
512 
513 /**
514 * Checks for identical match
515 * @param strsrch string search data
516 * @param start offset of possible match
517 * @param end offset of possible match
518 * @return TRUE if identical match is found
519 */
520 static
checkIdentical(const UStringSearch * strsrch,int32_t start,int32_t end)521 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start, int32_t end)
522 {
523     if (strsrch->strength != UCOL_IDENTICAL) {
524         return TRUE;
525     }
526 
527     // Note: We could use Normalizer::compare() or similar, but for short strings
528     // which may not be in FCD it might be faster to just NFD them.
529     UErrorCode status = U_ZERO_ERROR;
530     UnicodeString t2, p2;
531     strsrch->nfd->normalize(
532         UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
533     strsrch->nfd->normalize(
534         UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
535     // return FALSE if NFD failed
536     return U_SUCCESS(status) && t2 == p2;
537 }
538 
539 // constructors and destructor -------------------------------------------
540 
usearch_open(const UChar * pattern,int32_t patternlength,const UChar * text,int32_t textlength,const char * locale,UBreakIterator * breakiter,UErrorCode * status)541 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
542                                           int32_t         patternlength,
543                                     const UChar          *text,
544                                           int32_t         textlength,
545                                     const char           *locale,
546                                           UBreakIterator *breakiter,
547                                           UErrorCode     *status)
548 {
549     if (U_FAILURE(*status)) {
550         return nullptr;
551     }
552 #if UCONFIG_NO_BREAK_ITERATION
553     if (breakiter != nullptr) {
554         *status = U_UNSUPPORTED_ERROR;
555         return nullptr;
556     }
557 #endif
558     if (locale) {
559         // ucol_open internally checks for status
560         UCollator     *collator = ucol_open(locale, status);
561         // pattern, text checks are done in usearch_openFromCollator
562         UStringSearch *result   = usearch_openFromCollator(pattern,
563                                               patternlength, text, textlength,
564                                               collator, breakiter, status);
565 
566         if (result == nullptr || U_FAILURE(*status)) {
567             if (collator) {
568                 ucol_close(collator);
569             }
570             return nullptr;
571         }
572         else {
573             result->ownCollator = TRUE;
574         }
575         return result;
576     }
577     *status = U_ILLEGAL_ARGUMENT_ERROR;
578     return nullptr;
579 }
580 
usearch_openFromCollator(const UChar * pattern,int32_t patternlength,const UChar * text,int32_t textlength,const UCollator * collator,UBreakIterator * breakiter,UErrorCode * status)581 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
582                                   const UChar          *pattern,
583                                         int32_t         patternlength,
584                                   const UChar          *text,
585                                         int32_t         textlength,
586                                   const UCollator      *collator,
587                                         UBreakIterator *breakiter,
588                                         UErrorCode     *status)
589 {
590     if (U_FAILURE(*status)) {
591         return nullptr;
592     }
593 #if UCONFIG_NO_BREAK_ITERATION
594     if (breakiter != nullptr) {
595         *status = U_UNSUPPORTED_ERROR;
596         return nullptr;
597     }
598 #endif
599     if (pattern == nullptr || text == nullptr || collator == nullptr) {
600         *status = U_ILLEGAL_ARGUMENT_ERROR;
601         return nullptr;
602     }
603 
604     // string search does not really work when numeric collation is turned on
605     if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
606         *status = U_UNSUPPORTED_ERROR;
607         return nullptr;
608     }
609 
610     if (U_SUCCESS(*status)) {
611         initializeFCD(status);
612         if (U_FAILURE(*status)) {
613             return nullptr;
614         }
615 
616         UStringSearch *result;
617         if (textlength == -1) {
618             textlength = u_strlen(text);
619         }
620         if (patternlength == -1) {
621             patternlength = u_strlen(pattern);
622         }
623         if (textlength <= 0 || patternlength <= 0) {
624             *status = U_ILLEGAL_ARGUMENT_ERROR;
625             return nullptr;
626         }
627 
628         result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
629         if (result == nullptr) {
630             *status = U_MEMORY_ALLOCATION_ERROR;
631             return nullptr;
632         }
633 
634         result->collator    = collator;
635         result->strength    = ucol_getStrength(collator);
636         result->ceMask      = getMask(result->strength);
637         result->toShift     =
638              ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
639                                                             UCOL_SHIFTED;
640         result->variableTop = ucol_getVariableTop(collator, status);
641 
642         result->nfd         = Normalizer2::getNFDInstance(*status);
643 
644         if (U_FAILURE(*status)) {
645             uprv_free(result);
646             return nullptr;
647         }
648 
649         result->search             = (USearch *)uprv_malloc(sizeof(USearch));
650         if (result->search == nullptr) {
651             *status = U_MEMORY_ALLOCATION_ERROR;
652             uprv_free(result);
653             return nullptr;
654         }
655 
656         result->search->text       = text;
657         result->search->textLength = textlength;
658 
659         result->pattern.text       = pattern;
660         result->pattern.textLength = patternlength;
661         result->pattern.ces         = nullptr;
662         result->pattern.pces        = nullptr;
663 
664         result->search->breakIter  = breakiter;
665 #if !UCONFIG_NO_BREAK_ITERATION
666         result->search->internalBreakIter = nullptr; // Lazily created.
667         if (breakiter) {
668             ubrk_setText(breakiter, text, textlength, status);
669         }
670 #endif
671 
672         result->ownCollator           = FALSE;
673         result->search->matchedLength = 0;
674         result->search->matchedIndex  = USEARCH_DONE;
675         result->utilIter              = nullptr;
676         result->textIter              = ucol_openElements(collator, text,
677                                                           textlength, status);
678         result->textProcessedIter     = nullptr;
679         if (U_FAILURE(*status)) {
680             usearch_close(result);
681             return nullptr;
682         }
683 
684         result->search->isOverlap          = FALSE;
685         result->search->isCanonicalMatch   = FALSE;
686         result->search->elementComparisonType = 0;
687         result->search->isForwardSearching = TRUE;
688         result->search->reset              = TRUE;
689 
690         initialize(result, status);
691 
692         if (U_FAILURE(*status)) {
693             usearch_close(result);
694             return nullptr;
695         }
696 
697         return result;
698     }
699     return nullptr;
700 }
701 
usearch_close(UStringSearch * strsrch)702 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
703 {
704     if (strsrch) {
705         if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
706             strsrch->pattern.ces) {
707             uprv_free(strsrch->pattern.ces);
708         }
709 
710         if (strsrch->pattern.pces != nullptr &&
711             strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
712             uprv_free(strsrch->pattern.pces);
713         }
714 
715         delete strsrch->textProcessedIter;
716         ucol_closeElements(strsrch->textIter);
717         ucol_closeElements(strsrch->utilIter);
718 
719         if (strsrch->ownCollator && strsrch->collator) {
720             ucol_close((UCollator *)strsrch->collator);
721         }
722 
723 #if !UCONFIG_NO_BREAK_ITERATION
724         if (strsrch->search->internalBreakIter != nullptr) {
725             ubrk_close(strsrch->search->internalBreakIter);
726         }
727 #endif
728 
729         uprv_free(strsrch->search);
730         uprv_free(strsrch);
731     }
732 }
733 
734 namespace {
735 
initTextProcessedIter(UStringSearch * strsrch,UErrorCode * status)736 UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
737     if (U_FAILURE(*status)) { return FALSE; }
738     if (strsrch->textProcessedIter == nullptr) {
739         strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
740         if (strsrch->textProcessedIter == nullptr) {
741             *status = U_MEMORY_ALLOCATION_ERROR;
742             return FALSE;
743         }
744     } else {
745         strsrch->textProcessedIter->init(strsrch->textIter);
746     }
747     return TRUE;
748 }
749 
750 }
751 
752 // set and get methods --------------------------------------------------
753 
usearch_setOffset(UStringSearch * strsrch,int32_t position,UErrorCode * status)754 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
755                                         int32_t        position,
756                                         UErrorCode    *status)
757 {
758     if (U_SUCCESS(*status) && strsrch) {
759         if (isOutOfBounds(strsrch->search->textLength, position)) {
760             *status = U_INDEX_OUTOFBOUNDS_ERROR;
761         }
762         else {
763             setColEIterOffset(strsrch->textIter, position, *status);
764         }
765         strsrch->search->matchedIndex  = USEARCH_DONE;
766         strsrch->search->matchedLength = 0;
767         strsrch->search->reset         = FALSE;
768     }
769 }
770 
usearch_getOffset(const UStringSearch * strsrch)771 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
772 {
773     if (strsrch) {
774         int32_t result = ucol_getOffset(strsrch->textIter);
775         if (isOutOfBounds(strsrch->search->textLength, result)) {
776             return USEARCH_DONE;
777         }
778         return result;
779     }
780     return USEARCH_DONE;
781 }
782 
usearch_setAttribute(UStringSearch * strsrch,USearchAttribute attribute,USearchAttributeValue value,UErrorCode * status)783 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch        *strsrch,
784                                            USearchAttribute      attribute,
785                                            USearchAttributeValue value,
786                                            UErrorCode           *status)
787 {
788     if (U_SUCCESS(*status) && strsrch) {
789         switch (attribute)
790         {
791         case USEARCH_OVERLAP :
792             strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
793             break;
794         case USEARCH_CANONICAL_MATCH :
795             strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
796                                                                       FALSE);
797             break;
798         case USEARCH_ELEMENT_COMPARISON :
799             if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
800                 strsrch->search->elementComparisonType = (int16_t)value;
801             } else {
802                 strsrch->search->elementComparisonType = 0;
803             }
804             break;
805         case USEARCH_ATTRIBUTE_COUNT :
806         default:
807             *status = U_ILLEGAL_ARGUMENT_ERROR;
808         }
809     }
810     if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
811         *status = U_ILLEGAL_ARGUMENT_ERROR;
812     }
813 }
814 
usearch_getAttribute(const UStringSearch * strsrch,USearchAttribute attribute)815 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
816                                                 const UStringSearch *strsrch,
817                                                 USearchAttribute attribute)
818 {
819     if (strsrch) {
820         switch (attribute) {
821         case USEARCH_OVERLAP :
822             return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
823                                                         USEARCH_OFF);
824         case USEARCH_CANONICAL_MATCH :
825             return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
826                                                                USEARCH_OFF);
827         case USEARCH_ELEMENT_COMPARISON :
828             {
829                 int16_t value = strsrch->search->elementComparisonType;
830                 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
831                     return (USearchAttributeValue)value;
832                 } else {
833                     return USEARCH_STANDARD_ELEMENT_COMPARISON;
834                 }
835             }
836         case USEARCH_ATTRIBUTE_COUNT :
837             return USEARCH_DEFAULT;
838         }
839     }
840     return USEARCH_DEFAULT;
841 }
842 
usearch_getMatchedStart(const UStringSearch * strsrch)843 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
844                                                 const UStringSearch *strsrch)
845 {
846     if (strsrch == nullptr) {
847         return USEARCH_DONE;
848     }
849     return strsrch->search->matchedIndex;
850 }
851 
852 
usearch_getMatchedText(const UStringSearch * strsrch,UChar * result,int32_t resultCapacity,UErrorCode * status)853 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
854                                             UChar         *result,
855                                             int32_t        resultCapacity,
856                                             UErrorCode    *status)
857 {
858     if (U_FAILURE(*status)) {
859         return USEARCH_DONE;
860     }
861     if (strsrch == nullptr || resultCapacity < 0 || (resultCapacity > 0 &&
862         result == nullptr)) {
863         *status = U_ILLEGAL_ARGUMENT_ERROR;
864         return USEARCH_DONE;
865     }
866 
867     int32_t     copylength = strsrch->search->matchedLength;
868     int32_t copyindex  = strsrch->search->matchedIndex;
869     if (copyindex == USEARCH_DONE) {
870         u_terminateUChars(result, resultCapacity, 0, status);
871         return USEARCH_DONE;
872     }
873 
874     if (resultCapacity < copylength) {
875         copylength = resultCapacity;
876     }
877     if (copylength > 0) {
878         uprv_memcpy(result, strsrch->search->text + copyindex,
879                     copylength * sizeof(UChar));
880     }
881     return u_terminateUChars(result, resultCapacity,
882                              strsrch->search->matchedLength, status);
883 }
884 
usearch_getMatchedLength(const UStringSearch * strsrch)885 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
886                                               const UStringSearch *strsrch)
887 {
888     if (strsrch) {
889         return strsrch->search->matchedLength;
890     }
891     return USEARCH_DONE;
892 }
893 
894 #if !UCONFIG_NO_BREAK_ITERATION
895 
usearch_setBreakIterator(UStringSearch * strsrch,UBreakIterator * breakiter,UErrorCode * status)896 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
897                                                UBreakIterator *breakiter,
898                                                UErrorCode     *status)
899 {
900     if (U_SUCCESS(*status) && strsrch) {
901         strsrch->search->breakIter = breakiter;
902         if (breakiter) {
903             ubrk_setText(breakiter, strsrch->search->text,
904                          strsrch->search->textLength, status);
905         }
906     }
907 }
908 
909 U_CAPI const UBreakIterator* U_EXPORT2
usearch_getBreakIterator(const UStringSearch * strsrch)910 usearch_getBreakIterator(const UStringSearch *strsrch)
911 {
912     if (strsrch) {
913         return strsrch->search->breakIter;
914     }
915     return nullptr;
916 }
917 
918 #endif
919 
usearch_setText(UStringSearch * strsrch,const UChar * text,int32_t textlength,UErrorCode * status)920 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
921                                       const UChar         *text,
922                                             int32_t        textlength,
923                                             UErrorCode    *status)
924 {
925     if (U_SUCCESS(*status)) {
926         if (strsrch == nullptr || text == nullptr || textlength < -1 ||
927             textlength == 0) {
928             *status = U_ILLEGAL_ARGUMENT_ERROR;
929         }
930         else {
931             if (textlength == -1) {
932                 textlength = u_strlen(text);
933             }
934             strsrch->search->text       = text;
935             strsrch->search->textLength = textlength;
936             ucol_setText(strsrch->textIter, text, textlength, status);
937             strsrch->search->matchedIndex  = USEARCH_DONE;
938             strsrch->search->matchedLength = 0;
939             strsrch->search->reset         = TRUE;
940 #if !UCONFIG_NO_BREAK_ITERATION
941             if (strsrch->search->breakIter != nullptr) {
942                 ubrk_setText(strsrch->search->breakIter, text,
943                              textlength, status);
944             }
945             if (strsrch->search->internalBreakIter != nullptr) {
946                 ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
947             }
948 #endif
949         }
950     }
951 }
952 
usearch_getText(const UStringSearch * strsrch,int32_t * length)953 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
954                                                      int32_t       *length)
955 {
956     if (strsrch) {
957         *length = strsrch->search->textLength;
958         return strsrch->search->text;
959     }
960     return nullptr;
961 }
962 
usearch_setCollator(UStringSearch * strsrch,const UCollator * collator,UErrorCode * status)963 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
964                                           const UCollator     *collator,
965                                                 UErrorCode    *status)
966 {
967     if (U_SUCCESS(*status)) {
968         if (collator == nullptr) {
969             *status = U_ILLEGAL_ARGUMENT_ERROR;
970             return;
971         }
972 
973         if (strsrch) {
974             delete strsrch->textProcessedIter;
975             strsrch->textProcessedIter = nullptr;
976             ucol_closeElements(strsrch->textIter);
977             ucol_closeElements(strsrch->utilIter);
978             strsrch->textIter = strsrch->utilIter = nullptr;
979             if (strsrch->ownCollator && (strsrch->collator != collator)) {
980                 ucol_close((UCollator *)strsrch->collator);
981                 strsrch->ownCollator = FALSE;
982             }
983             strsrch->collator    = collator;
984             strsrch->strength    = ucol_getStrength(collator);
985             strsrch->ceMask      = getMask(strsrch->strength);
986 #if !UCONFIG_NO_BREAK_ITERATION
987             if (strsrch->search->internalBreakIter != nullptr) {
988                 ubrk_close(strsrch->search->internalBreakIter);
989                 strsrch->search->internalBreakIter = nullptr;   // Lazily created.
990             }
991 #endif
992             // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
993             strsrch->toShift     =
994                ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
995                                                                 UCOL_SHIFTED;
996             // if status is a failure, ucol_getVariableTop returns 0
997             strsrch->variableTop = ucol_getVariableTop(collator, status);
998             strsrch->textIter = ucol_openElements(collator,
999                                       strsrch->search->text,
1000                                       strsrch->search->textLength,
1001                                       status);
1002             strsrch->utilIter = ucol_openElements(
1003                     collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
1004             // initialize() _after_ setting the iterators for the new collator.
1005             initialize(strsrch, status);
1006         }
1007 
1008         // **** are these calls needed?
1009         // **** we call uprv_init_pce in initializePatternPCETable
1010         // **** and the CEIBuffer constructor...
1011 #if 0
1012         uprv_init_pce(strsrch->textIter);
1013         uprv_init_pce(strsrch->utilIter);
1014 #endif
1015     }
1016 }
1017 
usearch_getCollator(const UStringSearch * strsrch)1018 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
1019 {
1020     if (strsrch) {
1021         return (UCollator *)strsrch->collator;
1022     }
1023     return nullptr;
1024 }
1025 
usearch_setPattern(UStringSearch * strsrch,const UChar * pattern,int32_t patternlength,UErrorCode * status)1026 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
1027                                          const UChar         *pattern,
1028                                                int32_t        patternlength,
1029                                                UErrorCode    *status)
1030 {
1031     if (U_SUCCESS(*status)) {
1032         if (strsrch == nullptr || pattern == nullptr) {
1033             *status = U_ILLEGAL_ARGUMENT_ERROR;
1034         }
1035         else {
1036             if (patternlength == -1) {
1037                 patternlength = u_strlen(pattern);
1038             }
1039             if (patternlength == 0) {
1040                 *status = U_ILLEGAL_ARGUMENT_ERROR;
1041                 return;
1042             }
1043             strsrch->pattern.text       = pattern;
1044             strsrch->pattern.textLength = patternlength;
1045             initialize(strsrch, status);
1046         }
1047     }
1048 }
1049 
1050 U_CAPI const UChar* U_EXPORT2
usearch_getPattern(const UStringSearch * strsrch,int32_t * length)1051 usearch_getPattern(const UStringSearch *strsrch,
1052                    int32_t             *length)
1053 {
1054     if (strsrch) {
1055         *length = strsrch->pattern.textLength;
1056         return strsrch->pattern.text;
1057     }
1058     return nullptr;
1059 }
1060 
1061 // miscellaneous methods --------------------------------------------------
1062 
usearch_first(UStringSearch * strsrch,UErrorCode * status)1063 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
1064                                        UErrorCode    *status)
1065 {
1066     if (strsrch && U_SUCCESS(*status)) {
1067         strsrch->search->isForwardSearching = TRUE;
1068         usearch_setOffset(strsrch, 0, status);
1069         if (U_SUCCESS(*status)) {
1070             return usearch_next(strsrch, status);
1071         }
1072     }
1073     return USEARCH_DONE;
1074 }
1075 
usearch_following(UStringSearch * strsrch,int32_t position,UErrorCode * status)1076 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
1077                                            int32_t        position,
1078                                            UErrorCode    *status)
1079 {
1080     if (strsrch && U_SUCCESS(*status)) {
1081         strsrch->search->isForwardSearching = TRUE;
1082         // position checked in usearch_setOffset
1083         usearch_setOffset(strsrch, position, status);
1084         if (U_SUCCESS(*status)) {
1085             return usearch_next(strsrch, status);
1086         }
1087     }
1088     return USEARCH_DONE;
1089 }
1090 
usearch_last(UStringSearch * strsrch,UErrorCode * status)1091 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
1092                                       UErrorCode    *status)
1093 {
1094     if (strsrch && U_SUCCESS(*status)) {
1095         strsrch->search->isForwardSearching = FALSE;
1096         usearch_setOffset(strsrch, strsrch->search->textLength, status);
1097         if (U_SUCCESS(*status)) {
1098             return usearch_previous(strsrch, status);
1099         }
1100     }
1101     return USEARCH_DONE;
1102 }
1103 
usearch_preceding(UStringSearch * strsrch,int32_t position,UErrorCode * status)1104 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
1105                                            int32_t        position,
1106                                            UErrorCode    *status)
1107 {
1108     if (strsrch && U_SUCCESS(*status)) {
1109         strsrch->search->isForwardSearching = FALSE;
1110         // position checked in usearch_setOffset
1111         usearch_setOffset(strsrch, position, status);
1112         if (U_SUCCESS(*status)) {
1113             return usearch_previous(strsrch, status);
1114         }
1115     }
1116     return USEARCH_DONE;
1117 }
1118 
1119 /**
1120 * If a direction switch is required, we'll count the number of ces till the
1121 * beginning of the collation element iterator and iterate forwards that
1122 * number of times. This is so that we get to the correct point within the
1123 * string to continue the search in. Imagine when we are in the middle of the
1124 * normalization buffer when the change in direction is request. arrrgghh....
1125 * After searching the offset within the collation element iterator will be
1126 * shifted to the start of the match. If a match is not found, the offset would
1127 * have been set to the end of the text string in the collation element
1128 * iterator.
1129 * Okay, here's my take on normalization buffer. The only time when there can
1130 * be 2 matches within the same normalization is when the pattern is consists
1131 * of all accents. But since the offset returned is from the text string, we
1132 * should not confuse the caller by returning the second match within the
1133 * same normalization buffer. If we do, the 2 results will have the same match
1134 * offsets, and that'll be confusing. I'll return the next match that doesn't
1135 * fall within the same normalization buffer. Note this does not affect the
1136 * results of matches spanning the text and the normalization buffer.
1137 * The position to start searching is taken from the collation element
1138 * iterator. Callers of this API would have to set the offset in the collation
1139 * element iterator before using this method.
1140 */
usearch_next(UStringSearch * strsrch,UErrorCode * status)1141 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
1142                                       UErrorCode    *status)
1143 {
1144     if (U_SUCCESS(*status) && strsrch) {
1145         // note offset is either equivalent to the start of the previous match
1146         // or is set by the user
1147         int32_t      offset       = usearch_getOffset(strsrch);
1148         USearch     *search       = strsrch->search;
1149         search->reset             = FALSE;
1150         int32_t      textlength   = search->textLength;
1151         if (search->isForwardSearching) {
1152             if (offset == textlength ||
1153                 (! search->isOverlap &&
1154                 (search->matchedIndex != USEARCH_DONE &&
1155                 offset + search->matchedLength > textlength))) {
1156                     // not enough characters to match
1157                     setMatchNotFound(strsrch, *status);
1158                     return USEARCH_DONE;
1159             }
1160         }
1161         else {
1162             // switching direction.
1163             // if matchedIndex == USEARCH_DONE, it means that either a
1164             // setOffset has been called or that previous ran off the text
1165             // string. the iterator would have been set to offset 0 if a
1166             // match is not found.
1167             search->isForwardSearching = TRUE;
1168             if (search->matchedIndex != USEARCH_DONE) {
1169                 // there's no need to set the collation element iterator
1170                 // the next call to next will set the offset.
1171                 return search->matchedIndex;
1172             }
1173         }
1174 
1175         if (U_SUCCESS(*status)) {
1176             if (strsrch->pattern.cesLength == 0) {
1177                 if (search->matchedIndex == USEARCH_DONE) {
1178                     search->matchedIndex = offset;
1179                 }
1180                 else { // moves by codepoints
1181                     U16_FWD_1(search->text, search->matchedIndex, textlength);
1182                 }
1183 
1184                 search->matchedLength = 0;
1185                 setColEIterOffset(strsrch->textIter, search->matchedIndex, *status);
1186                 // status checked below
1187                 if (search->matchedIndex == textlength) {
1188                     search->matchedIndex = USEARCH_DONE;
1189                 }
1190             }
1191             else {
1192                 if (search->matchedLength > 0) {
1193                     // if matchlength is 0 we are at the start of the iteration
1194                     if (search->isOverlap) {
1195                         ucol_setOffset(strsrch->textIter, offset + 1, status);
1196                     }
1197                     else {
1198                         ucol_setOffset(strsrch->textIter,
1199                                        offset + search->matchedLength, status);
1200                     }
1201                 }
1202                 else {
1203                     // for boundary check purposes. this will ensure that the
1204                     // next match will not precede the current offset
1205                     // note search->matchedIndex will always be set to something
1206                     // in the code
1207                     search->matchedIndex = offset - 1;
1208                 }
1209 
1210                 if (search->isCanonicalMatch) {
1211                     // can't use exact here since extra accents are allowed.
1212                     usearch_handleNextCanonical(strsrch, status);
1213                 }
1214                 else {
1215                     usearch_handleNextExact(strsrch, status);
1216                 }
1217             }
1218 
1219             if (U_FAILURE(*status)) {
1220                 return USEARCH_DONE;
1221             }
1222 
1223             if (search->matchedIndex == USEARCH_DONE) {
1224                 ucol_setOffset(strsrch->textIter, search->textLength, status);
1225             } else {
1226                 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
1227             }
1228 
1229             return search->matchedIndex;
1230         }
1231     }
1232     return USEARCH_DONE;
1233 }
1234 
usearch_previous(UStringSearch * strsrch,UErrorCode * status)1235 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
1236                                           UErrorCode    *status)
1237 {
1238     if (U_SUCCESS(*status) && strsrch) {
1239         int32_t offset;
1240         USearch *search = strsrch->search;
1241         if (search->reset) {
1242             offset                     = search->textLength;
1243             search->isForwardSearching = FALSE;
1244             search->reset              = FALSE;
1245             setColEIterOffset(strsrch->textIter, offset, *status);
1246         }
1247         else {
1248             offset = usearch_getOffset(strsrch);
1249         }
1250 
1251         int32_t matchedindex = search->matchedIndex;
1252         if (search->isForwardSearching == TRUE) {
1253             // switching direction.
1254             // if matchedIndex == USEARCH_DONE, it means that either a
1255             // setOffset has been called or that next ran off the text
1256             // string. the iterator would have been set to offset textLength if
1257             // a match is not found.
1258             search->isForwardSearching = FALSE;
1259             if (matchedindex != USEARCH_DONE) {
1260                 return matchedindex;
1261             }
1262         }
1263         else {
1264 
1265             // Could check pattern length, but the
1266             // linear search will do the right thing
1267             if (offset == 0 || matchedindex == 0) {
1268                 setMatchNotFound(strsrch, *status);
1269                 return USEARCH_DONE;
1270             }
1271         }
1272 
1273         if (U_SUCCESS(*status)) {
1274             if (strsrch->pattern.cesLength == 0) {
1275                 search->matchedIndex =
1276                       (matchedindex == USEARCH_DONE ? offset : matchedindex);
1277                 if (search->matchedIndex == 0) {
1278                     setMatchNotFound(strsrch, *status);
1279                     // status checked below
1280                 }
1281                 else { // move by codepoints
1282                     U16_BACK_1(search->text, 0, search->matchedIndex);
1283                     setColEIterOffset(strsrch->textIter, search->matchedIndex, *status);
1284                     // status checked below
1285                     search->matchedLength = 0;
1286                 }
1287             }
1288             else {
1289                 if (strsrch->search->isCanonicalMatch) {
1290                     // can't use exact here since extra accents are allowed.
1291                     usearch_handlePreviousCanonical(strsrch, status);
1292                     // status checked below
1293                 }
1294                 else {
1295                     usearch_handlePreviousExact(strsrch, status);
1296                     // status checked below
1297                 }
1298             }
1299 
1300             if (U_FAILURE(*status)) {
1301                 return USEARCH_DONE;
1302             }
1303 
1304             return search->matchedIndex;
1305         }
1306     }
1307     return USEARCH_DONE;
1308 }
1309 
1310 
1311 
usearch_reset(UStringSearch * strsrch)1312 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
1313 {
1314     /*
1315     reset is setting the attributes that are already in
1316     string search, hence all attributes in the collator should
1317     be retrieved without any problems
1318     */
1319     if (strsrch) {
1320         UErrorCode status            = U_ZERO_ERROR;
1321         UBool      sameCollAttribute = TRUE;
1322         uint32_t   ceMask;
1323         UBool      shift;
1324         uint32_t   varTop;
1325 
1326         // **** hack to deal w/ how processed CEs encode quaternary ****
1327         UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
1328         if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
1329             (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
1330                 sameCollAttribute = FALSE;
1331         }
1332 
1333         strsrch->strength    = ucol_getStrength(strsrch->collator);
1334         ceMask = getMask(strsrch->strength);
1335         if (strsrch->ceMask != ceMask) {
1336             strsrch->ceMask = ceMask;
1337             sameCollAttribute = FALSE;
1338         }
1339 
1340         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
1341         shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
1342                                   &status) == UCOL_SHIFTED;
1343         if (strsrch->toShift != shift) {
1344             strsrch->toShift  = shift;
1345             sameCollAttribute = FALSE;
1346         }
1347 
1348         // if status is a failure, ucol_getVariableTop returns 0
1349         varTop = ucol_getVariableTop(strsrch->collator, &status);
1350         if (strsrch->variableTop != varTop) {
1351             strsrch->variableTop = varTop;
1352             sameCollAttribute    = FALSE;
1353         }
1354         if (!sameCollAttribute) {
1355             initialize(strsrch, &status);
1356         }
1357         ucol_setText(strsrch->textIter, strsrch->search->text,
1358                               strsrch->search->textLength,
1359                               &status);
1360         strsrch->search->matchedLength      = 0;
1361         strsrch->search->matchedIndex       = USEARCH_DONE;
1362         strsrch->search->isOverlap          = FALSE;
1363         strsrch->search->isCanonicalMatch   = FALSE;
1364         strsrch->search->elementComparisonType = 0;
1365         strsrch->search->isForwardSearching = TRUE;
1366         strsrch->search->reset              = TRUE;
1367     }
1368 }
1369 
1370 //
1371 //  CEI  Collation Element + source text index.
1372 //       These structs are kept in the circular buffer.
1373 //
1374 struct  CEI {
1375     int64_t ce;
1376     int32_t lowIndex;
1377     int32_t highIndex;
1378 };
1379 
1380 U_NAMESPACE_BEGIN
1381 
1382 namespace {
1383 //
1384 //  CEIBuffer   A circular buffer of CEs-with-index from the text being searched.
1385 //
1386 #define   DEFAULT_CEBUFFER_SIZE 96
1387 #define   CEBUFFER_EXTRA 32
1388 // Some typical max values to make buffer size more reasonable for asymmetric search.
1389 // #8694 is for a better long-term solution to allocation of this buffer.
1390 #define   MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
1391 #define   MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
1392 #define   MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
1393 struct CEIBuffer {
1394     CEI                  defBuf[DEFAULT_CEBUFFER_SIZE];
1395     CEI                 *buf;
1396     int32_t              bufSize;
1397     int32_t              firstIx;
1398     int32_t              limitIx;
1399     UCollationElements  *ceIter;
1400     UStringSearch       *strSearch;
1401 
1402 
1403 
1404                CEIBuffer(UStringSearch *ss, UErrorCode *status);
1405                ~CEIBuffer();
1406    const CEI   *get(int32_t index);
1407    const CEI   *getPrevious(int32_t index);
1408 };
1409 
1410 
CEIBuffer(UStringSearch * ss,UErrorCode * status)1411 CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
1412     buf = defBuf;
1413     strSearch = ss;
1414     bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
1415     if (ss->search->elementComparisonType != 0) {
1416         const UChar * patText = ss->pattern.text;
1417         if (patText) {
1418             const UChar * patTextLimit = patText + ss->pattern.textLength;
1419             while ( patText < patTextLimit ) {
1420                 UChar c = *patText++;
1421                 if (MIGHT_BE_JAMO_L(c)) {
1422                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
1423                 } else {
1424                     // No check for surrogates, we might allocate slightly more buffer than necessary.
1425                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
1426                 }
1427             }
1428         }
1429     }
1430     ceIter    = ss->textIter;
1431     firstIx = 0;
1432     limitIx = 0;
1433 
1434     if (!initTextProcessedIter(ss, status)) { return; }
1435 
1436     if (bufSize>DEFAULT_CEBUFFER_SIZE) {
1437         buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
1438         if (buf == nullptr) {
1439             *status = U_MEMORY_ALLOCATION_ERROR;
1440         }
1441     }
1442 }
1443 
1444 // TODO: add a reset or init function so that allocated
1445 //       buffers can be retained & reused.
1446 
~CEIBuffer()1447 CEIBuffer::~CEIBuffer() {
1448     if (buf != defBuf) {
1449         uprv_free(buf);
1450     }
1451 }
1452 
1453 
1454 // Get the CE with the specified index.
1455 //   Index must be in the range
1456 //          n-history_size < index < n+1
1457 //   where n is the largest index to have been fetched by some previous call to this function.
1458 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
1459 //
get(int32_t index)1460 const CEI *CEIBuffer::get(int32_t index) {
1461     int i = index % bufSize;
1462 
1463     if (index>=firstIx && index<limitIx) {
1464         // The request was for an entry already in our buffer.
1465         //  Just return it.
1466         return &buf[i];
1467     }
1468 
1469     // Caller is requesting a new, never accessed before, CE.
1470     //   Verify that it is the next one in sequence, which is all
1471     //   that is allowed.
1472     if (index != limitIx) {
1473         UPRV_UNREACHABLE_ASSERT;
1474         // TODO: In ICU 64 the above was changed from U_ASSERT to UPRV_UNREACHABLE,
1475         // which unconditionally called abort(). However, there were cases in which it
1476         // was being hit, so it was changed back to U_ASSERT per ICU-20680. In ICU 70,
1477         // we now use the new UPRV_UNREACHABLE_ASSERT to better indicate the situation.
1478         // ICU-20792 tracks the follow-up work/further investigation on this.
1479         return nullptr;
1480     }
1481 
1482     // Manage the circular CE buffer indexing
1483     limitIx++;
1484 
1485     if (limitIx - firstIx >= bufSize) {
1486         // The buffer is full, knock out the lowest-indexed entry.
1487         firstIx++;
1488     }
1489 
1490     UErrorCode status = U_ZERO_ERROR;
1491 
1492     buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
1493 
1494     return &buf[i];
1495 }
1496 
1497 // Get the CE with the specified index.
1498 //   Index must be in the range
1499 //          n-history_size < index < n+1
1500 //   where n is the largest index to have been fetched by some previous call to this function.
1501 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
1502 //
getPrevious(int32_t index)1503 const CEI *CEIBuffer::getPrevious(int32_t index) {
1504     int i = index % bufSize;
1505 
1506     if (index>=firstIx && index<limitIx) {
1507         // The request was for an entry already in our buffer.
1508         //  Just return it.
1509         return &buf[i];
1510     }
1511 
1512     // Caller is requesting a new, never accessed before, CE.
1513     //   Verify that it is the next one in sequence, which is all
1514     //   that is allowed.
1515     if (index != limitIx) {
1516         UPRV_UNREACHABLE_ASSERT;
1517         // TODO: In ICU 64 the above was changed from U_ASSERT to UPRV_UNREACHABLE,
1518         // which unconditionally called abort(). However, there were cases in which it
1519         // was being hit, so it was changed back to U_ASSERT per ICU-20680. In ICU 70,
1520         // we now use the new UPRV_UNREACHABLE_ASSERT to better indicate the situation.
1521         // ICU-20792 tracks the follow-up work/further investigation on this.
1522         return nullptr;
1523     }
1524 
1525     // Manage the circular CE buffer indexing
1526     limitIx++;
1527 
1528     if (limitIx - firstIx >= bufSize) {
1529         // The buffer is full, knock out the lowest-indexed entry.
1530         firstIx++;
1531     }
1532 
1533     UErrorCode status = U_ZERO_ERROR;
1534 
1535     buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
1536 
1537     return &buf[i];
1538 }
1539 
1540 }
1541 
1542 U_NAMESPACE_END
1543 
1544 
1545 // #define USEARCH_DEBUG
1546 
1547 #ifdef USEARCH_DEBUG
1548 #include <stdio.h>
1549 #include <stdlib.h>
1550 #endif
1551 
1552 /*
1553  * Find the next break boundary after startIndex. If the UStringSearch object
1554  * has an external break iterator, use that. Otherwise use the internal character
1555  * break iterator.
1556  */
nextBoundaryAfter(UStringSearch * strsrch,int32_t startIndex,UErrorCode & status)1557 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex, UErrorCode &status) {
1558     if (U_FAILURE(status)) {
1559         return startIndex;
1560     }
1561 #if 0
1562     const UChar *text = strsrch->search->text;
1563     int32_t textLen   = strsrch->search->textLength;
1564 
1565     U_ASSERT(startIndex>=0);
1566     U_ASSERT(startIndex<=textLen);
1567 
1568     if (startIndex >= textLen) {
1569         return startIndex;
1570     }
1571 
1572     UChar32  c;
1573     int32_t  i = startIndex;
1574     U16_NEXT(text, i, textLen, c);
1575 
1576     // If we are on a control character, stop without looking for combining marks.
1577     //    Control characters do not combine.
1578     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1579     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
1580         return i;
1581     }
1582 
1583     // The initial character was not a control, and can thus accept trailing
1584     //   combining characters.  Advance over however many of them there are.
1585     int32_t  indexOfLastCharChecked;
1586     for (;;) {
1587         indexOfLastCharChecked = i;
1588         if (i>=textLen) {
1589             break;
1590         }
1591         U16_NEXT(text, i, textLen, c);
1592         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1593         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
1594             break;
1595         }
1596     }
1597     return indexOfLastCharChecked;
1598 #elif !UCONFIG_NO_BREAK_ITERATION
1599     UBreakIterator *breakiterator = getBreakIterator(strsrch, status);
1600     if (U_FAILURE(status)) {
1601         return startIndex;
1602     }
1603 
1604     return ubrk_following(breakiterator, startIndex);
1605 #else
1606     // **** or should we use the original code? ****
1607     return startIndex;
1608 #endif
1609 
1610 }
1611 
1612 /*
1613  * Returns TRUE if index is on a break boundary. If the UStringSearch
1614  * has an external break iterator, test using that, otherwise test
1615  * using the internal character break iterator.
1616  */
isBreakBoundary(UStringSearch * strsrch,int32_t index,UErrorCode & status)1617 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index, UErrorCode &status) {
1618     if (U_FAILURE(status)) {
1619         return TRUE;
1620     }
1621 #if 0
1622     const UChar *text = strsrch->search->text;
1623     int32_t textLen   = strsrch->search->textLength;
1624 
1625     U_ASSERT(index>=0);
1626     U_ASSERT(index<=textLen);
1627 
1628     if (index>=textLen || index<=0) {
1629         return TRUE;
1630     }
1631 
1632     // If the character at the current index is not a GRAPHEME_EXTEND
1633     //    then we can not be within a combining sequence.
1634     UChar32  c;
1635     U16_GET(text, 0, index, textLen, c);
1636     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1637     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
1638         return TRUE;
1639     }
1640 
1641     // We are at a combining mark.  If the preceding character is anything
1642     //   except a CONTROL, CR or LF, we are in a combining sequence.
1643     U16_PREV(text, 0, index, c);
1644     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1645     UBool combining =  !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
1646     return !combining;
1647 #elif !UCONFIG_NO_BREAK_ITERATION
1648     UBreakIterator *breakiterator = getBreakIterator(strsrch, status);
1649     if (U_FAILURE(status)) {
1650         return TRUE;
1651     }
1652 
1653     return ubrk_isBoundary(breakiterator, index);
1654 #else
1655     // **** or use the original code? ****
1656     return TRUE;
1657 #endif
1658 }
1659 
1660 #if 0
1661 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end, UErrorCode &status)
1662 {
1663     if (U_FAILURE(status)) {
1664         return TRUE;
1665     }
1666 
1667 #if !UCONFIG_NO_BREAK_ITERATION
1668     UBreakIterator *breakiterator = getBreakIterator(strsrch, status);
1669     if (U_SUCCESS(status)) {
1670         int32_t startindex = ubrk_first(breakiterator);
1671         int32_t endindex   = ubrk_last(breakiterator);
1672 
1673         // out-of-range indexes are never boundary positions
1674         if (start < startindex || start > endindex ||
1675             end < startindex || end > endindex) {
1676             return FALSE;
1677         }
1678 
1679         return ubrk_isBoundary(breakiterator, start) &&
1680                ubrk_isBoundary(breakiterator, end);
1681     }
1682 #endif
1683 
1684     return TRUE;
1685 }
1686 #endif
1687 
1688 typedef enum {
1689     U_CE_MATCH = -1,
1690     U_CE_NO_MATCH = 0,
1691     U_CE_SKIP_TARG,
1692     U_CE_SKIP_PATN
1693 } UCompareCEsResult;
1694 #define U_CE_LEVEL2_BASE 0x00000005
1695 #define U_CE_LEVEL3_BASE 0x00050000
1696 
compareCE64s(int64_t targCE,int64_t patCE,int16_t compareType)1697 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
1698     if (targCE == patCE) {
1699         return U_CE_MATCH;
1700     }
1701     if (compareType == 0) {
1702         return U_CE_NO_MATCH;
1703     }
1704 
1705     int64_t targCEshifted = targCE >> 32;
1706     int64_t patCEshifted = patCE >> 32;
1707     int64_t mask;
1708 
1709     mask = 0xFFFF0000;
1710     int32_t targLev1 = (int32_t)(targCEshifted & mask);
1711     int32_t patLev1 = (int32_t)(patCEshifted & mask);
1712     if ( targLev1 != patLev1 ) {
1713         if ( targLev1 == 0 ) {
1714             return U_CE_SKIP_TARG;
1715         }
1716         if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
1717             return U_CE_SKIP_PATN;
1718         }
1719         return U_CE_NO_MATCH;
1720     }
1721 
1722     mask = 0x0000FFFF;
1723     int32_t targLev2 = (int32_t)(targCEshifted & mask);
1724     int32_t patLev2 = (int32_t)(patCEshifted & mask);
1725     if ( targLev2 != patLev2 ) {
1726         if ( targLev2 == 0 ) {
1727             return U_CE_SKIP_TARG;
1728         }
1729         if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
1730             return U_CE_SKIP_PATN;
1731         }
1732         return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
1733             U_CE_MATCH: U_CE_NO_MATCH;
1734     }
1735 
1736     mask = 0xFFFF0000;
1737     int32_t targLev3 = (int32_t)(targCE & mask);
1738     int32_t patLev3 = (int32_t)(patCE & mask);
1739     if ( targLev3 != patLev3 ) {
1740         return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
1741             U_CE_MATCH: U_CE_NO_MATCH;
1742    }
1743 
1744     return U_CE_MATCH;
1745 }
1746 
1747 namespace {
1748 
codePointAt(const USearch & search,int32_t index)1749 UChar32 codePointAt(const USearch &search, int32_t index) {
1750     if (index < search.textLength) {
1751         UChar32 c;
1752         U16_NEXT(search.text, index, search.textLength, c);
1753         return c;
1754     }
1755     return U_SENTINEL;
1756 }
1757 
codePointBefore(const USearch & search,int32_t index)1758 UChar32 codePointBefore(const USearch &search, int32_t index) {
1759     if (0 < index) {
1760         UChar32 c;
1761         U16_PREV(search.text, 0, index, c);
1762         return c;
1763     }
1764     return U_SENTINEL;
1765 }
1766 
1767 }  // namespace
1768 
usearch_search(UStringSearch * strsrch,int32_t startIdx,int32_t * matchStart,int32_t * matchLimit,UErrorCode * status)1769 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
1770                                        int32_t        startIdx,
1771                                        int32_t        *matchStart,
1772                                        int32_t        *matchLimit,
1773                                        UErrorCode     *status)
1774 {
1775     if (U_FAILURE(*status)) {
1776         return FALSE;
1777     }
1778 
1779     // TODO:  reject search patterns beginning with a combining char.
1780 
1781 #ifdef USEARCH_DEBUG
1782     if (getenv("USEARCH_DEBUG") != nullptr) {
1783         printf("Pattern CEs\n");
1784         for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
1785             printf(" %8x", strsrch->pattern.ces[ii]);
1786         }
1787         printf("\n");
1788     }
1789 
1790 #endif
1791     // Input parameter sanity check.
1792     //  TODO:  should input indices clip to the text length
1793     //         in the same way that UText does.
1794     if(strsrch->pattern.cesLength == 0         ||
1795        startIdx < 0                           ||
1796        startIdx > strsrch->search->textLength ||
1797        strsrch->pattern.ces == nullptr) {
1798            *status = U_ILLEGAL_ARGUMENT_ERROR;
1799            return FALSE;
1800     }
1801 
1802     if (strsrch->pattern.pces == nullptr) {
1803         initializePatternPCETable(strsrch, status);
1804     }
1805 
1806     ucol_setOffset(strsrch->textIter, startIdx, status);
1807     CEIBuffer ceb(strsrch, status);
1808 
1809     // An out-of-memory (OOM) failure can occur in the initializePatternPCETable function
1810     // or CEIBuffer constructor above, so we need to check the status.
1811     if (U_FAILURE(*status)) {
1812         return FALSE;
1813     }
1814 
1815     int32_t    targetIx = 0;
1816     const CEI *targetCEI = nullptr;
1817     int32_t    patIx;
1818     UBool      found;
1819 
1820     int32_t  mStart = -1;
1821     int32_t  mLimit = -1;
1822     int32_t  minLimit;
1823     int32_t  maxLimit;
1824 
1825 
1826 
1827     // Outer loop moves over match starting positions in the
1828     //      target CE space.
1829     // Here we see the target as a sequence of collation elements, resulting from the following:
1830     // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
1831     //    (for example, digraphs such as IJ may be broken into two characters).
1832     // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
1833     //    16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
1834     //    fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
1835     //    CE weight 0 (as for a combining diacritic with secondary weight when the collator strength is primary),
1836     //    then the CE is deleted, so the following code sees only CEs that are relevant.
1837     // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
1838     // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
1839     // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
1840     //
1841     for(targetIx=0; ; targetIx++)
1842     {
1843         found = TRUE;
1844         //  Inner loop checks for a match beginning at each
1845         //  position from the outer loop.
1846         int32_t targetIxOffset = 0;
1847         int64_t patCE = 0;
1848         // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
1849         // (compared to the last CE fetched for the previous targetIx value) as we need to go
1850         // for this targetIx value, so if it is non-nullptr then other ceb.get calls should be OK.
1851         const CEI *firstCEI = ceb.get(targetIx);
1852         if (firstCEI == nullptr) {
1853             *status = U_INTERNAL_PROGRAM_ERROR;
1854             found = FALSE;
1855             break;
1856         }
1857 
1858         for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
1859             patCE = strsrch->pattern.pces[patIx];
1860             targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
1861             //  Compare CE from target string with CE from the pattern.
1862             //    Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
1863             //    which will fail the compare, below.
1864             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
1865             if ( ceMatch == U_CE_NO_MATCH ) {
1866                 found = FALSE;
1867                 break;
1868             } else if ( ceMatch > U_CE_NO_MATCH ) {
1869                 if ( ceMatch == U_CE_SKIP_TARG ) {
1870                     // redo with same patCE, next targCE
1871                     patIx--;
1872                     targetIxOffset++;
1873                 } else { // ceMatch == U_CE_SKIP_PATN
1874                     // redo with same targCE, next patCE
1875                     targetIxOffset--;
1876                 }
1877             }
1878         }
1879         targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
1880 
1881         if (!found && ((targetCEI == nullptr) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
1882             // No match at this targetIx.  Try again at the next.
1883             continue;
1884         }
1885 
1886         if (!found) {
1887             // No match at all, we have run off the end of the target text.
1888             break;
1889         }
1890 
1891 
1892         // We have found a match in CE space.
1893         // Now determine the bounds in string index space.
1894         //  There still is a chance of match failure if the CE range not correspond to
1895         //     an acceptable character range.
1896         //
1897         const CEI *lastCEI  = ceb.get(targetIx + targetIxOffset - 1);
1898 
1899         mStart   = firstCEI->lowIndex;
1900         minLimit = lastCEI->lowIndex;
1901 
1902         // Look at the CE following the match.  If it is UCOL_NULLORDER the match
1903         //   extended to the end of input, and the match is good.
1904 
1905         // Look at the high and low indices of the CE following the match. If
1906         // they are the same it means one of two things:
1907         //    1. The match extended to the last CE from the target text, which is OK, or
1908         //    2. The last CE that was part of the match is in an expansion that extends
1909         //       to the first CE after the match. In this case, we reject the match.
1910         const CEI *nextCEI = 0;
1911         if (strsrch->search->elementComparisonType == 0) {
1912             nextCEI  = ceb.get(targetIx + targetIxOffset);
1913             maxLimit = nextCEI->lowIndex;
1914             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
1915                 found = FALSE;
1916             }
1917         } else {
1918             for ( ; ; ++targetIxOffset ) {
1919                 nextCEI = ceb.get(targetIx + targetIxOffset);
1920                 maxLimit = nextCEI->lowIndex;
1921                 // If we are at the end of the target too, match succeeds
1922                 if (  nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
1923                     break;
1924                 }
1925                 // As long as the next CE has primary weight of 0,
1926                 // it is part of the last target element matched by the pattern;
1927                 // make sure it can be part of a match with the last patCE
1928                 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
1929                     UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
1930                     if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
1931                         found = FALSE;
1932                         break;
1933                     }
1934                 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
1935                 // target element, but it has non-zero primary weight => match fails
1936                 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
1937                     found = false;
1938                     break;
1939                 // Else the target CE is not part of an expansion of the last matched element, match succeeds
1940                 } else {
1941                     break;
1942                 }
1943             }
1944         }
1945 
1946 
1947         // Check for the start of the match being within a combining sequence.
1948         //   This can happen if the pattern itself begins with a combining char, and
1949         //   the match found combining marks in the target text that were attached
1950         //    to something else.
1951         //   This type of match should be rejected for not completely consuming a
1952         //   combining sequence.
1953         if (!isBreakBoundary(strsrch, mStart, *status)) {
1954             found = FALSE;
1955         }
1956         if (U_FAILURE(*status)) {
1957             break;
1958         }
1959 
1960         // Check for the start of the match being within an Collation Element Expansion,
1961         //   meaning that the first char of the match is only partially matched.
1962         //   With expansions, the first CE will report the index of the source
1963         //   character, and all subsequent (expansions) CEs will report the source index of the
1964         //    _following_ character.
1965         int32_t secondIx = firstCEI->highIndex;
1966         if (mStart == secondIx) {
1967             found = FALSE;
1968         }
1969 
1970         // Allow matches to end in the middle of a grapheme cluster if the following
1971         // conditions are met; this is needed to make prefix search work properly in
1972         // Indic, see #11750
1973         // * the default breakIter is being used
1974         // * the next collation element after this combining sequence
1975         //   - has non-zero primary weight
1976         //   - corresponds to a separate character following the one at end of the current match
1977         //   (the second of these conditions, and perhaps both, may be redundant given the
1978         //   subsequent check for normalization boundary; however they are likely much faster
1979         //   tests in any case)
1980         // * the match limit is a normalization boundary
1981         UBool allowMidclusterMatch = FALSE;
1982         if (strsrch->search->text != nullptr && strsrch->search->textLength > maxLimit) {
1983             allowMidclusterMatch =
1984                     strsrch->search->breakIter == nullptr &&
1985                     nextCEI != nullptr && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
1986                     maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
1987                     (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
1988                         strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
1989         }
1990         // If those conditions are met, then:
1991         // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
1992         //   the match limit may be backed off to a previous break boundary. This handles
1993         //   cases in which mLimit includes target characters that are ignorable with current
1994         //   settings (such as space) and which extend beyond the pattern match.
1995         // * do NOT require that end of the combining sequence not extend beyond the match in CE space
1996         // * do NOT require that match limit be on a breakIter boundary
1997 
1998         //  Advance the match end position to the first acceptable match boundary.
1999         //    This advances the index over any combining characters.
2000         mLimit = maxLimit;
2001         if (minLimit < maxLimit) {
2002             // When the last CE's low index is same with its high index, the CE is likely
2003             // a part of expansion. In this case, the index is located just after the
2004             // character corresponding to the CEs compared above. If the index is right
2005             // at the break boundary, move the position to the next boundary will result
2006             // incorrect match length when there are ignorable characters exist between
2007             // the position and the next character produces CE(s). See ticket#8482.
2008             if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit, *status)) {
2009                 mLimit = minLimit;
2010             } else {
2011                 int32_t nba = nextBoundaryAfter(strsrch, minLimit, *status);
2012                 // Note that we can have nba < maxLimit && nba >= minLImit, in which
2013                 // case we want to set mLimit to nba regardless of allowMidclusterMatch
2014                 // (i.e. we back off mLimit to the previous breakIterator boundary).
2015                 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
2016                     mLimit = nba;
2017                 }
2018             }
2019         }
2020 
2021         if (U_FAILURE(*status)) {
2022             break;
2023         }
2024 
2025     #ifdef USEARCH_DEBUG
2026         if (getenv("USEARCH_DEBUG") != nullptr) {
2027             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
2028         }
2029     #endif
2030 
2031         if (!allowMidclusterMatch) {
2032             // If advancing to the end of a combining sequence in character indexing space
2033             //   advanced us beyond the end of the match in CE space, reject this match.
2034             if (mLimit > maxLimit) {
2035                 found = FALSE;
2036             }
2037 
2038             if (!isBreakBoundary(strsrch, mLimit, *status)) {
2039                 found = FALSE;
2040             }
2041             if (U_FAILURE(*status)) {
2042                 break;
2043             }
2044         }
2045 
2046         if (! checkIdentical(strsrch, mStart, mLimit)) {
2047             found = FALSE;
2048         }
2049 
2050         if (found) {
2051             break;
2052         }
2053     }
2054 
2055     #ifdef USEARCH_DEBUG
2056     if (getenv("USEARCH_DEBUG") != nullptr) {
2057         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
2058         int32_t  lastToPrint = ceb.limitIx+2;
2059         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
2060             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
2061         }
2062         printf("\n%s\n", found? "match found" : "no match");
2063     }
2064     #endif
2065 
2066     // All Done.  Store back the match bounds to the caller.
2067     //
2068 
2069     if (U_FAILURE(*status)) {
2070         found = FALSE; // No match if a failure occured.
2071     }
2072 
2073     if (found==FALSE) {
2074         mLimit = -1;
2075         mStart = -1;
2076     }
2077 
2078     if (matchStart != nullptr) {
2079         *matchStart= mStart;
2080     }
2081 
2082     if (matchLimit != nullptr) {
2083         *matchLimit = mLimit;
2084     }
2085 
2086     return found;
2087 }
2088 
usearch_searchBackwards(UStringSearch * strsrch,int32_t startIdx,int32_t * matchStart,int32_t * matchLimit,UErrorCode * status)2089 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
2090                                                 int32_t        startIdx,
2091                                                 int32_t        *matchStart,
2092                                                 int32_t        *matchLimit,
2093                                                 UErrorCode     *status)
2094 {
2095     if (U_FAILURE(*status)) {
2096         return FALSE;
2097     }
2098 
2099     // TODO:  reject search patterns beginning with a combining char.
2100 
2101 #ifdef USEARCH_DEBUG
2102     if (getenv("USEARCH_DEBUG") != nullptr) {
2103         printf("Pattern CEs\n");
2104         for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
2105             printf(" %8x", strsrch->pattern.ces[ii]);
2106         }
2107         printf("\n");
2108     }
2109 
2110 #endif
2111     // Input parameter sanity check.
2112     //  TODO:  should input indices clip to the text length
2113     //         in the same way that UText does.
2114     if(strsrch->pattern.cesLength == 0        ||
2115        startIdx < 0                           ||
2116        startIdx > strsrch->search->textLength ||
2117        strsrch->pattern.ces == nullptr) {
2118            *status = U_ILLEGAL_ARGUMENT_ERROR;
2119            return FALSE;
2120     }
2121 
2122     if (strsrch->pattern.pces == nullptr) {
2123         initializePatternPCETable(strsrch, status);
2124     }
2125 
2126     CEIBuffer ceb(strsrch, status);
2127     int32_t    targetIx = 0;
2128 
2129     /*
2130      * Pre-load the buffer with the CE's for the grapheme
2131      * after our starting position so that we're sure that
2132      * we can look at the CE following the match when we
2133      * check the match boundaries.
2134      *
2135      * This will also pre-fetch the first CE that we'll
2136      * consider for the match.
2137      */
2138     if (startIdx < strsrch->search->textLength) {
2139         UBreakIterator *breakiterator = getBreakIterator(strsrch, *status);
2140         if (U_FAILURE(*status)) {
2141             return FALSE;
2142         }
2143         int32_t next = ubrk_following(breakiterator, startIdx);
2144 
2145         ucol_setOffset(strsrch->textIter, next, status);
2146 
2147         for (targetIx = 0; ; targetIx += 1) {
2148             if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
2149                 break;
2150             }
2151         }
2152     } else {
2153         ucol_setOffset(strsrch->textIter, startIdx, status);
2154     }
2155 
2156     // An out-of-memory (OOM) failure can occur above, so we need to check the status.
2157     if (U_FAILURE(*status)) {
2158         return FALSE;
2159     }
2160 
2161     const CEI *targetCEI = nullptr;
2162     int32_t    patIx;
2163     UBool      found;
2164 
2165     int32_t  limitIx = targetIx;
2166     int32_t  mStart = -1;
2167     int32_t  mLimit = -1;
2168     int32_t  minLimit;
2169     int32_t  maxLimit;
2170 
2171 
2172 
2173     // Outer loop moves over match starting positions in the
2174     //      target CE space.
2175     // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
2176     // But  patIx is 0 at the beginning of the pattern and increases toward the end.
2177     // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
2178     // and the beginning of the base text.
2179     for(targetIx = limitIx; ; targetIx += 1)
2180     {
2181         found = TRUE;
2182         // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
2183         // (compared to the last CE fetched for the previous targetIx value) as we need to go
2184         // for this targetIx value, so if it is non-nullptr then other ceb.getPrevious calls should be OK.
2185         const CEI *lastCEI  = ceb.getPrevious(targetIx);
2186         if (lastCEI == nullptr) {
2187             *status = U_INTERNAL_PROGRAM_ERROR;
2188             found = FALSE;
2189              break;
2190         }
2191         //  Inner loop checks for a match beginning at each
2192         //  position from the outer loop.
2193         int32_t targetIxOffset = 0;
2194         for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
2195             int64_t patCE = strsrch->pattern.pces[patIx];
2196 
2197             targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
2198             //  Compare CE from target string with CE from the pattern.
2199             //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
2200             //    which will fail the compare, below.
2201             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
2202             if ( ceMatch == U_CE_NO_MATCH ) {
2203                 found = FALSE;
2204                 break;
2205             } else if ( ceMatch > U_CE_NO_MATCH ) {
2206                 if ( ceMatch == U_CE_SKIP_TARG ) {
2207                     // redo with same patCE, next targCE
2208                     patIx++;
2209                     targetIxOffset++;
2210                 } else { // ceMatch == U_CE_SKIP_PATN
2211                     // redo with same targCE, next patCE
2212                     targetIxOffset--;
2213                 }
2214             }
2215         }
2216 
2217         if (!found && ((targetCEI == nullptr) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
2218             // No match at this targetIx.  Try again at the next.
2219             continue;
2220         }
2221 
2222         if (!found) {
2223             // No match at all, we have run off the end of the target text.
2224             break;
2225         }
2226 
2227 
2228         // We have found a match in CE space.
2229         // Now determine the bounds in string index space.
2230         //  There still is a chance of match failure if the CE range not correspond to
2231         //     an acceptable character range.
2232         //
2233         const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
2234         mStart   = firstCEI->lowIndex;
2235 
2236         // Check for the start of the match being within a combining sequence.
2237         //   This can happen if the pattern itself begins with a combining char, and
2238         //   the match found combining marks in the target text that were attached
2239         //    to something else.
2240         //   This type of match should be rejected for not completely consuming a
2241         //   combining sequence.
2242         if (!isBreakBoundary(strsrch, mStart, *status)) {
2243             found = FALSE;
2244         }
2245         if (U_FAILURE(*status)) {
2246             break;
2247         }
2248 
2249         // Look at the high index of the first CE in the match. If it's the same as the
2250         // low index, the first CE in the match is in the middle of an expansion.
2251         if (mStart == firstCEI->highIndex) {
2252             found = FALSE;
2253         }
2254 
2255 
2256         minLimit = lastCEI->lowIndex;
2257 
2258         if (targetIx > 0) {
2259             // Look at the CE following the match.  If it is UCOL_NULLORDER the match
2260             //   extended to the end of input, and the match is good.
2261 
2262             // Look at the high and low indices of the CE following the match. If
2263             // they are the same it means one of two things:
2264             //    1. The match extended to the last CE from the target text, which is OK, or
2265             //    2. The last CE that was part of the match is in an expansion that extends
2266             //       to the first CE after the match. In this case, we reject the match.
2267             const CEI *nextCEI  = ceb.getPrevious(targetIx - 1);
2268 
2269             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
2270                 found = FALSE;
2271             }
2272 
2273             mLimit = maxLimit = nextCEI->lowIndex;
2274 
2275             // Allow matches to end in the middle of a grapheme cluster if the following
2276             // conditions are met; this is needed to make prefix search work properly in
2277             // Indic, see #11750
2278             // * the default breakIter is being used
2279             // * the next collation element after this combining sequence
2280             //   - has non-zero primary weight
2281             //   - corresponds to a separate character following the one at end of the current match
2282             //   (the second of these conditions, and perhaps both, may be redundant given the
2283             //   subsequent check for normalization boundary; however they are likely much faster
2284             //   tests in any case)
2285             // * the match limit is a normalization boundary
2286             UBool allowMidclusterMatch = FALSE;
2287             if (strsrch->search->text != nullptr && strsrch->search->textLength > maxLimit) {
2288                 allowMidclusterMatch =
2289                         strsrch->search->breakIter == nullptr &&
2290                         nextCEI != nullptr && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
2291                         maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
2292                         (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
2293                             strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
2294             }
2295             // If those conditions are met, then:
2296             // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
2297             //   the match limit may be backed off to a previous break boundary. This handles
2298             //   cases in which mLimit includes target characters that are ignorable with current
2299             //   settings (such as space) and which extend beyond the pattern match.
2300             // * do NOT require that end of the combining sequence not extend beyond the match in CE space
2301             // * do NOT require that match limit be on a breakIter boundary
2302 
2303             //  Advance the match end position to the first acceptable match boundary.
2304             //    This advances the index over any combining characters.
2305             if (minLimit < maxLimit) {
2306                 int32_t nba = nextBoundaryAfter(strsrch, minLimit, *status);
2307                 // Note that we can have nba < maxLimit && nba >= minLImit, in which
2308                 // case we want to set mLimit to nba regardless of allowMidclusterMatch
2309                 // (i.e. we back off mLimit to the previous breakIterator boundary).
2310                 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
2311                     mLimit = nba;
2312                 }
2313             }
2314 
2315             if (!allowMidclusterMatch) {
2316                 // If advancing to the end of a combining sequence in character indexing space
2317                 //   advanced us beyond the end of the match in CE space, reject this match.
2318                 if (mLimit > maxLimit) {
2319                     found = FALSE;
2320                 }
2321 
2322                 // Make sure the end of the match is on a break boundary
2323                 if (!isBreakBoundary(strsrch, mLimit, *status)) {
2324                     found = FALSE;
2325                 }
2326                 if (U_FAILURE(*status)) {
2327                     break;
2328                 }
2329             }
2330 
2331         } else {
2332             // No non-ignorable CEs after this point.
2333             // The maximum position is detected by boundary after
2334             // the last non-ignorable CE. Combining sequence
2335             // across the start index will be truncated.
2336             int32_t nba = nextBoundaryAfter(strsrch, minLimit, *status);
2337             mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
2338         }
2339 
2340     #ifdef USEARCH_DEBUG
2341         if (getenv("USEARCH_DEBUG") != nullptr) {
2342             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
2343         }
2344     #endif
2345 
2346 
2347         if (! checkIdentical(strsrch, mStart, mLimit)) {
2348             found = FALSE;
2349         }
2350 
2351         if (found) {
2352             break;
2353         }
2354     }
2355 
2356     #ifdef USEARCH_DEBUG
2357     if (getenv("USEARCH_DEBUG") != nullptr) {
2358         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
2359         int32_t  lastToPrint = ceb.limitIx+2;
2360         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
2361             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
2362         }
2363         printf("\n%s\n", found? "match found" : "no match");
2364     }
2365     #endif
2366 
2367     // All Done.  Store back the match bounds to the caller.
2368     //
2369 
2370     if (U_FAILURE(*status)) {
2371         found = FALSE; // No match if a failure occured.
2372     }
2373 
2374     if (found==FALSE) {
2375         mLimit = -1;
2376         mStart = -1;
2377     }
2378 
2379     if (matchStart != nullptr) {
2380         *matchStart= mStart;
2381     }
2382 
2383     if (matchLimit != nullptr) {
2384         *matchLimit = mLimit;
2385     }
2386 
2387     return found;
2388 }
2389 
2390 // internal use methods declared in usrchimp.h -----------------------------
2391 
usearch_handleNextExact(UStringSearch * strsrch,UErrorCode * status)2392 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
2393 {
2394     if (U_FAILURE(*status)) {
2395         setMatchNotFound(strsrch, *status);
2396         return FALSE;
2397     }
2398 
2399     int32_t textOffset = ucol_getOffset(strsrch->textIter);
2400     int32_t start = -1;
2401     int32_t end = -1;
2402 
2403     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
2404         strsrch->search->matchedIndex  = start;
2405         strsrch->search->matchedLength = end - start;
2406         return TRUE;
2407     } else {
2408         setMatchNotFound(strsrch, *status);
2409         return FALSE;
2410     }
2411 }
2412 
usearch_handleNextCanonical(UStringSearch * strsrch,UErrorCode * status)2413 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
2414 {
2415     if (U_FAILURE(*status)) {
2416         setMatchNotFound(strsrch, *status);
2417         return FALSE;
2418     }
2419 
2420     int32_t textOffset = ucol_getOffset(strsrch->textIter);
2421     int32_t start = -1;
2422     int32_t end = -1;
2423 
2424     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
2425         strsrch->search->matchedIndex  = start;
2426         strsrch->search->matchedLength = end - start;
2427         return TRUE;
2428     } else {
2429         setMatchNotFound(strsrch, *status);
2430         return FALSE;
2431     }
2432 }
2433 
usearch_handlePreviousExact(UStringSearch * strsrch,UErrorCode * status)2434 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
2435 {
2436     if (U_FAILURE(*status)) {
2437         setMatchNotFound(strsrch, *status);
2438         return FALSE;
2439     }
2440 
2441     int32_t textOffset;
2442 
2443     if (strsrch->search->isOverlap) {
2444         if (strsrch->search->matchedIndex != USEARCH_DONE) {
2445             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
2446         } else {
2447             // move the start position at the end of possible match
2448             initializePatternPCETable(strsrch, status);
2449             if (!initTextProcessedIter(strsrch, status)) {
2450                 setMatchNotFound(strsrch, *status);
2451                 return FALSE;
2452             }
2453             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
2454                 int64_t pce = strsrch->textProcessedIter->nextProcessed(nullptr, nullptr, status);
2455                 if (pce == UCOL_PROCESSED_NULLORDER) {
2456                     // at the end of the text
2457                     break;
2458                 }
2459             }
2460             if (U_FAILURE(*status)) {
2461                 setMatchNotFound(strsrch, *status);
2462                 return FALSE;
2463             }
2464             textOffset = ucol_getOffset(strsrch->textIter);
2465         }
2466     } else {
2467         textOffset = ucol_getOffset(strsrch->textIter);
2468     }
2469 
2470     int32_t start = -1;
2471     int32_t end = -1;
2472 
2473     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
2474         strsrch->search->matchedIndex = start;
2475         strsrch->search->matchedLength = end - start;
2476         return TRUE;
2477     } else {
2478         setMatchNotFound(strsrch, *status);
2479         return FALSE;
2480     }
2481 }
2482 
usearch_handlePreviousCanonical(UStringSearch * strsrch,UErrorCode * status)2483 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
2484                                       UErrorCode    *status)
2485 {
2486     if (U_FAILURE(*status)) {
2487         setMatchNotFound(strsrch, *status);
2488         return FALSE;
2489     }
2490 
2491     int32_t textOffset;
2492 
2493     if (strsrch->search->isOverlap) {
2494         if (strsrch->search->matchedIndex != USEARCH_DONE) {
2495             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
2496         } else {
2497             // move the start position at the end of possible match
2498             initializePatternPCETable(strsrch, status);
2499             if (!initTextProcessedIter(strsrch, status)) {
2500                 setMatchNotFound(strsrch, *status);
2501                 return FALSE;
2502             }
2503             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
2504                 int64_t pce = strsrch->textProcessedIter->nextProcessed(nullptr, nullptr, status);
2505                 if (pce == UCOL_PROCESSED_NULLORDER) {
2506                     // at the end of the text
2507                     break;
2508                 }
2509             }
2510             if (U_FAILURE(*status)) {
2511                 setMatchNotFound(strsrch, *status);
2512                 return FALSE;
2513             }
2514             textOffset = ucol_getOffset(strsrch->textIter);
2515         }
2516     } else {
2517         textOffset = ucol_getOffset(strsrch->textIter);
2518     }
2519 
2520     int32_t start = -1;
2521     int32_t end = -1;
2522 
2523     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
2524         strsrch->search->matchedIndex = start;
2525         strsrch->search->matchedLength = end - start;
2526         return TRUE;
2527     } else {
2528         setMatchNotFound(strsrch, *status);
2529         return FALSE;
2530     }
2531 }
2532 
2533 #endif /* #if !UCONFIG_NO_COLLATION */
2534