• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2001-2015 IBM and others. All rights reserved.
6 **********************************************************************
7 *   Date        Name        Description
8 *  07/02/2001   synwee      Creation.
9 **********************************************************************
10 */
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
15 
16 #include "unicode/usearch.h"
17 #include "unicode/ustring.h"
18 #include "unicode/uchar.h"
19 #include "unicode/utf16.h"
20 #include "normalizer2impl.h"
21 #include "usrchimp.h"
22 #include "cmemory.h"
23 #include "ucln_in.h"
24 #include "uassert.h"
25 #include "ustr_imp.h"
26 
27 U_NAMESPACE_USE
28 
29 // internal definition ---------------------------------------------------
30 
31 #define LAST_BYTE_MASK_          0xFF
32 #define SECOND_LAST_BYTE_SHIFT_  8
33 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
34 
35 static const Normalizer2Impl *g_nfcImpl = nullptr;
36 
37 // internal methods -------------------------------------------------
38 
39 /**
40 * Fast collation element iterator setOffset.
41 * This function does not check for bounds.
42 * @param coleiter collation element iterator
43 * @param offset to set
44 */
45 static
setColEIterOffset(UCollationElements * elems,int32_t offset,UErrorCode & status)46 inline void setColEIterOffset(UCollationElements *elems,
47                               int32_t offset,
48                               UErrorCode &status)
49 {
50     // Note: Not "fast" any more after the 2013 collation rewrite.
51     // We do not want to expose more internals than necessary.
52     ucol_setOffset(elems, offset, &status);
53 }
54 
55 /**
56 * Getting the mask for collation strength
57 * @param strength collation strength
58 * @return collation element mask
59 */
60 static
getMask(UCollationStrength strength)61 inline uint32_t getMask(UCollationStrength strength)
62 {
63     switch (strength)
64     {
65     case UCOL_PRIMARY:
66         return UCOL_PRIMARYORDERMASK;
67     case UCOL_SECONDARY:
68         return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
69     default:
70         return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
71                UCOL_PRIMARYORDERMASK;
72     }
73 }
74 
75 U_CDECL_BEGIN
76 static UBool U_CALLCONV
usearch_cleanup()77 usearch_cleanup() {
78     g_nfcImpl = nullptr;
79     return true;
80 }
81 U_CDECL_END
82 
83 /**
84 * Initializing the fcd tables.
85 * Internal method, status assumed to be a success.
86 * @param status output error if any, caller to check status before calling
87 *               method, status assumed to be success when passed in.
88 */
89 static
initializeFCD(UErrorCode * status)90 inline void initializeFCD(UErrorCode *status)
91 {
92     if (g_nfcImpl == nullptr) {
93         g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
94         ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
95     }
96 }
97 
98 /**
99 * Gets the fcd value for a character at the argument index.
100 * This method takes into accounts of the supplementary characters.
101 * @param str UTF16 string where character for fcd retrieval resides
102 * @param offset position of the character whose fcd is to be retrieved, to be
103 *               overwritten with the next character position, taking
104 *               surrogate characters into consideration.
105 * @param strlength length of the argument string
106 * @return fcd value
107 */
108 static
getFCD(const char16_t * str,int32_t * offset,int32_t strlength)109 uint16_t getFCD(const char16_t   *str, int32_t *offset,
110                              int32_t  strlength)
111 {
112     const char16_t *temp = str + *offset;
113     uint16_t    result = g_nfcImpl->nextFCD16(temp, str + strlength);
114     *offset = (int32_t)(temp - str);
115     return result;
116 }
117 
118 /**
119 * Getting the modified collation elements taking into account the collation
120 * attributes
121 * @param strsrch string search data
122 * @param sourcece
123 * @return the modified collation element
124 */
125 static
getCE(const UStringSearch * strsrch,uint32_t sourcece)126 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
127 {
128     // note for tertiary we can't use the collator->tertiaryMask, that
129     // is a preprocessed mask that takes into account case options. since
130     // we are only concerned with exact matches, we don't need that.
131     sourcece &= strsrch->ceMask;
132 
133     if (strsrch->toShift) {
134         // alternate handling here, since only the 16 most significant digits
135         // is only used, we can safely do a compare without masking
136         // if the ce is a variable, we mask and get only the primary values
137         // no shifting to quartenary is required since all primary values
138         // less than variabletop will need to be masked off anyway.
139         if (strsrch->variableTop > sourcece) {
140             if (strsrch->strength >= UCOL_QUATERNARY) {
141                 sourcece &= UCOL_PRIMARYORDERMASK;
142             }
143             else {
144                 sourcece = UCOL_IGNORABLE;
145             }
146         }
147     } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
148         sourcece = 0xFFFF;
149     }
150 
151     return sourcece;
152 }
153 
154 /**
155 * Allocate a memory and returns nullptr if it failed.
156 * Internal method, status assumed to be a success.
157 * @param size to allocate
158 * @param status output error if any, caller to check status before calling
159 *               method, status assumed to be success when passed in.
160 * @return newly allocated array, nullptr otherwise
161 */
162 static
allocateMemory(uint32_t size,UErrorCode * status)163 inline void * allocateMemory(uint32_t size, UErrorCode *status)
164 {
165     uint32_t *result = (uint32_t *)uprv_malloc(size);
166     if (result == nullptr) {
167         *status = U_MEMORY_ALLOCATION_ERROR;
168     }
169     return result;
170 }
171 
172 /**
173 * Adds a uint32_t value to a destination array.
174 * Creates a new array if we run out of space. The caller will have to
175 * manually deallocate the newly allocated array.
176 * Internal method, status assumed to be success, caller has to check status
177 * before calling this method. destination not to be nullptr and has at least
178 * size destinationlength.
179 * @param destination target array
180 * @param offset destination offset to add value
181 * @param destinationlength target array size, return value for the new size
182 * @param value to be added
183 * @param increments incremental size expected
184 * @param status output error if any, caller to check status before calling
185 *               method, status assumed to be success when passed in.
186 * @return new destination array, destination if there was no new allocation
187 */
188 static
addTouint32_tArray(int32_t * destination,uint32_t offset,uint32_t * destinationlength,uint32_t value,uint32_t increments,UErrorCode * status)189 inline int32_t * addTouint32_tArray(int32_t    *destination,
190                                     uint32_t    offset,
191                                     uint32_t   *destinationlength,
192                                     uint32_t    value,
193                                     uint32_t    increments,
194                                     UErrorCode *status)
195 {
196     uint32_t newlength = *destinationlength;
197     if (offset + 1 == newlength) {
198         newlength += increments;
199         int32_t *temp = (int32_t *)allocateMemory(
200                                          sizeof(int32_t) * newlength, status);
201         if (U_FAILURE(*status)) {
202             return nullptr;
203         }
204         uprv_memcpy(temp, destination, sizeof(int32_t) * (size_t)offset);
205         *destinationlength = newlength;
206         destination        = temp;
207     }
208     destination[offset] = value;
209     return destination;
210 }
211 
212 /**
213 * Adds a uint64_t value to a destination array.
214 * Creates a new array if we run out of space. The caller will have to
215 * manually deallocate the newly allocated array.
216 * Internal method, status assumed to be success, caller has to check status
217 * before calling this method. destination not to be nullptr and has at least
218 * size destinationlength.
219 * @param destination target array
220 * @param offset destination offset to add value
221 * @param destinationlength target array size, return value for the new size
222 * @param value to be added
223 * @param increments incremental size expected
224 * @param status output error if any, caller to check status before calling
225 *               method, status assumed to be success when passed in.
226 * @return new destination array, destination if there was no new allocation
227 */
228 static
addTouint64_tArray(int64_t * destination,uint32_t offset,uint32_t * destinationlength,uint64_t value,uint32_t increments,UErrorCode * status)229 inline int64_t * addTouint64_tArray(int64_t    *destination,
230                                     uint32_t    offset,
231                                     uint32_t   *destinationlength,
232                                     uint64_t    value,
233                                     uint32_t    increments,
234                                     UErrorCode *status)
235 {
236     uint32_t newlength = *destinationlength;
237     if (offset + 1 == newlength) {
238         newlength += increments;
239         int64_t *temp = (int64_t *)allocateMemory(
240                                          sizeof(int64_t) * newlength, status);
241 
242         if (U_FAILURE(*status)) {
243             return nullptr;
244         }
245 
246         uprv_memcpy(temp, destination, sizeof(int64_t) * (size_t)offset);
247         *destinationlength = newlength;
248         destination        = temp;
249     }
250 
251     destination[offset] = value;
252 
253     return destination;
254 }
255 
256 /**
257 * Initializing the ce table for a pattern.
258 * Stores non-ignorable collation keys.
259 * Table size will be estimated by the size of the pattern text. Table
260 * expansion will be perform as we go along. Adding 1 to ensure that the table
261 * size definitely increases.
262 * Internal method, status assumed to be a success.
263 * @param strsrch string search data
264 * @param status output error if any, caller to check status before calling
265 *               method, status assumed to be success when passed in.
266 */
267 static
initializePatternCETable(UStringSearch * strsrch,UErrorCode * status)268 inline void initializePatternCETable(UStringSearch *strsrch, UErrorCode *status)
269 {
270     UPattern *pattern            = &(strsrch->pattern);
271     uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
272     int32_t  *cetable            = pattern->cesBuffer;
273     uint32_t  patternlength      = pattern->textLength;
274     UCollationElements *coleiter = strsrch->utilIter;
275 
276     if (coleiter == nullptr) {
277         coleiter = ucol_openElements(strsrch->collator, pattern->text,
278                                      patternlength, status);
279         // status will be checked in ucol_next(..) later and if it is an
280         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
281         // returned.
282         strsrch->utilIter = coleiter;
283     }
284     else {
285         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
286     }
287     if(U_FAILURE(*status)) {
288         return;
289     }
290 
291     if (pattern->ces != cetable && pattern->ces) {
292         uprv_free(pattern->ces);
293     }
294 
295     uint32_t  offset      = 0;
296     int32_t   ce;
297 
298     while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
299            U_SUCCESS(*status)) {
300         uint32_t newce = getCE(strsrch, ce);
301         if (newce) {
302             int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
303                                   newce,
304                                   patternlength - ucol_getOffset(coleiter) + 1,
305                                   status);
306             if (U_FAILURE(*status)) {
307                 return;
308             }
309             offset ++;
310             if (cetable != temp && cetable != pattern->cesBuffer) {
311                 uprv_free(cetable);
312             }
313             cetable = temp;
314         }
315     }
316 
317     cetable[offset]   = 0;
318     pattern->ces       = cetable;
319     pattern->cesLength = offset;
320 }
321 
322 /**
323 * Initializing the pce table for a pattern.
324 * Stores non-ignorable collation keys.
325 * Table size will be estimated by the size of the pattern text. Table
326 * expansion will be perform as we go along. Adding 1 to ensure that the table
327 * size definitely increases.
328 * Internal method, status assumed to be a success.
329 * @param strsrch string search data
330 * @param status output error if any, caller to check status before calling
331 *               method, status assumed to be success when passed in.
332 */
333 static
initializePatternPCETable(UStringSearch * strsrch,UErrorCode * status)334 inline void initializePatternPCETable(UStringSearch *strsrch,
335                                       UErrorCode    *status)
336 {
337     UPattern *pattern            = &(strsrch->pattern);
338     uint32_t  pcetablesize       = INITIAL_ARRAY_SIZE_;
339     int64_t  *pcetable           = pattern->pcesBuffer;
340     uint32_t  patternlength      = pattern->textLength;
341     UCollationElements *coleiter = strsrch->utilIter;
342 
343     if (coleiter == nullptr) {
344         coleiter = ucol_openElements(strsrch->collator, pattern->text,
345                                      patternlength, status);
346         // status will be checked in nextProcessed(..) later and if it is an error
347         // then UCOL_PROCESSED_NULLORDER is returned by nextProcessed(..), so 0 will be
348         // returned.
349         strsrch->utilIter = coleiter;
350     } else {
351         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
352     }
353     if(U_FAILURE(*status)) {
354         return;
355     }
356 
357     if (pattern->pces != pcetable && pattern->pces != nullptr) {
358         uprv_free(pattern->pces);
359     }
360 
361     uint32_t  offset = 0;
362     int64_t   pce;
363 
364     icu::UCollationPCE iter(coleiter);
365 
366     // ** Should processed CEs be signed or unsigned?
367     // ** (the rest of the code in this file seems to play fast-and-loose with
368     // **  whether a CE is signed or unsigned. For example, look at routine above this one.)
369     while ((pce = iter.nextProcessed(nullptr, nullptr, status)) != UCOL_PROCESSED_NULLORDER &&
370            U_SUCCESS(*status)) {
371         int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
372                               pce,
373                               patternlength - ucol_getOffset(coleiter) + 1,
374                               status);
375 
376         if (U_FAILURE(*status)) {
377             return;
378         }
379 
380         offset += 1;
381 
382         if (pcetable != temp && pcetable != pattern->pcesBuffer) {
383             uprv_free(pcetable);
384         }
385 
386         pcetable = temp;
387     }
388 
389     pcetable[offset]   = 0;
390     pattern->pces       = pcetable;
391     pattern->pcesLength = offset;
392 }
393 
394 /**
395 * Initializes the pattern struct.
396 * @param strsrch UStringSearch data storage
397 * @param status output error if any, caller to check status before calling
398 *               method, status assumed to be success when passed in.
399 */
400 static
initializePattern(UStringSearch * strsrch,UErrorCode * status)401 inline void initializePattern(UStringSearch *strsrch, UErrorCode *status)
402 {
403     if (U_FAILURE(*status)) { return; }
404 
405           UPattern   *pattern     = &(strsrch->pattern);
406     const char16_t   *patterntext = pattern->text;
407           int32_t     length      = pattern->textLength;
408           int32_t index       = 0;
409 
410     // Since the strength is primary, accents are ignored in the pattern.
411     if (strsrch->strength == UCOL_PRIMARY) {
412         pattern->hasPrefixAccents = 0;
413         pattern->hasSuffixAccents = 0;
414     } else {
415         pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
416                                                          SECOND_LAST_BYTE_SHIFT_;
417         index = length;
418         U16_BACK_1(patterntext, 0, index);
419         pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
420                                                                  LAST_BYTE_MASK_;
421     }
422 
423     // ** HACK **
424     if (strsrch->pattern.pces != nullptr) {
425         if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
426             uprv_free(strsrch->pattern.pces);
427         }
428 
429         strsrch->pattern.pces = nullptr;
430     }
431 
432     initializePatternCETable(strsrch, status);
433 }
434 
435 /**
436 * Initializes the pattern struct and builds the pattern collation element table.
437 * @param strsrch UStringSearch data storage
438 * @param status  for output errors if it occurs, status is assumed to be a
439 *                success when it is passed in.
440 */
441 static
initialize(UStringSearch * strsrch,UErrorCode * status)442 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
443 {
444     initializePattern(strsrch, status);
445 }
446 
447 #if !UCONFIG_NO_BREAK_ITERATION
448 // If the caller provided a character breakiterator we'll return that,
449 // otherwise we lazily create the internal break iterator.
getBreakIterator(UStringSearch * strsrch,UErrorCode & status)450 static UBreakIterator* getBreakIterator(UStringSearch *strsrch, UErrorCode &status)
451 {
452     if (U_FAILURE(status)) {
453         return nullptr;
454     }
455 
456     if (strsrch->search->breakIter != nullptr) {
457         return strsrch->search->breakIter;
458     }
459 
460     if (strsrch->search->internalBreakIter != nullptr) {
461         return strsrch->search->internalBreakIter;
462     }
463 
464     // Need to create the internal break iterator.
465     strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER,
466         ucol_getLocaleByType(strsrch->collator, ULOC_VALID_LOCALE, &status),
467         strsrch->search->text, strsrch->search->textLength, &status);
468 
469     return strsrch->search->internalBreakIter;
470 }
471 #endif
472 
473 /**
474 * Sets the match result to "not found", regardless of the incoming error status.
475 * If an error occurs while setting the result, it is reported back.
476 *
477 * @param strsrch string search data
478 * @param status  for output errors, if they occur.
479 */
480 static
setMatchNotFound(UStringSearch * strsrch,UErrorCode & status)481 inline void setMatchNotFound(UStringSearch *strsrch, UErrorCode &status)
482 {
483     UErrorCode localStatus = U_ZERO_ERROR;
484 
485     strsrch->search->matchedIndex = USEARCH_DONE;
486     strsrch->search->matchedLength = 0;
487     if (strsrch->search->isForwardSearching) {
488         setColEIterOffset(strsrch->textIter, strsrch->search->textLength, localStatus);
489     }
490     else {
491         setColEIterOffset(strsrch->textIter, 0, localStatus);
492     }
493 
494     // If an error occurred while setting the result to not found (ex: OOM),
495     // then we want to report that error back to the caller.
496     if (U_SUCCESS(status) && U_FAILURE(localStatus)) {
497         status = localStatus;
498     }
499 }
500 
501 /**
502 * Checks if the offset runs out of the text string
503 * @param offset
504 * @param textlength of the text string
505 * @return true if offset is out of bounds, false otherwise
506 */
507 static
isOutOfBounds(int32_t textlength,int32_t offset)508 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
509 {
510     return offset < 0 || offset > textlength;
511 }
512 
513 /**
514 * Checks for identical match
515 * @param strsrch string search data
516 * @param start offset of possible match
517 * @param end offset of possible match
518 * @return true if identical match is found
519 */
520 static
checkIdentical(const UStringSearch * strsrch,int32_t start,int32_t end)521 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start, int32_t end)
522 {
523     if (strsrch->strength != UCOL_IDENTICAL) {
524         return true;
525     }
526 
527     // Note: We could use Normalizer::compare() or similar, but for short strings
528     // which may not be in FCD it might be faster to just NFD them.
529     UErrorCode status = U_ZERO_ERROR;
530     UnicodeString t2, p2;
531     strsrch->nfd->normalize(
532         UnicodeString(false, strsrch->search->text + start, end - start), t2, status);
533     strsrch->nfd->normalize(
534         UnicodeString(false, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
535     // return false if NFD failed
536     return U_SUCCESS(status) && t2 == p2;
537 }
538 
539 // constructors and destructor -------------------------------------------
540 
usearch_open(const char16_t * pattern,int32_t patternlength,const char16_t * text,int32_t textlength,const char * locale,UBreakIterator * breakiter,UErrorCode * status)541 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const char16_t *pattern,
542                                           int32_t         patternlength,
543                                     const char16_t       *text,
544                                           int32_t         textlength,
545                                     const char           *locale,
546                                           UBreakIterator *breakiter,
547                                           UErrorCode     *status)
548 {
549     if (U_FAILURE(*status)) {
550         return nullptr;
551     }
552 #if UCONFIG_NO_BREAK_ITERATION
553     if (breakiter != nullptr) {
554         *status = U_UNSUPPORTED_ERROR;
555         return nullptr;
556     }
557 #endif
558     if (locale) {
559         // ucol_open internally checks for status
560         UCollator     *collator = ucol_open(locale, status);
561         // pattern, text checks are done in usearch_openFromCollator
562         UStringSearch *result   = usearch_openFromCollator(pattern,
563                                               patternlength, text, textlength,
564                                               collator, breakiter, status);
565 
566         if (result == nullptr || U_FAILURE(*status)) {
567             if (collator) {
568                 ucol_close(collator);
569             }
570             return nullptr;
571         }
572         else {
573             result->ownCollator = true;
574         }
575         return result;
576     }
577     *status = U_ILLEGAL_ARGUMENT_ERROR;
578     return nullptr;
579 }
580 
usearch_openFromCollator(const char16_t * pattern,int32_t patternlength,const char16_t * text,int32_t textlength,const UCollator * collator,UBreakIterator * breakiter,UErrorCode * status)581 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
582                                   const char16_t       *pattern,
583                                         int32_t         patternlength,
584                                   const char16_t       *text,
585                                         int32_t         textlength,
586                                   const UCollator      *collator,
587                                         UBreakIterator *breakiter,
588                                         UErrorCode     *status)
589 {
590     if (U_FAILURE(*status)) {
591         return nullptr;
592     }
593 #if UCONFIG_NO_BREAK_ITERATION
594     if (breakiter != nullptr) {
595         *status = U_UNSUPPORTED_ERROR;
596         return nullptr;
597     }
598 #endif
599     if (pattern == nullptr || text == nullptr || collator == nullptr) {
600         *status = U_ILLEGAL_ARGUMENT_ERROR;
601         return nullptr;
602     }
603 
604     // string search does not really work when numeric collation is turned on
605     if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
606         *status = U_UNSUPPORTED_ERROR;
607         return nullptr;
608     }
609 
610     if (U_SUCCESS(*status)) {
611         initializeFCD(status);
612         if (U_FAILURE(*status)) {
613             return nullptr;
614         }
615 
616         UStringSearch *result;
617         if (textlength == -1) {
618             textlength = u_strlen(text);
619         }
620         if (patternlength == -1) {
621             patternlength = u_strlen(pattern);
622         }
623         if (textlength <= 0 || patternlength <= 0) {
624             *status = U_ILLEGAL_ARGUMENT_ERROR;
625             return nullptr;
626         }
627 
628         result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
629         if (result == nullptr) {
630             *status = U_MEMORY_ALLOCATION_ERROR;
631             return nullptr;
632         }
633 
634         result->collator    = collator;
635         result->strength    = ucol_getStrength(collator);
636         result->ceMask      = getMask(result->strength);
637         result->toShift     =
638              ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
639                                                             UCOL_SHIFTED;
640         result->variableTop = ucol_getVariableTop(collator, status);
641 
642         result->nfd         = Normalizer2::getNFDInstance(*status);
643 
644         if (U_FAILURE(*status)) {
645             uprv_free(result);
646             return nullptr;
647         }
648 
649         result->search             = (USearch *)uprv_malloc(sizeof(USearch));
650         if (result->search == nullptr) {
651             *status = U_MEMORY_ALLOCATION_ERROR;
652             uprv_free(result);
653             return nullptr;
654         }
655 
656         result->search->text       = text;
657         result->search->textLength = textlength;
658 
659         result->pattern.text       = pattern;
660         result->pattern.textLength = patternlength;
661         result->pattern.ces         = nullptr;
662         result->pattern.pces        = nullptr;
663 
664         result->search->breakIter  = breakiter;
665 #if !UCONFIG_NO_BREAK_ITERATION
666         result->search->internalBreakIter = nullptr; // Lazily created.
667         if (breakiter) {
668             ubrk_setText(breakiter, text, textlength, status);
669         }
670 #endif
671 
672         result->ownCollator           = false;
673         result->search->matchedLength = 0;
674         result->search->matchedIndex  = USEARCH_DONE;
675         result->utilIter              = nullptr;
676         result->textIter              = ucol_openElements(collator, text,
677                                                           textlength, status);
678         result->textProcessedIter     = nullptr;
679         if (U_FAILURE(*status)) {
680             usearch_close(result);
681             return nullptr;
682         }
683 
684         result->search->isOverlap          = false;
685         result->search->isCanonicalMatch   = false;
686         result->search->elementComparisonType = 0;
687         result->search->isForwardSearching = true;
688         result->search->reset              = true;
689 
690         initialize(result, status);
691 
692         if (U_FAILURE(*status)) {
693             usearch_close(result);
694             return nullptr;
695         }
696 
697         return result;
698     }
699     return nullptr;
700 }
701 
usearch_close(UStringSearch * strsrch)702 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
703 {
704     if (strsrch) {
705         if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
706             strsrch->pattern.ces) {
707             uprv_free(strsrch->pattern.ces);
708         }
709 
710         if (strsrch->pattern.pces != nullptr &&
711             strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
712             uprv_free(strsrch->pattern.pces);
713         }
714 
715         delete strsrch->textProcessedIter;
716         ucol_closeElements(strsrch->textIter);
717         ucol_closeElements(strsrch->utilIter);
718 
719         if (strsrch->ownCollator && strsrch->collator) {
720             ucol_close((UCollator *)strsrch->collator);
721         }
722 
723 #if !UCONFIG_NO_BREAK_ITERATION
724         if (strsrch->search->internalBreakIter != nullptr) {
725             ubrk_close(strsrch->search->internalBreakIter);
726         }
727 #endif
728 
729         uprv_free(strsrch->search);
730         uprv_free(strsrch);
731     }
732 }
733 
734 namespace {
735 
initTextProcessedIter(UStringSearch * strsrch,UErrorCode * status)736 UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
737     if (U_FAILURE(*status)) { return false; }
738     if (strsrch->textProcessedIter == nullptr) {
739         strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
740         if (strsrch->textProcessedIter == nullptr) {
741             *status = U_MEMORY_ALLOCATION_ERROR;
742             return false;
743         }
744     } else {
745         strsrch->textProcessedIter->init(strsrch->textIter);
746     }
747     return true;
748 }
749 
750 }
751 
752 // set and get methods --------------------------------------------------
753 
usearch_setOffset(UStringSearch * strsrch,int32_t position,UErrorCode * status)754 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
755                                         int32_t        position,
756                                         UErrorCode    *status)
757 {
758     if (U_SUCCESS(*status) && strsrch) {
759         if (isOutOfBounds(strsrch->search->textLength, position)) {
760             *status = U_INDEX_OUTOFBOUNDS_ERROR;
761         }
762         else {
763             setColEIterOffset(strsrch->textIter, position, *status);
764         }
765         strsrch->search->matchedIndex  = USEARCH_DONE;
766         strsrch->search->matchedLength = 0;
767         strsrch->search->reset         = false;
768     }
769 }
770 
usearch_getOffset(const UStringSearch * strsrch)771 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
772 {
773     if (strsrch) {
774         int32_t result = ucol_getOffset(strsrch->textIter);
775         if (isOutOfBounds(strsrch->search->textLength, result)) {
776             return USEARCH_DONE;
777         }
778         return result;
779     }
780     return USEARCH_DONE;
781 }
782 
usearch_setAttribute(UStringSearch * strsrch,USearchAttribute attribute,USearchAttributeValue value,UErrorCode * status)783 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch        *strsrch,
784                                            USearchAttribute      attribute,
785                                            USearchAttributeValue value,
786                                            UErrorCode           *status)
787 {
788     if (U_SUCCESS(*status) && strsrch) {
789         switch (attribute)
790         {
791         case USEARCH_OVERLAP :
792             strsrch->search->isOverlap = (value == USEARCH_ON ? true : false);
793             break;
794         case USEARCH_CANONICAL_MATCH :
795             strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? true :
796                                                                       false);
797             break;
798         case USEARCH_ELEMENT_COMPARISON :
799             if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
800                 strsrch->search->elementComparisonType = (int16_t)value;
801             } else {
802                 strsrch->search->elementComparisonType = 0;
803             }
804             break;
805         case USEARCH_ATTRIBUTE_COUNT :
806         default:
807             *status = U_ILLEGAL_ARGUMENT_ERROR;
808         }
809     }
810     if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
811         *status = U_ILLEGAL_ARGUMENT_ERROR;
812     }
813 }
814 
usearch_getAttribute(const UStringSearch * strsrch,USearchAttribute attribute)815 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
816                                                 const UStringSearch *strsrch,
817                                                 USearchAttribute attribute)
818 {
819     if (strsrch) {
820         switch (attribute) {
821         case USEARCH_OVERLAP :
822             return (strsrch->search->isOverlap ? USEARCH_ON : USEARCH_OFF);
823         case USEARCH_CANONICAL_MATCH :
824             return (strsrch->search->isCanonicalMatch ? USEARCH_ON : USEARCH_OFF);
825         case USEARCH_ELEMENT_COMPARISON :
826             {
827                 int16_t value = strsrch->search->elementComparisonType;
828                 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
829                     return (USearchAttributeValue)value;
830                 } else {
831                     return USEARCH_STANDARD_ELEMENT_COMPARISON;
832                 }
833             }
834         case USEARCH_ATTRIBUTE_COUNT :
835             return USEARCH_DEFAULT;
836         }
837     }
838     return USEARCH_DEFAULT;
839 }
840 
usearch_getMatchedStart(const UStringSearch * strsrch)841 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
842                                                 const UStringSearch *strsrch)
843 {
844     if (strsrch == nullptr) {
845         return USEARCH_DONE;
846     }
847     return strsrch->search->matchedIndex;
848 }
849 
850 
usearch_getMatchedText(const UStringSearch * strsrch,char16_t * result,int32_t resultCapacity,UErrorCode * status)851 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
852                                             char16_t      *result,
853                                             int32_t        resultCapacity,
854                                             UErrorCode    *status)
855 {
856     if (U_FAILURE(*status)) {
857         return USEARCH_DONE;
858     }
859     if (strsrch == nullptr || resultCapacity < 0 || (resultCapacity > 0 &&
860         result == nullptr)) {
861         *status = U_ILLEGAL_ARGUMENT_ERROR;
862         return USEARCH_DONE;
863     }
864 
865     int32_t     copylength = strsrch->search->matchedLength;
866     int32_t copyindex  = strsrch->search->matchedIndex;
867     if (copyindex == USEARCH_DONE) {
868         u_terminateUChars(result, resultCapacity, 0, status);
869         return USEARCH_DONE;
870     }
871 
872     if (resultCapacity < copylength) {
873         copylength = resultCapacity;
874     }
875     if (copylength > 0) {
876         uprv_memcpy(result, strsrch->search->text + copyindex,
877                     copylength * sizeof(char16_t));
878     }
879     return u_terminateUChars(result, resultCapacity,
880                              strsrch->search->matchedLength, status);
881 }
882 
usearch_getMatchedLength(const UStringSearch * strsrch)883 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
884                                               const UStringSearch *strsrch)
885 {
886     if (strsrch) {
887         return strsrch->search->matchedLength;
888     }
889     return USEARCH_DONE;
890 }
891 
892 #if !UCONFIG_NO_BREAK_ITERATION
893 
usearch_setBreakIterator(UStringSearch * strsrch,UBreakIterator * breakiter,UErrorCode * status)894 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
895                                                UBreakIterator *breakiter,
896                                                UErrorCode     *status)
897 {
898     if (U_SUCCESS(*status) && strsrch) {
899         strsrch->search->breakIter = breakiter;
900         if (breakiter) {
901             ubrk_setText(breakiter, strsrch->search->text,
902                          strsrch->search->textLength, status);
903         }
904     }
905 }
906 
907 U_CAPI const UBreakIterator* U_EXPORT2
usearch_getBreakIterator(const UStringSearch * strsrch)908 usearch_getBreakIterator(const UStringSearch *strsrch)
909 {
910     if (strsrch) {
911         return strsrch->search->breakIter;
912     }
913     return nullptr;
914 }
915 
916 #endif
917 
usearch_setText(UStringSearch * strsrch,const char16_t * text,int32_t textlength,UErrorCode * status)918 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
919                                       const char16_t      *text,
920                                             int32_t        textlength,
921                                             UErrorCode    *status)
922 {
923     if (U_SUCCESS(*status)) {
924         if (strsrch == nullptr || text == nullptr || textlength < -1 ||
925             textlength == 0) {
926             *status = U_ILLEGAL_ARGUMENT_ERROR;
927         }
928         else {
929             if (textlength == -1) {
930                 textlength = u_strlen(text);
931             }
932             strsrch->search->text       = text;
933             strsrch->search->textLength = textlength;
934             ucol_setText(strsrch->textIter, text, textlength, status);
935             strsrch->search->matchedIndex  = USEARCH_DONE;
936             strsrch->search->matchedLength = 0;
937             strsrch->search->reset         = true;
938 #if !UCONFIG_NO_BREAK_ITERATION
939             if (strsrch->search->breakIter != nullptr) {
940                 ubrk_setText(strsrch->search->breakIter, text,
941                              textlength, status);
942             }
943             if (strsrch->search->internalBreakIter != nullptr) {
944                 ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
945             }
946 #endif
947         }
948     }
949 }
950 
usearch_getText(const UStringSearch * strsrch,int32_t * length)951 U_CAPI const char16_t * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
952                                                      int32_t       *length)
953 {
954     if (strsrch) {
955         *length = strsrch->search->textLength;
956         return strsrch->search->text;
957     }
958     return nullptr;
959 }
960 
usearch_setCollator(UStringSearch * strsrch,const UCollator * collator,UErrorCode * status)961 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
962                                           const UCollator     *collator,
963                                                 UErrorCode    *status)
964 {
965     if (U_SUCCESS(*status)) {
966         if (collator == nullptr) {
967             *status = U_ILLEGAL_ARGUMENT_ERROR;
968             return;
969         }
970 
971         if (strsrch) {
972             delete strsrch->textProcessedIter;
973             strsrch->textProcessedIter = nullptr;
974             ucol_closeElements(strsrch->textIter);
975             ucol_closeElements(strsrch->utilIter);
976             strsrch->textIter = strsrch->utilIter = nullptr;
977             if (strsrch->ownCollator && (strsrch->collator != collator)) {
978                 ucol_close((UCollator *)strsrch->collator);
979                 strsrch->ownCollator = false;
980             }
981             strsrch->collator    = collator;
982             strsrch->strength    = ucol_getStrength(collator);
983             strsrch->ceMask      = getMask(strsrch->strength);
984 #if !UCONFIG_NO_BREAK_ITERATION
985             if (strsrch->search->internalBreakIter != nullptr) {
986                 ubrk_close(strsrch->search->internalBreakIter);
987                 strsrch->search->internalBreakIter = nullptr;   // Lazily created.
988             }
989 #endif
990             // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
991             strsrch->toShift     =
992                ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
993                                                                 UCOL_SHIFTED;
994             // if status is a failure, ucol_getVariableTop returns 0
995             strsrch->variableTop = ucol_getVariableTop(collator, status);
996             strsrch->textIter = ucol_openElements(collator,
997                                       strsrch->search->text,
998                                       strsrch->search->textLength,
999                                       status);
1000             strsrch->utilIter = ucol_openElements(
1001                     collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
1002             // initialize() _after_ setting the iterators for the new collator.
1003             initialize(strsrch, status);
1004         }
1005 
1006         // **** are these calls needed?
1007         // **** we call uprv_init_pce in initializePatternPCETable
1008         // **** and the CEIBuffer constructor...
1009 #if 0
1010         uprv_init_pce(strsrch->textIter);
1011         uprv_init_pce(strsrch->utilIter);
1012 #endif
1013     }
1014 }
1015 
usearch_getCollator(const UStringSearch * strsrch)1016 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
1017 {
1018     if (strsrch) {
1019         return (UCollator *)strsrch->collator;
1020     }
1021     return nullptr;
1022 }
1023 
usearch_setPattern(UStringSearch * strsrch,const char16_t * pattern,int32_t patternlength,UErrorCode * status)1024 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
1025                                          const char16_t      *pattern,
1026                                                int32_t        patternlength,
1027                                                UErrorCode    *status)
1028 {
1029     if (U_SUCCESS(*status)) {
1030         if (strsrch == nullptr || pattern == nullptr) {
1031             *status = U_ILLEGAL_ARGUMENT_ERROR;
1032         }
1033         else {
1034             if (patternlength == -1) {
1035                 patternlength = u_strlen(pattern);
1036             }
1037             if (patternlength == 0) {
1038                 *status = U_ILLEGAL_ARGUMENT_ERROR;
1039                 return;
1040             }
1041             strsrch->pattern.text       = pattern;
1042             strsrch->pattern.textLength = patternlength;
1043             initialize(strsrch, status);
1044         }
1045     }
1046 }
1047 
1048 U_CAPI const char16_t* U_EXPORT2
usearch_getPattern(const UStringSearch * strsrch,int32_t * length)1049 usearch_getPattern(const UStringSearch *strsrch,
1050                    int32_t             *length)
1051 {
1052     if (strsrch) {
1053         *length = strsrch->pattern.textLength;
1054         return strsrch->pattern.text;
1055     }
1056     return nullptr;
1057 }
1058 
1059 // miscellaneous methods --------------------------------------------------
1060 
usearch_first(UStringSearch * strsrch,UErrorCode * status)1061 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
1062                                        UErrorCode    *status)
1063 {
1064     if (strsrch && U_SUCCESS(*status)) {
1065         strsrch->search->isForwardSearching = true;
1066         usearch_setOffset(strsrch, 0, status);
1067         if (U_SUCCESS(*status)) {
1068             return usearch_next(strsrch, status);
1069         }
1070     }
1071     return USEARCH_DONE;
1072 }
1073 
usearch_following(UStringSearch * strsrch,int32_t position,UErrorCode * status)1074 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
1075                                            int32_t        position,
1076                                            UErrorCode    *status)
1077 {
1078     if (strsrch && U_SUCCESS(*status)) {
1079         strsrch->search->isForwardSearching = true;
1080         // position checked in usearch_setOffset
1081         usearch_setOffset(strsrch, position, status);
1082         if (U_SUCCESS(*status)) {
1083             return usearch_next(strsrch, status);
1084         }
1085     }
1086     return USEARCH_DONE;
1087 }
1088 
usearch_last(UStringSearch * strsrch,UErrorCode * status)1089 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
1090                                       UErrorCode    *status)
1091 {
1092     if (strsrch && U_SUCCESS(*status)) {
1093         strsrch->search->isForwardSearching = false;
1094         usearch_setOffset(strsrch, strsrch->search->textLength, status);
1095         if (U_SUCCESS(*status)) {
1096             return usearch_previous(strsrch, status);
1097         }
1098     }
1099     return USEARCH_DONE;
1100 }
1101 
usearch_preceding(UStringSearch * strsrch,int32_t position,UErrorCode * status)1102 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
1103                                            int32_t        position,
1104                                            UErrorCode    *status)
1105 {
1106     if (strsrch && U_SUCCESS(*status)) {
1107         strsrch->search->isForwardSearching = false;
1108         // position checked in usearch_setOffset
1109         usearch_setOffset(strsrch, position, status);
1110         if (U_SUCCESS(*status)) {
1111             return usearch_previous(strsrch, status);
1112         }
1113     }
1114     return USEARCH_DONE;
1115 }
1116 
1117 /**
1118 * If a direction switch is required, we'll count the number of ces till the
1119 * beginning of the collation element iterator and iterate forwards that
1120 * number of times. This is so that we get to the correct point within the
1121 * string to continue the search in. Imagine when we are in the middle of the
1122 * normalization buffer when the change in direction is request. arrrgghh....
1123 * After searching the offset within the collation element iterator will be
1124 * shifted to the start of the match. If a match is not found, the offset would
1125 * have been set to the end of the text string in the collation element
1126 * iterator.
1127 * Okay, here's my take on normalization buffer. The only time when there can
1128 * be 2 matches within the same normalization is when the pattern is consists
1129 * of all accents. But since the offset returned is from the text string, we
1130 * should not confuse the caller by returning the second match within the
1131 * same normalization buffer. If we do, the 2 results will have the same match
1132 * offsets, and that'll be confusing. I'll return the next match that doesn't
1133 * fall within the same normalization buffer. Note this does not affect the
1134 * results of matches spanning the text and the normalization buffer.
1135 * The position to start searching is taken from the collation element
1136 * iterator. Callers of this API would have to set the offset in the collation
1137 * element iterator before using this method.
1138 */
usearch_next(UStringSearch * strsrch,UErrorCode * status)1139 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
1140                                       UErrorCode    *status)
1141 {
1142     if (U_SUCCESS(*status) && strsrch) {
1143         // note offset is either equivalent to the start of the previous match
1144         // or is set by the user
1145         int32_t      offset       = usearch_getOffset(strsrch);
1146         USearch     *search       = strsrch->search;
1147         search->reset             = false;
1148         int32_t      textlength   = search->textLength;
1149         if (search->isForwardSearching) {
1150             if (offset == textlength ||
1151                 (! search->isOverlap &&
1152                 (search->matchedIndex != USEARCH_DONE &&
1153                 offset + search->matchedLength > textlength))) {
1154                     // not enough characters to match
1155                     setMatchNotFound(strsrch, *status);
1156                     return USEARCH_DONE;
1157             }
1158         }
1159         else {
1160             // switching direction.
1161             // if matchedIndex == USEARCH_DONE, it means that either a
1162             // setOffset has been called or that previous ran off the text
1163             // string. the iterator would have been set to offset 0 if a
1164             // match is not found.
1165             search->isForwardSearching = true;
1166             if (search->matchedIndex != USEARCH_DONE) {
1167                 // there's no need to set the collation element iterator
1168                 // the next call to next will set the offset.
1169                 return search->matchedIndex;
1170             }
1171         }
1172 
1173         if (U_SUCCESS(*status)) {
1174             if (strsrch->pattern.cesLength == 0) {
1175                 if (search->matchedIndex == USEARCH_DONE) {
1176                     search->matchedIndex = offset;
1177                 }
1178                 else { // moves by codepoints
1179                     U16_FWD_1(search->text, search->matchedIndex, textlength);
1180                 }
1181 
1182                 search->matchedLength = 0;
1183                 setColEIterOffset(strsrch->textIter, search->matchedIndex, *status);
1184                 // status checked below
1185                 if (search->matchedIndex == textlength) {
1186                     search->matchedIndex = USEARCH_DONE;
1187                 }
1188             }
1189             else {
1190                 if (search->matchedLength > 0) {
1191                     // if matchlength is 0 we are at the start of the iteration
1192                     if (search->isOverlap) {
1193                         ucol_setOffset(strsrch->textIter, offset + 1, status);
1194                     }
1195                     else {
1196                         ucol_setOffset(strsrch->textIter,
1197                                        offset + search->matchedLength, status);
1198                     }
1199                 }
1200                 else {
1201                     // for boundary check purposes. this will ensure that the
1202                     // next match will not precede the current offset
1203                     // note search->matchedIndex will always be set to something
1204                     // in the code
1205                     search->matchedIndex = offset - 1;
1206                 }
1207 
1208                 if (search->isCanonicalMatch) {
1209                     // can't use exact here since extra accents are allowed.
1210                     usearch_handleNextCanonical(strsrch, status);
1211                 }
1212                 else {
1213                     usearch_handleNextExact(strsrch, status);
1214                 }
1215             }
1216 
1217             if (U_FAILURE(*status)) {
1218                 return USEARCH_DONE;
1219             }
1220 
1221             if (search->matchedIndex == USEARCH_DONE) {
1222                 ucol_setOffset(strsrch->textIter, search->textLength, status);
1223             } else {
1224                 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
1225             }
1226 
1227             return search->matchedIndex;
1228         }
1229     }
1230     return USEARCH_DONE;
1231 }
1232 
usearch_previous(UStringSearch * strsrch,UErrorCode * status)1233 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
1234                                           UErrorCode    *status)
1235 {
1236     if (U_SUCCESS(*status) && strsrch) {
1237         int32_t offset;
1238         USearch *search = strsrch->search;
1239         if (search->reset) {
1240             offset                     = search->textLength;
1241             search->isForwardSearching = false;
1242             search->reset              = false;
1243             setColEIterOffset(strsrch->textIter, offset, *status);
1244         }
1245         else {
1246             offset = usearch_getOffset(strsrch);
1247         }
1248 
1249         int32_t matchedindex = search->matchedIndex;
1250         if (search->isForwardSearching) {
1251             // switching direction.
1252             // if matchedIndex == USEARCH_DONE, it means that either a
1253             // setOffset has been called or that next ran off the text
1254             // string. the iterator would have been set to offset textLength if
1255             // a match is not found.
1256             search->isForwardSearching = false;
1257             if (matchedindex != USEARCH_DONE) {
1258                 return matchedindex;
1259             }
1260         }
1261         else {
1262 
1263             // Could check pattern length, but the
1264             // linear search will do the right thing
1265             if (offset == 0 || matchedindex == 0) {
1266                 setMatchNotFound(strsrch, *status);
1267                 return USEARCH_DONE;
1268             }
1269         }
1270 
1271         if (U_SUCCESS(*status)) {
1272             if (strsrch->pattern.cesLength == 0) {
1273                 search->matchedIndex =
1274                       (matchedindex == USEARCH_DONE ? offset : matchedindex);
1275                 if (search->matchedIndex == 0) {
1276                     setMatchNotFound(strsrch, *status);
1277                     // status checked below
1278                 }
1279                 else { // move by codepoints
1280                     U16_BACK_1(search->text, 0, search->matchedIndex);
1281                     setColEIterOffset(strsrch->textIter, search->matchedIndex, *status);
1282                     // status checked below
1283                     search->matchedLength = 0;
1284                 }
1285             }
1286             else {
1287                 if (strsrch->search->isCanonicalMatch) {
1288                     // can't use exact here since extra accents are allowed.
1289                     usearch_handlePreviousCanonical(strsrch, status);
1290                     // status checked below
1291                 }
1292                 else {
1293                     usearch_handlePreviousExact(strsrch, status);
1294                     // status checked below
1295                 }
1296             }
1297 
1298             if (U_FAILURE(*status)) {
1299                 return USEARCH_DONE;
1300             }
1301 
1302             return search->matchedIndex;
1303         }
1304     }
1305     return USEARCH_DONE;
1306 }
1307 
1308 
1309 
usearch_reset(UStringSearch * strsrch)1310 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
1311 {
1312     /*
1313     reset is setting the attributes that are already in
1314     string search, hence all attributes in the collator should
1315     be retrieved without any problems
1316     */
1317     if (strsrch) {
1318         UErrorCode status            = U_ZERO_ERROR;
1319         UBool      sameCollAttribute = true;
1320         uint32_t   ceMask;
1321         UBool      shift;
1322         uint32_t   varTop;
1323 
1324         // **** hack to deal w/ how processed CEs encode quaternary ****
1325         UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
1326         if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
1327             (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
1328                 sameCollAttribute = false;
1329         }
1330 
1331         strsrch->strength    = ucol_getStrength(strsrch->collator);
1332         ceMask = getMask(strsrch->strength);
1333         if (strsrch->ceMask != ceMask) {
1334             strsrch->ceMask = ceMask;
1335             sameCollAttribute = false;
1336         }
1337 
1338         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
1339         shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
1340                                   &status) == UCOL_SHIFTED;
1341         if (strsrch->toShift != shift) {
1342             strsrch->toShift  = shift;
1343             sameCollAttribute = false;
1344         }
1345 
1346         // if status is a failure, ucol_getVariableTop returns 0
1347         varTop = ucol_getVariableTop(strsrch->collator, &status);
1348         if (strsrch->variableTop != varTop) {
1349             strsrch->variableTop = varTop;
1350             sameCollAttribute    = false;
1351         }
1352         if (!sameCollAttribute) {
1353             initialize(strsrch, &status);
1354         }
1355         ucol_setText(strsrch->textIter, strsrch->search->text,
1356                               strsrch->search->textLength,
1357                               &status);
1358         strsrch->search->matchedLength      = 0;
1359         strsrch->search->matchedIndex       = USEARCH_DONE;
1360         strsrch->search->isOverlap          = false;
1361         strsrch->search->isCanonicalMatch   = false;
1362         strsrch->search->elementComparisonType = 0;
1363         strsrch->search->isForwardSearching = true;
1364         strsrch->search->reset              = true;
1365     }
1366 }
1367 
1368 //
1369 //  CEI  Collation Element + source text index.
1370 //       These structs are kept in the circular buffer.
1371 //
1372 struct  CEI {
1373     int64_t ce;
1374     int32_t lowIndex;
1375     int32_t highIndex;
1376 };
1377 
1378 U_NAMESPACE_BEGIN
1379 
1380 namespace {
1381 //
1382 //  CEIBuffer   A circular buffer of CEs-with-index from the text being searched.
1383 //
1384 #define   DEFAULT_CEBUFFER_SIZE 96
1385 #define   CEBUFFER_EXTRA 32
1386 // Some typical max values to make buffer size more reasonable for asymmetric search.
1387 // #8694 is for a better long-term solution to allocation of this buffer.
1388 #define   MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
1389 #define   MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
1390 #define   MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
1391 struct CEIBuffer {
1392     CEI                  defBuf[DEFAULT_CEBUFFER_SIZE];
1393     CEI                 *buf;
1394     int32_t              bufSize;
1395     int32_t              firstIx;
1396     int32_t              limitIx;
1397     UCollationElements  *ceIter;
1398     UStringSearch       *strSearch;
1399 
1400 
1401 
1402                CEIBuffer(UStringSearch *ss, UErrorCode *status);
1403                ~CEIBuffer();
1404    const CEI   *get(int32_t index);
1405    const CEI   *getPrevious(int32_t index);
1406 };
1407 
1408 
CEIBuffer(UStringSearch * ss,UErrorCode * status)1409 CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
1410     buf = defBuf;
1411     strSearch = ss;
1412     bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
1413     if (ss->search->elementComparisonType != 0) {
1414         const char16_t * patText = ss->pattern.text;
1415         if (patText) {
1416             const char16_t * patTextLimit = patText + ss->pattern.textLength;
1417             while ( patText < patTextLimit ) {
1418                 char16_t c = *patText++;
1419                 if (MIGHT_BE_JAMO_L(c)) {
1420                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
1421                 } else {
1422                     // No check for surrogates, we might allocate slightly more buffer than necessary.
1423                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
1424                 }
1425             }
1426         }
1427     }
1428     ceIter    = ss->textIter;
1429     firstIx = 0;
1430     limitIx = 0;
1431 
1432     if (!initTextProcessedIter(ss, status)) { return; }
1433 
1434     if (bufSize>DEFAULT_CEBUFFER_SIZE) {
1435         buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
1436         if (buf == nullptr) {
1437             *status = U_MEMORY_ALLOCATION_ERROR;
1438         }
1439     }
1440 }
1441 
1442 // TODO: add a reset or init function so that allocated
1443 //       buffers can be retained & reused.
1444 
~CEIBuffer()1445 CEIBuffer::~CEIBuffer() {
1446     if (buf != defBuf) {
1447         uprv_free(buf);
1448     }
1449 }
1450 
1451 
1452 // Get the CE with the specified index.
1453 //   Index must be in the range
1454 //          n-history_size < index < n+1
1455 //   where n is the largest index to have been fetched by some previous call to this function.
1456 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
1457 //
get(int32_t index)1458 const CEI *CEIBuffer::get(int32_t index) {
1459     int i = index % bufSize;
1460 
1461     if (index>=firstIx && index<limitIx) {
1462         // The request was for an entry already in our buffer.
1463         //  Just return it.
1464         return &buf[i];
1465     }
1466 
1467     // Caller is requesting a new, never accessed before, CE.
1468     //   Verify that it is the next one in sequence, which is all
1469     //   that is allowed.
1470     if (index != limitIx) {
1471         UPRV_UNREACHABLE_ASSERT;
1472         // TODO: In ICU 64 the above was changed from U_ASSERT to UPRV_UNREACHABLE,
1473         // which unconditionally called abort(). However, there were cases in which it
1474         // was being hit, so it was changed back to U_ASSERT per ICU-20680. In ICU 70,
1475         // we now use the new UPRV_UNREACHABLE_ASSERT to better indicate the situation.
1476         // ICU-20792 tracks the follow-up work/further investigation on this.
1477         return nullptr;
1478     }
1479 
1480     // Manage the circular CE buffer indexing
1481     limitIx++;
1482 
1483     if (limitIx - firstIx >= bufSize) {
1484         // The buffer is full, knock out the lowest-indexed entry.
1485         firstIx++;
1486     }
1487 
1488     UErrorCode status = U_ZERO_ERROR;
1489 
1490     buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
1491 
1492     return &buf[i];
1493 }
1494 
1495 // Get the CE with the specified index.
1496 //   Index must be in the range
1497 //          n-history_size < index < n+1
1498 //   where n is the largest index to have been fetched by some previous call to this function.
1499 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
1500 //
getPrevious(int32_t index)1501 const CEI *CEIBuffer::getPrevious(int32_t index) {
1502     int i = index % bufSize;
1503 
1504     if (index>=firstIx && index<limitIx) {
1505         // The request was for an entry already in our buffer.
1506         //  Just return it.
1507         return &buf[i];
1508     }
1509 
1510     // Caller is requesting a new, never accessed before, CE.
1511     //   Verify that it is the next one in sequence, which is all
1512     //   that is allowed.
1513     if (index != limitIx) {
1514         UPRV_UNREACHABLE_ASSERT;
1515         // TODO: In ICU 64 the above was changed from U_ASSERT to UPRV_UNREACHABLE,
1516         // which unconditionally called abort(). However, there were cases in which it
1517         // was being hit, so it was changed back to U_ASSERT per ICU-20680. In ICU 70,
1518         // we now use the new UPRV_UNREACHABLE_ASSERT to better indicate the situation.
1519         // ICU-20792 tracks the follow-up work/further investigation on this.
1520         return nullptr;
1521     }
1522 
1523     // Manage the circular CE buffer indexing
1524     limitIx++;
1525 
1526     if (limitIx - firstIx >= bufSize) {
1527         // The buffer is full, knock out the lowest-indexed entry.
1528         firstIx++;
1529     }
1530 
1531     UErrorCode status = U_ZERO_ERROR;
1532 
1533     buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
1534 
1535     return &buf[i];
1536 }
1537 
1538 }
1539 
1540 U_NAMESPACE_END
1541 
1542 
1543 // #define USEARCH_DEBUG
1544 
1545 #ifdef USEARCH_DEBUG
1546 #include <stdio.h>
1547 #include <stdlib.h>
1548 #endif
1549 
1550 /*
1551  * Find the next break boundary after startIndex. If the UStringSearch object
1552  * has an external break iterator, use that. Otherwise use the internal character
1553  * break iterator.
1554  */
nextBoundaryAfter(UStringSearch * strsrch,int32_t startIndex,UErrorCode & status)1555 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex, UErrorCode &status) {
1556     if (U_FAILURE(status)) {
1557         return startIndex;
1558     }
1559 #if 0
1560     const char16_t *text = strsrch->search->text;
1561     int32_t textLen   = strsrch->search->textLength;
1562 
1563     U_ASSERT(startIndex>=0);
1564     U_ASSERT(startIndex<=textLen);
1565 
1566     if (startIndex >= textLen) {
1567         return startIndex;
1568     }
1569 
1570     UChar32  c;
1571     int32_t  i = startIndex;
1572     U16_NEXT(text, i, textLen, c);
1573 
1574     // If we are on a control character, stop without looking for combining marks.
1575     //    Control characters do not combine.
1576     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1577     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
1578         return i;
1579     }
1580 
1581     // The initial character was not a control, and can thus accept trailing
1582     //   combining characters.  Advance over however many of them there are.
1583     int32_t  indexOfLastCharChecked;
1584     for (;;) {
1585         indexOfLastCharChecked = i;
1586         if (i>=textLen) {
1587             break;
1588         }
1589         U16_NEXT(text, i, textLen, c);
1590         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1591         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
1592             break;
1593         }
1594     }
1595     return indexOfLastCharChecked;
1596 #elif !UCONFIG_NO_BREAK_ITERATION
1597     UBreakIterator *breakiterator = getBreakIterator(strsrch, status);
1598     if (U_FAILURE(status)) {
1599         return startIndex;
1600     }
1601 
1602     return ubrk_following(breakiterator, startIndex);
1603 #else
1604     // **** or should we use the original code? ****
1605     return startIndex;
1606 #endif
1607 
1608 }
1609 
1610 /*
1611  * Returns true if index is on a break boundary. If the UStringSearch
1612  * has an external break iterator, test using that, otherwise test
1613  * using the internal character break iterator.
1614  */
isBreakBoundary(UStringSearch * strsrch,int32_t index,UErrorCode & status)1615 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index, UErrorCode &status) {
1616     if (U_FAILURE(status)) {
1617         return true;
1618     }
1619 #if 0
1620     const char16_t *text = strsrch->search->text;
1621     int32_t textLen   = strsrch->search->textLength;
1622 
1623     U_ASSERT(index>=0);
1624     U_ASSERT(index<=textLen);
1625 
1626     if (index>=textLen || index<=0) {
1627         return true;
1628     }
1629 
1630     // If the character at the current index is not a GRAPHEME_EXTEND
1631     //    then we can not be within a combining sequence.
1632     UChar32  c;
1633     U16_GET(text, 0, index, textLen, c);
1634     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1635     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
1636         return true;
1637     }
1638 
1639     // We are at a combining mark.  If the preceding character is anything
1640     //   except a CONTROL, CR or LF, we are in a combining sequence.
1641     U16_PREV(text, 0, index, c);
1642     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1643     UBool combining =  !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
1644     return !combining;
1645 #elif !UCONFIG_NO_BREAK_ITERATION
1646     UBreakIterator *breakiterator = getBreakIterator(strsrch, status);
1647     if (U_FAILURE(status)) {
1648         return true;
1649     }
1650 
1651     return ubrk_isBoundary(breakiterator, index);
1652 #else
1653     // **** or use the original code? ****
1654     return true;
1655 #endif
1656 }
1657 
1658 #if 0
1659 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end, UErrorCode &status)
1660 {
1661     if (U_FAILURE(status)) {
1662         return true;
1663     }
1664 
1665 #if !UCONFIG_NO_BREAK_ITERATION
1666     UBreakIterator *breakiterator = getBreakIterator(strsrch, status);
1667     if (U_SUCCESS(status)) {
1668         int32_t startindex = ubrk_first(breakiterator);
1669         int32_t endindex   = ubrk_last(breakiterator);
1670 
1671         // out-of-range indexes are never boundary positions
1672         if (start < startindex || start > endindex ||
1673             end < startindex || end > endindex) {
1674             return false;
1675         }
1676 
1677         return ubrk_isBoundary(breakiterator, start) &&
1678                ubrk_isBoundary(breakiterator, end);
1679     }
1680 #endif
1681 
1682     return true;
1683 }
1684 #endif
1685 
1686 typedef enum {
1687     U_CE_MATCH = -1,
1688     U_CE_NO_MATCH = 0,
1689     U_CE_SKIP_TARG,
1690     U_CE_SKIP_PATN
1691 } UCompareCEsResult;
1692 #define U_CE_LEVEL2_BASE 0x00000005
1693 #define U_CE_LEVEL3_BASE 0x00050000
1694 
compareCE64s(int64_t targCE,int64_t patCE,int16_t compareType)1695 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
1696     if (targCE == patCE) {
1697         return U_CE_MATCH;
1698     }
1699     if (compareType == 0) {
1700         return U_CE_NO_MATCH;
1701     }
1702 
1703     int64_t targCEshifted = targCE >> 32;
1704     int64_t patCEshifted = patCE >> 32;
1705     int64_t mask;
1706 
1707     mask = 0xFFFF0000;
1708     int32_t targLev1 = (int32_t)(targCEshifted & mask);
1709     int32_t patLev1 = (int32_t)(patCEshifted & mask);
1710     if ( targLev1 != patLev1 ) {
1711         if ( targLev1 == 0 ) {
1712             return U_CE_SKIP_TARG;
1713         }
1714         if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
1715             return U_CE_SKIP_PATN;
1716         }
1717         return U_CE_NO_MATCH;
1718     }
1719 
1720     mask = 0x0000FFFF;
1721     int32_t targLev2 = (int32_t)(targCEshifted & mask);
1722     int32_t patLev2 = (int32_t)(patCEshifted & mask);
1723     if ( targLev2 != patLev2 ) {
1724         if ( targLev2 == 0 ) {
1725             return U_CE_SKIP_TARG;
1726         }
1727         if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
1728             return U_CE_SKIP_PATN;
1729         }
1730         return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
1731             U_CE_MATCH: U_CE_NO_MATCH;
1732     }
1733 
1734     mask = 0xFFFF0000;
1735     int32_t targLev3 = (int32_t)(targCE & mask);
1736     int32_t patLev3 = (int32_t)(patCE & mask);
1737     if ( targLev3 != patLev3 ) {
1738         return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
1739             U_CE_MATCH: U_CE_NO_MATCH;
1740    }
1741 
1742     return U_CE_MATCH;
1743 }
1744 
1745 namespace {
1746 
codePointAt(const USearch & search,int32_t index)1747 UChar32 codePointAt(const USearch &search, int32_t index) {
1748     if (index < search.textLength) {
1749         UChar32 c;
1750         U16_NEXT(search.text, index, search.textLength, c);
1751         return c;
1752     }
1753     return U_SENTINEL;
1754 }
1755 
codePointBefore(const USearch & search,int32_t index)1756 UChar32 codePointBefore(const USearch &search, int32_t index) {
1757     if (0 < index) {
1758         UChar32 c;
1759         U16_PREV(search.text, 0, index, c);
1760         return c;
1761     }
1762     return U_SENTINEL;
1763 }
1764 
1765 }  // namespace
1766 
usearch_search(UStringSearch * strsrch,int32_t startIdx,int32_t * matchStart,int32_t * matchLimit,UErrorCode * status)1767 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
1768                                        int32_t        startIdx,
1769                                        int32_t        *matchStart,
1770                                        int32_t        *matchLimit,
1771                                        UErrorCode     *status)
1772 {
1773     if (U_FAILURE(*status)) {
1774         return false;
1775     }
1776 
1777     // TODO:  reject search patterns beginning with a combining char.
1778 
1779 #ifdef USEARCH_DEBUG
1780     if (getenv("USEARCH_DEBUG") != nullptr) {
1781         printf("Pattern CEs\n");
1782         for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
1783             printf(" %8x", strsrch->pattern.ces[ii]);
1784         }
1785         printf("\n");
1786     }
1787 
1788 #endif
1789     // Input parameter sanity check.
1790     //  TODO:  should input indices clip to the text length
1791     //         in the same way that UText does.
1792     if(strsrch->pattern.cesLength == 0         ||
1793        startIdx < 0                           ||
1794        startIdx > strsrch->search->textLength ||
1795        strsrch->pattern.ces == nullptr) {
1796            *status = U_ILLEGAL_ARGUMENT_ERROR;
1797            return false;
1798     }
1799 
1800     if (strsrch->pattern.pces == nullptr) {
1801         initializePatternPCETable(strsrch, status);
1802     }
1803 
1804     ucol_setOffset(strsrch->textIter, startIdx, status);
1805     CEIBuffer ceb(strsrch, status);
1806 
1807     // An out-of-memory (OOM) failure can occur in the initializePatternPCETable function
1808     // or CEIBuffer constructor above, so we need to check the status.
1809     if (U_FAILURE(*status)) {
1810         return false;
1811     }
1812 
1813     int32_t    targetIx = 0;
1814     const CEI *targetCEI = nullptr;
1815     int32_t    patIx;
1816     UBool      found;
1817 
1818     int32_t  mStart = -1;
1819     int32_t  mLimit = -1;
1820     int32_t  minLimit;
1821     int32_t  maxLimit;
1822 
1823 
1824 
1825     // Outer loop moves over match starting positions in the
1826     //      target CE space.
1827     // Here we see the target as a sequence of collation elements, resulting from the following:
1828     // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
1829     //    (for example, digraphs such as IJ may be broken into two characters).
1830     // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
1831     //    16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
1832     //    fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
1833     //    CE weight 0 (as for a combining diacritic with secondary weight when the collator strength is primary),
1834     //    then the CE is deleted, so the following code sees only CEs that are relevant.
1835     // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
1836     // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
1837     // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
1838     //
1839     for(targetIx=0; ; targetIx++)
1840     {
1841         found = true;
1842         //  Inner loop checks for a match beginning at each
1843         //  position from the outer loop.
1844         int32_t targetIxOffset = 0;
1845         int64_t patCE = 0;
1846         // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
1847         // (compared to the last CE fetched for the previous targetIx value) as we need to go
1848         // for this targetIx value, so if it is non-nullptr then other ceb.get calls should be OK.
1849         const CEI *firstCEI = ceb.get(targetIx);
1850         if (firstCEI == nullptr) {
1851             *status = U_INTERNAL_PROGRAM_ERROR;
1852             found = false;
1853             break;
1854         }
1855 
1856         for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
1857             patCE = strsrch->pattern.pces[patIx];
1858             targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
1859             //  Compare CE from target string with CE from the pattern.
1860             //    Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
1861             //    which will fail the compare, below.
1862             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
1863             if ( ceMatch == U_CE_NO_MATCH ) {
1864                 found = false;
1865                 break;
1866             } else if ( ceMatch > U_CE_NO_MATCH ) {
1867                 if ( ceMatch == U_CE_SKIP_TARG ) {
1868                     // redo with same patCE, next targCE
1869                     patIx--;
1870                     targetIxOffset++;
1871                 } else { // ceMatch == U_CE_SKIP_PATN
1872                     // redo with same targCE, next patCE
1873                     targetIxOffset--;
1874                 }
1875             }
1876         }
1877         targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
1878 
1879         if (!found && ((targetCEI == nullptr) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
1880             // No match at this targetIx.  Try again at the next.
1881             continue;
1882         }
1883 
1884         if (!found) {
1885             // No match at all, we have run off the end of the target text.
1886             break;
1887         }
1888 
1889 
1890         // We have found a match in CE space.
1891         // Now determine the bounds in string index space.
1892         //  There still is a chance of match failure if the CE range not correspond to
1893         //     an acceptable character range.
1894         //
1895         const CEI *lastCEI  = ceb.get(targetIx + targetIxOffset - 1);
1896 
1897         mStart   = firstCEI->lowIndex;
1898         minLimit = lastCEI->lowIndex;
1899 
1900         // Look at the CE following the match.  If it is UCOL_NULLORDER the match
1901         //   extended to the end of input, and the match is good.
1902 
1903         // Look at the high and low indices of the CE following the match. If
1904         // they are the same it means one of two things:
1905         //    1. The match extended to the last CE from the target text, which is OK, or
1906         //    2. The last CE that was part of the match is in an expansion that extends
1907         //       to the first CE after the match. In this case, we reject the match.
1908         const CEI *nextCEI = 0;
1909         if (strsrch->search->elementComparisonType == 0) {
1910             nextCEI  = ceb.get(targetIx + targetIxOffset);
1911             maxLimit = nextCEI->lowIndex;
1912             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
1913                 found = false;
1914             }
1915         } else {
1916             for ( ; ; ++targetIxOffset ) {
1917                 nextCEI = ceb.get(targetIx + targetIxOffset);
1918                 maxLimit = nextCEI->lowIndex;
1919                 // If we are at the end of the target too, match succeeds
1920                 if (  nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
1921                     break;
1922                 }
1923                 // As long as the next CE has primary weight of 0,
1924                 // it is part of the last target element matched by the pattern;
1925                 // make sure it can be part of a match with the last patCE
1926                 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
1927                     UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
1928                     if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
1929                         found = false;
1930                         break;
1931                     }
1932                 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
1933                 // target element, but it has non-zero primary weight => match fails
1934                 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
1935                     found = false;
1936                     break;
1937                 // Else the target CE is not part of an expansion of the last matched element, match succeeds
1938                 } else {
1939                     break;
1940                 }
1941             }
1942         }
1943 
1944 
1945         // Check for the start of the match being within a combining sequence.
1946         //   This can happen if the pattern itself begins with a combining char, and
1947         //   the match found combining marks in the target text that were attached
1948         //    to something else.
1949         //   This type of match should be rejected for not completely consuming a
1950         //   combining sequence.
1951         if (!isBreakBoundary(strsrch, mStart, *status)) {
1952             found = false;
1953         }
1954         if (U_FAILURE(*status)) {
1955             break;
1956         }
1957 
1958         // Check for the start of the match being within an Collation Element Expansion,
1959         //   meaning that the first char of the match is only partially matched.
1960         //   With expansions, the first CE will report the index of the source
1961         //   character, and all subsequent (expansions) CEs will report the source index of the
1962         //    _following_ character.
1963         int32_t secondIx = firstCEI->highIndex;
1964         if (mStart == secondIx) {
1965             found = false;
1966         }
1967 
1968         // Allow matches to end in the middle of a grapheme cluster if the following
1969         // conditions are met; this is needed to make prefix search work properly in
1970         // Indic, see #11750
1971         // * the default breakIter is being used
1972         // * the next collation element after this combining sequence
1973         //   - has non-zero primary weight
1974         //   - corresponds to a separate character following the one at end of the current match
1975         //   (the second of these conditions, and perhaps both, may be redundant given the
1976         //   subsequent check for normalization boundary; however they are likely much faster
1977         //   tests in any case)
1978         // * the match limit is a normalization boundary
1979         UBool allowMidclusterMatch = false;
1980         if (strsrch->search->text != nullptr && strsrch->search->textLength > maxLimit) {
1981             allowMidclusterMatch =
1982                     strsrch->search->breakIter == nullptr &&
1983                     nextCEI != nullptr && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
1984                     maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
1985                     (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
1986                         strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
1987         }
1988         // If those conditions are met, then:
1989         // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
1990         //   the match limit may be backed off to a previous break boundary. This handles
1991         //   cases in which mLimit includes target characters that are ignorable with current
1992         //   settings (such as space) and which extend beyond the pattern match.
1993         // * do NOT require that end of the combining sequence not extend beyond the match in CE space
1994         // * do NOT require that match limit be on a breakIter boundary
1995 
1996         //  Advance the match end position to the first acceptable match boundary.
1997         //    This advances the index over any combining characters.
1998         mLimit = maxLimit;
1999         if (minLimit < maxLimit) {
2000             // When the last CE's low index is same with its high index, the CE is likely
2001             // a part of expansion. In this case, the index is located just after the
2002             // character corresponding to the CEs compared above. If the index is right
2003             // at the break boundary, move the position to the next boundary will result
2004             // incorrect match length when there are ignorable characters exist between
2005             // the position and the next character produces CE(s). See ticket#8482.
2006             if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit, *status)) {
2007                 mLimit = minLimit;
2008             } else {
2009                 int32_t nba = nextBoundaryAfter(strsrch, minLimit, *status);
2010                 // Note that we can have nba < maxLimit && nba >= minLImit, in which
2011                 // case we want to set mLimit to nba regardless of allowMidclusterMatch
2012                 // (i.e. we back off mLimit to the previous breakIterator boundary).
2013                 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
2014                     mLimit = nba;
2015                 }
2016             }
2017         }
2018 
2019         if (U_FAILURE(*status)) {
2020             break;
2021         }
2022 
2023     #ifdef USEARCH_DEBUG
2024         if (getenv("USEARCH_DEBUG") != nullptr) {
2025             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
2026         }
2027     #endif
2028 
2029         if (!allowMidclusterMatch) {
2030             // If advancing to the end of a combining sequence in character indexing space
2031             //   advanced us beyond the end of the match in CE space, reject this match.
2032             if (mLimit > maxLimit) {
2033                 found = false;
2034             }
2035 
2036             if (!isBreakBoundary(strsrch, mLimit, *status)) {
2037                 found = false;
2038             }
2039             if (U_FAILURE(*status)) {
2040                 break;
2041             }
2042         }
2043 
2044         if (! checkIdentical(strsrch, mStart, mLimit)) {
2045             found = false;
2046         }
2047 
2048         if (found) {
2049             break;
2050         }
2051     }
2052 
2053     #ifdef USEARCH_DEBUG
2054     if (getenv("USEARCH_DEBUG") != nullptr) {
2055         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
2056         int32_t  lastToPrint = ceb.limitIx+2;
2057         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
2058             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
2059         }
2060         printf("\n%s\n", found? "match found" : "no match");
2061     }
2062     #endif
2063 
2064     // All Done.  Store back the match bounds to the caller.
2065     //
2066 
2067     if (U_FAILURE(*status)) {
2068         found = false; // No match if a failure occured.
2069     }
2070 
2071     if (found==false) {
2072         mLimit = -1;
2073         mStart = -1;
2074     }
2075 
2076     if (matchStart != nullptr) {
2077         *matchStart= mStart;
2078     }
2079 
2080     if (matchLimit != nullptr) {
2081         *matchLimit = mLimit;
2082     }
2083 
2084     return found;
2085 }
2086 
usearch_searchBackwards(UStringSearch * strsrch,int32_t startIdx,int32_t * matchStart,int32_t * matchLimit,UErrorCode * status)2087 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
2088                                                 int32_t        startIdx,
2089                                                 int32_t        *matchStart,
2090                                                 int32_t        *matchLimit,
2091                                                 UErrorCode     *status)
2092 {
2093     if (U_FAILURE(*status)) {
2094         return false;
2095     }
2096 
2097     // TODO:  reject search patterns beginning with a combining char.
2098 
2099 #ifdef USEARCH_DEBUG
2100     if (getenv("USEARCH_DEBUG") != nullptr) {
2101         printf("Pattern CEs\n");
2102         for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
2103             printf(" %8x", strsrch->pattern.ces[ii]);
2104         }
2105         printf("\n");
2106     }
2107 
2108 #endif
2109     // Input parameter sanity check.
2110     //  TODO:  should input indices clip to the text length
2111     //         in the same way that UText does.
2112     if(strsrch->pattern.cesLength == 0        ||
2113        startIdx < 0                           ||
2114        startIdx > strsrch->search->textLength ||
2115        strsrch->pattern.ces == nullptr) {
2116            *status = U_ILLEGAL_ARGUMENT_ERROR;
2117            return false;
2118     }
2119 
2120     if (strsrch->pattern.pces == nullptr) {
2121         initializePatternPCETable(strsrch, status);
2122     }
2123 
2124     CEIBuffer ceb(strsrch, status);
2125     int32_t    targetIx = 0;
2126 
2127     /*
2128      * Pre-load the buffer with the CE's for the grapheme
2129      * after our starting position so that we're sure that
2130      * we can look at the CE following the match when we
2131      * check the match boundaries.
2132      *
2133      * This will also pre-fetch the first CE that we'll
2134      * consider for the match.
2135      */
2136     if (startIdx < strsrch->search->textLength) {
2137         UBreakIterator *breakiterator = getBreakIterator(strsrch, *status);
2138         if (U_FAILURE(*status)) {
2139             return false;
2140         }
2141         int32_t next = ubrk_following(breakiterator, startIdx);
2142 
2143         ucol_setOffset(strsrch->textIter, next, status);
2144 
2145         for (targetIx = 0; ; targetIx += 1) {
2146             if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
2147                 break;
2148             }
2149         }
2150     } else {
2151         ucol_setOffset(strsrch->textIter, startIdx, status);
2152     }
2153 
2154     // An out-of-memory (OOM) failure can occur above, so we need to check the status.
2155     if (U_FAILURE(*status)) {
2156         return false;
2157     }
2158 
2159     const CEI *targetCEI = nullptr;
2160     int32_t    patIx;
2161     UBool      found;
2162 
2163     int32_t  limitIx = targetIx;
2164     int32_t  mStart = -1;
2165     int32_t  mLimit = -1;
2166     int32_t  minLimit;
2167     int32_t  maxLimit;
2168 
2169 
2170 
2171     // Outer loop moves over match starting positions in the
2172     //      target CE space.
2173     // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
2174     // But  patIx is 0 at the beginning of the pattern and increases toward the end.
2175     // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
2176     // and the beginning of the base text.
2177     for(targetIx = limitIx; ; targetIx += 1)
2178     {
2179         found = true;
2180         // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
2181         // (compared to the last CE fetched for the previous targetIx value) as we need to go
2182         // for this targetIx value, so if it is non-nullptr then other ceb.getPrevious calls should be OK.
2183         const CEI *lastCEI  = ceb.getPrevious(targetIx);
2184         if (lastCEI == nullptr) {
2185             *status = U_INTERNAL_PROGRAM_ERROR;
2186             found = false;
2187              break;
2188         }
2189         //  Inner loop checks for a match beginning at each
2190         //  position from the outer loop.
2191         int32_t targetIxOffset = 0;
2192         for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
2193             int64_t patCE = strsrch->pattern.pces[patIx];
2194 
2195             targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
2196             //  Compare CE from target string with CE from the pattern.
2197             //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
2198             //    which will fail the compare, below.
2199             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
2200             if ( ceMatch == U_CE_NO_MATCH ) {
2201                 found = false;
2202                 break;
2203             } else if ( ceMatch > U_CE_NO_MATCH ) {
2204                 if ( ceMatch == U_CE_SKIP_TARG ) {
2205                     // redo with same patCE, next targCE
2206                     patIx++;
2207                     targetIxOffset++;
2208                 } else { // ceMatch == U_CE_SKIP_PATN
2209                     // redo with same targCE, next patCE
2210                     targetIxOffset--;
2211                 }
2212             }
2213         }
2214 
2215         if (!found && ((targetCEI == nullptr) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
2216             // No match at this targetIx.  Try again at the next.
2217             continue;
2218         }
2219 
2220         if (!found) {
2221             // No match at all, we have run off the end of the target text.
2222             break;
2223         }
2224 
2225 
2226         // We have found a match in CE space.
2227         // Now determine the bounds in string index space.
2228         //  There still is a chance of match failure if the CE range not correspond to
2229         //     an acceptable character range.
2230         //
2231         const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
2232         mStart   = firstCEI->lowIndex;
2233 
2234         // Check for the start of the match being within a combining sequence.
2235         //   This can happen if the pattern itself begins with a combining char, and
2236         //   the match found combining marks in the target text that were attached
2237         //    to something else.
2238         //   This type of match should be rejected for not completely consuming a
2239         //   combining sequence.
2240         if (!isBreakBoundary(strsrch, mStart, *status)) {
2241             found = false;
2242         }
2243         if (U_FAILURE(*status)) {
2244             break;
2245         }
2246 
2247         // Look at the high index of the first CE in the match. If it's the same as the
2248         // low index, the first CE in the match is in the middle of an expansion.
2249         if (mStart == firstCEI->highIndex) {
2250             found = false;
2251         }
2252 
2253 
2254         minLimit = lastCEI->lowIndex;
2255 
2256         if (targetIx > 0) {
2257             // Look at the CE following the match.  If it is UCOL_NULLORDER the match
2258             //   extended to the end of input, and the match is good.
2259 
2260             // Look at the high and low indices of the CE following the match. If
2261             // they are the same it means one of two things:
2262             //    1. The match extended to the last CE from the target text, which is OK, or
2263             //    2. The last CE that was part of the match is in an expansion that extends
2264             //       to the first CE after the match. In this case, we reject the match.
2265             const CEI *nextCEI  = ceb.getPrevious(targetIx - 1);
2266 
2267             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
2268                 found = false;
2269             }
2270 
2271             mLimit = maxLimit = nextCEI->lowIndex;
2272 
2273             // Allow matches to end in the middle of a grapheme cluster if the following
2274             // conditions are met; this is needed to make prefix search work properly in
2275             // Indic, see #11750
2276             // * the default breakIter is being used
2277             // * the next collation element after this combining sequence
2278             //   - has non-zero primary weight
2279             //   - corresponds to a separate character following the one at end of the current match
2280             //   (the second of these conditions, and perhaps both, may be redundant given the
2281             //   subsequent check for normalization boundary; however they are likely much faster
2282             //   tests in any case)
2283             // * the match limit is a normalization boundary
2284             UBool allowMidclusterMatch = false;
2285             if (strsrch->search->text != nullptr && strsrch->search->textLength > maxLimit) {
2286                 allowMidclusterMatch =
2287                         strsrch->search->breakIter == nullptr &&
2288                         nextCEI != nullptr && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
2289                         maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
2290                         (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
2291                             strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
2292             }
2293             // If those conditions are met, then:
2294             // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
2295             //   the match limit may be backed off to a previous break boundary. This handles
2296             //   cases in which mLimit includes target characters that are ignorable with current
2297             //   settings (such as space) and which extend beyond the pattern match.
2298             // * do NOT require that end of the combining sequence not extend beyond the match in CE space
2299             // * do NOT require that match limit be on a breakIter boundary
2300 
2301             //  Advance the match end position to the first acceptable match boundary.
2302             //    This advances the index over any combining characters.
2303             if (minLimit < maxLimit) {
2304                 int32_t nba = nextBoundaryAfter(strsrch, minLimit, *status);
2305                 // Note that we can have nba < maxLimit && nba >= minLImit, in which
2306                 // case we want to set mLimit to nba regardless of allowMidclusterMatch
2307                 // (i.e. we back off mLimit to the previous breakIterator boundary).
2308                 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
2309                     mLimit = nba;
2310                 }
2311             }
2312 
2313             if (!allowMidclusterMatch) {
2314                 // If advancing to the end of a combining sequence in character indexing space
2315                 //   advanced us beyond the end of the match in CE space, reject this match.
2316                 if (mLimit > maxLimit) {
2317                     found = false;
2318                 }
2319 
2320                 // Make sure the end of the match is on a break boundary
2321                 if (!isBreakBoundary(strsrch, mLimit, *status)) {
2322                     found = false;
2323                 }
2324                 if (U_FAILURE(*status)) {
2325                     break;
2326                 }
2327             }
2328 
2329         } else {
2330             // No non-ignorable CEs after this point.
2331             // The maximum position is detected by boundary after
2332             // the last non-ignorable CE. Combining sequence
2333             // across the start index will be truncated.
2334             int32_t nba = nextBoundaryAfter(strsrch, minLimit, *status);
2335             mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
2336         }
2337 
2338     #ifdef USEARCH_DEBUG
2339         if (getenv("USEARCH_DEBUG") != nullptr) {
2340             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
2341         }
2342     #endif
2343 
2344 
2345         if (! checkIdentical(strsrch, mStart, mLimit)) {
2346             found = false;
2347         }
2348 
2349         if (found) {
2350             break;
2351         }
2352     }
2353 
2354     #ifdef USEARCH_DEBUG
2355     if (getenv("USEARCH_DEBUG") != nullptr) {
2356         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
2357         int32_t  lastToPrint = ceb.limitIx+2;
2358         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
2359             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
2360         }
2361         printf("\n%s\n", found? "match found" : "no match");
2362     }
2363     #endif
2364 
2365     // All Done.  Store back the match bounds to the caller.
2366     //
2367 
2368     if (U_FAILURE(*status)) {
2369         found = false; // No match if a failure occured.
2370     }
2371 
2372     if (found==false) {
2373         mLimit = -1;
2374         mStart = -1;
2375     }
2376 
2377     if (matchStart != nullptr) {
2378         *matchStart= mStart;
2379     }
2380 
2381     if (matchLimit != nullptr) {
2382         *matchLimit = mLimit;
2383     }
2384 
2385     return found;
2386 }
2387 
2388 // internal use methods declared in usrchimp.h -----------------------------
2389 
usearch_handleNextExact(UStringSearch * strsrch,UErrorCode * status)2390 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
2391 {
2392     if (U_FAILURE(*status)) {
2393         setMatchNotFound(strsrch, *status);
2394         return false;
2395     }
2396 
2397     int32_t textOffset = ucol_getOffset(strsrch->textIter);
2398     int32_t start = -1;
2399     int32_t end = -1;
2400 
2401     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
2402         strsrch->search->matchedIndex  = start;
2403         strsrch->search->matchedLength = end - start;
2404         return true;
2405     } else {
2406         setMatchNotFound(strsrch, *status);
2407         return false;
2408     }
2409 }
2410 
usearch_handleNextCanonical(UStringSearch * strsrch,UErrorCode * status)2411 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
2412 {
2413     if (U_FAILURE(*status)) {
2414         setMatchNotFound(strsrch, *status);
2415         return false;
2416     }
2417 
2418     int32_t textOffset = ucol_getOffset(strsrch->textIter);
2419     int32_t start = -1;
2420     int32_t end = -1;
2421 
2422     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
2423         strsrch->search->matchedIndex  = start;
2424         strsrch->search->matchedLength = end - start;
2425         return true;
2426     } else {
2427         setMatchNotFound(strsrch, *status);
2428         return false;
2429     }
2430 }
2431 
usearch_handlePreviousExact(UStringSearch * strsrch,UErrorCode * status)2432 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
2433 {
2434     if (U_FAILURE(*status)) {
2435         setMatchNotFound(strsrch, *status);
2436         return false;
2437     }
2438 
2439     int32_t textOffset;
2440 
2441     if (strsrch->search->isOverlap) {
2442         if (strsrch->search->matchedIndex != USEARCH_DONE) {
2443             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
2444         } else {
2445             // move the start position at the end of possible match
2446             initializePatternPCETable(strsrch, status);
2447             if (!initTextProcessedIter(strsrch, status)) {
2448                 setMatchNotFound(strsrch, *status);
2449                 return false;
2450             }
2451             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
2452                 int64_t pce = strsrch->textProcessedIter->nextProcessed(nullptr, nullptr, status);
2453                 if (pce == UCOL_PROCESSED_NULLORDER) {
2454                     // at the end of the text
2455                     break;
2456                 }
2457             }
2458             if (U_FAILURE(*status)) {
2459                 setMatchNotFound(strsrch, *status);
2460                 return false;
2461             }
2462             textOffset = ucol_getOffset(strsrch->textIter);
2463         }
2464     } else {
2465         textOffset = ucol_getOffset(strsrch->textIter);
2466     }
2467 
2468     int32_t start = -1;
2469     int32_t end = -1;
2470 
2471     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
2472         strsrch->search->matchedIndex = start;
2473         strsrch->search->matchedLength = end - start;
2474         return true;
2475     } else {
2476         setMatchNotFound(strsrch, *status);
2477         return false;
2478     }
2479 }
2480 
usearch_handlePreviousCanonical(UStringSearch * strsrch,UErrorCode * status)2481 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
2482                                       UErrorCode    *status)
2483 {
2484     if (U_FAILURE(*status)) {
2485         setMatchNotFound(strsrch, *status);
2486         return false;
2487     }
2488 
2489     int32_t textOffset;
2490 
2491     if (strsrch->search->isOverlap) {
2492         if (strsrch->search->matchedIndex != USEARCH_DONE) {
2493             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
2494         } else {
2495             // move the start position at the end of possible match
2496             initializePatternPCETable(strsrch, status);
2497             if (!initTextProcessedIter(strsrch, status)) {
2498                 setMatchNotFound(strsrch, *status);
2499                 return false;
2500             }
2501             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
2502                 int64_t pce = strsrch->textProcessedIter->nextProcessed(nullptr, nullptr, status);
2503                 if (pce == UCOL_PROCESSED_NULLORDER) {
2504                     // at the end of the text
2505                     break;
2506                 }
2507             }
2508             if (U_FAILURE(*status)) {
2509                 setMatchNotFound(strsrch, *status);
2510                 return false;
2511             }
2512             textOffset = ucol_getOffset(strsrch->textIter);
2513         }
2514     } else {
2515         textOffset = ucol_getOffset(strsrch->textIter);
2516     }
2517 
2518     int32_t start = -1;
2519     int32_t end = -1;
2520 
2521     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
2522         strsrch->search->matchedIndex = start;
2523         strsrch->search->matchedLength = end - start;
2524         return true;
2525     } else {
2526         setMatchNotFound(strsrch, *status);
2527         return false;
2528     }
2529 }
2530 
2531 #endif /* #if !UCONFIG_NO_COLLATION */
2532