1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2001-2015 IBM and others. All rights reserved.
6 **********************************************************************
7 * Date Name Description
8 * 07/02/2001 synwee Creation.
9 **********************************************************************
10 */
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
15
16 #include "unicode/usearch.h"
17 #include "unicode/ustring.h"
18 #include "unicode/uchar.h"
19 #include "unicode/utf16.h"
20 #include "normalizer2impl.h"
21 #include "usrchimp.h"
22 #include "cmemory.h"
23 #include "ucln_in.h"
24 #include "uassert.h"
25 #include "ustr_imp.h"
26
27 U_NAMESPACE_USE
28
29 // don't use Boyer-Moore
30 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
31 #define BOYER_MOORE 0
32
33 // internal definition ---------------------------------------------------
34
35 #define LAST_BYTE_MASK_ 0xFF
36 #define SECOND_LAST_BYTE_SHIFT_ 8
37 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
38
39 static const Normalizer2Impl *g_nfcImpl = NULL;
40
41 // internal methods -------------------------------------------------
42
43 /**
44 * Fast collation element iterator setOffset.
45 * This function does not check for bounds.
46 * @param coleiter collation element iterator
47 * @param offset to set
48 */
49 static
setColEIterOffset(UCollationElements * elems,int32_t offset)50 inline void setColEIterOffset(UCollationElements *elems,
51 int32_t offset)
52 {
53 // Note: Not "fast" any more after the 2013 collation rewrite.
54 // We do not want to expose more internals than necessary.
55 UErrorCode status = U_ZERO_ERROR;
56 ucol_setOffset(elems, offset, &status);
57 }
58
59 /**
60 * Getting the mask for collation strength
61 * @param strength collation strength
62 * @return collation element mask
63 */
64 static
getMask(UCollationStrength strength)65 inline uint32_t getMask(UCollationStrength strength)
66 {
67 switch (strength)
68 {
69 case UCOL_PRIMARY:
70 return UCOL_PRIMARYORDERMASK;
71 case UCOL_SECONDARY:
72 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
73 default:
74 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
75 UCOL_PRIMARYORDERMASK;
76 }
77 }
78
79 /**
80 * @param ce 32-bit collation element
81 * @return hash code
82 */
83 static
hashFromCE32(uint32_t ce)84 inline int hashFromCE32(uint32_t ce)
85 {
86 int hc = (int)(
87 ((((((ce >> 24) * 37) +
88 (ce >> 16)) * 37) +
89 (ce >> 8)) * 37) +
90 ce);
91 hc %= MAX_TABLE_SIZE_;
92 if (hc < 0) {
93 hc += MAX_TABLE_SIZE_;
94 }
95 return hc;
96 }
97
98 U_CDECL_BEGIN
99 static UBool U_CALLCONV
usearch_cleanup(void)100 usearch_cleanup(void) {
101 g_nfcImpl = NULL;
102 return TRUE;
103 }
104 U_CDECL_END
105
106 /**
107 * Initializing the fcd tables.
108 * Internal method, status assumed to be a success.
109 * @param status output error if any, caller to check status before calling
110 * method, status assumed to be success when passed in.
111 */
112 static
initializeFCD(UErrorCode * status)113 inline void initializeFCD(UErrorCode *status)
114 {
115 if (g_nfcImpl == NULL) {
116 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
117 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
118 }
119 }
120
121 /**
122 * Gets the fcd value for a character at the argument index.
123 * This method takes into accounts of the supplementary characters.
124 * @param str UTF16 string where character for fcd retrieval resides
125 * @param offset position of the character whose fcd is to be retrieved, to be
126 * overwritten with the next character position, taking
127 * surrogate characters into consideration.
128 * @param strlength length of the argument string
129 * @return fcd value
130 */
131 static
getFCD(const UChar * str,int32_t * offset,int32_t strlength)132 uint16_t getFCD(const UChar *str, int32_t *offset,
133 int32_t strlength)
134 {
135 const UChar *temp = str + *offset;
136 uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength);
137 *offset = (int32_t)(temp - str);
138 return result;
139 }
140
141 /**
142 * Getting the modified collation elements taking into account the collation
143 * attributes
144 * @param strsrch string search data
145 * @param sourcece
146 * @return the modified collation element
147 */
148 static
getCE(const UStringSearch * strsrch,uint32_t sourcece)149 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
150 {
151 // note for tertiary we can't use the collator->tertiaryMask, that
152 // is a preprocessed mask that takes into account case options. since
153 // we are only concerned with exact matches, we don't need that.
154 sourcece &= strsrch->ceMask;
155
156 if (strsrch->toShift) {
157 // alternate handling here, since only the 16 most significant digits
158 // is only used, we can safely do a compare without masking
159 // if the ce is a variable, we mask and get only the primary values
160 // no shifting to quartenary is required since all primary values
161 // less than variabletop will need to be masked off anyway.
162 if (strsrch->variableTop > sourcece) {
163 if (strsrch->strength >= UCOL_QUATERNARY) {
164 sourcece &= UCOL_PRIMARYORDERMASK;
165 }
166 else {
167 sourcece = UCOL_IGNORABLE;
168 }
169 }
170 } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
171 sourcece = 0xFFFF;
172 }
173
174 return sourcece;
175 }
176
177 /**
178 * Allocate a memory and returns NULL if it failed.
179 * Internal method, status assumed to be a success.
180 * @param size to allocate
181 * @param status output error if any, caller to check status before calling
182 * method, status assumed to be success when passed in.
183 * @return newly allocated array, NULL otherwise
184 */
185 static
allocateMemory(uint32_t size,UErrorCode * status)186 inline void * allocateMemory(uint32_t size, UErrorCode *status)
187 {
188 uint32_t *result = (uint32_t *)uprv_malloc(size);
189 if (result == NULL) {
190 *status = U_MEMORY_ALLOCATION_ERROR;
191 }
192 return result;
193 }
194
195 /**
196 * Adds a uint32_t value to a destination array.
197 * Creates a new array if we run out of space. The caller will have to
198 * manually deallocate the newly allocated array.
199 * Internal method, status assumed to be success, caller has to check status
200 * before calling this method. destination not to be NULL and has at least
201 * size destinationlength.
202 * @param destination target array
203 * @param offset destination offset to add value
204 * @param destinationlength target array size, return value for the new size
205 * @param value to be added
206 * @param increments incremental size expected
207 * @param status output error if any, caller to check status before calling
208 * method, status assumed to be success when passed in.
209 * @return new destination array, destination if there was no new allocation
210 */
211 static
addTouint32_tArray(int32_t * destination,uint32_t offset,uint32_t * destinationlength,uint32_t value,uint32_t increments,UErrorCode * status)212 inline int32_t * addTouint32_tArray(int32_t *destination,
213 uint32_t offset,
214 uint32_t *destinationlength,
215 uint32_t value,
216 uint32_t increments,
217 UErrorCode *status)
218 {
219 uint32_t newlength = *destinationlength;
220 if (offset + 1 == newlength) {
221 newlength += increments;
222 int32_t *temp = (int32_t *)allocateMemory(
223 sizeof(int32_t) * newlength, status);
224 if (U_FAILURE(*status)) {
225 return NULL;
226 }
227 uprv_memcpy(temp, destination, sizeof(int32_t) * (size_t)offset);
228 *destinationlength = newlength;
229 destination = temp;
230 }
231 destination[offset] = value;
232 return destination;
233 }
234
235 /**
236 * Adds a uint64_t value to a destination array.
237 * Creates a new array if we run out of space. The caller will have to
238 * manually deallocate the newly allocated array.
239 * Internal method, status assumed to be success, caller has to check status
240 * before calling this method. destination not to be NULL and has at least
241 * size destinationlength.
242 * @param destination target array
243 * @param offset destination offset to add value
244 * @param destinationlength target array size, return value for the new size
245 * @param value to be added
246 * @param increments incremental size expected
247 * @param status output error if any, caller to check status before calling
248 * method, status assumed to be success when passed in.
249 * @return new destination array, destination if there was no new allocation
250 */
251 static
addTouint64_tArray(int64_t * destination,uint32_t offset,uint32_t * destinationlength,uint64_t value,uint32_t increments,UErrorCode * status)252 inline int64_t * addTouint64_tArray(int64_t *destination,
253 uint32_t offset,
254 uint32_t *destinationlength,
255 uint64_t value,
256 uint32_t increments,
257 UErrorCode *status)
258 {
259 uint32_t newlength = *destinationlength;
260 if (offset + 1 == newlength) {
261 newlength += increments;
262 int64_t *temp = (int64_t *)allocateMemory(
263 sizeof(int64_t) * newlength, status);
264
265 if (U_FAILURE(*status)) {
266 return NULL;
267 }
268
269 uprv_memcpy(temp, destination, sizeof(int64_t) * (size_t)offset);
270 *destinationlength = newlength;
271 destination = temp;
272 }
273
274 destination[offset] = value;
275
276 return destination;
277 }
278
279 /**
280 * Initializing the ce table for a pattern.
281 * Stores non-ignorable collation keys.
282 * Table size will be estimated by the size of the pattern text. Table
283 * expansion will be perform as we go along. Adding 1 to ensure that the table
284 * size definitely increases.
285 * Internal method, status assumed to be a success.
286 * @param strsrch string search data
287 * @param status output error if any, caller to check status before calling
288 * method, status assumed to be success when passed in.
289 * @return total number of expansions
290 */
291 static
initializePatternCETable(UStringSearch * strsrch,UErrorCode * status)292 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
293 UErrorCode *status)
294 {
295 UPattern *pattern = &(strsrch->pattern);
296 uint32_t cetablesize = INITIAL_ARRAY_SIZE_;
297 int32_t *cetable = pattern->cesBuffer;
298 uint32_t patternlength = pattern->textLength;
299 UCollationElements *coleiter = strsrch->utilIter;
300
301 if (coleiter == NULL) {
302 coleiter = ucol_openElements(strsrch->collator, pattern->text,
303 patternlength, status);
304 // status will be checked in ucol_next(..) later and if it is an
305 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
306 // returned.
307 strsrch->utilIter = coleiter;
308 }
309 else {
310 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
311 }
312 if(U_FAILURE(*status)) {
313 return 0;
314 }
315
316 if (pattern->ces != cetable && pattern->ces) {
317 uprv_free(pattern->ces);
318 }
319
320 uint16_t offset = 0;
321 uint16_t result = 0;
322 int32_t ce;
323
324 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
325 U_SUCCESS(*status)) {
326 uint32_t newce = getCE(strsrch, ce);
327 if (newce) {
328 int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
329 newce,
330 patternlength - ucol_getOffset(coleiter) + 1,
331 status);
332 if (U_FAILURE(*status)) {
333 return 0;
334 }
335 offset ++;
336 if (cetable != temp && cetable != pattern->cesBuffer) {
337 uprv_free(cetable);
338 }
339 cetable = temp;
340 }
341 result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
342 }
343
344 cetable[offset] = 0;
345 pattern->ces = cetable;
346 pattern->cesLength = offset;
347
348 return result;
349 }
350
351 /**
352 * Initializing the pce table for a pattern.
353 * Stores non-ignorable collation keys.
354 * Table size will be estimated by the size of the pattern text. Table
355 * expansion will be perform as we go along. Adding 1 to ensure that the table
356 * size definitely increases.
357 * Internal method, status assumed to be a success.
358 * @param strsrch string search data
359 * @param status output error if any, caller to check status before calling
360 * method, status assumed to be success when passed in.
361 * @return total number of expansions
362 */
363 static
initializePatternPCETable(UStringSearch * strsrch,UErrorCode * status)364 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
365 UErrorCode *status)
366 {
367 UPattern *pattern = &(strsrch->pattern);
368 uint32_t pcetablesize = INITIAL_ARRAY_SIZE_;
369 int64_t *pcetable = pattern->pcesBuffer;
370 uint32_t patternlength = pattern->textLength;
371 UCollationElements *coleiter = strsrch->utilIter;
372
373 if (coleiter == NULL) {
374 coleiter = ucol_openElements(strsrch->collator, pattern->text,
375 patternlength, status);
376 // status will be checked in ucol_next(..) later and if it is an
377 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
378 // returned.
379 strsrch->utilIter = coleiter;
380 } else {
381 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
382 }
383 if(U_FAILURE(*status)) {
384 return 0;
385 }
386
387 if (pattern->pces != pcetable && pattern->pces != NULL) {
388 uprv_free(pattern->pces);
389 }
390
391 uint16_t offset = 0;
392 uint16_t result = 0;
393 int64_t pce;
394
395 icu::UCollationPCE iter(coleiter);
396
397 // ** Should processed CEs be signed or unsigned?
398 // ** (the rest of the code in this file seems to play fast-and-loose with
399 // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
400 while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
401 U_SUCCESS(*status)) {
402 int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
403 pce,
404 patternlength - ucol_getOffset(coleiter) + 1,
405 status);
406
407 if (U_FAILURE(*status)) {
408 return 0;
409 }
410
411 offset += 1;
412
413 if (pcetable != temp && pcetable != pattern->pcesBuffer) {
414 uprv_free(pcetable);
415 }
416
417 pcetable = temp;
418 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
419 }
420
421 pcetable[offset] = 0;
422 pattern->pces = pcetable;
423 pattern->pcesLength = offset;
424
425 return result;
426 }
427
428 /**
429 * Initializes the pattern struct.
430 * Internal method, status assumed to be success.
431 * @param strsrch UStringSearch data storage
432 * @param status output error if any, caller to check status before calling
433 * method, status assumed to be success when passed in.
434 * @return expansionsize the total expansion size of the pattern
435 */
436 static
initializePattern(UStringSearch * strsrch,UErrorCode * status)437 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
438 {
439 if (U_FAILURE(*status)) { return 0; }
440 UPattern *pattern = &(strsrch->pattern);
441 const UChar *patterntext = pattern->text;
442 int32_t length = pattern->textLength;
443 int32_t index = 0;
444
445 // Since the strength is primary, accents are ignored in the pattern.
446 if (strsrch->strength == UCOL_PRIMARY) {
447 pattern->hasPrefixAccents = 0;
448 pattern->hasSuffixAccents = 0;
449 } else {
450 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
451 SECOND_LAST_BYTE_SHIFT_;
452 index = length;
453 U16_BACK_1(patterntext, 0, index);
454 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
455 LAST_BYTE_MASK_;
456 }
457
458 // ** HACK **
459 if (strsrch->pattern.pces != NULL) {
460 if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
461 uprv_free(strsrch->pattern.pces);
462 }
463
464 strsrch->pattern.pces = NULL;
465 }
466
467 // since intializePattern is an internal method status is a success.
468 return initializePatternCETable(strsrch, status);
469 }
470
471 /**
472 * Initializing shift tables, with the default values.
473 * If a corresponding default value is 0, the shift table is not set.
474 * @param shift table for forwards shift
475 * @param backshift table for backwards shift
476 * @param cetable table containing pattern ce
477 * @param cesize size of the pattern ces
478 * @param expansionsize total size of the expansions
479 * @param defaultforward the default forward value
480 * @param defaultbackward the default backward value
481 */
482 static
setShiftTable(int16_t shift[],int16_t backshift[],int32_t * cetable,int32_t cesize,int16_t expansionsize,int16_t defaultforward,int16_t defaultbackward)483 inline void setShiftTable(int16_t shift[], int16_t backshift[],
484 int32_t *cetable, int32_t cesize,
485 int16_t expansionsize,
486 int16_t defaultforward,
487 int16_t defaultbackward)
488 {
489 // estimate the value to shift. to do that we estimate the smallest
490 // number of characters to give the relevant ces, ie approximately
491 // the number of ces minus their expansion, since expansions can come
492 // from a character.
493 int32_t count;
494 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
495 shift[count] = defaultforward;
496 }
497 cesize --; // down to the last index
498 for (count = 0; count < cesize; count ++) {
499 // number of ces from right of array to the count
500 int temp = defaultforward - count - 1;
501 shift[hashFromCE32(cetable[count])] = temp > 1 ? temp : 1;
502 }
503 shift[hashFromCE32(cetable[cesize])] = 1;
504 // for ignorables we just shift by one. see test examples.
505 shift[hashFromCE32(0)] = 1;
506
507 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
508 backshift[count] = defaultbackward;
509 }
510 for (count = cesize; count > 0; count --) {
511 // the original value count does not seem to work
512 backshift[hashFromCE32(cetable[count])] = count > expansionsize ?
513 (int16_t)(count - expansionsize) : 1;
514 }
515 backshift[hashFromCE32(cetable[0])] = 1;
516 backshift[hashFromCE32(0)] = 1;
517 }
518
519 /**
520 * Building of the pattern collation element list and the boyer moore strsrch
521 * table.
522 * The canonical match will only be performed after the default match fails.
523 * For both cases we need to remember the size of the composed and decomposed
524 * versions of the string. Since the Boyer-Moore shift calculations shifts by
525 * a number of characters in the text and tries to match the pattern from that
526 * offset, the shift value can not be too large in case we miss some
527 * characters. To choose a right shift size, we estimate the NFC form of the
528 * and use its size as a shift guide. The NFC form should be the small
529 * possible representation of the pattern. Anyways, we'll err on the smaller
530 * shift size. Hence the calculation for minlength.
531 * Canonical match will be performed slightly differently. We'll split the
532 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
533 * the first and last base character (MS), the ending accents (EA). Matches
534 * will be done on MS first, and only when we match MS then some processing
535 * will be required for the prefix and end accents in order to determine if
536 * they match PA and EA. Hence the default shift values
537 * for the canonical match will take the size of either end's accent into
538 * consideration. Forwards search will take the end accents into consideration
539 * for the default shift values and the backwards search will take the prefix
540 * accents into consideration.
541 * If pattern has no non-ignorable ce, we return a illegal argument error.
542 * Internal method, status assumed to be success.
543 * @param strsrch UStringSearch data storage
544 * @param status for output errors if it occurs, status is assumed to be a
545 * success when it is passed in.
546 */
547 static
initialize(UStringSearch * strsrch,UErrorCode * status)548 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
549 {
550 int16_t expandlength = initializePattern(strsrch, status);
551 if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
552 UPattern *pattern = &strsrch->pattern;
553 int32_t cesize = pattern->cesLength;
554
555 int16_t minlength = cesize > expandlength
556 ? (int16_t)cesize - expandlength : 1;
557 pattern->defaultShiftSize = minlength;
558 setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
559 cesize, expandlength, minlength, minlength);
560 return;
561 }
562 strsrch->pattern.defaultShiftSize = 0;
563 }
564
565 #if BOYER_MOORE
566 /**
567 * Check to make sure that the match length is at the end of the character by
568 * using the breakiterator.
569 * @param strsrch string search data
570 * @param start target text start offset
571 * @param end target text end offset
572 */
573 static
checkBreakBoundary(const UStringSearch * strsrch,int32_t *,int32_t * end)574 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
575 int32_t *end)
576 {
577 #if !UCONFIG_NO_BREAK_ITERATION
578 UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
579 if (breakiterator) {
580 int32_t matchend = *end;
581 //int32_t matchstart = *start;
582
583 if (!ubrk_isBoundary(breakiterator, matchend)) {
584 *end = ubrk_following(breakiterator, matchend);
585 }
586
587 /* Check the start of the matched text to make sure it doesn't have any accents
588 * before it. This code may not be necessary and so it is commented out */
589 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
590 *start = ubrk_preceding(breakiterator, matchstart);
591 }*/
592 }
593 #endif
594 }
595
596 /**
597 * Determine whether the target text in UStringSearch bounded by the offset
598 * start and end is one or more whole units of text as
599 * determined by the breakiterator in UStringSearch.
600 * @param strsrch string search data
601 * @param start target text start offset
602 * @param end target text end offset
603 */
604 static
isBreakUnit(const UStringSearch * strsrch,int32_t start,int32_t end)605 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
606 int32_t end)
607 {
608 #if !UCONFIG_NO_BREAK_ITERATION
609 UBreakIterator *breakiterator = strsrch->search->breakIter;
610 //TODO: Add here.
611 if (breakiterator) {
612 int32_t startindex = ubrk_first(breakiterator);
613 int32_t endindex = ubrk_last(breakiterator);
614
615 // out-of-range indexes are never boundary positions
616 if (start < startindex || start > endindex ||
617 end < startindex || end > endindex) {
618 return FALSE;
619 }
620 // otherwise, we can use following() on the position before the
621 // specified one and return true of the position we get back is the
622 // one the user specified
623 UBool result = (start == startindex ||
624 ubrk_following(breakiterator, start - 1) == start) &&
625 (end == endindex ||
626 ubrk_following(breakiterator, end - 1) == end);
627 if (result) {
628 // iterates the individual ces
629 UCollationElements *coleiter = strsrch->utilIter;
630 const UChar *text = strsrch->search->text +
631 start;
632 UErrorCode status = U_ZERO_ERROR;
633 ucol_setText(coleiter, text, end - start, &status);
634 for (int32_t count = 0; count < strsrch->pattern.cesLength;
635 count ++) {
636 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
637 if (ce == UCOL_IGNORABLE) {
638 count --;
639 continue;
640 }
641 if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) {
642 return FALSE;
643 }
644 }
645 int32_t nextce = ucol_next(coleiter, &status);
646 while (ucol_getOffset(coleiter) == (end - start)
647 && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
648 nextce = ucol_next(coleiter, &status);
649 }
650 if (ucol_getOffset(coleiter) == (end - start)
651 && nextce != UCOL_NULLORDER) {
652 // extra collation elements at the end of the match
653 return FALSE;
654 }
655 }
656 return result;
657 }
658 #endif
659 return TRUE;
660 }
661
662 /**
663 * Getting the next base character offset if current offset is an accent,
664 * or the current offset if the current character contains a base character.
665 * accents the following base character will be returned
666 * @param text string
667 * @param textoffset current offset
668 * @param textlength length of text string
669 * @return the next base character or the current offset
670 * if the current character is contains a base character.
671 */
672 static
getNextBaseOffset(const UChar * text,int32_t textoffset,int32_t textlength)673 inline int32_t getNextBaseOffset(const UChar *text,
674 int32_t textoffset,
675 int32_t textlength)
676 {
677 if (textoffset < textlength) {
678 int32_t temp = textoffset;
679 if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
680 while (temp < textlength) {
681 int32_t result = temp;
682 if ((getFCD(text, &temp, textlength) >>
683 SECOND_LAST_BYTE_SHIFT_) == 0) {
684 return result;
685 }
686 }
687 return textlength;
688 }
689 }
690 return textoffset;
691 }
692
693 /**
694 * Gets the next base character offset depending on the string search pattern
695 * data
696 * @param strsrch string search data
697 * @param textoffset current offset, one offset away from the last character
698 * to search for.
699 * @return start index of the next base character or the current offset
700 * if the current character is contains a base character.
701 */
702 static
getNextUStringSearchBaseOffset(UStringSearch * strsrch,int32_t textoffset)703 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
704 int32_t textoffset)
705 {
706 int32_t textlength = strsrch->search->textLength;
707 if (strsrch->pattern.hasSuffixAccents &&
708 textoffset < textlength) {
709 int32_t temp = textoffset;
710 const UChar *text = strsrch->search->text;
711 U16_BACK_1(text, 0, temp);
712 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
713 return getNextBaseOffset(text, textoffset, textlength);
714 }
715 }
716 return textoffset;
717 }
718
719 /**
720 * Shifting the collation element iterator position forward to prepare for
721 * a following match. If the last character is a unsafe character, we'll only
722 * shift by 1 to capture contractions, normalization etc.
723 * Internal method, status assumed to be success.
724 * @param text strsrch string search data
725 * @param textoffset start text position to do search
726 * @param ce the text ce which failed the match.
727 * @param patternceindex index of the ce within the pattern ce buffer which
728 * failed the match
729 * @return final offset
730 */
731 static
shiftForward(UStringSearch * strsrch,int32_t textoffset,int32_t ce,int32_t patternceindex)732 inline int32_t shiftForward(UStringSearch *strsrch,
733 int32_t textoffset,
734 int32_t ce,
735 int32_t patternceindex)
736 {
737 UPattern *pattern = &(strsrch->pattern);
738 if (ce != UCOL_NULLORDER) {
739 int32_t shift = pattern->shift[hashFromCE32(ce)];
740 // this is to adjust for characters in the middle of the
741 // substring for matching that failed.
742 int32_t adjust = pattern->cesLength - patternceindex;
743 if (adjust > 1 && shift >= adjust) {
744 shift -= adjust - 1;
745 }
746 textoffset += shift;
747 }
748 else {
749 textoffset += pattern->defaultShiftSize;
750 }
751
752 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
753 // check for unsafe characters
754 // * if it is the start or middle of a contraction: to be done after
755 // a initial match is found
756 // * thai or lao base consonant character: similar to contraction
757 // * high surrogate character: similar to contraction
758 // * next character is a accent: shift to the next base character
759 return textoffset;
760 }
761 #endif // #if BOYER_MOORE
762
763 /**
764 * sets match not found
765 * @param strsrch string search data
766 */
767 static
setMatchNotFound(UStringSearch * strsrch)768 inline void setMatchNotFound(UStringSearch *strsrch)
769 {
770 // this method resets the match result regardless of the error status.
771 strsrch->search->matchedIndex = USEARCH_DONE;
772 strsrch->search->matchedLength = 0;
773 if (strsrch->search->isForwardSearching) {
774 setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
775 }
776 else {
777 setColEIterOffset(strsrch->textIter, 0);
778 }
779 }
780
781 #if BOYER_MOORE
782 /**
783 * Gets the offset to the next safe point in text.
784 * ie. not the middle of a contraction, swappable characters or supplementary
785 * characters.
786 * @param collator collation sata
787 * @param text string to work with
788 * @param textoffset offset in string
789 * @param textlength length of text string
790 * @return offset to the next safe character
791 */
792 static
getNextSafeOffset(const UCollator * collator,const UChar * text,int32_t textoffset,int32_t textlength)793 inline int32_t getNextSafeOffset(const UCollator *collator,
794 const UChar *text,
795 int32_t textoffset,
796 int32_t textlength)
797 {
798 int32_t result = textoffset; // first contraction character
799 while (result != textlength && ucol_unsafeCP(text[result], collator)) {
800 result ++;
801 }
802 return result;
803 }
804
805 /**
806 * This checks for accents in the potential match started with a .
807 * composite character.
808 * This is really painful... we have to check that composite character do not
809 * have any extra accents. We have to normalize the potential match and find
810 * the immediate decomposed character before the match.
811 * The first composite character would have been taken care of by the fcd
812 * checks in checkForwardExactMatch.
813 * This is the slow path after the fcd of the first character and
814 * the last character has been checked by checkForwardExactMatch and we
815 * determine that the potential match has extra non-ignorable preceding
816 * ces.
817 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
818 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
819 * Note here that accents checking are slow and cautioned in the API docs.
820 * Internal method, status assumed to be a success, caller should check status
821 * before calling this method
822 * @param strsrch string search data
823 * @param start index of the potential unfriendly composite character
824 * @param end index of the potential unfriendly composite character
825 * @param status output error status if any.
826 * @return TRUE if there is non-ignorable accents before at the beginning
827 * of the match, FALSE otherwise.
828 */
829
830 static
checkExtraMatchAccents(const UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)831 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
832 int32_t end,
833 UErrorCode *status)
834 {
835 UBool result = FALSE;
836 if (strsrch->pattern.hasPrefixAccents) {
837 int32_t length = end - start;
838 int32_t offset = 0;
839 const UChar *text = strsrch->search->text + start;
840
841 U16_FWD_1(text, offset, length);
842 // we are only concerned with the first composite character
843 if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
844 int32_t safeoffset = getNextSafeOffset(strsrch->collator,
845 text, 0, length);
846 if (safeoffset != length) {
847 safeoffset ++;
848 }
849 UChar *norm = NULL;
850 UChar buffer[INITIAL_ARRAY_SIZE_];
851 int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
852 buffer, INITIAL_ARRAY_SIZE_,
853 status);
854 if (U_FAILURE(*status)) {
855 return FALSE;
856 }
857 if (size >= INITIAL_ARRAY_SIZE_) {
858 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
859 status);
860 // if allocation failed, status will be set to
861 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
862 // checks for it.
863 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
864 size, status);
865 if (U_FAILURE(*status) && norm != NULL) {
866 uprv_free(norm);
867 return FALSE;
868 }
869 }
870 else {
871 norm = buffer;
872 }
873
874 UCollationElements *coleiter = strsrch->utilIter;
875 ucol_setText(coleiter, norm, size, status);
876 uint32_t firstce = strsrch->pattern.ces[0];
877 UBool ignorable = TRUE;
878 uint32_t ce = UCOL_IGNORABLE;
879 while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
880 offset = ucol_getOffset(coleiter);
881 if (ce != firstce && ce != UCOL_IGNORABLE) {
882 ignorable = FALSE;
883 }
884 ce = ucol_next(coleiter, status);
885 }
886 UChar32 codepoint;
887 U16_PREV(norm, 0, offset, codepoint);
888 result = !ignorable && (u_getCombiningClass(codepoint) != 0);
889
890 if (norm != buffer) {
891 uprv_free(norm);
892 }
893 }
894 }
895
896 return result;
897 }
898
899 /**
900 * Used by exact matches, checks if there are accents before the match.
901 * This is really painful... we have to check that composite characters at
902 * the start of the matches have to not have any extra accents.
903 * We check the FCD of the character first, if it starts with an accent and
904 * the first pattern ce does not match the first ce of the character, we bail.
905 * Otherwise we try normalizing the first composite
906 * character and find the immediate decomposed character before the match to
907 * see if it is an non-ignorable accent.
908 * Now normalizing the first composite character is enough because we ensure
909 * that when the match is passed in here with extra beginning ces, the
910 * first or last ce that match has to occur within the first character.
911 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
912 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
913 * Note here that accents checking are slow and cautioned in the API docs.
914 * @param strsrch string search data
915 * @param start offset
916 * @param end offset
917 * @return TRUE if there are accents on either side of the match,
918 * FALSE otherwise
919 */
920 static
hasAccentsBeforeMatch(const UStringSearch * strsrch,int32_t start,int32_t end)921 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
922 int32_t end)
923 {
924 if (strsrch->pattern.hasPrefixAccents) {
925 UCollationElements *coleiter = strsrch->textIter;
926 UErrorCode status = U_ZERO_ERROR;
927 // we have been iterating forwards previously
928 uint32_t ignorable = TRUE;
929 int32_t firstce = strsrch->pattern.ces[0];
930
931 setColEIterOffset(coleiter, start);
932 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
933 if (U_FAILURE(status)) {
934 return TRUE;
935 }
936 while (ce != firstce) {
937 if (ce != UCOL_IGNORABLE) {
938 ignorable = FALSE;
939 }
940 ce = getCE(strsrch, ucol_next(coleiter, &status));
941 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
942 return TRUE;
943 }
944 }
945 if (!ignorable && inNormBuf(coleiter)) {
946 // within normalization buffer, discontiguous handled here
947 return TRUE;
948 }
949
950 // within text
951 int32_t temp = start;
952 // original code
953 // accent = (getFCD(strsrch->search->text, &temp,
954 // strsrch->search->textLength)
955 // >> SECOND_LAST_BYTE_SHIFT_);
956 // however this code does not work well with VC7 .net in release mode.
957 // maybe the inlines for getFCD combined with shifting has bugs in
958 // VC7. anyways this is a work around.
959 UBool accent = getFCD(strsrch->search->text, &temp,
960 strsrch->search->textLength) > 0xFF;
961 if (!accent) {
962 return checkExtraMatchAccents(strsrch, start, end, &status);
963 }
964 if (!ignorable) {
965 return TRUE;
966 }
967 if (start > 0) {
968 temp = start;
969 U16_BACK_1(strsrch->search->text, 0, temp);
970 if (getFCD(strsrch->search->text, &temp,
971 strsrch->search->textLength) & LAST_BYTE_MASK_) {
972 setColEIterOffset(coleiter, start);
973 ce = ucol_previous(coleiter, &status);
974 if (U_FAILURE(status) ||
975 (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
976 return TRUE;
977 }
978 }
979 }
980 }
981
982 return FALSE;
983 }
984
985 /**
986 * Used by exact matches, checks if there are accents bounding the match.
987 * Note this is the initial boundary check. If the potential match
988 * starts or ends with composite characters, the accents in those
989 * characters will be determined later.
990 * Not doing backwards iteration here, since discontiguos contraction for
991 * backwards collation element iterator, use up too many characters.
992 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
993 * should fail since there is a acute at the end of \u01FA
994 * Note here that accents checking are slow and cautioned in the API docs.
995 * @param strsrch string search data
996 * @param start offset of match
997 * @param end end offset of the match
998 * @return TRUE if there are accents on either side of the match,
999 * FALSE otherwise
1000 */
1001 static
hasAccentsAfterMatch(const UStringSearch * strsrch,int32_t start,int32_t end)1002 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
1003 int32_t end)
1004 {
1005 if (strsrch->pattern.hasSuffixAccents) {
1006 const UChar *text = strsrch->search->text;
1007 int32_t temp = end;
1008 int32_t textlength = strsrch->search->textLength;
1009 U16_BACK_1(text, 0, temp);
1010 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
1011 int32_t firstce = strsrch->pattern.ces[0];
1012 UCollationElements *coleiter = strsrch->textIter;
1013 UErrorCode status = U_ZERO_ERROR;
1014 int32_t ce;
1015 setColEIterOffset(coleiter, start);
1016 while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1017 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
1018 return TRUE;
1019 }
1020 }
1021 int32_t count = 1;
1022 while (count < strsrch->pattern.cesLength) {
1023 if (getCE(strsrch, ucol_next(coleiter, &status))
1024 == UCOL_IGNORABLE) {
1025 // Thai can give an ignorable here.
1026 count --;
1027 }
1028 if (U_FAILURE(status)) {
1029 return TRUE;
1030 }
1031 count ++;
1032 }
1033
1034 ce = ucol_next(coleiter, &status);
1035 if (U_FAILURE(status)) {
1036 return TRUE;
1037 }
1038 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1039 ce = getCE(strsrch, ce);
1040 }
1041 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1042 if (ucol_getOffset(coleiter) <= end) {
1043 return TRUE;
1044 }
1045 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1046 return TRUE;
1047 }
1048 }
1049 }
1050 }
1051 return FALSE;
1052 }
1053 #endif // #if BOYER_MOORE
1054
1055 /**
1056 * Checks if the offset runs out of the text string
1057 * @param offset
1058 * @param textlength of the text string
1059 * @return TRUE if offset is out of bounds, FALSE otherwise
1060 */
1061 static
isOutOfBounds(int32_t textlength,int32_t offset)1062 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1063 {
1064 return offset < 0 || offset > textlength;
1065 }
1066
1067 /**
1068 * Checks for identical match
1069 * @param strsrch string search data
1070 * @param start offset of possible match
1071 * @param end offset of possible match
1072 * @return TRUE if identical match is found
1073 */
1074 static
checkIdentical(const UStringSearch * strsrch,int32_t start,int32_t end)1075 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1076 int32_t end)
1077 {
1078 if (strsrch->strength != UCOL_IDENTICAL) {
1079 return TRUE;
1080 }
1081
1082 // Note: We could use Normalizer::compare() or similar, but for short strings
1083 // which may not be in FCD it might be faster to just NFD them.
1084 UErrorCode status = U_ZERO_ERROR;
1085 UnicodeString t2, p2;
1086 strsrch->nfd->normalize(
1087 UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
1088 strsrch->nfd->normalize(
1089 UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
1090 // return FALSE if NFD failed
1091 return U_SUCCESS(status) && t2 == p2;
1092 }
1093
1094 #if BOYER_MOORE
1095 /**
1096 * Checks to see if the match is repeated
1097 * @param strsrch string search data
1098 * @param start new match start index
1099 * @param end new match end index
1100 * @return TRUE if the the match is repeated, FALSE otherwise
1101 */
1102 static
checkRepeatedMatch(UStringSearch * strsrch,int32_t start,int32_t end)1103 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1104 int32_t start,
1105 int32_t end)
1106 {
1107 int32_t lastmatchindex = strsrch->search->matchedIndex;
1108 UBool result;
1109 if (lastmatchindex == USEARCH_DONE) {
1110 return FALSE;
1111 }
1112 if (strsrch->search->isForwardSearching) {
1113 result = start <= lastmatchindex;
1114 }
1115 else {
1116 result = start >= lastmatchindex;
1117 }
1118 if (!result && !strsrch->search->isOverlap) {
1119 if (strsrch->search->isForwardSearching) {
1120 result = start < lastmatchindex + strsrch->search->matchedLength;
1121 }
1122 else {
1123 result = end > lastmatchindex;
1124 }
1125 }
1126 return result;
1127 }
1128
1129 /**
1130 * Gets the collation element iterator's current offset.
1131 * @param coleiter collation element iterator
1132 * @param forwards flag TRUE if we are moving in th forwards direction
1133 * @return current offset
1134 */
1135 static
getColElemIterOffset(const UCollationElements * coleiter,UBool forwards)1136 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1137 UBool forwards)
1138 {
1139 int32_t result = ucol_getOffset(coleiter);
1140 // intricacies of the the backwards collation element iterator
1141 if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
1142 result ++;
1143 }
1144 return result;
1145 }
1146
1147 /**
1148 * Checks match for contraction.
1149 * If the match ends with a partial contraction we fail.
1150 * If the match starts too far off (because of backwards iteration) we try to
1151 * chip off the extra characters depending on whether a breakiterator has
1152 * been used.
1153 * Internal method, error assumed to be success, caller has to check status
1154 * before calling this method.
1155 * @param strsrch string search data
1156 * @param start offset of potential match, to be modified if necessary
1157 * @param end offset of potential match, to be modified if necessary
1158 * @param status output error status if any
1159 * @return TRUE if match passes the contraction test, FALSE otherwise
1160 */
1161
1162 static
checkNextExactContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)1163 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1164 int32_t *start,
1165 int32_t *end, UErrorCode *status)
1166 {
1167 UCollationElements *coleiter = strsrch->textIter;
1168 int32_t textlength = strsrch->search->textLength;
1169 int32_t temp = *start;
1170 const UCollator *collator = strsrch->collator;
1171 const UChar *text = strsrch->search->text;
1172 // This part checks if either ends of the match contains potential
1173 // contraction. If so we'll have to iterate through them
1174 // The start contraction needs to be checked since ucol_previous dumps
1175 // all characters till the first safe character into the buffer.
1176 // *start + 1 is used to test for the unsafe characters instead of *start
1177 // because ucol_prev takes all unsafe characters till the first safe
1178 // character ie *start. so by testing *start + 1, we can estimate if
1179 // excess prefix characters has been included in the potential search
1180 // results.
1181 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1182 (*start + 1 < textlength
1183 && ucol_unsafeCP(text[*start + 1], collator))) {
1184 int32_t expansion = getExpansionPrefix(coleiter);
1185 UBool expandflag = expansion > 0;
1186 setColEIterOffset(coleiter, *start);
1187 while (expansion > 0) {
1188 // getting rid of the redundant ce, caused by setOffset.
1189 // since backward contraction/expansion may have extra ces if we
1190 // are in the normalization buffer, hasAccentsBeforeMatch would
1191 // have taken care of it.
1192 // E.g. the character \u01FA will have an expansion of 3, but if
1193 // we are only looking for acute and ring \u030A and \u0301, we'll
1194 // have to skip the first ce in the expansion buffer.
1195 ucol_next(coleiter, status);
1196 if (U_FAILURE(*status)) {
1197 return FALSE;
1198 }
1199 if (ucol_getOffset(coleiter) != temp) {
1200 *start = temp;
1201 temp = ucol_getOffset(coleiter);
1202 }
1203 expansion --;
1204 }
1205
1206 int32_t *patternce = strsrch->pattern.ces;
1207 int32_t patterncelength = strsrch->pattern.cesLength;
1208 int32_t count = 0;
1209 while (count < patterncelength) {
1210 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1211 if (ce == UCOL_IGNORABLE) {
1212 continue;
1213 }
1214 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1215 *start = temp;
1216 temp = ucol_getOffset(coleiter);
1217 }
1218 if (U_FAILURE(*status) || ce != patternce[count]) {
1219 (*end) ++;
1220 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1221 return FALSE;
1222 }
1223 count ++;
1224 }
1225 }
1226 return TRUE;
1227 }
1228
1229 /**
1230 * Checks and sets the match information if found.
1231 * Checks
1232 * <ul>
1233 * <li> the potential match does not repeat the previous match
1234 * <li> boundaries are correct
1235 * <li> exact matches has no extra accents
1236 * <li> identical matchesb
1237 * <li> potential match does not end in the middle of a contraction
1238 * <\ul>
1239 * Otherwise the offset will be shifted to the next character.
1240 * Internal method, status assumed to be success, caller has to check status
1241 * before calling this method.
1242 * @param strsrch string search data
1243 * @param textoffset offset in the collation element text. the returned value
1244 * will be the truncated end offset of the match or the new start
1245 * search offset.
1246 * @param status output error status if any
1247 * @return TRUE if the match is valid, FALSE otherwise
1248 */
1249 static
checkNextExactMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)1250 inline UBool checkNextExactMatch(UStringSearch *strsrch,
1251 int32_t *textoffset, UErrorCode *status)
1252 {
1253 UCollationElements *coleiter = strsrch->textIter;
1254 int32_t start = getColElemIterOffset(coleiter, FALSE);
1255
1256 if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1257 return FALSE;
1258 }
1259
1260 // this totally matches, however we need to check if it is repeating
1261 if (!isBreakUnit(strsrch, start, *textoffset) ||
1262 checkRepeatedMatch(strsrch, start, *textoffset) ||
1263 hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1264 !checkIdentical(strsrch, start, *textoffset) ||
1265 hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1266
1267 (*textoffset) ++;
1268 *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1269 return FALSE;
1270 }
1271
1272 //Add breakiterator boundary check for primary strength search.
1273 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
1274 checkBreakBoundary(strsrch, &start, textoffset);
1275 }
1276
1277 // totally match, we will get rid of the ending ignorables.
1278 strsrch->search->matchedIndex = start;
1279 strsrch->search->matchedLength = *textoffset - start;
1280 return TRUE;
1281 }
1282
1283 /**
1284 * Getting the previous base character offset, or the current offset if the
1285 * current character is a base character
1286 * @param text string
1287 * @param textoffset one offset after the current character
1288 * @return the offset of the next character after the base character or the first
1289 * composed character with accents
1290 */
1291 static
getPreviousBaseOffset(const UChar * text,int32_t textoffset)1292 inline int32_t getPreviousBaseOffset(const UChar *text,
1293 int32_t textoffset)
1294 {
1295 if (textoffset > 0) {
1296 for (;;) {
1297 int32_t result = textoffset;
1298 U16_BACK_1(text, 0, textoffset);
1299 int32_t temp = textoffset;
1300 uint16_t fcd = getFCD(text, &temp, result);
1301 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1302 if (fcd & LAST_BYTE_MASK_) {
1303 return textoffset;
1304 }
1305 return result;
1306 }
1307 if (textoffset == 0) {
1308 return 0;
1309 }
1310 }
1311 }
1312 return textoffset;
1313 }
1314
1315 /**
1316 * Getting the indexes of the accents that are not blocked in the argument
1317 * accent array
1318 * @param accents array of accents in nfd terminated by a 0.
1319 * @param accentsindex array of indexes of the accents that are not blocked
1320 */
1321 static
getUnblockedAccentIndex(UChar * accents,int32_t * accentsindex)1322 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1323 {
1324 int32_t index = 0;
1325 int32_t length = u_strlen(accents);
1326 UChar32 codepoint = 0;
1327 int cclass = 0;
1328 int result = 0;
1329 int32_t temp;
1330 while (index < length) {
1331 temp = index;
1332 U16_NEXT(accents, index, length, codepoint);
1333 if (u_getCombiningClass(codepoint) != cclass) {
1334 cclass = u_getCombiningClass(codepoint);
1335 accentsindex[result] = temp;
1336 result ++;
1337 }
1338 }
1339 accentsindex[result] = length;
1340 return result;
1341 }
1342
1343 /**
1344 * Appends 3 UChar arrays to a destination array.
1345 * Creates a new array if we run out of space. The caller will have to
1346 * manually deallocate the newly allocated array.
1347 * Internal method, status assumed to be success, caller has to check status
1348 * before calling this method. destination not to be NULL and has at least
1349 * size destinationlength.
1350 * @param destination target array
1351 * @param destinationlength target array size, returning the appended length
1352 * @param source1 null-terminated first array
1353 * @param source2 second array
1354 * @param source2length length of seond array
1355 * @param source3 null-terminated third array
1356 * @param status error status if any
1357 * @return new destination array, destination if there was no new allocation
1358 */
1359 static
addToUCharArray(UChar * destination,int32_t * destinationlength,const UChar * source1,const UChar * source2,int32_t source2length,const UChar * source3,UErrorCode * status)1360 inline UChar * addToUCharArray( UChar *destination,
1361 int32_t *destinationlength,
1362 const UChar *source1,
1363 const UChar *source2,
1364 int32_t source2length,
1365 const UChar *source3,
1366 UErrorCode *status)
1367 {
1368 int32_t source1length = source1 ? u_strlen(source1) : 0;
1369 int32_t source3length = source3 ? u_strlen(source3) : 0;
1370 if (*destinationlength < source1length + source2length + source3length +
1371 1)
1372 {
1373 destination = (UChar *)allocateMemory(
1374 (source1length + source2length + source3length + 1) * sizeof(UChar),
1375 status);
1376 // if error allocating memory, status will be
1377 // U_MEMORY_ALLOCATION_ERROR
1378 if (U_FAILURE(*status)) {
1379 *destinationlength = 0;
1380 return NULL;
1381 }
1382 }
1383 if (source1length != 0) {
1384 u_memcpy(destination, source1, source1length);
1385 }
1386 if (source2length != 0) {
1387 uprv_memcpy(destination + source1length, source2,
1388 sizeof(UChar) * source2length);
1389 }
1390 if (source3length != 0) {
1391 uprv_memcpy(destination + source1length + source2length, source3,
1392 sizeof(UChar) * source3length);
1393 }
1394 *destinationlength = source1length + source2length + source3length;
1395 return destination;
1396 }
1397
1398 /**
1399 * Running through a collation element iterator to see if the contents matches
1400 * pattern in string search data
1401 * @param strsrch string search data
1402 * @param coleiter collation element iterator
1403 * @return TRUE if a match if found, FALSE otherwise
1404 */
1405 static
checkCollationMatch(const UStringSearch * strsrch,UCollationElements * coleiter)1406 inline UBool checkCollationMatch(const UStringSearch *strsrch,
1407 UCollationElements *coleiter)
1408 {
1409 int patternceindex = strsrch->pattern.cesLength;
1410 int32_t *patternce = strsrch->pattern.ces;
1411 UErrorCode status = U_ZERO_ERROR;
1412 while (patternceindex > 0) {
1413 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1414 if (ce == UCOL_IGNORABLE) {
1415 continue;
1416 }
1417 if (U_FAILURE(status) || ce != *patternce) {
1418 return FALSE;
1419 }
1420 patternce ++;
1421 patternceindex --;
1422 }
1423 return TRUE;
1424 }
1425
1426 /**
1427 * Rearranges the front accents to try matching.
1428 * Prefix accents in the text will be grouped according to their combining
1429 * class and the groups will be mixed and matched to try find the perfect
1430 * match with the pattern.
1431 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1432 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1433 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1434 * "\u0301\u0325".
1435 * step 2: check if any of the generated substrings matches the pattern.
1436 * Internal method, status is assumed to be success, caller has to check status
1437 * before calling this method.
1438 * @param strsrch string search match
1439 * @param start first offset of the accents to start searching
1440 * @param end start of the last accent set
1441 * @param status output error status if any
1442 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1443 * offset of the match. Note this start includes all preceding accents.
1444 */
1445 static
doNextCanonicalPrefixMatch(UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)1446 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1447 int32_t start,
1448 int32_t end,
1449 UErrorCode *status)
1450 {
1451 const UChar *text = strsrch->search->text;
1452 int32_t textlength = strsrch->search->textLength;
1453 int32_t tempstart = start;
1454
1455 if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1456 // die... failed at a base character
1457 return USEARCH_DONE;
1458 }
1459
1460 int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1461 start = getPreviousBaseOffset(text, tempstart);
1462
1463 UChar accents[INITIAL_ARRAY_SIZE_];
1464 // normalizing the offensive string
1465 unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1466 INITIAL_ARRAY_SIZE_, status);
1467 if (U_FAILURE(*status)) {
1468 return USEARCH_DONE;
1469 }
1470
1471 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1472 int32_t accentsize = getUnblockedAccentIndex(accents,
1473 accentsindex);
1474 int32_t count = (2 << (accentsize - 1)) - 1;
1475 UChar buffer[INITIAL_ARRAY_SIZE_];
1476 UCollationElements *coleiter = strsrch->utilIter;
1477 while (U_SUCCESS(*status) && count > 0) {
1478 UChar *rearrange = strsrch->canonicalPrefixAccents;
1479 // copy the base characters
1480 for (int k = 0; k < accentsindex[0]; k ++) {
1481 *rearrange ++ = accents[k];
1482 }
1483 // forming all possible canonical rearrangement by dropping
1484 // sets of accents
1485 for (int i = 0; i <= accentsize - 1; i ++) {
1486 int32_t mask = 1 << (accentsize - i - 1);
1487 if (count & mask) {
1488 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1489 *rearrange ++ = accents[j];
1490 }
1491 }
1492 }
1493 *rearrange = 0;
1494 int32_t matchsize = INITIAL_ARRAY_SIZE_;
1495 UChar *match = addToUCharArray(buffer, &matchsize,
1496 strsrch->canonicalPrefixAccents,
1497 strsrch->search->text + offset,
1498 end - offset,
1499 strsrch->canonicalSuffixAccents,
1500 status);
1501
1502 // if status is a failure, ucol_setText does nothing.
1503 // run the collator iterator through this match
1504 ucol_setText(coleiter, match, matchsize, status);
1505 if (U_SUCCESS(*status)) {
1506 if (checkCollationMatch(strsrch, coleiter)) {
1507 if (match != buffer) {
1508 uprv_free(match);
1509 }
1510 return start;
1511 }
1512 }
1513 count --;
1514 }
1515 return USEARCH_DONE;
1516 }
1517
1518 /**
1519 * Gets the offset to the safe point in text before textoffset.
1520 * ie. not the middle of a contraction, swappable characters or supplementary
1521 * characters.
1522 * @param collator collation sata
1523 * @param text string to work with
1524 * @param textoffset offset in string
1525 * @param textlength length of text string
1526 * @return offset to the previous safe character
1527 */
1528 static
getPreviousSafeOffset(const UCollator * collator,const UChar * text,int32_t textoffset)1529 inline uint32_t getPreviousSafeOffset(const UCollator *collator,
1530 const UChar *text,
1531 int32_t textoffset)
1532 {
1533 int32_t result = textoffset; // first contraction character
1534 while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1535 result --;
1536 }
1537 if (result != 0) {
1538 // the first contraction character is consider unsafe here
1539 result --;
1540 }
1541 return result;
1542 }
1543
1544 /**
1545 * Cleaning up after we passed the safe zone
1546 * @param strsrch string search data
1547 * @param safetext safe text array
1548 * @param safebuffer safe text buffer
1549 * @param coleiter collation element iterator for safe text
1550 */
1551 static
cleanUpSafeText(const UStringSearch * strsrch,UChar * safetext,UChar * safebuffer)1552 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1553 UChar *safebuffer)
1554 {
1555 if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1556 {
1557 uprv_free(safetext);
1558 }
1559 }
1560
1561 /**
1562 * Take the rearranged end accents and tries matching. If match failed at
1563 * a seperate preceding set of accents (seperated from the rearranged on by
1564 * at least a base character) then we rearrange the preceding accents and
1565 * tries matching again.
1566 * We allow skipping of the ends of the accent set if the ces do not match.
1567 * However if the failure is found before the accent set, it fails.
1568 * Internal method, status assumed to be success, caller has to check status
1569 * before calling this method.
1570 * @param strsrch string search data
1571 * @param textoffset of the start of the rearranged accent
1572 * @param status output error status if any
1573 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1574 * offset of the match. Note this start includes all preceding accents.
1575 */
1576 static
doNextCanonicalSuffixMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)1577 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1578 int32_t textoffset,
1579 UErrorCode *status)
1580 {
1581 const UChar *text = strsrch->search->text;
1582 const UCollator *collator = strsrch->collator;
1583 int32_t safelength = 0;
1584 UChar *safetext;
1585 int32_t safetextlength;
1586 UChar safebuffer[INITIAL_ARRAY_SIZE_];
1587 UCollationElements *coleiter = strsrch->utilIter;
1588 int32_t safeoffset = textoffset;
1589
1590 if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1591 collator)) {
1592 safeoffset = getPreviousSafeOffset(collator, text, textoffset);
1593 safelength = textoffset - safeoffset;
1594 safetextlength = INITIAL_ARRAY_SIZE_;
1595 safetext = addToUCharArray(safebuffer, &safetextlength, NULL,
1596 text + safeoffset, safelength,
1597 strsrch->canonicalSuffixAccents,
1598 status);
1599 }
1600 else {
1601 safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1602 safetext = strsrch->canonicalSuffixAccents;
1603 }
1604
1605 // if status is a failure, ucol_setText does nothing
1606 ucol_setText(coleiter, safetext, safetextlength, status);
1607 // status checked in loop below
1608
1609 int32_t *ce = strsrch->pattern.ces;
1610 int32_t celength = strsrch->pattern.cesLength;
1611 int ceindex = celength - 1;
1612 UBool isSafe = TRUE; // indication flag for position in safe zone
1613
1614 while (ceindex >= 0) {
1615 int32_t textce = ucol_previous(coleiter, status);
1616 if (U_FAILURE(*status)) {
1617 if (isSafe) {
1618 cleanUpSafeText(strsrch, safetext, safebuffer);
1619 }
1620 return USEARCH_DONE;
1621 }
1622 if (textce == UCOL_NULLORDER) {
1623 // check if we have passed the safe buffer
1624 if (coleiter == strsrch->textIter) {
1625 cleanUpSafeText(strsrch, safetext, safebuffer);
1626 return USEARCH_DONE;
1627 }
1628 cleanUpSafeText(strsrch, safetext, safebuffer);
1629 safetext = safebuffer;
1630 coleiter = strsrch->textIter;
1631 setColEIterOffset(coleiter, safeoffset);
1632 // status checked at the start of the loop
1633 isSafe = FALSE;
1634 continue;
1635 }
1636 textce = getCE(strsrch, textce);
1637 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1638 // do the beginning stuff
1639 int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1640 if (isSafe && failedoffset >= safelength) {
1641 // alas... no hope. failed at rearranged accent set
1642 cleanUpSafeText(strsrch, safetext, safebuffer);
1643 return USEARCH_DONE;
1644 }
1645 else {
1646 if (isSafe) {
1647 failedoffset += safeoffset;
1648 cleanUpSafeText(strsrch, safetext, safebuffer);
1649 }
1650
1651 // try rearranging the front accents
1652 int32_t result = doNextCanonicalPrefixMatch(strsrch,
1653 failedoffset, textoffset, status);
1654 if (result != USEARCH_DONE) {
1655 // if status is a failure, ucol_setOffset does nothing
1656 setColEIterOffset(strsrch->textIter, result);
1657 }
1658 if (U_FAILURE(*status)) {
1659 return USEARCH_DONE;
1660 }
1661 return result;
1662 }
1663 }
1664 if (textce == ce[ceindex]) {
1665 ceindex --;
1666 }
1667 }
1668 // set offset here
1669 if (isSafe) {
1670 int32_t result = getColElemIterOffset(coleiter, FALSE);
1671 // sets the text iterator here with the correct expansion and offset
1672 int32_t leftoverces = getExpansionPrefix(coleiter);
1673 cleanUpSafeText(strsrch, safetext, safebuffer);
1674 if (result >= safelength) {
1675 result = textoffset;
1676 }
1677 else {
1678 result += safeoffset;
1679 }
1680 setColEIterOffset(strsrch->textIter, result);
1681 strsrch->textIter->iteratordata_.toReturn =
1682 setExpansionPrefix(strsrch->textIter, leftoverces);
1683 return result;
1684 }
1685
1686 return ucol_getOffset(coleiter);
1687 }
1688
1689 /**
1690 * Trying out the substring and sees if it can be a canonical match.
1691 * This will try normalizing the end accents and arranging them into canonical
1692 * equivalents and check their corresponding ces with the pattern ce.
1693 * Suffix accents in the text will be grouped according to their combining
1694 * class and the groups will be mixed and matched to try find the perfect
1695 * match with the pattern.
1696 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1697 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1698 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1699 * "\u0301\u0325".
1700 * step 2: check if any of the generated substrings matches the pattern.
1701 * Internal method, status assumed to be success, caller has to check status
1702 * before calling this method.
1703 * @param strsrch string search data
1704 * @param textoffset end offset in the collation element text that ends with
1705 * the accents to be rearranged
1706 * @param status error status if any
1707 * @return TRUE if the match is valid, FALSE otherwise
1708 */
1709 static
doNextCanonicalMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)1710 UBool doNextCanonicalMatch(UStringSearch *strsrch,
1711 int32_t textoffset,
1712 UErrorCode *status)
1713 {
1714 const UChar *text = strsrch->search->text;
1715 int32_t temp = textoffset;
1716 U16_BACK_1(text, 0, temp);
1717 if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1718 UCollationElements *coleiter = strsrch->textIter;
1719 int32_t offset = getColElemIterOffset(coleiter, FALSE);
1720 if (strsrch->pattern.hasPrefixAccents) {
1721 offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1722 status);
1723 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1724 setColEIterOffset(coleiter, offset);
1725 return TRUE;
1726 }
1727 }
1728 return FALSE;
1729 }
1730
1731 if (!strsrch->pattern.hasSuffixAccents) {
1732 return FALSE;
1733 }
1734
1735 UChar accents[INITIAL_ARRAY_SIZE_];
1736 // offset to the last base character in substring to search
1737 int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1738 // normalizing the offensive string
1739 unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1740 0, accents, INITIAL_ARRAY_SIZE_, status);
1741 // status checked in loop below
1742
1743 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1744 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1745
1746 // 2 power n - 1 plus the full set of accents
1747 int32_t count = (2 << (size - 1)) - 1;
1748 while (U_SUCCESS(*status) && count > 0) {
1749 UChar *rearrange = strsrch->canonicalSuffixAccents;
1750 // copy the base characters
1751 for (int k = 0; k < accentsindex[0]; k ++) {
1752 *rearrange ++ = accents[k];
1753 }
1754 // forming all possible canonical rearrangement by dropping
1755 // sets of accents
1756 for (int i = 0; i <= size - 1; i ++) {
1757 int32_t mask = 1 << (size - i - 1);
1758 if (count & mask) {
1759 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1760 *rearrange ++ = accents[j];
1761 }
1762 }
1763 }
1764 *rearrange = 0;
1765 int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1766 status);
1767 if (offset != USEARCH_DONE) {
1768 return TRUE; // match found
1769 }
1770 count --;
1771 }
1772 return FALSE;
1773 }
1774
1775 /**
1776 * Gets the previous base character offset depending on the string search
1777 * pattern data
1778 * @param strsrch string search data
1779 * @param textoffset current offset, current character
1780 * @return the offset of the next character after this base character or itself
1781 * if it is a composed character with accents
1782 */
1783 static
getPreviousUStringSearchBaseOffset(UStringSearch * strsrch,int32_t textoffset)1784 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1785 int32_t textoffset)
1786 {
1787 if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1788 const UChar *text = strsrch->search->text;
1789 int32_t offset = textoffset;
1790 if (getFCD(text, &offset, strsrch->search->textLength) >>
1791 SECOND_LAST_BYTE_SHIFT_) {
1792 return getPreviousBaseOffset(text, textoffset);
1793 }
1794 }
1795 return textoffset;
1796 }
1797
1798 /**
1799 * Checks match for contraction.
1800 * If the match ends with a partial contraction we fail.
1801 * If the match starts too far off (because of backwards iteration) we try to
1802 * chip off the extra characters
1803 * Internal method, status assumed to be success, caller has to check status
1804 * before calling this method.
1805 * @param strsrch string search data
1806 * @param start offset of potential match, to be modified if necessary
1807 * @param end offset of potential match, to be modified if necessary
1808 * @param status output error status if any
1809 * @return TRUE if match passes the contraction test, FALSE otherwise
1810 */
1811 static
checkNextCanonicalContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)1812 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1813 int32_t *start,
1814 int32_t *end,
1815 UErrorCode *status)
1816 {
1817 UCollationElements *coleiter = strsrch->textIter;
1818 int32_t textlength = strsrch->search->textLength;
1819 int32_t temp = *start;
1820 const UCollator *collator = strsrch->collator;
1821 const UChar *text = strsrch->search->text;
1822 // This part checks if either ends of the match contains potential
1823 // contraction. If so we'll have to iterate through them
1824 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1825 (*start + 1 < textlength
1826 && ucol_unsafeCP(text[*start + 1], collator))) {
1827 int32_t expansion = getExpansionPrefix(coleiter);
1828 UBool expandflag = expansion > 0;
1829 setColEIterOffset(coleiter, *start);
1830 while (expansion > 0) {
1831 // getting rid of the redundant ce, caused by setOffset.
1832 // since backward contraction/expansion may have extra ces if we
1833 // are in the normalization buffer, hasAccentsBeforeMatch would
1834 // have taken care of it.
1835 // E.g. the character \u01FA will have an expansion of 3, but if
1836 // we are only looking for acute and ring \u030A and \u0301, we'll
1837 // have to skip the first ce in the expansion buffer.
1838 ucol_next(coleiter, status);
1839 if (U_FAILURE(*status)) {
1840 return FALSE;
1841 }
1842 if (ucol_getOffset(coleiter) != temp) {
1843 *start = temp;
1844 temp = ucol_getOffset(coleiter);
1845 }
1846 expansion --;
1847 }
1848
1849 int32_t *patternce = strsrch->pattern.ces;
1850 int32_t patterncelength = strsrch->pattern.cesLength;
1851 int32_t count = 0;
1852 int32_t textlength = strsrch->search->textLength;
1853 while (count < patterncelength) {
1854 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1855 // status checked below, note that if status is a failure
1856 // ucol_next returns UCOL_NULLORDER
1857 if (ce == UCOL_IGNORABLE) {
1858 continue;
1859 }
1860 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1861 *start = temp;
1862 temp = ucol_getOffset(coleiter);
1863 }
1864
1865 if (count == 0 && ce != patternce[0]) {
1866 // accents may have extra starting ces, this occurs when a
1867 // pure accent pattern is matched without rearrangement
1868 // text \u0325\u0300 and looking for \u0300
1869 int32_t expected = patternce[0];
1870 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1871 ce = getCE(strsrch, ucol_next(coleiter, status));
1872 while (U_SUCCESS(*status) && ce != expected &&
1873 ce != UCOL_NULLORDER &&
1874 ucol_getOffset(coleiter) <= *end) {
1875 ce = getCE(strsrch, ucol_next(coleiter, status));
1876 }
1877 }
1878 }
1879 if (U_FAILURE(*status) || ce != patternce[count]) {
1880 (*end) ++;
1881 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1882 return FALSE;
1883 }
1884 count ++;
1885 }
1886 }
1887 return TRUE;
1888 }
1889
1890 /**
1891 * Checks and sets the match information if found.
1892 * Checks
1893 * <ul>
1894 * <li> the potential match does not repeat the previous match
1895 * <li> boundaries are correct
1896 * <li> potential match does not end in the middle of a contraction
1897 * <li> identical matches
1898 * <\ul>
1899 * Otherwise the offset will be shifted to the next character.
1900 * Internal method, status assumed to be success, caller has to check the
1901 * status before calling this method.
1902 * @param strsrch string search data
1903 * @param textoffset offset in the collation element text. the returned value
1904 * will be the truncated end offset of the match or the new start
1905 * search offset.
1906 * @param status output error status if any
1907 * @return TRUE if the match is valid, FALSE otherwise
1908 */
1909 static
checkNextCanonicalMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)1910 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1911 int32_t *textoffset,
1912 UErrorCode *status)
1913 {
1914 // to ensure that the start and ends are not composite characters
1915 UCollationElements *coleiter = strsrch->textIter;
1916 // if we have a canonical accent match
1917 if ((strsrch->pattern.hasSuffixAccents &&
1918 strsrch->canonicalSuffixAccents[0]) ||
1919 (strsrch->pattern.hasPrefixAccents &&
1920 strsrch->canonicalPrefixAccents[0])) {
1921 strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset(
1922 strsrch,
1923 ucol_getOffset(coleiter));
1924 strsrch->search->matchedLength = *textoffset -
1925 strsrch->search->matchedIndex;
1926 return TRUE;
1927 }
1928
1929 int32_t start = getColElemIterOffset(coleiter, FALSE);
1930 if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1931 status) || U_FAILURE(*status)) {
1932 return FALSE;
1933 }
1934
1935 start = getPreviousUStringSearchBaseOffset(strsrch, start);
1936 // this totally matches, however we need to check if it is repeating
1937 if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1938 !isBreakUnit(strsrch, start, *textoffset) ||
1939 !checkIdentical(strsrch, start, *textoffset)) {
1940 (*textoffset) ++;
1941 *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1942 strsrch->search->textLength);
1943 return FALSE;
1944 }
1945
1946 strsrch->search->matchedIndex = start;
1947 strsrch->search->matchedLength = *textoffset - start;
1948 return TRUE;
1949 }
1950
1951 /**
1952 * Shifting the collation element iterator position forward to prepare for
1953 * a preceding match. If the first character is a unsafe character, we'll only
1954 * shift by 1 to capture contractions, normalization etc.
1955 * Internal method, status assumed to be success, caller has to check status
1956 * before calling this method.
1957 * @param text strsrch string search data
1958 * @param textoffset start text position to do search
1959 * @param ce the text ce which failed the match.
1960 * @param patternceindex index of the ce within the pattern ce buffer which
1961 * failed the match
1962 * @return final offset
1963 */
1964 static
reverseShift(UStringSearch * strsrch,int32_t textoffset,int32_t ce,int32_t patternceindex)1965 inline int32_t reverseShift(UStringSearch *strsrch,
1966 int32_t textoffset,
1967 int32_t ce,
1968 int32_t patternceindex)
1969 {
1970 if (strsrch->search->isOverlap) {
1971 if (textoffset != strsrch->search->textLength) {
1972 textoffset --;
1973 }
1974 else {
1975 textoffset -= strsrch->pattern.defaultShiftSize;
1976 }
1977 }
1978 else {
1979 if (ce != UCOL_NULLORDER) {
1980 int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)];
1981
1982 // this is to adjust for characters in the middle of the substring
1983 // for matching that failed.
1984 int32_t adjust = patternceindex;
1985 if (adjust > 1 && shift > adjust) {
1986 shift -= adjust - 1;
1987 }
1988 textoffset -= shift;
1989 }
1990 else {
1991 textoffset -= strsrch->pattern.defaultShiftSize;
1992 }
1993 }
1994 textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1995 return textoffset;
1996 }
1997
1998 /**
1999 * Checks match for contraction.
2000 * If the match starts with a partial contraction we fail.
2001 * Internal method, status assumed to be success, caller has to check status
2002 * before calling this method.
2003 * @param strsrch string search data
2004 * @param start offset of potential match, to be modified if necessary
2005 * @param end offset of potential match, to be modified if necessary
2006 * @param status output error status if any
2007 * @return TRUE if match passes the contraction test, FALSE otherwise
2008 */
2009 static
checkPreviousExactContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)2010 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2011 int32_t *start,
2012 int32_t *end, UErrorCode *status)
2013 {
2014 UCollationElements *coleiter = strsrch->textIter;
2015 int32_t textlength = strsrch->search->textLength;
2016 int32_t temp = *end;
2017 const UCollator *collator = strsrch->collator;
2018 const UChar *text = strsrch->search->text;
2019 // This part checks if either if the start of the match contains potential
2020 // contraction. If so we'll have to iterate through them
2021 // Since we used ucol_next while previously looking for the potential
2022 // match, this guarantees that our end will not be a partial contraction,
2023 // or a partial supplementary character.
2024 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2025 int32_t expansion = getExpansionSuffix(coleiter);
2026 UBool expandflag = expansion > 0;
2027 setColEIterOffset(coleiter, *end);
2028 while (U_SUCCESS(*status) && expansion > 0) {
2029 // getting rid of the redundant ce
2030 // since forward contraction/expansion may have extra ces
2031 // if we are in the normalization buffer, hasAccentsBeforeMatch
2032 // would have taken care of it.
2033 // E.g. the character \u01FA will have an expansion of 3, but if
2034 // we are only looking for A ring A\u030A, we'll have to skip the
2035 // last ce in the expansion buffer
2036 ucol_previous(coleiter, status);
2037 if (U_FAILURE(*status)) {
2038 return FALSE;
2039 }
2040 if (ucol_getOffset(coleiter) != temp) {
2041 *end = temp;
2042 temp = ucol_getOffset(coleiter);
2043 }
2044 expansion --;
2045 }
2046
2047 int32_t *patternce = strsrch->pattern.ces;
2048 int32_t patterncelength = strsrch->pattern.cesLength;
2049 int32_t count = patterncelength;
2050 while (count > 0) {
2051 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2052 // status checked below, note that if status is a failure
2053 // ucol_previous returns UCOL_NULLORDER
2054 if (ce == UCOL_IGNORABLE) {
2055 continue;
2056 }
2057 if (expandflag && count == 0 &&
2058 getColElemIterOffset(coleiter, FALSE) != temp) {
2059 *end = temp;
2060 temp = ucol_getOffset(coleiter);
2061 }
2062 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2063 (*start) --;
2064 *start = getPreviousBaseOffset(text, *start);
2065 return FALSE;
2066 }
2067 count --;
2068 }
2069 }
2070 return TRUE;
2071 }
2072
2073 /**
2074 * Checks and sets the match information if found.
2075 * Checks
2076 * <ul>
2077 * <li> the current match does not repeat the last match
2078 * <li> boundaries are correct
2079 * <li> exact matches has no extra accents
2080 * <li> identical matches
2081 * <\ul>
2082 * Otherwise the offset will be shifted to the preceding character.
2083 * Internal method, status assumed to be success, caller has to check status
2084 * before calling this method.
2085 * @param strsrch string search data
2086 * @param collator
2087 * @param coleiter collation element iterator
2088 * @param text string
2089 * @param textoffset offset in the collation element text. the returned value
2090 * will be the truncated start offset of the match or the new start
2091 * search offset.
2092 * @param status output error status if any
2093 * @return TRUE if the match is valid, FALSE otherwise
2094 */
2095 static
checkPreviousExactMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)2096 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2097 int32_t *textoffset,
2098 UErrorCode *status)
2099 {
2100 // to ensure that the start and ends are not composite characters
2101 int32_t end = ucol_getOffset(strsrch->textIter);
2102 if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2103 || U_FAILURE(*status)) {
2104 return FALSE;
2105 }
2106
2107 // this totally matches, however we need to check if it is repeating
2108 // the old match
2109 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2110 !isBreakUnit(strsrch, *textoffset, end) ||
2111 hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
2112 !checkIdentical(strsrch, *textoffset, end) ||
2113 hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2114 (*textoffset) --;
2115 *textoffset = getPreviousBaseOffset(strsrch->search->text,
2116 *textoffset);
2117 return FALSE;
2118 }
2119
2120 //Add breakiterator boundary check for primary strength search.
2121 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
2122 checkBreakBoundary(strsrch, textoffset, &end);
2123 }
2124
2125 strsrch->search->matchedIndex = *textoffset;
2126 strsrch->search->matchedLength = end - *textoffset;
2127 return TRUE;
2128 }
2129
2130 /**
2131 * Rearranges the end accents to try matching.
2132 * Suffix accents in the text will be grouped according to their combining
2133 * class and the groups will be mixed and matched to try find the perfect
2134 * match with the pattern.
2135 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2136 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2137 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2138 * "\u0301\u0325".
2139 * step 2: check if any of the generated substrings matches the pattern.
2140 * Internal method, status assumed to be success, user has to check status
2141 * before calling this method.
2142 * @param strsrch string search match
2143 * @param start offset of the first base character
2144 * @param end start of the last accent set
2145 * @param status only error status if any
2146 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2147 * offset of the match. Note this start includes all following accents.
2148 */
2149 static
doPreviousCanonicalSuffixMatch(UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)2150 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
2151 int32_t start,
2152 int32_t end,
2153 UErrorCode *status)
2154 {
2155 const UChar *text = strsrch->search->text;
2156 int32_t tempend = end;
2157
2158 U16_BACK_1(text, 0, tempend);
2159 if (!(getFCD(text, &tempend, strsrch->search->textLength) &
2160 LAST_BYTE_MASK_)) {
2161 // die... failed at a base character
2162 return USEARCH_DONE;
2163 }
2164 end = getNextBaseOffset(text, end, strsrch->search->textLength);
2165
2166 if (U_SUCCESS(*status)) {
2167 UChar accents[INITIAL_ARRAY_SIZE_];
2168 int32_t offset = getPreviousBaseOffset(text, end);
2169 // normalizing the offensive string
2170 unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2171 INITIAL_ARRAY_SIZE_, status);
2172
2173 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2174 int32_t accentsize = getUnblockedAccentIndex(accents,
2175 accentsindex);
2176 int32_t count = (2 << (accentsize - 1)) - 1;
2177 UChar buffer[INITIAL_ARRAY_SIZE_];
2178 UCollationElements *coleiter = strsrch->utilIter;
2179 while (U_SUCCESS(*status) && count > 0) {
2180 UChar *rearrange = strsrch->canonicalSuffixAccents;
2181 // copy the base characters
2182 for (int k = 0; k < accentsindex[0]; k ++) {
2183 *rearrange ++ = accents[k];
2184 }
2185 // forming all possible canonical rearrangement by dropping
2186 // sets of accents
2187 for (int i = 0; i <= accentsize - 1; i ++) {
2188 int32_t mask = 1 << (accentsize - i - 1);
2189 if (count & mask) {
2190 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2191 *rearrange ++ = accents[j];
2192 }
2193 }
2194 }
2195 *rearrange = 0;
2196 int32_t matchsize = INITIAL_ARRAY_SIZE_;
2197 UChar *match = addToUCharArray(buffer, &matchsize,
2198 strsrch->canonicalPrefixAccents,
2199 strsrch->search->text + start,
2200 offset - start,
2201 strsrch->canonicalSuffixAccents,
2202 status);
2203
2204 // run the collator iterator through this match
2205 // if status is a failure ucol_setText does nothing
2206 ucol_setText(coleiter, match, matchsize, status);
2207 if (U_SUCCESS(*status)) {
2208 if (checkCollationMatch(strsrch, coleiter)) {
2209 if (match != buffer) {
2210 uprv_free(match);
2211 }
2212 return end;
2213 }
2214 }
2215 count --;
2216 }
2217 }
2218 return USEARCH_DONE;
2219 }
2220
2221 /**
2222 * Take the rearranged start accents and tries matching. If match failed at
2223 * a seperate following set of accents (seperated from the rearranged on by
2224 * at least a base character) then we rearrange the preceding accents and
2225 * tries matching again.
2226 * We allow skipping of the ends of the accent set if the ces do not match.
2227 * However if the failure is found before the accent set, it fails.
2228 * Internal method, status assumed to be success, caller has to check status
2229 * before calling this method.
2230 * @param strsrch string search data
2231 * @param textoffset of the ends of the rearranged accent
2232 * @param status output error status if any
2233 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2234 * offset of the match. Note this start includes all following accents.
2235 */
2236 static
doPreviousCanonicalPrefixMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)2237 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2238 int32_t textoffset,
2239 UErrorCode *status)
2240 {
2241 const UChar *text = strsrch->search->text;
2242 const UCollator *collator = strsrch->collator;
2243 int32_t safelength = 0;
2244 UChar *safetext;
2245 int32_t safetextlength;
2246 UChar safebuffer[INITIAL_ARRAY_SIZE_];
2247 int32_t safeoffset = textoffset;
2248
2249 if (textoffset &&
2250 ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2251 u_strlen(strsrch->canonicalPrefixAccents) - 1
2252 ], collator)) {
2253 safeoffset = getNextSafeOffset(collator, text, textoffset,
2254 strsrch->search->textLength);
2255 safelength = safeoffset - textoffset;
2256 safetextlength = INITIAL_ARRAY_SIZE_;
2257 safetext = addToUCharArray(safebuffer, &safetextlength,
2258 strsrch->canonicalPrefixAccents,
2259 text + textoffset, safelength,
2260 NULL, status);
2261 }
2262 else {
2263 safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2264 safetext = strsrch->canonicalPrefixAccents;
2265 }
2266
2267 UCollationElements *coleiter = strsrch->utilIter;
2268 // if status is a failure, ucol_setText does nothing
2269 ucol_setText(coleiter, safetext, safetextlength, status);
2270 // status checked in loop below
2271
2272 int32_t *ce = strsrch->pattern.ces;
2273 int32_t celength = strsrch->pattern.cesLength;
2274 int ceindex = 0;
2275 UBool isSafe = TRUE; // safe zone indication flag for position
2276 int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2277
2278 while (ceindex < celength) {
2279 int32_t textce = ucol_next(coleiter, status);
2280 if (U_FAILURE(*status)) {
2281 if (isSafe) {
2282 cleanUpSafeText(strsrch, safetext, safebuffer);
2283 }
2284 return USEARCH_DONE;
2285 }
2286 if (textce == UCOL_NULLORDER) {
2287 // check if we have passed the safe buffer
2288 if (coleiter == strsrch->textIter) {
2289 cleanUpSafeText(strsrch, safetext, safebuffer);
2290 return USEARCH_DONE;
2291 }
2292 cleanUpSafeText(strsrch, safetext, safebuffer);
2293 safetext = safebuffer;
2294 coleiter = strsrch->textIter;
2295 setColEIterOffset(coleiter, safeoffset);
2296 // status checked at the start of the loop
2297 isSafe = FALSE;
2298 continue;
2299 }
2300 textce = getCE(strsrch, textce);
2301 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2302 // do the beginning stuff
2303 int32_t failedoffset = ucol_getOffset(coleiter);
2304 if (isSafe && failedoffset <= prefixlength) {
2305 // alas... no hope. failed at rearranged accent set
2306 cleanUpSafeText(strsrch, safetext, safebuffer);
2307 return USEARCH_DONE;
2308 }
2309 else {
2310 if (isSafe) {
2311 failedoffset = safeoffset - failedoffset;
2312 cleanUpSafeText(strsrch, safetext, safebuffer);
2313 }
2314
2315 // try rearranging the end accents
2316 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2317 textoffset, failedoffset, status);
2318 if (result != USEARCH_DONE) {
2319 // if status is a failure, ucol_setOffset does nothing
2320 setColEIterOffset(strsrch->textIter, result);
2321 }
2322 if (U_FAILURE(*status)) {
2323 return USEARCH_DONE;
2324 }
2325 return result;
2326 }
2327 }
2328 if (textce == ce[ceindex]) {
2329 ceindex ++;
2330 }
2331 }
2332 // set offset here
2333 if (isSafe) {
2334 int32_t result = ucol_getOffset(coleiter);
2335 // sets the text iterator here with the correct expansion and offset
2336 int32_t leftoverces = getExpansionSuffix(coleiter);
2337 cleanUpSafeText(strsrch, safetext, safebuffer);
2338 if (result <= prefixlength) {
2339 result = textoffset;
2340 }
2341 else {
2342 result = textoffset + (safeoffset - result);
2343 }
2344 setColEIterOffset(strsrch->textIter, result);
2345 setExpansionSuffix(strsrch->textIter, leftoverces);
2346 return result;
2347 }
2348
2349 return ucol_getOffset(coleiter);
2350 }
2351
2352 /**
2353 * Trying out the substring and sees if it can be a canonical match.
2354 * This will try normalizing the starting accents and arranging them into
2355 * canonical equivalents and check their corresponding ces with the pattern ce.
2356 * Prefix accents in the text will be grouped according to their combining
2357 * class and the groups will be mixed and matched to try find the perfect
2358 * match with the pattern.
2359 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2360 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2361 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2362 * "\u0301\u0325".
2363 * step 2: check if any of the generated substrings matches the pattern.
2364 * Internal method, status assumed to be success, caller has to check status
2365 * before calling this method.
2366 * @param strsrch string search data
2367 * @param textoffset start offset in the collation element text that starts
2368 * with the accents to be rearranged
2369 * @param status output error status if any
2370 * @return TRUE if the match is valid, FALSE otherwise
2371 */
2372 static
doPreviousCanonicalMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)2373 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2374 int32_t textoffset,
2375 UErrorCode *status)
2376 {
2377 const UChar *text = strsrch->search->text;
2378 int32_t temp = textoffset;
2379 int32_t textlength = strsrch->search->textLength;
2380 if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2381 UCollationElements *coleiter = strsrch->textIter;
2382 int32_t offset = ucol_getOffset(coleiter);
2383 if (strsrch->pattern.hasSuffixAccents) {
2384 offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2385 offset, status);
2386 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2387 setColEIterOffset(coleiter, offset);
2388 return TRUE;
2389 }
2390 }
2391 return FALSE;
2392 }
2393
2394 if (!strsrch->pattern.hasPrefixAccents) {
2395 return FALSE;
2396 }
2397
2398 UChar accents[INITIAL_ARRAY_SIZE_];
2399 // offset to the last base character in substring to search
2400 int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2401 // normalizing the offensive string
2402 unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2403 0, accents, INITIAL_ARRAY_SIZE_, status);
2404 // status checked in loop
2405
2406 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2407 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2408
2409 // 2 power n - 1 plus the full set of accents
2410 int32_t count = (2 << (size - 1)) - 1;
2411 while (U_SUCCESS(*status) && count > 0) {
2412 UChar *rearrange = strsrch->canonicalPrefixAccents;
2413 // copy the base characters
2414 for (int k = 0; k < accentsindex[0]; k ++) {
2415 *rearrange ++ = accents[k];
2416 }
2417 // forming all possible canonical rearrangement by dropping
2418 // sets of accents
2419 for (int i = 0; i <= size - 1; i ++) {
2420 int32_t mask = 1 << (size - i - 1);
2421 if (count & mask) {
2422 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2423 *rearrange ++ = accents[j];
2424 }
2425 }
2426 }
2427 *rearrange = 0;
2428 int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2429 baseoffset, status);
2430 if (offset != USEARCH_DONE) {
2431 return TRUE; // match found
2432 }
2433 count --;
2434 }
2435 return FALSE;
2436 }
2437
2438 /**
2439 * Checks match for contraction.
2440 * If the match starts with a partial contraction we fail.
2441 * Internal method, status assumed to be success, caller has to check status
2442 * before calling this method.
2443 * @param strsrch string search data
2444 * @param start offset of potential match, to be modified if necessary
2445 * @param end offset of potential match, to be modified if necessary
2446 * @param status only error status if any
2447 * @return TRUE if match passes the contraction test, FALSE otherwise
2448 */
2449 static
checkPreviousCanonicalContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)2450 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2451 int32_t *start,
2452 int32_t *end, UErrorCode *status)
2453 {
2454 UCollationElements *coleiter = strsrch->textIter;
2455 int32_t textlength = strsrch->search->textLength;
2456 int32_t temp = *end;
2457 const UCollator *collator = strsrch->collator;
2458 const UChar *text = strsrch->search->text;
2459 // This part checks if either if the start of the match contains potential
2460 // contraction. If so we'll have to iterate through them
2461 // Since we used ucol_next while previously looking for the potential
2462 // match, this guarantees that our end will not be a partial contraction,
2463 // or a partial supplementary character.
2464 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2465 int32_t expansion = getExpansionSuffix(coleiter);
2466 UBool expandflag = expansion > 0;
2467 setColEIterOffset(coleiter, *end);
2468 while (expansion > 0) {
2469 // getting rid of the redundant ce
2470 // since forward contraction/expansion may have extra ces
2471 // if we are in the normalization buffer, hasAccentsBeforeMatch
2472 // would have taken care of it.
2473 // E.g. the character \u01FA will have an expansion of 3, but if
2474 // we are only looking for A ring A\u030A, we'll have to skip the
2475 // last ce in the expansion buffer
2476 ucol_previous(coleiter, status);
2477 if (U_FAILURE(*status)) {
2478 return FALSE;
2479 }
2480 if (ucol_getOffset(coleiter) != temp) {
2481 *end = temp;
2482 temp = ucol_getOffset(coleiter);
2483 }
2484 expansion --;
2485 }
2486
2487 int32_t *patternce = strsrch->pattern.ces;
2488 int32_t patterncelength = strsrch->pattern.cesLength;
2489 int32_t count = patterncelength;
2490 while (count > 0) {
2491 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2492 // status checked below, note that if status is a failure
2493 // ucol_previous returns UCOL_NULLORDER
2494 if (ce == UCOL_IGNORABLE) {
2495 continue;
2496 }
2497 if (expandflag && count == 0 &&
2498 getColElemIterOffset(coleiter, FALSE) != temp) {
2499 *end = temp;
2500 temp = ucol_getOffset(coleiter);
2501 }
2502 if (count == patterncelength &&
2503 ce != patternce[patterncelength - 1]) {
2504 // accents may have extra starting ces, this occurs when a
2505 // pure accent pattern is matched without rearrangement
2506 int32_t expected = patternce[patterncelength - 1];
2507 U16_BACK_1(text, 0, *end);
2508 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2509 ce = getCE(strsrch, ucol_previous(coleiter, status));
2510 while (U_SUCCESS(*status) && ce != expected &&
2511 ce != UCOL_NULLORDER &&
2512 ucol_getOffset(coleiter) <= *start) {
2513 ce = getCE(strsrch, ucol_previous(coleiter, status));
2514 }
2515 }
2516 }
2517 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2518 (*start) --;
2519 *start = getPreviousBaseOffset(text, *start);
2520 return FALSE;
2521 }
2522 count --;
2523 }
2524 }
2525 return TRUE;
2526 }
2527
2528 /**
2529 * Checks and sets the match information if found.
2530 * Checks
2531 * <ul>
2532 * <li> the potential match does not repeat the previous match
2533 * <li> boundaries are correct
2534 * <li> potential match does not end in the middle of a contraction
2535 * <li> identical matches
2536 * <\ul>
2537 * Otherwise the offset will be shifted to the next character.
2538 * Internal method, status assumed to be success, caller has to check status
2539 * before calling this method.
2540 * @param strsrch string search data
2541 * @param textoffset offset in the collation element text. the returned value
2542 * will be the truncated start offset of the match or the new start
2543 * search offset.
2544 * @param status only error status if any
2545 * @return TRUE if the match is valid, FALSE otherwise
2546 */
2547 static
checkPreviousCanonicalMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)2548 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2549 int32_t *textoffset,
2550 UErrorCode *status)
2551 {
2552 // to ensure that the start and ends are not composite characters
2553 UCollationElements *coleiter = strsrch->textIter;
2554 // if we have a canonical accent match
2555 if ((strsrch->pattern.hasSuffixAccents &&
2556 strsrch->canonicalSuffixAccents[0]) ||
2557 (strsrch->pattern.hasPrefixAccents &&
2558 strsrch->canonicalPrefixAccents[0])) {
2559 strsrch->search->matchedIndex = *textoffset;
2560 strsrch->search->matchedLength =
2561 getNextUStringSearchBaseOffset(strsrch,
2562 getColElemIterOffset(coleiter, FALSE))
2563 - *textoffset;
2564 return TRUE;
2565 }
2566
2567 int32_t end = ucol_getOffset(coleiter);
2568 if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2569 status) ||
2570 U_FAILURE(*status)) {
2571 return FALSE;
2572 }
2573
2574 end = getNextUStringSearchBaseOffset(strsrch, end);
2575 // this totally matches, however we need to check if it is repeating
2576 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2577 !isBreakUnit(strsrch, *textoffset, end) ||
2578 !checkIdentical(strsrch, *textoffset, end)) {
2579 (*textoffset) --;
2580 *textoffset = getPreviousBaseOffset(strsrch->search->text,
2581 *textoffset);
2582 return FALSE;
2583 }
2584
2585 strsrch->search->matchedIndex = *textoffset;
2586 strsrch->search->matchedLength = end - *textoffset;
2587 return TRUE;
2588 }
2589 #endif // #if BOYER_MOORE
2590
2591 // constructors and destructor -------------------------------------------
2592
usearch_open(const UChar * pattern,int32_t patternlength,const UChar * text,int32_t textlength,const char * locale,UBreakIterator * breakiter,UErrorCode * status)2593 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2594 int32_t patternlength,
2595 const UChar *text,
2596 int32_t textlength,
2597 const char *locale,
2598 UBreakIterator *breakiter,
2599 UErrorCode *status)
2600 {
2601 if (U_FAILURE(*status)) {
2602 return NULL;
2603 }
2604 #if UCONFIG_NO_BREAK_ITERATION
2605 if (breakiter != NULL) {
2606 *status = U_UNSUPPORTED_ERROR;
2607 return NULL;
2608 }
2609 #endif
2610 if (locale) {
2611 // ucol_open internally checks for status
2612 UCollator *collator = ucol_open(locale, status);
2613 // pattern, text checks are done in usearch_openFromCollator
2614 UStringSearch *result = usearch_openFromCollator(pattern,
2615 patternlength, text, textlength,
2616 collator, breakiter, status);
2617
2618 if (result == NULL || U_FAILURE(*status)) {
2619 if (collator) {
2620 ucol_close(collator);
2621 }
2622 return NULL;
2623 }
2624 else {
2625 result->ownCollator = TRUE;
2626 }
2627 return result;
2628 }
2629 *status = U_ILLEGAL_ARGUMENT_ERROR;
2630 return NULL;
2631 }
2632
usearch_openFromCollator(const UChar * pattern,int32_t patternlength,const UChar * text,int32_t textlength,const UCollator * collator,UBreakIterator * breakiter,UErrorCode * status)2633 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2634 const UChar *pattern,
2635 int32_t patternlength,
2636 const UChar *text,
2637 int32_t textlength,
2638 const UCollator *collator,
2639 UBreakIterator *breakiter,
2640 UErrorCode *status)
2641 {
2642 if (U_FAILURE(*status)) {
2643 return NULL;
2644 }
2645 #if UCONFIG_NO_BREAK_ITERATION
2646 if (breakiter != NULL) {
2647 *status = U_UNSUPPORTED_ERROR;
2648 return NULL;
2649 }
2650 #endif
2651 if (pattern == NULL || text == NULL || collator == NULL) {
2652 *status = U_ILLEGAL_ARGUMENT_ERROR;
2653 return NULL;
2654 }
2655
2656 // string search does not really work when numeric collation is turned on
2657 if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2658 *status = U_UNSUPPORTED_ERROR;
2659 return NULL;
2660 }
2661
2662 if (U_SUCCESS(*status)) {
2663 initializeFCD(status);
2664 if (U_FAILURE(*status)) {
2665 return NULL;
2666 }
2667
2668 UStringSearch *result;
2669 if (textlength == -1) {
2670 textlength = u_strlen(text);
2671 }
2672 if (patternlength == -1) {
2673 patternlength = u_strlen(pattern);
2674 }
2675 if (textlength <= 0 || patternlength <= 0) {
2676 *status = U_ILLEGAL_ARGUMENT_ERROR;
2677 return NULL;
2678 }
2679
2680 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2681 if (result == NULL) {
2682 *status = U_MEMORY_ALLOCATION_ERROR;
2683 return NULL;
2684 }
2685
2686 result->collator = collator;
2687 result->strength = ucol_getStrength(collator);
2688 result->ceMask = getMask(result->strength);
2689 result->toShift =
2690 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2691 UCOL_SHIFTED;
2692 result->variableTop = ucol_getVariableTop(collator, status);
2693
2694 result->nfd = Normalizer2::getNFDInstance(*status);
2695
2696 if (U_FAILURE(*status)) {
2697 uprv_free(result);
2698 return NULL;
2699 }
2700
2701 result->search = (USearch *)uprv_malloc(sizeof(USearch));
2702 if (result->search == NULL) {
2703 *status = U_MEMORY_ALLOCATION_ERROR;
2704 uprv_free(result);
2705 return NULL;
2706 }
2707
2708 result->search->text = text;
2709 result->search->textLength = textlength;
2710
2711 result->pattern.text = pattern;
2712 result->pattern.textLength = patternlength;
2713 result->pattern.ces = NULL;
2714 result->pattern.pces = NULL;
2715
2716 result->search->breakIter = breakiter;
2717 #if !UCONFIG_NO_BREAK_ITERATION
2718 result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
2719 if (breakiter) {
2720 ubrk_setText(breakiter, text, textlength, status);
2721 }
2722 #endif
2723
2724 result->ownCollator = FALSE;
2725 result->search->matchedLength = 0;
2726 result->search->matchedIndex = USEARCH_DONE;
2727 result->utilIter = NULL;
2728 result->textIter = ucol_openElements(collator, text,
2729 textlength, status);
2730 result->textProcessedIter = NULL;
2731 if (U_FAILURE(*status)) {
2732 usearch_close(result);
2733 return NULL;
2734 }
2735
2736 result->search->isOverlap = FALSE;
2737 result->search->isCanonicalMatch = FALSE;
2738 result->search->elementComparisonType = 0;
2739 result->search->isForwardSearching = TRUE;
2740 result->search->reset = TRUE;
2741
2742 initialize(result, status);
2743
2744 if (U_FAILURE(*status)) {
2745 usearch_close(result);
2746 return NULL;
2747 }
2748
2749 return result;
2750 }
2751 return NULL;
2752 }
2753
usearch_close(UStringSearch * strsrch)2754 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2755 {
2756 if (strsrch) {
2757 if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
2758 strsrch->pattern.ces) {
2759 uprv_free(strsrch->pattern.ces);
2760 }
2761
2762 if (strsrch->pattern.pces != NULL &&
2763 strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
2764 uprv_free(strsrch->pattern.pces);
2765 }
2766
2767 delete strsrch->textProcessedIter;
2768 ucol_closeElements(strsrch->textIter);
2769 ucol_closeElements(strsrch->utilIter);
2770
2771 if (strsrch->ownCollator && strsrch->collator) {
2772 ucol_close((UCollator *)strsrch->collator);
2773 }
2774
2775 #if !UCONFIG_NO_BREAK_ITERATION
2776 if (strsrch->search->internalBreakIter) {
2777 ubrk_close(strsrch->search->internalBreakIter);
2778 }
2779 #endif
2780
2781 uprv_free(strsrch->search);
2782 uprv_free(strsrch);
2783 }
2784 }
2785
2786 namespace {
2787
initTextProcessedIter(UStringSearch * strsrch,UErrorCode * status)2788 UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
2789 if (U_FAILURE(*status)) { return FALSE; }
2790 if (strsrch->textProcessedIter == NULL) {
2791 strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
2792 if (strsrch->textProcessedIter == NULL) {
2793 *status = U_MEMORY_ALLOCATION_ERROR;
2794 return FALSE;
2795 }
2796 } else {
2797 strsrch->textProcessedIter->init(strsrch->textIter);
2798 }
2799 return TRUE;
2800 }
2801
2802 }
2803
2804 // set and get methods --------------------------------------------------
2805
usearch_setOffset(UStringSearch * strsrch,int32_t position,UErrorCode * status)2806 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2807 int32_t position,
2808 UErrorCode *status)
2809 {
2810 if (U_SUCCESS(*status) && strsrch) {
2811 if (isOutOfBounds(strsrch->search->textLength, position)) {
2812 *status = U_INDEX_OUTOFBOUNDS_ERROR;
2813 }
2814 else {
2815 setColEIterOffset(strsrch->textIter, position);
2816 }
2817 strsrch->search->matchedIndex = USEARCH_DONE;
2818 strsrch->search->matchedLength = 0;
2819 strsrch->search->reset = FALSE;
2820 }
2821 }
2822
usearch_getOffset(const UStringSearch * strsrch)2823 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2824 {
2825 if (strsrch) {
2826 int32_t result = ucol_getOffset(strsrch->textIter);
2827 if (isOutOfBounds(strsrch->search->textLength, result)) {
2828 return USEARCH_DONE;
2829 }
2830 return result;
2831 }
2832 return USEARCH_DONE;
2833 }
2834
usearch_setAttribute(UStringSearch * strsrch,USearchAttribute attribute,USearchAttributeValue value,UErrorCode * status)2835 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2836 USearchAttribute attribute,
2837 USearchAttributeValue value,
2838 UErrorCode *status)
2839 {
2840 if (U_SUCCESS(*status) && strsrch) {
2841 switch (attribute)
2842 {
2843 case USEARCH_OVERLAP :
2844 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2845 break;
2846 case USEARCH_CANONICAL_MATCH :
2847 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2848 FALSE);
2849 break;
2850 case USEARCH_ELEMENT_COMPARISON :
2851 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2852 strsrch->search->elementComparisonType = (int16_t)value;
2853 } else {
2854 strsrch->search->elementComparisonType = 0;
2855 }
2856 break;
2857 case USEARCH_ATTRIBUTE_COUNT :
2858 default:
2859 *status = U_ILLEGAL_ARGUMENT_ERROR;
2860 }
2861 }
2862 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2863 *status = U_ILLEGAL_ARGUMENT_ERROR;
2864 }
2865 }
2866
usearch_getAttribute(const UStringSearch * strsrch,USearchAttribute attribute)2867 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2868 const UStringSearch *strsrch,
2869 USearchAttribute attribute)
2870 {
2871 if (strsrch) {
2872 switch (attribute) {
2873 case USEARCH_OVERLAP :
2874 return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2875 USEARCH_OFF);
2876 case USEARCH_CANONICAL_MATCH :
2877 return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2878 USEARCH_OFF);
2879 case USEARCH_ELEMENT_COMPARISON :
2880 {
2881 int16_t value = strsrch->search->elementComparisonType;
2882 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2883 return (USearchAttributeValue)value;
2884 } else {
2885 return USEARCH_STANDARD_ELEMENT_COMPARISON;
2886 }
2887 }
2888 case USEARCH_ATTRIBUTE_COUNT :
2889 return USEARCH_DEFAULT;
2890 }
2891 }
2892 return USEARCH_DEFAULT;
2893 }
2894
usearch_getMatchedStart(const UStringSearch * strsrch)2895 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2896 const UStringSearch *strsrch)
2897 {
2898 if (strsrch == NULL) {
2899 return USEARCH_DONE;
2900 }
2901 return strsrch->search->matchedIndex;
2902 }
2903
2904
usearch_getMatchedText(const UStringSearch * strsrch,UChar * result,int32_t resultCapacity,UErrorCode * status)2905 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2906 UChar *result,
2907 int32_t resultCapacity,
2908 UErrorCode *status)
2909 {
2910 if (U_FAILURE(*status)) {
2911 return USEARCH_DONE;
2912 }
2913 if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2914 result == NULL)) {
2915 *status = U_ILLEGAL_ARGUMENT_ERROR;
2916 return USEARCH_DONE;
2917 }
2918
2919 int32_t copylength = strsrch->search->matchedLength;
2920 int32_t copyindex = strsrch->search->matchedIndex;
2921 if (copyindex == USEARCH_DONE) {
2922 u_terminateUChars(result, resultCapacity, 0, status);
2923 return USEARCH_DONE;
2924 }
2925
2926 if (resultCapacity < copylength) {
2927 copylength = resultCapacity;
2928 }
2929 if (copylength > 0) {
2930 uprv_memcpy(result, strsrch->search->text + copyindex,
2931 copylength * sizeof(UChar));
2932 }
2933 return u_terminateUChars(result, resultCapacity,
2934 strsrch->search->matchedLength, status);
2935 }
2936
usearch_getMatchedLength(const UStringSearch * strsrch)2937 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2938 const UStringSearch *strsrch)
2939 {
2940 if (strsrch) {
2941 return strsrch->search->matchedLength;
2942 }
2943 return USEARCH_DONE;
2944 }
2945
2946 #if !UCONFIG_NO_BREAK_ITERATION
2947
usearch_setBreakIterator(UStringSearch * strsrch,UBreakIterator * breakiter,UErrorCode * status)2948 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch,
2949 UBreakIterator *breakiter,
2950 UErrorCode *status)
2951 {
2952 if (U_SUCCESS(*status) && strsrch) {
2953 strsrch->search->breakIter = breakiter;
2954 if (breakiter) {
2955 ubrk_setText(breakiter, strsrch->search->text,
2956 strsrch->search->textLength, status);
2957 }
2958 }
2959 }
2960
2961 U_CAPI const UBreakIterator* U_EXPORT2
usearch_getBreakIterator(const UStringSearch * strsrch)2962 usearch_getBreakIterator(const UStringSearch *strsrch)
2963 {
2964 if (strsrch) {
2965 return strsrch->search->breakIter;
2966 }
2967 return NULL;
2968 }
2969
2970 #endif
2971
usearch_setText(UStringSearch * strsrch,const UChar * text,int32_t textlength,UErrorCode * status)2972 U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch,
2973 const UChar *text,
2974 int32_t textlength,
2975 UErrorCode *status)
2976 {
2977 if (U_SUCCESS(*status)) {
2978 if (strsrch == NULL || text == NULL || textlength < -1 ||
2979 textlength == 0) {
2980 *status = U_ILLEGAL_ARGUMENT_ERROR;
2981 }
2982 else {
2983 if (textlength == -1) {
2984 textlength = u_strlen(text);
2985 }
2986 strsrch->search->text = text;
2987 strsrch->search->textLength = textlength;
2988 ucol_setText(strsrch->textIter, text, textlength, status);
2989 strsrch->search->matchedIndex = USEARCH_DONE;
2990 strsrch->search->matchedLength = 0;
2991 strsrch->search->reset = TRUE;
2992 #if !UCONFIG_NO_BREAK_ITERATION
2993 if (strsrch->search->breakIter != NULL) {
2994 ubrk_setText(strsrch->search->breakIter, text,
2995 textlength, status);
2996 }
2997 ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
2998 #endif
2999 }
3000 }
3001 }
3002
usearch_getText(const UStringSearch * strsrch,int32_t * length)3003 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
3004 int32_t *length)
3005 {
3006 if (strsrch) {
3007 *length = strsrch->search->textLength;
3008 return strsrch->search->text;
3009 }
3010 return NULL;
3011 }
3012
usearch_setCollator(UStringSearch * strsrch,const UCollator * collator,UErrorCode * status)3013 U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
3014 const UCollator *collator,
3015 UErrorCode *status)
3016 {
3017 if (U_SUCCESS(*status)) {
3018 if (collator == NULL) {
3019 *status = U_ILLEGAL_ARGUMENT_ERROR;
3020 return;
3021 }
3022
3023 if (strsrch) {
3024 delete strsrch->textProcessedIter;
3025 strsrch->textProcessedIter = NULL;
3026 ucol_closeElements(strsrch->textIter);
3027 ucol_closeElements(strsrch->utilIter);
3028 strsrch->textIter = strsrch->utilIter = NULL;
3029 if (strsrch->ownCollator && (strsrch->collator != collator)) {
3030 ucol_close((UCollator *)strsrch->collator);
3031 strsrch->ownCollator = FALSE;
3032 }
3033 strsrch->collator = collator;
3034 strsrch->strength = ucol_getStrength(collator);
3035 strsrch->ceMask = getMask(strsrch->strength);
3036 #if !UCONFIG_NO_BREAK_ITERATION
3037 ubrk_close(strsrch->search->internalBreakIter);
3038 strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
3039 strsrch->search->text, strsrch->search->textLength, status);
3040 #endif
3041 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3042 strsrch->toShift =
3043 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
3044 UCOL_SHIFTED;
3045 // if status is a failure, ucol_getVariableTop returns 0
3046 strsrch->variableTop = ucol_getVariableTop(collator, status);
3047 strsrch->textIter = ucol_openElements(collator,
3048 strsrch->search->text,
3049 strsrch->search->textLength,
3050 status);
3051 strsrch->utilIter = ucol_openElements(
3052 collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
3053 // initialize() _after_ setting the iterators for the new collator.
3054 initialize(strsrch, status);
3055 }
3056
3057 // **** are these calls needed?
3058 // **** we call uprv_init_pce in initializePatternPCETable
3059 // **** and the CEIBuffer constructor...
3060 #if 0
3061 uprv_init_pce(strsrch->textIter);
3062 uprv_init_pce(strsrch->utilIter);
3063 #endif
3064 }
3065 }
3066
usearch_getCollator(const UStringSearch * strsrch)3067 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3068 {
3069 if (strsrch) {
3070 return (UCollator *)strsrch->collator;
3071 }
3072 return NULL;
3073 }
3074
usearch_setPattern(UStringSearch * strsrch,const UChar * pattern,int32_t patternlength,UErrorCode * status)3075 U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch,
3076 const UChar *pattern,
3077 int32_t patternlength,
3078 UErrorCode *status)
3079 {
3080 if (U_SUCCESS(*status)) {
3081 if (strsrch == NULL || pattern == NULL) {
3082 *status = U_ILLEGAL_ARGUMENT_ERROR;
3083 }
3084 else {
3085 if (patternlength == -1) {
3086 patternlength = u_strlen(pattern);
3087 }
3088 if (patternlength == 0) {
3089 *status = U_ILLEGAL_ARGUMENT_ERROR;
3090 return;
3091 }
3092 strsrch->pattern.text = pattern;
3093 strsrch->pattern.textLength = patternlength;
3094 initialize(strsrch, status);
3095 }
3096 }
3097 }
3098
3099 U_CAPI const UChar* U_EXPORT2
usearch_getPattern(const UStringSearch * strsrch,int32_t * length)3100 usearch_getPattern(const UStringSearch *strsrch,
3101 int32_t *length)
3102 {
3103 if (strsrch) {
3104 *length = strsrch->pattern.textLength;
3105 return strsrch->pattern.text;
3106 }
3107 return NULL;
3108 }
3109
3110 // miscellanous methods --------------------------------------------------
3111
usearch_first(UStringSearch * strsrch,UErrorCode * status)3112 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3113 UErrorCode *status)
3114 {
3115 if (strsrch && U_SUCCESS(*status)) {
3116 strsrch->search->isForwardSearching = TRUE;
3117 usearch_setOffset(strsrch, 0, status);
3118 if (U_SUCCESS(*status)) {
3119 return usearch_next(strsrch, status);
3120 }
3121 }
3122 return USEARCH_DONE;
3123 }
3124
usearch_following(UStringSearch * strsrch,int32_t position,UErrorCode * status)3125 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
3126 int32_t position,
3127 UErrorCode *status)
3128 {
3129 if (strsrch && U_SUCCESS(*status)) {
3130 strsrch->search->isForwardSearching = TRUE;
3131 // position checked in usearch_setOffset
3132 usearch_setOffset(strsrch, position, status);
3133 if (U_SUCCESS(*status)) {
3134 return usearch_next(strsrch, status);
3135 }
3136 }
3137 return USEARCH_DONE;
3138 }
3139
usearch_last(UStringSearch * strsrch,UErrorCode * status)3140 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
3141 UErrorCode *status)
3142 {
3143 if (strsrch && U_SUCCESS(*status)) {
3144 strsrch->search->isForwardSearching = FALSE;
3145 usearch_setOffset(strsrch, strsrch->search->textLength, status);
3146 if (U_SUCCESS(*status)) {
3147 return usearch_previous(strsrch, status);
3148 }
3149 }
3150 return USEARCH_DONE;
3151 }
3152
usearch_preceding(UStringSearch * strsrch,int32_t position,UErrorCode * status)3153 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
3154 int32_t position,
3155 UErrorCode *status)
3156 {
3157 if (strsrch && U_SUCCESS(*status)) {
3158 strsrch->search->isForwardSearching = FALSE;
3159 // position checked in usearch_setOffset
3160 usearch_setOffset(strsrch, position, status);
3161 if (U_SUCCESS(*status)) {
3162 return usearch_previous(strsrch, status);
3163 }
3164 }
3165 return USEARCH_DONE;
3166 }
3167
3168 /**
3169 * If a direction switch is required, we'll count the number of ces till the
3170 * beginning of the collation element iterator and iterate forwards that
3171 * number of times. This is so that we get to the correct point within the
3172 * string to continue the search in. Imagine when we are in the middle of the
3173 * normalization buffer when the change in direction is request. arrrgghh....
3174 * After searching the offset within the collation element iterator will be
3175 * shifted to the start of the match. If a match is not found, the offset would
3176 * have been set to the end of the text string in the collation element
3177 * iterator.
3178 * Okay, here's my take on normalization buffer. The only time when there can
3179 * be 2 matches within the same normalization is when the pattern is consists
3180 * of all accents. But since the offset returned is from the text string, we
3181 * should not confuse the caller by returning the second match within the
3182 * same normalization buffer. If we do, the 2 results will have the same match
3183 * offsets, and that'll be confusing. I'll return the next match that doesn't
3184 * fall within the same normalization buffer. Note this does not affect the
3185 * results of matches spanning the text and the normalization buffer.
3186 * The position to start searching is taken from the collation element
3187 * iterator. Callers of this API would have to set the offset in the collation
3188 * element iterator before using this method.
3189 */
usearch_next(UStringSearch * strsrch,UErrorCode * status)3190 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3191 UErrorCode *status)
3192 {
3193 if (U_SUCCESS(*status) && strsrch) {
3194 // note offset is either equivalent to the start of the previous match
3195 // or is set by the user
3196 int32_t offset = usearch_getOffset(strsrch);
3197 USearch *search = strsrch->search;
3198 search->reset = FALSE;
3199 int32_t textlength = search->textLength;
3200 if (search->isForwardSearching) {
3201 #if BOYER_MOORE
3202 if (offset == textlength
3203 || (!search->isOverlap &&
3204 (offset + strsrch->pattern.defaultShiftSize > textlength ||
3205 (search->matchedIndex != USEARCH_DONE &&
3206 offset + search->matchedLength >= textlength)))) {
3207 // not enough characters to match
3208 setMatchNotFound(strsrch);
3209 return USEARCH_DONE;
3210 }
3211 #else
3212 if (offset == textlength ||
3213 (! search->isOverlap &&
3214 (search->matchedIndex != USEARCH_DONE &&
3215 offset + search->matchedLength > textlength))) {
3216 // not enough characters to match
3217 setMatchNotFound(strsrch);
3218 return USEARCH_DONE;
3219 }
3220 #endif
3221 }
3222 else {
3223 // switching direction.
3224 // if matchedIndex == USEARCH_DONE, it means that either a
3225 // setOffset has been called or that previous ran off the text
3226 // string. the iterator would have been set to offset 0 if a
3227 // match is not found.
3228 search->isForwardSearching = TRUE;
3229 if (search->matchedIndex != USEARCH_DONE) {
3230 // there's no need to set the collation element iterator
3231 // the next call to next will set the offset.
3232 return search->matchedIndex;
3233 }
3234 }
3235
3236 if (U_SUCCESS(*status)) {
3237 if (strsrch->pattern.cesLength == 0) {
3238 if (search->matchedIndex == USEARCH_DONE) {
3239 search->matchedIndex = offset;
3240 }
3241 else { // moves by codepoints
3242 U16_FWD_1(search->text, search->matchedIndex, textlength);
3243 }
3244
3245 search->matchedLength = 0;
3246 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3247 // status checked below
3248 if (search->matchedIndex == textlength) {
3249 search->matchedIndex = USEARCH_DONE;
3250 }
3251 }
3252 else {
3253 if (search->matchedLength > 0) {
3254 // if matchlength is 0 we are at the start of the iteration
3255 if (search->isOverlap) {
3256 ucol_setOffset(strsrch->textIter, offset + 1, status);
3257 }
3258 else {
3259 ucol_setOffset(strsrch->textIter,
3260 offset + search->matchedLength, status);
3261 }
3262 }
3263 else {
3264 // for boundary check purposes. this will ensure that the
3265 // next match will not preceed the current offset
3266 // note search->matchedIndex will always be set to something
3267 // in the code
3268 search->matchedIndex = offset - 1;
3269 }
3270
3271 if (search->isCanonicalMatch) {
3272 // can't use exact here since extra accents are allowed.
3273 usearch_handleNextCanonical(strsrch, status);
3274 }
3275 else {
3276 usearch_handleNextExact(strsrch, status);
3277 }
3278 }
3279
3280 if (U_FAILURE(*status)) {
3281 return USEARCH_DONE;
3282 }
3283
3284 #if !BOYER_MOORE
3285 if (search->matchedIndex == USEARCH_DONE) {
3286 ucol_setOffset(strsrch->textIter, search->textLength, status);
3287 } else {
3288 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3289 }
3290 #endif
3291
3292 return search->matchedIndex;
3293 }
3294 }
3295 return USEARCH_DONE;
3296 }
3297
usearch_previous(UStringSearch * strsrch,UErrorCode * status)3298 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3299 UErrorCode *status)
3300 {
3301 if (U_SUCCESS(*status) && strsrch) {
3302 int32_t offset;
3303 USearch *search = strsrch->search;
3304 if (search->reset) {
3305 offset = search->textLength;
3306 search->isForwardSearching = FALSE;
3307 search->reset = FALSE;
3308 setColEIterOffset(strsrch->textIter, offset);
3309 }
3310 else {
3311 offset = usearch_getOffset(strsrch);
3312 }
3313
3314 int32_t matchedindex = search->matchedIndex;
3315 if (search->isForwardSearching == TRUE) {
3316 // switching direction.
3317 // if matchedIndex == USEARCH_DONE, it means that either a
3318 // setOffset has been called or that next ran off the text
3319 // string. the iterator would have been set to offset textLength if
3320 // a match is not found.
3321 search->isForwardSearching = FALSE;
3322 if (matchedindex != USEARCH_DONE) {
3323 return matchedindex;
3324 }
3325 }
3326 else {
3327 #if BOYER_MOORE
3328 if (offset == 0 || matchedindex == 0 ||
3329 (!search->isOverlap &&
3330 (offset < strsrch->pattern.defaultShiftSize ||
3331 (matchedindex != USEARCH_DONE &&
3332 matchedindex < strsrch->pattern.defaultShiftSize)))) {
3333 // not enough characters to match
3334 setMatchNotFound(strsrch);
3335 return USEARCH_DONE;
3336 }
3337 #else
3338 // Could check pattern length, but the
3339 // linear search will do the right thing
3340 if (offset == 0 || matchedindex == 0) {
3341 setMatchNotFound(strsrch);
3342 return USEARCH_DONE;
3343 }
3344 #endif
3345 }
3346
3347 if (U_SUCCESS(*status)) {
3348 if (strsrch->pattern.cesLength == 0) {
3349 search->matchedIndex =
3350 (matchedindex == USEARCH_DONE ? offset : matchedindex);
3351 if (search->matchedIndex == 0) {
3352 setMatchNotFound(strsrch);
3353 // status checked below
3354 }
3355 else { // move by codepoints
3356 U16_BACK_1(search->text, 0, search->matchedIndex);
3357 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3358 // status checked below
3359 search->matchedLength = 0;
3360 }
3361 }
3362 else {
3363 if (strsrch->search->isCanonicalMatch) {
3364 // can't use exact here since extra accents are allowed.
3365 usearch_handlePreviousCanonical(strsrch, status);
3366 // status checked below
3367 }
3368 else {
3369 usearch_handlePreviousExact(strsrch, status);
3370 // status checked below
3371 }
3372 }
3373
3374 if (U_FAILURE(*status)) {
3375 return USEARCH_DONE;
3376 }
3377
3378 return search->matchedIndex;
3379 }
3380 }
3381 return USEARCH_DONE;
3382 }
3383
3384
3385
usearch_reset(UStringSearch * strsrch)3386 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3387 {
3388 /*
3389 reset is setting the attributes that are already in
3390 string search, hence all attributes in the collator should
3391 be retrieved without any problems
3392 */
3393 if (strsrch) {
3394 UErrorCode status = U_ZERO_ERROR;
3395 UBool sameCollAttribute = TRUE;
3396 uint32_t ceMask;
3397 UBool shift;
3398 uint32_t varTop;
3399
3400 // **** hack to deal w/ how processed CEs encode quaternary ****
3401 UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
3402 if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
3403 (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
3404 sameCollAttribute = FALSE;
3405 }
3406
3407 strsrch->strength = ucol_getStrength(strsrch->collator);
3408 ceMask = getMask(strsrch->strength);
3409 if (strsrch->ceMask != ceMask) {
3410 strsrch->ceMask = ceMask;
3411 sameCollAttribute = FALSE;
3412 }
3413
3414 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3415 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3416 &status) == UCOL_SHIFTED;
3417 if (strsrch->toShift != shift) {
3418 strsrch->toShift = shift;
3419 sameCollAttribute = FALSE;
3420 }
3421
3422 // if status is a failure, ucol_getVariableTop returns 0
3423 varTop = ucol_getVariableTop(strsrch->collator, &status);
3424 if (strsrch->variableTop != varTop) {
3425 strsrch->variableTop = varTop;
3426 sameCollAttribute = FALSE;
3427 }
3428 if (!sameCollAttribute) {
3429 initialize(strsrch, &status);
3430 }
3431 ucol_setText(strsrch->textIter, strsrch->search->text,
3432 strsrch->search->textLength,
3433 &status);
3434 strsrch->search->matchedLength = 0;
3435 strsrch->search->matchedIndex = USEARCH_DONE;
3436 strsrch->search->isOverlap = FALSE;
3437 strsrch->search->isCanonicalMatch = FALSE;
3438 strsrch->search->elementComparisonType = 0;
3439 strsrch->search->isForwardSearching = TRUE;
3440 strsrch->search->reset = TRUE;
3441 }
3442 }
3443
3444 //
3445 // CEI Collation Element + source text index.
3446 // These structs are kept in the circular buffer.
3447 //
3448 struct CEI {
3449 int64_t ce;
3450 int32_t lowIndex;
3451 int32_t highIndex;
3452 };
3453
3454 U_NAMESPACE_BEGIN
3455
3456 namespace {
3457 //
3458 // CEIBuffer A circular buffer of CEs-with-index from the text being searched.
3459 //
3460 #define DEFAULT_CEBUFFER_SIZE 96
3461 #define CEBUFFER_EXTRA 32
3462 // Some typical max values to make buffer size more reasonable for asymmetric search.
3463 // #8694 is for a better long-term solution to allocation of this buffer.
3464 #define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3465 #define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3466 #define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
3467 struct CEIBuffer {
3468 CEI defBuf[DEFAULT_CEBUFFER_SIZE];
3469 CEI *buf;
3470 int32_t bufSize;
3471 int32_t firstIx;
3472 int32_t limitIx;
3473 UCollationElements *ceIter;
3474 UStringSearch *strSearch;
3475
3476
3477
3478 CEIBuffer(UStringSearch *ss, UErrorCode *status);
3479 ~CEIBuffer();
3480 const CEI *get(int32_t index);
3481 const CEI *getPrevious(int32_t index);
3482 };
3483
3484
CEIBuffer(UStringSearch * ss,UErrorCode * status)3485 CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
3486 buf = defBuf;
3487 strSearch = ss;
3488 bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
3489 if (ss->search->elementComparisonType != 0) {
3490 const UChar * patText = ss->pattern.text;
3491 if (patText) {
3492 const UChar * patTextLimit = patText + ss->pattern.textLength;
3493 while ( patText < patTextLimit ) {
3494 UChar c = *patText++;
3495 if (MIGHT_BE_JAMO_L(c)) {
3496 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
3497 } else {
3498 // No check for surrogates, we might allocate slightly more buffer than necessary.
3499 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
3500 }
3501 }
3502 }
3503 }
3504 ceIter = ss->textIter;
3505 firstIx = 0;
3506 limitIx = 0;
3507
3508 if (!initTextProcessedIter(ss, status)) { return; }
3509
3510 if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3511 buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
3512 if (buf == NULL) {
3513 *status = U_MEMORY_ALLOCATION_ERROR;
3514 }
3515 }
3516 }
3517
3518 // TODO: add a reset or init function so that allocated
3519 // buffers can be retained & reused.
3520
~CEIBuffer()3521 CEIBuffer::~CEIBuffer() {
3522 if (buf != defBuf) {
3523 uprv_free(buf);
3524 }
3525 }
3526
3527
3528 // Get the CE with the specified index.
3529 // Index must be in the range
3530 // n-history_size < index < n+1
3531 // where n is the largest index to have been fetched by some previous call to this function.
3532 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3533 //
get(int32_t index)3534 const CEI *CEIBuffer::get(int32_t index) {
3535 int i = index % bufSize;
3536
3537 if (index>=firstIx && index<limitIx) {
3538 // The request was for an entry already in our buffer.
3539 // Just return it.
3540 return &buf[i];
3541 }
3542
3543 // Caller is requesting a new, never accessed before, CE.
3544 // Verify that it is the next one in sequence, which is all
3545 // that is allowed.
3546 if (index != limitIx) {
3547 U_ASSERT(FALSE);
3548
3549 return NULL;
3550 }
3551
3552 // Manage the circular CE buffer indexing
3553 limitIx++;
3554
3555 if (limitIx - firstIx >= bufSize) {
3556 // The buffer is full, knock out the lowest-indexed entry.
3557 firstIx++;
3558 }
3559
3560 UErrorCode status = U_ZERO_ERROR;
3561
3562 buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3563
3564 return &buf[i];
3565 }
3566
3567 // Get the CE with the specified index.
3568 // Index must be in the range
3569 // n-history_size < index < n+1
3570 // where n is the largest index to have been fetched by some previous call to this function.
3571 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3572 //
getPrevious(int32_t index)3573 const CEI *CEIBuffer::getPrevious(int32_t index) {
3574 int i = index % bufSize;
3575
3576 if (index>=firstIx && index<limitIx) {
3577 // The request was for an entry already in our buffer.
3578 // Just return it.
3579 return &buf[i];
3580 }
3581
3582 // Caller is requesting a new, never accessed before, CE.
3583 // Verify that it is the next one in sequence, which is all
3584 // that is allowed.
3585 if (index != limitIx) {
3586 U_ASSERT(FALSE);
3587
3588 return NULL;
3589 }
3590
3591 // Manage the circular CE buffer indexing
3592 limitIx++;
3593
3594 if (limitIx - firstIx >= bufSize) {
3595 // The buffer is full, knock out the lowest-indexed entry.
3596 firstIx++;
3597 }
3598
3599 UErrorCode status = U_ZERO_ERROR;
3600
3601 buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3602
3603 return &buf[i];
3604 }
3605
3606 }
3607
3608 U_NAMESPACE_END
3609
3610
3611 // #define USEARCH_DEBUG
3612
3613 #ifdef USEARCH_DEBUG
3614 #include <stdio.h>
3615 #include <stdlib.h>
3616 #endif
3617
3618 /*
3619 * Find the next break boundary after startIndex. If the UStringSearch object
3620 * has an external break iterator, use that. Otherwise use the internal character
3621 * break iterator.
3622 */
nextBoundaryAfter(UStringSearch * strsrch,int32_t startIndex)3623 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3624 #if 0
3625 const UChar *text = strsrch->search->text;
3626 int32_t textLen = strsrch->search->textLength;
3627
3628 U_ASSERT(startIndex>=0);
3629 U_ASSERT(startIndex<=textLen);
3630
3631 if (startIndex >= textLen) {
3632 return startIndex;
3633 }
3634
3635 UChar32 c;
3636 int32_t i = startIndex;
3637 U16_NEXT(text, i, textLen, c);
3638
3639 // If we are on a control character, stop without looking for combining marks.
3640 // Control characters do not combine.
3641 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3642 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
3643 return i;
3644 }
3645
3646 // The initial character was not a control, and can thus accept trailing
3647 // combining characters. Advance over however many of them there are.
3648 int32_t indexOfLastCharChecked;
3649 for (;;) {
3650 indexOfLastCharChecked = i;
3651 if (i>=textLen) {
3652 break;
3653 }
3654 U16_NEXT(text, i, textLen, c);
3655 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3656 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3657 break;
3658 }
3659 }
3660 return indexOfLastCharChecked;
3661 #elif !UCONFIG_NO_BREAK_ITERATION
3662 UBreakIterator *breakiterator = strsrch->search->breakIter;
3663
3664 if (breakiterator == NULL) {
3665 breakiterator = strsrch->search->internalBreakIter;
3666 }
3667
3668 if (breakiterator != NULL) {
3669 return ubrk_following(breakiterator, startIndex);
3670 }
3671
3672 return startIndex;
3673 #else
3674 // **** or should we use the original code? ****
3675 return startIndex;
3676 #endif
3677
3678 }
3679
3680 /*
3681 * Returns TRUE if index is on a break boundary. If the UStringSearch
3682 * has an external break iterator, test using that, otherwise test
3683 * using the internal character break iterator.
3684 */
isBreakBoundary(UStringSearch * strsrch,int32_t index)3685 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3686 #if 0
3687 const UChar *text = strsrch->search->text;
3688 int32_t textLen = strsrch->search->textLength;
3689
3690 U_ASSERT(index>=0);
3691 U_ASSERT(index<=textLen);
3692
3693 if (index>=textLen || index<=0) {
3694 return TRUE;
3695 }
3696
3697 // If the character at the current index is not a GRAPHEME_EXTEND
3698 // then we can not be within a combining sequence.
3699 UChar32 c;
3700 U16_GET(text, 0, index, textLen, c);
3701 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3702 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3703 return TRUE;
3704 }
3705
3706 // We are at a combining mark. If the preceding character is anything
3707 // except a CONTROL, CR or LF, we are in a combining sequence.
3708 U16_PREV(text, 0, index, c);
3709 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3710 UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
3711 return !combining;
3712 #elif !UCONFIG_NO_BREAK_ITERATION
3713 UBreakIterator *breakiterator = strsrch->search->breakIter;
3714
3715 if (breakiterator == NULL) {
3716 breakiterator = strsrch->search->internalBreakIter;
3717 }
3718
3719 return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
3720 #else
3721 // **** or use the original code? ****
3722 return TRUE;
3723 #endif
3724 }
3725
3726 #if 0
3727 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3728 {
3729 #if !UCONFIG_NO_BREAK_ITERATION
3730 UBreakIterator *breakiterator = strsrch->search->breakIter;
3731
3732 if (breakiterator != NULL) {
3733 int32_t startindex = ubrk_first(breakiterator);
3734 int32_t endindex = ubrk_last(breakiterator);
3735
3736 // out-of-range indexes are never boundary positions
3737 if (start < startindex || start > endindex ||
3738 end < startindex || end > endindex) {
3739 return FALSE;
3740 }
3741
3742 return ubrk_isBoundary(breakiterator, start) &&
3743 ubrk_isBoundary(breakiterator, end);
3744 }
3745 #endif
3746
3747 return TRUE;
3748 }
3749 #endif
3750
3751 typedef enum {
3752 U_CE_MATCH = -1,
3753 U_CE_NO_MATCH = 0,
3754 U_CE_SKIP_TARG,
3755 U_CE_SKIP_PATN
3756 } UCompareCEsResult;
3757 #define U_CE_LEVEL2_BASE 0x00000005
3758 #define U_CE_LEVEL3_BASE 0x00050000
3759
compareCE64s(int64_t targCE,int64_t patCE,int16_t compareType)3760 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
3761 if (targCE == patCE) {
3762 return U_CE_MATCH;
3763 }
3764 if (compareType == 0) {
3765 return U_CE_NO_MATCH;
3766 }
3767
3768 int64_t targCEshifted = targCE >> 32;
3769 int64_t patCEshifted = patCE >> 32;
3770 int64_t mask;
3771
3772 mask = 0xFFFF0000;
3773 int32_t targLev1 = (int32_t)(targCEshifted & mask);
3774 int32_t patLev1 = (int32_t)(patCEshifted & mask);
3775 if ( targLev1 != patLev1 ) {
3776 if ( targLev1 == 0 ) {
3777 return U_CE_SKIP_TARG;
3778 }
3779 if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3780 return U_CE_SKIP_PATN;
3781 }
3782 return U_CE_NO_MATCH;
3783 }
3784
3785 mask = 0x0000FFFF;
3786 int32_t targLev2 = (int32_t)(targCEshifted & mask);
3787 int32_t patLev2 = (int32_t)(patCEshifted & mask);
3788 if ( targLev2 != patLev2 ) {
3789 if ( targLev2 == 0 ) {
3790 return U_CE_SKIP_TARG;
3791 }
3792 if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3793 return U_CE_SKIP_PATN;
3794 }
3795 return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
3796 U_CE_MATCH: U_CE_NO_MATCH;
3797 }
3798
3799 mask = 0xFFFF0000;
3800 int32_t targLev3 = (int32_t)(targCE & mask);
3801 int32_t patLev3 = (int32_t)(patCE & mask);
3802 if ( targLev3 != patLev3 ) {
3803 return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
3804 U_CE_MATCH: U_CE_NO_MATCH;
3805 }
3806
3807 return U_CE_MATCH;
3808 }
3809
3810 #if BOYER_MOORE
3811 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3812 #endif
3813
3814 namespace {
3815
codePointAt(const USearch & search,int32_t index)3816 UChar32 codePointAt(const USearch &search, int32_t index) {
3817 if (index < search.textLength) {
3818 UChar32 c;
3819 U16_NEXT(search.text, index, search.textLength, c);
3820 return c;
3821 }
3822 return U_SENTINEL;
3823 }
3824
codePointBefore(const USearch & search,int32_t index)3825 UChar32 codePointBefore(const USearch &search, int32_t index) {
3826 if (0 < index) {
3827 UChar32 c;
3828 U16_PREV(search.text, 0, index, c);
3829 return c;
3830 }
3831 return U_SENTINEL;
3832 }
3833
3834 } // namespace
3835
usearch_search(UStringSearch * strsrch,int32_t startIdx,int32_t * matchStart,int32_t * matchLimit,UErrorCode * status)3836 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
3837 int32_t startIdx,
3838 int32_t *matchStart,
3839 int32_t *matchLimit,
3840 UErrorCode *status)
3841 {
3842 if (U_FAILURE(*status)) {
3843 return FALSE;
3844 }
3845
3846 // TODO: reject search patterns beginning with a combining char.
3847
3848 #ifdef USEARCH_DEBUG
3849 if (getenv("USEARCH_DEBUG") != NULL) {
3850 printf("Pattern CEs\n");
3851 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
3852 printf(" %8x", strsrch->pattern.ces[ii]);
3853 }
3854 printf("\n");
3855 }
3856
3857 #endif
3858 // Input parameter sanity check.
3859 // TODO: should input indicies clip to the text length
3860 // in the same way that UText does.
3861 if(strsrch->pattern.cesLength == 0 ||
3862 startIdx < 0 ||
3863 startIdx > strsrch->search->textLength ||
3864 strsrch->pattern.ces == NULL) {
3865 *status = U_ILLEGAL_ARGUMENT_ERROR;
3866 return FALSE;
3867 }
3868
3869 if (strsrch->pattern.pces == NULL) {
3870 initializePatternPCETable(strsrch, status);
3871 }
3872
3873 ucol_setOffset(strsrch->textIter, startIdx, status);
3874 CEIBuffer ceb(strsrch, status);
3875
3876
3877 int32_t targetIx = 0;
3878 const CEI *targetCEI = NULL;
3879 int32_t patIx;
3880 UBool found;
3881
3882 int32_t mStart = -1;
3883 int32_t mLimit = -1;
3884 int32_t minLimit;
3885 int32_t maxLimit;
3886
3887
3888
3889 // Outer loop moves over match starting positions in the
3890 // target CE space.
3891 // Here we see the target as a sequence of collation elements, resulting from the following:
3892 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3893 // (for example, digraphs such as IJ may be broken into two characters).
3894 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3895 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3896 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3897 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3898 // then the CE is deleted, so the following code sees only CEs that are relevant.
3899 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3900 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3901 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3902 //
3903 for(targetIx=0; ; targetIx++)
3904 {
3905 found = TRUE;
3906 // Inner loop checks for a match beginning at each
3907 // position from the outer loop.
3908 int32_t targetIxOffset = 0;
3909 int64_t patCE = 0;
3910 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3911 // (compared to the last CE fetched for the previous targetIx value) as we need to go
3912 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3913 const CEI *firstCEI = ceb.get(targetIx);
3914 if (firstCEI == NULL) {
3915 *status = U_INTERNAL_PROGRAM_ERROR;
3916 found = FALSE;
3917 break;
3918 }
3919
3920 for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
3921 patCE = strsrch->pattern.pces[patIx];
3922 targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
3923 // Compare CE from target string with CE from the pattern.
3924 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
3925 // which will fail the compare, below.
3926 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
3927 if ( ceMatch == U_CE_NO_MATCH ) {
3928 found = FALSE;
3929 break;
3930 } else if ( ceMatch > U_CE_NO_MATCH ) {
3931 if ( ceMatch == U_CE_SKIP_TARG ) {
3932 // redo with same patCE, next targCE
3933 patIx--;
3934 targetIxOffset++;
3935 } else { // ceMatch == U_CE_SKIP_PATN
3936 // redo with same targCE, next patCE
3937 targetIxOffset--;
3938 }
3939 }
3940 }
3941 targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
3942
3943 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
3944 // No match at this targetIx. Try again at the next.
3945 continue;
3946 }
3947
3948 if (!found) {
3949 // No match at all, we have run off the end of the target text.
3950 break;
3951 }
3952
3953
3954 // We have found a match in CE space.
3955 // Now determine the bounds in string index space.
3956 // There still is a chance of match failure if the CE range not correspond to
3957 // an acceptable character range.
3958 //
3959 const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1);
3960
3961 mStart = firstCEI->lowIndex;
3962 minLimit = lastCEI->lowIndex;
3963
3964 // Look at the CE following the match. If it is UCOL_NULLORDER the match
3965 // extended to the end of input, and the match is good.
3966
3967 // Look at the high and low indices of the CE following the match. If
3968 // they are the same it means one of two things:
3969 // 1. The match extended to the last CE from the target text, which is OK, or
3970 // 2. The last CE that was part of the match is in an expansion that extends
3971 // to the first CE after the match. In this case, we reject the match.
3972 const CEI *nextCEI = 0;
3973 if (strsrch->search->elementComparisonType == 0) {
3974 nextCEI = ceb.get(targetIx + targetIxOffset);
3975 maxLimit = nextCEI->lowIndex;
3976 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3977 found = FALSE;
3978 }
3979 } else {
3980 for ( ; ; ++targetIxOffset ) {
3981 nextCEI = ceb.get(targetIx + targetIxOffset);
3982 maxLimit = nextCEI->lowIndex;
3983 // If we are at the end of the target too, match succeeds
3984 if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
3985 break;
3986 }
3987 // As long as the next CE has primary weight of 0,
3988 // it is part of the last target element matched by the pattern;
3989 // make sure it can be part of a match with the last patCE
3990 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
3991 UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
3992 if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
3993 found = FALSE;
3994 break;
3995 }
3996 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
3997 // target element, but it has non-zero primary weight => match fails
3998 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
3999 found = false;
4000 break;
4001 // Else the target CE is not part of an expansion of the last matched element, match succeeds
4002 } else {
4003 break;
4004 }
4005 }
4006 }
4007
4008
4009 // Check for the start of the match being within a combining sequence.
4010 // This can happen if the pattern itself begins with a combining char, and
4011 // the match found combining marks in the target text that were attached
4012 // to something else.
4013 // This type of match should be rejected for not completely consuming a
4014 // combining sequence.
4015 if (!isBreakBoundary(strsrch, mStart)) {
4016 found = FALSE;
4017 }
4018
4019 // Check for the start of the match being within an Collation Element Expansion,
4020 // meaning that the first char of the match is only partially matched.
4021 // With exapnsions, the first CE will report the index of the source
4022 // character, and all subsequent (expansions) CEs will report the source index of the
4023 // _following_ character.
4024 int32_t secondIx = firstCEI->highIndex;
4025 if (mStart == secondIx) {
4026 found = FALSE;
4027 }
4028
4029 // Allow matches to end in the middle of a grapheme cluster if the following
4030 // conditions are met; this is needed to make prefix search work properly in
4031 // Indic, see #11750
4032 // * the default breakIter is being used
4033 // * the next collation element after this combining sequence
4034 // - has non-zero primary weight
4035 // - corresponds to a separate character following the one at end of the current match
4036 // (the second of these conditions, and perhaps both, may be redundant given the
4037 // subsequent check for normalization boundary; however they are likely much faster
4038 // tests in any case)
4039 // * the match limit is a normalization boundary
4040 UBool allowMidclusterMatch = FALSE;
4041 if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4042 allowMidclusterMatch =
4043 strsrch->search->breakIter == NULL &&
4044 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4045 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4046 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4047 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4048 }
4049 // If those conditions are met, then:
4050 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4051 // the match limit may be backed off to a previous break boundary. This handles
4052 // cases in which mLimit includes target characters that are ignorable with current
4053 // settings (such as space) and which extend beyond the pattern match.
4054 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4055 // * do NOT require that match limit be on a breakIter boundary
4056
4057 // Advance the match end position to the first acceptable match boundary.
4058 // This advances the index over any combining charcters.
4059 mLimit = maxLimit;
4060 if (minLimit < maxLimit) {
4061 // When the last CE's low index is same with its high index, the CE is likely
4062 // a part of expansion. In this case, the index is located just after the
4063 // character corresponding to the CEs compared above. If the index is right
4064 // at the break boundary, move the position to the next boundary will result
4065 // incorrect match length when there are ignorable characters exist between
4066 // the position and the next character produces CE(s). See ticket#8482.
4067 if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
4068 mLimit = minLimit;
4069 } else {
4070 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4071 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4072 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4073 // (i.e. we back off mLimit to the previous breakIterator boundary).
4074 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4075 mLimit = nba;
4076 }
4077 }
4078 }
4079
4080 #ifdef USEARCH_DEBUG
4081 if (getenv("USEARCH_DEBUG") != NULL) {
4082 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4083 }
4084 #endif
4085
4086 if (!allowMidclusterMatch) {
4087 // If advancing to the end of a combining sequence in character indexing space
4088 // advanced us beyond the end of the match in CE space, reject this match.
4089 if (mLimit > maxLimit) {
4090 found = FALSE;
4091 }
4092
4093 if (!isBreakBoundary(strsrch, mLimit)) {
4094 found = FALSE;
4095 }
4096 }
4097
4098 if (! checkIdentical(strsrch, mStart, mLimit)) {
4099 found = FALSE;
4100 }
4101
4102 if (found) {
4103 break;
4104 }
4105 }
4106
4107 #ifdef USEARCH_DEBUG
4108 if (getenv("USEARCH_DEBUG") != NULL) {
4109 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4110 int32_t lastToPrint = ceb.limitIx+2;
4111 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4112 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4113 }
4114 printf("\n%s\n", found? "match found" : "no match");
4115 }
4116 #endif
4117
4118 // All Done. Store back the match bounds to the caller.
4119 //
4120 if (found==FALSE) {
4121 mLimit = -1;
4122 mStart = -1;
4123 }
4124
4125 if (matchStart != NULL) {
4126 *matchStart= mStart;
4127 }
4128
4129 if (matchLimit != NULL) {
4130 *matchLimit = mLimit;
4131 }
4132
4133 return found;
4134 }
4135
usearch_searchBackwards(UStringSearch * strsrch,int32_t startIdx,int32_t * matchStart,int32_t * matchLimit,UErrorCode * status)4136 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
4137 int32_t startIdx,
4138 int32_t *matchStart,
4139 int32_t *matchLimit,
4140 UErrorCode *status)
4141 {
4142 if (U_FAILURE(*status)) {
4143 return FALSE;
4144 }
4145
4146 // TODO: reject search patterns beginning with a combining char.
4147
4148 #ifdef USEARCH_DEBUG
4149 if (getenv("USEARCH_DEBUG") != NULL) {
4150 printf("Pattern CEs\n");
4151 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
4152 printf(" %8x", strsrch->pattern.ces[ii]);
4153 }
4154 printf("\n");
4155 }
4156
4157 #endif
4158 // Input parameter sanity check.
4159 // TODO: should input indicies clip to the text length
4160 // in the same way that UText does.
4161 if(strsrch->pattern.cesLength == 0 ||
4162 startIdx < 0 ||
4163 startIdx > strsrch->search->textLength ||
4164 strsrch->pattern.ces == NULL) {
4165 *status = U_ILLEGAL_ARGUMENT_ERROR;
4166 return FALSE;
4167 }
4168
4169 if (strsrch->pattern.pces == NULL) {
4170 initializePatternPCETable(strsrch, status);
4171 }
4172
4173 CEIBuffer ceb(strsrch, status);
4174 int32_t targetIx = 0;
4175
4176 /*
4177 * Pre-load the buffer with the CE's for the grapheme
4178 * after our starting position so that we're sure that
4179 * we can look at the CE following the match when we
4180 * check the match boundaries.
4181 *
4182 * This will also pre-fetch the first CE that we'll
4183 * consider for the match.
4184 */
4185 if (startIdx < strsrch->search->textLength) {
4186 UBreakIterator *bi = strsrch->search->internalBreakIter;
4187 int32_t next = ubrk_following(bi, startIdx);
4188
4189 ucol_setOffset(strsrch->textIter, next, status);
4190
4191 for (targetIx = 0; ; targetIx += 1) {
4192 if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
4193 break;
4194 }
4195 }
4196 } else {
4197 ucol_setOffset(strsrch->textIter, startIdx, status);
4198 }
4199
4200
4201 const CEI *targetCEI = NULL;
4202 int32_t patIx;
4203 UBool found;
4204
4205 int32_t limitIx = targetIx;
4206 int32_t mStart = -1;
4207 int32_t mLimit = -1;
4208 int32_t minLimit;
4209 int32_t maxLimit;
4210
4211
4212
4213 // Outer loop moves over match starting positions in the
4214 // target CE space.
4215 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4216 // But patIx is 0 at the beginning of the pattern and increases toward the end.
4217 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4218 // and the beginning of the base text.
4219 for(targetIx = limitIx; ; targetIx += 1)
4220 {
4221 found = TRUE;
4222 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4223 // (compared to the last CE fetched for the previous targetIx value) as we need to go
4224 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4225 const CEI *lastCEI = ceb.getPrevious(targetIx);
4226 if (lastCEI == NULL) {
4227 *status = U_INTERNAL_PROGRAM_ERROR;
4228 found = FALSE;
4229 break;
4230 }
4231 // Inner loop checks for a match beginning at each
4232 // position from the outer loop.
4233 int32_t targetIxOffset = 0;
4234 for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
4235 int64_t patCE = strsrch->pattern.pces[patIx];
4236
4237 targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
4238 // Compare CE from target string with CE from the pattern.
4239 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4240 // which will fail the compare, below.
4241 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
4242 if ( ceMatch == U_CE_NO_MATCH ) {
4243 found = FALSE;
4244 break;
4245 } else if ( ceMatch > U_CE_NO_MATCH ) {
4246 if ( ceMatch == U_CE_SKIP_TARG ) {
4247 // redo with same patCE, next targCE
4248 patIx++;
4249 targetIxOffset++;
4250 } else { // ceMatch == U_CE_SKIP_PATN
4251 // redo with same targCE, next patCE
4252 targetIxOffset--;
4253 }
4254 }
4255 }
4256
4257 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
4258 // No match at this targetIx. Try again at the next.
4259 continue;
4260 }
4261
4262 if (!found) {
4263 // No match at all, we have run off the end of the target text.
4264 break;
4265 }
4266
4267
4268 // We have found a match in CE space.
4269 // Now determine the bounds in string index space.
4270 // There still is a chance of match failure if the CE range not correspond to
4271 // an acceptable character range.
4272 //
4273 const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
4274 mStart = firstCEI->lowIndex;
4275
4276 // Check for the start of the match being within a combining sequence.
4277 // This can happen if the pattern itself begins with a combining char, and
4278 // the match found combining marks in the target text that were attached
4279 // to something else.
4280 // This type of match should be rejected for not completely consuming a
4281 // combining sequence.
4282 if (!isBreakBoundary(strsrch, mStart)) {
4283 found = FALSE;
4284 }
4285
4286 // Look at the high index of the first CE in the match. If it's the same as the
4287 // low index, the first CE in the match is in the middle of an expansion.
4288 if (mStart == firstCEI->highIndex) {
4289 found = FALSE;
4290 }
4291
4292
4293 minLimit = lastCEI->lowIndex;
4294
4295 if (targetIx > 0) {
4296 // Look at the CE following the match. If it is UCOL_NULLORDER the match
4297 // extended to the end of input, and the match is good.
4298
4299 // Look at the high and low indices of the CE following the match. If
4300 // they are the same it means one of two things:
4301 // 1. The match extended to the last CE from the target text, which is OK, or
4302 // 2. The last CE that was part of the match is in an expansion that extends
4303 // to the first CE after the match. In this case, we reject the match.
4304 const CEI *nextCEI = ceb.getPrevious(targetIx - 1);
4305
4306 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4307 found = FALSE;
4308 }
4309
4310 mLimit = maxLimit = nextCEI->lowIndex;
4311
4312 // Allow matches to end in the middle of a grapheme cluster if the following
4313 // conditions are met; this is needed to make prefix search work properly in
4314 // Indic, see #11750
4315 // * the default breakIter is being used
4316 // * the next collation element after this combining sequence
4317 // - has non-zero primary weight
4318 // - corresponds to a separate character following the one at end of the current match
4319 // (the second of these conditions, and perhaps both, may be redundant given the
4320 // subsequent check for normalization boundary; however they are likely much faster
4321 // tests in any case)
4322 // * the match limit is a normalization boundary
4323 UBool allowMidclusterMatch = FALSE;
4324 if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4325 allowMidclusterMatch =
4326 strsrch->search->breakIter == NULL &&
4327 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4328 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4329 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4330 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4331 }
4332 // If those conditions are met, then:
4333 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4334 // the match limit may be backed off to a previous break boundary. This handles
4335 // cases in which mLimit includes target characters that are ignorable with current
4336 // settings (such as space) and which extend beyond the pattern match.
4337 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4338 // * do NOT require that match limit be on a breakIter boundary
4339
4340 // Advance the match end position to the first acceptable match boundary.
4341 // This advances the index over any combining characters.
4342 if (minLimit < maxLimit) {
4343 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4344 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4345 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4346 // (i.e. we back off mLimit to the previous breakIterator boundary).
4347 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4348 mLimit = nba;
4349 }
4350 }
4351
4352 if (!allowMidclusterMatch) {
4353 // If advancing to the end of a combining sequence in character indexing space
4354 // advanced us beyond the end of the match in CE space, reject this match.
4355 if (mLimit > maxLimit) {
4356 found = FALSE;
4357 }
4358
4359 // Make sure the end of the match is on a break boundary
4360 if (!isBreakBoundary(strsrch, mLimit)) {
4361 found = FALSE;
4362 }
4363 }
4364
4365 } else {
4366 // No non-ignorable CEs after this point.
4367 // The maximum position is detected by boundary after
4368 // the last non-ignorable CE. Combining sequence
4369 // across the start index will be truncated.
4370 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4371 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
4372 }
4373
4374 #ifdef USEARCH_DEBUG
4375 if (getenv("USEARCH_DEBUG") != NULL) {
4376 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4377 }
4378 #endif
4379
4380
4381 if (! checkIdentical(strsrch, mStart, mLimit)) {
4382 found = FALSE;
4383 }
4384
4385 if (found) {
4386 break;
4387 }
4388 }
4389
4390 #ifdef USEARCH_DEBUG
4391 if (getenv("USEARCH_DEBUG") != NULL) {
4392 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4393 int32_t lastToPrint = ceb.limitIx+2;
4394 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4395 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4396 }
4397 printf("\n%s\n", found? "match found" : "no match");
4398 }
4399 #endif
4400
4401 // All Done. Store back the match bounds to the caller.
4402 //
4403 if (found==FALSE) {
4404 mLimit = -1;
4405 mStart = -1;
4406 }
4407
4408 if (matchStart != NULL) {
4409 *matchStart= mStart;
4410 }
4411
4412 if (matchLimit != NULL) {
4413 *matchLimit = mLimit;
4414 }
4415
4416 return found;
4417 }
4418
4419 // internal use methods declared in usrchimp.h -----------------------------
4420
usearch_handleNextExact(UStringSearch * strsrch,UErrorCode * status)4421 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
4422 {
4423 if (U_FAILURE(*status)) {
4424 setMatchNotFound(strsrch);
4425 return FALSE;
4426 }
4427
4428 #if BOYER_MOORE
4429 UCollationElements *coleiter = strsrch->textIter;
4430 int32_t textlength = strsrch->search->textLength;
4431 int32_t *patternce = strsrch->pattern.ces;
4432 int32_t patterncelength = strsrch->pattern.cesLength;
4433 int32_t textoffset = ucol_getOffset(coleiter);
4434
4435 // status used in setting coleiter offset, since offset is checked in
4436 // shiftForward before setting the coleiter offset, status never
4437 // a failure
4438 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4439 patterncelength);
4440 while (textoffset <= textlength)
4441 {
4442 uint32_t patternceindex = patterncelength - 1;
4443 int32_t targetce;
4444 UBool found = FALSE;
4445 int32_t lastce = UCOL_NULLORDER;
4446
4447 setColEIterOffset(coleiter, textoffset);
4448
4449 for (;;) {
4450 // finding the last pattern ce match, imagine composite characters
4451 // for example: search for pattern A in text \u00C0
4452 // we'll have to skip \u0300 the grave first before we get to A
4453 targetce = ucol_previous(coleiter, status);
4454 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4455 found = FALSE;
4456 break;
4457 }
4458 targetce = getCE(strsrch, targetce);
4459 if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4460 // this is for the text \u0315\u0300 that requires
4461 // normalization and pattern \u0300, where \u0315 is ignorable
4462 continue;
4463 }
4464 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4465 lastce = targetce;
4466 }
4467 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4468 if (targetce == patternce[patternceindex]) {
4469 // the first ce can be a contraction
4470 found = TRUE;
4471 break;
4472 }
4473 if (!hasExpansion(coleiter)) {
4474 found = FALSE;
4475 break;
4476 }
4477 }
4478
4479 //targetce = lastce;
4480
4481 while (found && patternceindex > 0) {
4482 lastce = targetce;
4483 targetce = ucol_previous(coleiter, status);
4484 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4485 found = FALSE;
4486 break;
4487 }
4488 targetce = getCE(strsrch, targetce);
4489 if (targetce == UCOL_IGNORABLE) {
4490 continue;
4491 }
4492
4493 patternceindex --;
4494 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4495 found = found && targetce == patternce[patternceindex];
4496 }
4497
4498 targetce = lastce;
4499
4500 if (!found) {
4501 if (U_FAILURE(*status)) {
4502 break;
4503 }
4504 textoffset = shiftForward(strsrch, textoffset, lastce,
4505 patternceindex);
4506 // status checked at loop.
4507 patternceindex = patterncelength;
4508 continue;
4509 }
4510
4511 if (checkNextExactMatch(strsrch, &textoffset, status)) {
4512 // status checked in ucol_setOffset
4513 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4514 return TRUE;
4515 }
4516 }
4517 setMatchNotFound(strsrch);
4518 return FALSE;
4519 #else
4520 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4521 int32_t start = -1;
4522 int32_t end = -1;
4523
4524 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4525 strsrch->search->matchedIndex = start;
4526 strsrch->search->matchedLength = end - start;
4527 return TRUE;
4528 } else {
4529 setMatchNotFound(strsrch);
4530 return FALSE;
4531 }
4532 #endif
4533 }
4534
usearch_handleNextCanonical(UStringSearch * strsrch,UErrorCode * status)4535 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
4536 {
4537 if (U_FAILURE(*status)) {
4538 setMatchNotFound(strsrch);
4539 return FALSE;
4540 }
4541
4542 #if BOYER_MOORE
4543 UCollationElements *coleiter = strsrch->textIter;
4544 int32_t textlength = strsrch->search->textLength;
4545 int32_t *patternce = strsrch->pattern.ces;
4546 int32_t patterncelength = strsrch->pattern.cesLength;
4547 int32_t textoffset = ucol_getOffset(coleiter);
4548 UBool hasPatternAccents =
4549 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4550
4551 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4552 patterncelength);
4553 strsrch->canonicalPrefixAccents[0] = 0;
4554 strsrch->canonicalSuffixAccents[0] = 0;
4555
4556 while (textoffset <= textlength)
4557 {
4558 int32_t patternceindex = patterncelength - 1;
4559 int32_t targetce;
4560 UBool found = FALSE;
4561 int32_t lastce = UCOL_NULLORDER;
4562
4563 setColEIterOffset(coleiter, textoffset);
4564
4565 for (;;) {
4566 // finding the last pattern ce match, imagine composite characters
4567 // for example: search for pattern A in text \u00C0
4568 // we'll have to skip \u0300 the grave first before we get to A
4569 targetce = ucol_previous(coleiter, status);
4570 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4571 found = FALSE;
4572 break;
4573 }
4574 targetce = getCE(strsrch, targetce);
4575 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4576 lastce = targetce;
4577 }
4578 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4579 if (targetce == patternce[patternceindex]) {
4580 // the first ce can be a contraction
4581 found = TRUE;
4582 break;
4583 }
4584 if (!hasExpansion(coleiter)) {
4585 found = FALSE;
4586 break;
4587 }
4588 }
4589
4590 while (found && patternceindex > 0) {
4591 targetce = ucol_previous(coleiter, status);
4592 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4593 found = FALSE;
4594 break;
4595 }
4596 targetce = getCE(strsrch, targetce);
4597 if (targetce == UCOL_IGNORABLE) {
4598 continue;
4599 }
4600
4601 patternceindex --;
4602 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4603 found = found && targetce == patternce[patternceindex];
4604 }
4605
4606 // initializing the rearranged accent array
4607 if (hasPatternAccents && !found) {
4608 strsrch->canonicalPrefixAccents[0] = 0;
4609 strsrch->canonicalSuffixAccents[0] = 0;
4610 if (U_FAILURE(*status)) {
4611 break;
4612 }
4613 found = doNextCanonicalMatch(strsrch, textoffset, status);
4614 }
4615
4616 if (!found) {
4617 if (U_FAILURE(*status)) {
4618 break;
4619 }
4620 textoffset = shiftForward(strsrch, textoffset, lastce,
4621 patternceindex);
4622 // status checked at loop
4623 patternceindex = patterncelength;
4624 continue;
4625 }
4626
4627 if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4628 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4629 return TRUE;
4630 }
4631 }
4632 setMatchNotFound(strsrch);
4633 return FALSE;
4634 #else
4635 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4636 int32_t start = -1;
4637 int32_t end = -1;
4638
4639 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4640 strsrch->search->matchedIndex = start;
4641 strsrch->search->matchedLength = end - start;
4642 return TRUE;
4643 } else {
4644 setMatchNotFound(strsrch);
4645 return FALSE;
4646 }
4647 #endif
4648 }
4649
usearch_handlePreviousExact(UStringSearch * strsrch,UErrorCode * status)4650 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
4651 {
4652 if (U_FAILURE(*status)) {
4653 setMatchNotFound(strsrch);
4654 return FALSE;
4655 }
4656
4657 #if BOYER_MOORE
4658 UCollationElements *coleiter = strsrch->textIter;
4659 int32_t *patternce = strsrch->pattern.ces;
4660 int32_t patterncelength = strsrch->pattern.cesLength;
4661 int32_t textoffset = ucol_getOffset(coleiter);
4662
4663 // shifting it check for setting offset
4664 // if setOffset is called previously or there was no previous match, we
4665 // leave the offset as it is.
4666 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4667 textoffset = strsrch->search->matchedIndex;
4668 }
4669
4670 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4671 patterncelength);
4672
4673 while (textoffset >= 0)
4674 {
4675 int32_t patternceindex = 1;
4676 int32_t targetce;
4677 UBool found = FALSE;
4678 int32_t firstce = UCOL_NULLORDER;
4679
4680 // if status is a failure, ucol_setOffset does nothing
4681 setColEIterOffset(coleiter, textoffset);
4682
4683 for (;;) {
4684 // finding the first pattern ce match, imagine composite
4685 // characters. for example: search for pattern \u0300 in text
4686 // \u00C0, we'll have to skip A first before we get to
4687 // \u0300 the grave accent
4688 targetce = ucol_next(coleiter, status);
4689 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4690 found = FALSE;
4691 break;
4692 }
4693 targetce = getCE(strsrch, targetce);
4694 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4695 firstce = targetce;
4696 }
4697 if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
4698 continue;
4699 }
4700 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4701 if (targetce == patternce[0]) {
4702 found = TRUE;
4703 break;
4704 }
4705 if (!hasExpansion(coleiter)) {
4706 // checking for accents in composite character
4707 found = FALSE;
4708 break;
4709 }
4710 }
4711
4712 //targetce = firstce;
4713
4714 while (found && (patternceindex < patterncelength)) {
4715 firstce = targetce;
4716 targetce = ucol_next(coleiter, status);
4717 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4718 found = FALSE;
4719 break;
4720 }
4721 targetce = getCE(strsrch, targetce);
4722 if (targetce == UCOL_IGNORABLE) {
4723 continue;
4724 }
4725
4726 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4727 found = found && targetce == patternce[patternceindex];
4728 patternceindex ++;
4729 }
4730
4731 targetce = firstce;
4732
4733 if (!found) {
4734 if (U_FAILURE(*status)) {
4735 break;
4736 }
4737
4738 textoffset = reverseShift(strsrch, textoffset, targetce,
4739 patternceindex);
4740 patternceindex = 0;
4741 continue;
4742 }
4743
4744 if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4745 setColEIterOffset(coleiter, textoffset);
4746 return TRUE;
4747 }
4748 }
4749 setMatchNotFound(strsrch);
4750 return FALSE;
4751 #else
4752 int32_t textOffset;
4753
4754 if (strsrch->search->isOverlap) {
4755 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4756 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4757 } else {
4758 // move the start position at the end of possible match
4759 initializePatternPCETable(strsrch, status);
4760 if (!initTextProcessedIter(strsrch, status)) {
4761 setMatchNotFound(strsrch);
4762 return FALSE;
4763 }
4764 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4765 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4766 if (pce == UCOL_PROCESSED_NULLORDER) {
4767 // at the end of the text
4768 break;
4769 }
4770 }
4771 if (U_FAILURE(*status)) {
4772 setMatchNotFound(strsrch);
4773 return FALSE;
4774 }
4775 textOffset = ucol_getOffset(strsrch->textIter);
4776 }
4777 } else {
4778 textOffset = ucol_getOffset(strsrch->textIter);
4779 }
4780
4781 int32_t start = -1;
4782 int32_t end = -1;
4783
4784 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4785 strsrch->search->matchedIndex = start;
4786 strsrch->search->matchedLength = end - start;
4787 return TRUE;
4788 } else {
4789 setMatchNotFound(strsrch);
4790 return FALSE;
4791 }
4792 #endif
4793 }
4794
usearch_handlePreviousCanonical(UStringSearch * strsrch,UErrorCode * status)4795 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
4796 UErrorCode *status)
4797 {
4798 if (U_FAILURE(*status)) {
4799 setMatchNotFound(strsrch);
4800 return FALSE;
4801 }
4802
4803 #if BOYER_MOORE
4804 UCollationElements *coleiter = strsrch->textIter;
4805 int32_t *patternce = strsrch->pattern.ces;
4806 int32_t patterncelength = strsrch->pattern.cesLength;
4807 int32_t textoffset = ucol_getOffset(coleiter);
4808 UBool hasPatternAccents =
4809 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4810
4811 // shifting it check for setting offset
4812 // if setOffset is called previously or there was no previous match, we
4813 // leave the offset as it is.
4814 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4815 textoffset = strsrch->search->matchedIndex;
4816 }
4817
4818 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4819 patterncelength);
4820 strsrch->canonicalPrefixAccents[0] = 0;
4821 strsrch->canonicalSuffixAccents[0] = 0;
4822
4823 while (textoffset >= 0)
4824 {
4825 int32_t patternceindex = 1;
4826 int32_t targetce;
4827 UBool found = FALSE;
4828 int32_t firstce = UCOL_NULLORDER;
4829
4830 setColEIterOffset(coleiter, textoffset);
4831 for (;;) {
4832 // finding the first pattern ce match, imagine composite
4833 // characters. for example: search for pattern \u0300 in text
4834 // \u00C0, we'll have to skip A first before we get to
4835 // \u0300 the grave accent
4836 targetce = ucol_next(coleiter, status);
4837 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4838 found = FALSE;
4839 break;
4840 }
4841 targetce = getCE(strsrch, targetce);
4842 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4843 firstce = targetce;
4844 }
4845
4846 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4847 if (targetce == patternce[0]) {
4848 // the first ce can be a contraction
4849 found = TRUE;
4850 break;
4851 }
4852 if (!hasExpansion(coleiter)) {
4853 // checking for accents in composite character
4854 found = FALSE;
4855 break;
4856 }
4857 }
4858
4859 targetce = firstce;
4860
4861 while (found && patternceindex < patterncelength) {
4862 targetce = ucol_next(coleiter, status);
4863 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4864 found = FALSE;
4865 break;
4866 }
4867 targetce = getCE(strsrch, targetce);
4868 if (targetce == UCOL_IGNORABLE) {
4869 continue;
4870 }
4871
4872 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4873 found = found && targetce == patternce[patternceindex];
4874 patternceindex ++;
4875 }
4876
4877 // initializing the rearranged accent array
4878 if (hasPatternAccents && !found) {
4879 strsrch->canonicalPrefixAccents[0] = 0;
4880 strsrch->canonicalSuffixAccents[0] = 0;
4881 if (U_FAILURE(*status)) {
4882 break;
4883 }
4884 found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4885 }
4886
4887 if (!found) {
4888 if (U_FAILURE(*status)) {
4889 break;
4890 }
4891 textoffset = reverseShift(strsrch, textoffset, targetce,
4892 patternceindex);
4893 patternceindex = 0;
4894 continue;
4895 }
4896
4897 if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4898 setColEIterOffset(coleiter, textoffset);
4899 return TRUE;
4900 }
4901 }
4902 setMatchNotFound(strsrch);
4903 return FALSE;
4904 #else
4905 int32_t textOffset;
4906
4907 if (strsrch->search->isOverlap) {
4908 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4909 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4910 } else {
4911 // move the start position at the end of possible match
4912 initializePatternPCETable(strsrch, status);
4913 if (!initTextProcessedIter(strsrch, status)) {
4914 setMatchNotFound(strsrch);
4915 return FALSE;
4916 }
4917 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4918 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4919 if (pce == UCOL_PROCESSED_NULLORDER) {
4920 // at the end of the text
4921 break;
4922 }
4923 }
4924 if (U_FAILURE(*status)) {
4925 setMatchNotFound(strsrch);
4926 return FALSE;
4927 }
4928 textOffset = ucol_getOffset(strsrch->textIter);
4929 }
4930 } else {
4931 textOffset = ucol_getOffset(strsrch->textIter);
4932 }
4933
4934 int32_t start = -1;
4935 int32_t end = -1;
4936
4937 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4938 strsrch->search->matchedIndex = start;
4939 strsrch->search->matchedLength = end - start;
4940 return TRUE;
4941 } else {
4942 setMatchNotFound(strsrch);
4943 return FALSE;
4944 }
4945 #endif
4946 }
4947
4948 #endif /* #if !UCONFIG_NO_COLLATION */
4949