1 /*
2 **********************************************************************
3 * Copyright (C) 2001-2015 IBM and others. All rights reserved.
4 **********************************************************************
5 * Date Name Description
6 * 07/02/2001 synwee Creation.
7 **********************************************************************
8 */
9
10 #include "unicode/utypes.h"
11
12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
13
14 #include "unicode/usearch.h"
15 #include "unicode/ustring.h"
16 #include "unicode/uchar.h"
17 #include "unicode/utf16.h"
18 #include "normalizer2impl.h"
19 #include "usrchimp.h"
20 #include "cmemory.h"
21 #include "ucln_in.h"
22 #include "uassert.h"
23 #include "ustr_imp.h"
24
25 U_NAMESPACE_USE
26
27 // don't use Boyer-Moore
28 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
29 #define BOYER_MOORE 0
30
31 // internal definition ---------------------------------------------------
32
33 #define LAST_BYTE_MASK_ 0xFF
34 #define SECOND_LAST_BYTE_SHIFT_ 8
35 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
36
37 static const Normalizer2Impl *g_nfcImpl = NULL;
38
39 // internal methods -------------------------------------------------
40
41 /**
42 * Fast collation element iterator setOffset.
43 * This function does not check for bounds.
44 * @param coleiter collation element iterator
45 * @param offset to set
46 */
47 static
setColEIterOffset(UCollationElements * elems,int32_t offset)48 inline void setColEIterOffset(UCollationElements *elems,
49 int32_t offset)
50 {
51 // Note: Not "fast" any more after the 2013 collation rewrite.
52 // We do not want to expose more internals than necessary.
53 UErrorCode status = U_ZERO_ERROR;
54 ucol_setOffset(elems, offset, &status);
55 }
56
57 /**
58 * Getting the mask for collation strength
59 * @param strength collation strength
60 * @return collation element mask
61 */
62 static
getMask(UCollationStrength strength)63 inline uint32_t getMask(UCollationStrength strength)
64 {
65 switch (strength)
66 {
67 case UCOL_PRIMARY:
68 return UCOL_PRIMARYORDERMASK;
69 case UCOL_SECONDARY:
70 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
71 default:
72 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
73 UCOL_PRIMARYORDERMASK;
74 }
75 }
76
77 /**
78 * @param ce 32-bit collation element
79 * @return hash code
80 */
81 static
hashFromCE32(uint32_t ce)82 inline int hashFromCE32(uint32_t ce)
83 {
84 int hc = (int)(
85 ((((((ce >> 24) * 37) +
86 (ce >> 16)) * 37) +
87 (ce >> 8)) * 37) +
88 ce);
89 hc %= MAX_TABLE_SIZE_;
90 if (hc < 0) {
91 hc += MAX_TABLE_SIZE_;
92 }
93 return hc;
94 }
95
96 U_CDECL_BEGIN
97 static UBool U_CALLCONV
usearch_cleanup(void)98 usearch_cleanup(void) {
99 g_nfcImpl = NULL;
100 return TRUE;
101 }
102 U_CDECL_END
103
104 /**
105 * Initializing the fcd tables.
106 * Internal method, status assumed to be a success.
107 * @param status output error if any, caller to check status before calling
108 * method, status assumed to be success when passed in.
109 */
110 static
initializeFCD(UErrorCode * status)111 inline void initializeFCD(UErrorCode *status)
112 {
113 if (g_nfcImpl == NULL) {
114 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
115 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
116 }
117 }
118
119 /**
120 * Gets the fcd value for a character at the argument index.
121 * This method takes into accounts of the supplementary characters.
122 * @param str UTF16 string where character for fcd retrieval resides
123 * @param offset position of the character whose fcd is to be retrieved, to be
124 * overwritten with the next character position, taking
125 * surrogate characters into consideration.
126 * @param strlength length of the argument string
127 * @return fcd value
128 */
129 static
getFCD(const UChar * str,int32_t * offset,int32_t strlength)130 uint16_t getFCD(const UChar *str, int32_t *offset,
131 int32_t strlength)
132 {
133 const UChar *temp = str + *offset;
134 uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength);
135 *offset = (int32_t)(temp - str);
136 return result;
137 }
138
139 /**
140 * Getting the modified collation elements taking into account the collation
141 * attributes
142 * @param strsrch string search data
143 * @param sourcece
144 * @return the modified collation element
145 */
146 static
getCE(const UStringSearch * strsrch,uint32_t sourcece)147 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
148 {
149 // note for tertiary we can't use the collator->tertiaryMask, that
150 // is a preprocessed mask that takes into account case options. since
151 // we are only concerned with exact matches, we don't need that.
152 sourcece &= strsrch->ceMask;
153
154 if (strsrch->toShift) {
155 // alternate handling here, since only the 16 most significant digits
156 // is only used, we can safely do a compare without masking
157 // if the ce is a variable, we mask and get only the primary values
158 // no shifting to quartenary is required since all primary values
159 // less than variabletop will need to be masked off anyway.
160 if (strsrch->variableTop > sourcece) {
161 if (strsrch->strength >= UCOL_QUATERNARY) {
162 sourcece &= UCOL_PRIMARYORDERMASK;
163 }
164 else {
165 sourcece = UCOL_IGNORABLE;
166 }
167 }
168 } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
169 sourcece = 0xFFFF;
170 }
171
172 return sourcece;
173 }
174
175 /**
176 * Allocate a memory and returns NULL if it failed.
177 * Internal method, status assumed to be a success.
178 * @param size to allocate
179 * @param status output error if any, caller to check status before calling
180 * method, status assumed to be success when passed in.
181 * @return newly allocated array, NULL otherwise
182 */
183 static
allocateMemory(uint32_t size,UErrorCode * status)184 inline void * allocateMemory(uint32_t size, UErrorCode *status)
185 {
186 uint32_t *result = (uint32_t *)uprv_malloc(size);
187 if (result == NULL) {
188 *status = U_MEMORY_ALLOCATION_ERROR;
189 }
190 return result;
191 }
192
193 /**
194 * Adds a uint32_t value to a destination array.
195 * Creates a new array if we run out of space. The caller will have to
196 * manually deallocate the newly allocated array.
197 * Internal method, status assumed to be success, caller has to check status
198 * before calling this method. destination not to be NULL and has at least
199 * size destinationlength.
200 * @param destination target array
201 * @param offset destination offset to add value
202 * @param destinationlength target array size, return value for the new size
203 * @param value to be added
204 * @param increments incremental size expected
205 * @param status output error if any, caller to check status before calling
206 * method, status assumed to be success when passed in.
207 * @return new destination array, destination if there was no new allocation
208 */
209 static
addTouint32_tArray(int32_t * destination,uint32_t offset,uint32_t * destinationlength,uint32_t value,uint32_t increments,UErrorCode * status)210 inline int32_t * addTouint32_tArray(int32_t *destination,
211 uint32_t offset,
212 uint32_t *destinationlength,
213 uint32_t value,
214 uint32_t increments,
215 UErrorCode *status)
216 {
217 uint32_t newlength = *destinationlength;
218 if (offset + 1 == newlength) {
219 newlength += increments;
220 int32_t *temp = (int32_t *)allocateMemory(
221 sizeof(int32_t) * newlength, status);
222 if (U_FAILURE(*status)) {
223 return NULL;
224 }
225 uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
226 *destinationlength = newlength;
227 destination = temp;
228 }
229 destination[offset] = value;
230 return destination;
231 }
232
233 /**
234 * Adds a uint64_t value to a destination array.
235 * Creates a new array if we run out of space. The caller will have to
236 * manually deallocate the newly allocated array.
237 * Internal method, status assumed to be success, caller has to check status
238 * before calling this method. destination not to be NULL and has at least
239 * size destinationlength.
240 * @param destination target array
241 * @param offset destination offset to add value
242 * @param destinationlength target array size, return value for the new size
243 * @param value to be added
244 * @param increments incremental size expected
245 * @param status output error if any, caller to check status before calling
246 * method, status assumed to be success when passed in.
247 * @return new destination array, destination if there was no new allocation
248 */
249 static
addTouint64_tArray(int64_t * destination,uint32_t offset,uint32_t * destinationlength,uint64_t value,uint32_t increments,UErrorCode * status)250 inline int64_t * addTouint64_tArray(int64_t *destination,
251 uint32_t offset,
252 uint32_t *destinationlength,
253 uint64_t value,
254 uint32_t increments,
255 UErrorCode *status)
256 {
257 uint32_t newlength = *destinationlength;
258 if (offset + 1 == newlength) {
259 newlength += increments;
260 int64_t *temp = (int64_t *)allocateMemory(
261 sizeof(int64_t) * newlength, status);
262
263 if (U_FAILURE(*status)) {
264 return NULL;
265 }
266
267 uprv_memcpy(temp, destination, sizeof(int64_t) * offset);
268 *destinationlength = newlength;
269 destination = temp;
270 }
271
272 destination[offset] = value;
273
274 return destination;
275 }
276
277 /**
278 * Initializing the ce table for a pattern.
279 * Stores non-ignorable collation keys.
280 * Table size will be estimated by the size of the pattern text. Table
281 * expansion will be perform as we go along. Adding 1 to ensure that the table
282 * size definitely increases.
283 * Internal method, status assumed to be a success.
284 * @param strsrch string search data
285 * @param status output error if any, caller to check status before calling
286 * method, status assumed to be success when passed in.
287 * @return total number of expansions
288 */
289 static
initializePatternCETable(UStringSearch * strsrch,UErrorCode * status)290 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
291 UErrorCode *status)
292 {
293 UPattern *pattern = &(strsrch->pattern);
294 uint32_t cetablesize = INITIAL_ARRAY_SIZE_;
295 int32_t *cetable = pattern->cesBuffer;
296 uint32_t patternlength = pattern->textLength;
297 UCollationElements *coleiter = strsrch->utilIter;
298
299 if (coleiter == NULL) {
300 coleiter = ucol_openElements(strsrch->collator, pattern->text,
301 patternlength, status);
302 // status will be checked in ucol_next(..) later and if it is an
303 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
304 // returned.
305 strsrch->utilIter = coleiter;
306 }
307 else {
308 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
309 }
310 if(U_FAILURE(*status)) {
311 return 0;
312 }
313
314 if (pattern->ces != cetable && pattern->ces) {
315 uprv_free(pattern->ces);
316 }
317
318 uint16_t offset = 0;
319 uint16_t result = 0;
320 int32_t ce;
321
322 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
323 U_SUCCESS(*status)) {
324 uint32_t newce = getCE(strsrch, ce);
325 if (newce) {
326 int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
327 newce,
328 patternlength - ucol_getOffset(coleiter) + 1,
329 status);
330 if (U_FAILURE(*status)) {
331 return 0;
332 }
333 offset ++;
334 if (cetable != temp && cetable != pattern->cesBuffer) {
335 uprv_free(cetable);
336 }
337 cetable = temp;
338 }
339 result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
340 }
341
342 cetable[offset] = 0;
343 pattern->ces = cetable;
344 pattern->cesLength = offset;
345
346 return result;
347 }
348
349 /**
350 * Initializing the pce table for a pattern.
351 * Stores non-ignorable collation keys.
352 * Table size will be estimated by the size of the pattern text. Table
353 * expansion will be perform as we go along. Adding 1 to ensure that the table
354 * size definitely increases.
355 * Internal method, status assumed to be a success.
356 * @param strsrch string search data
357 * @param status output error if any, caller to check status before calling
358 * method, status assumed to be success when passed in.
359 * @return total number of expansions
360 */
361 static
initializePatternPCETable(UStringSearch * strsrch,UErrorCode * status)362 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
363 UErrorCode *status)
364 {
365 UPattern *pattern = &(strsrch->pattern);
366 uint32_t pcetablesize = INITIAL_ARRAY_SIZE_;
367 int64_t *pcetable = pattern->pcesBuffer;
368 uint32_t patternlength = pattern->textLength;
369 UCollationElements *coleiter = strsrch->utilIter;
370
371 if (coleiter == NULL) {
372 coleiter = ucol_openElements(strsrch->collator, pattern->text,
373 patternlength, status);
374 // status will be checked in ucol_next(..) later and if it is an
375 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
376 // returned.
377 strsrch->utilIter = coleiter;
378 } else {
379 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
380 }
381 if(U_FAILURE(*status)) {
382 return 0;
383 }
384
385 if (pattern->pces != pcetable && pattern->pces != NULL) {
386 uprv_free(pattern->pces);
387 }
388
389 uint16_t offset = 0;
390 uint16_t result = 0;
391 int64_t pce;
392
393 icu::UCollationPCE iter(coleiter);
394
395 // ** Should processed CEs be signed or unsigned?
396 // ** (the rest of the code in this file seems to play fast-and-loose with
397 // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
398 while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
399 U_SUCCESS(*status)) {
400 int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
401 pce,
402 patternlength - ucol_getOffset(coleiter) + 1,
403 status);
404
405 if (U_FAILURE(*status)) {
406 return 0;
407 }
408
409 offset += 1;
410
411 if (pcetable != temp && pcetable != pattern->pcesBuffer) {
412 uprv_free(pcetable);
413 }
414
415 pcetable = temp;
416 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
417 }
418
419 pcetable[offset] = 0;
420 pattern->pces = pcetable;
421 pattern->pcesLength = offset;
422
423 return result;
424 }
425
426 /**
427 * Initializes the pattern struct.
428 * Internal method, status assumed to be success.
429 * @param strsrch UStringSearch data storage
430 * @param status output error if any, caller to check status before calling
431 * method, status assumed to be success when passed in.
432 * @return expansionsize the total expansion size of the pattern
433 */
434 static
initializePattern(UStringSearch * strsrch,UErrorCode * status)435 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
436 {
437 if (U_FAILURE(*status)) { return 0; }
438 UPattern *pattern = &(strsrch->pattern);
439 const UChar *patterntext = pattern->text;
440 int32_t length = pattern->textLength;
441 int32_t index = 0;
442
443 // Since the strength is primary, accents are ignored in the pattern.
444 if (strsrch->strength == UCOL_PRIMARY) {
445 pattern->hasPrefixAccents = 0;
446 pattern->hasSuffixAccents = 0;
447 } else {
448 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
449 SECOND_LAST_BYTE_SHIFT_;
450 index = length;
451 U16_BACK_1(patterntext, 0, index);
452 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
453 LAST_BYTE_MASK_;
454 }
455
456 // ** HACK **
457 if (strsrch->pattern.pces != NULL) {
458 if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
459 uprv_free(strsrch->pattern.pces);
460 }
461
462 strsrch->pattern.pces = NULL;
463 }
464
465 // since intializePattern is an internal method status is a success.
466 return initializePatternCETable(strsrch, status);
467 }
468
469 /**
470 * Initializing shift tables, with the default values.
471 * If a corresponding default value is 0, the shift table is not set.
472 * @param shift table for forwards shift
473 * @param backshift table for backwards shift
474 * @param cetable table containing pattern ce
475 * @param cesize size of the pattern ces
476 * @param expansionsize total size of the expansions
477 * @param defaultforward the default forward value
478 * @param defaultbackward the default backward value
479 */
480 static
setShiftTable(int16_t shift[],int16_t backshift[],int32_t * cetable,int32_t cesize,int16_t expansionsize,int16_t defaultforward,int16_t defaultbackward)481 inline void setShiftTable(int16_t shift[], int16_t backshift[],
482 int32_t *cetable, int32_t cesize,
483 int16_t expansionsize,
484 int16_t defaultforward,
485 int16_t defaultbackward)
486 {
487 // estimate the value to shift. to do that we estimate the smallest
488 // number of characters to give the relevant ces, ie approximately
489 // the number of ces minus their expansion, since expansions can come
490 // from a character.
491 int32_t count;
492 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
493 shift[count] = defaultforward;
494 }
495 cesize --; // down to the last index
496 for (count = 0; count < cesize; count ++) {
497 // number of ces from right of array to the count
498 int temp = defaultforward - count - 1;
499 shift[hashFromCE32(cetable[count])] = temp > 1 ? temp : 1;
500 }
501 shift[hashFromCE32(cetable[cesize])] = 1;
502 // for ignorables we just shift by one. see test examples.
503 shift[hashFromCE32(0)] = 1;
504
505 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
506 backshift[count] = defaultbackward;
507 }
508 for (count = cesize; count > 0; count --) {
509 // the original value count does not seem to work
510 backshift[hashFromCE32(cetable[count])] = count > expansionsize ?
511 (int16_t)(count - expansionsize) : 1;
512 }
513 backshift[hashFromCE32(cetable[0])] = 1;
514 backshift[hashFromCE32(0)] = 1;
515 }
516
517 /**
518 * Building of the pattern collation element list and the boyer moore strsrch
519 * table.
520 * The canonical match will only be performed after the default match fails.
521 * For both cases we need to remember the size of the composed and decomposed
522 * versions of the string. Since the Boyer-Moore shift calculations shifts by
523 * a number of characters in the text and tries to match the pattern from that
524 * offset, the shift value can not be too large in case we miss some
525 * characters. To choose a right shift size, we estimate the NFC form of the
526 * and use its size as a shift guide. The NFC form should be the small
527 * possible representation of the pattern. Anyways, we'll err on the smaller
528 * shift size. Hence the calculation for minlength.
529 * Canonical match will be performed slightly differently. We'll split the
530 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
531 * the first and last base character (MS), the ending accents (EA). Matches
532 * will be done on MS first, and only when we match MS then some processing
533 * will be required for the prefix and end accents in order to determine if
534 * they match PA and EA. Hence the default shift values
535 * for the canonical match will take the size of either end's accent into
536 * consideration. Forwards search will take the end accents into consideration
537 * for the default shift values and the backwards search will take the prefix
538 * accents into consideration.
539 * If pattern has no non-ignorable ce, we return a illegal argument error.
540 * Internal method, status assumed to be success.
541 * @param strsrch UStringSearch data storage
542 * @param status for output errors if it occurs, status is assumed to be a
543 * success when it is passed in.
544 */
545 static
initialize(UStringSearch * strsrch,UErrorCode * status)546 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
547 {
548 int16_t expandlength = initializePattern(strsrch, status);
549 if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
550 UPattern *pattern = &strsrch->pattern;
551 int32_t cesize = pattern->cesLength;
552
553 int16_t minlength = cesize > expandlength
554 ? (int16_t)cesize - expandlength : 1;
555 pattern->defaultShiftSize = minlength;
556 setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
557 cesize, expandlength, minlength, minlength);
558 return;
559 }
560 strsrch->pattern.defaultShiftSize = 0;
561 }
562
563 #if BOYER_MOORE
564 /**
565 * Check to make sure that the match length is at the end of the character by
566 * using the breakiterator.
567 * @param strsrch string search data
568 * @param start target text start offset
569 * @param end target text end offset
570 */
571 static
checkBreakBoundary(const UStringSearch * strsrch,int32_t *,int32_t * end)572 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
573 int32_t *end)
574 {
575 #if !UCONFIG_NO_BREAK_ITERATION
576 UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
577 if (breakiterator) {
578 int32_t matchend = *end;
579 //int32_t matchstart = *start;
580
581 if (!ubrk_isBoundary(breakiterator, matchend)) {
582 *end = ubrk_following(breakiterator, matchend);
583 }
584
585 /* Check the start of the matched text to make sure it doesn't have any accents
586 * before it. This code may not be necessary and so it is commented out */
587 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
588 *start = ubrk_preceding(breakiterator, matchstart);
589 }*/
590 }
591 #endif
592 }
593
594 /**
595 * Determine whether the target text in UStringSearch bounded by the offset
596 * start and end is one or more whole units of text as
597 * determined by the breakiterator in UStringSearch.
598 * @param strsrch string search data
599 * @param start target text start offset
600 * @param end target text end offset
601 */
602 static
isBreakUnit(const UStringSearch * strsrch,int32_t start,int32_t end)603 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
604 int32_t end)
605 {
606 #if !UCONFIG_NO_BREAK_ITERATION
607 UBreakIterator *breakiterator = strsrch->search->breakIter;
608 //TODO: Add here.
609 if (breakiterator) {
610 int32_t startindex = ubrk_first(breakiterator);
611 int32_t endindex = ubrk_last(breakiterator);
612
613 // out-of-range indexes are never boundary positions
614 if (start < startindex || start > endindex ||
615 end < startindex || end > endindex) {
616 return FALSE;
617 }
618 // otherwise, we can use following() on the position before the
619 // specified one and return true of the position we get back is the
620 // one the user specified
621 UBool result = (start == startindex ||
622 ubrk_following(breakiterator, start - 1) == start) &&
623 (end == endindex ||
624 ubrk_following(breakiterator, end - 1) == end);
625 if (result) {
626 // iterates the individual ces
627 UCollationElements *coleiter = strsrch->utilIter;
628 const UChar *text = strsrch->search->text +
629 start;
630 UErrorCode status = U_ZERO_ERROR;
631 ucol_setText(coleiter, text, end - start, &status);
632 for (int32_t count = 0; count < strsrch->pattern.cesLength;
633 count ++) {
634 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
635 if (ce == UCOL_IGNORABLE) {
636 count --;
637 continue;
638 }
639 if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) {
640 return FALSE;
641 }
642 }
643 int32_t nextce = ucol_next(coleiter, &status);
644 while (ucol_getOffset(coleiter) == (end - start)
645 && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
646 nextce = ucol_next(coleiter, &status);
647 }
648 if (ucol_getOffset(coleiter) == (end - start)
649 && nextce != UCOL_NULLORDER) {
650 // extra collation elements at the end of the match
651 return FALSE;
652 }
653 }
654 return result;
655 }
656 #endif
657 return TRUE;
658 }
659
660 /**
661 * Getting the next base character offset if current offset is an accent,
662 * or the current offset if the current character contains a base character.
663 * accents the following base character will be returned
664 * @param text string
665 * @param textoffset current offset
666 * @param textlength length of text string
667 * @return the next base character or the current offset
668 * if the current character is contains a base character.
669 */
670 static
getNextBaseOffset(const UChar * text,int32_t textoffset,int32_t textlength)671 inline int32_t getNextBaseOffset(const UChar *text,
672 int32_t textoffset,
673 int32_t textlength)
674 {
675 if (textoffset < textlength) {
676 int32_t temp = textoffset;
677 if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
678 while (temp < textlength) {
679 int32_t result = temp;
680 if ((getFCD(text, &temp, textlength) >>
681 SECOND_LAST_BYTE_SHIFT_) == 0) {
682 return result;
683 }
684 }
685 return textlength;
686 }
687 }
688 return textoffset;
689 }
690
691 /**
692 * Gets the next base character offset depending on the string search pattern
693 * data
694 * @param strsrch string search data
695 * @param textoffset current offset, one offset away from the last character
696 * to search for.
697 * @return start index of the next base character or the current offset
698 * if the current character is contains a base character.
699 */
700 static
getNextUStringSearchBaseOffset(UStringSearch * strsrch,int32_t textoffset)701 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
702 int32_t textoffset)
703 {
704 int32_t textlength = strsrch->search->textLength;
705 if (strsrch->pattern.hasSuffixAccents &&
706 textoffset < textlength) {
707 int32_t temp = textoffset;
708 const UChar *text = strsrch->search->text;
709 U16_BACK_1(text, 0, temp);
710 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
711 return getNextBaseOffset(text, textoffset, textlength);
712 }
713 }
714 return textoffset;
715 }
716
717 /**
718 * Shifting the collation element iterator position forward to prepare for
719 * a following match. If the last character is a unsafe character, we'll only
720 * shift by 1 to capture contractions, normalization etc.
721 * Internal method, status assumed to be success.
722 * @param text strsrch string search data
723 * @param textoffset start text position to do search
724 * @param ce the text ce which failed the match.
725 * @param patternceindex index of the ce within the pattern ce buffer which
726 * failed the match
727 * @return final offset
728 */
729 static
shiftForward(UStringSearch * strsrch,int32_t textoffset,int32_t ce,int32_t patternceindex)730 inline int32_t shiftForward(UStringSearch *strsrch,
731 int32_t textoffset,
732 int32_t ce,
733 int32_t patternceindex)
734 {
735 UPattern *pattern = &(strsrch->pattern);
736 if (ce != UCOL_NULLORDER) {
737 int32_t shift = pattern->shift[hashFromCE32(ce)];
738 // this is to adjust for characters in the middle of the
739 // substring for matching that failed.
740 int32_t adjust = pattern->cesLength - patternceindex;
741 if (adjust > 1 && shift >= adjust) {
742 shift -= adjust - 1;
743 }
744 textoffset += shift;
745 }
746 else {
747 textoffset += pattern->defaultShiftSize;
748 }
749
750 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
751 // check for unsafe characters
752 // * if it is the start or middle of a contraction: to be done after
753 // a initial match is found
754 // * thai or lao base consonant character: similar to contraction
755 // * high surrogate character: similar to contraction
756 // * next character is a accent: shift to the next base character
757 return textoffset;
758 }
759 #endif // #if BOYER_MOORE
760
761 /**
762 * sets match not found
763 * @param strsrch string search data
764 */
765 static
setMatchNotFound(UStringSearch * strsrch)766 inline void setMatchNotFound(UStringSearch *strsrch)
767 {
768 // this method resets the match result regardless of the error status.
769 strsrch->search->matchedIndex = USEARCH_DONE;
770 strsrch->search->matchedLength = 0;
771 if (strsrch->search->isForwardSearching) {
772 setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
773 }
774 else {
775 setColEIterOffset(strsrch->textIter, 0);
776 }
777 }
778
779 #if BOYER_MOORE
780 /**
781 * Gets the offset to the next safe point in text.
782 * ie. not the middle of a contraction, swappable characters or supplementary
783 * characters.
784 * @param collator collation sata
785 * @param text string to work with
786 * @param textoffset offset in string
787 * @param textlength length of text string
788 * @return offset to the next safe character
789 */
790 static
getNextSafeOffset(const UCollator * collator,const UChar * text,int32_t textoffset,int32_t textlength)791 inline int32_t getNextSafeOffset(const UCollator *collator,
792 const UChar *text,
793 int32_t textoffset,
794 int32_t textlength)
795 {
796 int32_t result = textoffset; // first contraction character
797 while (result != textlength && ucol_unsafeCP(text[result], collator)) {
798 result ++;
799 }
800 return result;
801 }
802
803 /**
804 * This checks for accents in the potential match started with a .
805 * composite character.
806 * This is really painful... we have to check that composite character do not
807 * have any extra accents. We have to normalize the potential match and find
808 * the immediate decomposed character before the match.
809 * The first composite character would have been taken care of by the fcd
810 * checks in checkForwardExactMatch.
811 * This is the slow path after the fcd of the first character and
812 * the last character has been checked by checkForwardExactMatch and we
813 * determine that the potential match has extra non-ignorable preceding
814 * ces.
815 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
816 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
817 * Note here that accents checking are slow and cautioned in the API docs.
818 * Internal method, status assumed to be a success, caller should check status
819 * before calling this method
820 * @param strsrch string search data
821 * @param start index of the potential unfriendly composite character
822 * @param end index of the potential unfriendly composite character
823 * @param status output error status if any.
824 * @return TRUE if there is non-ignorable accents before at the beginning
825 * of the match, FALSE otherwise.
826 */
827
828 static
checkExtraMatchAccents(const UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)829 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
830 int32_t end,
831 UErrorCode *status)
832 {
833 UBool result = FALSE;
834 if (strsrch->pattern.hasPrefixAccents) {
835 int32_t length = end - start;
836 int32_t offset = 0;
837 const UChar *text = strsrch->search->text + start;
838
839 U16_FWD_1(text, offset, length);
840 // we are only concerned with the first composite character
841 if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
842 int32_t safeoffset = getNextSafeOffset(strsrch->collator,
843 text, 0, length);
844 if (safeoffset != length) {
845 safeoffset ++;
846 }
847 UChar *norm = NULL;
848 UChar buffer[INITIAL_ARRAY_SIZE_];
849 int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
850 buffer, INITIAL_ARRAY_SIZE_,
851 status);
852 if (U_FAILURE(*status)) {
853 return FALSE;
854 }
855 if (size >= INITIAL_ARRAY_SIZE_) {
856 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
857 status);
858 // if allocation failed, status will be set to
859 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
860 // checks for it.
861 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
862 size, status);
863 if (U_FAILURE(*status) && norm != NULL) {
864 uprv_free(norm);
865 return FALSE;
866 }
867 }
868 else {
869 norm = buffer;
870 }
871
872 UCollationElements *coleiter = strsrch->utilIter;
873 ucol_setText(coleiter, norm, size, status);
874 uint32_t firstce = strsrch->pattern.ces[0];
875 UBool ignorable = TRUE;
876 uint32_t ce = UCOL_IGNORABLE;
877 while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
878 offset = ucol_getOffset(coleiter);
879 if (ce != firstce && ce != UCOL_IGNORABLE) {
880 ignorable = FALSE;
881 }
882 ce = ucol_next(coleiter, status);
883 }
884 UChar32 codepoint;
885 U16_PREV(norm, 0, offset, codepoint);
886 result = !ignorable && (u_getCombiningClass(codepoint) != 0);
887
888 if (norm != buffer) {
889 uprv_free(norm);
890 }
891 }
892 }
893
894 return result;
895 }
896
897 /**
898 * Used by exact matches, checks if there are accents before the match.
899 * This is really painful... we have to check that composite characters at
900 * the start of the matches have to not have any extra accents.
901 * We check the FCD of the character first, if it starts with an accent and
902 * the first pattern ce does not match the first ce of the character, we bail.
903 * Otherwise we try normalizing the first composite
904 * character and find the immediate decomposed character before the match to
905 * see if it is an non-ignorable accent.
906 * Now normalizing the first composite character is enough because we ensure
907 * that when the match is passed in here with extra beginning ces, the
908 * first or last ce that match has to occur within the first character.
909 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
910 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
911 * Note here that accents checking are slow and cautioned in the API docs.
912 * @param strsrch string search data
913 * @param start offset
914 * @param end offset
915 * @return TRUE if there are accents on either side of the match,
916 * FALSE otherwise
917 */
918 static
hasAccentsBeforeMatch(const UStringSearch * strsrch,int32_t start,int32_t end)919 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
920 int32_t end)
921 {
922 if (strsrch->pattern.hasPrefixAccents) {
923 UCollationElements *coleiter = strsrch->textIter;
924 UErrorCode status = U_ZERO_ERROR;
925 // we have been iterating forwards previously
926 uint32_t ignorable = TRUE;
927 int32_t firstce = strsrch->pattern.ces[0];
928
929 setColEIterOffset(coleiter, start);
930 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
931 if (U_FAILURE(status)) {
932 return TRUE;
933 }
934 while (ce != firstce) {
935 if (ce != UCOL_IGNORABLE) {
936 ignorable = FALSE;
937 }
938 ce = getCE(strsrch, ucol_next(coleiter, &status));
939 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
940 return TRUE;
941 }
942 }
943 if (!ignorable && inNormBuf(coleiter)) {
944 // within normalization buffer, discontiguous handled here
945 return TRUE;
946 }
947
948 // within text
949 int32_t temp = start;
950 // original code
951 // accent = (getFCD(strsrch->search->text, &temp,
952 // strsrch->search->textLength)
953 // >> SECOND_LAST_BYTE_SHIFT_);
954 // however this code does not work well with VC7 .net in release mode.
955 // maybe the inlines for getFCD combined with shifting has bugs in
956 // VC7. anyways this is a work around.
957 UBool accent = getFCD(strsrch->search->text, &temp,
958 strsrch->search->textLength) > 0xFF;
959 if (!accent) {
960 return checkExtraMatchAccents(strsrch, start, end, &status);
961 }
962 if (!ignorable) {
963 return TRUE;
964 }
965 if (start > 0) {
966 temp = start;
967 U16_BACK_1(strsrch->search->text, 0, temp);
968 if (getFCD(strsrch->search->text, &temp,
969 strsrch->search->textLength) & LAST_BYTE_MASK_) {
970 setColEIterOffset(coleiter, start);
971 ce = ucol_previous(coleiter, &status);
972 if (U_FAILURE(status) ||
973 (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
974 return TRUE;
975 }
976 }
977 }
978 }
979
980 return FALSE;
981 }
982
983 /**
984 * Used by exact matches, checks if there are accents bounding the match.
985 * Note this is the initial boundary check. If the potential match
986 * starts or ends with composite characters, the accents in those
987 * characters will be determined later.
988 * Not doing backwards iteration here, since discontiguos contraction for
989 * backwards collation element iterator, use up too many characters.
990 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
991 * should fail since there is a acute at the end of \u01FA
992 * Note here that accents checking are slow and cautioned in the API docs.
993 * @param strsrch string search data
994 * @param start offset of match
995 * @param end end offset of the match
996 * @return TRUE if there are accents on either side of the match,
997 * FALSE otherwise
998 */
999 static
hasAccentsAfterMatch(const UStringSearch * strsrch,int32_t start,int32_t end)1000 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
1001 int32_t end)
1002 {
1003 if (strsrch->pattern.hasSuffixAccents) {
1004 const UChar *text = strsrch->search->text;
1005 int32_t temp = end;
1006 int32_t textlength = strsrch->search->textLength;
1007 U16_BACK_1(text, 0, temp);
1008 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
1009 int32_t firstce = strsrch->pattern.ces[0];
1010 UCollationElements *coleiter = strsrch->textIter;
1011 UErrorCode status = U_ZERO_ERROR;
1012 int32_t ce;
1013 setColEIterOffset(coleiter, start);
1014 while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1015 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
1016 return TRUE;
1017 }
1018 }
1019 int32_t count = 1;
1020 while (count < strsrch->pattern.cesLength) {
1021 if (getCE(strsrch, ucol_next(coleiter, &status))
1022 == UCOL_IGNORABLE) {
1023 // Thai can give an ignorable here.
1024 count --;
1025 }
1026 if (U_FAILURE(status)) {
1027 return TRUE;
1028 }
1029 count ++;
1030 }
1031
1032 ce = ucol_next(coleiter, &status);
1033 if (U_FAILURE(status)) {
1034 return TRUE;
1035 }
1036 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1037 ce = getCE(strsrch, ce);
1038 }
1039 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1040 if (ucol_getOffset(coleiter) <= end) {
1041 return TRUE;
1042 }
1043 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1044 return TRUE;
1045 }
1046 }
1047 }
1048 }
1049 return FALSE;
1050 }
1051 #endif // #if BOYER_MOORE
1052
1053 /**
1054 * Checks if the offset runs out of the text string
1055 * @param offset
1056 * @param textlength of the text string
1057 * @return TRUE if offset is out of bounds, FALSE otherwise
1058 */
1059 static
isOutOfBounds(int32_t textlength,int32_t offset)1060 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1061 {
1062 return offset < 0 || offset > textlength;
1063 }
1064
1065 /**
1066 * Checks for identical match
1067 * @param strsrch string search data
1068 * @param start offset of possible match
1069 * @param end offset of possible match
1070 * @return TRUE if identical match is found
1071 */
1072 static
checkIdentical(const UStringSearch * strsrch,int32_t start,int32_t end)1073 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1074 int32_t end)
1075 {
1076 if (strsrch->strength != UCOL_IDENTICAL) {
1077 return TRUE;
1078 }
1079
1080 // Note: We could use Normalizer::compare() or similar, but for short strings
1081 // which may not be in FCD it might be faster to just NFD them.
1082 UErrorCode status = U_ZERO_ERROR;
1083 UnicodeString t2, p2;
1084 strsrch->nfd->normalize(
1085 UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
1086 strsrch->nfd->normalize(
1087 UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
1088 // return FALSE if NFD failed
1089 return U_SUCCESS(status) && t2 == p2;
1090 }
1091
1092 #if BOYER_MOORE
1093 /**
1094 * Checks to see if the match is repeated
1095 * @param strsrch string search data
1096 * @param start new match start index
1097 * @param end new match end index
1098 * @return TRUE if the the match is repeated, FALSE otherwise
1099 */
1100 static
checkRepeatedMatch(UStringSearch * strsrch,int32_t start,int32_t end)1101 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1102 int32_t start,
1103 int32_t end)
1104 {
1105 int32_t lastmatchindex = strsrch->search->matchedIndex;
1106 UBool result;
1107 if (lastmatchindex == USEARCH_DONE) {
1108 return FALSE;
1109 }
1110 if (strsrch->search->isForwardSearching) {
1111 result = start <= lastmatchindex;
1112 }
1113 else {
1114 result = start >= lastmatchindex;
1115 }
1116 if (!result && !strsrch->search->isOverlap) {
1117 if (strsrch->search->isForwardSearching) {
1118 result = start < lastmatchindex + strsrch->search->matchedLength;
1119 }
1120 else {
1121 result = end > lastmatchindex;
1122 }
1123 }
1124 return result;
1125 }
1126
1127 /**
1128 * Gets the collation element iterator's current offset.
1129 * @param coleiter collation element iterator
1130 * @param forwards flag TRUE if we are moving in th forwards direction
1131 * @return current offset
1132 */
1133 static
getColElemIterOffset(const UCollationElements * coleiter,UBool forwards)1134 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1135 UBool forwards)
1136 {
1137 int32_t result = ucol_getOffset(coleiter);
1138 // intricacies of the the backwards collation element iterator
1139 if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
1140 result ++;
1141 }
1142 return result;
1143 }
1144
1145 /**
1146 * Checks match for contraction.
1147 * If the match ends with a partial contraction we fail.
1148 * If the match starts too far off (because of backwards iteration) we try to
1149 * chip off the extra characters depending on whether a breakiterator has
1150 * been used.
1151 * Internal method, error assumed to be success, caller has to check status
1152 * before calling this method.
1153 * @param strsrch string search data
1154 * @param start offset of potential match, to be modified if necessary
1155 * @param end offset of potential match, to be modified if necessary
1156 * @param status output error status if any
1157 * @return TRUE if match passes the contraction test, FALSE otherwise
1158 */
1159
1160 static
checkNextExactContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)1161 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1162 int32_t *start,
1163 int32_t *end, UErrorCode *status)
1164 {
1165 UCollationElements *coleiter = strsrch->textIter;
1166 int32_t textlength = strsrch->search->textLength;
1167 int32_t temp = *start;
1168 const UCollator *collator = strsrch->collator;
1169 const UChar *text = strsrch->search->text;
1170 // This part checks if either ends of the match contains potential
1171 // contraction. If so we'll have to iterate through them
1172 // The start contraction needs to be checked since ucol_previous dumps
1173 // all characters till the first safe character into the buffer.
1174 // *start + 1 is used to test for the unsafe characters instead of *start
1175 // because ucol_prev takes all unsafe characters till the first safe
1176 // character ie *start. so by testing *start + 1, we can estimate if
1177 // excess prefix characters has been included in the potential search
1178 // results.
1179 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1180 (*start + 1 < textlength
1181 && ucol_unsafeCP(text[*start + 1], collator))) {
1182 int32_t expansion = getExpansionPrefix(coleiter);
1183 UBool expandflag = expansion > 0;
1184 setColEIterOffset(coleiter, *start);
1185 while (expansion > 0) {
1186 // getting rid of the redundant ce, caused by setOffset.
1187 // since backward contraction/expansion may have extra ces if we
1188 // are in the normalization buffer, hasAccentsBeforeMatch would
1189 // have taken care of it.
1190 // E.g. the character \u01FA will have an expansion of 3, but if
1191 // we are only looking for acute and ring \u030A and \u0301, we'll
1192 // have to skip the first ce in the expansion buffer.
1193 ucol_next(coleiter, status);
1194 if (U_FAILURE(*status)) {
1195 return FALSE;
1196 }
1197 if (ucol_getOffset(coleiter) != temp) {
1198 *start = temp;
1199 temp = ucol_getOffset(coleiter);
1200 }
1201 expansion --;
1202 }
1203
1204 int32_t *patternce = strsrch->pattern.ces;
1205 int32_t patterncelength = strsrch->pattern.cesLength;
1206 int32_t count = 0;
1207 while (count < patterncelength) {
1208 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1209 if (ce == UCOL_IGNORABLE) {
1210 continue;
1211 }
1212 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1213 *start = temp;
1214 temp = ucol_getOffset(coleiter);
1215 }
1216 if (U_FAILURE(*status) || ce != patternce[count]) {
1217 (*end) ++;
1218 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1219 return FALSE;
1220 }
1221 count ++;
1222 }
1223 }
1224 return TRUE;
1225 }
1226
1227 /**
1228 * Checks and sets the match information if found.
1229 * Checks
1230 * <ul>
1231 * <li> the potential match does not repeat the previous match
1232 * <li> boundaries are correct
1233 * <li> exact matches has no extra accents
1234 * <li> identical matchesb
1235 * <li> potential match does not end in the middle of a contraction
1236 * <\ul>
1237 * Otherwise the offset will be shifted to the next character.
1238 * Internal method, status assumed to be success, caller has to check status
1239 * before calling this method.
1240 * @param strsrch string search data
1241 * @param textoffset offset in the collation element text. the returned value
1242 * will be the truncated end offset of the match or the new start
1243 * search offset.
1244 * @param status output error status if any
1245 * @return TRUE if the match is valid, FALSE otherwise
1246 */
1247 static
checkNextExactMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)1248 inline UBool checkNextExactMatch(UStringSearch *strsrch,
1249 int32_t *textoffset, UErrorCode *status)
1250 {
1251 UCollationElements *coleiter = strsrch->textIter;
1252 int32_t start = getColElemIterOffset(coleiter, FALSE);
1253
1254 if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1255 return FALSE;
1256 }
1257
1258 // this totally matches, however we need to check if it is repeating
1259 if (!isBreakUnit(strsrch, start, *textoffset) ||
1260 checkRepeatedMatch(strsrch, start, *textoffset) ||
1261 hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1262 !checkIdentical(strsrch, start, *textoffset) ||
1263 hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1264
1265 (*textoffset) ++;
1266 *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1267 return FALSE;
1268 }
1269
1270 //Add breakiterator boundary check for primary strength search.
1271 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
1272 checkBreakBoundary(strsrch, &start, textoffset);
1273 }
1274
1275 // totally match, we will get rid of the ending ignorables.
1276 strsrch->search->matchedIndex = start;
1277 strsrch->search->matchedLength = *textoffset - start;
1278 return TRUE;
1279 }
1280
1281 /**
1282 * Getting the previous base character offset, or the current offset if the
1283 * current character is a base character
1284 * @param text string
1285 * @param textoffset one offset after the current character
1286 * @return the offset of the next character after the base character or the first
1287 * composed character with accents
1288 */
1289 static
getPreviousBaseOffset(const UChar * text,int32_t textoffset)1290 inline int32_t getPreviousBaseOffset(const UChar *text,
1291 int32_t textoffset)
1292 {
1293 if (textoffset > 0) {
1294 for (;;) {
1295 int32_t result = textoffset;
1296 U16_BACK_1(text, 0, textoffset);
1297 int32_t temp = textoffset;
1298 uint16_t fcd = getFCD(text, &temp, result);
1299 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1300 if (fcd & LAST_BYTE_MASK_) {
1301 return textoffset;
1302 }
1303 return result;
1304 }
1305 if (textoffset == 0) {
1306 return 0;
1307 }
1308 }
1309 }
1310 return textoffset;
1311 }
1312
1313 /**
1314 * Getting the indexes of the accents that are not blocked in the argument
1315 * accent array
1316 * @param accents array of accents in nfd terminated by a 0.
1317 * @param accentsindex array of indexes of the accents that are not blocked
1318 */
1319 static
getUnblockedAccentIndex(UChar * accents,int32_t * accentsindex)1320 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1321 {
1322 int32_t index = 0;
1323 int32_t length = u_strlen(accents);
1324 UChar32 codepoint = 0;
1325 int cclass = 0;
1326 int result = 0;
1327 int32_t temp;
1328 while (index < length) {
1329 temp = index;
1330 U16_NEXT(accents, index, length, codepoint);
1331 if (u_getCombiningClass(codepoint) != cclass) {
1332 cclass = u_getCombiningClass(codepoint);
1333 accentsindex[result] = temp;
1334 result ++;
1335 }
1336 }
1337 accentsindex[result] = length;
1338 return result;
1339 }
1340
1341 /**
1342 * Appends 3 UChar arrays to a destination array.
1343 * Creates a new array if we run out of space. The caller will have to
1344 * manually deallocate the newly allocated array.
1345 * Internal method, status assumed to be success, caller has to check status
1346 * before calling this method. destination not to be NULL and has at least
1347 * size destinationlength.
1348 * @param destination target array
1349 * @param destinationlength target array size, returning the appended length
1350 * @param source1 null-terminated first array
1351 * @param source2 second array
1352 * @param source2length length of seond array
1353 * @param source3 null-terminated third array
1354 * @param status error status if any
1355 * @return new destination array, destination if there was no new allocation
1356 */
1357 static
addToUCharArray(UChar * destination,int32_t * destinationlength,const UChar * source1,const UChar * source2,int32_t source2length,const UChar * source3,UErrorCode * status)1358 inline UChar * addToUCharArray( UChar *destination,
1359 int32_t *destinationlength,
1360 const UChar *source1,
1361 const UChar *source2,
1362 int32_t source2length,
1363 const UChar *source3,
1364 UErrorCode *status)
1365 {
1366 int32_t source1length = source1 ? u_strlen(source1) : 0;
1367 int32_t source3length = source3 ? u_strlen(source3) : 0;
1368 if (*destinationlength < source1length + source2length + source3length +
1369 1)
1370 {
1371 destination = (UChar *)allocateMemory(
1372 (source1length + source2length + source3length + 1) * sizeof(UChar),
1373 status);
1374 // if error allocating memory, status will be
1375 // U_MEMORY_ALLOCATION_ERROR
1376 if (U_FAILURE(*status)) {
1377 *destinationlength = 0;
1378 return NULL;
1379 }
1380 }
1381 if (source1length != 0) {
1382 uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
1383 }
1384 if (source2length != 0) {
1385 uprv_memcpy(destination + source1length, source2,
1386 sizeof(UChar) * source2length);
1387 }
1388 if (source3length != 0) {
1389 uprv_memcpy(destination + source1length + source2length, source3,
1390 sizeof(UChar) * source3length);
1391 }
1392 *destinationlength = source1length + source2length + source3length;
1393 return destination;
1394 }
1395
1396 /**
1397 * Running through a collation element iterator to see if the contents matches
1398 * pattern in string search data
1399 * @param strsrch string search data
1400 * @param coleiter collation element iterator
1401 * @return TRUE if a match if found, FALSE otherwise
1402 */
1403 static
checkCollationMatch(const UStringSearch * strsrch,UCollationElements * coleiter)1404 inline UBool checkCollationMatch(const UStringSearch *strsrch,
1405 UCollationElements *coleiter)
1406 {
1407 int patternceindex = strsrch->pattern.cesLength;
1408 int32_t *patternce = strsrch->pattern.ces;
1409 UErrorCode status = U_ZERO_ERROR;
1410 while (patternceindex > 0) {
1411 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1412 if (ce == UCOL_IGNORABLE) {
1413 continue;
1414 }
1415 if (U_FAILURE(status) || ce != *patternce) {
1416 return FALSE;
1417 }
1418 patternce ++;
1419 patternceindex --;
1420 }
1421 return TRUE;
1422 }
1423
1424 /**
1425 * Rearranges the front accents to try matching.
1426 * Prefix accents in the text will be grouped according to their combining
1427 * class and the groups will be mixed and matched to try find the perfect
1428 * match with the pattern.
1429 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1430 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1431 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1432 * "\u0301\u0325".
1433 * step 2: check if any of the generated substrings matches the pattern.
1434 * Internal method, status is assumed to be success, caller has to check status
1435 * before calling this method.
1436 * @param strsrch string search match
1437 * @param start first offset of the accents to start searching
1438 * @param end start of the last accent set
1439 * @param status output error status if any
1440 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1441 * offset of the match. Note this start includes all preceding accents.
1442 */
1443 static
doNextCanonicalPrefixMatch(UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)1444 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1445 int32_t start,
1446 int32_t end,
1447 UErrorCode *status)
1448 {
1449 const UChar *text = strsrch->search->text;
1450 int32_t textlength = strsrch->search->textLength;
1451 int32_t tempstart = start;
1452
1453 if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1454 // die... failed at a base character
1455 return USEARCH_DONE;
1456 }
1457
1458 int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1459 start = getPreviousBaseOffset(text, tempstart);
1460
1461 UChar accents[INITIAL_ARRAY_SIZE_];
1462 // normalizing the offensive string
1463 unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1464 INITIAL_ARRAY_SIZE_, status);
1465 if (U_FAILURE(*status)) {
1466 return USEARCH_DONE;
1467 }
1468
1469 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1470 int32_t accentsize = getUnblockedAccentIndex(accents,
1471 accentsindex);
1472 int32_t count = (2 << (accentsize - 1)) - 1;
1473 UChar buffer[INITIAL_ARRAY_SIZE_];
1474 UCollationElements *coleiter = strsrch->utilIter;
1475 while (U_SUCCESS(*status) && count > 0) {
1476 UChar *rearrange = strsrch->canonicalPrefixAccents;
1477 // copy the base characters
1478 for (int k = 0; k < accentsindex[0]; k ++) {
1479 *rearrange ++ = accents[k];
1480 }
1481 // forming all possible canonical rearrangement by dropping
1482 // sets of accents
1483 for (int i = 0; i <= accentsize - 1; i ++) {
1484 int32_t mask = 1 << (accentsize - i - 1);
1485 if (count & mask) {
1486 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1487 *rearrange ++ = accents[j];
1488 }
1489 }
1490 }
1491 *rearrange = 0;
1492 int32_t matchsize = INITIAL_ARRAY_SIZE_;
1493 UChar *match = addToUCharArray(buffer, &matchsize,
1494 strsrch->canonicalPrefixAccents,
1495 strsrch->search->text + offset,
1496 end - offset,
1497 strsrch->canonicalSuffixAccents,
1498 status);
1499
1500 // if status is a failure, ucol_setText does nothing.
1501 // run the collator iterator through this match
1502 ucol_setText(coleiter, match, matchsize, status);
1503 if (U_SUCCESS(*status)) {
1504 if (checkCollationMatch(strsrch, coleiter)) {
1505 if (match != buffer) {
1506 uprv_free(match);
1507 }
1508 return start;
1509 }
1510 }
1511 count --;
1512 }
1513 return USEARCH_DONE;
1514 }
1515
1516 /**
1517 * Gets the offset to the safe point in text before textoffset.
1518 * ie. not the middle of a contraction, swappable characters or supplementary
1519 * characters.
1520 * @param collator collation sata
1521 * @param text string to work with
1522 * @param textoffset offset in string
1523 * @param textlength length of text string
1524 * @return offset to the previous safe character
1525 */
1526 static
getPreviousSafeOffset(const UCollator * collator,const UChar * text,int32_t textoffset)1527 inline uint32_t getPreviousSafeOffset(const UCollator *collator,
1528 const UChar *text,
1529 int32_t textoffset)
1530 {
1531 int32_t result = textoffset; // first contraction character
1532 while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1533 result --;
1534 }
1535 if (result != 0) {
1536 // the first contraction character is consider unsafe here
1537 result --;
1538 }
1539 return result;
1540 }
1541
1542 /**
1543 * Cleaning up after we passed the safe zone
1544 * @param strsrch string search data
1545 * @param safetext safe text array
1546 * @param safebuffer safe text buffer
1547 * @param coleiter collation element iterator for safe text
1548 */
1549 static
cleanUpSafeText(const UStringSearch * strsrch,UChar * safetext,UChar * safebuffer)1550 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1551 UChar *safebuffer)
1552 {
1553 if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1554 {
1555 uprv_free(safetext);
1556 }
1557 }
1558
1559 /**
1560 * Take the rearranged end accents and tries matching. If match failed at
1561 * a seperate preceding set of accents (seperated from the rearranged on by
1562 * at least a base character) then we rearrange the preceding accents and
1563 * tries matching again.
1564 * We allow skipping of the ends of the accent set if the ces do not match.
1565 * However if the failure is found before the accent set, it fails.
1566 * Internal method, status assumed to be success, caller has to check status
1567 * before calling this method.
1568 * @param strsrch string search data
1569 * @param textoffset of the start of the rearranged accent
1570 * @param status output error status if any
1571 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1572 * offset of the match. Note this start includes all preceding accents.
1573 */
1574 static
doNextCanonicalSuffixMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)1575 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1576 int32_t textoffset,
1577 UErrorCode *status)
1578 {
1579 const UChar *text = strsrch->search->text;
1580 const UCollator *collator = strsrch->collator;
1581 int32_t safelength = 0;
1582 UChar *safetext;
1583 int32_t safetextlength;
1584 UChar safebuffer[INITIAL_ARRAY_SIZE_];
1585 UCollationElements *coleiter = strsrch->utilIter;
1586 int32_t safeoffset = textoffset;
1587
1588 if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1589 collator)) {
1590 safeoffset = getPreviousSafeOffset(collator, text, textoffset);
1591 safelength = textoffset - safeoffset;
1592 safetextlength = INITIAL_ARRAY_SIZE_;
1593 safetext = addToUCharArray(safebuffer, &safetextlength, NULL,
1594 text + safeoffset, safelength,
1595 strsrch->canonicalSuffixAccents,
1596 status);
1597 }
1598 else {
1599 safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1600 safetext = strsrch->canonicalSuffixAccents;
1601 }
1602
1603 // if status is a failure, ucol_setText does nothing
1604 ucol_setText(coleiter, safetext, safetextlength, status);
1605 // status checked in loop below
1606
1607 int32_t *ce = strsrch->pattern.ces;
1608 int32_t celength = strsrch->pattern.cesLength;
1609 int ceindex = celength - 1;
1610 UBool isSafe = TRUE; // indication flag for position in safe zone
1611
1612 while (ceindex >= 0) {
1613 int32_t textce = ucol_previous(coleiter, status);
1614 if (U_FAILURE(*status)) {
1615 if (isSafe) {
1616 cleanUpSafeText(strsrch, safetext, safebuffer);
1617 }
1618 return USEARCH_DONE;
1619 }
1620 if (textce == UCOL_NULLORDER) {
1621 // check if we have passed the safe buffer
1622 if (coleiter == strsrch->textIter) {
1623 cleanUpSafeText(strsrch, safetext, safebuffer);
1624 return USEARCH_DONE;
1625 }
1626 cleanUpSafeText(strsrch, safetext, safebuffer);
1627 safetext = safebuffer;
1628 coleiter = strsrch->textIter;
1629 setColEIterOffset(coleiter, safeoffset);
1630 // status checked at the start of the loop
1631 isSafe = FALSE;
1632 continue;
1633 }
1634 textce = getCE(strsrch, textce);
1635 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1636 // do the beginning stuff
1637 int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1638 if (isSafe && failedoffset >= safelength) {
1639 // alas... no hope. failed at rearranged accent set
1640 cleanUpSafeText(strsrch, safetext, safebuffer);
1641 return USEARCH_DONE;
1642 }
1643 else {
1644 if (isSafe) {
1645 failedoffset += safeoffset;
1646 cleanUpSafeText(strsrch, safetext, safebuffer);
1647 }
1648
1649 // try rearranging the front accents
1650 int32_t result = doNextCanonicalPrefixMatch(strsrch,
1651 failedoffset, textoffset, status);
1652 if (result != USEARCH_DONE) {
1653 // if status is a failure, ucol_setOffset does nothing
1654 setColEIterOffset(strsrch->textIter, result);
1655 }
1656 if (U_FAILURE(*status)) {
1657 return USEARCH_DONE;
1658 }
1659 return result;
1660 }
1661 }
1662 if (textce == ce[ceindex]) {
1663 ceindex --;
1664 }
1665 }
1666 // set offset here
1667 if (isSafe) {
1668 int32_t result = getColElemIterOffset(coleiter, FALSE);
1669 // sets the text iterator here with the correct expansion and offset
1670 int32_t leftoverces = getExpansionPrefix(coleiter);
1671 cleanUpSafeText(strsrch, safetext, safebuffer);
1672 if (result >= safelength) {
1673 result = textoffset;
1674 }
1675 else {
1676 result += safeoffset;
1677 }
1678 setColEIterOffset(strsrch->textIter, result);
1679 strsrch->textIter->iteratordata_.toReturn =
1680 setExpansionPrefix(strsrch->textIter, leftoverces);
1681 return result;
1682 }
1683
1684 return ucol_getOffset(coleiter);
1685 }
1686
1687 /**
1688 * Trying out the substring and sees if it can be a canonical match.
1689 * This will try normalizing the end accents and arranging them into canonical
1690 * equivalents and check their corresponding ces with the pattern ce.
1691 * Suffix accents in the text will be grouped according to their combining
1692 * class and the groups will be mixed and matched to try find the perfect
1693 * match with the pattern.
1694 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1695 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1696 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1697 * "\u0301\u0325".
1698 * step 2: check if any of the generated substrings matches the pattern.
1699 * Internal method, status assumed to be success, caller has to check status
1700 * before calling this method.
1701 * @param strsrch string search data
1702 * @param textoffset end offset in the collation element text that ends with
1703 * the accents to be rearranged
1704 * @param status error status if any
1705 * @return TRUE if the match is valid, FALSE otherwise
1706 */
1707 static
doNextCanonicalMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)1708 UBool doNextCanonicalMatch(UStringSearch *strsrch,
1709 int32_t textoffset,
1710 UErrorCode *status)
1711 {
1712 const UChar *text = strsrch->search->text;
1713 int32_t temp = textoffset;
1714 U16_BACK_1(text, 0, temp);
1715 if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1716 UCollationElements *coleiter = strsrch->textIter;
1717 int32_t offset = getColElemIterOffset(coleiter, FALSE);
1718 if (strsrch->pattern.hasPrefixAccents) {
1719 offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1720 status);
1721 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1722 setColEIterOffset(coleiter, offset);
1723 return TRUE;
1724 }
1725 }
1726 return FALSE;
1727 }
1728
1729 if (!strsrch->pattern.hasSuffixAccents) {
1730 return FALSE;
1731 }
1732
1733 UChar accents[INITIAL_ARRAY_SIZE_];
1734 // offset to the last base character in substring to search
1735 int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1736 // normalizing the offensive string
1737 unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1738 0, accents, INITIAL_ARRAY_SIZE_, status);
1739 // status checked in loop below
1740
1741 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1742 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1743
1744 // 2 power n - 1 plus the full set of accents
1745 int32_t count = (2 << (size - 1)) - 1;
1746 while (U_SUCCESS(*status) && count > 0) {
1747 UChar *rearrange = strsrch->canonicalSuffixAccents;
1748 // copy the base characters
1749 for (int k = 0; k < accentsindex[0]; k ++) {
1750 *rearrange ++ = accents[k];
1751 }
1752 // forming all possible canonical rearrangement by dropping
1753 // sets of accents
1754 for (int i = 0; i <= size - 1; i ++) {
1755 int32_t mask = 1 << (size - i - 1);
1756 if (count & mask) {
1757 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1758 *rearrange ++ = accents[j];
1759 }
1760 }
1761 }
1762 *rearrange = 0;
1763 int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1764 status);
1765 if (offset != USEARCH_DONE) {
1766 return TRUE; // match found
1767 }
1768 count --;
1769 }
1770 return FALSE;
1771 }
1772
1773 /**
1774 * Gets the previous base character offset depending on the string search
1775 * pattern data
1776 * @param strsrch string search data
1777 * @param textoffset current offset, current character
1778 * @return the offset of the next character after this base character or itself
1779 * if it is a composed character with accents
1780 */
1781 static
getPreviousUStringSearchBaseOffset(UStringSearch * strsrch,int32_t textoffset)1782 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1783 int32_t textoffset)
1784 {
1785 if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1786 const UChar *text = strsrch->search->text;
1787 int32_t offset = textoffset;
1788 if (getFCD(text, &offset, strsrch->search->textLength) >>
1789 SECOND_LAST_BYTE_SHIFT_) {
1790 return getPreviousBaseOffset(text, textoffset);
1791 }
1792 }
1793 return textoffset;
1794 }
1795
1796 /**
1797 * Checks match for contraction.
1798 * If the match ends with a partial contraction we fail.
1799 * If the match starts too far off (because of backwards iteration) we try to
1800 * chip off the extra characters
1801 * Internal method, status assumed to be success, caller has to check status
1802 * before calling this method.
1803 * @param strsrch string search data
1804 * @param start offset of potential match, to be modified if necessary
1805 * @param end offset of potential match, to be modified if necessary
1806 * @param status output error status if any
1807 * @return TRUE if match passes the contraction test, FALSE otherwise
1808 */
1809 static
checkNextCanonicalContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)1810 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1811 int32_t *start,
1812 int32_t *end,
1813 UErrorCode *status)
1814 {
1815 UCollationElements *coleiter = strsrch->textIter;
1816 int32_t textlength = strsrch->search->textLength;
1817 int32_t temp = *start;
1818 const UCollator *collator = strsrch->collator;
1819 const UChar *text = strsrch->search->text;
1820 // This part checks if either ends of the match contains potential
1821 // contraction. If so we'll have to iterate through them
1822 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1823 (*start + 1 < textlength
1824 && ucol_unsafeCP(text[*start + 1], collator))) {
1825 int32_t expansion = getExpansionPrefix(coleiter);
1826 UBool expandflag = expansion > 0;
1827 setColEIterOffset(coleiter, *start);
1828 while (expansion > 0) {
1829 // getting rid of the redundant ce, caused by setOffset.
1830 // since backward contraction/expansion may have extra ces if we
1831 // are in the normalization buffer, hasAccentsBeforeMatch would
1832 // have taken care of it.
1833 // E.g. the character \u01FA will have an expansion of 3, but if
1834 // we are only looking for acute and ring \u030A and \u0301, we'll
1835 // have to skip the first ce in the expansion buffer.
1836 ucol_next(coleiter, status);
1837 if (U_FAILURE(*status)) {
1838 return FALSE;
1839 }
1840 if (ucol_getOffset(coleiter) != temp) {
1841 *start = temp;
1842 temp = ucol_getOffset(coleiter);
1843 }
1844 expansion --;
1845 }
1846
1847 int32_t *patternce = strsrch->pattern.ces;
1848 int32_t patterncelength = strsrch->pattern.cesLength;
1849 int32_t count = 0;
1850 int32_t textlength = strsrch->search->textLength;
1851 while (count < patterncelength) {
1852 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1853 // status checked below, note that if status is a failure
1854 // ucol_next returns UCOL_NULLORDER
1855 if (ce == UCOL_IGNORABLE) {
1856 continue;
1857 }
1858 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1859 *start = temp;
1860 temp = ucol_getOffset(coleiter);
1861 }
1862
1863 if (count == 0 && ce != patternce[0]) {
1864 // accents may have extra starting ces, this occurs when a
1865 // pure accent pattern is matched without rearrangement
1866 // text \u0325\u0300 and looking for \u0300
1867 int32_t expected = patternce[0];
1868 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1869 ce = getCE(strsrch, ucol_next(coleiter, status));
1870 while (U_SUCCESS(*status) && ce != expected &&
1871 ce != UCOL_NULLORDER &&
1872 ucol_getOffset(coleiter) <= *end) {
1873 ce = getCE(strsrch, ucol_next(coleiter, status));
1874 }
1875 }
1876 }
1877 if (U_FAILURE(*status) || ce != patternce[count]) {
1878 (*end) ++;
1879 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1880 return FALSE;
1881 }
1882 count ++;
1883 }
1884 }
1885 return TRUE;
1886 }
1887
1888 /**
1889 * Checks and sets the match information if found.
1890 * Checks
1891 * <ul>
1892 * <li> the potential match does not repeat the previous match
1893 * <li> boundaries are correct
1894 * <li> potential match does not end in the middle of a contraction
1895 * <li> identical matches
1896 * <\ul>
1897 * Otherwise the offset will be shifted to the next character.
1898 * Internal method, status assumed to be success, caller has to check the
1899 * status before calling this method.
1900 * @param strsrch string search data
1901 * @param textoffset offset in the collation element text. the returned value
1902 * will be the truncated end offset of the match or the new start
1903 * search offset.
1904 * @param status output error status if any
1905 * @return TRUE if the match is valid, FALSE otherwise
1906 */
1907 static
checkNextCanonicalMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)1908 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1909 int32_t *textoffset,
1910 UErrorCode *status)
1911 {
1912 // to ensure that the start and ends are not composite characters
1913 UCollationElements *coleiter = strsrch->textIter;
1914 // if we have a canonical accent match
1915 if ((strsrch->pattern.hasSuffixAccents &&
1916 strsrch->canonicalSuffixAccents[0]) ||
1917 (strsrch->pattern.hasPrefixAccents &&
1918 strsrch->canonicalPrefixAccents[0])) {
1919 strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset(
1920 strsrch,
1921 ucol_getOffset(coleiter));
1922 strsrch->search->matchedLength = *textoffset -
1923 strsrch->search->matchedIndex;
1924 return TRUE;
1925 }
1926
1927 int32_t start = getColElemIterOffset(coleiter, FALSE);
1928 if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1929 status) || U_FAILURE(*status)) {
1930 return FALSE;
1931 }
1932
1933 start = getPreviousUStringSearchBaseOffset(strsrch, start);
1934 // this totally matches, however we need to check if it is repeating
1935 if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1936 !isBreakUnit(strsrch, start, *textoffset) ||
1937 !checkIdentical(strsrch, start, *textoffset)) {
1938 (*textoffset) ++;
1939 *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1940 strsrch->search->textLength);
1941 return FALSE;
1942 }
1943
1944 strsrch->search->matchedIndex = start;
1945 strsrch->search->matchedLength = *textoffset - start;
1946 return TRUE;
1947 }
1948
1949 /**
1950 * Shifting the collation element iterator position forward to prepare for
1951 * a preceding match. If the first character is a unsafe character, we'll only
1952 * shift by 1 to capture contractions, normalization etc.
1953 * Internal method, status assumed to be success, caller has to check status
1954 * before calling this method.
1955 * @param text strsrch string search data
1956 * @param textoffset start text position to do search
1957 * @param ce the text ce which failed the match.
1958 * @param patternceindex index of the ce within the pattern ce buffer which
1959 * failed the match
1960 * @return final offset
1961 */
1962 static
reverseShift(UStringSearch * strsrch,int32_t textoffset,int32_t ce,int32_t patternceindex)1963 inline int32_t reverseShift(UStringSearch *strsrch,
1964 int32_t textoffset,
1965 int32_t ce,
1966 int32_t patternceindex)
1967 {
1968 if (strsrch->search->isOverlap) {
1969 if (textoffset != strsrch->search->textLength) {
1970 textoffset --;
1971 }
1972 else {
1973 textoffset -= strsrch->pattern.defaultShiftSize;
1974 }
1975 }
1976 else {
1977 if (ce != UCOL_NULLORDER) {
1978 int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)];
1979
1980 // this is to adjust for characters in the middle of the substring
1981 // for matching that failed.
1982 int32_t adjust = patternceindex;
1983 if (adjust > 1 && shift > adjust) {
1984 shift -= adjust - 1;
1985 }
1986 textoffset -= shift;
1987 }
1988 else {
1989 textoffset -= strsrch->pattern.defaultShiftSize;
1990 }
1991 }
1992 textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1993 return textoffset;
1994 }
1995
1996 /**
1997 * Checks match for contraction.
1998 * If the match starts with a partial contraction we fail.
1999 * Internal method, status assumed to be success, caller has to check status
2000 * before calling this method.
2001 * @param strsrch string search data
2002 * @param start offset of potential match, to be modified if necessary
2003 * @param end offset of potential match, to be modified if necessary
2004 * @param status output error status if any
2005 * @return TRUE if match passes the contraction test, FALSE otherwise
2006 */
2007 static
checkPreviousExactContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)2008 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2009 int32_t *start,
2010 int32_t *end, UErrorCode *status)
2011 {
2012 UCollationElements *coleiter = strsrch->textIter;
2013 int32_t textlength = strsrch->search->textLength;
2014 int32_t temp = *end;
2015 const UCollator *collator = strsrch->collator;
2016 const UChar *text = strsrch->search->text;
2017 // This part checks if either if the start of the match contains potential
2018 // contraction. If so we'll have to iterate through them
2019 // Since we used ucol_next while previously looking for the potential
2020 // match, this guarantees that our end will not be a partial contraction,
2021 // or a partial supplementary character.
2022 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2023 int32_t expansion = getExpansionSuffix(coleiter);
2024 UBool expandflag = expansion > 0;
2025 setColEIterOffset(coleiter, *end);
2026 while (U_SUCCESS(*status) && expansion > 0) {
2027 // getting rid of the redundant ce
2028 // since forward contraction/expansion may have extra ces
2029 // if we are in the normalization buffer, hasAccentsBeforeMatch
2030 // would have taken care of it.
2031 // E.g. the character \u01FA will have an expansion of 3, but if
2032 // we are only looking for A ring A\u030A, we'll have to skip the
2033 // last ce in the expansion buffer
2034 ucol_previous(coleiter, status);
2035 if (U_FAILURE(*status)) {
2036 return FALSE;
2037 }
2038 if (ucol_getOffset(coleiter) != temp) {
2039 *end = temp;
2040 temp = ucol_getOffset(coleiter);
2041 }
2042 expansion --;
2043 }
2044
2045 int32_t *patternce = strsrch->pattern.ces;
2046 int32_t patterncelength = strsrch->pattern.cesLength;
2047 int32_t count = patterncelength;
2048 while (count > 0) {
2049 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2050 // status checked below, note that if status is a failure
2051 // ucol_previous returns UCOL_NULLORDER
2052 if (ce == UCOL_IGNORABLE) {
2053 continue;
2054 }
2055 if (expandflag && count == 0 &&
2056 getColElemIterOffset(coleiter, FALSE) != temp) {
2057 *end = temp;
2058 temp = ucol_getOffset(coleiter);
2059 }
2060 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2061 (*start) --;
2062 *start = getPreviousBaseOffset(text, *start);
2063 return FALSE;
2064 }
2065 count --;
2066 }
2067 }
2068 return TRUE;
2069 }
2070
2071 /**
2072 * Checks and sets the match information if found.
2073 * Checks
2074 * <ul>
2075 * <li> the current match does not repeat the last match
2076 * <li> boundaries are correct
2077 * <li> exact matches has no extra accents
2078 * <li> identical matches
2079 * <\ul>
2080 * Otherwise the offset will be shifted to the preceding character.
2081 * Internal method, status assumed to be success, caller has to check status
2082 * before calling this method.
2083 * @param strsrch string search data
2084 * @param collator
2085 * @param coleiter collation element iterator
2086 * @param text string
2087 * @param textoffset offset in the collation element text. the returned value
2088 * will be the truncated start offset of the match or the new start
2089 * search offset.
2090 * @param status output error status if any
2091 * @return TRUE if the match is valid, FALSE otherwise
2092 */
2093 static
checkPreviousExactMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)2094 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2095 int32_t *textoffset,
2096 UErrorCode *status)
2097 {
2098 // to ensure that the start and ends are not composite characters
2099 int32_t end = ucol_getOffset(strsrch->textIter);
2100 if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2101 || U_FAILURE(*status)) {
2102 return FALSE;
2103 }
2104
2105 // this totally matches, however we need to check if it is repeating
2106 // the old match
2107 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2108 !isBreakUnit(strsrch, *textoffset, end) ||
2109 hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
2110 !checkIdentical(strsrch, *textoffset, end) ||
2111 hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2112 (*textoffset) --;
2113 *textoffset = getPreviousBaseOffset(strsrch->search->text,
2114 *textoffset);
2115 return FALSE;
2116 }
2117
2118 //Add breakiterator boundary check for primary strength search.
2119 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
2120 checkBreakBoundary(strsrch, textoffset, &end);
2121 }
2122
2123 strsrch->search->matchedIndex = *textoffset;
2124 strsrch->search->matchedLength = end - *textoffset;
2125 return TRUE;
2126 }
2127
2128 /**
2129 * Rearranges the end accents to try matching.
2130 * Suffix accents in the text will be grouped according to their combining
2131 * class and the groups will be mixed and matched to try find the perfect
2132 * match with the pattern.
2133 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2134 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2135 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2136 * "\u0301\u0325".
2137 * step 2: check if any of the generated substrings matches the pattern.
2138 * Internal method, status assumed to be success, user has to check status
2139 * before calling this method.
2140 * @param strsrch string search match
2141 * @param start offset of the first base character
2142 * @param end start of the last accent set
2143 * @param status only error status if any
2144 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2145 * offset of the match. Note this start includes all following accents.
2146 */
2147 static
doPreviousCanonicalSuffixMatch(UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)2148 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
2149 int32_t start,
2150 int32_t end,
2151 UErrorCode *status)
2152 {
2153 const UChar *text = strsrch->search->text;
2154 int32_t tempend = end;
2155
2156 U16_BACK_1(text, 0, tempend);
2157 if (!(getFCD(text, &tempend, strsrch->search->textLength) &
2158 LAST_BYTE_MASK_)) {
2159 // die... failed at a base character
2160 return USEARCH_DONE;
2161 }
2162 end = getNextBaseOffset(text, end, strsrch->search->textLength);
2163
2164 if (U_SUCCESS(*status)) {
2165 UChar accents[INITIAL_ARRAY_SIZE_];
2166 int32_t offset = getPreviousBaseOffset(text, end);
2167 // normalizing the offensive string
2168 unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2169 INITIAL_ARRAY_SIZE_, status);
2170
2171 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2172 int32_t accentsize = getUnblockedAccentIndex(accents,
2173 accentsindex);
2174 int32_t count = (2 << (accentsize - 1)) - 1;
2175 UChar buffer[INITIAL_ARRAY_SIZE_];
2176 UCollationElements *coleiter = strsrch->utilIter;
2177 while (U_SUCCESS(*status) && count > 0) {
2178 UChar *rearrange = strsrch->canonicalSuffixAccents;
2179 // copy the base characters
2180 for (int k = 0; k < accentsindex[0]; k ++) {
2181 *rearrange ++ = accents[k];
2182 }
2183 // forming all possible canonical rearrangement by dropping
2184 // sets of accents
2185 for (int i = 0; i <= accentsize - 1; i ++) {
2186 int32_t mask = 1 << (accentsize - i - 1);
2187 if (count & mask) {
2188 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2189 *rearrange ++ = accents[j];
2190 }
2191 }
2192 }
2193 *rearrange = 0;
2194 int32_t matchsize = INITIAL_ARRAY_SIZE_;
2195 UChar *match = addToUCharArray(buffer, &matchsize,
2196 strsrch->canonicalPrefixAccents,
2197 strsrch->search->text + start,
2198 offset - start,
2199 strsrch->canonicalSuffixAccents,
2200 status);
2201
2202 // run the collator iterator through this match
2203 // if status is a failure ucol_setText does nothing
2204 ucol_setText(coleiter, match, matchsize, status);
2205 if (U_SUCCESS(*status)) {
2206 if (checkCollationMatch(strsrch, coleiter)) {
2207 if (match != buffer) {
2208 uprv_free(match);
2209 }
2210 return end;
2211 }
2212 }
2213 count --;
2214 }
2215 }
2216 return USEARCH_DONE;
2217 }
2218
2219 /**
2220 * Take the rearranged start accents and tries matching. If match failed at
2221 * a seperate following set of accents (seperated from the rearranged on by
2222 * at least a base character) then we rearrange the preceding accents and
2223 * tries matching again.
2224 * We allow skipping of the ends of the accent set if the ces do not match.
2225 * However if the failure is found before the accent set, it fails.
2226 * Internal method, status assumed to be success, caller has to check status
2227 * before calling this method.
2228 * @param strsrch string search data
2229 * @param textoffset of the ends of the rearranged accent
2230 * @param status output error status if any
2231 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2232 * offset of the match. Note this start includes all following accents.
2233 */
2234 static
doPreviousCanonicalPrefixMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)2235 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2236 int32_t textoffset,
2237 UErrorCode *status)
2238 {
2239 const UChar *text = strsrch->search->text;
2240 const UCollator *collator = strsrch->collator;
2241 int32_t safelength = 0;
2242 UChar *safetext;
2243 int32_t safetextlength;
2244 UChar safebuffer[INITIAL_ARRAY_SIZE_];
2245 int32_t safeoffset = textoffset;
2246
2247 if (textoffset &&
2248 ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2249 u_strlen(strsrch->canonicalPrefixAccents) - 1
2250 ], collator)) {
2251 safeoffset = getNextSafeOffset(collator, text, textoffset,
2252 strsrch->search->textLength);
2253 safelength = safeoffset - textoffset;
2254 safetextlength = INITIAL_ARRAY_SIZE_;
2255 safetext = addToUCharArray(safebuffer, &safetextlength,
2256 strsrch->canonicalPrefixAccents,
2257 text + textoffset, safelength,
2258 NULL, status);
2259 }
2260 else {
2261 safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2262 safetext = strsrch->canonicalPrefixAccents;
2263 }
2264
2265 UCollationElements *coleiter = strsrch->utilIter;
2266 // if status is a failure, ucol_setText does nothing
2267 ucol_setText(coleiter, safetext, safetextlength, status);
2268 // status checked in loop below
2269
2270 int32_t *ce = strsrch->pattern.ces;
2271 int32_t celength = strsrch->pattern.cesLength;
2272 int ceindex = 0;
2273 UBool isSafe = TRUE; // safe zone indication flag for position
2274 int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2275
2276 while (ceindex < celength) {
2277 int32_t textce = ucol_next(coleiter, status);
2278 if (U_FAILURE(*status)) {
2279 if (isSafe) {
2280 cleanUpSafeText(strsrch, safetext, safebuffer);
2281 }
2282 return USEARCH_DONE;
2283 }
2284 if (textce == UCOL_NULLORDER) {
2285 // check if we have passed the safe buffer
2286 if (coleiter == strsrch->textIter) {
2287 cleanUpSafeText(strsrch, safetext, safebuffer);
2288 return USEARCH_DONE;
2289 }
2290 cleanUpSafeText(strsrch, safetext, safebuffer);
2291 safetext = safebuffer;
2292 coleiter = strsrch->textIter;
2293 setColEIterOffset(coleiter, safeoffset);
2294 // status checked at the start of the loop
2295 isSafe = FALSE;
2296 continue;
2297 }
2298 textce = getCE(strsrch, textce);
2299 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2300 // do the beginning stuff
2301 int32_t failedoffset = ucol_getOffset(coleiter);
2302 if (isSafe && failedoffset <= prefixlength) {
2303 // alas... no hope. failed at rearranged accent set
2304 cleanUpSafeText(strsrch, safetext, safebuffer);
2305 return USEARCH_DONE;
2306 }
2307 else {
2308 if (isSafe) {
2309 failedoffset = safeoffset - failedoffset;
2310 cleanUpSafeText(strsrch, safetext, safebuffer);
2311 }
2312
2313 // try rearranging the end accents
2314 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2315 textoffset, failedoffset, status);
2316 if (result != USEARCH_DONE) {
2317 // if status is a failure, ucol_setOffset does nothing
2318 setColEIterOffset(strsrch->textIter, result);
2319 }
2320 if (U_FAILURE(*status)) {
2321 return USEARCH_DONE;
2322 }
2323 return result;
2324 }
2325 }
2326 if (textce == ce[ceindex]) {
2327 ceindex ++;
2328 }
2329 }
2330 // set offset here
2331 if (isSafe) {
2332 int32_t result = ucol_getOffset(coleiter);
2333 // sets the text iterator here with the correct expansion and offset
2334 int32_t leftoverces = getExpansionSuffix(coleiter);
2335 cleanUpSafeText(strsrch, safetext, safebuffer);
2336 if (result <= prefixlength) {
2337 result = textoffset;
2338 }
2339 else {
2340 result = textoffset + (safeoffset - result);
2341 }
2342 setColEIterOffset(strsrch->textIter, result);
2343 setExpansionSuffix(strsrch->textIter, leftoverces);
2344 return result;
2345 }
2346
2347 return ucol_getOffset(coleiter);
2348 }
2349
2350 /**
2351 * Trying out the substring and sees if it can be a canonical match.
2352 * This will try normalizing the starting accents and arranging them into
2353 * canonical equivalents and check their corresponding ces with the pattern ce.
2354 * Prefix accents in the text will be grouped according to their combining
2355 * class and the groups will be mixed and matched to try find the perfect
2356 * match with the pattern.
2357 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2358 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2359 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2360 * "\u0301\u0325".
2361 * step 2: check if any of the generated substrings matches the pattern.
2362 * Internal method, status assumed to be success, caller has to check status
2363 * before calling this method.
2364 * @param strsrch string search data
2365 * @param textoffset start offset in the collation element text that starts
2366 * with the accents to be rearranged
2367 * @param status output error status if any
2368 * @return TRUE if the match is valid, FALSE otherwise
2369 */
2370 static
doPreviousCanonicalMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)2371 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2372 int32_t textoffset,
2373 UErrorCode *status)
2374 {
2375 const UChar *text = strsrch->search->text;
2376 int32_t temp = textoffset;
2377 int32_t textlength = strsrch->search->textLength;
2378 if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2379 UCollationElements *coleiter = strsrch->textIter;
2380 int32_t offset = ucol_getOffset(coleiter);
2381 if (strsrch->pattern.hasSuffixAccents) {
2382 offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2383 offset, status);
2384 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2385 setColEIterOffset(coleiter, offset);
2386 return TRUE;
2387 }
2388 }
2389 return FALSE;
2390 }
2391
2392 if (!strsrch->pattern.hasPrefixAccents) {
2393 return FALSE;
2394 }
2395
2396 UChar accents[INITIAL_ARRAY_SIZE_];
2397 // offset to the last base character in substring to search
2398 int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2399 // normalizing the offensive string
2400 unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2401 0, accents, INITIAL_ARRAY_SIZE_, status);
2402 // status checked in loop
2403
2404 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2405 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2406
2407 // 2 power n - 1 plus the full set of accents
2408 int32_t count = (2 << (size - 1)) - 1;
2409 while (U_SUCCESS(*status) && count > 0) {
2410 UChar *rearrange = strsrch->canonicalPrefixAccents;
2411 // copy the base characters
2412 for (int k = 0; k < accentsindex[0]; k ++) {
2413 *rearrange ++ = accents[k];
2414 }
2415 // forming all possible canonical rearrangement by dropping
2416 // sets of accents
2417 for (int i = 0; i <= size - 1; i ++) {
2418 int32_t mask = 1 << (size - i - 1);
2419 if (count & mask) {
2420 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2421 *rearrange ++ = accents[j];
2422 }
2423 }
2424 }
2425 *rearrange = 0;
2426 int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2427 baseoffset, status);
2428 if (offset != USEARCH_DONE) {
2429 return TRUE; // match found
2430 }
2431 count --;
2432 }
2433 return FALSE;
2434 }
2435
2436 /**
2437 * Checks match for contraction.
2438 * If the match starts with a partial contraction we fail.
2439 * Internal method, status assumed to be success, caller has to check status
2440 * before calling this method.
2441 * @param strsrch string search data
2442 * @param start offset of potential match, to be modified if necessary
2443 * @param end offset of potential match, to be modified if necessary
2444 * @param status only error status if any
2445 * @return TRUE if match passes the contraction test, FALSE otherwise
2446 */
2447 static
checkPreviousCanonicalContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)2448 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2449 int32_t *start,
2450 int32_t *end, UErrorCode *status)
2451 {
2452 UCollationElements *coleiter = strsrch->textIter;
2453 int32_t textlength = strsrch->search->textLength;
2454 int32_t temp = *end;
2455 const UCollator *collator = strsrch->collator;
2456 const UChar *text = strsrch->search->text;
2457 // This part checks if either if the start of the match contains potential
2458 // contraction. If so we'll have to iterate through them
2459 // Since we used ucol_next while previously looking for the potential
2460 // match, this guarantees that our end will not be a partial contraction,
2461 // or a partial supplementary character.
2462 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2463 int32_t expansion = getExpansionSuffix(coleiter);
2464 UBool expandflag = expansion > 0;
2465 setColEIterOffset(coleiter, *end);
2466 while (expansion > 0) {
2467 // getting rid of the redundant ce
2468 // since forward contraction/expansion may have extra ces
2469 // if we are in the normalization buffer, hasAccentsBeforeMatch
2470 // would have taken care of it.
2471 // E.g. the character \u01FA will have an expansion of 3, but if
2472 // we are only looking for A ring A\u030A, we'll have to skip the
2473 // last ce in the expansion buffer
2474 ucol_previous(coleiter, status);
2475 if (U_FAILURE(*status)) {
2476 return FALSE;
2477 }
2478 if (ucol_getOffset(coleiter) != temp) {
2479 *end = temp;
2480 temp = ucol_getOffset(coleiter);
2481 }
2482 expansion --;
2483 }
2484
2485 int32_t *patternce = strsrch->pattern.ces;
2486 int32_t patterncelength = strsrch->pattern.cesLength;
2487 int32_t count = patterncelength;
2488 while (count > 0) {
2489 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2490 // status checked below, note that if status is a failure
2491 // ucol_previous returns UCOL_NULLORDER
2492 if (ce == UCOL_IGNORABLE) {
2493 continue;
2494 }
2495 if (expandflag && count == 0 &&
2496 getColElemIterOffset(coleiter, FALSE) != temp) {
2497 *end = temp;
2498 temp = ucol_getOffset(coleiter);
2499 }
2500 if (count == patterncelength &&
2501 ce != patternce[patterncelength - 1]) {
2502 // accents may have extra starting ces, this occurs when a
2503 // pure accent pattern is matched without rearrangement
2504 int32_t expected = patternce[patterncelength - 1];
2505 U16_BACK_1(text, 0, *end);
2506 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2507 ce = getCE(strsrch, ucol_previous(coleiter, status));
2508 while (U_SUCCESS(*status) && ce != expected &&
2509 ce != UCOL_NULLORDER &&
2510 ucol_getOffset(coleiter) <= *start) {
2511 ce = getCE(strsrch, ucol_previous(coleiter, status));
2512 }
2513 }
2514 }
2515 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2516 (*start) --;
2517 *start = getPreviousBaseOffset(text, *start);
2518 return FALSE;
2519 }
2520 count --;
2521 }
2522 }
2523 return TRUE;
2524 }
2525
2526 /**
2527 * Checks and sets the match information if found.
2528 * Checks
2529 * <ul>
2530 * <li> the potential match does not repeat the previous match
2531 * <li> boundaries are correct
2532 * <li> potential match does not end in the middle of a contraction
2533 * <li> identical matches
2534 * <\ul>
2535 * Otherwise the offset will be shifted to the next character.
2536 * Internal method, status assumed to be success, caller has to check status
2537 * before calling this method.
2538 * @param strsrch string search data
2539 * @param textoffset offset in the collation element text. the returned value
2540 * will be the truncated start offset of the match or the new start
2541 * search offset.
2542 * @param status only error status if any
2543 * @return TRUE if the match is valid, FALSE otherwise
2544 */
2545 static
checkPreviousCanonicalMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)2546 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2547 int32_t *textoffset,
2548 UErrorCode *status)
2549 {
2550 // to ensure that the start and ends are not composite characters
2551 UCollationElements *coleiter = strsrch->textIter;
2552 // if we have a canonical accent match
2553 if ((strsrch->pattern.hasSuffixAccents &&
2554 strsrch->canonicalSuffixAccents[0]) ||
2555 (strsrch->pattern.hasPrefixAccents &&
2556 strsrch->canonicalPrefixAccents[0])) {
2557 strsrch->search->matchedIndex = *textoffset;
2558 strsrch->search->matchedLength =
2559 getNextUStringSearchBaseOffset(strsrch,
2560 getColElemIterOffset(coleiter, FALSE))
2561 - *textoffset;
2562 return TRUE;
2563 }
2564
2565 int32_t end = ucol_getOffset(coleiter);
2566 if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2567 status) ||
2568 U_FAILURE(*status)) {
2569 return FALSE;
2570 }
2571
2572 end = getNextUStringSearchBaseOffset(strsrch, end);
2573 // this totally matches, however we need to check if it is repeating
2574 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2575 !isBreakUnit(strsrch, *textoffset, end) ||
2576 !checkIdentical(strsrch, *textoffset, end)) {
2577 (*textoffset) --;
2578 *textoffset = getPreviousBaseOffset(strsrch->search->text,
2579 *textoffset);
2580 return FALSE;
2581 }
2582
2583 strsrch->search->matchedIndex = *textoffset;
2584 strsrch->search->matchedLength = end - *textoffset;
2585 return TRUE;
2586 }
2587 #endif // #if BOYER_MOORE
2588
2589 // constructors and destructor -------------------------------------------
2590
usearch_open(const UChar * pattern,int32_t patternlength,const UChar * text,int32_t textlength,const char * locale,UBreakIterator * breakiter,UErrorCode * status)2591 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2592 int32_t patternlength,
2593 const UChar *text,
2594 int32_t textlength,
2595 const char *locale,
2596 UBreakIterator *breakiter,
2597 UErrorCode *status)
2598 {
2599 if (U_FAILURE(*status)) {
2600 return NULL;
2601 }
2602 #if UCONFIG_NO_BREAK_ITERATION
2603 if (breakiter != NULL) {
2604 *status = U_UNSUPPORTED_ERROR;
2605 return NULL;
2606 }
2607 #endif
2608 if (locale) {
2609 // ucol_open internally checks for status
2610 UCollator *collator = ucol_open(locale, status);
2611 // pattern, text checks are done in usearch_openFromCollator
2612 UStringSearch *result = usearch_openFromCollator(pattern,
2613 patternlength, text, textlength,
2614 collator, breakiter, status);
2615
2616 if (result == NULL || U_FAILURE(*status)) {
2617 if (collator) {
2618 ucol_close(collator);
2619 }
2620 return NULL;
2621 }
2622 else {
2623 result->ownCollator = TRUE;
2624 }
2625 return result;
2626 }
2627 *status = U_ILLEGAL_ARGUMENT_ERROR;
2628 return NULL;
2629 }
2630
usearch_openFromCollator(const UChar * pattern,int32_t patternlength,const UChar * text,int32_t textlength,const UCollator * collator,UBreakIterator * breakiter,UErrorCode * status)2631 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2632 const UChar *pattern,
2633 int32_t patternlength,
2634 const UChar *text,
2635 int32_t textlength,
2636 const UCollator *collator,
2637 UBreakIterator *breakiter,
2638 UErrorCode *status)
2639 {
2640 if (U_FAILURE(*status)) {
2641 return NULL;
2642 }
2643 #if UCONFIG_NO_BREAK_ITERATION
2644 if (breakiter != NULL) {
2645 *status = U_UNSUPPORTED_ERROR;
2646 return NULL;
2647 }
2648 #endif
2649 if (pattern == NULL || text == NULL || collator == NULL) {
2650 *status = U_ILLEGAL_ARGUMENT_ERROR;
2651 return NULL;
2652 }
2653
2654 // string search does not really work when numeric collation is turned on
2655 if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2656 *status = U_UNSUPPORTED_ERROR;
2657 return NULL;
2658 }
2659
2660 if (U_SUCCESS(*status)) {
2661 initializeFCD(status);
2662 if (U_FAILURE(*status)) {
2663 return NULL;
2664 }
2665
2666 UStringSearch *result;
2667 if (textlength == -1) {
2668 textlength = u_strlen(text);
2669 }
2670 if (patternlength == -1) {
2671 patternlength = u_strlen(pattern);
2672 }
2673 if (textlength <= 0 || patternlength <= 0) {
2674 *status = U_ILLEGAL_ARGUMENT_ERROR;
2675 return NULL;
2676 }
2677
2678 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2679 if (result == NULL) {
2680 *status = U_MEMORY_ALLOCATION_ERROR;
2681 return NULL;
2682 }
2683
2684 result->collator = collator;
2685 result->strength = ucol_getStrength(collator);
2686 result->ceMask = getMask(result->strength);
2687 result->toShift =
2688 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2689 UCOL_SHIFTED;
2690 result->variableTop = ucol_getVariableTop(collator, status);
2691
2692 result->nfd = Normalizer2::getNFDInstance(*status);
2693
2694 if (U_FAILURE(*status)) {
2695 uprv_free(result);
2696 return NULL;
2697 }
2698
2699 result->search = (USearch *)uprv_malloc(sizeof(USearch));
2700 if (result->search == NULL) {
2701 *status = U_MEMORY_ALLOCATION_ERROR;
2702 uprv_free(result);
2703 return NULL;
2704 }
2705
2706 result->search->text = text;
2707 result->search->textLength = textlength;
2708
2709 result->pattern.text = pattern;
2710 result->pattern.textLength = patternlength;
2711 result->pattern.ces = NULL;
2712 result->pattern.pces = NULL;
2713
2714 result->search->breakIter = breakiter;
2715 #if !UCONFIG_NO_BREAK_ITERATION
2716 result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
2717 if (breakiter) {
2718 ubrk_setText(breakiter, text, textlength, status);
2719 }
2720 #endif
2721
2722 result->ownCollator = FALSE;
2723 result->search->matchedLength = 0;
2724 result->search->matchedIndex = USEARCH_DONE;
2725 result->utilIter = NULL;
2726 result->textIter = ucol_openElements(collator, text,
2727 textlength, status);
2728 result->textProcessedIter = NULL;
2729 if (U_FAILURE(*status)) {
2730 usearch_close(result);
2731 return NULL;
2732 }
2733
2734 result->search->isOverlap = FALSE;
2735 result->search->isCanonicalMatch = FALSE;
2736 result->search->elementComparisonType = 0;
2737 result->search->isForwardSearching = TRUE;
2738 result->search->reset = TRUE;
2739
2740 initialize(result, status);
2741
2742 if (U_FAILURE(*status)) {
2743 usearch_close(result);
2744 return NULL;
2745 }
2746
2747 return result;
2748 }
2749 return NULL;
2750 }
2751
usearch_close(UStringSearch * strsrch)2752 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2753 {
2754 if (strsrch) {
2755 if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
2756 strsrch->pattern.ces) {
2757 uprv_free(strsrch->pattern.ces);
2758 }
2759
2760 if (strsrch->pattern.pces != NULL &&
2761 strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
2762 uprv_free(strsrch->pattern.pces);
2763 }
2764
2765 delete strsrch->textProcessedIter;
2766 ucol_closeElements(strsrch->textIter);
2767 ucol_closeElements(strsrch->utilIter);
2768
2769 if (strsrch->ownCollator && strsrch->collator) {
2770 ucol_close((UCollator *)strsrch->collator);
2771 }
2772
2773 #if !UCONFIG_NO_BREAK_ITERATION
2774 if (strsrch->search->internalBreakIter) {
2775 ubrk_close(strsrch->search->internalBreakIter);
2776 }
2777 #endif
2778
2779 uprv_free(strsrch->search);
2780 uprv_free(strsrch);
2781 }
2782 }
2783
2784 namespace {
2785
initTextProcessedIter(UStringSearch * strsrch,UErrorCode * status)2786 UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
2787 if (U_FAILURE(*status)) { return FALSE; }
2788 if (strsrch->textProcessedIter == NULL) {
2789 strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
2790 if (strsrch->textProcessedIter == NULL) {
2791 *status = U_MEMORY_ALLOCATION_ERROR;
2792 return FALSE;
2793 }
2794 } else {
2795 strsrch->textProcessedIter->init(strsrch->textIter);
2796 }
2797 return TRUE;
2798 }
2799
2800 }
2801
2802 // set and get methods --------------------------------------------------
2803
usearch_setOffset(UStringSearch * strsrch,int32_t position,UErrorCode * status)2804 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2805 int32_t position,
2806 UErrorCode *status)
2807 {
2808 if (U_SUCCESS(*status) && strsrch) {
2809 if (isOutOfBounds(strsrch->search->textLength, position)) {
2810 *status = U_INDEX_OUTOFBOUNDS_ERROR;
2811 }
2812 else {
2813 setColEIterOffset(strsrch->textIter, position);
2814 }
2815 strsrch->search->matchedIndex = USEARCH_DONE;
2816 strsrch->search->matchedLength = 0;
2817 strsrch->search->reset = FALSE;
2818 }
2819 }
2820
usearch_getOffset(const UStringSearch * strsrch)2821 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2822 {
2823 if (strsrch) {
2824 int32_t result = ucol_getOffset(strsrch->textIter);
2825 if (isOutOfBounds(strsrch->search->textLength, result)) {
2826 return USEARCH_DONE;
2827 }
2828 return result;
2829 }
2830 return USEARCH_DONE;
2831 }
2832
usearch_setAttribute(UStringSearch * strsrch,USearchAttribute attribute,USearchAttributeValue value,UErrorCode * status)2833 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2834 USearchAttribute attribute,
2835 USearchAttributeValue value,
2836 UErrorCode *status)
2837 {
2838 if (U_SUCCESS(*status) && strsrch) {
2839 switch (attribute)
2840 {
2841 case USEARCH_OVERLAP :
2842 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2843 break;
2844 case USEARCH_CANONICAL_MATCH :
2845 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2846 FALSE);
2847 break;
2848 case USEARCH_ELEMENT_COMPARISON :
2849 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2850 strsrch->search->elementComparisonType = (int16_t)value;
2851 } else {
2852 strsrch->search->elementComparisonType = 0;
2853 }
2854 break;
2855 case USEARCH_ATTRIBUTE_COUNT :
2856 default:
2857 *status = U_ILLEGAL_ARGUMENT_ERROR;
2858 }
2859 }
2860 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2861 *status = U_ILLEGAL_ARGUMENT_ERROR;
2862 }
2863 }
2864
usearch_getAttribute(const UStringSearch * strsrch,USearchAttribute attribute)2865 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2866 const UStringSearch *strsrch,
2867 USearchAttribute attribute)
2868 {
2869 if (strsrch) {
2870 switch (attribute) {
2871 case USEARCH_OVERLAP :
2872 return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2873 USEARCH_OFF);
2874 case USEARCH_CANONICAL_MATCH :
2875 return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2876 USEARCH_OFF);
2877 case USEARCH_ELEMENT_COMPARISON :
2878 {
2879 int16_t value = strsrch->search->elementComparisonType;
2880 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2881 return (USearchAttributeValue)value;
2882 } else {
2883 return USEARCH_STANDARD_ELEMENT_COMPARISON;
2884 }
2885 }
2886 case USEARCH_ATTRIBUTE_COUNT :
2887 return USEARCH_DEFAULT;
2888 }
2889 }
2890 return USEARCH_DEFAULT;
2891 }
2892
usearch_getMatchedStart(const UStringSearch * strsrch)2893 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2894 const UStringSearch *strsrch)
2895 {
2896 if (strsrch == NULL) {
2897 return USEARCH_DONE;
2898 }
2899 return strsrch->search->matchedIndex;
2900 }
2901
2902
usearch_getMatchedText(const UStringSearch * strsrch,UChar * result,int32_t resultCapacity,UErrorCode * status)2903 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2904 UChar *result,
2905 int32_t resultCapacity,
2906 UErrorCode *status)
2907 {
2908 if (U_FAILURE(*status)) {
2909 return USEARCH_DONE;
2910 }
2911 if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2912 result == NULL)) {
2913 *status = U_ILLEGAL_ARGUMENT_ERROR;
2914 return USEARCH_DONE;
2915 }
2916
2917 int32_t copylength = strsrch->search->matchedLength;
2918 int32_t copyindex = strsrch->search->matchedIndex;
2919 if (copyindex == USEARCH_DONE) {
2920 u_terminateUChars(result, resultCapacity, 0, status);
2921 return USEARCH_DONE;
2922 }
2923
2924 if (resultCapacity < copylength) {
2925 copylength = resultCapacity;
2926 }
2927 if (copylength > 0) {
2928 uprv_memcpy(result, strsrch->search->text + copyindex,
2929 copylength * sizeof(UChar));
2930 }
2931 return u_terminateUChars(result, resultCapacity,
2932 strsrch->search->matchedLength, status);
2933 }
2934
usearch_getMatchedLength(const UStringSearch * strsrch)2935 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2936 const UStringSearch *strsrch)
2937 {
2938 if (strsrch) {
2939 return strsrch->search->matchedLength;
2940 }
2941 return USEARCH_DONE;
2942 }
2943
2944 #if !UCONFIG_NO_BREAK_ITERATION
2945
usearch_setBreakIterator(UStringSearch * strsrch,UBreakIterator * breakiter,UErrorCode * status)2946 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch,
2947 UBreakIterator *breakiter,
2948 UErrorCode *status)
2949 {
2950 if (U_SUCCESS(*status) && strsrch) {
2951 strsrch->search->breakIter = breakiter;
2952 if (breakiter) {
2953 ubrk_setText(breakiter, strsrch->search->text,
2954 strsrch->search->textLength, status);
2955 }
2956 }
2957 }
2958
2959 U_CAPI const UBreakIterator* U_EXPORT2
usearch_getBreakIterator(const UStringSearch * strsrch)2960 usearch_getBreakIterator(const UStringSearch *strsrch)
2961 {
2962 if (strsrch) {
2963 return strsrch->search->breakIter;
2964 }
2965 return NULL;
2966 }
2967
2968 #endif
2969
usearch_setText(UStringSearch * strsrch,const UChar * text,int32_t textlength,UErrorCode * status)2970 U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch,
2971 const UChar *text,
2972 int32_t textlength,
2973 UErrorCode *status)
2974 {
2975 if (U_SUCCESS(*status)) {
2976 if (strsrch == NULL || text == NULL || textlength < -1 ||
2977 textlength == 0) {
2978 *status = U_ILLEGAL_ARGUMENT_ERROR;
2979 }
2980 else {
2981 if (textlength == -1) {
2982 textlength = u_strlen(text);
2983 }
2984 strsrch->search->text = text;
2985 strsrch->search->textLength = textlength;
2986 ucol_setText(strsrch->textIter, text, textlength, status);
2987 strsrch->search->matchedIndex = USEARCH_DONE;
2988 strsrch->search->matchedLength = 0;
2989 strsrch->search->reset = TRUE;
2990 #if !UCONFIG_NO_BREAK_ITERATION
2991 if (strsrch->search->breakIter != NULL) {
2992 ubrk_setText(strsrch->search->breakIter, text,
2993 textlength, status);
2994 }
2995 ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
2996 #endif
2997 }
2998 }
2999 }
3000
usearch_getText(const UStringSearch * strsrch,int32_t * length)3001 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
3002 int32_t *length)
3003 {
3004 if (strsrch) {
3005 *length = strsrch->search->textLength;
3006 return strsrch->search->text;
3007 }
3008 return NULL;
3009 }
3010
usearch_setCollator(UStringSearch * strsrch,const UCollator * collator,UErrorCode * status)3011 U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
3012 const UCollator *collator,
3013 UErrorCode *status)
3014 {
3015 if (U_SUCCESS(*status)) {
3016 if (collator == NULL) {
3017 *status = U_ILLEGAL_ARGUMENT_ERROR;
3018 return;
3019 }
3020
3021 if (strsrch) {
3022 delete strsrch->textProcessedIter;
3023 strsrch->textProcessedIter = NULL;
3024 ucol_closeElements(strsrch->textIter);
3025 ucol_closeElements(strsrch->utilIter);
3026 strsrch->textIter = strsrch->utilIter = NULL;
3027 if (strsrch->ownCollator && (strsrch->collator != collator)) {
3028 ucol_close((UCollator *)strsrch->collator);
3029 strsrch->ownCollator = FALSE;
3030 }
3031 strsrch->collator = collator;
3032 strsrch->strength = ucol_getStrength(collator);
3033 strsrch->ceMask = getMask(strsrch->strength);
3034 #if !UCONFIG_NO_BREAK_ITERATION
3035 ubrk_close(strsrch->search->internalBreakIter);
3036 strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
3037 strsrch->search->text, strsrch->search->textLength, status);
3038 #endif
3039 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3040 strsrch->toShift =
3041 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
3042 UCOL_SHIFTED;
3043 // if status is a failure, ucol_getVariableTop returns 0
3044 strsrch->variableTop = ucol_getVariableTop(collator, status);
3045 strsrch->textIter = ucol_openElements(collator,
3046 strsrch->search->text,
3047 strsrch->search->textLength,
3048 status);
3049 strsrch->utilIter = ucol_openElements(
3050 collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
3051 // initialize() _after_ setting the iterators for the new collator.
3052 initialize(strsrch, status);
3053 }
3054
3055 // **** are these calls needed?
3056 // **** we call uprv_init_pce in initializePatternPCETable
3057 // **** and the CEIBuffer constructor...
3058 #if 0
3059 uprv_init_pce(strsrch->textIter);
3060 uprv_init_pce(strsrch->utilIter);
3061 #endif
3062 }
3063 }
3064
usearch_getCollator(const UStringSearch * strsrch)3065 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3066 {
3067 if (strsrch) {
3068 return (UCollator *)strsrch->collator;
3069 }
3070 return NULL;
3071 }
3072
usearch_setPattern(UStringSearch * strsrch,const UChar * pattern,int32_t patternlength,UErrorCode * status)3073 U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch,
3074 const UChar *pattern,
3075 int32_t patternlength,
3076 UErrorCode *status)
3077 {
3078 if (U_SUCCESS(*status)) {
3079 if (strsrch == NULL || pattern == NULL) {
3080 *status = U_ILLEGAL_ARGUMENT_ERROR;
3081 }
3082 else {
3083 if (patternlength == -1) {
3084 patternlength = u_strlen(pattern);
3085 }
3086 if (patternlength == 0) {
3087 *status = U_ILLEGAL_ARGUMENT_ERROR;
3088 return;
3089 }
3090 strsrch->pattern.text = pattern;
3091 strsrch->pattern.textLength = patternlength;
3092 initialize(strsrch, status);
3093 }
3094 }
3095 }
3096
3097 U_CAPI const UChar* U_EXPORT2
usearch_getPattern(const UStringSearch * strsrch,int32_t * length)3098 usearch_getPattern(const UStringSearch *strsrch,
3099 int32_t *length)
3100 {
3101 if (strsrch) {
3102 *length = strsrch->pattern.textLength;
3103 return strsrch->pattern.text;
3104 }
3105 return NULL;
3106 }
3107
3108 // miscellanous methods --------------------------------------------------
3109
usearch_first(UStringSearch * strsrch,UErrorCode * status)3110 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3111 UErrorCode *status)
3112 {
3113 if (strsrch && U_SUCCESS(*status)) {
3114 strsrch->search->isForwardSearching = TRUE;
3115 usearch_setOffset(strsrch, 0, status);
3116 if (U_SUCCESS(*status)) {
3117 return usearch_next(strsrch, status);
3118 }
3119 }
3120 return USEARCH_DONE;
3121 }
3122
usearch_following(UStringSearch * strsrch,int32_t position,UErrorCode * status)3123 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
3124 int32_t position,
3125 UErrorCode *status)
3126 {
3127 if (strsrch && U_SUCCESS(*status)) {
3128 strsrch->search->isForwardSearching = TRUE;
3129 // position checked in usearch_setOffset
3130 usearch_setOffset(strsrch, position, status);
3131 if (U_SUCCESS(*status)) {
3132 return usearch_next(strsrch, status);
3133 }
3134 }
3135 return USEARCH_DONE;
3136 }
3137
usearch_last(UStringSearch * strsrch,UErrorCode * status)3138 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
3139 UErrorCode *status)
3140 {
3141 if (strsrch && U_SUCCESS(*status)) {
3142 strsrch->search->isForwardSearching = FALSE;
3143 usearch_setOffset(strsrch, strsrch->search->textLength, status);
3144 if (U_SUCCESS(*status)) {
3145 return usearch_previous(strsrch, status);
3146 }
3147 }
3148 return USEARCH_DONE;
3149 }
3150
usearch_preceding(UStringSearch * strsrch,int32_t position,UErrorCode * status)3151 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
3152 int32_t position,
3153 UErrorCode *status)
3154 {
3155 if (strsrch && U_SUCCESS(*status)) {
3156 strsrch->search->isForwardSearching = FALSE;
3157 // position checked in usearch_setOffset
3158 usearch_setOffset(strsrch, position, status);
3159 if (U_SUCCESS(*status)) {
3160 return usearch_previous(strsrch, status);
3161 }
3162 }
3163 return USEARCH_DONE;
3164 }
3165
3166 /**
3167 * If a direction switch is required, we'll count the number of ces till the
3168 * beginning of the collation element iterator and iterate forwards that
3169 * number of times. This is so that we get to the correct point within the
3170 * string to continue the search in. Imagine when we are in the middle of the
3171 * normalization buffer when the change in direction is request. arrrgghh....
3172 * After searching the offset within the collation element iterator will be
3173 * shifted to the start of the match. If a match is not found, the offset would
3174 * have been set to the end of the text string in the collation element
3175 * iterator.
3176 * Okay, here's my take on normalization buffer. The only time when there can
3177 * be 2 matches within the same normalization is when the pattern is consists
3178 * of all accents. But since the offset returned is from the text string, we
3179 * should not confuse the caller by returning the second match within the
3180 * same normalization buffer. If we do, the 2 results will have the same match
3181 * offsets, and that'll be confusing. I'll return the next match that doesn't
3182 * fall within the same normalization buffer. Note this does not affect the
3183 * results of matches spanning the text and the normalization buffer.
3184 * The position to start searching is taken from the collation element
3185 * iterator. Callers of this API would have to set the offset in the collation
3186 * element iterator before using this method.
3187 */
usearch_next(UStringSearch * strsrch,UErrorCode * status)3188 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3189 UErrorCode *status)
3190 {
3191 if (U_SUCCESS(*status) && strsrch) {
3192 // note offset is either equivalent to the start of the previous match
3193 // or is set by the user
3194 int32_t offset = usearch_getOffset(strsrch);
3195 USearch *search = strsrch->search;
3196 search->reset = FALSE;
3197 int32_t textlength = search->textLength;
3198 if (search->isForwardSearching) {
3199 #if BOYER_MOORE
3200 if (offset == textlength
3201 || (!search->isOverlap &&
3202 (offset + strsrch->pattern.defaultShiftSize > textlength ||
3203 (search->matchedIndex != USEARCH_DONE &&
3204 offset + search->matchedLength >= textlength)))) {
3205 // not enough characters to match
3206 setMatchNotFound(strsrch);
3207 return USEARCH_DONE;
3208 }
3209 #else
3210 if (offset == textlength ||
3211 (! search->isOverlap &&
3212 (search->matchedIndex != USEARCH_DONE &&
3213 offset + search->matchedLength > textlength))) {
3214 // not enough characters to match
3215 setMatchNotFound(strsrch);
3216 return USEARCH_DONE;
3217 }
3218 #endif
3219 }
3220 else {
3221 // switching direction.
3222 // if matchedIndex == USEARCH_DONE, it means that either a
3223 // setOffset has been called or that previous ran off the text
3224 // string. the iterator would have been set to offset 0 if a
3225 // match is not found.
3226 search->isForwardSearching = TRUE;
3227 if (search->matchedIndex != USEARCH_DONE) {
3228 // there's no need to set the collation element iterator
3229 // the next call to next will set the offset.
3230 return search->matchedIndex;
3231 }
3232 }
3233
3234 if (U_SUCCESS(*status)) {
3235 if (strsrch->pattern.cesLength == 0) {
3236 if (search->matchedIndex == USEARCH_DONE) {
3237 search->matchedIndex = offset;
3238 }
3239 else { // moves by codepoints
3240 U16_FWD_1(search->text, search->matchedIndex, textlength);
3241 }
3242
3243 search->matchedLength = 0;
3244 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3245 // status checked below
3246 if (search->matchedIndex == textlength) {
3247 search->matchedIndex = USEARCH_DONE;
3248 }
3249 }
3250 else {
3251 if (search->matchedLength > 0) {
3252 // if matchlength is 0 we are at the start of the iteration
3253 if (search->isOverlap) {
3254 ucol_setOffset(strsrch->textIter, offset + 1, status);
3255 }
3256 else {
3257 ucol_setOffset(strsrch->textIter,
3258 offset + search->matchedLength, status);
3259 }
3260 }
3261 else {
3262 // for boundary check purposes. this will ensure that the
3263 // next match will not preceed the current offset
3264 // note search->matchedIndex will always be set to something
3265 // in the code
3266 search->matchedIndex = offset - 1;
3267 }
3268
3269 if (search->isCanonicalMatch) {
3270 // can't use exact here since extra accents are allowed.
3271 usearch_handleNextCanonical(strsrch, status);
3272 }
3273 else {
3274 usearch_handleNextExact(strsrch, status);
3275 }
3276 }
3277
3278 if (U_FAILURE(*status)) {
3279 return USEARCH_DONE;
3280 }
3281
3282 #if !BOYER_MOORE
3283 if (search->matchedIndex == USEARCH_DONE) {
3284 ucol_setOffset(strsrch->textIter, search->textLength, status);
3285 } else {
3286 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3287 }
3288 #endif
3289
3290 return search->matchedIndex;
3291 }
3292 }
3293 return USEARCH_DONE;
3294 }
3295
usearch_previous(UStringSearch * strsrch,UErrorCode * status)3296 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3297 UErrorCode *status)
3298 {
3299 if (U_SUCCESS(*status) && strsrch) {
3300 int32_t offset;
3301 USearch *search = strsrch->search;
3302 if (search->reset) {
3303 offset = search->textLength;
3304 search->isForwardSearching = FALSE;
3305 search->reset = FALSE;
3306 setColEIterOffset(strsrch->textIter, offset);
3307 }
3308 else {
3309 offset = usearch_getOffset(strsrch);
3310 }
3311
3312 int32_t matchedindex = search->matchedIndex;
3313 if (search->isForwardSearching == TRUE) {
3314 // switching direction.
3315 // if matchedIndex == USEARCH_DONE, it means that either a
3316 // setOffset has been called or that next ran off the text
3317 // string. the iterator would have been set to offset textLength if
3318 // a match is not found.
3319 search->isForwardSearching = FALSE;
3320 if (matchedindex != USEARCH_DONE) {
3321 return matchedindex;
3322 }
3323 }
3324 else {
3325 #if BOYER_MOORE
3326 if (offset == 0 || matchedindex == 0 ||
3327 (!search->isOverlap &&
3328 (offset < strsrch->pattern.defaultShiftSize ||
3329 (matchedindex != USEARCH_DONE &&
3330 matchedindex < strsrch->pattern.defaultShiftSize)))) {
3331 // not enough characters to match
3332 setMatchNotFound(strsrch);
3333 return USEARCH_DONE;
3334 }
3335 #else
3336 // Could check pattern length, but the
3337 // linear search will do the right thing
3338 if (offset == 0 || matchedindex == 0) {
3339 setMatchNotFound(strsrch);
3340 return USEARCH_DONE;
3341 }
3342 #endif
3343 }
3344
3345 if (U_SUCCESS(*status)) {
3346 if (strsrch->pattern.cesLength == 0) {
3347 search->matchedIndex =
3348 (matchedindex == USEARCH_DONE ? offset : matchedindex);
3349 if (search->matchedIndex == 0) {
3350 setMatchNotFound(strsrch);
3351 // status checked below
3352 }
3353 else { // move by codepoints
3354 U16_BACK_1(search->text, 0, search->matchedIndex);
3355 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3356 // status checked below
3357 search->matchedLength = 0;
3358 }
3359 }
3360 else {
3361 if (strsrch->search->isCanonicalMatch) {
3362 // can't use exact here since extra accents are allowed.
3363 usearch_handlePreviousCanonical(strsrch, status);
3364 // status checked below
3365 }
3366 else {
3367 usearch_handlePreviousExact(strsrch, status);
3368 // status checked below
3369 }
3370 }
3371
3372 if (U_FAILURE(*status)) {
3373 return USEARCH_DONE;
3374 }
3375
3376 return search->matchedIndex;
3377 }
3378 }
3379 return USEARCH_DONE;
3380 }
3381
3382
3383
usearch_reset(UStringSearch * strsrch)3384 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3385 {
3386 /*
3387 reset is setting the attributes that are already in
3388 string search, hence all attributes in the collator should
3389 be retrieved without any problems
3390 */
3391 if (strsrch) {
3392 UErrorCode status = U_ZERO_ERROR;
3393 UBool sameCollAttribute = TRUE;
3394 uint32_t ceMask;
3395 UBool shift;
3396 uint32_t varTop;
3397
3398 // **** hack to deal w/ how processed CEs encode quaternary ****
3399 UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
3400 if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
3401 (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
3402 sameCollAttribute = FALSE;
3403 }
3404
3405 strsrch->strength = ucol_getStrength(strsrch->collator);
3406 ceMask = getMask(strsrch->strength);
3407 if (strsrch->ceMask != ceMask) {
3408 strsrch->ceMask = ceMask;
3409 sameCollAttribute = FALSE;
3410 }
3411
3412 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3413 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3414 &status) == UCOL_SHIFTED;
3415 if (strsrch->toShift != shift) {
3416 strsrch->toShift = shift;
3417 sameCollAttribute = FALSE;
3418 }
3419
3420 // if status is a failure, ucol_getVariableTop returns 0
3421 varTop = ucol_getVariableTop(strsrch->collator, &status);
3422 if (strsrch->variableTop != varTop) {
3423 strsrch->variableTop = varTop;
3424 sameCollAttribute = FALSE;
3425 }
3426 if (!sameCollAttribute) {
3427 initialize(strsrch, &status);
3428 }
3429 ucol_setText(strsrch->textIter, strsrch->search->text,
3430 strsrch->search->textLength,
3431 &status);
3432 strsrch->search->matchedLength = 0;
3433 strsrch->search->matchedIndex = USEARCH_DONE;
3434 strsrch->search->isOverlap = FALSE;
3435 strsrch->search->isCanonicalMatch = FALSE;
3436 strsrch->search->elementComparisonType = 0;
3437 strsrch->search->isForwardSearching = TRUE;
3438 strsrch->search->reset = TRUE;
3439 }
3440 }
3441
3442 //
3443 // CEI Collation Element + source text index.
3444 // These structs are kept in the circular buffer.
3445 //
3446 struct CEI {
3447 int64_t ce;
3448 int32_t lowIndex;
3449 int32_t highIndex;
3450 };
3451
3452 U_NAMESPACE_BEGIN
3453
3454 namespace {
3455 //
3456 // CEIBuffer A circular buffer of CEs-with-index from the text being searched.
3457 //
3458 #define DEFAULT_CEBUFFER_SIZE 96
3459 #define CEBUFFER_EXTRA 32
3460 // Some typical max values to make buffer size more reasonable for asymmetric search.
3461 // #8694 is for a better long-term solution to allocation of this buffer.
3462 #define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3463 #define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3464 #define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
3465 struct CEIBuffer {
3466 CEI defBuf[DEFAULT_CEBUFFER_SIZE];
3467 CEI *buf;
3468 int32_t bufSize;
3469 int32_t firstIx;
3470 int32_t limitIx;
3471 UCollationElements *ceIter;
3472 UStringSearch *strSearch;
3473
3474
3475
3476 CEIBuffer(UStringSearch *ss, UErrorCode *status);
3477 ~CEIBuffer();
3478 const CEI *get(int32_t index);
3479 const CEI *getPrevious(int32_t index);
3480 };
3481
3482
CEIBuffer(UStringSearch * ss,UErrorCode * status)3483 CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
3484 buf = defBuf;
3485 strSearch = ss;
3486 bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
3487 if (ss->search->elementComparisonType != 0) {
3488 const UChar * patText = ss->pattern.text;
3489 if (patText) {
3490 const UChar * patTextLimit = patText + ss->pattern.textLength;
3491 while ( patText < patTextLimit ) {
3492 UChar c = *patText++;
3493 if (MIGHT_BE_JAMO_L(c)) {
3494 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
3495 } else {
3496 // No check for surrogates, we might allocate slightly more buffer than necessary.
3497 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
3498 }
3499 }
3500 }
3501 }
3502 ceIter = ss->textIter;
3503 firstIx = 0;
3504 limitIx = 0;
3505
3506 if (!initTextProcessedIter(ss, status)) { return; }
3507
3508 if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3509 buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
3510 if (buf == NULL) {
3511 *status = U_MEMORY_ALLOCATION_ERROR;
3512 }
3513 }
3514 }
3515
3516 // TODO: add a reset or init function so that allocated
3517 // buffers can be retained & reused.
3518
~CEIBuffer()3519 CEIBuffer::~CEIBuffer() {
3520 if (buf != defBuf) {
3521 uprv_free(buf);
3522 }
3523 }
3524
3525
3526 // Get the CE with the specified index.
3527 // Index must be in the range
3528 // n-history_size < index < n+1
3529 // where n is the largest index to have been fetched by some previous call to this function.
3530 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3531 //
get(int32_t index)3532 const CEI *CEIBuffer::get(int32_t index) {
3533 int i = index % bufSize;
3534
3535 if (index>=firstIx && index<limitIx) {
3536 // The request was for an entry already in our buffer.
3537 // Just return it.
3538 return &buf[i];
3539 }
3540
3541 // Caller is requesting a new, never accessed before, CE.
3542 // Verify that it is the next one in sequence, which is all
3543 // that is allowed.
3544 if (index != limitIx) {
3545 U_ASSERT(FALSE);
3546
3547 return NULL;
3548 }
3549
3550 // Manage the circular CE buffer indexing
3551 limitIx++;
3552
3553 if (limitIx - firstIx >= bufSize) {
3554 // The buffer is full, knock out the lowest-indexed entry.
3555 firstIx++;
3556 }
3557
3558 UErrorCode status = U_ZERO_ERROR;
3559
3560 buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3561
3562 return &buf[i];
3563 }
3564
3565 // Get the CE with the specified index.
3566 // Index must be in the range
3567 // n-history_size < index < n+1
3568 // where n is the largest index to have been fetched by some previous call to this function.
3569 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3570 //
getPrevious(int32_t index)3571 const CEI *CEIBuffer::getPrevious(int32_t index) {
3572 int i = index % bufSize;
3573
3574 if (index>=firstIx && index<limitIx) {
3575 // The request was for an entry already in our buffer.
3576 // Just return it.
3577 return &buf[i];
3578 }
3579
3580 // Caller is requesting a new, never accessed before, CE.
3581 // Verify that it is the next one in sequence, which is all
3582 // that is allowed.
3583 if (index != limitIx) {
3584 U_ASSERT(FALSE);
3585
3586 return NULL;
3587 }
3588
3589 // Manage the circular CE buffer indexing
3590 limitIx++;
3591
3592 if (limitIx - firstIx >= bufSize) {
3593 // The buffer is full, knock out the lowest-indexed entry.
3594 firstIx++;
3595 }
3596
3597 UErrorCode status = U_ZERO_ERROR;
3598
3599 buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3600
3601 return &buf[i];
3602 }
3603
3604 }
3605
3606 U_NAMESPACE_END
3607
3608
3609 // #define USEARCH_DEBUG
3610
3611 #ifdef USEARCH_DEBUG
3612 #include <stdio.h>
3613 #include <stdlib.h>
3614 #endif
3615
3616 /*
3617 * Find the next break boundary after startIndex. If the UStringSearch object
3618 * has an external break iterator, use that. Otherwise use the internal character
3619 * break iterator.
3620 */
nextBoundaryAfter(UStringSearch * strsrch,int32_t startIndex)3621 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3622 #if 0
3623 const UChar *text = strsrch->search->text;
3624 int32_t textLen = strsrch->search->textLength;
3625
3626 U_ASSERT(startIndex>=0);
3627 U_ASSERT(startIndex<=textLen);
3628
3629 if (startIndex >= textLen) {
3630 return startIndex;
3631 }
3632
3633 UChar32 c;
3634 int32_t i = startIndex;
3635 U16_NEXT(text, i, textLen, c);
3636
3637 // If we are on a control character, stop without looking for combining marks.
3638 // Control characters do not combine.
3639 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3640 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
3641 return i;
3642 }
3643
3644 // The initial character was not a control, and can thus accept trailing
3645 // combining characters. Advance over however many of them there are.
3646 int32_t indexOfLastCharChecked;
3647 for (;;) {
3648 indexOfLastCharChecked = i;
3649 if (i>=textLen) {
3650 break;
3651 }
3652 U16_NEXT(text, i, textLen, c);
3653 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3654 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3655 break;
3656 }
3657 }
3658 return indexOfLastCharChecked;
3659 #elif !UCONFIG_NO_BREAK_ITERATION
3660 UBreakIterator *breakiterator = strsrch->search->breakIter;
3661
3662 if (breakiterator == NULL) {
3663 breakiterator = strsrch->search->internalBreakIter;
3664 }
3665
3666 if (breakiterator != NULL) {
3667 return ubrk_following(breakiterator, startIndex);
3668 }
3669
3670 return startIndex;
3671 #else
3672 // **** or should we use the original code? ****
3673 return startIndex;
3674 #endif
3675
3676 }
3677
3678 /*
3679 * Returns TRUE if index is on a break boundary. If the UStringSearch
3680 * has an external break iterator, test using that, otherwise test
3681 * using the internal character break iterator.
3682 */
isBreakBoundary(UStringSearch * strsrch,int32_t index)3683 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3684 #if 0
3685 const UChar *text = strsrch->search->text;
3686 int32_t textLen = strsrch->search->textLength;
3687
3688 U_ASSERT(index>=0);
3689 U_ASSERT(index<=textLen);
3690
3691 if (index>=textLen || index<=0) {
3692 return TRUE;
3693 }
3694
3695 // If the character at the current index is not a GRAPHEME_EXTEND
3696 // then we can not be within a combining sequence.
3697 UChar32 c;
3698 U16_GET(text, 0, index, textLen, c);
3699 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3700 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3701 return TRUE;
3702 }
3703
3704 // We are at a combining mark. If the preceding character is anything
3705 // except a CONTROL, CR or LF, we are in a combining sequence.
3706 U16_PREV(text, 0, index, c);
3707 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3708 UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
3709 return !combining;
3710 #elif !UCONFIG_NO_BREAK_ITERATION
3711 UBreakIterator *breakiterator = strsrch->search->breakIter;
3712
3713 if (breakiterator == NULL) {
3714 breakiterator = strsrch->search->internalBreakIter;
3715 }
3716
3717 return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
3718 #else
3719 // **** or use the original code? ****
3720 return TRUE;
3721 #endif
3722 }
3723
3724 #if 0
3725 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3726 {
3727 #if !UCONFIG_NO_BREAK_ITERATION
3728 UBreakIterator *breakiterator = strsrch->search->breakIter;
3729
3730 if (breakiterator != NULL) {
3731 int32_t startindex = ubrk_first(breakiterator);
3732 int32_t endindex = ubrk_last(breakiterator);
3733
3734 // out-of-range indexes are never boundary positions
3735 if (start < startindex || start > endindex ||
3736 end < startindex || end > endindex) {
3737 return FALSE;
3738 }
3739
3740 return ubrk_isBoundary(breakiterator, start) &&
3741 ubrk_isBoundary(breakiterator, end);
3742 }
3743 #endif
3744
3745 return TRUE;
3746 }
3747 #endif
3748
3749 typedef enum {
3750 U_CE_MATCH = -1,
3751 U_CE_NO_MATCH = 0,
3752 U_CE_SKIP_TARG,
3753 U_CE_SKIP_PATN
3754 } UCompareCEsResult;
3755 #define U_CE_LEVEL2_BASE 0x00000005
3756 #define U_CE_LEVEL3_BASE 0x00050000
3757
compareCE64s(int64_t targCE,int64_t patCE,int16_t compareType)3758 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
3759 if (targCE == patCE) {
3760 return U_CE_MATCH;
3761 }
3762 if (compareType == 0) {
3763 return U_CE_NO_MATCH;
3764 }
3765
3766 int64_t targCEshifted = targCE >> 32;
3767 int64_t patCEshifted = patCE >> 32;
3768 int64_t mask;
3769
3770 mask = 0xFFFF0000;
3771 int32_t targLev1 = (int32_t)(targCEshifted & mask);
3772 int32_t patLev1 = (int32_t)(patCEshifted & mask);
3773 if ( targLev1 != patLev1 ) {
3774 if ( targLev1 == 0 ) {
3775 return U_CE_SKIP_TARG;
3776 }
3777 if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3778 return U_CE_SKIP_PATN;
3779 }
3780 return U_CE_NO_MATCH;
3781 }
3782
3783 mask = 0x0000FFFF;
3784 int32_t targLev2 = (int32_t)(targCEshifted & mask);
3785 int32_t patLev2 = (int32_t)(patCEshifted & mask);
3786 if ( targLev2 != patLev2 ) {
3787 if ( targLev2 == 0 ) {
3788 return U_CE_SKIP_TARG;
3789 }
3790 if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3791 return U_CE_SKIP_PATN;
3792 }
3793 return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
3794 U_CE_MATCH: U_CE_NO_MATCH;
3795 }
3796
3797 mask = 0xFFFF0000;
3798 int32_t targLev3 = (int32_t)(targCE & mask);
3799 int32_t patLev3 = (int32_t)(patCE & mask);
3800 if ( targLev3 != patLev3 ) {
3801 return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
3802 U_CE_MATCH: U_CE_NO_MATCH;
3803 }
3804
3805 return U_CE_MATCH;
3806 }
3807
3808 #if BOYER_MOORE
3809 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3810 #endif
3811
3812 namespace {
3813
codePointAt(const USearch & search,int32_t index)3814 UChar32 codePointAt(const USearch &search, int32_t index) {
3815 if (index < search.textLength) {
3816 UChar32 c;
3817 U16_NEXT(search.text, index, search.textLength, c);
3818 return c;
3819 }
3820 return U_SENTINEL;
3821 }
3822
codePointBefore(const USearch & search,int32_t index)3823 UChar32 codePointBefore(const USearch &search, int32_t index) {
3824 if (0 < index) {
3825 UChar32 c;
3826 U16_PREV(search.text, 0, index, c);
3827 return c;
3828 }
3829 return U_SENTINEL;
3830 }
3831
3832 } // namespace
3833
usearch_search(UStringSearch * strsrch,int32_t startIdx,int32_t * matchStart,int32_t * matchLimit,UErrorCode * status)3834 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
3835 int32_t startIdx,
3836 int32_t *matchStart,
3837 int32_t *matchLimit,
3838 UErrorCode *status)
3839 {
3840 if (U_FAILURE(*status)) {
3841 return FALSE;
3842 }
3843
3844 // TODO: reject search patterns beginning with a combining char.
3845
3846 #ifdef USEARCH_DEBUG
3847 if (getenv("USEARCH_DEBUG") != NULL) {
3848 printf("Pattern CEs\n");
3849 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
3850 printf(" %8x", strsrch->pattern.ces[ii]);
3851 }
3852 printf("\n");
3853 }
3854
3855 #endif
3856 // Input parameter sanity check.
3857 // TODO: should input indicies clip to the text length
3858 // in the same way that UText does.
3859 if(strsrch->pattern.cesLength == 0 ||
3860 startIdx < 0 ||
3861 startIdx > strsrch->search->textLength ||
3862 strsrch->pattern.ces == NULL) {
3863 *status = U_ILLEGAL_ARGUMENT_ERROR;
3864 return FALSE;
3865 }
3866
3867 if (strsrch->pattern.pces == NULL) {
3868 initializePatternPCETable(strsrch, status);
3869 }
3870
3871 ucol_setOffset(strsrch->textIter, startIdx, status);
3872 CEIBuffer ceb(strsrch, status);
3873
3874
3875 int32_t targetIx = 0;
3876 const CEI *targetCEI = NULL;
3877 int32_t patIx;
3878 UBool found;
3879
3880 int32_t mStart = -1;
3881 int32_t mLimit = -1;
3882 int32_t minLimit;
3883 int32_t maxLimit;
3884
3885
3886
3887 // Outer loop moves over match starting positions in the
3888 // target CE space.
3889 // Here we see the target as a sequence of collation elements, resulting from the following:
3890 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3891 // (for example, digraphs such as IJ may be broken into two characters).
3892 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3893 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3894 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3895 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3896 // then the CE is deleted, so the following code sees only CEs that are relevant.
3897 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3898 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3899 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3900 //
3901 for(targetIx=0; ; targetIx++)
3902 {
3903 found = TRUE;
3904 // Inner loop checks for a match beginning at each
3905 // position from the outer loop.
3906 int32_t targetIxOffset = 0;
3907 int64_t patCE = 0;
3908 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3909 // (compared to the last CE fetched for the previous targetIx value) as we need to go
3910 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3911 const CEI *firstCEI = ceb.get(targetIx);
3912 if (firstCEI == NULL) {
3913 *status = U_INTERNAL_PROGRAM_ERROR;
3914 found = FALSE;
3915 break;
3916 }
3917
3918 for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
3919 patCE = strsrch->pattern.pces[patIx];
3920 targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
3921 // Compare CE from target string with CE from the pattern.
3922 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
3923 // which will fail the compare, below.
3924 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
3925 if ( ceMatch == U_CE_NO_MATCH ) {
3926 found = FALSE;
3927 break;
3928 } else if ( ceMatch > U_CE_NO_MATCH ) {
3929 if ( ceMatch == U_CE_SKIP_TARG ) {
3930 // redo with same patCE, next targCE
3931 patIx--;
3932 targetIxOffset++;
3933 } else { // ceMatch == U_CE_SKIP_PATN
3934 // redo with same targCE, next patCE
3935 targetIxOffset--;
3936 }
3937 }
3938 }
3939 targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
3940
3941 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
3942 // No match at this targetIx. Try again at the next.
3943 continue;
3944 }
3945
3946 if (!found) {
3947 // No match at all, we have run off the end of the target text.
3948 break;
3949 }
3950
3951
3952 // We have found a match in CE space.
3953 // Now determine the bounds in string index space.
3954 // There still is a chance of match failure if the CE range not correspond to
3955 // an acceptable character range.
3956 //
3957 const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1);
3958
3959 mStart = firstCEI->lowIndex;
3960 minLimit = lastCEI->lowIndex;
3961
3962 // Look at the CE following the match. If it is UCOL_NULLORDER the match
3963 // extended to the end of input, and the match is good.
3964
3965 // Look at the high and low indices of the CE following the match. If
3966 // they are the same it means one of two things:
3967 // 1. The match extended to the last CE from the target text, which is OK, or
3968 // 2. The last CE that was part of the match is in an expansion that extends
3969 // to the first CE after the match. In this case, we reject the match.
3970 const CEI *nextCEI = 0;
3971 if (strsrch->search->elementComparisonType == 0) {
3972 nextCEI = ceb.get(targetIx + targetIxOffset);
3973 maxLimit = nextCEI->lowIndex;
3974 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3975 found = FALSE;
3976 }
3977 } else {
3978 for ( ; ; ++targetIxOffset ) {
3979 nextCEI = ceb.get(targetIx + targetIxOffset);
3980 maxLimit = nextCEI->lowIndex;
3981 // If we are at the end of the target too, match succeeds
3982 if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
3983 break;
3984 }
3985 // As long as the next CE has primary weight of 0,
3986 // it is part of the last target element matched by the pattern;
3987 // make sure it can be part of a match with the last patCE
3988 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
3989 UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
3990 if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
3991 found = FALSE;
3992 break;
3993 }
3994 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
3995 // target element, but it has non-zero primary weight => match fails
3996 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
3997 found = false;
3998 break;
3999 // Else the target CE is not part of an expansion of the last matched element, match succeeds
4000 } else {
4001 break;
4002 }
4003 }
4004 }
4005
4006
4007 // Check for the start of the match being within a combining sequence.
4008 // This can happen if the pattern itself begins with a combining char, and
4009 // the match found combining marks in the target text that were attached
4010 // to something else.
4011 // This type of match should be rejected for not completely consuming a
4012 // combining sequence.
4013 if (!isBreakBoundary(strsrch, mStart)) {
4014 found = FALSE;
4015 }
4016
4017 // Check for the start of the match being within an Collation Element Expansion,
4018 // meaning that the first char of the match is only partially matched.
4019 // With exapnsions, the first CE will report the index of the source
4020 // character, and all subsequent (expansions) CEs will report the source index of the
4021 // _following_ character.
4022 int32_t secondIx = firstCEI->highIndex;
4023 if (mStart == secondIx) {
4024 found = FALSE;
4025 }
4026
4027 // Allow matches to end in the middle of a grapheme cluster if the following
4028 // conditions are met; this is needed to make prefix search work properly in
4029 // Indic, see #11750
4030 // * the default breakIter is being used
4031 // * the next collation element after this combining sequence
4032 // - has non-zero primary weight
4033 // - corresponds to a separate character following the one at end of the current match
4034 // (the second of these conditions, and perhaps both, may be redundant given the
4035 // subsequent check for normalization boundary; however they are likely much faster
4036 // tests in any case)
4037 // * the match limit is a normalization boundary
4038 UBool allowMidclusterMatch = FALSE;
4039 if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4040 allowMidclusterMatch =
4041 strsrch->search->breakIter == NULL &&
4042 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4043 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4044 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4045 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4046 }
4047 // If those conditions are met, then:
4048 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4049 // the match limit may be backed off to a previous break boundary. This handles
4050 // cases in which mLimit includes target characters that are ignorable with current
4051 // settings (such as space) and which extend beyond the pattern match.
4052 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4053 // * do NOT require that match limit be on a breakIter boundary
4054
4055 // Advance the match end position to the first acceptable match boundary.
4056 // This advances the index over any combining charcters.
4057 mLimit = maxLimit;
4058 if (minLimit < maxLimit) {
4059 // When the last CE's low index is same with its high index, the CE is likely
4060 // a part of expansion. In this case, the index is located just after the
4061 // character corresponding to the CEs compared above. If the index is right
4062 // at the break boundary, move the position to the next boundary will result
4063 // incorrect match length when there are ignorable characters exist between
4064 // the position and the next character produces CE(s). See ticket#8482.
4065 if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
4066 mLimit = minLimit;
4067 } else {
4068 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4069 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4070 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4071 // (i.e. we back off mLimit to the previous breakIterator boundary).
4072 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4073 mLimit = nba;
4074 }
4075 }
4076 }
4077
4078 #ifdef USEARCH_DEBUG
4079 if (getenv("USEARCH_DEBUG") != NULL) {
4080 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4081 }
4082 #endif
4083
4084 if (!allowMidclusterMatch) {
4085 // If advancing to the end of a combining sequence in character indexing space
4086 // advanced us beyond the end of the match in CE space, reject this match.
4087 if (mLimit > maxLimit) {
4088 found = FALSE;
4089 }
4090
4091 if (!isBreakBoundary(strsrch, mLimit)) {
4092 found = FALSE;
4093 }
4094 }
4095
4096 if (! checkIdentical(strsrch, mStart, mLimit)) {
4097 found = FALSE;
4098 }
4099
4100 if (found) {
4101 break;
4102 }
4103 }
4104
4105 #ifdef USEARCH_DEBUG
4106 if (getenv("USEARCH_DEBUG") != NULL) {
4107 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4108 int32_t lastToPrint = ceb.limitIx+2;
4109 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4110 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4111 }
4112 printf("\n%s\n", found? "match found" : "no match");
4113 }
4114 #endif
4115
4116 // All Done. Store back the match bounds to the caller.
4117 //
4118 if (found==FALSE) {
4119 mLimit = -1;
4120 mStart = -1;
4121 }
4122
4123 if (matchStart != NULL) {
4124 *matchStart= mStart;
4125 }
4126
4127 if (matchLimit != NULL) {
4128 *matchLimit = mLimit;
4129 }
4130
4131 return found;
4132 }
4133
usearch_searchBackwards(UStringSearch * strsrch,int32_t startIdx,int32_t * matchStart,int32_t * matchLimit,UErrorCode * status)4134 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
4135 int32_t startIdx,
4136 int32_t *matchStart,
4137 int32_t *matchLimit,
4138 UErrorCode *status)
4139 {
4140 if (U_FAILURE(*status)) {
4141 return FALSE;
4142 }
4143
4144 // TODO: reject search patterns beginning with a combining char.
4145
4146 #ifdef USEARCH_DEBUG
4147 if (getenv("USEARCH_DEBUG") != NULL) {
4148 printf("Pattern CEs\n");
4149 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
4150 printf(" %8x", strsrch->pattern.ces[ii]);
4151 }
4152 printf("\n");
4153 }
4154
4155 #endif
4156 // Input parameter sanity check.
4157 // TODO: should input indicies clip to the text length
4158 // in the same way that UText does.
4159 if(strsrch->pattern.cesLength == 0 ||
4160 startIdx < 0 ||
4161 startIdx > strsrch->search->textLength ||
4162 strsrch->pattern.ces == NULL) {
4163 *status = U_ILLEGAL_ARGUMENT_ERROR;
4164 return FALSE;
4165 }
4166
4167 if (strsrch->pattern.pces == NULL) {
4168 initializePatternPCETable(strsrch, status);
4169 }
4170
4171 CEIBuffer ceb(strsrch, status);
4172 int32_t targetIx = 0;
4173
4174 /*
4175 * Pre-load the buffer with the CE's for the grapheme
4176 * after our starting position so that we're sure that
4177 * we can look at the CE following the match when we
4178 * check the match boundaries.
4179 *
4180 * This will also pre-fetch the first CE that we'll
4181 * consider for the match.
4182 */
4183 if (startIdx < strsrch->search->textLength) {
4184 UBreakIterator *bi = strsrch->search->internalBreakIter;
4185 int32_t next = ubrk_following(bi, startIdx);
4186
4187 ucol_setOffset(strsrch->textIter, next, status);
4188
4189 for (targetIx = 0; ; targetIx += 1) {
4190 if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
4191 break;
4192 }
4193 }
4194 } else {
4195 ucol_setOffset(strsrch->textIter, startIdx, status);
4196 }
4197
4198
4199 const CEI *targetCEI = NULL;
4200 int32_t patIx;
4201 UBool found;
4202
4203 int32_t limitIx = targetIx;
4204 int32_t mStart = -1;
4205 int32_t mLimit = -1;
4206 int32_t minLimit;
4207 int32_t maxLimit;
4208
4209
4210
4211 // Outer loop moves over match starting positions in the
4212 // target CE space.
4213 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4214 // But patIx is 0 at the beginning of the pattern and increases toward the end.
4215 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4216 // and the beginning of the base text.
4217 for(targetIx = limitIx; ; targetIx += 1)
4218 {
4219 found = TRUE;
4220 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4221 // (compared to the last CE fetched for the previous targetIx value) as we need to go
4222 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4223 const CEI *lastCEI = ceb.getPrevious(targetIx);
4224 if (lastCEI == NULL) {
4225 *status = U_INTERNAL_PROGRAM_ERROR;
4226 found = FALSE;
4227 break;
4228 }
4229 // Inner loop checks for a match beginning at each
4230 // position from the outer loop.
4231 int32_t targetIxOffset = 0;
4232 for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
4233 int64_t patCE = strsrch->pattern.pces[patIx];
4234
4235 targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
4236 // Compare CE from target string with CE from the pattern.
4237 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4238 // which will fail the compare, below.
4239 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
4240 if ( ceMatch == U_CE_NO_MATCH ) {
4241 found = FALSE;
4242 break;
4243 } else if ( ceMatch > U_CE_NO_MATCH ) {
4244 if ( ceMatch == U_CE_SKIP_TARG ) {
4245 // redo with same patCE, next targCE
4246 patIx++;
4247 targetIxOffset++;
4248 } else { // ceMatch == U_CE_SKIP_PATN
4249 // redo with same targCE, next patCE
4250 targetIxOffset--;
4251 }
4252 }
4253 }
4254
4255 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
4256 // No match at this targetIx. Try again at the next.
4257 continue;
4258 }
4259
4260 if (!found) {
4261 // No match at all, we have run off the end of the target text.
4262 break;
4263 }
4264
4265
4266 // We have found a match in CE space.
4267 // Now determine the bounds in string index space.
4268 // There still is a chance of match failure if the CE range not correspond to
4269 // an acceptable character range.
4270 //
4271 const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
4272 mStart = firstCEI->lowIndex;
4273
4274 // Check for the start of the match being within a combining sequence.
4275 // This can happen if the pattern itself begins with a combining char, and
4276 // the match found combining marks in the target text that were attached
4277 // to something else.
4278 // This type of match should be rejected for not completely consuming a
4279 // combining sequence.
4280 if (!isBreakBoundary(strsrch, mStart)) {
4281 found = FALSE;
4282 }
4283
4284 // Look at the high index of the first CE in the match. If it's the same as the
4285 // low index, the first CE in the match is in the middle of an expansion.
4286 if (mStart == firstCEI->highIndex) {
4287 found = FALSE;
4288 }
4289
4290
4291 minLimit = lastCEI->lowIndex;
4292
4293 if (targetIx > 0) {
4294 // Look at the CE following the match. If it is UCOL_NULLORDER the match
4295 // extended to the end of input, and the match is good.
4296
4297 // Look at the high and low indices of the CE following the match. If
4298 // they are the same it means one of two things:
4299 // 1. The match extended to the last CE from the target text, which is OK, or
4300 // 2. The last CE that was part of the match is in an expansion that extends
4301 // to the first CE after the match. In this case, we reject the match.
4302 const CEI *nextCEI = ceb.getPrevious(targetIx - 1);
4303
4304 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4305 found = FALSE;
4306 }
4307
4308 mLimit = maxLimit = nextCEI->lowIndex;
4309
4310 // Allow matches to end in the middle of a grapheme cluster if the following
4311 // conditions are met; this is needed to make prefix search work properly in
4312 // Indic, see #11750
4313 // * the default breakIter is being used
4314 // * the next collation element after this combining sequence
4315 // - has non-zero primary weight
4316 // - corresponds to a separate character following the one at end of the current match
4317 // (the second of these conditions, and perhaps both, may be redundant given the
4318 // subsequent check for normalization boundary; however they are likely much faster
4319 // tests in any case)
4320 // * the match limit is a normalization boundary
4321 UBool allowMidclusterMatch = FALSE;
4322 if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4323 allowMidclusterMatch =
4324 strsrch->search->breakIter == NULL &&
4325 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4326 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4327 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4328 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4329 }
4330 // If those conditions are met, then:
4331 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4332 // the match limit may be backed off to a previous break boundary. This handles
4333 // cases in which mLimit includes target characters that are ignorable with current
4334 // settings (such as space) and which extend beyond the pattern match.
4335 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4336 // * do NOT require that match limit be on a breakIter boundary
4337
4338 // Advance the match end position to the first acceptable match boundary.
4339 // This advances the index over any combining characters.
4340 if (minLimit < maxLimit) {
4341 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4342 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4343 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4344 // (i.e. we back off mLimit to the previous breakIterator boundary).
4345 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4346 mLimit = nba;
4347 }
4348 }
4349
4350 if (!allowMidclusterMatch) {
4351 // If advancing to the end of a combining sequence in character indexing space
4352 // advanced us beyond the end of the match in CE space, reject this match.
4353 if (mLimit > maxLimit) {
4354 found = FALSE;
4355 }
4356
4357 // Make sure the end of the match is on a break boundary
4358 if (!isBreakBoundary(strsrch, mLimit)) {
4359 found = FALSE;
4360 }
4361 }
4362
4363 } else {
4364 // No non-ignorable CEs after this point.
4365 // The maximum position is detected by boundary after
4366 // the last non-ignorable CE. Combining sequence
4367 // across the start index will be truncated.
4368 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4369 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
4370 }
4371
4372 #ifdef USEARCH_DEBUG
4373 if (getenv("USEARCH_DEBUG") != NULL) {
4374 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4375 }
4376 #endif
4377
4378
4379 if (! checkIdentical(strsrch, mStart, mLimit)) {
4380 found = FALSE;
4381 }
4382
4383 if (found) {
4384 break;
4385 }
4386 }
4387
4388 #ifdef USEARCH_DEBUG
4389 if (getenv("USEARCH_DEBUG") != NULL) {
4390 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4391 int32_t lastToPrint = ceb.limitIx+2;
4392 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4393 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4394 }
4395 printf("\n%s\n", found? "match found" : "no match");
4396 }
4397 #endif
4398
4399 // All Done. Store back the match bounds to the caller.
4400 //
4401 if (found==FALSE) {
4402 mLimit = -1;
4403 mStart = -1;
4404 }
4405
4406 if (matchStart != NULL) {
4407 *matchStart= mStart;
4408 }
4409
4410 if (matchLimit != NULL) {
4411 *matchLimit = mLimit;
4412 }
4413
4414 return found;
4415 }
4416
4417 // internal use methods declared in usrchimp.h -----------------------------
4418
usearch_handleNextExact(UStringSearch * strsrch,UErrorCode * status)4419 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
4420 {
4421 if (U_FAILURE(*status)) {
4422 setMatchNotFound(strsrch);
4423 return FALSE;
4424 }
4425
4426 #if BOYER_MOORE
4427 UCollationElements *coleiter = strsrch->textIter;
4428 int32_t textlength = strsrch->search->textLength;
4429 int32_t *patternce = strsrch->pattern.ces;
4430 int32_t patterncelength = strsrch->pattern.cesLength;
4431 int32_t textoffset = ucol_getOffset(coleiter);
4432
4433 // status used in setting coleiter offset, since offset is checked in
4434 // shiftForward before setting the coleiter offset, status never
4435 // a failure
4436 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4437 patterncelength);
4438 while (textoffset <= textlength)
4439 {
4440 uint32_t patternceindex = patterncelength - 1;
4441 int32_t targetce;
4442 UBool found = FALSE;
4443 int32_t lastce = UCOL_NULLORDER;
4444
4445 setColEIterOffset(coleiter, textoffset);
4446
4447 for (;;) {
4448 // finding the last pattern ce match, imagine composite characters
4449 // for example: search for pattern A in text \u00C0
4450 // we'll have to skip \u0300 the grave first before we get to A
4451 targetce = ucol_previous(coleiter, status);
4452 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4453 found = FALSE;
4454 break;
4455 }
4456 targetce = getCE(strsrch, targetce);
4457 if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4458 // this is for the text \u0315\u0300 that requires
4459 // normalization and pattern \u0300, where \u0315 is ignorable
4460 continue;
4461 }
4462 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4463 lastce = targetce;
4464 }
4465 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4466 if (targetce == patternce[patternceindex]) {
4467 // the first ce can be a contraction
4468 found = TRUE;
4469 break;
4470 }
4471 if (!hasExpansion(coleiter)) {
4472 found = FALSE;
4473 break;
4474 }
4475 }
4476
4477 //targetce = lastce;
4478
4479 while (found && patternceindex > 0) {
4480 lastce = targetce;
4481 targetce = ucol_previous(coleiter, status);
4482 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4483 found = FALSE;
4484 break;
4485 }
4486 targetce = getCE(strsrch, targetce);
4487 if (targetce == UCOL_IGNORABLE) {
4488 continue;
4489 }
4490
4491 patternceindex --;
4492 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4493 found = found && targetce == patternce[patternceindex];
4494 }
4495
4496 targetce = lastce;
4497
4498 if (!found) {
4499 if (U_FAILURE(*status)) {
4500 break;
4501 }
4502 textoffset = shiftForward(strsrch, textoffset, lastce,
4503 patternceindex);
4504 // status checked at loop.
4505 patternceindex = patterncelength;
4506 continue;
4507 }
4508
4509 if (checkNextExactMatch(strsrch, &textoffset, status)) {
4510 // status checked in ucol_setOffset
4511 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4512 return TRUE;
4513 }
4514 }
4515 setMatchNotFound(strsrch);
4516 return FALSE;
4517 #else
4518 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4519 int32_t start = -1;
4520 int32_t end = -1;
4521
4522 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4523 strsrch->search->matchedIndex = start;
4524 strsrch->search->matchedLength = end - start;
4525 return TRUE;
4526 } else {
4527 setMatchNotFound(strsrch);
4528 return FALSE;
4529 }
4530 #endif
4531 }
4532
usearch_handleNextCanonical(UStringSearch * strsrch,UErrorCode * status)4533 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
4534 {
4535 if (U_FAILURE(*status)) {
4536 setMatchNotFound(strsrch);
4537 return FALSE;
4538 }
4539
4540 #if BOYER_MOORE
4541 UCollationElements *coleiter = strsrch->textIter;
4542 int32_t textlength = strsrch->search->textLength;
4543 int32_t *patternce = strsrch->pattern.ces;
4544 int32_t patterncelength = strsrch->pattern.cesLength;
4545 int32_t textoffset = ucol_getOffset(coleiter);
4546 UBool hasPatternAccents =
4547 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4548
4549 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4550 patterncelength);
4551 strsrch->canonicalPrefixAccents[0] = 0;
4552 strsrch->canonicalSuffixAccents[0] = 0;
4553
4554 while (textoffset <= textlength)
4555 {
4556 int32_t patternceindex = patterncelength - 1;
4557 int32_t targetce;
4558 UBool found = FALSE;
4559 int32_t lastce = UCOL_NULLORDER;
4560
4561 setColEIterOffset(coleiter, textoffset);
4562
4563 for (;;) {
4564 // finding the last pattern ce match, imagine composite characters
4565 // for example: search for pattern A in text \u00C0
4566 // we'll have to skip \u0300 the grave first before we get to A
4567 targetce = ucol_previous(coleiter, status);
4568 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4569 found = FALSE;
4570 break;
4571 }
4572 targetce = getCE(strsrch, targetce);
4573 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4574 lastce = targetce;
4575 }
4576 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4577 if (targetce == patternce[patternceindex]) {
4578 // the first ce can be a contraction
4579 found = TRUE;
4580 break;
4581 }
4582 if (!hasExpansion(coleiter)) {
4583 found = FALSE;
4584 break;
4585 }
4586 }
4587
4588 while (found && patternceindex > 0) {
4589 targetce = ucol_previous(coleiter, status);
4590 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4591 found = FALSE;
4592 break;
4593 }
4594 targetce = getCE(strsrch, targetce);
4595 if (targetce == UCOL_IGNORABLE) {
4596 continue;
4597 }
4598
4599 patternceindex --;
4600 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4601 found = found && targetce == patternce[patternceindex];
4602 }
4603
4604 // initializing the rearranged accent array
4605 if (hasPatternAccents && !found) {
4606 strsrch->canonicalPrefixAccents[0] = 0;
4607 strsrch->canonicalSuffixAccents[0] = 0;
4608 if (U_FAILURE(*status)) {
4609 break;
4610 }
4611 found = doNextCanonicalMatch(strsrch, textoffset, status);
4612 }
4613
4614 if (!found) {
4615 if (U_FAILURE(*status)) {
4616 break;
4617 }
4618 textoffset = shiftForward(strsrch, textoffset, lastce,
4619 patternceindex);
4620 // status checked at loop
4621 patternceindex = patterncelength;
4622 continue;
4623 }
4624
4625 if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4626 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4627 return TRUE;
4628 }
4629 }
4630 setMatchNotFound(strsrch);
4631 return FALSE;
4632 #else
4633 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4634 int32_t start = -1;
4635 int32_t end = -1;
4636
4637 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4638 strsrch->search->matchedIndex = start;
4639 strsrch->search->matchedLength = end - start;
4640 return TRUE;
4641 } else {
4642 setMatchNotFound(strsrch);
4643 return FALSE;
4644 }
4645 #endif
4646 }
4647
usearch_handlePreviousExact(UStringSearch * strsrch,UErrorCode * status)4648 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
4649 {
4650 if (U_FAILURE(*status)) {
4651 setMatchNotFound(strsrch);
4652 return FALSE;
4653 }
4654
4655 #if BOYER_MOORE
4656 UCollationElements *coleiter = strsrch->textIter;
4657 int32_t *patternce = strsrch->pattern.ces;
4658 int32_t patterncelength = strsrch->pattern.cesLength;
4659 int32_t textoffset = ucol_getOffset(coleiter);
4660
4661 // shifting it check for setting offset
4662 // if setOffset is called previously or there was no previous match, we
4663 // leave the offset as it is.
4664 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4665 textoffset = strsrch->search->matchedIndex;
4666 }
4667
4668 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4669 patterncelength);
4670
4671 while (textoffset >= 0)
4672 {
4673 int32_t patternceindex = 1;
4674 int32_t targetce;
4675 UBool found = FALSE;
4676 int32_t firstce = UCOL_NULLORDER;
4677
4678 // if status is a failure, ucol_setOffset does nothing
4679 setColEIterOffset(coleiter, textoffset);
4680
4681 for (;;) {
4682 // finding the first pattern ce match, imagine composite
4683 // characters. for example: search for pattern \u0300 in text
4684 // \u00C0, we'll have to skip A first before we get to
4685 // \u0300 the grave accent
4686 targetce = ucol_next(coleiter, status);
4687 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4688 found = FALSE;
4689 break;
4690 }
4691 targetce = getCE(strsrch, targetce);
4692 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4693 firstce = targetce;
4694 }
4695 if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
4696 continue;
4697 }
4698 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4699 if (targetce == patternce[0]) {
4700 found = TRUE;
4701 break;
4702 }
4703 if (!hasExpansion(coleiter)) {
4704 // checking for accents in composite character
4705 found = FALSE;
4706 break;
4707 }
4708 }
4709
4710 //targetce = firstce;
4711
4712 while (found && (patternceindex < patterncelength)) {
4713 firstce = targetce;
4714 targetce = ucol_next(coleiter, status);
4715 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4716 found = FALSE;
4717 break;
4718 }
4719 targetce = getCE(strsrch, targetce);
4720 if (targetce == UCOL_IGNORABLE) {
4721 continue;
4722 }
4723
4724 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4725 found = found && targetce == patternce[patternceindex];
4726 patternceindex ++;
4727 }
4728
4729 targetce = firstce;
4730
4731 if (!found) {
4732 if (U_FAILURE(*status)) {
4733 break;
4734 }
4735
4736 textoffset = reverseShift(strsrch, textoffset, targetce,
4737 patternceindex);
4738 patternceindex = 0;
4739 continue;
4740 }
4741
4742 if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4743 setColEIterOffset(coleiter, textoffset);
4744 return TRUE;
4745 }
4746 }
4747 setMatchNotFound(strsrch);
4748 return FALSE;
4749 #else
4750 int32_t textOffset;
4751
4752 if (strsrch->search->isOverlap) {
4753 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4754 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4755 } else {
4756 // move the start position at the end of possible match
4757 initializePatternPCETable(strsrch, status);
4758 if (!initTextProcessedIter(strsrch, status)) {
4759 setMatchNotFound(strsrch);
4760 return FALSE;
4761 }
4762 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4763 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4764 if (pce == UCOL_PROCESSED_NULLORDER) {
4765 // at the end of the text
4766 break;
4767 }
4768 }
4769 if (U_FAILURE(*status)) {
4770 setMatchNotFound(strsrch);
4771 return FALSE;
4772 }
4773 textOffset = ucol_getOffset(strsrch->textIter);
4774 }
4775 } else {
4776 textOffset = ucol_getOffset(strsrch->textIter);
4777 }
4778
4779 int32_t start = -1;
4780 int32_t end = -1;
4781
4782 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4783 strsrch->search->matchedIndex = start;
4784 strsrch->search->matchedLength = end - start;
4785 return TRUE;
4786 } else {
4787 setMatchNotFound(strsrch);
4788 return FALSE;
4789 }
4790 #endif
4791 }
4792
usearch_handlePreviousCanonical(UStringSearch * strsrch,UErrorCode * status)4793 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
4794 UErrorCode *status)
4795 {
4796 if (U_FAILURE(*status)) {
4797 setMatchNotFound(strsrch);
4798 return FALSE;
4799 }
4800
4801 #if BOYER_MOORE
4802 UCollationElements *coleiter = strsrch->textIter;
4803 int32_t *patternce = strsrch->pattern.ces;
4804 int32_t patterncelength = strsrch->pattern.cesLength;
4805 int32_t textoffset = ucol_getOffset(coleiter);
4806 UBool hasPatternAccents =
4807 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4808
4809 // shifting it check for setting offset
4810 // if setOffset is called previously or there was no previous match, we
4811 // leave the offset as it is.
4812 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4813 textoffset = strsrch->search->matchedIndex;
4814 }
4815
4816 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4817 patterncelength);
4818 strsrch->canonicalPrefixAccents[0] = 0;
4819 strsrch->canonicalSuffixAccents[0] = 0;
4820
4821 while (textoffset >= 0)
4822 {
4823 int32_t patternceindex = 1;
4824 int32_t targetce;
4825 UBool found = FALSE;
4826 int32_t firstce = UCOL_NULLORDER;
4827
4828 setColEIterOffset(coleiter, textoffset);
4829 for (;;) {
4830 // finding the first pattern ce match, imagine composite
4831 // characters. for example: search for pattern \u0300 in text
4832 // \u00C0, we'll have to skip A first before we get to
4833 // \u0300 the grave accent
4834 targetce = ucol_next(coleiter, status);
4835 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4836 found = FALSE;
4837 break;
4838 }
4839 targetce = getCE(strsrch, targetce);
4840 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4841 firstce = targetce;
4842 }
4843
4844 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4845 if (targetce == patternce[0]) {
4846 // the first ce can be a contraction
4847 found = TRUE;
4848 break;
4849 }
4850 if (!hasExpansion(coleiter)) {
4851 // checking for accents in composite character
4852 found = FALSE;
4853 break;
4854 }
4855 }
4856
4857 targetce = firstce;
4858
4859 while (found && patternceindex < patterncelength) {
4860 targetce = ucol_next(coleiter, status);
4861 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4862 found = FALSE;
4863 break;
4864 }
4865 targetce = getCE(strsrch, targetce);
4866 if (targetce == UCOL_IGNORABLE) {
4867 continue;
4868 }
4869
4870 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4871 found = found && targetce == patternce[patternceindex];
4872 patternceindex ++;
4873 }
4874
4875 // initializing the rearranged accent array
4876 if (hasPatternAccents && !found) {
4877 strsrch->canonicalPrefixAccents[0] = 0;
4878 strsrch->canonicalSuffixAccents[0] = 0;
4879 if (U_FAILURE(*status)) {
4880 break;
4881 }
4882 found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4883 }
4884
4885 if (!found) {
4886 if (U_FAILURE(*status)) {
4887 break;
4888 }
4889 textoffset = reverseShift(strsrch, textoffset, targetce,
4890 patternceindex);
4891 patternceindex = 0;
4892 continue;
4893 }
4894
4895 if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4896 setColEIterOffset(coleiter, textoffset);
4897 return TRUE;
4898 }
4899 }
4900 setMatchNotFound(strsrch);
4901 return FALSE;
4902 #else
4903 int32_t textOffset;
4904
4905 if (strsrch->search->isOverlap) {
4906 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4907 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4908 } else {
4909 // move the start position at the end of possible match
4910 initializePatternPCETable(strsrch, status);
4911 if (!initTextProcessedIter(strsrch, status)) {
4912 setMatchNotFound(strsrch);
4913 return FALSE;
4914 }
4915 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4916 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4917 if (pce == UCOL_PROCESSED_NULLORDER) {
4918 // at the end of the text
4919 break;
4920 }
4921 }
4922 if (U_FAILURE(*status)) {
4923 setMatchNotFound(strsrch);
4924 return FALSE;
4925 }
4926 textOffset = ucol_getOffset(strsrch->textIter);
4927 }
4928 } else {
4929 textOffset = ucol_getOffset(strsrch->textIter);
4930 }
4931
4932 int32_t start = -1;
4933 int32_t end = -1;
4934
4935 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4936 strsrch->search->matchedIndex = start;
4937 strsrch->search->matchedLength = end - start;
4938 return TRUE;
4939 } else {
4940 setMatchNotFound(strsrch);
4941 return FALSE;
4942 }
4943 #endif
4944 }
4945
4946 #endif /* #if !UCONFIG_NO_COLLATION */
4947