1 /*
2 **********************************************************************
3 * Copyright (C) 2001-2009 IBM and others. All rights reserved.
4 **********************************************************************
5 * Date Name Description
6 * 07/02/2001 synwee Creation.
7 **********************************************************************
8 */
9
10 #include "unicode/utypes.h"
11
12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
13
14 #include "unicode/usearch.h"
15 #include "unicode/ustring.h"
16 #include "unicode/uchar.h"
17 #include "unormimp.h"
18 #include "ucol_imp.h"
19 #include "usrchimp.h"
20 #include "cmemory.h"
21 #include "ucln_in.h"
22 #include "uassert.h"
23
24 U_NAMESPACE_USE
25
26 // don't use Boyer-Moore
27 #define BOYER_MOORE 0
28
29 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
30
31 // internal definition ---------------------------------------------------
32
33 #define LAST_BYTE_MASK_ 0xFF
34 #define SECOND_LAST_BYTE_SHIFT_ 8
35 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
36
37 static const uint16_t *fcdTrieIndex = NULL;
38 static UChar32 fcdHighStart = 0;
39
40 // internal methods -------------------------------------------------
41
42 /**
43 * Fast collation element iterator setOffset.
44 * This function does not check for bounds.
45 * @param coleiter collation element iterator
46 * @param offset to set
47 */
48 static
setColEIterOffset(UCollationElements * elems,int32_t offset)49 inline void setColEIterOffset(UCollationElements *elems,
50 int32_t offset)
51 {
52 collIterate *ci = &(elems->iteratordata_);
53 ci->pos = ci->string + offset;
54 ci->CEpos = ci->toReturn = ci->extendCEs ? ci->extendCEs : ci->CEs;
55 if (ci->flags & UCOL_ITER_INNORMBUF) {
56 ci->flags = ci->origFlags;
57 }
58 ci->fcdPosition = NULL;
59
60 ci->offsetReturn = NULL;
61 ci->offsetStore = ci->offsetBuffer;
62 ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
63 }
64
65 /**
66 * Getting the mask for collation strength
67 * @param strength collation strength
68 * @return collation element mask
69 */
70 static
getMask(UCollationStrength strength)71 inline uint32_t getMask(UCollationStrength strength)
72 {
73 switch (strength)
74 {
75 case UCOL_PRIMARY:
76 return UCOL_PRIMARYORDERMASK;
77 case UCOL_SECONDARY:
78 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
79 default:
80 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
81 UCOL_PRIMARYORDERMASK;
82 }
83 }
84
85 /**
86 * This is to squeeze the 21bit ces into a 256 table
87 * @param ce collation element
88 * @return collapsed version of the collation element
89 */
90 static
hash(uint32_t ce)91 inline int hash(uint32_t ce)
92 {
93 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
94 // well with the new collation where most of the latin 1 characters
95 // are of the value xx000xxx. their hashes will most of the time be 0
96 // to be discussed on the hash algo.
97 return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
98 }
99
100 U_CDECL_BEGIN
101 static UBool U_CALLCONV
usearch_cleanup(void)102 usearch_cleanup(void) {
103 fcdTrieIndex = NULL;
104 return TRUE;
105 }
106 U_CDECL_END
107
108 /**
109 * Initializing the fcd tables.
110 * Internal method, status assumed to be a success.
111 * @param status output error if any, caller to check status before calling
112 * method, status assumed to be success when passed in.
113 */
114 static
initializeFCD(UErrorCode * status)115 inline void initializeFCD(UErrorCode *status)
116 {
117 if (fcdTrieIndex == NULL) {
118 fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
119 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
120 }
121 }
122
123 /**
124 * Gets the fcd value for a character at the argument index.
125 * This method takes into accounts of the supplementary characters.
126 * @param str UTF16 string where character for fcd retrieval resides
127 * @param offset position of the character whose fcd is to be retrieved, to be
128 * overwritten with the next character position, taking
129 * surrogate characters into consideration.
130 * @param strlength length of the argument string
131 * @return fcd value
132 */
133 static
getFCD(const UChar * str,int32_t * offset,int32_t strlength)134 uint16_t getFCD(const UChar *str, int32_t *offset,
135 int32_t strlength)
136 {
137 const UChar *temp = str + *offset;
138 uint16_t result = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, temp, str + strlength);
139 *offset = (int32_t)(temp - str);
140 return result;
141 }
142
143 /**
144 * Getting the modified collation elements taking into account the collation
145 * attributes
146 * @param strsrch string search data
147 * @param sourcece
148 * @return the modified collation element
149 */
150 static
getCE(const UStringSearch * strsrch,uint32_t sourcece)151 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
152 {
153 // note for tertiary we can't use the collator->tertiaryMask, that
154 // is a preprocessed mask that takes into account case options. since
155 // we are only concerned with exact matches, we don't need that.
156 sourcece &= strsrch->ceMask;
157
158 if (strsrch->toShift) {
159 // alternate handling here, since only the 16 most significant digits
160 // is only used, we can safely do a compare without masking
161 // if the ce is a variable, we mask and get only the primary values
162 // no shifting to quartenary is required since all primary values
163 // less than variabletop will need to be masked off anyway.
164 if (strsrch->variableTop > sourcece) {
165 if (strsrch->strength >= UCOL_QUATERNARY) {
166 sourcece &= UCOL_PRIMARYORDERMASK;
167 }
168 else {
169 sourcece = UCOL_IGNORABLE;
170 }
171 }
172 } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
173 sourcece = 0xFFFF;
174 }
175
176 return sourcece;
177 }
178
179 /**
180 * Allocate a memory and returns NULL if it failed.
181 * Internal method, status assumed to be a success.
182 * @param size to allocate
183 * @param status output error if any, caller to check status before calling
184 * method, status assumed to be success when passed in.
185 * @return newly allocated array, NULL otherwise
186 */
187 static
allocateMemory(uint32_t size,UErrorCode * status)188 inline void * allocateMemory(uint32_t size, UErrorCode *status)
189 {
190 uint32_t *result = (uint32_t *)uprv_malloc(size);
191 if (result == NULL) {
192 *status = U_MEMORY_ALLOCATION_ERROR;
193 }
194 return result;
195 }
196
197 /**
198 * Adds a uint32_t value to a destination array.
199 * Creates a new array if we run out of space. The caller will have to
200 * manually deallocate the newly allocated array.
201 * Internal method, status assumed to be success, caller has to check status
202 * before calling this method. destination not to be NULL and has at least
203 * size destinationlength.
204 * @param destination target array
205 * @param offset destination offset to add value
206 * @param destinationlength target array size, return value for the new size
207 * @param value to be added
208 * @param increments incremental size expected
209 * @param status output error if any, caller to check status before calling
210 * method, status assumed to be success when passed in.
211 * @return new destination array, destination if there was no new allocation
212 */
213 static
addTouint32_tArray(int32_t * destination,uint32_t offset,uint32_t * destinationlength,uint32_t value,uint32_t increments,UErrorCode * status)214 inline int32_t * addTouint32_tArray(int32_t *destination,
215 uint32_t offset,
216 uint32_t *destinationlength,
217 uint32_t value,
218 uint32_t increments,
219 UErrorCode *status)
220 {
221 uint32_t newlength = *destinationlength;
222 if (offset + 1 == newlength) {
223 newlength += increments;
224 int32_t *temp = (int32_t *)allocateMemory(
225 sizeof(int32_t) * newlength, status);
226 if (U_FAILURE(*status)) {
227 return NULL;
228 }
229 uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
230 *destinationlength = newlength;
231 destination = temp;
232 }
233 destination[offset] = value;
234 return destination;
235 }
236
237 /**
238 * Adds a uint64_t value to a destination array.
239 * Creates a new array if we run out of space. The caller will have to
240 * manually deallocate the newly allocated array.
241 * Internal method, status assumed to be success, caller has to check status
242 * before calling this method. destination not to be NULL and has at least
243 * size destinationlength.
244 * @param destination target array
245 * @param offset destination offset to add value
246 * @param destinationlength target array size, return value for the new size
247 * @param value to be added
248 * @param increments incremental size expected
249 * @param status output error if any, caller to check status before calling
250 * method, status assumed to be success when passed in.
251 * @return new destination array, destination if there was no new allocation
252 */
253 static
addTouint64_tArray(int64_t * destination,uint32_t offset,uint32_t * destinationlength,uint64_t value,uint32_t increments,UErrorCode * status)254 inline int64_t * addTouint64_tArray(int64_t *destination,
255 uint32_t offset,
256 uint32_t *destinationlength,
257 uint64_t value,
258 uint32_t increments,
259 UErrorCode *status)
260 {
261 uint32_t newlength = *destinationlength;
262 if (offset + 1 == newlength) {
263 newlength += increments;
264 int64_t *temp = (int64_t *)allocateMemory(
265 sizeof(int64_t) * newlength, status);
266
267 if (U_FAILURE(*status)) {
268 return NULL;
269 }
270
271 uprv_memcpy(temp, destination, sizeof(int64_t) * offset);
272 *destinationlength = newlength;
273 destination = temp;
274 }
275
276 destination[offset] = value;
277
278 return destination;
279 }
280
281 /**
282 * Initializing the ce table for a pattern.
283 * Stores non-ignorable collation keys.
284 * Table size will be estimated by the size of the pattern text. Table
285 * expansion will be perform as we go along. Adding 1 to ensure that the table
286 * size definitely increases.
287 * Internal method, status assumed to be a success.
288 * @param strsrch string search data
289 * @param status output error if any, caller to check status before calling
290 * method, status assumed to be success when passed in.
291 * @return total number of expansions
292 */
293 static
initializePatternCETable(UStringSearch * strsrch,UErrorCode * status)294 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
295 UErrorCode *status)
296 {
297 UPattern *pattern = &(strsrch->pattern);
298 uint32_t cetablesize = INITIAL_ARRAY_SIZE_;
299 int32_t *cetable = pattern->CEBuffer;
300 uint32_t patternlength = pattern->textLength;
301 UCollationElements *coleiter = strsrch->utilIter;
302
303 if (coleiter == NULL) {
304 coleiter = ucol_openElements(strsrch->collator, pattern->text,
305 patternlength, status);
306 // status will be checked in ucol_next(..) later and if it is an
307 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
308 // returned.
309 strsrch->utilIter = coleiter;
310 }
311 else {
312 uprv_init_collIterate(strsrch->collator, pattern->text,
313 pattern->textLength,
314 &coleiter->iteratordata_);
315 }
316
317 if (pattern->CE != cetable && pattern->CE) {
318 uprv_free(pattern->CE);
319 }
320
321 uint16_t offset = 0;
322 uint16_t result = 0;
323 int32_t ce;
324
325 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
326 U_SUCCESS(*status)) {
327 uint32_t newce = getCE(strsrch, ce);
328 if (newce) {
329 int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
330 newce,
331 patternlength - ucol_getOffset(coleiter) + 1,
332 status);
333 if (U_FAILURE(*status)) {
334 return 0;
335 }
336 offset ++;
337 if (cetable != temp && cetable != pattern->CEBuffer) {
338 uprv_free(cetable);
339 }
340 cetable = temp;
341 }
342 result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
343 }
344
345 cetable[offset] = 0;
346 pattern->CE = cetable;
347 pattern->CELength = offset;
348
349 return result;
350 }
351
352 /**
353 * Initializing the pce table for a pattern.
354 * Stores non-ignorable collation keys.
355 * Table size will be estimated by the size of the pattern text. Table
356 * expansion will be perform as we go along. Adding 1 to ensure that the table
357 * size definitely increases.
358 * Internal method, status assumed to be a success.
359 * @param strsrch string search data
360 * @param status output error if any, caller to check status before calling
361 * method, status assumed to be success when passed in.
362 * @return total number of expansions
363 */
364 static
initializePatternPCETable(UStringSearch * strsrch,UErrorCode * status)365 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
366 UErrorCode *status)
367 {
368 UPattern *pattern = &(strsrch->pattern);
369 uint32_t pcetablesize = INITIAL_ARRAY_SIZE_;
370 int64_t *pcetable = pattern->PCEBuffer;
371 uint32_t patternlength = pattern->textLength;
372 UCollationElements *coleiter = strsrch->utilIter;
373
374 if (coleiter == NULL) {
375 coleiter = ucol_openElements(strsrch->collator, pattern->text,
376 patternlength, status);
377 // status will be checked in ucol_next(..) later and if it is an
378 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
379 // returned.
380 strsrch->utilIter = coleiter;
381 } else {
382 uprv_init_collIterate(strsrch->collator, pattern->text,
383 pattern->textLength,
384 &coleiter->iteratordata_);
385 }
386
387 if (pattern->PCE != pcetable && pattern->PCE != NULL) {
388 uprv_free(pattern->PCE);
389 }
390
391 uint16_t offset = 0;
392 uint16_t result = 0;
393 int64_t pce;
394
395 uprv_init_pce(coleiter);
396
397 // ** Should processed CEs be signed or unsigned?
398 // ** (the rest of the code in this file seems to play fast-and-loose with
399 // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
400 while ((pce = ucol_nextProcessed(coleiter, NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
401 U_SUCCESS(*status)) {
402 int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
403 pce,
404 patternlength - ucol_getOffset(coleiter) + 1,
405 status);
406
407 if (U_FAILURE(*status)) {
408 return 0;
409 }
410
411 offset += 1;
412
413 if (pcetable != temp && pcetable != pattern->PCEBuffer) {
414 uprv_free(pcetable);
415 }
416
417 pcetable = temp;
418 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
419 }
420
421 pcetable[offset] = 0;
422 pattern->PCE = pcetable;
423 pattern->PCELength = offset;
424
425 return result;
426 }
427
428 /**
429 * Initializes the pattern struct.
430 * Internal method, status assumed to be success.
431 * @param strsrch UStringSearch data storage
432 * @param status output error if any, caller to check status before calling
433 * method, status assumed to be success when passed in.
434 * @return expansionsize the total expansion size of the pattern
435 */
436 static
initializePattern(UStringSearch * strsrch,UErrorCode * status)437 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
438 {
439 UPattern *pattern = &(strsrch->pattern);
440 const UChar *patterntext = pattern->text;
441 int32_t length = pattern->textLength;
442 int32_t index = 0;
443
444 // Since the strength is primary, accents are ignored in the pattern.
445 if (strsrch->strength == UCOL_PRIMARY) {
446 pattern->hasPrefixAccents = 0;
447 pattern->hasSuffixAccents = 0;
448 } else {
449 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
450 SECOND_LAST_BYTE_SHIFT_;
451 index = length;
452 UTF_BACK_1(patterntext, 0, index);
453 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
454 LAST_BYTE_MASK_;
455 }
456
457 // ** HACK **
458 if (strsrch->pattern.PCE != NULL) {
459 if (strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
460 uprv_free(strsrch->pattern.PCE);
461 }
462
463 strsrch->pattern.PCE = NULL;
464 }
465
466 // since intializePattern is an internal method status is a success.
467 return initializePatternCETable(strsrch, status);
468 }
469
470 /**
471 * Initializing shift tables, with the default values.
472 * If a corresponding default value is 0, the shift table is not set.
473 * @param shift table for forwards shift
474 * @param backshift table for backwards shift
475 * @param cetable table containing pattern ce
476 * @param cesize size of the pattern ces
477 * @param expansionsize total size of the expansions
478 * @param defaultforward the default forward value
479 * @param defaultbackward the default backward value
480 */
481 static
setShiftTable(int16_t shift[],int16_t backshift[],int32_t * cetable,int32_t cesize,int16_t expansionsize,int16_t defaultforward,int16_t defaultbackward)482 inline void setShiftTable(int16_t shift[], int16_t backshift[],
483 int32_t *cetable, int32_t cesize,
484 int16_t expansionsize,
485 int16_t defaultforward,
486 int16_t defaultbackward)
487 {
488 // estimate the value to shift. to do that we estimate the smallest
489 // number of characters to give the relevant ces, ie approximately
490 // the number of ces minus their expansion, since expansions can come
491 // from a character.
492 int32_t count;
493 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
494 shift[count] = defaultforward;
495 }
496 cesize --; // down to the last index
497 for (count = 0; count < cesize; count ++) {
498 // number of ces from right of array to the count
499 int temp = defaultforward - count - 1;
500 shift[hash(cetable[count])] = temp > 1 ? temp : 1;
501 }
502 shift[hash(cetable[cesize])] = 1;
503 // for ignorables we just shift by one. see test examples.
504 shift[hash(0)] = 1;
505
506 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
507 backshift[count] = defaultbackward;
508 }
509 for (count = cesize; count > 0; count --) {
510 // the original value count does not seem to work
511 backshift[hash(cetable[count])] = count > expansionsize ?
512 (int16_t)(count - expansionsize) : 1;
513 }
514 backshift[hash(cetable[0])] = 1;
515 backshift[hash(0)] = 1;
516 }
517
518 /**
519 * Building of the pattern collation element list and the boyer moore strsrch
520 * table.
521 * The canonical match will only be performed after the default match fails.
522 * For both cases we need to remember the size of the composed and decomposed
523 * versions of the string. Since the Boyer-Moore shift calculations shifts by
524 * a number of characters in the text and tries to match the pattern from that
525 * offset, the shift value can not be too large in case we miss some
526 * characters. To choose a right shift size, we estimate the NFC form of the
527 * and use its size as a shift guide. The NFC form should be the small
528 * possible representation of the pattern. Anyways, we'll err on the smaller
529 * shift size. Hence the calculation for minlength.
530 * Canonical match will be performed slightly differently. We'll split the
531 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
532 * the first and last base character (MS), the ending accents (EA). Matches
533 * will be done on MS first, and only when we match MS then some processing
534 * will be required for the prefix and end accents in order to determine if
535 * they match PA and EA. Hence the default shift values
536 * for the canonical match will take the size of either end's accent into
537 * consideration. Forwards search will take the end accents into consideration
538 * for the default shift values and the backwards search will take the prefix
539 * accents into consideration.
540 * If pattern has no non-ignorable ce, we return a illegal argument error.
541 * Internal method, status assumed to be success.
542 * @param strsrch UStringSearch data storage
543 * @param status for output errors if it occurs, status is assumed to be a
544 * success when it is passed in.
545 */
546 static
initialize(UStringSearch * strsrch,UErrorCode * status)547 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
548 {
549 int16_t expandlength = initializePattern(strsrch, status);
550 if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) {
551 UPattern *pattern = &strsrch->pattern;
552 int32_t cesize = pattern->CELength;
553
554 int16_t minlength = cesize > expandlength
555 ? (int16_t)cesize - expandlength : 1;
556 pattern->defaultShiftSize = minlength;
557 setShiftTable(pattern->shift, pattern->backShift, pattern->CE,
558 cesize, expandlength, minlength, minlength);
559 return;
560 }
561 strsrch->pattern.defaultShiftSize = 0;
562 }
563
564 #if BOYER_MOORE
565 /**
566 * Check to make sure that the match length is at the end of the character by
567 * using the breakiterator.
568 * @param strsrch string search data
569 * @param start target text start offset
570 * @param end target text end offset
571 */
572 static
checkBreakBoundary(const UStringSearch * strsrch,int32_t *,int32_t * end)573 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
574 int32_t *end)
575 {
576 #if !UCONFIG_NO_BREAK_ITERATION
577 UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
578 if (breakiterator) {
579 int32_t matchend = *end;
580 //int32_t matchstart = *start;
581
582 if (!ubrk_isBoundary(breakiterator, matchend)) {
583 *end = ubrk_following(breakiterator, matchend);
584 }
585
586 /* Check the start of the matched text to make sure it doesn't have any accents
587 * before it. This code may not be necessary and so it is commented out */
588 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
589 *start = ubrk_preceding(breakiterator, matchstart);
590 }*/
591 }
592 #endif
593 }
594
595 /**
596 * Determine whether the target text in UStringSearch bounded by the offset
597 * start and end is one or more whole units of text as
598 * determined by the breakiterator in UStringSearch.
599 * @param strsrch string search data
600 * @param start target text start offset
601 * @param end target text end offset
602 */
603 static
isBreakUnit(const UStringSearch * strsrch,int32_t start,int32_t end)604 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
605 int32_t end)
606 {
607 #if !UCONFIG_NO_BREAK_ITERATION
608 UBreakIterator *breakiterator = strsrch->search->breakIter;
609 //TODO: Add here.
610 if (breakiterator) {
611 int32_t startindex = ubrk_first(breakiterator);
612 int32_t endindex = ubrk_last(breakiterator);
613
614 // out-of-range indexes are never boundary positions
615 if (start < startindex || start > endindex ||
616 end < startindex || end > endindex) {
617 return FALSE;
618 }
619 // otherwise, we can use following() on the position before the
620 // specified one and return true of the position we get back is the
621 // one the user specified
622 UBool result = (start == startindex ||
623 ubrk_following(breakiterator, start - 1) == start) &&
624 (end == endindex ||
625 ubrk_following(breakiterator, end - 1) == end);
626 if (result) {
627 // iterates the individual ces
628 UCollationElements *coleiter = strsrch->utilIter;
629 const UChar *text = strsrch->search->text +
630 start;
631 UErrorCode status = U_ZERO_ERROR;
632 ucol_setText(coleiter, text, end - start, &status);
633 for (int32_t count = 0; count < strsrch->pattern.CELength;
634 count ++) {
635 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
636 if (ce == UCOL_IGNORABLE) {
637 count --;
638 continue;
639 }
640 if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) {
641 return FALSE;
642 }
643 }
644 int32_t nextce = ucol_next(coleiter, &status);
645 while (ucol_getOffset(coleiter) == (end - start)
646 && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
647 nextce = ucol_next(coleiter, &status);
648 }
649 if (ucol_getOffset(coleiter) == (end - start)
650 && nextce != UCOL_NULLORDER) {
651 // extra collation elements at the end of the match
652 return FALSE;
653 }
654 }
655 return result;
656 }
657 #endif
658 return TRUE;
659 }
660
661 /**
662 * Getting the next base character offset if current offset is an accent,
663 * or the current offset if the current character contains a base character.
664 * accents the following base character will be returned
665 * @param text string
666 * @param textoffset current offset
667 * @param textlength length of text string
668 * @return the next base character or the current offset
669 * if the current character is contains a base character.
670 */
671 static
getNextBaseOffset(const UChar * text,int32_t textoffset,int32_t textlength)672 inline int32_t getNextBaseOffset(const UChar *text,
673 int32_t textoffset,
674 int32_t textlength)
675 {
676 if (textoffset < textlength) {
677 int32_t temp = textoffset;
678 if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
679 while (temp < textlength) {
680 int32_t result = temp;
681 if ((getFCD(text, &temp, textlength) >>
682 SECOND_LAST_BYTE_SHIFT_) == 0) {
683 return result;
684 }
685 }
686 return textlength;
687 }
688 }
689 return textoffset;
690 }
691
692 /**
693 * Gets the next base character offset depending on the string search pattern
694 * data
695 * @param strsrch string search data
696 * @param textoffset current offset, one offset away from the last character
697 * to search for.
698 * @return start index of the next base character or the current offset
699 * if the current character is contains a base character.
700 */
701 static
getNextUStringSearchBaseOffset(UStringSearch * strsrch,int32_t textoffset)702 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
703 int32_t textoffset)
704 {
705 int32_t textlength = strsrch->search->textLength;
706 if (strsrch->pattern.hasSuffixAccents &&
707 textoffset < textlength) {
708 int32_t temp = textoffset;
709 const UChar *text = strsrch->search->text;
710 UTF_BACK_1(text, 0, temp);
711 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
712 return getNextBaseOffset(text, textoffset, textlength);
713 }
714 }
715 return textoffset;
716 }
717
718 /**
719 * Shifting the collation element iterator position forward to prepare for
720 * a following match. If the last character is a unsafe character, we'll only
721 * shift by 1 to capture contractions, normalization etc.
722 * Internal method, status assumed to be success.
723 * @param text strsrch string search data
724 * @param textoffset start text position to do search
725 * @param ce the text ce which failed the match.
726 * @param patternceindex index of the ce within the pattern ce buffer which
727 * failed the match
728 * @return final offset
729 */
730 static
shiftForward(UStringSearch * strsrch,int32_t textoffset,int32_t ce,int32_t patternceindex)731 inline int32_t shiftForward(UStringSearch *strsrch,
732 int32_t textoffset,
733 int32_t ce,
734 int32_t patternceindex)
735 {
736 UPattern *pattern = &(strsrch->pattern);
737 if (ce != UCOL_NULLORDER) {
738 int32_t shift = pattern->shift[hash(ce)];
739 // this is to adjust for characters in the middle of the
740 // substring for matching that failed.
741 int32_t adjust = pattern->CELength - patternceindex;
742 if (adjust > 1 && shift >= adjust) {
743 shift -= adjust - 1;
744 }
745 textoffset += shift;
746 }
747 else {
748 textoffset += pattern->defaultShiftSize;
749 }
750
751 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
752 // check for unsafe characters
753 // * if it is the start or middle of a contraction: to be done after
754 // a initial match is found
755 // * thai or lao base consonant character: similar to contraction
756 // * high surrogate character: similar to contraction
757 // * next character is a accent: shift to the next base character
758 return textoffset;
759 }
760 #endif // #if BOYER_MOORE
761
762 /**
763 * sets match not found
764 * @param strsrch string search data
765 */
766 static
setMatchNotFound(UStringSearch * strsrch)767 inline void setMatchNotFound(UStringSearch *strsrch)
768 {
769 // this method resets the match result regardless of the error status.
770 strsrch->search->matchedIndex = USEARCH_DONE;
771 strsrch->search->matchedLength = 0;
772 if (strsrch->search->isForwardSearching) {
773 setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
774 }
775 else {
776 setColEIterOffset(strsrch->textIter, 0);
777 }
778 }
779
780 #if BOYER_MOORE
781 /**
782 * Gets the offset to the next safe point in text.
783 * ie. not the middle of a contraction, swappable characters or supplementary
784 * characters.
785 * @param collator collation sata
786 * @param text string to work with
787 * @param textoffset offset in string
788 * @param textlength length of text string
789 * @return offset to the next safe character
790 */
791 static
getNextSafeOffset(const UCollator * collator,const UChar * text,int32_t textoffset,int32_t textlength)792 inline int32_t getNextSafeOffset(const UCollator *collator,
793 const UChar *text,
794 int32_t textoffset,
795 int32_t textlength)
796 {
797 int32_t result = textoffset; // first contraction character
798 while (result != textlength && ucol_unsafeCP(text[result], collator)) {
799 result ++;
800 }
801 return result;
802 }
803
804 /**
805 * This checks for accents in the potential match started with a .
806 * composite character.
807 * This is really painful... we have to check that composite character do not
808 * have any extra accents. We have to normalize the potential match and find
809 * the immediate decomposed character before the match.
810 * The first composite character would have been taken care of by the fcd
811 * checks in checkForwardExactMatch.
812 * This is the slow path after the fcd of the first character and
813 * the last character has been checked by checkForwardExactMatch and we
814 * determine that the potential match has extra non-ignorable preceding
815 * ces.
816 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
817 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
818 * Note here that accents checking are slow and cautioned in the API docs.
819 * Internal method, status assumed to be a success, caller should check status
820 * before calling this method
821 * @param strsrch string search data
822 * @param start index of the potential unfriendly composite character
823 * @param end index of the potential unfriendly composite character
824 * @param status output error status if any.
825 * @return TRUE if there is non-ignorable accents before at the beginning
826 * of the match, FALSE otherwise.
827 */
828
829 static
checkExtraMatchAccents(const UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)830 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
831 int32_t end,
832 UErrorCode *status)
833 {
834 UBool result = FALSE;
835 if (strsrch->pattern.hasPrefixAccents) {
836 int32_t length = end - start;
837 int32_t offset = 0;
838 const UChar *text = strsrch->search->text + start;
839
840 UTF_FWD_1(text, offset, length);
841 // we are only concerned with the first composite character
842 if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
843 int32_t safeoffset = getNextSafeOffset(strsrch->collator,
844 text, 0, length);
845 if (safeoffset != length) {
846 safeoffset ++;
847 }
848 UChar *norm = NULL;
849 UChar buffer[INITIAL_ARRAY_SIZE_];
850 int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
851 buffer, INITIAL_ARRAY_SIZE_,
852 status);
853 if (U_FAILURE(*status)) {
854 return FALSE;
855 }
856 if (size >= INITIAL_ARRAY_SIZE_) {
857 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
858 status);
859 // if allocation failed, status will be set to
860 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
861 // checks for it.
862 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
863 size, status);
864 if (U_FAILURE(*status) && norm != NULL) {
865 uprv_free(norm);
866 return FALSE;
867 }
868 }
869 else {
870 norm = buffer;
871 }
872
873 UCollationElements *coleiter = strsrch->utilIter;
874 ucol_setText(coleiter, norm, size, status);
875 uint32_t firstce = strsrch->pattern.CE[0];
876 UBool ignorable = TRUE;
877 uint32_t ce = UCOL_IGNORABLE;
878 while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
879 offset = ucol_getOffset(coleiter);
880 if (ce != firstce && ce != UCOL_IGNORABLE) {
881 ignorable = FALSE;
882 }
883 ce = ucol_next(coleiter, status);
884 }
885 UChar32 codepoint;
886 UTF_PREV_CHAR(norm, 0, offset, codepoint);
887 result = !ignorable && (u_getCombiningClass(codepoint) != 0);
888
889 if (norm != buffer) {
890 uprv_free(norm);
891 }
892 }
893 }
894
895 return result;
896 }
897
898 /**
899 * Used by exact matches, checks if there are accents before the match.
900 * This is really painful... we have to check that composite characters at
901 * the start of the matches have to not have any extra accents.
902 * We check the FCD of the character first, if it starts with an accent and
903 * the first pattern ce does not match the first ce of the character, we bail.
904 * Otherwise we try normalizing the first composite
905 * character and find the immediate decomposed character before the match to
906 * see if it is an non-ignorable accent.
907 * Now normalizing the first composite character is enough because we ensure
908 * that when the match is passed in here with extra beginning ces, the
909 * first or last ce that match has to occur within the first character.
910 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
911 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
912 * Note here that accents checking are slow and cautioned in the API docs.
913 * @param strsrch string search data
914 * @param start offset
915 * @param end offset
916 * @return TRUE if there are accents on either side of the match,
917 * FALSE otherwise
918 */
919 static
hasAccentsBeforeMatch(const UStringSearch * strsrch,int32_t start,int32_t end)920 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
921 int32_t end)
922 {
923 if (strsrch->pattern.hasPrefixAccents) {
924 UCollationElements *coleiter = strsrch->textIter;
925 UErrorCode status = U_ZERO_ERROR;
926 // we have been iterating forwards previously
927 uint32_t ignorable = TRUE;
928 int32_t firstce = strsrch->pattern.CE[0];
929
930 setColEIterOffset(coleiter, start);
931 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
932 if (U_FAILURE(status)) {
933 return TRUE;
934 }
935 while (ce != firstce) {
936 if (ce != UCOL_IGNORABLE) {
937 ignorable = FALSE;
938 }
939 ce = getCE(strsrch, ucol_next(coleiter, &status));
940 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
941 return TRUE;
942 }
943 }
944 if (!ignorable && inNormBuf(coleiter)) {
945 // within normalization buffer, discontiguous handled here
946 return TRUE;
947 }
948
949 // within text
950 int32_t temp = start;
951 // original code
952 // accent = (getFCD(strsrch->search->text, &temp,
953 // strsrch->search->textLength)
954 // >> SECOND_LAST_BYTE_SHIFT_);
955 // however this code does not work well with VC7 .net in release mode.
956 // maybe the inlines for getFCD combined with shifting has bugs in
957 // VC7. anyways this is a work around.
958 UBool accent = getFCD(strsrch->search->text, &temp,
959 strsrch->search->textLength) > 0xFF;
960 if (!accent) {
961 return checkExtraMatchAccents(strsrch, start, end, &status);
962 }
963 if (!ignorable) {
964 return TRUE;
965 }
966 if (start > 0) {
967 temp = start;
968 UTF_BACK_1(strsrch->search->text, 0, temp);
969 if (getFCD(strsrch->search->text, &temp,
970 strsrch->search->textLength) & LAST_BYTE_MASK_) {
971 setColEIterOffset(coleiter, start);
972 ce = ucol_previous(coleiter, &status);
973 if (U_FAILURE(status) ||
974 (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
975 return TRUE;
976 }
977 }
978 }
979 }
980
981 return FALSE;
982 }
983
984 /**
985 * Used by exact matches, checks if there are accents bounding the match.
986 * Note this is the initial boundary check. If the potential match
987 * starts or ends with composite characters, the accents in those
988 * characters will be determined later.
989 * Not doing backwards iteration here, since discontiguos contraction for
990 * backwards collation element iterator, use up too many characters.
991 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
992 * should fail since there is a acute at the end of \u01FA
993 * Note here that accents checking are slow and cautioned in the API docs.
994 * @param strsrch string search data
995 * @param start offset of match
996 * @param end end offset of the match
997 * @return TRUE if there are accents on either side of the match,
998 * FALSE otherwise
999 */
1000 static
hasAccentsAfterMatch(const UStringSearch * strsrch,int32_t start,int32_t end)1001 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
1002 int32_t end)
1003 {
1004 if (strsrch->pattern.hasSuffixAccents) {
1005 const UChar *text = strsrch->search->text;
1006 int32_t temp = end;
1007 int32_t textlength = strsrch->search->textLength;
1008 UTF_BACK_1(text, 0, temp);
1009 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
1010 int32_t firstce = strsrch->pattern.CE[0];
1011 UCollationElements *coleiter = strsrch->textIter;
1012 UErrorCode status = U_ZERO_ERROR;
1013 int32_t ce;
1014 setColEIterOffset(coleiter, start);
1015 while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1016 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
1017 return TRUE;
1018 }
1019 }
1020 int32_t count = 1;
1021 while (count < strsrch->pattern.CELength) {
1022 if (getCE(strsrch, ucol_next(coleiter, &status))
1023 == UCOL_IGNORABLE) {
1024 // Thai can give an ignorable here.
1025 count --;
1026 }
1027 if (U_FAILURE(status)) {
1028 return TRUE;
1029 }
1030 count ++;
1031 }
1032
1033 ce = ucol_next(coleiter, &status);
1034 if (U_FAILURE(status)) {
1035 return TRUE;
1036 }
1037 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1038 ce = getCE(strsrch, ce);
1039 }
1040 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1041 if (ucol_getOffset(coleiter) <= end) {
1042 return TRUE;
1043 }
1044 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1045 return TRUE;
1046 }
1047 }
1048 }
1049 }
1050 return FALSE;
1051 }
1052 #endif // #if BOYER_MOORE
1053
1054 /**
1055 * Checks if the offset runs out of the text string
1056 * @param offset
1057 * @param textlength of the text string
1058 * @return TRUE if offset is out of bounds, FALSE otherwise
1059 */
1060 static
isOutOfBounds(int32_t textlength,int32_t offset)1061 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1062 {
1063 return offset < 0 || offset > textlength;
1064 }
1065
1066 /**
1067 * Checks for identical match
1068 * @param strsrch string search data
1069 * @param start offset of possible match
1070 * @param end offset of possible match
1071 * @return TRUE if identical match is found
1072 */
1073 static
checkIdentical(const UStringSearch * strsrch,int32_t start,int32_t end)1074 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1075 int32_t end)
1076 {
1077 UChar t2[32], p2[32];
1078 int32_t length = end - start;
1079 if (strsrch->strength != UCOL_IDENTICAL) {
1080 return TRUE;
1081 }
1082
1083 UErrorCode status = U_ZERO_ERROR, status2 = U_ZERO_ERROR;
1084 int32_t decomplength = unorm_decompose(t2, LENGTHOF(t2),
1085 strsrch->search->text + start, length,
1086 FALSE, 0, &status);
1087 // use separate status2 in case of buffer overflow
1088 if (decomplength != unorm_decompose(p2, LENGTHOF(p2),
1089 strsrch->pattern.text,
1090 strsrch->pattern.textLength,
1091 FALSE, 0, &status2)) {
1092 return FALSE; // lengths are different
1093 }
1094
1095 // compare contents
1096 UChar *text, *pattern;
1097 if(U_SUCCESS(status)) {
1098 text = t2;
1099 pattern = p2;
1100 } else if(status==U_BUFFER_OVERFLOW_ERROR) {
1101 status = U_ZERO_ERROR;
1102 // allocate one buffer for both decompositions
1103 text = (UChar *)uprv_malloc(decomplength * 2 * U_SIZEOF_UCHAR);
1104 // Check for allocation failure.
1105 if (text == NULL) {
1106 return FALSE;
1107 }
1108 pattern = text + decomplength;
1109 unorm_decompose(text, decomplength, strsrch->search->text + start,
1110 length, FALSE, 0, &status);
1111 unorm_decompose(pattern, decomplength, strsrch->pattern.text,
1112 strsrch->pattern.textLength, FALSE, 0, &status);
1113 } else {
1114 // NFD failed, make sure that u_memcmp() does not overrun t2 & p2
1115 // and that we don't uprv_free() an undefined text pointer
1116 text = pattern = t2;
1117 decomplength = 0;
1118 }
1119 UBool result = (UBool)(u_memcmp(pattern, text, decomplength) == 0);
1120 if(text != t2) {
1121 uprv_free(text);
1122 }
1123 // return FALSE if NFD failed
1124 return U_SUCCESS(status) && result;
1125 }
1126
1127 #if BOYER_MOORE
1128 /**
1129 * Checks to see if the match is repeated
1130 * @param strsrch string search data
1131 * @param start new match start index
1132 * @param end new match end index
1133 * @return TRUE if the the match is repeated, FALSE otherwise
1134 */
1135 static
checkRepeatedMatch(UStringSearch * strsrch,int32_t start,int32_t end)1136 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1137 int32_t start,
1138 int32_t end)
1139 {
1140 int32_t lastmatchindex = strsrch->search->matchedIndex;
1141 UBool result;
1142 if (lastmatchindex == USEARCH_DONE) {
1143 return FALSE;
1144 }
1145 if (strsrch->search->isForwardSearching) {
1146 result = start <= lastmatchindex;
1147 }
1148 else {
1149 result = start >= lastmatchindex;
1150 }
1151 if (!result && !strsrch->search->isOverlap) {
1152 if (strsrch->search->isForwardSearching) {
1153 result = start < lastmatchindex + strsrch->search->matchedLength;
1154 }
1155 else {
1156 result = end > lastmatchindex;
1157 }
1158 }
1159 return result;
1160 }
1161
1162 /**
1163 * Gets the collation element iterator's current offset.
1164 * @param coleiter collation element iterator
1165 * @param forwards flag TRUE if we are moving in th forwards direction
1166 * @return current offset
1167 */
1168 static
getColElemIterOffset(const UCollationElements * coleiter,UBool forwards)1169 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1170 UBool forwards)
1171 {
1172 int32_t result = ucol_getOffset(coleiter);
1173 // intricacies of the the backwards collation element iterator
1174 if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
1175 result ++;
1176 }
1177 return result;
1178 }
1179
1180 /**
1181 * Checks match for contraction.
1182 * If the match ends with a partial contraction we fail.
1183 * If the match starts too far off (because of backwards iteration) we try to
1184 * chip off the extra characters depending on whether a breakiterator has
1185 * been used.
1186 * Internal method, error assumed to be success, caller has to check status
1187 * before calling this method.
1188 * @param strsrch string search data
1189 * @param start offset of potential match, to be modified if necessary
1190 * @param end offset of potential match, to be modified if necessary
1191 * @param status output error status if any
1192 * @return TRUE if match passes the contraction test, FALSE otherwise
1193 */
1194
1195 static
checkNextExactContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)1196 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1197 int32_t *start,
1198 int32_t *end, UErrorCode *status)
1199 {
1200 UCollationElements *coleiter = strsrch->textIter;
1201 int32_t textlength = strsrch->search->textLength;
1202 int32_t temp = *start;
1203 const UCollator *collator = strsrch->collator;
1204 const UChar *text = strsrch->search->text;
1205 // This part checks if either ends of the match contains potential
1206 // contraction. If so we'll have to iterate through them
1207 // The start contraction needs to be checked since ucol_previous dumps
1208 // all characters till the first safe character into the buffer.
1209 // *start + 1 is used to test for the unsafe characters instead of *start
1210 // because ucol_prev takes all unsafe characters till the first safe
1211 // character ie *start. so by testing *start + 1, we can estimate if
1212 // excess prefix characters has been included in the potential search
1213 // results.
1214 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1215 (*start + 1 < textlength
1216 && ucol_unsafeCP(text[*start + 1], collator))) {
1217 int32_t expansion = getExpansionPrefix(coleiter);
1218 UBool expandflag = expansion > 0;
1219 setColEIterOffset(coleiter, *start);
1220 while (expansion > 0) {
1221 // getting rid of the redundant ce, caused by setOffset.
1222 // since backward contraction/expansion may have extra ces if we
1223 // are in the normalization buffer, hasAccentsBeforeMatch would
1224 // have taken care of it.
1225 // E.g. the character \u01FA will have an expansion of 3, but if
1226 // we are only looking for acute and ring \u030A and \u0301, we'll
1227 // have to skip the first ce in the expansion buffer.
1228 ucol_next(coleiter, status);
1229 if (U_FAILURE(*status)) {
1230 return FALSE;
1231 }
1232 if (ucol_getOffset(coleiter) != temp) {
1233 *start = temp;
1234 temp = ucol_getOffset(coleiter);
1235 }
1236 expansion --;
1237 }
1238
1239 int32_t *patternce = strsrch->pattern.CE;
1240 int32_t patterncelength = strsrch->pattern.CELength;
1241 int32_t count = 0;
1242 while (count < patterncelength) {
1243 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1244 if (ce == UCOL_IGNORABLE) {
1245 continue;
1246 }
1247 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1248 *start = temp;
1249 temp = ucol_getOffset(coleiter);
1250 }
1251 if (U_FAILURE(*status) || ce != patternce[count]) {
1252 (*end) ++;
1253 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1254 return FALSE;
1255 }
1256 count ++;
1257 }
1258 }
1259 return TRUE;
1260 }
1261
1262 /**
1263 * Checks and sets the match information if found.
1264 * Checks
1265 * <ul>
1266 * <li> the potential match does not repeat the previous match
1267 * <li> boundaries are correct
1268 * <li> exact matches has no extra accents
1269 * <li> identical matchesb
1270 * <li> potential match does not end in the middle of a contraction
1271 * <\ul>
1272 * Otherwise the offset will be shifted to the next character.
1273 * Internal method, status assumed to be success, caller has to check status
1274 * before calling this method.
1275 * @param strsrch string search data
1276 * @param textoffset offset in the collation element text. the returned value
1277 * will be the truncated end offset of the match or the new start
1278 * search offset.
1279 * @param status output error status if any
1280 * @return TRUE if the match is valid, FALSE otherwise
1281 */
1282 static
checkNextExactMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)1283 inline UBool checkNextExactMatch(UStringSearch *strsrch,
1284 int32_t *textoffset, UErrorCode *status)
1285 {
1286 UCollationElements *coleiter = strsrch->textIter;
1287 int32_t start = getColElemIterOffset(coleiter, FALSE);
1288
1289 if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1290 return FALSE;
1291 }
1292
1293 // this totally matches, however we need to check if it is repeating
1294 if (!isBreakUnit(strsrch, start, *textoffset) ||
1295 checkRepeatedMatch(strsrch, start, *textoffset) ||
1296 hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1297 !checkIdentical(strsrch, start, *textoffset) ||
1298 hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1299
1300 (*textoffset) ++;
1301 *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1302 return FALSE;
1303 }
1304
1305 //Add breakiterator boundary check for primary strength search.
1306 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
1307 checkBreakBoundary(strsrch, &start, textoffset);
1308 }
1309
1310 // totally match, we will get rid of the ending ignorables.
1311 strsrch->search->matchedIndex = start;
1312 strsrch->search->matchedLength = *textoffset - start;
1313 return TRUE;
1314 }
1315
1316 /**
1317 * Getting the previous base character offset, or the current offset if the
1318 * current character is a base character
1319 * @param text string
1320 * @param textoffset one offset after the current character
1321 * @return the offset of the next character after the base character or the first
1322 * composed character with accents
1323 */
1324 static
getPreviousBaseOffset(const UChar * text,int32_t textoffset)1325 inline int32_t getPreviousBaseOffset(const UChar *text,
1326 int32_t textoffset)
1327 {
1328 if (textoffset > 0) {
1329 for (;;) {
1330 int32_t result = textoffset;
1331 UTF_BACK_1(text, 0, textoffset);
1332 int32_t temp = textoffset;
1333 uint16_t fcd = getFCD(text, &temp, result);
1334 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1335 if (fcd & LAST_BYTE_MASK_) {
1336 return textoffset;
1337 }
1338 return result;
1339 }
1340 if (textoffset == 0) {
1341 return 0;
1342 }
1343 }
1344 }
1345 return textoffset;
1346 }
1347
1348 /**
1349 * Getting the indexes of the accents that are not blocked in the argument
1350 * accent array
1351 * @param accents array of accents in nfd terminated by a 0.
1352 * @param accentsindex array of indexes of the accents that are not blocked
1353 */
1354 static
getUnblockedAccentIndex(UChar * accents,int32_t * accentsindex)1355 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1356 {
1357 int32_t index = 0;
1358 int32_t length = u_strlen(accents);
1359 UChar32 codepoint = 0;
1360 int cclass = 0;
1361 int result = 0;
1362 int32_t temp;
1363 while (index < length) {
1364 temp = index;
1365 UTF_NEXT_CHAR(accents, index, length, codepoint);
1366 if (u_getCombiningClass(codepoint) != cclass) {
1367 cclass = u_getCombiningClass(codepoint);
1368 accentsindex[result] = temp;
1369 result ++;
1370 }
1371 }
1372 accentsindex[result] = length;
1373 return result;
1374 }
1375
1376 /**
1377 * Appends 3 UChar arrays to a destination array.
1378 * Creates a new array if we run out of space. The caller will have to
1379 * manually deallocate the newly allocated array.
1380 * Internal method, status assumed to be success, caller has to check status
1381 * before calling this method. destination not to be NULL and has at least
1382 * size destinationlength.
1383 * @param destination target array
1384 * @param destinationlength target array size, returning the appended length
1385 * @param source1 null-terminated first array
1386 * @param source2 second array
1387 * @param source2length length of seond array
1388 * @param source3 null-terminated third array
1389 * @param status error status if any
1390 * @return new destination array, destination if there was no new allocation
1391 */
1392 static
addToUCharArray(UChar * destination,int32_t * destinationlength,const UChar * source1,const UChar * source2,int32_t source2length,const UChar * source3,UErrorCode * status)1393 inline UChar * addToUCharArray( UChar *destination,
1394 int32_t *destinationlength,
1395 const UChar *source1,
1396 const UChar *source2,
1397 int32_t source2length,
1398 const UChar *source3,
1399 UErrorCode *status)
1400 {
1401 int32_t source1length = source1 ? u_strlen(source1) : 0;
1402 int32_t source3length = source3 ? u_strlen(source3) : 0;
1403 if (*destinationlength < source1length + source2length + source3length +
1404 1)
1405 {
1406 destination = (UChar *)allocateMemory(
1407 (source1length + source2length + source3length + 1) * sizeof(UChar),
1408 status);
1409 // if error allocating memory, status will be
1410 // U_MEMORY_ALLOCATION_ERROR
1411 if (U_FAILURE(*status)) {
1412 *destinationlength = 0;
1413 return NULL;
1414 }
1415 }
1416 if (source1length != 0) {
1417 uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
1418 }
1419 if (source2length != 0) {
1420 uprv_memcpy(destination + source1length, source2,
1421 sizeof(UChar) * source2length);
1422 }
1423 if (source3length != 0) {
1424 uprv_memcpy(destination + source1length + source2length, source3,
1425 sizeof(UChar) * source3length);
1426 }
1427 *destinationlength = source1length + source2length + source3length;
1428 return destination;
1429 }
1430
1431 /**
1432 * Running through a collation element iterator to see if the contents matches
1433 * pattern in string search data
1434 * @param strsrch string search data
1435 * @param coleiter collation element iterator
1436 * @return TRUE if a match if found, FALSE otherwise
1437 */
1438 static
checkCollationMatch(const UStringSearch * strsrch,UCollationElements * coleiter)1439 inline UBool checkCollationMatch(const UStringSearch *strsrch,
1440 UCollationElements *coleiter)
1441 {
1442 int patternceindex = strsrch->pattern.CELength;
1443 int32_t *patternce = strsrch->pattern.CE;
1444 UErrorCode status = U_ZERO_ERROR;
1445 while (patternceindex > 0) {
1446 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1447 if (ce == UCOL_IGNORABLE) {
1448 continue;
1449 }
1450 if (U_FAILURE(status) || ce != *patternce) {
1451 return FALSE;
1452 }
1453 patternce ++;
1454 patternceindex --;
1455 }
1456 return TRUE;
1457 }
1458
1459 /**
1460 * Rearranges the front accents to try matching.
1461 * Prefix accents in the text will be grouped according to their combining
1462 * class and the groups will be mixed and matched to try find the perfect
1463 * match with the pattern.
1464 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1465 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1466 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1467 * "\u0301\u0325".
1468 * step 2: check if any of the generated substrings matches the pattern.
1469 * Internal method, status is assumed to be success, caller has to check status
1470 * before calling this method.
1471 * @param strsrch string search match
1472 * @param start first offset of the accents to start searching
1473 * @param end start of the last accent set
1474 * @param status output error status if any
1475 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1476 * offset of the match. Note this start includes all preceding accents.
1477 */
1478 static
doNextCanonicalPrefixMatch(UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)1479 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1480 int32_t start,
1481 int32_t end,
1482 UErrorCode *status)
1483 {
1484 const UChar *text = strsrch->search->text;
1485 int32_t textlength = strsrch->search->textLength;
1486 int32_t tempstart = start;
1487
1488 if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1489 // die... failed at a base character
1490 return USEARCH_DONE;
1491 }
1492
1493 int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1494 start = getPreviousBaseOffset(text, tempstart);
1495
1496 UChar accents[INITIAL_ARRAY_SIZE_];
1497 // normalizing the offensive string
1498 unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1499 INITIAL_ARRAY_SIZE_, status);
1500 if (U_FAILURE(*status)) {
1501 return USEARCH_DONE;
1502 }
1503
1504 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1505 int32_t accentsize = getUnblockedAccentIndex(accents,
1506 accentsindex);
1507 int32_t count = (2 << (accentsize - 1)) - 1;
1508 UChar buffer[INITIAL_ARRAY_SIZE_];
1509 UCollationElements *coleiter = strsrch->utilIter;
1510 while (U_SUCCESS(*status) && count > 0) {
1511 UChar *rearrange = strsrch->canonicalPrefixAccents;
1512 // copy the base characters
1513 for (int k = 0; k < accentsindex[0]; k ++) {
1514 *rearrange ++ = accents[k];
1515 }
1516 // forming all possible canonical rearrangement by dropping
1517 // sets of accents
1518 for (int i = 0; i <= accentsize - 1; i ++) {
1519 int32_t mask = 1 << (accentsize - i - 1);
1520 if (count & mask) {
1521 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1522 *rearrange ++ = accents[j];
1523 }
1524 }
1525 }
1526 *rearrange = 0;
1527 int32_t matchsize = INITIAL_ARRAY_SIZE_;
1528 UChar *match = addToUCharArray(buffer, &matchsize,
1529 strsrch->canonicalPrefixAccents,
1530 strsrch->search->text + offset,
1531 end - offset,
1532 strsrch->canonicalSuffixAccents,
1533 status);
1534
1535 // if status is a failure, ucol_setText does nothing.
1536 // run the collator iterator through this match
1537 ucol_setText(coleiter, match, matchsize, status);
1538 if (U_SUCCESS(*status)) {
1539 if (checkCollationMatch(strsrch, coleiter)) {
1540 if (match != buffer) {
1541 uprv_free(match);
1542 }
1543 return start;
1544 }
1545 }
1546 count --;
1547 }
1548 return USEARCH_DONE;
1549 }
1550
1551 /**
1552 * Gets the offset to the safe point in text before textoffset.
1553 * ie. not the middle of a contraction, swappable characters or supplementary
1554 * characters.
1555 * @param collator collation sata
1556 * @param text string to work with
1557 * @param textoffset offset in string
1558 * @param textlength length of text string
1559 * @return offset to the previous safe character
1560 */
1561 static
getPreviousSafeOffset(const UCollator * collator,const UChar * text,int32_t textoffset)1562 inline uint32_t getPreviousSafeOffset(const UCollator *collator,
1563 const UChar *text,
1564 int32_t textoffset)
1565 {
1566 int32_t result = textoffset; // first contraction character
1567 while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1568 result --;
1569 }
1570 if (result != 0) {
1571 // the first contraction character is consider unsafe here
1572 result --;
1573 }
1574 return result;
1575 }
1576
1577 /**
1578 * Cleaning up after we passed the safe zone
1579 * @param strsrch string search data
1580 * @param safetext safe text array
1581 * @param safebuffer safe text buffer
1582 * @param coleiter collation element iterator for safe text
1583 */
1584 static
cleanUpSafeText(const UStringSearch * strsrch,UChar * safetext,UChar * safebuffer)1585 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1586 UChar *safebuffer)
1587 {
1588 if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1589 {
1590 uprv_free(safetext);
1591 }
1592 }
1593
1594 /**
1595 * Take the rearranged end accents and tries matching. If match failed at
1596 * a seperate preceding set of accents (seperated from the rearranged on by
1597 * at least a base character) then we rearrange the preceding accents and
1598 * tries matching again.
1599 * We allow skipping of the ends of the accent set if the ces do not match.
1600 * However if the failure is found before the accent set, it fails.
1601 * Internal method, status assumed to be success, caller has to check status
1602 * before calling this method.
1603 * @param strsrch string search data
1604 * @param textoffset of the start of the rearranged accent
1605 * @param status output error status if any
1606 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1607 * offset of the match. Note this start includes all preceding accents.
1608 */
1609 static
doNextCanonicalSuffixMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)1610 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1611 int32_t textoffset,
1612 UErrorCode *status)
1613 {
1614 const UChar *text = strsrch->search->text;
1615 const UCollator *collator = strsrch->collator;
1616 int32_t safelength = 0;
1617 UChar *safetext;
1618 int32_t safetextlength;
1619 UChar safebuffer[INITIAL_ARRAY_SIZE_];
1620 UCollationElements *coleiter = strsrch->utilIter;
1621 int32_t safeoffset = textoffset;
1622
1623 if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1624 collator)) {
1625 safeoffset = getPreviousSafeOffset(collator, text, textoffset);
1626 safelength = textoffset - safeoffset;
1627 safetextlength = INITIAL_ARRAY_SIZE_;
1628 safetext = addToUCharArray(safebuffer, &safetextlength, NULL,
1629 text + safeoffset, safelength,
1630 strsrch->canonicalSuffixAccents,
1631 status);
1632 }
1633 else {
1634 safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1635 safetext = strsrch->canonicalSuffixAccents;
1636 }
1637
1638 // if status is a failure, ucol_setText does nothing
1639 ucol_setText(coleiter, safetext, safetextlength, status);
1640 // status checked in loop below
1641
1642 int32_t *ce = strsrch->pattern.CE;
1643 int32_t celength = strsrch->pattern.CELength;
1644 int ceindex = celength - 1;
1645 UBool isSafe = TRUE; // indication flag for position in safe zone
1646
1647 while (ceindex >= 0) {
1648 int32_t textce = ucol_previous(coleiter, status);
1649 if (U_FAILURE(*status)) {
1650 if (isSafe) {
1651 cleanUpSafeText(strsrch, safetext, safebuffer);
1652 }
1653 return USEARCH_DONE;
1654 }
1655 if (textce == UCOL_NULLORDER) {
1656 // check if we have passed the safe buffer
1657 if (coleiter == strsrch->textIter) {
1658 cleanUpSafeText(strsrch, safetext, safebuffer);
1659 return USEARCH_DONE;
1660 }
1661 cleanUpSafeText(strsrch, safetext, safebuffer);
1662 safetext = safebuffer;
1663 coleiter = strsrch->textIter;
1664 setColEIterOffset(coleiter, safeoffset);
1665 // status checked at the start of the loop
1666 isSafe = FALSE;
1667 continue;
1668 }
1669 textce = getCE(strsrch, textce);
1670 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1671 // do the beginning stuff
1672 int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1673 if (isSafe && failedoffset >= safelength) {
1674 // alas... no hope. failed at rearranged accent set
1675 cleanUpSafeText(strsrch, safetext, safebuffer);
1676 return USEARCH_DONE;
1677 }
1678 else {
1679 if (isSafe) {
1680 failedoffset += safeoffset;
1681 cleanUpSafeText(strsrch, safetext, safebuffer);
1682 }
1683
1684 // try rearranging the front accents
1685 int32_t result = doNextCanonicalPrefixMatch(strsrch,
1686 failedoffset, textoffset, status);
1687 if (result != USEARCH_DONE) {
1688 // if status is a failure, ucol_setOffset does nothing
1689 setColEIterOffset(strsrch->textIter, result);
1690 }
1691 if (U_FAILURE(*status)) {
1692 return USEARCH_DONE;
1693 }
1694 return result;
1695 }
1696 }
1697 if (textce == ce[ceindex]) {
1698 ceindex --;
1699 }
1700 }
1701 // set offset here
1702 if (isSafe) {
1703 int32_t result = getColElemIterOffset(coleiter, FALSE);
1704 // sets the text iterator here with the correct expansion and offset
1705 int32_t leftoverces = getExpansionPrefix(coleiter);
1706 cleanUpSafeText(strsrch, safetext, safebuffer);
1707 if (result >= safelength) {
1708 result = textoffset;
1709 }
1710 else {
1711 result += safeoffset;
1712 }
1713 setColEIterOffset(strsrch->textIter, result);
1714 strsrch->textIter->iteratordata_.toReturn =
1715 setExpansionPrefix(strsrch->textIter, leftoverces);
1716 return result;
1717 }
1718
1719 return ucol_getOffset(coleiter);
1720 }
1721
1722 /**
1723 * Trying out the substring and sees if it can be a canonical match.
1724 * This will try normalizing the end accents and arranging them into canonical
1725 * equivalents and check their corresponding ces with the pattern ce.
1726 * Suffix accents in the text will be grouped according to their combining
1727 * class and the groups will be mixed and matched to try find the perfect
1728 * match with the pattern.
1729 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1730 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1731 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1732 * "\u0301\u0325".
1733 * step 2: check if any of the generated substrings matches the pattern.
1734 * Internal method, status assumed to be success, caller has to check status
1735 * before calling this method.
1736 * @param strsrch string search data
1737 * @param textoffset end offset in the collation element text that ends with
1738 * the accents to be rearranged
1739 * @param status error status if any
1740 * @return TRUE if the match is valid, FALSE otherwise
1741 */
1742 static
doNextCanonicalMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)1743 UBool doNextCanonicalMatch(UStringSearch *strsrch,
1744 int32_t textoffset,
1745 UErrorCode *status)
1746 {
1747 const UChar *text = strsrch->search->text;
1748 int32_t temp = textoffset;
1749 UTF_BACK_1(text, 0, temp);
1750 if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1751 UCollationElements *coleiter = strsrch->textIter;
1752 int32_t offset = getColElemIterOffset(coleiter, FALSE);
1753 if (strsrch->pattern.hasPrefixAccents) {
1754 offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1755 status);
1756 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1757 setColEIterOffset(coleiter, offset);
1758 return TRUE;
1759 }
1760 }
1761 return FALSE;
1762 }
1763
1764 if (!strsrch->pattern.hasSuffixAccents) {
1765 return FALSE;
1766 }
1767
1768 UChar accents[INITIAL_ARRAY_SIZE_];
1769 // offset to the last base character in substring to search
1770 int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1771 // normalizing the offensive string
1772 unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1773 0, accents, INITIAL_ARRAY_SIZE_, status);
1774 // status checked in loop below
1775
1776 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1777 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1778
1779 // 2 power n - 1 plus the full set of accents
1780 int32_t count = (2 << (size - 1)) - 1;
1781 while (U_SUCCESS(*status) && count > 0) {
1782 UChar *rearrange = strsrch->canonicalSuffixAccents;
1783 // copy the base characters
1784 for (int k = 0; k < accentsindex[0]; k ++) {
1785 *rearrange ++ = accents[k];
1786 }
1787 // forming all possible canonical rearrangement by dropping
1788 // sets of accents
1789 for (int i = 0; i <= size - 1; i ++) {
1790 int32_t mask = 1 << (size - i - 1);
1791 if (count & mask) {
1792 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1793 *rearrange ++ = accents[j];
1794 }
1795 }
1796 }
1797 *rearrange = 0;
1798 int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1799 status);
1800 if (offset != USEARCH_DONE) {
1801 return TRUE; // match found
1802 }
1803 count --;
1804 }
1805 return FALSE;
1806 }
1807
1808 /**
1809 * Gets the previous base character offset depending on the string search
1810 * pattern data
1811 * @param strsrch string search data
1812 * @param textoffset current offset, current character
1813 * @return the offset of the next character after this base character or itself
1814 * if it is a composed character with accents
1815 */
1816 static
getPreviousUStringSearchBaseOffset(UStringSearch * strsrch,int32_t textoffset)1817 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1818 int32_t textoffset)
1819 {
1820 if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1821 const UChar *text = strsrch->search->text;
1822 int32_t offset = textoffset;
1823 if (getFCD(text, &offset, strsrch->search->textLength) >>
1824 SECOND_LAST_BYTE_SHIFT_) {
1825 return getPreviousBaseOffset(text, textoffset);
1826 }
1827 }
1828 return textoffset;
1829 }
1830
1831 /**
1832 * Checks match for contraction.
1833 * If the match ends with a partial contraction we fail.
1834 * If the match starts too far off (because of backwards iteration) we try to
1835 * chip off the extra characters
1836 * Internal method, status assumed to be success, caller has to check status
1837 * before calling this method.
1838 * @param strsrch string search data
1839 * @param start offset of potential match, to be modified if necessary
1840 * @param end offset of potential match, to be modified if necessary
1841 * @param status output error status if any
1842 * @return TRUE if match passes the contraction test, FALSE otherwise
1843 */
1844 static
checkNextCanonicalContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)1845 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1846 int32_t *start,
1847 int32_t *end,
1848 UErrorCode *status)
1849 {
1850 UCollationElements *coleiter = strsrch->textIter;
1851 int32_t textlength = strsrch->search->textLength;
1852 int32_t temp = *start;
1853 const UCollator *collator = strsrch->collator;
1854 const UChar *text = strsrch->search->text;
1855 // This part checks if either ends of the match contains potential
1856 // contraction. If so we'll have to iterate through them
1857 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1858 (*start + 1 < textlength
1859 && ucol_unsafeCP(text[*start + 1], collator))) {
1860 int32_t expansion = getExpansionPrefix(coleiter);
1861 UBool expandflag = expansion > 0;
1862 setColEIterOffset(coleiter, *start);
1863 while (expansion > 0) {
1864 // getting rid of the redundant ce, caused by setOffset.
1865 // since backward contraction/expansion may have extra ces if we
1866 // are in the normalization buffer, hasAccentsBeforeMatch would
1867 // have taken care of it.
1868 // E.g. the character \u01FA will have an expansion of 3, but if
1869 // we are only looking for acute and ring \u030A and \u0301, we'll
1870 // have to skip the first ce in the expansion buffer.
1871 ucol_next(coleiter, status);
1872 if (U_FAILURE(*status)) {
1873 return FALSE;
1874 }
1875 if (ucol_getOffset(coleiter) != temp) {
1876 *start = temp;
1877 temp = ucol_getOffset(coleiter);
1878 }
1879 expansion --;
1880 }
1881
1882 int32_t *patternce = strsrch->pattern.CE;
1883 int32_t patterncelength = strsrch->pattern.CELength;
1884 int32_t count = 0;
1885 int32_t textlength = strsrch->search->textLength;
1886 while (count < patterncelength) {
1887 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1888 // status checked below, note that if status is a failure
1889 // ucol_next returns UCOL_NULLORDER
1890 if (ce == UCOL_IGNORABLE) {
1891 continue;
1892 }
1893 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1894 *start = temp;
1895 temp = ucol_getOffset(coleiter);
1896 }
1897
1898 if (count == 0 && ce != patternce[0]) {
1899 // accents may have extra starting ces, this occurs when a
1900 // pure accent pattern is matched without rearrangement
1901 // text \u0325\u0300 and looking for \u0300
1902 int32_t expected = patternce[0];
1903 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1904 ce = getCE(strsrch, ucol_next(coleiter, status));
1905 while (U_SUCCESS(*status) && ce != expected &&
1906 ce != UCOL_NULLORDER &&
1907 ucol_getOffset(coleiter) <= *end) {
1908 ce = getCE(strsrch, ucol_next(coleiter, status));
1909 }
1910 }
1911 }
1912 if (U_FAILURE(*status) || ce != patternce[count]) {
1913 (*end) ++;
1914 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1915 return FALSE;
1916 }
1917 count ++;
1918 }
1919 }
1920 return TRUE;
1921 }
1922
1923 /**
1924 * Checks and sets the match information if found.
1925 * Checks
1926 * <ul>
1927 * <li> the potential match does not repeat the previous match
1928 * <li> boundaries are correct
1929 * <li> potential match does not end in the middle of a contraction
1930 * <li> identical matches
1931 * <\ul>
1932 * Otherwise the offset will be shifted to the next character.
1933 * Internal method, status assumed to be success, caller has to check the
1934 * status before calling this method.
1935 * @param strsrch string search data
1936 * @param textoffset offset in the collation element text. the returned value
1937 * will be the truncated end offset of the match or the new start
1938 * search offset.
1939 * @param status output error status if any
1940 * @return TRUE if the match is valid, FALSE otherwise
1941 */
1942 static
checkNextCanonicalMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)1943 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1944 int32_t *textoffset,
1945 UErrorCode *status)
1946 {
1947 // to ensure that the start and ends are not composite characters
1948 UCollationElements *coleiter = strsrch->textIter;
1949 // if we have a canonical accent match
1950 if ((strsrch->pattern.hasSuffixAccents &&
1951 strsrch->canonicalSuffixAccents[0]) ||
1952 (strsrch->pattern.hasPrefixAccents &&
1953 strsrch->canonicalPrefixAccents[0])) {
1954 strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset(
1955 strsrch,
1956 ucol_getOffset(coleiter));
1957 strsrch->search->matchedLength = *textoffset -
1958 strsrch->search->matchedIndex;
1959 return TRUE;
1960 }
1961
1962 int32_t start = getColElemIterOffset(coleiter, FALSE);
1963 if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1964 status) || U_FAILURE(*status)) {
1965 return FALSE;
1966 }
1967
1968 start = getPreviousUStringSearchBaseOffset(strsrch, start);
1969 // this totally matches, however we need to check if it is repeating
1970 if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1971 !isBreakUnit(strsrch, start, *textoffset) ||
1972 !checkIdentical(strsrch, start, *textoffset)) {
1973 (*textoffset) ++;
1974 *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1975 strsrch->search->textLength);
1976 return FALSE;
1977 }
1978
1979 strsrch->search->matchedIndex = start;
1980 strsrch->search->matchedLength = *textoffset - start;
1981 return TRUE;
1982 }
1983
1984 /**
1985 * Shifting the collation element iterator position forward to prepare for
1986 * a preceding match. If the first character is a unsafe character, we'll only
1987 * shift by 1 to capture contractions, normalization etc.
1988 * Internal method, status assumed to be success, caller has to check status
1989 * before calling this method.
1990 * @param text strsrch string search data
1991 * @param textoffset start text position to do search
1992 * @param ce the text ce which failed the match.
1993 * @param patternceindex index of the ce within the pattern ce buffer which
1994 * failed the match
1995 * @return final offset
1996 */
1997 static
reverseShift(UStringSearch * strsrch,int32_t textoffset,int32_t ce,int32_t patternceindex)1998 inline int32_t reverseShift(UStringSearch *strsrch,
1999 int32_t textoffset,
2000 int32_t ce,
2001 int32_t patternceindex)
2002 {
2003 if (strsrch->search->isOverlap) {
2004 if (textoffset != strsrch->search->textLength) {
2005 textoffset --;
2006 }
2007 else {
2008 textoffset -= strsrch->pattern.defaultShiftSize;
2009 }
2010 }
2011 else {
2012 if (ce != UCOL_NULLORDER) {
2013 int32_t shift = strsrch->pattern.backShift[hash(ce)];
2014
2015 // this is to adjust for characters in the middle of the substring
2016 // for matching that failed.
2017 int32_t adjust = patternceindex;
2018 if (adjust > 1 && shift > adjust) {
2019 shift -= adjust - 1;
2020 }
2021 textoffset -= shift;
2022 }
2023 else {
2024 textoffset -= strsrch->pattern.defaultShiftSize;
2025 }
2026 }
2027 textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
2028 return textoffset;
2029 }
2030
2031 /**
2032 * Checks match for contraction.
2033 * If the match starts with a partial contraction we fail.
2034 * Internal method, status assumed to be success, caller has to check status
2035 * before calling this method.
2036 * @param strsrch string search data
2037 * @param start offset of potential match, to be modified if necessary
2038 * @param end offset of potential match, to be modified if necessary
2039 * @param status output error status if any
2040 * @return TRUE if match passes the contraction test, FALSE otherwise
2041 */
2042 static
checkPreviousExactContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)2043 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2044 int32_t *start,
2045 int32_t *end, UErrorCode *status)
2046 {
2047 UCollationElements *coleiter = strsrch->textIter;
2048 int32_t textlength = strsrch->search->textLength;
2049 int32_t temp = *end;
2050 const UCollator *collator = strsrch->collator;
2051 const UChar *text = strsrch->search->text;
2052 // This part checks if either if the start of the match contains potential
2053 // contraction. If so we'll have to iterate through them
2054 // Since we used ucol_next while previously looking for the potential
2055 // match, this guarantees that our end will not be a partial contraction,
2056 // or a partial supplementary character.
2057 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2058 int32_t expansion = getExpansionSuffix(coleiter);
2059 UBool expandflag = expansion > 0;
2060 setColEIterOffset(coleiter, *end);
2061 while (U_SUCCESS(*status) && expansion > 0) {
2062 // getting rid of the redundant ce
2063 // since forward contraction/expansion may have extra ces
2064 // if we are in the normalization buffer, hasAccentsBeforeMatch
2065 // would have taken care of it.
2066 // E.g. the character \u01FA will have an expansion of 3, but if
2067 // we are only looking for A ring A\u030A, we'll have to skip the
2068 // last ce in the expansion buffer
2069 ucol_previous(coleiter, status);
2070 if (U_FAILURE(*status)) {
2071 return FALSE;
2072 }
2073 if (ucol_getOffset(coleiter) != temp) {
2074 *end = temp;
2075 temp = ucol_getOffset(coleiter);
2076 }
2077 expansion --;
2078 }
2079
2080 int32_t *patternce = strsrch->pattern.CE;
2081 int32_t patterncelength = strsrch->pattern.CELength;
2082 int32_t count = patterncelength;
2083 while (count > 0) {
2084 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2085 // status checked below, note that if status is a failure
2086 // ucol_previous returns UCOL_NULLORDER
2087 if (ce == UCOL_IGNORABLE) {
2088 continue;
2089 }
2090 if (expandflag && count == 0 &&
2091 getColElemIterOffset(coleiter, FALSE) != temp) {
2092 *end = temp;
2093 temp = ucol_getOffset(coleiter);
2094 }
2095 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2096 (*start) --;
2097 *start = getPreviousBaseOffset(text, *start);
2098 return FALSE;
2099 }
2100 count --;
2101 }
2102 }
2103 return TRUE;
2104 }
2105
2106 /**
2107 * Checks and sets the match information if found.
2108 * Checks
2109 * <ul>
2110 * <li> the current match does not repeat the last match
2111 * <li> boundaries are correct
2112 * <li> exact matches has no extra accents
2113 * <li> identical matches
2114 * <\ul>
2115 * Otherwise the offset will be shifted to the preceding character.
2116 * Internal method, status assumed to be success, caller has to check status
2117 * before calling this method.
2118 * @param strsrch string search data
2119 * @param collator
2120 * @param coleiter collation element iterator
2121 * @param text string
2122 * @param textoffset offset in the collation element text. the returned value
2123 * will be the truncated start offset of the match or the new start
2124 * search offset.
2125 * @param status output error status if any
2126 * @return TRUE if the match is valid, FALSE otherwise
2127 */
2128 static
checkPreviousExactMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)2129 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2130 int32_t *textoffset,
2131 UErrorCode *status)
2132 {
2133 // to ensure that the start and ends are not composite characters
2134 int32_t end = ucol_getOffset(strsrch->textIter);
2135 if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2136 || U_FAILURE(*status)) {
2137 return FALSE;
2138 }
2139
2140 // this totally matches, however we need to check if it is repeating
2141 // the old match
2142 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2143 !isBreakUnit(strsrch, *textoffset, end) ||
2144 hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
2145 !checkIdentical(strsrch, *textoffset, end) ||
2146 hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2147 (*textoffset) --;
2148 *textoffset = getPreviousBaseOffset(strsrch->search->text,
2149 *textoffset);
2150 return FALSE;
2151 }
2152
2153 //Add breakiterator boundary check for primary strength search.
2154 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
2155 checkBreakBoundary(strsrch, textoffset, &end);
2156 }
2157
2158 strsrch->search->matchedIndex = *textoffset;
2159 strsrch->search->matchedLength = end - *textoffset;
2160 return TRUE;
2161 }
2162
2163 /**
2164 * Rearranges the end accents to try matching.
2165 * Suffix accents in the text will be grouped according to their combining
2166 * class and the groups will be mixed and matched to try find the perfect
2167 * match with the pattern.
2168 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2169 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2170 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2171 * "\u0301\u0325".
2172 * step 2: check if any of the generated substrings matches the pattern.
2173 * Internal method, status assumed to be success, user has to check status
2174 * before calling this method.
2175 * @param strsrch string search match
2176 * @param start offset of the first base character
2177 * @param end start of the last accent set
2178 * @param status only error status if any
2179 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2180 * offset of the match. Note this start includes all following accents.
2181 */
2182 static
doPreviousCanonicalSuffixMatch(UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)2183 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
2184 int32_t start,
2185 int32_t end,
2186 UErrorCode *status)
2187 {
2188 const UChar *text = strsrch->search->text;
2189 int32_t tempend = end;
2190
2191 UTF_BACK_1(text, 0, tempend);
2192 if (!(getFCD(text, &tempend, strsrch->search->textLength) &
2193 LAST_BYTE_MASK_)) {
2194 // die... failed at a base character
2195 return USEARCH_DONE;
2196 }
2197 end = getNextBaseOffset(text, end, strsrch->search->textLength);
2198
2199 if (U_SUCCESS(*status)) {
2200 UChar accents[INITIAL_ARRAY_SIZE_];
2201 int32_t offset = getPreviousBaseOffset(text, end);
2202 // normalizing the offensive string
2203 unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2204 INITIAL_ARRAY_SIZE_, status);
2205
2206 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2207 int32_t accentsize = getUnblockedAccentIndex(accents,
2208 accentsindex);
2209 int32_t count = (2 << (accentsize - 1)) - 1;
2210 UChar buffer[INITIAL_ARRAY_SIZE_];
2211 UCollationElements *coleiter = strsrch->utilIter;
2212 while (U_SUCCESS(*status) && count > 0) {
2213 UChar *rearrange = strsrch->canonicalSuffixAccents;
2214 // copy the base characters
2215 for (int k = 0; k < accentsindex[0]; k ++) {
2216 *rearrange ++ = accents[k];
2217 }
2218 // forming all possible canonical rearrangement by dropping
2219 // sets of accents
2220 for (int i = 0; i <= accentsize - 1; i ++) {
2221 int32_t mask = 1 << (accentsize - i - 1);
2222 if (count & mask) {
2223 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2224 *rearrange ++ = accents[j];
2225 }
2226 }
2227 }
2228 *rearrange = 0;
2229 int32_t matchsize = INITIAL_ARRAY_SIZE_;
2230 UChar *match = addToUCharArray(buffer, &matchsize,
2231 strsrch->canonicalPrefixAccents,
2232 strsrch->search->text + start,
2233 offset - start,
2234 strsrch->canonicalSuffixAccents,
2235 status);
2236
2237 // run the collator iterator through this match
2238 // if status is a failure ucol_setText does nothing
2239 ucol_setText(coleiter, match, matchsize, status);
2240 if (U_SUCCESS(*status)) {
2241 if (checkCollationMatch(strsrch, coleiter)) {
2242 if (match != buffer) {
2243 uprv_free(match);
2244 }
2245 return end;
2246 }
2247 }
2248 count --;
2249 }
2250 }
2251 return USEARCH_DONE;
2252 }
2253
2254 /**
2255 * Take the rearranged start accents and tries matching. If match failed at
2256 * a seperate following set of accents (seperated from the rearranged on by
2257 * at least a base character) then we rearrange the preceding accents and
2258 * tries matching again.
2259 * We allow skipping of the ends of the accent set if the ces do not match.
2260 * However if the failure is found before the accent set, it fails.
2261 * Internal method, status assumed to be success, caller has to check status
2262 * before calling this method.
2263 * @param strsrch string search data
2264 * @param textoffset of the ends of the rearranged accent
2265 * @param status output error status if any
2266 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2267 * offset of the match. Note this start includes all following accents.
2268 */
2269 static
doPreviousCanonicalPrefixMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)2270 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2271 int32_t textoffset,
2272 UErrorCode *status)
2273 {
2274 const UChar *text = strsrch->search->text;
2275 const UCollator *collator = strsrch->collator;
2276 int32_t safelength = 0;
2277 UChar *safetext;
2278 int32_t safetextlength;
2279 UChar safebuffer[INITIAL_ARRAY_SIZE_];
2280 int32_t safeoffset = textoffset;
2281
2282 if (textoffset &&
2283 ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2284 u_strlen(strsrch->canonicalPrefixAccents) - 1
2285 ], collator)) {
2286 safeoffset = getNextSafeOffset(collator, text, textoffset,
2287 strsrch->search->textLength);
2288 safelength = safeoffset - textoffset;
2289 safetextlength = INITIAL_ARRAY_SIZE_;
2290 safetext = addToUCharArray(safebuffer, &safetextlength,
2291 strsrch->canonicalPrefixAccents,
2292 text + textoffset, safelength,
2293 NULL, status);
2294 }
2295 else {
2296 safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2297 safetext = strsrch->canonicalPrefixAccents;
2298 }
2299
2300 UCollationElements *coleiter = strsrch->utilIter;
2301 // if status is a failure, ucol_setText does nothing
2302 ucol_setText(coleiter, safetext, safetextlength, status);
2303 // status checked in loop below
2304
2305 int32_t *ce = strsrch->pattern.CE;
2306 int32_t celength = strsrch->pattern.CELength;
2307 int ceindex = 0;
2308 UBool isSafe = TRUE; // safe zone indication flag for position
2309 int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2310
2311 while (ceindex < celength) {
2312 int32_t textce = ucol_next(coleiter, status);
2313 if (U_FAILURE(*status)) {
2314 if (isSafe) {
2315 cleanUpSafeText(strsrch, safetext, safebuffer);
2316 }
2317 return USEARCH_DONE;
2318 }
2319 if (textce == UCOL_NULLORDER) {
2320 // check if we have passed the safe buffer
2321 if (coleiter == strsrch->textIter) {
2322 cleanUpSafeText(strsrch, safetext, safebuffer);
2323 return USEARCH_DONE;
2324 }
2325 cleanUpSafeText(strsrch, safetext, safebuffer);
2326 safetext = safebuffer;
2327 coleiter = strsrch->textIter;
2328 setColEIterOffset(coleiter, safeoffset);
2329 // status checked at the start of the loop
2330 isSafe = FALSE;
2331 continue;
2332 }
2333 textce = getCE(strsrch, textce);
2334 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2335 // do the beginning stuff
2336 int32_t failedoffset = ucol_getOffset(coleiter);
2337 if (isSafe && failedoffset <= prefixlength) {
2338 // alas... no hope. failed at rearranged accent set
2339 cleanUpSafeText(strsrch, safetext, safebuffer);
2340 return USEARCH_DONE;
2341 }
2342 else {
2343 if (isSafe) {
2344 failedoffset = safeoffset - failedoffset;
2345 cleanUpSafeText(strsrch, safetext, safebuffer);
2346 }
2347
2348 // try rearranging the end accents
2349 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2350 textoffset, failedoffset, status);
2351 if (result != USEARCH_DONE) {
2352 // if status is a failure, ucol_setOffset does nothing
2353 setColEIterOffset(strsrch->textIter, result);
2354 }
2355 if (U_FAILURE(*status)) {
2356 return USEARCH_DONE;
2357 }
2358 return result;
2359 }
2360 }
2361 if (textce == ce[ceindex]) {
2362 ceindex ++;
2363 }
2364 }
2365 // set offset here
2366 if (isSafe) {
2367 int32_t result = ucol_getOffset(coleiter);
2368 // sets the text iterator here with the correct expansion and offset
2369 int32_t leftoverces = getExpansionSuffix(coleiter);
2370 cleanUpSafeText(strsrch, safetext, safebuffer);
2371 if (result <= prefixlength) {
2372 result = textoffset;
2373 }
2374 else {
2375 result = textoffset + (safeoffset - result);
2376 }
2377 setColEIterOffset(strsrch->textIter, result);
2378 setExpansionSuffix(strsrch->textIter, leftoverces);
2379 return result;
2380 }
2381
2382 return ucol_getOffset(coleiter);
2383 }
2384
2385 /**
2386 * Trying out the substring and sees if it can be a canonical match.
2387 * This will try normalizing the starting accents and arranging them into
2388 * canonical equivalents and check their corresponding ces with the pattern ce.
2389 * Prefix accents in the text will be grouped according to their combining
2390 * class and the groups will be mixed and matched to try find the perfect
2391 * match with the pattern.
2392 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2393 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2394 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2395 * "\u0301\u0325".
2396 * step 2: check if any of the generated substrings matches the pattern.
2397 * Internal method, status assumed to be success, caller has to check status
2398 * before calling this method.
2399 * @param strsrch string search data
2400 * @param textoffset start offset in the collation element text that starts
2401 * with the accents to be rearranged
2402 * @param status output error status if any
2403 * @return TRUE if the match is valid, FALSE otherwise
2404 */
2405 static
doPreviousCanonicalMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)2406 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2407 int32_t textoffset,
2408 UErrorCode *status)
2409 {
2410 const UChar *text = strsrch->search->text;
2411 int32_t temp = textoffset;
2412 int32_t textlength = strsrch->search->textLength;
2413 if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2414 UCollationElements *coleiter = strsrch->textIter;
2415 int32_t offset = ucol_getOffset(coleiter);
2416 if (strsrch->pattern.hasSuffixAccents) {
2417 offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2418 offset, status);
2419 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2420 setColEIterOffset(coleiter, offset);
2421 return TRUE;
2422 }
2423 }
2424 return FALSE;
2425 }
2426
2427 if (!strsrch->pattern.hasPrefixAccents) {
2428 return FALSE;
2429 }
2430
2431 UChar accents[INITIAL_ARRAY_SIZE_];
2432 // offset to the last base character in substring to search
2433 int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2434 // normalizing the offensive string
2435 unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2436 0, accents, INITIAL_ARRAY_SIZE_, status);
2437 // status checked in loop
2438
2439 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2440 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2441
2442 // 2 power n - 1 plus the full set of accents
2443 int32_t count = (2 << (size - 1)) - 1;
2444 while (U_SUCCESS(*status) && count > 0) {
2445 UChar *rearrange = strsrch->canonicalPrefixAccents;
2446 // copy the base characters
2447 for (int k = 0; k < accentsindex[0]; k ++) {
2448 *rearrange ++ = accents[k];
2449 }
2450 // forming all possible canonical rearrangement by dropping
2451 // sets of accents
2452 for (int i = 0; i <= size - 1; i ++) {
2453 int32_t mask = 1 << (size - i - 1);
2454 if (count & mask) {
2455 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2456 *rearrange ++ = accents[j];
2457 }
2458 }
2459 }
2460 *rearrange = 0;
2461 int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2462 baseoffset, status);
2463 if (offset != USEARCH_DONE) {
2464 return TRUE; // match found
2465 }
2466 count --;
2467 }
2468 return FALSE;
2469 }
2470
2471 /**
2472 * Checks match for contraction.
2473 * If the match starts with a partial contraction we fail.
2474 * Internal method, status assumed to be success, caller has to check status
2475 * before calling this method.
2476 * @param strsrch string search data
2477 * @param start offset of potential match, to be modified if necessary
2478 * @param end offset of potential match, to be modified if necessary
2479 * @param status only error status if any
2480 * @return TRUE if match passes the contraction test, FALSE otherwise
2481 */
2482 static
checkPreviousCanonicalContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)2483 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2484 int32_t *start,
2485 int32_t *end, UErrorCode *status)
2486 {
2487 UCollationElements *coleiter = strsrch->textIter;
2488 int32_t textlength = strsrch->search->textLength;
2489 int32_t temp = *end;
2490 const UCollator *collator = strsrch->collator;
2491 const UChar *text = strsrch->search->text;
2492 // This part checks if either if the start of the match contains potential
2493 // contraction. If so we'll have to iterate through them
2494 // Since we used ucol_next while previously looking for the potential
2495 // match, this guarantees that our end will not be a partial contraction,
2496 // or a partial supplementary character.
2497 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2498 int32_t expansion = getExpansionSuffix(coleiter);
2499 UBool expandflag = expansion > 0;
2500 setColEIterOffset(coleiter, *end);
2501 while (expansion > 0) {
2502 // getting rid of the redundant ce
2503 // since forward contraction/expansion may have extra ces
2504 // if we are in the normalization buffer, hasAccentsBeforeMatch
2505 // would have taken care of it.
2506 // E.g. the character \u01FA will have an expansion of 3, but if
2507 // we are only looking for A ring A\u030A, we'll have to skip the
2508 // last ce in the expansion buffer
2509 ucol_previous(coleiter, status);
2510 if (U_FAILURE(*status)) {
2511 return FALSE;
2512 }
2513 if (ucol_getOffset(coleiter) != temp) {
2514 *end = temp;
2515 temp = ucol_getOffset(coleiter);
2516 }
2517 expansion --;
2518 }
2519
2520 int32_t *patternce = strsrch->pattern.CE;
2521 int32_t patterncelength = strsrch->pattern.CELength;
2522 int32_t count = patterncelength;
2523 while (count > 0) {
2524 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2525 // status checked below, note that if status is a failure
2526 // ucol_previous returns UCOL_NULLORDER
2527 if (ce == UCOL_IGNORABLE) {
2528 continue;
2529 }
2530 if (expandflag && count == 0 &&
2531 getColElemIterOffset(coleiter, FALSE) != temp) {
2532 *end = temp;
2533 temp = ucol_getOffset(coleiter);
2534 }
2535 if (count == patterncelength &&
2536 ce != patternce[patterncelength - 1]) {
2537 // accents may have extra starting ces, this occurs when a
2538 // pure accent pattern is matched without rearrangement
2539 int32_t expected = patternce[patterncelength - 1];
2540 UTF_BACK_1(text, 0, *end);
2541 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2542 ce = getCE(strsrch, ucol_previous(coleiter, status));
2543 while (U_SUCCESS(*status) && ce != expected &&
2544 ce != UCOL_NULLORDER &&
2545 ucol_getOffset(coleiter) <= *start) {
2546 ce = getCE(strsrch, ucol_previous(coleiter, status));
2547 }
2548 }
2549 }
2550 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2551 (*start) --;
2552 *start = getPreviousBaseOffset(text, *start);
2553 return FALSE;
2554 }
2555 count --;
2556 }
2557 }
2558 return TRUE;
2559 }
2560
2561 /**
2562 * Checks and sets the match information if found.
2563 * Checks
2564 * <ul>
2565 * <li> the potential match does not repeat the previous match
2566 * <li> boundaries are correct
2567 * <li> potential match does not end in the middle of a contraction
2568 * <li> identical matches
2569 * <\ul>
2570 * Otherwise the offset will be shifted to the next character.
2571 * Internal method, status assumed to be success, caller has to check status
2572 * before calling this method.
2573 * @param strsrch string search data
2574 * @param textoffset offset in the collation element text. the returned value
2575 * will be the truncated start offset of the match or the new start
2576 * search offset.
2577 * @param status only error status if any
2578 * @return TRUE if the match is valid, FALSE otherwise
2579 */
2580 static
checkPreviousCanonicalMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)2581 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2582 int32_t *textoffset,
2583 UErrorCode *status)
2584 {
2585 // to ensure that the start and ends are not composite characters
2586 UCollationElements *coleiter = strsrch->textIter;
2587 // if we have a canonical accent match
2588 if ((strsrch->pattern.hasSuffixAccents &&
2589 strsrch->canonicalSuffixAccents[0]) ||
2590 (strsrch->pattern.hasPrefixAccents &&
2591 strsrch->canonicalPrefixAccents[0])) {
2592 strsrch->search->matchedIndex = *textoffset;
2593 strsrch->search->matchedLength =
2594 getNextUStringSearchBaseOffset(strsrch,
2595 getColElemIterOffset(coleiter, FALSE))
2596 - *textoffset;
2597 return TRUE;
2598 }
2599
2600 int32_t end = ucol_getOffset(coleiter);
2601 if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2602 status) ||
2603 U_FAILURE(*status)) {
2604 return FALSE;
2605 }
2606
2607 end = getNextUStringSearchBaseOffset(strsrch, end);
2608 // this totally matches, however we need to check if it is repeating
2609 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2610 !isBreakUnit(strsrch, *textoffset, end) ||
2611 !checkIdentical(strsrch, *textoffset, end)) {
2612 (*textoffset) --;
2613 *textoffset = getPreviousBaseOffset(strsrch->search->text,
2614 *textoffset);
2615 return FALSE;
2616 }
2617
2618 strsrch->search->matchedIndex = *textoffset;
2619 strsrch->search->matchedLength = end - *textoffset;
2620 return TRUE;
2621 }
2622 #endif // #if BOYER_MOORE
2623
2624 // constructors and destructor -------------------------------------------
2625
usearch_open(const UChar * pattern,int32_t patternlength,const UChar * text,int32_t textlength,const char * locale,UBreakIterator * breakiter,UErrorCode * status)2626 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2627 int32_t patternlength,
2628 const UChar *text,
2629 int32_t textlength,
2630 const char *locale,
2631 UBreakIterator *breakiter,
2632 UErrorCode *status)
2633 {
2634 if (U_FAILURE(*status)) {
2635 return NULL;
2636 }
2637 #if UCONFIG_NO_BREAK_ITERATION
2638 if (breakiter != NULL) {
2639 *status = U_UNSUPPORTED_ERROR;
2640 return NULL;
2641 }
2642 #endif
2643 if (locale) {
2644 // ucol_open internally checks for status
2645 UCollator *collator = ucol_open(locale, status);
2646 // pattern, text checks are done in usearch_openFromCollator
2647 UStringSearch *result = usearch_openFromCollator(pattern,
2648 patternlength, text, textlength,
2649 collator, breakiter, status);
2650
2651 if (result == NULL || U_FAILURE(*status)) {
2652 if (collator) {
2653 ucol_close(collator);
2654 }
2655 return NULL;
2656 }
2657 else {
2658 result->ownCollator = TRUE;
2659 }
2660 return result;
2661 }
2662 *status = U_ILLEGAL_ARGUMENT_ERROR;
2663 return NULL;
2664 }
2665
usearch_openFromCollator(const UChar * pattern,int32_t patternlength,const UChar * text,int32_t textlength,const UCollator * collator,UBreakIterator * breakiter,UErrorCode * status)2666 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2667 const UChar *pattern,
2668 int32_t patternlength,
2669 const UChar *text,
2670 int32_t textlength,
2671 const UCollator *collator,
2672 UBreakIterator *breakiter,
2673 UErrorCode *status)
2674 {
2675 if (U_FAILURE(*status)) {
2676 return NULL;
2677 }
2678 #if UCONFIG_NO_BREAK_ITERATION
2679 if (breakiter != NULL) {
2680 *status = U_UNSUPPORTED_ERROR;
2681 return NULL;
2682 }
2683 #endif
2684 if (pattern == NULL || text == NULL || collator == NULL) {
2685 *status = U_ILLEGAL_ARGUMENT_ERROR;
2686 return NULL;
2687 }
2688
2689 // string search does not really work when numeric collation is turned on
2690 if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2691 *status = U_UNSUPPORTED_ERROR;
2692 return NULL;
2693 }
2694
2695 if (U_SUCCESS(*status)) {
2696 initializeFCD(status);
2697 if (U_FAILURE(*status)) {
2698 return NULL;
2699 }
2700
2701 UStringSearch *result;
2702 if (textlength == -1) {
2703 textlength = u_strlen(text);
2704 }
2705 if (patternlength == -1) {
2706 patternlength = u_strlen(pattern);
2707 }
2708 if (textlength <= 0 || patternlength <= 0) {
2709 *status = U_ILLEGAL_ARGUMENT_ERROR;
2710 return NULL;
2711 }
2712
2713 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2714 if (result == NULL) {
2715 *status = U_MEMORY_ALLOCATION_ERROR;
2716 return NULL;
2717 }
2718
2719 result->collator = collator;
2720 result->strength = ucol_getStrength(collator);
2721 result->ceMask = getMask(result->strength);
2722 result->toShift =
2723 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2724 UCOL_SHIFTED;
2725 result->variableTop = ucol_getVariableTop(collator, status);
2726
2727 if (U_FAILURE(*status)) {
2728 uprv_free(result);
2729 return NULL;
2730 }
2731
2732 result->search = (USearch *)uprv_malloc(sizeof(USearch));
2733 if (result->search == NULL) {
2734 *status = U_MEMORY_ALLOCATION_ERROR;
2735 uprv_free(result);
2736 return NULL;
2737 }
2738
2739 result->search->text = text;
2740 result->search->textLength = textlength;
2741
2742 result->pattern.text = pattern;
2743 result->pattern.textLength = patternlength;
2744 result->pattern.CE = NULL;
2745 result->pattern.PCE = NULL;
2746
2747 result->search->breakIter = breakiter;
2748 #if !UCONFIG_NO_BREAK_ITERATION
2749 result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
2750 if (breakiter) {
2751 ubrk_setText(breakiter, text, textlength, status);
2752 }
2753 #endif
2754
2755 result->ownCollator = FALSE;
2756 result->search->matchedLength = 0;
2757 result->search->matchedIndex = USEARCH_DONE;
2758 result->utilIter = NULL;
2759 result->textIter = ucol_openElements(collator, text,
2760 textlength, status);
2761 if (U_FAILURE(*status)) {
2762 usearch_close(result);
2763 return NULL;
2764 }
2765
2766 result->search->isOverlap = FALSE;
2767 result->search->isCanonicalMatch = FALSE;
2768 result->search->isForwardSearching = TRUE;
2769 result->search->reset = TRUE;
2770
2771 initialize(result, status);
2772
2773 if (U_FAILURE(*status)) {
2774 usearch_close(result);
2775 return NULL;
2776 }
2777
2778 return result;
2779 }
2780 return NULL;
2781 }
2782
usearch_close(UStringSearch * strsrch)2783 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2784 {
2785 if (strsrch) {
2786 if (strsrch->pattern.CE != strsrch->pattern.CEBuffer &&
2787 strsrch->pattern.CE) {
2788 uprv_free(strsrch->pattern.CE);
2789 }
2790
2791 if (strsrch->pattern.PCE != NULL &&
2792 strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
2793 uprv_free(strsrch->pattern.PCE);
2794 }
2795
2796 ucol_closeElements(strsrch->textIter);
2797 ucol_closeElements(strsrch->utilIter);
2798
2799 if (strsrch->ownCollator && strsrch->collator) {
2800 ucol_close((UCollator *)strsrch->collator);
2801 }
2802
2803 #if !UCONFIG_NO_BREAK_ITERATION
2804 if (strsrch->search->internalBreakIter) {
2805 ubrk_close(strsrch->search->internalBreakIter);
2806 }
2807 #endif
2808
2809 uprv_free(strsrch->search);
2810 uprv_free(strsrch);
2811 }
2812 }
2813
2814 // set and get methods --------------------------------------------------
2815
usearch_setOffset(UStringSearch * strsrch,int32_t position,UErrorCode * status)2816 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2817 int32_t position,
2818 UErrorCode *status)
2819 {
2820 if (U_SUCCESS(*status) && strsrch) {
2821 if (isOutOfBounds(strsrch->search->textLength, position)) {
2822 *status = U_INDEX_OUTOFBOUNDS_ERROR;
2823 }
2824 else {
2825 setColEIterOffset(strsrch->textIter, position);
2826 }
2827 strsrch->search->matchedIndex = USEARCH_DONE;
2828 strsrch->search->matchedLength = 0;
2829 strsrch->search->reset = FALSE;
2830 }
2831 }
2832
usearch_getOffset(const UStringSearch * strsrch)2833 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2834 {
2835 if (strsrch) {
2836 int32_t result = ucol_getOffset(strsrch->textIter);
2837 if (isOutOfBounds(strsrch->search->textLength, result)) {
2838 return USEARCH_DONE;
2839 }
2840 return result;
2841 }
2842 return USEARCH_DONE;
2843 }
2844
usearch_setAttribute(UStringSearch * strsrch,USearchAttribute attribute,USearchAttributeValue value,UErrorCode * status)2845 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2846 USearchAttribute attribute,
2847 USearchAttributeValue value,
2848 UErrorCode *status)
2849 {
2850 if (U_SUCCESS(*status) && strsrch) {
2851 switch (attribute)
2852 {
2853 case USEARCH_OVERLAP :
2854 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2855 break;
2856 case USEARCH_CANONICAL_MATCH :
2857 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2858 FALSE);
2859 break;
2860 case USEARCH_ATTRIBUTE_COUNT :
2861 default:
2862 *status = U_ILLEGAL_ARGUMENT_ERROR;
2863 }
2864 }
2865 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2866 *status = U_ILLEGAL_ARGUMENT_ERROR;
2867 }
2868 }
2869
usearch_getAttribute(const UStringSearch * strsrch,USearchAttribute attribute)2870 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2871 const UStringSearch *strsrch,
2872 USearchAttribute attribute)
2873 {
2874 if (strsrch) {
2875 switch (attribute) {
2876 case USEARCH_OVERLAP :
2877 return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2878 USEARCH_OFF);
2879 case USEARCH_CANONICAL_MATCH :
2880 return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2881 USEARCH_OFF);
2882 case USEARCH_ATTRIBUTE_COUNT :
2883 return USEARCH_DEFAULT;
2884 }
2885 }
2886 return USEARCH_DEFAULT;
2887 }
2888
usearch_getMatchedStart(const UStringSearch * strsrch)2889 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2890 const UStringSearch *strsrch)
2891 {
2892 if (strsrch == NULL) {
2893 return USEARCH_DONE;
2894 }
2895 return strsrch->search->matchedIndex;
2896 }
2897
2898
usearch_getMatchedText(const UStringSearch * strsrch,UChar * result,int32_t resultCapacity,UErrorCode * status)2899 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2900 UChar *result,
2901 int32_t resultCapacity,
2902 UErrorCode *status)
2903 {
2904 if (U_FAILURE(*status)) {
2905 return USEARCH_DONE;
2906 }
2907 if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2908 result == NULL)) {
2909 *status = U_ILLEGAL_ARGUMENT_ERROR;
2910 return USEARCH_DONE;
2911 }
2912
2913 int32_t copylength = strsrch->search->matchedLength;
2914 int32_t copyindex = strsrch->search->matchedIndex;
2915 if (copyindex == USEARCH_DONE) {
2916 u_terminateUChars(result, resultCapacity, 0, status);
2917 return USEARCH_DONE;
2918 }
2919
2920 if (resultCapacity < copylength) {
2921 copylength = resultCapacity;
2922 }
2923 if (copylength > 0) {
2924 uprv_memcpy(result, strsrch->search->text + copyindex,
2925 copylength * sizeof(UChar));
2926 }
2927 return u_terminateUChars(result, resultCapacity,
2928 strsrch->search->matchedLength, status);
2929 }
2930
usearch_getMatchedLength(const UStringSearch * strsrch)2931 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2932 const UStringSearch *strsrch)
2933 {
2934 if (strsrch) {
2935 return strsrch->search->matchedLength;
2936 }
2937 return USEARCH_DONE;
2938 }
2939
2940 #if !UCONFIG_NO_BREAK_ITERATION
2941
usearch_setBreakIterator(UStringSearch * strsrch,UBreakIterator * breakiter,UErrorCode * status)2942 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch,
2943 UBreakIterator *breakiter,
2944 UErrorCode *status)
2945 {
2946 if (U_SUCCESS(*status) && strsrch) {
2947 strsrch->search->breakIter = breakiter;
2948 if (breakiter) {
2949 ubrk_setText(breakiter, strsrch->search->text,
2950 strsrch->search->textLength, status);
2951 }
2952 }
2953 }
2954
2955 U_CAPI const UBreakIterator* U_EXPORT2
usearch_getBreakIterator(const UStringSearch * strsrch)2956 usearch_getBreakIterator(const UStringSearch *strsrch)
2957 {
2958 if (strsrch) {
2959 return strsrch->search->breakIter;
2960 }
2961 return NULL;
2962 }
2963
2964 #endif
2965
usearch_setText(UStringSearch * strsrch,const UChar * text,int32_t textlength,UErrorCode * status)2966 U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch,
2967 const UChar *text,
2968 int32_t textlength,
2969 UErrorCode *status)
2970 {
2971 if (U_SUCCESS(*status)) {
2972 if (strsrch == NULL || text == NULL || textlength < -1 ||
2973 textlength == 0) {
2974 *status = U_ILLEGAL_ARGUMENT_ERROR;
2975 }
2976 else {
2977 if (textlength == -1) {
2978 textlength = u_strlen(text);
2979 }
2980 strsrch->search->text = text;
2981 strsrch->search->textLength = textlength;
2982 ucol_setText(strsrch->textIter, text, textlength, status);
2983 strsrch->search->matchedIndex = USEARCH_DONE;
2984 strsrch->search->matchedLength = 0;
2985 strsrch->search->reset = TRUE;
2986 #if !UCONFIG_NO_BREAK_ITERATION
2987 if (strsrch->search->breakIter != NULL) {
2988 ubrk_setText(strsrch->search->breakIter, text,
2989 textlength, status);
2990 }
2991 ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
2992 #endif
2993 }
2994 }
2995 }
2996
usearch_getText(const UStringSearch * strsrch,int32_t * length)2997 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
2998 int32_t *length)
2999 {
3000 if (strsrch) {
3001 *length = strsrch->search->textLength;
3002 return strsrch->search->text;
3003 }
3004 return NULL;
3005 }
3006
usearch_setCollator(UStringSearch * strsrch,const UCollator * collator,UErrorCode * status)3007 U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
3008 const UCollator *collator,
3009 UErrorCode *status)
3010 {
3011 if (U_SUCCESS(*status)) {
3012 if (collator == NULL) {
3013 *status = U_ILLEGAL_ARGUMENT_ERROR;
3014 return;
3015 }
3016
3017 if (strsrch) {
3018 if (strsrch->ownCollator && (strsrch->collator != collator)) {
3019 ucol_close((UCollator *)strsrch->collator);
3020 strsrch->ownCollator = FALSE;
3021 }
3022 strsrch->collator = collator;
3023 strsrch->strength = ucol_getStrength(collator);
3024 strsrch->ceMask = getMask(strsrch->strength);
3025 #if !UCONFIG_NO_BREAK_ITERATION
3026 ubrk_close(strsrch->search->internalBreakIter);
3027 strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
3028 strsrch->search->text, strsrch->search->textLength, status);
3029 #endif
3030 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3031 strsrch->toShift =
3032 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
3033 UCOL_SHIFTED;
3034 // if status is a failure, ucol_getVariableTop returns 0
3035 strsrch->variableTop = ucol_getVariableTop(collator, status);
3036 if (U_SUCCESS(*status)) {
3037 initialize(strsrch, status);
3038 if (U_SUCCESS(*status)) {
3039 /* free offset buffer to avoid memory leak before initializing. */
3040 ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));
3041 uprv_init_collIterate(collator, strsrch->search->text,
3042 strsrch->search->textLength,
3043 &(strsrch->textIter->iteratordata_));
3044 strsrch->utilIter->iteratordata_.coll = collator;
3045 }
3046 }
3047 }
3048
3049 // **** are these calls needed?
3050 // **** we call uprv_init_pce in initializePatternPCETable
3051 // **** and the CEBuffer constructor...
3052 #if 0
3053 uprv_init_pce(strsrch->textIter);
3054 uprv_init_pce(strsrch->utilIter);
3055 #endif
3056 }
3057 }
3058
usearch_getCollator(const UStringSearch * strsrch)3059 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3060 {
3061 if (strsrch) {
3062 return (UCollator *)strsrch->collator;
3063 }
3064 return NULL;
3065 }
3066
usearch_setPattern(UStringSearch * strsrch,const UChar * pattern,int32_t patternlength,UErrorCode * status)3067 U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch,
3068 const UChar *pattern,
3069 int32_t patternlength,
3070 UErrorCode *status)
3071 {
3072 if (U_SUCCESS(*status)) {
3073 if (strsrch == NULL || pattern == NULL) {
3074 *status = U_ILLEGAL_ARGUMENT_ERROR;
3075 }
3076 else {
3077 if (patternlength == -1) {
3078 patternlength = u_strlen(pattern);
3079 }
3080 if (patternlength == 0) {
3081 *status = U_ILLEGAL_ARGUMENT_ERROR;
3082 return;
3083 }
3084 strsrch->pattern.text = pattern;
3085 strsrch->pattern.textLength = patternlength;
3086 initialize(strsrch, status);
3087 }
3088 }
3089 }
3090
3091 U_CAPI const UChar* U_EXPORT2
usearch_getPattern(const UStringSearch * strsrch,int32_t * length)3092 usearch_getPattern(const UStringSearch *strsrch,
3093 int32_t *length)
3094 {
3095 if (strsrch) {
3096 *length = strsrch->pattern.textLength;
3097 return strsrch->pattern.text;
3098 }
3099 return NULL;
3100 }
3101
3102 // miscellanous methods --------------------------------------------------
3103
usearch_first(UStringSearch * strsrch,UErrorCode * status)3104 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3105 UErrorCode *status)
3106 {
3107 if (strsrch && U_SUCCESS(*status)) {
3108 strsrch->search->isForwardSearching = TRUE;
3109 usearch_setOffset(strsrch, 0, status);
3110 if (U_SUCCESS(*status)) {
3111 return usearch_next(strsrch, status);
3112 }
3113 }
3114 return USEARCH_DONE;
3115 }
3116
usearch_following(UStringSearch * strsrch,int32_t position,UErrorCode * status)3117 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
3118 int32_t position,
3119 UErrorCode *status)
3120 {
3121 if (strsrch && U_SUCCESS(*status)) {
3122 strsrch->search->isForwardSearching = TRUE;
3123 // position checked in usearch_setOffset
3124 usearch_setOffset(strsrch, position, status);
3125 if (U_SUCCESS(*status)) {
3126 return usearch_next(strsrch, status);
3127 }
3128 }
3129 return USEARCH_DONE;
3130 }
3131
usearch_last(UStringSearch * strsrch,UErrorCode * status)3132 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
3133 UErrorCode *status)
3134 {
3135 if (strsrch && U_SUCCESS(*status)) {
3136 strsrch->search->isForwardSearching = FALSE;
3137 usearch_setOffset(strsrch, strsrch->search->textLength, status);
3138 if (U_SUCCESS(*status)) {
3139 return usearch_previous(strsrch, status);
3140 }
3141 }
3142 return USEARCH_DONE;
3143 }
3144
usearch_preceding(UStringSearch * strsrch,int32_t position,UErrorCode * status)3145 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
3146 int32_t position,
3147 UErrorCode *status)
3148 {
3149 if (strsrch && U_SUCCESS(*status)) {
3150 strsrch->search->isForwardSearching = FALSE;
3151 // position checked in usearch_setOffset
3152 usearch_setOffset(strsrch, position, status);
3153 if (U_SUCCESS(*status)) {
3154 return usearch_previous(strsrch, status);
3155 }
3156 }
3157 return USEARCH_DONE;
3158 }
3159
3160 /**
3161 * If a direction switch is required, we'll count the number of ces till the
3162 * beginning of the collation element iterator and iterate forwards that
3163 * number of times. This is so that we get to the correct point within the
3164 * string to continue the search in. Imagine when we are in the middle of the
3165 * normalization buffer when the change in direction is request. arrrgghh....
3166 * After searching the offset within the collation element iterator will be
3167 * shifted to the start of the match. If a match is not found, the offset would
3168 * have been set to the end of the text string in the collation element
3169 * iterator.
3170 * Okay, here's my take on normalization buffer. The only time when there can
3171 * be 2 matches within the same normalization is when the pattern is consists
3172 * of all accents. But since the offset returned is from the text string, we
3173 * should not confuse the caller by returning the second match within the
3174 * same normalization buffer. If we do, the 2 results will have the same match
3175 * offsets, and that'll be confusing. I'll return the next match that doesn't
3176 * fall within the same normalization buffer. Note this does not affect the
3177 * results of matches spanning the text and the normalization buffer.
3178 * The position to start searching is taken from the collation element
3179 * iterator. Callers of this API would have to set the offset in the collation
3180 * element iterator before using this method.
3181 */
usearch_next(UStringSearch * strsrch,UErrorCode * status)3182 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3183 UErrorCode *status)
3184 {
3185 if (U_SUCCESS(*status) && strsrch) {
3186 // note offset is either equivalent to the start of the previous match
3187 // or is set by the user
3188 int32_t offset = usearch_getOffset(strsrch);
3189 USearch *search = strsrch->search;
3190 search->reset = FALSE;
3191 int32_t textlength = search->textLength;
3192 if (search->isForwardSearching) {
3193 #if BOYER_MOORE
3194 if (offset == textlength
3195 || (!search->isOverlap &&
3196 (offset + strsrch->pattern.defaultShiftSize > textlength ||
3197 (search->matchedIndex != USEARCH_DONE &&
3198 offset + search->matchedLength >= textlength)))) {
3199 // not enough characters to match
3200 setMatchNotFound(strsrch);
3201 return USEARCH_DONE;
3202 }
3203 #else
3204 if (offset == textlength ||
3205 (! search->isOverlap &&
3206 (search->matchedIndex != USEARCH_DONE &&
3207 offset + search->matchedLength > textlength))) {
3208 // not enough characters to match
3209 setMatchNotFound(strsrch);
3210 return USEARCH_DONE;
3211 }
3212 #endif
3213 }
3214 else {
3215 // switching direction.
3216 // if matchedIndex == USEARCH_DONE, it means that either a
3217 // setOffset has been called or that previous ran off the text
3218 // string. the iterator would have been set to offset 0 if a
3219 // match is not found.
3220 search->isForwardSearching = TRUE;
3221 if (search->matchedIndex != USEARCH_DONE) {
3222 // there's no need to set the collation element iterator
3223 // the next call to next will set the offset.
3224 return search->matchedIndex;
3225 }
3226 }
3227
3228 if (U_SUCCESS(*status)) {
3229 if (strsrch->pattern.CELength == 0) {
3230 if (search->matchedIndex == USEARCH_DONE) {
3231 search->matchedIndex = offset;
3232 }
3233 else { // moves by codepoints
3234 UTF_FWD_1(search->text, search->matchedIndex, textlength);
3235 }
3236
3237 search->matchedLength = 0;
3238 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3239 // status checked below
3240 if (search->matchedIndex == textlength) {
3241 search->matchedIndex = USEARCH_DONE;
3242 }
3243 }
3244 else {
3245 if (search->matchedLength > 0) {
3246 // if matchlength is 0 we are at the start of the iteration
3247 if (search->isOverlap) {
3248 ucol_setOffset(strsrch->textIter, offset + 1, status);
3249 }
3250 else {
3251 ucol_setOffset(strsrch->textIter,
3252 offset + search->matchedLength, status);
3253 }
3254 }
3255 else {
3256 // for boundary check purposes. this will ensure that the
3257 // next match will not preceed the current offset
3258 // note search->matchedIndex will always be set to something
3259 // in the code
3260 search->matchedIndex = offset - 1;
3261 }
3262
3263 if (search->isCanonicalMatch) {
3264 // can't use exact here since extra accents are allowed.
3265 usearch_handleNextCanonical(strsrch, status);
3266 }
3267 else {
3268 usearch_handleNextExact(strsrch, status);
3269 }
3270 }
3271
3272 if (U_FAILURE(*status)) {
3273 return USEARCH_DONE;
3274 }
3275
3276 #if !BOYER_MOORE
3277 if (search->matchedIndex == USEARCH_DONE) {
3278 ucol_setOffset(strsrch->textIter, search->textLength, status);
3279 } else {
3280 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3281 }
3282 #endif
3283
3284 return search->matchedIndex;
3285 }
3286 }
3287 return USEARCH_DONE;
3288 }
3289
usearch_previous(UStringSearch * strsrch,UErrorCode * status)3290 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3291 UErrorCode *status)
3292 {
3293 if (U_SUCCESS(*status) && strsrch) {
3294 int32_t offset;
3295 USearch *search = strsrch->search;
3296 if (search->reset) {
3297 offset = search->textLength;
3298 search->isForwardSearching = FALSE;
3299 search->reset = FALSE;
3300 setColEIterOffset(strsrch->textIter, offset);
3301 }
3302 else {
3303 offset = usearch_getOffset(strsrch);
3304 }
3305
3306 int32_t matchedindex = search->matchedIndex;
3307 if (search->isForwardSearching == TRUE) {
3308 // switching direction.
3309 // if matchedIndex == USEARCH_DONE, it means that either a
3310 // setOffset has been called or that next ran off the text
3311 // string. the iterator would have been set to offset textLength if
3312 // a match is not found.
3313 search->isForwardSearching = FALSE;
3314 if (matchedindex != USEARCH_DONE) {
3315 return matchedindex;
3316 }
3317 }
3318 else {
3319 #if BOYER_MOORE
3320 if (offset == 0 || matchedindex == 0 ||
3321 (!search->isOverlap &&
3322 (offset < strsrch->pattern.defaultShiftSize ||
3323 (matchedindex != USEARCH_DONE &&
3324 matchedindex < strsrch->pattern.defaultShiftSize)))) {
3325 // not enough characters to match
3326 setMatchNotFound(strsrch);
3327 return USEARCH_DONE;
3328 }
3329 #else
3330 // Could check pattern length, but the
3331 // linear search will do the right thing
3332 if (offset == 0 || matchedindex == 0) {
3333 setMatchNotFound(strsrch);
3334 return USEARCH_DONE;
3335 }
3336 #endif
3337 }
3338
3339 if (U_SUCCESS(*status)) {
3340 if (strsrch->pattern.CELength == 0) {
3341 search->matchedIndex =
3342 (matchedindex == USEARCH_DONE ? offset : matchedindex);
3343 if (search->matchedIndex == 0) {
3344 setMatchNotFound(strsrch);
3345 // status checked below
3346 }
3347 else { // move by codepoints
3348 UTF_BACK_1(search->text, 0, search->matchedIndex);
3349 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3350 // status checked below
3351 search->matchedLength = 0;
3352 }
3353 }
3354 else {
3355 #if !BOYER_MOORE
3356 if (search->matchedIndex != USEARCH_DONE) {
3357 if (search->isOverlap) {
3358 ucol_setOffset(strsrch->textIter, search->matchedIndex + search->matchedLength - 2, status);
3359 }
3360 }
3361 #endif
3362
3363 if (strsrch->search->isCanonicalMatch) {
3364 // can't use exact here since extra accents are allowed.
3365 usearch_handlePreviousCanonical(strsrch, status);
3366 // status checked below
3367 }
3368 else {
3369 usearch_handlePreviousExact(strsrch, status);
3370 // status checked below
3371 }
3372 }
3373
3374 if (U_FAILURE(*status)) {
3375 return USEARCH_DONE;
3376 }
3377
3378 return search->matchedIndex;
3379 }
3380 }
3381 return USEARCH_DONE;
3382 }
3383
3384
3385
usearch_reset(UStringSearch * strsrch)3386 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3387 {
3388 /*
3389 reset is setting the attributes that are already in
3390 string search, hence all attributes in the collator should
3391 be retrieved without any problems
3392 */
3393 if (strsrch) {
3394 UErrorCode status = U_ZERO_ERROR;
3395 UBool sameCollAttribute = TRUE;
3396 uint32_t ceMask;
3397 UBool shift;
3398 uint32_t varTop;
3399
3400 // **** hack to deal w/ how processed CEs encode quaternary ****
3401 UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
3402 if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
3403 (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
3404 sameCollAttribute = FALSE;
3405 }
3406
3407 strsrch->strength = ucol_getStrength(strsrch->collator);
3408 ceMask = getMask(strsrch->strength);
3409 if (strsrch->ceMask != ceMask) {
3410 strsrch->ceMask = ceMask;
3411 sameCollAttribute = FALSE;
3412 }
3413
3414 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3415 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3416 &status) == UCOL_SHIFTED;
3417 if (strsrch->toShift != shift) {
3418 strsrch->toShift = shift;
3419 sameCollAttribute = FALSE;
3420 }
3421
3422 // if status is a failure, ucol_getVariableTop returns 0
3423 varTop = ucol_getVariableTop(strsrch->collator, &status);
3424 if (strsrch->variableTop != varTop) {
3425 strsrch->variableTop = varTop;
3426 sameCollAttribute = FALSE;
3427 }
3428 if (!sameCollAttribute) {
3429 initialize(strsrch, &status);
3430 }
3431 /* free offset buffer to avoid memory leak before initializing. */
3432 ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));
3433 uprv_init_collIterate(strsrch->collator, strsrch->search->text,
3434 strsrch->search->textLength,
3435 &(strsrch->textIter->iteratordata_));
3436 strsrch->search->matchedLength = 0;
3437 strsrch->search->matchedIndex = USEARCH_DONE;
3438 strsrch->search->isOverlap = FALSE;
3439 strsrch->search->isCanonicalMatch = FALSE;
3440 strsrch->search->isForwardSearching = TRUE;
3441 strsrch->search->reset = TRUE;
3442 }
3443 }
3444
3445 //
3446 // CEI Collation Element + source text index.
3447 // These structs are kept in the circular buffer.
3448 //
3449 struct CEI {
3450 int64_t ce;
3451 int32_t lowIndex;
3452 int32_t highIndex;
3453 };
3454
3455 U_NAMESPACE_BEGIN
3456
3457
3458 //
3459 // CEBuffer A circular buffer of CEs from the text being searched.
3460 //
3461 #define DEFAULT_CEBUFFER_SIZE 50
3462 struct CEBuffer {
3463 CEI defBuf[DEFAULT_CEBUFFER_SIZE];
3464 CEI *buf;
3465 int32_t bufSize;
3466 int32_t firstIx;
3467 int32_t limitIx;
3468 UCollationElements *ceIter;
3469 UStringSearch *strSearch;
3470
3471
3472
3473 CEBuffer(UStringSearch *ss, UErrorCode *status);
3474 ~CEBuffer();
3475 const CEI *get(int32_t index);
3476 const CEI *getPrevious(int32_t index);
3477 };
3478
3479
CEBuffer(UStringSearch * ss,UErrorCode * status)3480 CEBuffer::CEBuffer(UStringSearch *ss, UErrorCode *status) {
3481 buf = defBuf;
3482 strSearch = ss;
3483 bufSize = ss->pattern.CELength+10;
3484 ceIter = ss->textIter;
3485 firstIx = 0;
3486 limitIx = 0;
3487
3488 uprv_init_pce(ceIter);
3489
3490 if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3491 buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
3492 if (buf == NULL) {
3493 *status = U_MEMORY_ALLOCATION_ERROR;
3494 }
3495 }
3496 }
3497
3498 // TODO: add a reset or init function so that allocated
3499 // buffers can be retained & reused.
3500
~CEBuffer()3501 CEBuffer::~CEBuffer() {
3502 if (buf != defBuf) {
3503 uprv_free(buf);
3504 }
3505 }
3506
3507
3508 // Get the CE with the specified index.
3509 // Index must be in the range
3510 // n-history_size < index < n+1
3511 // where n is the largest index to have been fetched by some previous call to this function.
3512 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3513 //
get(int32_t index)3514 const CEI *CEBuffer::get(int32_t index) {
3515 int i = index % bufSize;
3516
3517 if (index>=firstIx && index<limitIx) {
3518 // The request was for an entry already in our buffer.
3519 // Just return it.
3520 return &buf[i];
3521 }
3522
3523 // Caller is requesting a new, never accessed before, CE.
3524 // Verify that it is the next one in sequence, which is all
3525 // that is allowed.
3526 if (index != limitIx) {
3527 U_ASSERT(FALSE);
3528
3529 return NULL;
3530 }
3531
3532 // Manage the circular CE buffer indexing
3533 limitIx++;
3534
3535 if (limitIx - firstIx >= bufSize) {
3536 // The buffer is full, knock out the lowest-indexed entry.
3537 firstIx++;
3538 }
3539
3540 UErrorCode status = U_ZERO_ERROR;
3541
3542 buf[i].ce = ucol_nextProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status);
3543
3544 return &buf[i];
3545 }
3546
3547 // Get the CE with the specified index.
3548 // Index must be in the range
3549 // n-history_size < index < n+1
3550 // where n is the largest index to have been fetched by some previous call to this function.
3551 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3552 //
getPrevious(int32_t index)3553 const CEI *CEBuffer::getPrevious(int32_t index) {
3554 int i = index % bufSize;
3555
3556 if (index>=firstIx && index<limitIx) {
3557 // The request was for an entry already in our buffer.
3558 // Just return it.
3559 return &buf[i];
3560 }
3561
3562 // Caller is requesting a new, never accessed before, CE.
3563 // Verify that it is the next one in sequence, which is all
3564 // that is allowed.
3565 if (index != limitIx) {
3566 U_ASSERT(FALSE);
3567
3568 return NULL;
3569 }
3570
3571 // Manage the circular CE buffer indexing
3572 limitIx++;
3573
3574 if (limitIx - firstIx >= bufSize) {
3575 // The buffer is full, knock out the lowest-indexed entry.
3576 firstIx++;
3577 }
3578
3579 UErrorCode status = U_ZERO_ERROR;
3580
3581 buf[i].ce = ucol_previousProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status);
3582
3583 return &buf[i];
3584 }
3585
3586 U_NAMESPACE_END
3587
3588
3589 // #define USEARCH_DEBUG
3590
3591 #ifdef USEARCH_DEBUG
3592 #include <stdio.h>
3593 #include <stdlib.h>
3594 #endif
3595
3596 /*
3597 * Find the next break boundary after startIndex. If the UStringSearch object
3598 * has an external break iterator, use that. Otherwise use the internal character
3599 * break iterator.
3600 */
nextBoundaryAfter(UStringSearch * strsrch,int32_t startIndex)3601 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3602 #if 0
3603 const UChar *text = strsrch->search->text;
3604 int32_t textLen = strsrch->search->textLength;
3605
3606 U_ASSERT(startIndex>=0);
3607 U_ASSERT(startIndex<=textLen);
3608
3609 if (startIndex >= textLen) {
3610 return startIndex;
3611 }
3612
3613 UChar32 c;
3614 int32_t i = startIndex;
3615 U16_NEXT(text, i, textLen, c);
3616
3617 // If we are on a control character, stop without looking for combining marks.
3618 // Control characters do not combine.
3619 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3620 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
3621 return i;
3622 }
3623
3624 // The initial character was not a control, and can thus accept trailing
3625 // combining characters. Advance over however many of them there are.
3626 int32_t indexOfLastCharChecked;
3627 for (;;) {
3628 indexOfLastCharChecked = i;
3629 if (i>=textLen) {
3630 break;
3631 }
3632 U16_NEXT(text, i, textLen, c);
3633 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3634 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3635 break;
3636 }
3637 }
3638 return indexOfLastCharChecked;
3639 #elif !UCONFIG_NO_BREAK_ITERATION
3640 UBreakIterator *breakiterator = strsrch->search->breakIter;
3641
3642 if (breakiterator == NULL) {
3643 breakiterator = strsrch->search->internalBreakIter;
3644 }
3645
3646 if (breakiterator != NULL) {
3647 return ubrk_following(breakiterator, startIndex);
3648 }
3649
3650 return startIndex;
3651 #else
3652 // **** or should we use the original code? ****
3653 return startIndex;
3654 #endif
3655
3656 }
3657
3658 /*
3659 * Returns TRUE if index is on a break boundary. If the UStringSearch
3660 * has an external break iterator, test using that, otherwise test
3661 * using the internal character break iterator.
3662 */
isBreakBoundary(UStringSearch * strsrch,int32_t index)3663 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3664 #if 0
3665 const UChar *text = strsrch->search->text;
3666 int32_t textLen = strsrch->search->textLength;
3667
3668 U_ASSERT(index>=0);
3669 U_ASSERT(index<=textLen);
3670
3671 if (index>=textLen || index<=0) {
3672 return FALSE;
3673 }
3674
3675 // If the character at the current index is not a GRAPHEME_EXTEND
3676 // then we can not be within a combining sequence.
3677 UChar32 c;
3678 U16_GET(text, 0, index, textLen, c);
3679 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3680 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3681 return FALSE;
3682 }
3683
3684 // We are at a combining mark. If the preceding character is anything
3685 // except a CONTROL, CR or LF, we are in a combining sequence.
3686 U16_PREV(text, 0, index, c);
3687 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3688 UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
3689 return combining;
3690 #elif !UCONFIG_NO_BREAK_ITERATION
3691 UBreakIterator *breakiterator = strsrch->search->breakIter;
3692
3693 if (breakiterator == NULL) {
3694 breakiterator = strsrch->search->internalBreakIter;
3695 }
3696
3697 return (breakiterator != NULL && ! ubrk_isBoundary(breakiterator, index));
3698 #else
3699 // **** or use the original code? ****
3700 return FALSE;
3701 #endif
3702 }
3703
3704 #if 0
3705 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3706 {
3707 #if !UCONFIG_NO_BREAK_ITERATION
3708 UBreakIterator *breakiterator = strsrch->search->breakIter;
3709
3710 if (breakiterator != NULL) {
3711 int32_t startindex = ubrk_first(breakiterator);
3712 int32_t endindex = ubrk_last(breakiterator);
3713
3714 // out-of-range indexes are never boundary positions
3715 if (start < startindex || start > endindex ||
3716 end < startindex || end > endindex) {
3717 return FALSE;
3718 }
3719
3720 return ubrk_isBoundary(breakiterator, start) &&
3721 ubrk_isBoundary(breakiterator, end);
3722 }
3723 #endif
3724
3725 return TRUE;
3726 }
3727 #endif
3728
3729
usearch_search(UStringSearch * strsrch,int32_t startIdx,int32_t * matchStart,int32_t * matchLimit,UErrorCode * status)3730 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
3731 int32_t startIdx,
3732 int32_t *matchStart,
3733 int32_t *matchLimit,
3734 UErrorCode *status)
3735 {
3736 if (U_FAILURE(*status)) {
3737 return FALSE;
3738 }
3739
3740 // TODO: reject search patterns beginning with a combining char.
3741
3742 #ifdef USEARCH_DEBUG
3743 if (getenv("USEARCH_DEBUG") != NULL) {
3744 printf("Pattern CEs\n");
3745 for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
3746 printf(" %8x", strsrch->pattern.CE[ii]);
3747 }
3748 printf("\n");
3749 }
3750
3751 #endif
3752 // Input parameter sanity check.
3753 // TODO: should input indicies clip to the text length
3754 // in the same way that UText does.
3755 if(strsrch->pattern.CELength == 0 ||
3756 startIdx < 0 ||
3757 startIdx > strsrch->search->textLength ||
3758 strsrch->pattern.CE == NULL) {
3759 *status = U_ILLEGAL_ARGUMENT_ERROR;
3760 return FALSE;
3761 }
3762
3763 if (strsrch->pattern.PCE == NULL) {
3764 initializePatternPCETable(strsrch, status);
3765 }
3766
3767 ucol_setOffset(strsrch->textIter, startIdx, status);
3768 CEBuffer ceb(strsrch, status);
3769
3770
3771 int32_t targetIx = 0;
3772 const CEI *targetCEI;
3773 int32_t patIx;
3774 UBool found;
3775
3776 int32_t mStart = -1;
3777 int32_t mLimit = -1;
3778 int32_t minLimit;
3779 int32_t maxLimit;
3780
3781
3782
3783 // Outer loop moves over match starting positions in the
3784 // target CE space.
3785 for(targetIx=0; ; targetIx++)
3786 {
3787 found = TRUE;
3788 // Inner loop checks for a match beginning at each
3789 // position from the outer loop.
3790 for (patIx=0; patIx<strsrch->pattern.PCELength; patIx++) {
3791 int64_t patCE = strsrch->pattern.PCE[patIx];
3792 targetCEI = ceb.get(targetIx+patIx);
3793 // Compare CE from target string with CE from the pattern.
3794 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
3795 // which will fail the compare, below.
3796 if (targetCEI->ce != patCE) {
3797 found = FALSE;
3798 break;
3799 }
3800 }
3801
3802 if (!found && targetCEI->ce != UCOL_PROCESSED_NULLORDER) {
3803 // No match at this targetIx. Try again at the next.
3804 continue;
3805 }
3806
3807 if (!found) {
3808 // No match at all, we have run off the end of the target text.
3809 break;
3810 }
3811
3812
3813 // We have found a match in CE space.
3814 // Now determine the bounds in string index space.
3815 // There still is a chance of match failure if the CE range not correspond to
3816 // an acceptable character range.
3817 //
3818 const CEI *firstCEI = ceb.get(targetIx);
3819 const CEI *lastCEI = ceb.get(targetIx + strsrch->pattern.PCELength - 1);
3820 const CEI *nextCEI = ceb.get(targetIx + strsrch->pattern.PCELength);
3821
3822 mStart = firstCEI->lowIndex;
3823 minLimit = lastCEI->lowIndex;
3824 maxLimit = nextCEI->lowIndex;
3825
3826 // Look at the CE following the match. If it is UCOL_NULLORDER the match
3827 // extended to the end of input, and the match is good.
3828
3829 // Look at the high and low indices of the CE following the match. If
3830 // they are the same it means one of two things:
3831 // 1. The match extended to the last CE from the target text, which is OK, or
3832 // 2. The last CE that was part of the match is in an expansion that extends
3833 // to the first CE after the match. In this case, we reject the match.
3834 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3835 found = FALSE;
3836 }
3837
3838
3839 // Check for the start of the match being within a combining sequence.
3840 // This can happen if the pattern itself begins with a combining char, and
3841 // the match found combining marks in the target text that were attached
3842 // to something else.
3843 // This type of match should be rejected for not completely consuming a
3844 // combining sequence.
3845 if (isBreakBoundary(strsrch, mStart)) {
3846 found = FALSE;
3847 }
3848
3849 // Check for the start of the match being within an Collation Element Expansion,
3850 // meaning that the first char of the match is only partially matched.
3851 // With exapnsions, the first CE will report the index of the source
3852 // character, and all subsequent (expansions) CEs will report the source index of the
3853 // _following_ character.
3854 int32_t secondIx = firstCEI->highIndex;
3855 if (mStart == secondIx) {
3856 found = FALSE;
3857 }
3858
3859 // Advance the match end position to the first acceptable match boundary.
3860 // This advances the index over any combining charcters.
3861 mLimit = maxLimit;
3862 if (minLimit < maxLimit) {
3863 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
3864
3865 if (nba >= lastCEI->highIndex) {
3866 mLimit = nba;
3867 }
3868 }
3869
3870 #ifdef USEARCH_DEBUG
3871 if (getenv("USEARCH_DEBUG") != NULL) {
3872 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
3873 }
3874 #endif
3875
3876 // If advancing to the end of a combining sequence in character indexing space
3877 // advanced us beyond the end of the match in CE space, reject this match.
3878 if (mLimit > maxLimit) {
3879 found = FALSE;
3880 }
3881
3882 if (isBreakBoundary(strsrch, mLimit)) {
3883 found = FALSE;
3884 }
3885
3886 if (! checkIdentical(strsrch, mStart, mLimit)) {
3887 found = FALSE;
3888 }
3889
3890 if (found) {
3891 break;
3892 }
3893 }
3894
3895 #ifdef USEARCH_DEBUG
3896 if (getenv("USEARCH_DEBUG") != NULL) {
3897 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
3898 int32_t lastToPrint = ceb.limitIx+2;
3899 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
3900 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
3901 }
3902 printf("\n%s\n", found? "match found" : "no match");
3903 }
3904 #endif
3905
3906 // All Done. Store back the match bounds to the caller.
3907 //
3908 if (found==FALSE) {
3909 mLimit = -1;
3910 mStart = -1;
3911 }
3912
3913 if (matchStart != NULL) {
3914 *matchStart= mStart;
3915 }
3916
3917 if (matchLimit != NULL) {
3918 *matchLimit = mLimit;
3919 }
3920
3921 return found;
3922 }
3923
3924
usearch_searchBackwards(UStringSearch * strsrch,int32_t startIdx,int32_t * matchStart,int32_t * matchLimit,UErrorCode * status)3925 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
3926 int32_t startIdx,
3927 int32_t *matchStart,
3928 int32_t *matchLimit,
3929 UErrorCode *status)
3930 {
3931 if (U_FAILURE(*status)) {
3932 return FALSE;
3933 }
3934
3935 // TODO: reject search patterns beginning with a combining char.
3936
3937 #ifdef USEARCH_DEBUG
3938 if (getenv("USEARCH_DEBUG") != NULL) {
3939 printf("Pattern CEs\n");
3940 for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
3941 printf(" %8x", strsrch->pattern.CE[ii]);
3942 }
3943 printf("\n");
3944 }
3945
3946 #endif
3947 // Input parameter sanity check.
3948 // TODO: should input indicies clip to the text length
3949 // in the same way that UText does.
3950 if(strsrch->pattern.CELength == 0 ||
3951 startIdx < 0 ||
3952 startIdx > strsrch->search->textLength ||
3953 strsrch->pattern.CE == NULL) {
3954 *status = U_ILLEGAL_ARGUMENT_ERROR;
3955 return FALSE;
3956 }
3957
3958 if (strsrch->pattern.PCE == NULL) {
3959 initializePatternPCETable(strsrch, status);
3960 }
3961
3962 CEBuffer ceb(strsrch, status);
3963 int32_t targetIx = 0;
3964
3965 /*
3966 * Pre-load the buffer with the CE's for the grapheme
3967 * after our starting position so that we're sure that
3968 * we can look at the CE following the match when we
3969 * check the match boundaries.
3970 *
3971 * This will also pre-fetch the first CE that we'll
3972 * consider for the match.
3973 */
3974 if (startIdx < strsrch->search->textLength) {
3975 UBreakIterator *bi = strsrch->search->internalBreakIter;
3976 int32_t next = ubrk_following(bi, startIdx);
3977
3978 ucol_setOffset(strsrch->textIter, next, status);
3979
3980 for (targetIx = 0; ; targetIx += 1) {
3981 if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
3982 break;
3983 }
3984 }
3985 } else {
3986 ucol_setOffset(strsrch->textIter, startIdx, status);
3987 }
3988
3989
3990 const CEI *targetCEI;
3991 int32_t patIx;
3992 UBool found;
3993
3994 int32_t limitIx = targetIx;
3995 int32_t mStart = -1;
3996 int32_t mLimit = -1;
3997 int32_t minLimit;
3998 int32_t maxLimit;
3999
4000
4001
4002 // Outer loop moves over match starting positions in the
4003 // target CE space.
4004 for(targetIx = limitIx; ; targetIx += 1)
4005 {
4006 found = TRUE;
4007 // Inner loop checks for a match beginning at each
4008 // position from the outer loop.
4009 for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) {
4010 int64_t patCE = strsrch->pattern.PCE[patIx];
4011
4012 targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx);
4013 // Compare CE from target string with CE from the pattern.
4014 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4015 // which will fail the compare, below.
4016 if (targetCEI->ce != patCE) {
4017 found = FALSE;
4018 break;
4019 }
4020 }
4021
4022 if (!found && targetCEI->ce != UCOL_PROCESSED_NULLORDER) {
4023 // No match at this targetIx. Try again at the next.
4024 continue;
4025 }
4026
4027 if (!found) {
4028 // No match at all, we have run off the end of the target text.
4029 break;
4030 }
4031
4032
4033 // We have found a match in CE space.
4034 // Now determine the bounds in string index space.
4035 // There still is a chance of match failure if the CE range not correspond to
4036 // an acceptable character range.
4037 //
4038 const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1);
4039 const CEI *lastCEI = ceb.getPrevious(targetIx);
4040 const CEI *nextCEI = targetIx > 0? ceb.getPrevious(targetIx - 1) : NULL;
4041
4042 mStart = firstCEI->lowIndex;
4043 minLimit = lastCEI->lowIndex;
4044 maxLimit = targetIx > 0? nextCEI->lowIndex : lastCEI->highIndex;
4045
4046 // Look at the CE following the match. If it is UCOL_NULLORDER the match
4047 // extended to the end of input, and the match is good.
4048
4049 // Look at the high and low indices of the CE following the match. If
4050 // they are the same it means one of two things:
4051 // 1. The match extended to the last CE from the target text, which is OK, or
4052 // 2. The last CE that was part of the match is in an expansion that extends
4053 // to the first CE after the match. In this case, we reject the match.
4054 if (targetIx >= 1) {
4055 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4056 found = FALSE;
4057 }
4058 }
4059
4060
4061 // Check for the start of the match being within a combining sequence.
4062 // This can happen if the pattern itself begins with a combining char, and
4063 // the match found combining marks in the target text that were attached
4064 // to something else.
4065 // This type of match should be rejected for not completely consuming a
4066 // combining sequence.
4067 if (isBreakBoundary(strsrch, mStart)) {
4068 found = FALSE;
4069 }
4070
4071 // Look at the high index of the first CE in the match. If it's the same as the
4072 // low index, the first CE in the match is in the middle of an expansion.
4073 if (mStart == firstCEI->highIndex) {
4074 found = FALSE;
4075 }
4076
4077 // Advance the match end position to the first acceptable match boundary.
4078 // This advances the index over any combining charcters.
4079 mLimit = maxLimit;
4080 if (/*targetIx > 0 &&*/ minLimit < maxLimit) {
4081 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4082
4083 if (nba >= lastCEI->highIndex) {
4084 mLimit = nba;
4085 }
4086 }
4087
4088 #ifdef USEARCH_DEBUG
4089 if (getenv("USEARCH_DEBUG") != NULL) {
4090 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4091 }
4092 #endif
4093
4094 // If advancing to the end of a combining sequence in character indexing space
4095 // advanced us beyond the end of the match in CE space, reject this match.
4096 if (mLimit > maxLimit) {
4097 found = FALSE;
4098 }
4099
4100 // Make sure the end of the match is on a break boundary
4101 if (isBreakBoundary(strsrch, mLimit)) {
4102 found = FALSE;
4103 }
4104
4105 if (! checkIdentical(strsrch, mStart, mLimit)) {
4106 found = FALSE;
4107 }
4108
4109 if (found) {
4110 break;
4111 }
4112 }
4113
4114 #ifdef USEARCH_DEBUG
4115 if (getenv("USEARCH_DEBUG") != NULL) {
4116 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4117 int32_t lastToPrint = ceb.limitIx+2;
4118 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4119 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4120 }
4121 printf("\n%s\n", found? "match found" : "no match");
4122 }
4123 #endif
4124
4125 // All Done. Store back the match bounds to the caller.
4126 //
4127 if (found==FALSE) {
4128 mLimit = -1;
4129 mStart = -1;
4130 }
4131
4132 if (matchStart != NULL) {
4133 *matchStart= mStart;
4134 }
4135
4136 if (matchLimit != NULL) {
4137 *matchLimit = mLimit;
4138 }
4139
4140 return found;
4141 }
4142
4143
4144
4145
4146 // internal use methods declared in usrchimp.h -----------------------------
4147
usearch_handleNextExact(UStringSearch * strsrch,UErrorCode * status)4148 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
4149 {
4150 if (U_FAILURE(*status)) {
4151 setMatchNotFound(strsrch);
4152 return FALSE;
4153 }
4154
4155 #if BOYER_MOORE
4156 UCollationElements *coleiter = strsrch->textIter;
4157 int32_t textlength = strsrch->search->textLength;
4158 int32_t *patternce = strsrch->pattern.CE;
4159 int32_t patterncelength = strsrch->pattern.CELength;
4160 int32_t textoffset = ucol_getOffset(coleiter);
4161
4162 // status used in setting coleiter offset, since offset is checked in
4163 // shiftForward before setting the coleiter offset, status never
4164 // a failure
4165 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4166 patterncelength);
4167 while (textoffset <= textlength)
4168 {
4169 uint32_t patternceindex = patterncelength - 1;
4170 int32_t targetce;
4171 UBool found = FALSE;
4172 int32_t lastce = UCOL_NULLORDER;
4173
4174 setColEIterOffset(coleiter, textoffset);
4175
4176 for (;;) {
4177 // finding the last pattern ce match, imagine composite characters
4178 // for example: search for pattern A in text \u00C0
4179 // we'll have to skip \u0300 the grave first before we get to A
4180 targetce = ucol_previous(coleiter, status);
4181 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4182 found = FALSE;
4183 break;
4184 }
4185 targetce = getCE(strsrch, targetce);
4186 if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4187 // this is for the text \u0315\u0300 that requires
4188 // normalization and pattern \u0300, where \u0315 is ignorable
4189 continue;
4190 }
4191 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4192 lastce = targetce;
4193 }
4194 if (targetce == patternce[patternceindex]) {
4195 // the first ce can be a contraction
4196 found = TRUE;
4197 break;
4198 }
4199 if (!hasExpansion(coleiter)) {
4200 found = FALSE;
4201 break;
4202 }
4203 }
4204
4205 //targetce = lastce;
4206
4207 while (found && patternceindex > 0) {
4208 lastce = targetce;
4209 targetce = ucol_previous(coleiter, status);
4210 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4211 found = FALSE;
4212 break;
4213 }
4214 targetce = getCE(strsrch, targetce);
4215 if (targetce == UCOL_IGNORABLE) {
4216 continue;
4217 }
4218
4219 patternceindex --;
4220 found = found && targetce == patternce[patternceindex];
4221 }
4222
4223 targetce = lastce;
4224
4225 if (!found) {
4226 if (U_FAILURE(*status)) {
4227 break;
4228 }
4229 textoffset = shiftForward(strsrch, textoffset, lastce,
4230 patternceindex);
4231 // status checked at loop.
4232 patternceindex = patterncelength;
4233 continue;
4234 }
4235
4236 if (checkNextExactMatch(strsrch, &textoffset, status)) {
4237 // status checked in ucol_setOffset
4238 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4239 return TRUE;
4240 }
4241 }
4242 setMatchNotFound(strsrch);
4243 return FALSE;
4244 #else
4245 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4246 int32_t start = -1;
4247 int32_t end = -1;
4248
4249 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4250 strsrch->search->matchedIndex = start;
4251 strsrch->search->matchedLength = end - start;
4252 return TRUE;
4253 } else {
4254 setMatchNotFound(strsrch);
4255 return FALSE;
4256 }
4257 #endif
4258 }
4259
usearch_handleNextCanonical(UStringSearch * strsrch,UErrorCode * status)4260 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
4261 {
4262 if (U_FAILURE(*status)) {
4263 setMatchNotFound(strsrch);
4264 return FALSE;
4265 }
4266
4267 #if BOYER_MOORE
4268 UCollationElements *coleiter = strsrch->textIter;
4269 int32_t textlength = strsrch->search->textLength;
4270 int32_t *patternce = strsrch->pattern.CE;
4271 int32_t patterncelength = strsrch->pattern.CELength;
4272 int32_t textoffset = ucol_getOffset(coleiter);
4273 UBool hasPatternAccents =
4274 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4275
4276 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4277 patterncelength);
4278 strsrch->canonicalPrefixAccents[0] = 0;
4279 strsrch->canonicalSuffixAccents[0] = 0;
4280
4281 while (textoffset <= textlength)
4282 {
4283 int32_t patternceindex = patterncelength - 1;
4284 int32_t targetce;
4285 UBool found = FALSE;
4286 int32_t lastce = UCOL_NULLORDER;
4287
4288 setColEIterOffset(coleiter, textoffset);
4289
4290 for (;;) {
4291 // finding the last pattern ce match, imagine composite characters
4292 // for example: search for pattern A in text \u00C0
4293 // we'll have to skip \u0300 the grave first before we get to A
4294 targetce = ucol_previous(coleiter, status);
4295 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4296 found = FALSE;
4297 break;
4298 }
4299 targetce = getCE(strsrch, targetce);
4300 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4301 lastce = targetce;
4302 }
4303 if (targetce == patternce[patternceindex]) {
4304 // the first ce can be a contraction
4305 found = TRUE;
4306 break;
4307 }
4308 if (!hasExpansion(coleiter)) {
4309 found = FALSE;
4310 break;
4311 }
4312 }
4313
4314 while (found && patternceindex > 0) {
4315 targetce = ucol_previous(coleiter, status);
4316 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4317 found = FALSE;
4318 break;
4319 }
4320 targetce = getCE(strsrch, targetce);
4321 if (targetce == UCOL_IGNORABLE) {
4322 continue;
4323 }
4324
4325 patternceindex --;
4326 found = found && targetce == patternce[patternceindex];
4327 }
4328
4329 // initializing the rearranged accent array
4330 if (hasPatternAccents && !found) {
4331 strsrch->canonicalPrefixAccents[0] = 0;
4332 strsrch->canonicalSuffixAccents[0] = 0;
4333 if (U_FAILURE(*status)) {
4334 break;
4335 }
4336 found = doNextCanonicalMatch(strsrch, textoffset, status);
4337 }
4338
4339 if (!found) {
4340 if (U_FAILURE(*status)) {
4341 break;
4342 }
4343 textoffset = shiftForward(strsrch, textoffset, lastce,
4344 patternceindex);
4345 // status checked at loop
4346 patternceindex = patterncelength;
4347 continue;
4348 }
4349
4350 if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4351 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4352 return TRUE;
4353 }
4354 }
4355 setMatchNotFound(strsrch);
4356 return FALSE;
4357 #else
4358 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4359 int32_t start = -1;
4360 int32_t end = -1;
4361
4362 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4363 strsrch->search->matchedIndex = start;
4364 strsrch->search->matchedLength = end - start;
4365 return TRUE;
4366 } else {
4367 setMatchNotFound(strsrch);
4368 return FALSE;
4369 }
4370 #endif
4371 }
4372
usearch_handlePreviousExact(UStringSearch * strsrch,UErrorCode * status)4373 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
4374 {
4375 if (U_FAILURE(*status)) {
4376 setMatchNotFound(strsrch);
4377 return FALSE;
4378 }
4379
4380 #if BOYER_MOORE
4381 UCollationElements *coleiter = strsrch->textIter;
4382 int32_t *patternce = strsrch->pattern.CE;
4383 int32_t patterncelength = strsrch->pattern.CELength;
4384 int32_t textoffset = ucol_getOffset(coleiter);
4385
4386 // shifting it check for setting offset
4387 // if setOffset is called previously or there was no previous match, we
4388 // leave the offset as it is.
4389 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4390 textoffset = strsrch->search->matchedIndex;
4391 }
4392
4393 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4394 patterncelength);
4395
4396 while (textoffset >= 0)
4397 {
4398 int32_t patternceindex = 1;
4399 int32_t targetce;
4400 UBool found = FALSE;
4401 int32_t firstce = UCOL_NULLORDER;
4402
4403 // if status is a failure, ucol_setOffset does nothing
4404 setColEIterOffset(coleiter, textoffset);
4405
4406 for (;;) {
4407 // finding the first pattern ce match, imagine composite
4408 // characters. for example: search for pattern \u0300 in text
4409 // \u00C0, we'll have to skip A first before we get to
4410 // \u0300 the grave accent
4411 targetce = ucol_next(coleiter, status);
4412 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4413 found = FALSE;
4414 break;
4415 }
4416 targetce = getCE(strsrch, targetce);
4417 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4418 firstce = targetce;
4419 }
4420 if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
4421 continue;
4422 }
4423 if (targetce == patternce[0]) {
4424 found = TRUE;
4425 break;
4426 }
4427 if (!hasExpansion(coleiter)) {
4428 // checking for accents in composite character
4429 found = FALSE;
4430 break;
4431 }
4432 }
4433
4434 //targetce = firstce;
4435
4436 while (found && (patternceindex < patterncelength)) {
4437 firstce = targetce;
4438 targetce = ucol_next(coleiter, status);
4439 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4440 found = FALSE;
4441 break;
4442 }
4443 targetce = getCE(strsrch, targetce);
4444 if (targetce == UCOL_IGNORABLE) {
4445 continue;
4446 }
4447
4448 found = found && targetce == patternce[patternceindex];
4449 patternceindex ++;
4450 }
4451
4452 targetce = firstce;
4453
4454 if (!found) {
4455 if (U_FAILURE(*status)) {
4456 break;
4457 }
4458
4459 textoffset = reverseShift(strsrch, textoffset, targetce,
4460 patternceindex);
4461 patternceindex = 0;
4462 continue;
4463 }
4464
4465 if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4466 setColEIterOffset(coleiter, textoffset);
4467 return TRUE;
4468 }
4469 }
4470 setMatchNotFound(strsrch);
4471 return FALSE;
4472 #else
4473 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4474 int32_t start = -1;
4475 int32_t end = -1;
4476
4477 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4478 strsrch->search->matchedIndex = start;
4479 strsrch->search->matchedLength = end - start;
4480 return TRUE;
4481 } else {
4482 setMatchNotFound(strsrch);
4483 return FALSE;
4484 }
4485 #endif
4486 }
4487
usearch_handlePreviousCanonical(UStringSearch * strsrch,UErrorCode * status)4488 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
4489 UErrorCode *status)
4490 {
4491 if (U_FAILURE(*status)) {
4492 setMatchNotFound(strsrch);
4493 return FALSE;
4494 }
4495
4496 #if BOYER_MOORE
4497 UCollationElements *coleiter = strsrch->textIter;
4498 int32_t *patternce = strsrch->pattern.CE;
4499 int32_t patterncelength = strsrch->pattern.CELength;
4500 int32_t textoffset = ucol_getOffset(coleiter);
4501 UBool hasPatternAccents =
4502 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4503
4504 // shifting it check for setting offset
4505 // if setOffset is called previously or there was no previous match, we
4506 // leave the offset as it is.
4507 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4508 textoffset = strsrch->search->matchedIndex;
4509 }
4510
4511 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4512 patterncelength);
4513 strsrch->canonicalPrefixAccents[0] = 0;
4514 strsrch->canonicalSuffixAccents[0] = 0;
4515
4516 while (textoffset >= 0)
4517 {
4518 int32_t patternceindex = 1;
4519 int32_t targetce;
4520 UBool found = FALSE;
4521 int32_t firstce = UCOL_NULLORDER;
4522
4523 setColEIterOffset(coleiter, textoffset);
4524 for (;;) {
4525 // finding the first pattern ce match, imagine composite
4526 // characters. for example: search for pattern \u0300 in text
4527 // \u00C0, we'll have to skip A first before we get to
4528 // \u0300 the grave accent
4529 targetce = ucol_next(coleiter, status);
4530 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4531 found = FALSE;
4532 break;
4533 }
4534 targetce = getCE(strsrch, targetce);
4535 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4536 firstce = targetce;
4537 }
4538
4539 if (targetce == patternce[0]) {
4540 // the first ce can be a contraction
4541 found = TRUE;
4542 break;
4543 }
4544 if (!hasExpansion(coleiter)) {
4545 // checking for accents in composite character
4546 found = FALSE;
4547 break;
4548 }
4549 }
4550
4551 targetce = firstce;
4552
4553 while (found && patternceindex < patterncelength) {
4554 targetce = ucol_next(coleiter, status);
4555 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4556 found = FALSE;
4557 break;
4558 }
4559 targetce = getCE(strsrch, targetce);
4560 if (targetce == UCOL_IGNORABLE) {
4561 continue;
4562 }
4563
4564 found = found && targetce == patternce[patternceindex];
4565 patternceindex ++;
4566 }
4567
4568 // initializing the rearranged accent array
4569 if (hasPatternAccents && !found) {
4570 strsrch->canonicalPrefixAccents[0] = 0;
4571 strsrch->canonicalSuffixAccents[0] = 0;
4572 if (U_FAILURE(*status)) {
4573 break;
4574 }
4575 found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4576 }
4577
4578 if (!found) {
4579 if (U_FAILURE(*status)) {
4580 break;
4581 }
4582 textoffset = reverseShift(strsrch, textoffset, targetce,
4583 patternceindex);
4584 patternceindex = 0;
4585 continue;
4586 }
4587
4588 if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4589 setColEIterOffset(coleiter, textoffset);
4590 return TRUE;
4591 }
4592 }
4593 setMatchNotFound(strsrch);
4594 return FALSE;
4595 #else
4596 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4597 int32_t start = -1;
4598 int32_t end = -1;
4599
4600 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4601 strsrch->search->matchedIndex = start;
4602 strsrch->search->matchedLength = end - start;
4603 return TRUE;
4604 } else {
4605 setMatchNotFound(strsrch);
4606 return FALSE;
4607 }
4608 #endif
4609 }
4610
4611 #endif /* #if !UCONFIG_NO_COLLATION */
4612