• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 2001-2010 IBM and others. All rights reserved.
4 **********************************************************************
5 *   Date        Name        Description
6 *  08/13/2001   synwee      Creation.
7 **********************************************************************
8 */
9 #ifndef USRCHIMP_H
10 #define USRCHIMP_H
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_COLLATION
15 
16 #include "unicode/normalizer2.h"
17 #include "unicode/ucol.h"
18 #include "unicode/ucoleitr.h"
19 #include "unicode/ubrk.h"
20 
21 #define INITIAL_ARRAY_SIZE_       256
22 #define MAX_TABLE_SIZE_           257
23 
24 struct USearch {
25     // required since collation element iterator does not have a getText API
26     const UChar              *text;
27           int32_t             textLength; // exact length
28           UBool               isOverlap;
29           UBool               isCanonicalMatch;
30           int16_t             elementComparisonType;
31           UBreakIterator     *internalBreakIter;  //internal character breakiterator
32           UBreakIterator     *breakIter;
33     // value USEARCH_DONE is the default value
34     // if we are not at the start of the text or the end of the text,
35     // depending on the iteration direction and matchedIndex is USEARCH_DONE
36     // it means that we can't find any more matches in that particular direction
37           int32_t             matchedIndex;
38           int32_t             matchedLength;
39           UBool               isForwardSearching;
40           UBool               reset;
41 };
42 
43 struct UPattern {
44     const UChar              *text;
45           int32_t             textLength; // exact length
46           // length required for backwards ce comparison
47           int32_t             CELength;
48           int32_t            *CE;
49           int32_t             CEBuffer[INITIAL_ARRAY_SIZE_];
50           int32_t             PCELength;
51           int64_t            *PCE;
52           int64_t             PCEBuffer[INITIAL_ARRAY_SIZE_];
53           UBool               hasPrefixAccents;
54           UBool               hasSuffixAccents;
55           int16_t             defaultShiftSize;
56           int16_t             shift[MAX_TABLE_SIZE_];
57           int16_t             backShift[MAX_TABLE_SIZE_];
58 };
59 
60 struct UStringSearch {
61     struct USearch            *search;
62     struct UPattern            pattern;
63     const  UCollator          *collator;
64     const  U_NAMESPACE_QUALIFIER Normalizer2 *nfd;
65     // positions within the collation element iterator is used to determine
66     // if we are at the start of the text.
67            UCollationElements *textIter;
68     // utility collation element, used throughout program for temporary
69     // iteration.
70            UCollationElements *utilIter;
71            UBool               ownCollator;
72            UCollationStrength  strength;
73            uint32_t            ceMask;
74            uint32_t            variableTop;
75            UBool               toShift;
76            UChar               canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
77            UChar               canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
78 };
79 
80 /**
81 * Exact matches without checking for the ends for extra accents.
82 * The match after the position within the collation element iterator is to be
83 * found.
84 * After a match is found the offset in the collation element iterator will be
85 * shifted to the start of the match.
86 * Implementation note:
87 * For tertiary we can't use the collator->tertiaryMask, that is a
88 * preprocessed mask that takes into account case options. since we are only
89 * concerned with exact matches, we don't need that.
90 * Alternate handling - since only the 16 most significant digits is only used,
91 * we can safely do a compare without masking if the ce is a variable, we mask
92 * and get only the primary values no shifting to quartenary is required since
93 * all primary values less than variabletop will need to be masked off anyway.
94 * If the end character is composite and the pattern ce does not match the text
95 * ce, we skip it until we find a match in the end composite character or when
96 * it has passed the character. This is so that we can match pattern "a" with
97 * the text "\u00e6"
98 * @param strsrch string search data
99 * @param status error status if any
100 * @return TRUE if an exact match is found, FALSE otherwise
101 */
102 U_CFUNC
103 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
104 
105 /**
106 * Canonical matches.
107 * According to the definition, matches found here will include the whole span
108 * of beginning and ending accents if it overlaps that region.
109 * @param strsrch string search data
110 * @param status error status if any
111 * @return TRUE if a canonical match is found, FALSE otherwise
112 */
113 U_CFUNC
114 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
115 
116 /**
117 * Gets the previous match.
118 * Comments follows from handleNextExact
119 * @param strsrch string search data
120 * @param status error status if any
121 * @return True if a exact math is found, FALSE otherwise.
122 */
123 U_CFUNC
124 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
125 
126 /**
127 * Canonical matches.
128 * According to the definition, matches found here will include the whole span
129 * of beginning and ending accents if it overlaps that region.
130 * @param strsrch string search data
131 * @param status error status if any
132 * @return TRUE if a canonical match is found, FALSE otherwise
133 */
134 U_CFUNC
135 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
136                                       UErrorCode    *status);
137 
138 #endif /* #if !UCONFIG_NO_COLLATION */
139 
140 #endif
141