• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2001-2015 IBM and others. All rights reserved.
6 **********************************************************************
7 *   Date        Name        Description
8 *  08/13/2001   synwee      Creation.
9 **********************************************************************
10 */
11 #ifndef USRCHIMP_H
12 #define USRCHIMP_H
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_COLLATION
17 
18 #include "unicode/normalizer2.h"
19 #include "unicode/ucol.h"
20 #include "unicode/ucoleitr.h"
21 #include "unicode/ubrk.h"
22 
23 /* mask off anything but primary order */
24 #define UCOL_PRIMARYORDERMASK 0xffff0000
25 /* mask off anything but secondary order */
26 #define UCOL_SECONDARYORDERMASK 0x0000ff00
27 /* mask off anything but tertiary order */
28 #define UCOL_TERTIARYORDERMASK 0x000000ff
29 /* primary order shift */
30 #define UCOL_PRIMARYORDERSHIFT 16
31 /* secondary order shift */
32 #define UCOL_SECONDARYORDERSHIFT 8
33 
34 #define UCOL_IGNORABLE 0
35 
36 /* get weights from a CE */
37 #define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
38 #define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
39 #define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
40 
41 #define UCOL_CONTINUATION_MARKER 0xC0
42 
43 #define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)
44 
45 /**
46  * This indicates an error has occurred during processing or there are no more CEs
47  * to be returned.
48  */
49 #define UCOL_PROCESSED_NULLORDER        ((int64_t)U_INT64_MAX)
50 
51 U_NAMESPACE_BEGIN
52 
53 class CollationElementIterator;
54 class Collator;
55 
56 struct PCEI
57 {
58     uint64_t ce;
59     int32_t  low;
60     int32_t  high;
61 };
62 
63 struct PCEBuffer
64 {
65     PCEI    defaultBuffer[16];
66     PCEI   *buffer;
67     int32_t bufferIndex;
68     int32_t bufferSize;
69 
70     PCEBuffer();
71     ~PCEBuffer();
72 
73     void  reset();
74     UBool isEmpty() const;
75     void  put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
76     const PCEI *get();
77 };
78 
79 class UCollationPCE : public UMemory {
80 private:
81     PCEBuffer          pceBuffer;
82     CollationElementIterator *cei;
83     UCollationStrength strength;
84     UBool              toShift;
85     UBool              isShifted;
86     uint32_t           variableTop;
87 
88 public:
89     UCollationPCE(UCollationElements *elems);
90     UCollationPCE(CollationElementIterator *iter);
91     ~UCollationPCE();
92 
93     void init(UCollationElements *elems);
94     void init(CollationElementIterator *iter);
95 
96     /**
97      * Get the processed ordering priority of the next collation element in the text.
98      * A single character may contain more than one collation element.
99      *
100      * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
101      * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
102      * @param status A pointer to an UErrorCode to receive any errors.
103      * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
104      *         if an error has occurred or if the end of string has been reached
105      */
106     int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
107     /**
108      * Get the processed ordering priority of the previous collation element in the text.
109      * A single character may contain more than one collation element.
110      *
111      * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
112      * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
113      * @param status A pointer to an UErrorCode to receive any errors. Notably
114      *               a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
115      *               buffer has been exhausted.
116      * @return The previous collation elements ordering, otherwise returns
117      *         UCOL_PROCESSED_NULLORDER if an error has occurred or if the start of
118      *         string has been reached.
119      */
120     int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
121 
122 private:
123     void init(const Collator &coll);
124     uint64_t processCE(uint32_t ce);
125 };
126 
127 U_NAMESPACE_END
128 
129 #define INITIAL_ARRAY_SIZE_       256
130 
131 struct USearch {
132     // required since collation element iterator does not have a getText API
133     const UChar              *text;
134           int32_t             textLength; // exact length
135           UBool               isOverlap;
136           UBool               isCanonicalMatch;
137           int16_t             elementComparisonType;
138           UBreakIterator     *internalBreakIter;  // internal character breakiterator, lazily created.
139           UBreakIterator     *breakIter;          // caller provided character breakiterator
140     // value USEARCH_DONE is the default value
141     // if we are not at the start of the text or the end of the text,
142     // depending on the iteration direction and matchedIndex is USEARCH_DONE
143     // it means that we can't find any more matches in that particular direction
144           int32_t             matchedIndex;
145           int32_t             matchedLength;
146           UBool               isForwardSearching;
147           UBool               reset;
148 };
149 
150 struct UPattern {
151     const UChar              *text;
152           int32_t             textLength; // exact length
153           // length required for backwards ce comparison
154           int32_t             cesLength;
155           int32_t            *ces;
156           int32_t             cesBuffer[INITIAL_ARRAY_SIZE_];
157           int32_t             pcesLength;
158           int64_t            *pces;
159           int64_t             pcesBuffer[INITIAL_ARRAY_SIZE_];
160           UBool               hasPrefixAccents;
161           UBool               hasSuffixAccents;
162 };
163 
164 struct UStringSearch {
165     struct USearch            *search;
166     struct UPattern            pattern;
167     const  UCollator          *collator;
168     const  icu::Normalizer2   *nfd;
169     // positions within the collation element iterator is used to determine
170     // if we are at the start of the text.
171            UCollationElements *textIter;
172            icu::UCollationPCE *textProcessedIter;
173     // utility collation element, used throughout program for temporary
174     // iteration.
175            UCollationElements *utilIter;
176            UBool               ownCollator;
177            UCollationStrength  strength;
178            uint32_t            ceMask;
179            uint32_t            variableTop;
180            UBool               toShift;
181 };
182 
183 /**
184 * Exact matches without checking for the ends for extra accents.
185 * The match after the position within the collation element iterator is to be
186 * found.
187 * After a match is found the offset in the collation element iterator will be
188 * shifted to the start of the match.
189 * Implementation note:
190 * For tertiary we can't use the collator->tertiaryMask, that is a
191 * preprocessed mask that takes into account case options. since we are only
192 * concerned with exact matches, we don't need that.
193 * Alternate handling - since only the 16 most significant digits is only used,
194 * we can safely do a compare without masking if the ce is a variable, we mask
195 * and get only the primary values no shifting to quartenary is required since
196 * all primary values less than variabletop will need to be masked off anyway.
197 * If the end character is composite and the pattern ce does not match the text
198 * ce, we skip it until we find a match in the end composite character or when
199 * it has passed the character. This is so that we can match pattern "a" with
200 * the text "\u00e6"
201 * @param strsrch string search data
202 * @param status error status if any
203 * @return true if an exact match is found, false otherwise
204 */
205 U_CFUNC
206 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
207 
208 /**
209 * Canonical matches.
210 * According to the definition, matches found here will include the whole span
211 * of beginning and ending accents if it overlaps that region.
212 * @param strsrch string search data
213 * @param status error status if any
214 * @return true if a canonical match is found, false otherwise
215 */
216 U_CFUNC
217 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
218 
219 /**
220 * Gets the previous match.
221 * Comments follows from handleNextExact
222 * @param strsrch string search data
223 * @param status error status if any
224 * @return True if a exact math is found, false otherwise.
225 */
226 U_CFUNC
227 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
228 
229 /**
230 * Canonical matches.
231 * According to the definition, matches found here will include the whole span
232 * of beginning and ending accents if it overlaps that region.
233 * @param strsrch string search data
234 * @param status error status if any
235 * @return true if a canonical match is found, false otherwise
236 */
237 U_CFUNC
238 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
239                                       UErrorCode    *status);
240 
241 #endif /* #if !UCONFIG_NO_COLLATION */
242 
243 #endif
244