1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2001-2015 IBM and others. All rights reserved. 6 ********************************************************************** 7 * Date Name Description 8 * 08/13/2001 synwee Creation. 9 ********************************************************************** 10 */ 11 #ifndef USRCHIMP_H 12 #define USRCHIMP_H 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_COLLATION 17 18 #include "unicode/normalizer2.h" 19 #include "unicode/ucol.h" 20 #include "unicode/ucoleitr.h" 21 #include "unicode/ubrk.h" 22 23 /* mask off anything but primary order */ 24 #define UCOL_PRIMARYORDERMASK 0xffff0000 25 /* mask off anything but secondary order */ 26 #define UCOL_SECONDARYORDERMASK 0x0000ff00 27 /* mask off anything but tertiary order */ 28 #define UCOL_TERTIARYORDERMASK 0x000000ff 29 /* primary order shift */ 30 #define UCOL_PRIMARYORDERSHIFT 16 31 /* secondary order shift */ 32 #define UCOL_SECONDARYORDERSHIFT 8 33 34 #define UCOL_IGNORABLE 0 35 36 /* get weights from a CE */ 37 #define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff) 38 #define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT) 39 #define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK) 40 41 #define UCOL_CONTINUATION_MARKER 0xC0 42 43 #define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER) 44 45 /** 46 * This indicates an error has occurred during processing or there are no more CEs 47 * to be returned. 48 */ 49 #define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX) 50 51 U_NAMESPACE_BEGIN 52 53 class CollationElementIterator; 54 class Collator; 55 56 struct PCEI 57 { 58 uint64_t ce; 59 int32_t low; 60 int32_t high; 61 }; 62 63 struct PCEBuffer 64 { 65 PCEI defaultBuffer[16]; 66 PCEI *buffer; 67 int32_t bufferIndex; 68 int32_t bufferSize; 69 70 PCEBuffer(); 71 ~PCEBuffer(); 72 73 void reset(); 74 UBool isEmpty() const; 75 void put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode); 76 const PCEI *get(); 77 }; 78 79 class UCollationPCE : public UMemory { 80 private: 81 PCEBuffer pceBuffer; 82 CollationElementIterator *cei; 83 UCollationStrength strength; 84 UBool toShift; 85 UBool isShifted; 86 uint32_t variableTop; 87 88 public: 89 UCollationPCE(UCollationElements *elems); 90 UCollationPCE(CollationElementIterator *iter); 91 ~UCollationPCE(); 92 93 void init(UCollationElements *elems); 94 void init(CollationElementIterator *iter); 95 96 /** 97 * Get the processed ordering priority of the next collation element in the text. 98 * A single character may contain more than one collation element. 99 * 100 * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE. 101 * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE. 102 * @param status A pointer to an UErrorCode to receive any errors. 103 * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER 104 * if an error has occurred or if the end of string has been reached 105 */ 106 int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); 107 /** 108 * Get the processed ordering priority of the previous collation element in the text. 109 * A single character may contain more than one collation element. 110 * 111 * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE 112 * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE 113 * @param status A pointer to an UErrorCode to receive any errors. Notably 114 * a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack 115 * buffer has been exhausted. 116 * @return The previous collation elements ordering, otherwise returns 117 * UCOL_PROCESSED_NULLORDER if an error has occurred or if the start of 118 * string has been reached. 119 */ 120 int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); 121 122 private: 123 void init(const Collator &coll); 124 uint64_t processCE(uint32_t ce); 125 }; 126 127 U_NAMESPACE_END 128 129 #define INITIAL_ARRAY_SIZE_ 256 130 131 struct USearch { 132 // required since collation element iterator does not have a getText API 133 const UChar *text; 134 int32_t textLength; // exact length 135 UBool isOverlap; 136 UBool isCanonicalMatch; 137 int16_t elementComparisonType; 138 UBreakIterator *internalBreakIter; // internal character breakiterator, lazily created. 139 UBreakIterator *breakIter; // caller provided character breakiterator 140 // value USEARCH_DONE is the default value 141 // if we are not at the start of the text or the end of the text, 142 // depending on the iteration direction and matchedIndex is USEARCH_DONE 143 // it means that we can't find any more matches in that particular direction 144 int32_t matchedIndex; 145 int32_t matchedLength; 146 UBool isForwardSearching; 147 UBool reset; 148 }; 149 150 struct UPattern { 151 const UChar *text; 152 int32_t textLength; // exact length 153 // length required for backwards ce comparison 154 int32_t cesLength; 155 int32_t *ces; 156 int32_t cesBuffer[INITIAL_ARRAY_SIZE_]; 157 int32_t pcesLength; 158 int64_t *pces; 159 int64_t pcesBuffer[INITIAL_ARRAY_SIZE_]; 160 UBool hasPrefixAccents; 161 UBool hasSuffixAccents; 162 }; 163 164 struct UStringSearch { 165 struct USearch *search; 166 struct UPattern pattern; 167 const UCollator *collator; 168 const icu::Normalizer2 *nfd; 169 // positions within the collation element iterator is used to determine 170 // if we are at the start of the text. 171 UCollationElements *textIter; 172 icu::UCollationPCE *textProcessedIter; 173 // utility collation element, used throughout program for temporary 174 // iteration. 175 UCollationElements *utilIter; 176 UBool ownCollator; 177 UCollationStrength strength; 178 uint32_t ceMask; 179 uint32_t variableTop; 180 UBool toShift; 181 }; 182 183 /** 184 * Exact matches without checking for the ends for extra accents. 185 * The match after the position within the collation element iterator is to be 186 * found. 187 * After a match is found the offset in the collation element iterator will be 188 * shifted to the start of the match. 189 * Implementation note: 190 * For tertiary we can't use the collator->tertiaryMask, that is a 191 * preprocessed mask that takes into account case options. since we are only 192 * concerned with exact matches, we don't need that. 193 * Alternate handling - since only the 16 most significant digits is only used, 194 * we can safely do a compare without masking if the ce is a variable, we mask 195 * and get only the primary values no shifting to quartenary is required since 196 * all primary values less than variabletop will need to be masked off anyway. 197 * If the end character is composite and the pattern ce does not match the text 198 * ce, we skip it until we find a match in the end composite character or when 199 * it has passed the character. This is so that we can match pattern "a" with 200 * the text "\u00e6" 201 * @param strsrch string search data 202 * @param status error status if any 203 * @return true if an exact match is found, false otherwise 204 */ 205 U_CFUNC 206 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status); 207 208 /** 209 * Canonical matches. 210 * According to the definition, matches found here will include the whole span 211 * of beginning and ending accents if it overlaps that region. 212 * @param strsrch string search data 213 * @param status error status if any 214 * @return true if a canonical match is found, false otherwise 215 */ 216 U_CFUNC 217 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status); 218 219 /** 220 * Gets the previous match. 221 * Comments follows from handleNextExact 222 * @param strsrch string search data 223 * @param status error status if any 224 * @return True if a exact math is found, false otherwise. 225 */ 226 U_CFUNC 227 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status); 228 229 /** 230 * Canonical matches. 231 * According to the definition, matches found here will include the whole span 232 * of beginning and ending accents if it overlaps that region. 233 * @param strsrch string search data 234 * @param status error status if any 235 * @return true if a canonical match is found, false otherwise 236 */ 237 U_CFUNC 238 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, 239 UErrorCode *status); 240 241 #endif /* #if !UCONFIG_NO_COLLATION */ 242 243 #endif 244