1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2012-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * utf8collationiterator.h 9 * 10 * created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h) 11 * created by: Markus W. Scherer 12 */ 13 14 #ifndef __UTF8COLLATIONITERATOR_H__ 15 #define __UTF8COLLATIONITERATOR_H__ 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_COLLATION 20 21 #include "cmemory.h" 22 #include "collation.h" 23 #include "collationdata.h" 24 #include "collationiterator.h" 25 #include "normalizer2impl.h" 26 27 U_NAMESPACE_BEGIN 28 29 /** 30 * UTF-8 collation element and character iterator. 31 * Handles normalized UTF-8 text inline, with length or NUL-terminated. 32 * Unnormalized text is handled by a subclass. 33 */ 34 class U_I18N_API UTF8CollationIterator : public CollationIterator { 35 public: UTF8CollationIterator(const CollationData * d,UBool numeric,const uint8_t * s,int32_t p,int32_t len)36 UTF8CollationIterator(const CollationData *d, UBool numeric, 37 const uint8_t *s, int32_t p, int32_t len) 38 : CollationIterator(d, numeric), 39 u8(s), pos(p), length(len) {} 40 41 virtual ~UTF8CollationIterator(); 42 43 virtual void resetToOffset(int32_t newOffset); 44 45 virtual int32_t getOffset() const; 46 47 virtual UChar32 nextCodePoint(UErrorCode &errorCode); 48 49 virtual UChar32 previousCodePoint(UErrorCode &errorCode); 50 51 protected: 52 /** 53 * For byte sequences that are illegal in UTF-8, an error value may be returned 54 * together with a bogus code point. The caller will ignore that code point. 55 * 56 * Special values may be returned for surrogate code points, which are also illegal in UTF-8, 57 * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns TRUE. 58 * 59 * Valid lead surrogates are returned from inside a normalized text segment, 60 * where handleGetTrailSurrogate() will return the matching trail surrogate. 61 */ 62 virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode); 63 64 virtual UBool foundNULTerminator(); 65 66 virtual UBool forbidSurrogateCodePoints() const; 67 68 virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode); 69 70 virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode); 71 72 const uint8_t *u8; 73 int32_t pos; 74 int32_t length; // <0 for NUL-terminated strings 75 }; 76 77 /** 78 * Incrementally checks the input text for FCD and normalizes where necessary. 79 */ 80 class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator { 81 public: FCDUTF8CollationIterator(const CollationData * data,UBool numeric,const uint8_t * s,int32_t p,int32_t len)82 FCDUTF8CollationIterator(const CollationData *data, UBool numeric, 83 const uint8_t *s, int32_t p, int32_t len) 84 : UTF8CollationIterator(data, numeric, s, p, len), 85 state(CHECK_FWD), start(p), 86 nfcImpl(data->nfcImpl) {} 87 88 virtual ~FCDUTF8CollationIterator(); 89 90 virtual void resetToOffset(int32_t newOffset); 91 92 virtual int32_t getOffset() const; 93 94 virtual UChar32 nextCodePoint(UErrorCode &errorCode); 95 96 virtual UChar32 previousCodePoint(UErrorCode &errorCode); 97 98 protected: 99 virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode); 100 101 virtual UChar handleGetTrailSurrogate(); 102 103 virtual UBool foundNULTerminator(); 104 105 virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode); 106 107 virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode); 108 109 private: 110 UBool nextHasLccc() const; 111 UBool previousHasTccc() const; 112 113 /** 114 * Switches to forward checking if possible. 115 */ 116 void switchToForward(); 117 118 /** 119 * Extends the FCD text segment forward or normalizes around pos. 120 * @return TRUE if success 121 */ 122 UBool nextSegment(UErrorCode &errorCode); 123 124 /** 125 * Switches to backward checking. 126 */ 127 void switchToBackward(); 128 129 /** 130 * Extends the FCD text segment backward or normalizes around pos. 131 * @return TRUE if success 132 */ 133 UBool previousSegment(UErrorCode &errorCode); 134 135 UBool normalize(const UnicodeString &s, UErrorCode &errorCode); 136 137 enum State { 138 /** 139 * The input text [start..pos[ passes the FCD check. 140 * Moving forward checks incrementally. 141 * limit is undefined. 142 */ 143 CHECK_FWD, 144 /** 145 * The input text [pos..limit[ passes the FCD check. 146 * Moving backward checks incrementally. 147 * start is undefined. 148 */ 149 CHECK_BWD, 150 /** 151 * The input text [start..limit[ passes the FCD check. 152 * pos tracks the current text index. 153 */ 154 IN_FCD_SEGMENT, 155 /** 156 * The input text [start..limit[ failed the FCD check and was normalized. 157 * pos tracks the current index in the normalized string. 158 */ 159 IN_NORMALIZED 160 }; 161 162 State state; 163 164 int32_t start; 165 int32_t limit; 166 167 const Normalizer2Impl &nfcImpl; 168 UnicodeString normalized; 169 }; 170 171 U_NAMESPACE_END 172 173 #endif // !UCONFIG_NO_COLLATION 174 #endif // __UTF8COLLATIONITERATOR_H__ 175