1 // © 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 // norms.h 5 // created: 2017jun04 Markus W. Scherer 6 // (pulled out of n2builder.cpp) 7 8 // Storing & manipulating Normalizer2 builder data. 9 10 #ifndef __NORMS_H__ 11 #define __NORMS_H__ 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_NORMALIZATION 16 17 #include "unicode/errorcode.h" 18 #include "unicode/umutablecptrie.h" 19 #include "unicode/uniset.h" 20 #include "unicode/unistr.h" 21 #include "unicode/utf16.h" 22 #include "normalizer2impl.h" 23 #include "toolutil.h" 24 #include "uvectr32.h" 25 26 U_NAMESPACE_BEGIN 27 28 class BuilderReorderingBuffer { 29 public: BuilderReorderingBuffer()30 BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(false) {} reset()31 void reset() { 32 fLength=0; 33 fLastStarterIndex=-1; 34 fDidReorder=false; 35 } length()36 int32_t length() const { return fLength; } isEmpty()37 UBool isEmpty() const { return fLength==0; } lastStarterIndex()38 int32_t lastStarterIndex() const { return fLastStarterIndex; } charAt(int32_t i)39 UChar32 charAt(int32_t i) const { return fArray[i]>>8; } ccAt(int32_t i)40 uint8_t ccAt(int32_t i) const { return static_cast<uint8_t>(fArray[i]); } didReorder()41 UBool didReorder() const { return fDidReorder; } 42 43 void append(UChar32 c, uint8_t cc); 44 void toString(UnicodeString &dest) const; 45 46 private: 47 int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK]; 48 int32_t fLength; 49 int32_t fLastStarterIndex; 50 UBool fDidReorder; 51 }; 52 53 struct CompositionPair { CompositionPairCompositionPair54 CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {} 55 UChar32 trail, composite; 56 }; 57 58 struct Norm { 59 enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY }; 60 hasMappingNorm61 UBool hasMapping() const { return mappingType>REMOVED; } 62 63 // Requires hasMapping() and well-formed mapping. setMappingCPNorm64 void setMappingCP() { 65 UChar32 c; 66 if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) { 67 mappingCP=c; 68 } else { 69 mappingCP=U_SENTINEL; 70 } 71 } 72 combinesFwdNorm73 bool combinesFwd() const { return compositions!=nullptr; } getCompositionPairsNorm74 const CompositionPair *getCompositionPairs(int32_t &length) const { 75 if(compositions==nullptr) { 76 length=0; 77 return nullptr; 78 } else { 79 length=compositions->size()/2; 80 return reinterpret_cast<const CompositionPair *>(compositions->getBuffer()); 81 } 82 } 83 UChar32 combine(UChar32 trail) const; 84 85 UnicodeString *mapping; 86 UnicodeString *rawMapping; // non-nullptr if the mapping is further decomposed 87 UChar32 mappingCP; // >=0 if mapping to 1 code point 88 int32_t mappingPhase; 89 MappingType mappingType; 90 91 UVector32 *compositions; // (trail, composite) pairs 92 uint8_t cc, leadCC, trailCC; 93 UBool combinesBack; 94 UBool hasCompBoundaryBefore, hasCompBoundaryAfter; 95 96 /** 97 * Overall type of normalization properties. 98 * Set after most processing is done. 99 * 100 * Corresponds to the rows in the chart on 101 * https://unicode-org.github.io/icu/design/normalization/custom.html 102 * in numerical (but reverse visual) order. 103 * 104 * YES_NO means composition quick check=yes, decomposition QC=no -- etc. 105 */ 106 enum Type { 107 /** Initial value until most processing is done. */ 108 UNKNOWN, 109 /** No mapping, does not combine, ccc=0. */ 110 INERT, 111 /** Starter, no mapping, has compositions. */ 112 YES_YES_COMBINES_FWD, 113 /** Starter with a round-trip mapping and compositions. */ 114 YES_NO_COMBINES_FWD, 115 /** Starter with a round-trip mapping but no compositions. */ 116 YES_NO_MAPPING_ONLY, 117 /** Has a one-way mapping which is comp-normalized. */ 118 NO_NO_COMP_YES, 119 /** Has a one-way mapping which is not comp-normalized but has a comp boundary before. */ 120 NO_NO_COMP_BOUNDARY_BEFORE, 121 /** Has a one-way mapping which does not have a comp boundary before. */ 122 NO_NO_COMP_NO_MAYBE_CC, 123 /** Has a one-way mapping to the empty string. */ 124 NO_NO_EMPTY, 125 /** Has an algorithmic one-way mapping to a single code point. */ 126 NO_NO_DELTA, 127 /** Has a two-way mapping which starts with a character that combines backward. */ 128 MAYBE_NO_MAPPING_ONLY, 129 /** 130 * Has a two-way mapping which starts with a character that combines backward. 131 * Also combines forward. 132 */ 133 MAYBE_NO_COMBINES_FWD, 134 /** Combines both backward and forward, has compositions. */ 135 MAYBE_YES_COMBINES_FWD, 136 /** Combines only backward. */ 137 MAYBE_YES_SIMPLE, 138 /** Non-zero ccc but does not combine backward. */ 139 YES_YES_WITH_CC 140 } type; 141 /** Offset into the type's part of the extra data, or the algorithmic-mapping delta. */ 142 int32_t offset; 143 144 /** 145 * Error string set by processing functions that do not have access 146 * to the code point, deferred for readable reporting. 147 */ 148 const char *error; 149 }; 150 151 class Norms { 152 public: 153 Norms(UErrorCode &errorCode); 154 ~Norms(); 155 length()156 int32_t length() const { return utm_countItems(normMem); } getNormRefByIndex(int32_t i)157 const Norm &getNormRefByIndex(int32_t i) const { return norms[i]; } getNormRefByIndex(int32_t i)158 Norm &getNormRefByIndex(int32_t i) { return norms[i]; } 159 160 Norm *allocNorm(); 161 /** Returns an existing Norm unit, or nullptr if c has no data. */ 162 Norm *getNorm(UChar32 c); 163 const Norm *getNorm(UChar32 c) const; 164 /** Returns a Norm unit, creating a new one if necessary. */ 165 Norm *createNorm(UChar32 c); 166 /** Returns an existing Norm unit, or an immutable empty object if c has no data. */ 167 const Norm &getNormRef(UChar32 c) const; getCC(UChar32 c)168 uint8_t getCC(UChar32 c) const { return getNormRef(c).cc; } combinesBack(UChar32 c)169 UBool combinesBack(UChar32 c) const { 170 return Hangul::isJamoV(c) || Hangul::isJamoT(c) || getNormRef(c).combinesBack; 171 } 172 173 void reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const; 174 175 // int32_t highCC not uint8_t so that we can pass in 256 as the upper limit. 176 UBool combinesWithCCBetween(const Norm &norm, uint8_t lowCC, int32_t highCC) const; 177 178 class Enumerator { 179 public: Enumerator(Norms & n)180 Enumerator(Norms &n) : norms(n) {} 181 virtual ~Enumerator(); 182 /** Called for enumerated value!=0. */ 183 virtual void rangeHandler(UChar32 start, UChar32 end, Norm &norm) = 0; 184 protected: 185 Norms &norms; 186 }; 187 188 void enumRanges(Enumerator &e); 189 190 UnicodeSet ccSet, mappingSet; 191 192 private: 193 Norms(const Norms &other) = delete; 194 Norms &operator=(const Norms &other) = delete; 195 196 UMutableCPTrie *normTrie; 197 UToolMemory *normMem; 198 Norm *norms; 199 }; 200 201 class CompositionBuilder : public Norms::Enumerator { 202 public: CompositionBuilder(Norms & n)203 CompositionBuilder(Norms &n) : Norms::Enumerator(n) {} 204 /** Adds a composition mapping for the first character in a round-trip mapping. */ 205 void rangeHandler(UChar32 start, UChar32 end, Norm &norm) override; 206 }; 207 208 class Decomposer : public Norms::Enumerator { 209 public: Decomposer(Norms & n)210 Decomposer(Norms &n) : Norms::Enumerator(n), didDecompose(false) {} 211 /** Decomposes each character of the current mapping. Sets didDecompose if any. */ 212 void rangeHandler(UChar32 start, UChar32 end, Norm &norm) override; 213 UBool didDecompose; 214 }; 215 216 U_NAMESPACE_END 217 218 #endif // #if !UCONFIG_NO_NORMALIZATION 219 220 #endif // __NORMS_H__ 221