1 // © 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 // norms.h 5 // created: 2017jun04 Markus W. Scherer 6 // (pulled out of n2builder.cpp) 7 8 // Storing & manipulating Normalizer2 builder data. 9 10 #ifndef __NORMS_H__ 11 #define __NORMS_H__ 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_NORMALIZATION 16 17 #include "unicode/errorcode.h" 18 #include "unicode/umutablecptrie.h" 19 #include "unicode/uniset.h" 20 #include "unicode/unistr.h" 21 #include "unicode/utf16.h" 22 #include "normalizer2impl.h" 23 #include "toolutil.h" 24 #include "uvectr32.h" 25 26 U_NAMESPACE_BEGIN 27 28 class BuilderReorderingBuffer { 29 public: BuilderReorderingBuffer()30 BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(false) {} reset()31 void reset() { 32 fLength=0; 33 fLastStarterIndex=-1; 34 fDidReorder=false; 35 } length()36 int32_t length() const { return fLength; } isEmpty()37 UBool isEmpty() const { return fLength==0; } lastStarterIndex()38 int32_t lastStarterIndex() const { return fLastStarterIndex; } charAt(int32_t i)39 UChar32 charAt(int32_t i) const { return fArray[i]>>8; } ccAt(int32_t i)40 uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; } didReorder()41 UBool didReorder() const { return fDidReorder; } 42 43 void append(UChar32 c, uint8_t cc); 44 void toString(UnicodeString &dest) const; 45 46 private: 47 int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK]; 48 int32_t fLength; 49 int32_t fLastStarterIndex; 50 UBool fDidReorder; 51 }; 52 53 struct CompositionPair { CompositionPairCompositionPair54 CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {} 55 UChar32 trail, composite; 56 }; 57 58 struct Norm { 59 enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY }; 60 hasMappingNorm61 UBool hasMapping() const { return mappingType>REMOVED; } 62 63 // Requires hasMapping() and well-formed mapping. setMappingCPNorm64 void setMappingCP() { 65 UChar32 c; 66 if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) { 67 mappingCP=c; 68 } else { 69 mappingCP=U_SENTINEL; 70 } 71 } 72 getCompositionPairsNorm73 const CompositionPair *getCompositionPairs(int32_t &length) const { 74 if(compositions==nullptr) { 75 length=0; 76 return nullptr; 77 } else { 78 length=compositions->size()/2; 79 return reinterpret_cast<const CompositionPair *>(compositions->getBuffer()); 80 } 81 } 82 UChar32 combine(UChar32 trail) const; 83 84 UnicodeString *mapping; 85 UnicodeString *rawMapping; // non-nullptr if the mapping is further decomposed 86 UChar32 mappingCP; // >=0 if mapping to 1 code point 87 int32_t mappingPhase; 88 MappingType mappingType; 89 90 UVector32 *compositions; // (trail, composite) pairs 91 uint8_t cc, leadCC, trailCC; 92 UBool combinesBack; 93 UBool hasCompBoundaryBefore, hasCompBoundaryAfter; 94 95 /** 96 * Overall type of normalization properties. 97 * Set after most processing is done. 98 * 99 * Corresponds to the rows in the chart on 100 * https://icu.unicode.org/design/normalization/custom 101 * in numerical (but reverse visual) order. 102 * 103 * YES_NO means composition quick check=yes, decomposition QC=no -- etc. 104 */ 105 enum Type { 106 /** Initial value until most processing is done. */ 107 UNKNOWN, 108 /** No mapping, does not combine, ccc=0. */ 109 INERT, 110 /** Starter, no mapping, has compositions. */ 111 YES_YES_COMBINES_FWD, 112 /** Starter with a round-trip mapping and compositions. */ 113 YES_NO_COMBINES_FWD, 114 /** Starter with a round-trip mapping but no compositions. */ 115 YES_NO_MAPPING_ONLY, 116 /** Has a one-way mapping which is comp-normalized. */ 117 NO_NO_COMP_YES, 118 /** Has a one-way mapping which is not comp-normalized but has a comp boundary before. */ 119 NO_NO_COMP_BOUNDARY_BEFORE, 120 /** Has a one-way mapping which does not have a comp boundary before. */ 121 NO_NO_COMP_NO_MAYBE_CC, 122 /** Has a one-way mapping to the empty string. */ 123 NO_NO_EMPTY, 124 /** Has an algorithmic one-way mapping to a single code point. */ 125 NO_NO_DELTA, 126 /** 127 * Combines both backward and forward, has compositions. 128 * Allowed, but not normally used. 129 */ 130 MAYBE_YES_COMBINES_FWD, 131 /** Combines only backward. */ 132 MAYBE_YES_SIMPLE, 133 /** Non-zero ccc but does not combine backward. */ 134 YES_YES_WITH_CC 135 } type; 136 /** Offset into the type's part of the extra data, or the algorithmic-mapping delta. */ 137 int32_t offset; 138 139 /** 140 * Error string set by processing functions that do not have access 141 * to the code point, deferred for readable reporting. 142 */ 143 const char *error; 144 }; 145 146 class Norms { 147 public: 148 Norms(UErrorCode &errorCode); 149 ~Norms(); 150 length()151 int32_t length() const { return utm_countItems(normMem); } getNormRefByIndex(int32_t i)152 const Norm &getNormRefByIndex(int32_t i) const { return norms[i]; } getNormRefByIndex(int32_t i)153 Norm &getNormRefByIndex(int32_t i) { return norms[i]; } 154 155 Norm *allocNorm(); 156 /** Returns an existing Norm unit, or nullptr if c has no data. */ 157 Norm *getNorm(UChar32 c); 158 const Norm *getNorm(UChar32 c) const; 159 /** Returns a Norm unit, creating a new one if necessary. */ 160 Norm *createNorm(UChar32 c); 161 /** Returns an existing Norm unit, or an immutable empty object if c has no data. */ 162 const Norm &getNormRef(UChar32 c) const; getCC(UChar32 c)163 uint8_t getCC(UChar32 c) const { return getNormRef(c).cc; } combinesBack(UChar32 c)164 UBool combinesBack(UChar32 c) const { 165 return Hangul::isJamoV(c) || Hangul::isJamoT(c) || getNormRef(c).combinesBack; 166 } 167 168 void reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const; 169 170 // int32_t highCC not uint8_t so that we can pass in 256 as the upper limit. 171 UBool combinesWithCCBetween(const Norm &norm, uint8_t lowCC, int32_t highCC) const; 172 173 class Enumerator { 174 public: Enumerator(Norms & n)175 Enumerator(Norms &n) : norms(n) {} 176 virtual ~Enumerator(); 177 /** Called for enumerated value!=0. */ 178 virtual void rangeHandler(UChar32 start, UChar32 end, Norm &norm) = 0; 179 protected: 180 Norms &norms; 181 }; 182 183 void enumRanges(Enumerator &e); 184 185 UnicodeSet ccSet, mappingSet; 186 187 private: 188 Norms(const Norms &other) = delete; 189 Norms &operator=(const Norms &other) = delete; 190 191 UMutableCPTrie *normTrie; 192 UToolMemory *normMem; 193 Norm *norms; 194 }; 195 196 class CompositionBuilder : public Norms::Enumerator { 197 public: CompositionBuilder(Norms & n)198 CompositionBuilder(Norms &n) : Norms::Enumerator(n) {} 199 /** Adds a composition mapping for the first character in a round-trip mapping. */ 200 void rangeHandler(UChar32 start, UChar32 end, Norm &norm) U_OVERRIDE; 201 }; 202 203 class Decomposer : public Norms::Enumerator { 204 public: Decomposer(Norms & n)205 Decomposer(Norms &n) : Norms::Enumerator(n), didDecompose(false) {} 206 /** Decomposes each character of the current mapping. Sets didDecompose if any. */ 207 void rangeHandler(UChar32 start, UChar32 end, Norm &norm) U_OVERRIDE; 208 UBool didDecompose; 209 }; 210 211 U_NAMESPACE_END 212 213 #endif // #if !UCONFIG_NO_NORMALIZATION 214 215 #endif // __NORMS_H__ 216