1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2012-2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * collationdatabuilder.h 9 * 10 * created on: 2012apr01 11 * created by: Markus W. Scherer 12 */ 13 14 #ifndef __COLLATIONDATABUILDER_H__ 15 #define __COLLATIONDATABUILDER_H__ 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_COLLATION 20 21 #include "unicode/uniset.h" 22 #include "unicode/unistr.h" 23 #include "unicode/uversion.h" 24 #include "collation.h" 25 #include "collationdata.h" 26 #include "collationsettings.h" 27 #include "normalizer2impl.h" 28 #include "utrie2.h" 29 #include "uvectr32.h" 30 #include "uvectr64.h" 31 #include "uvector.h" 32 33 U_NAMESPACE_BEGIN 34 35 struct ConditionalCE32; 36 37 class CollationFastLatinBuilder; 38 class CopyHelper; 39 class DataBuilderCollationIterator; 40 class UCharsTrieBuilder; 41 42 /** 43 * Low-level CollationData builder. 44 * Takes (character, CE) pairs and builds them into runtime data structures. 45 * Supports characters with context prefixes and contraction suffixes. 46 */ 47 class U_I18N_API CollationDataBuilder : public UObject { 48 public: 49 /** 50 * Collation element modifier. Interface class for a modifier 51 * that changes a tailoring builder's temporary CEs to final CEs. 52 * Called for every non-special CE32 and every expansion CE. 53 */ 54 class CEModifier : public UObject { 55 public: 56 virtual ~CEModifier(); 57 /** Returns a new CE to replace the non-special input CE32, or else Collation::NO_CE. */ 58 virtual int64_t modifyCE32(uint32_t ce32) const = 0; 59 /** Returns a new CE to replace the input CE, or else Collation::NO_CE. */ 60 virtual int64_t modifyCE(int64_t ce) const = 0; 61 }; 62 63 CollationDataBuilder(UBool icu4xMode, UErrorCode &errorCode); 64 65 virtual ~CollationDataBuilder(); 66 67 void initForTailoring(const CollationData *b, UErrorCode &errorCode); 68 69 virtual UBool isCompressibleLeadByte(uint32_t b) const; 70 isCompressiblePrimary(uint32_t p)71 inline UBool isCompressiblePrimary(uint32_t p) const { 72 return isCompressibleLeadByte(p >> 24); 73 } 74 75 /** 76 * @return true if this builder has mappings (e.g., add() has been called) 77 */ hasMappings()78 UBool hasMappings() const { return modified; } 79 80 /** 81 * @return true if c has CEs in this builder 82 */ 83 UBool isAssigned(UChar32 c) const; 84 85 /** 86 * @return the three-byte primary if c maps to a single such CE and has no context data, 87 * otherwise returns 0. 88 */ 89 uint32_t getLongPrimaryIfSingleCE(UChar32 c) const; 90 91 /** 92 * @return the single CE for c. 93 * Sets an error code if c does not have a single CE. 94 */ 95 int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const; 96 97 void add(const UnicodeString &prefix, const UnicodeString &s, 98 const int64_t ces[], int32_t cesLength, 99 UErrorCode &errorCode); 100 101 /** 102 * Encodes the ces as either the returned ce32 by itself, 103 * or by storing an expansion, with the returned ce32 referring to that. 104 * 105 * add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength)) 106 */ 107 virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode); 108 void addCE32(const UnicodeString &prefix, const UnicodeString &s, 109 uint32_t ce32, UErrorCode &errorCode); 110 111 /** 112 * Sets three-byte-primary CEs for a range of code points in code point order, 113 * if it is worth doing; otherwise no change is made. 114 * None of the code points in the range should have complex mappings so far 115 * (expansions/contractions/prefixes). 116 * @param start first code point 117 * @param end last code point (inclusive) 118 * @param primary primary weight for 'start' 119 * @param step per-code point primary-weight increment 120 * @param errorCode ICU in/out error code 121 * @return true if an OFFSET_TAG range was used for start..end 122 */ 123 UBool maybeSetPrimaryRange(UChar32 start, UChar32 end, 124 uint32_t primary, int32_t step, 125 UErrorCode &errorCode); 126 127 /** 128 * Sets three-byte-primary CEs for a range of code points in code point order. 129 * Sets range values if that is worth doing, or else individual values. 130 * None of the code points in the range should have complex mappings so far 131 * (expansions/contractions/prefixes). 132 * @param start first code point 133 * @param end last code point (inclusive) 134 * @param primary primary weight for 'start' 135 * @param step per-code point primary-weight increment 136 * @param errorCode ICU in/out error code 137 * @return the next primary after 'end': start primary incremented by ((end-start)+1)*step 138 */ 139 uint32_t setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end, 140 uint32_t primary, int32_t step, 141 UErrorCode &errorCode); 142 143 /** 144 * Copies all mappings from the src builder, with modifications. 145 * This builder here must not be built yet, and should be empty. 146 */ 147 void copyFrom(const CollationDataBuilder &src, const CEModifier &modifier, 148 UErrorCode &errorCode); 149 150 void optimize(const UnicodeSet &set, UErrorCode &errorCode); 151 void suppressContractions(const UnicodeSet &set, UErrorCode &errorCode); 152 enableFastLatin()153 void enableFastLatin() { fastLatinEnabled = true; } 154 virtual void build(CollationData &data, UErrorCode &errorCode); 155 156 /** 157 * Looks up CEs for s and appends them to the ces array. 158 * Does not handle normalization: s should be in FCD form. 159 * 160 * Does not write completely ignorable CEs. 161 * Does not write beyond Collation::MAX_EXPANSION_LENGTH. 162 * 163 * @return incremented cesLength 164 */ 165 int32_t getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength); 166 int32_t getCEs(const UnicodeString &prefix, const UnicodeString &s, 167 int64_t ces[], int32_t cesLength); 168 169 protected: 170 friend class CopyHelper; 171 friend class DataBuilderCollationIterator; 172 173 uint32_t getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const; 174 175 int32_t addCE(int64_t ce, UErrorCode &errorCode); 176 int32_t addCE32(uint32_t ce32, UErrorCode &errorCode); 177 int32_t addConditionalCE32(const UnicodeString &context, uint32_t ce32, UErrorCode &errorCode); 178 getConditionalCE32(int32_t index)179 inline ConditionalCE32 *getConditionalCE32(int32_t index) const { 180 return static_cast<ConditionalCE32 *>(conditionalCE32s[index]); 181 } getConditionalCE32ForCE32(uint32_t ce32)182 inline ConditionalCE32 *getConditionalCE32ForCE32(uint32_t ce32) const { 183 return getConditionalCE32(Collation::indexFromCE32(ce32)); 184 } 185 makeBuilderContextCE32(int32_t index)186 static uint32_t makeBuilderContextCE32(int32_t index) { 187 return Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, index); 188 } isBuilderContextCE32(uint32_t ce32)189 static inline UBool isBuilderContextCE32(uint32_t ce32) { 190 return Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG); 191 } 192 193 static uint32_t encodeOneCEAsCE32(int64_t ce); 194 uint32_t encodeOneCE(int64_t ce, UErrorCode &errorCode); 195 uint32_t encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode); 196 uint32_t encodeExpansion32(const int32_t newCE32s[], int32_t length, UErrorCode &errorCode); 197 198 uint32_t copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, UErrorCode &errorCode); 199 /** 200 * Copies base contractions to a list of ConditionalCE32. 201 * Sets cond->next to the index of the first new item 202 * and returns the index of the last new item. 203 */ 204 int32_t copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32, 205 ConditionalCE32 *cond, UErrorCode &errorCode); 206 207 UBool getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode); 208 void setDigitTags(UErrorCode &errorCode); 209 void setLeadSurrogates(UErrorCode &errorCode); 210 211 void buildMappings(CollationData &data, UErrorCode &errorCode); 212 213 void clearContexts(); 214 void buildContexts(UErrorCode &errorCode); 215 uint32_t buildContext(ConditionalCE32 *head, UErrorCode &errorCode); 216 int32_t addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder, 217 UErrorCode &errorCode); 218 219 void buildFastLatinTable(CollationData &data, UErrorCode &errorCode); 220 221 int32_t getCEs(const UnicodeString &s, int32_t start, int64_t ces[], int32_t cesLength); 222 jamoCpFromIndex(int32_t i)223 static UChar32 jamoCpFromIndex(int32_t i) { 224 // 0 <= i < CollationData::JAMO_CE32S_LENGTH = 19 + 21 + 27 225 if(i < Hangul::JAMO_L_COUNT) { return Hangul::JAMO_L_BASE + i; } 226 i -= Hangul::JAMO_L_COUNT; 227 if(i < Hangul::JAMO_V_COUNT) { return Hangul::JAMO_V_BASE + i; } 228 i -= Hangul::JAMO_V_COUNT; 229 // i < 27 230 return Hangul::JAMO_T_BASE + 1 + i; 231 } 232 233 /** @see Collation::BUILDER_DATA_TAG */ 234 static const uint32_t IS_BUILDER_JAMO_CE32 = 0x100; 235 236 const Normalizer2Impl &nfcImpl; 237 const CollationData *base; 238 const CollationSettings *baseSettings; 239 UTrie2 *trie; 240 UVector32 ce32s; 241 UVector64 ce64s; 242 UVector conditionalCE32s; // vector of ConditionalCE32 243 // Characters that have context (prefixes or contraction suffixes). 244 UnicodeSet contextChars; 245 // Serialized UCharsTrie structures for finalized contexts. 246 UnicodeString contexts; 247 private: 248 /** 249 * The "era" of building intermediate contexts. 250 * When the array of cached, temporary contexts overflows, then clearContexts() 251 * removes them all and invalidates the builtCE32 that used to point to built tries. 252 * See ConditionalCE32::era. 253 */ 254 int32_t contextsEra = 0; 255 protected: 256 UnicodeSet unsafeBackwardSet; 257 UBool modified; 258 UBool icu4xMode; 259 260 UBool fastLatinEnabled; 261 CollationFastLatinBuilder *fastLatinBuilder; 262 263 DataBuilderCollationIterator *collIter; 264 }; 265 266 U_NAMESPACE_END 267 268 #endif // !UCONFIG_NO_COLLATION 269 #endif // __COLLATIONDATABUILDER_H__ 270