1 // © 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2012-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * collationbasedatabuilder.h 9 * 10 * created on: 2012aug11 11 * created by: Markus W. Scherer 12 */ 13 14 #ifndef __COLLATIONBASEDATABUILDER_H__ 15 #define __COLLATIONBASEDATABUILDER_H__ 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_COLLATION 20 21 #include "unicode/uniset.h" 22 #include "unicode/unistr.h" 23 #include "unicode/uscript.h" 24 #include "collation.h" 25 #include "collationdata.h" 26 #include "collationdatabuilder.h" 27 #include "normalizer2impl.h" 28 #include "utrie2.h" 29 #include "uvectr32.h" 30 #include "uvectr64.h" 31 #include "uvector.h" 32 33 U_NAMESPACE_BEGIN 34 35 /** 36 * Low-level base CollationData builder. 37 */ 38 class U_I18N_API CollationBaseDataBuilder : public CollationDataBuilder { 39 public: 40 CollationBaseDataBuilder(UErrorCode &errorCode); 41 42 virtual ~CollationBaseDataBuilder(); 43 44 void init(UErrorCode &errorCode); 45 46 /** 47 * Sets the Han ranges as ranges of offset CE32s. 48 * Note: Unihan extension A sorts after the other BMP ranges. 49 * See http://www.unicode.org/reports/tr10/#Implicit_Weights 50 * 51 * @param ranges array of ranges of [:Unified_Ideograph:] in collation order, 52 * as (start, end) code point pairs 53 * @param length number of code points (not pairs) 54 * @param errorCode in/out error code 55 */ 56 void initHanRanges(const UChar32 ranges[], int32_t length, UErrorCode &errorCode); 57 setNumericPrimary(uint32_t np)58 void setNumericPrimary(uint32_t np) { numericPrimary = np; } 59 60 virtual UBool isCompressibleLeadByte(uint32_t b) const; 61 62 void setCompressibleLeadByte(uint32_t b); 63 64 static int32_t diffTwoBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible); 65 static int32_t diffThreeBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible); 66 67 virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode); 68 69 void addRootElements(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode); 70 void addRootElement(int64_t ce, UErrorCode &errorCode); 71 72 void addScriptStart(int32_t script, uint32_t p); 73 74 virtual void build(CollationData &data, UErrorCode &errorCode); 75 76 void buildRootElementsTable(UVector32 &table, UErrorCode &errorCode); 77 78 private: 79 int32_t writeRootElementsRange( 80 uint32_t prevPrimary, uint32_t p, int32_t i, 81 UVector32 &table, UErrorCode &errorCode); 82 83 // Flags for which primary-weight lead bytes are compressible. 84 UBool compressibleBytes[256]; 85 uint32_t numericPrimary; 86 uint32_t firstHanPrimary; 87 uint32_t lastHanPrimary; 88 int32_t hanStep; 89 UVector64 rootElements; 90 uint16_t scriptsIndex[USCRIPT_CODE_LIMIT + 16]; // need exactly this many 91 uint16_t scriptStarts[USCRIPT_CODE_LIMIT + 16]; // should be safely more than needed 92 int32_t scriptStartsLength; 93 }; 94 95 U_NAMESPACE_END 96 97 #endif // !UCONFIG_NO_COLLATION 98 #endif // __COLLATIONBASEDATABUILDER_H__ 99