• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2012-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * collationbasedatabuilder.h
9 *
10 * created on: 2012aug11
11 * created by: Markus W. Scherer
12 */
13 
14 #ifndef __COLLATIONBASEDATABUILDER_H__
15 #define __COLLATIONBASEDATABUILDER_H__
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_COLLATION
20 
21 #include "unicode/uniset.h"
22 #include "unicode/unistr.h"
23 #include "unicode/uscript.h"
24 #include "collation.h"
25 #include "collationdata.h"
26 #include "collationdatabuilder.h"
27 #include "normalizer2impl.h"
28 #include "utrie2.h"
29 #include "uvectr32.h"
30 #include "uvectr64.h"
31 #include "uvector.h"
32 
33 U_NAMESPACE_BEGIN
34 
35 /**
36  * Low-level base CollationData builder.
37  */
38 class U_I18N_API CollationBaseDataBuilder : public CollationDataBuilder {
39 public:
40     CollationBaseDataBuilder(UErrorCode &errorCode);
41 
42     virtual ~CollationBaseDataBuilder();
43 
44     void init(UErrorCode &errorCode);
45 
46     /**
47      * Sets the Han ranges as ranges of offset CE32s.
48      * Note: Unihan extension A sorts after the other BMP ranges.
49      * See http://www.unicode.org/reports/tr10/#Implicit_Weights
50      *
51      * @param ranges array of ranges of [:Unified_Ideograph:] in collation order,
52      *               as (start, end) code point pairs
53      * @param length number of code points (not pairs)
54      * @param errorCode in/out error code
55      */
56     void initHanRanges(const UChar32 ranges[], int32_t length, UErrorCode &errorCode);
57 
setNumericPrimary(uint32_t np)58     void setNumericPrimary(uint32_t np) { numericPrimary = np; }
59 
60     virtual UBool isCompressibleLeadByte(uint32_t b) const;
61 
62     void setCompressibleLeadByte(uint32_t b);
63 
64     static int32_t diffTwoBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible);
65     static int32_t diffThreeBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible);
66 
67     virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode);
68 
69     void addRootElements(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode);
70     void addRootElement(int64_t ce, UErrorCode &errorCode);
71 
72     void addScriptStart(int32_t script, uint32_t p);
73 
74     virtual void build(CollationData &data, UErrorCode &errorCode);
75 
76     void buildRootElementsTable(UVector32 &table, UErrorCode &errorCode);
77 
78 private:
79     int32_t writeRootElementsRange(
80             uint32_t prevPrimary, uint32_t p, int32_t i,
81             UVector32 &table, UErrorCode &errorCode);
82 
83     // Flags for which primary-weight lead bytes are compressible.
84     UBool compressibleBytes[256];
85     uint32_t numericPrimary;
86     uint32_t firstHanPrimary;
87     uint32_t lastHanPrimary;
88     int32_t hanStep;
89     UVector64 rootElements;
90     uint16_t scriptsIndex[USCRIPT_CODE_LIMIT + 16];  // need exactly this many
91     uint16_t scriptStarts[USCRIPT_CODE_LIMIT + 16];  // should be safely more than needed
92     int32_t scriptStartsLength;
93 };
94 
95 U_NAMESPACE_END
96 
97 #endif  // !UCONFIG_NO_COLLATION
98 #endif  // __COLLATIONBASEDATABUILDER_H__
99