• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 * Copyright (C) 2013-2015, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * collationdatareader.h
7 *
8 * created on: 2013feb07
9 * created by: Markus W. Scherer
10 */
11 
12 #ifndef __COLLATIONDATAREADER_H__
13 #define __COLLATIONDATAREADER_H__
14 
15 #include "unicode/utypes.h"
16 
17 #if !UCONFIG_NO_COLLATION
18 
19 #include "unicode/udata.h"
20 
21 struct UDataMemory;
22 
23 U_NAMESPACE_BEGIN
24 
25 struct CollationTailoring;
26 
27 /**
28  * Collation binary data reader.
29  */
30 struct U_I18N_API CollationDataReader /* all static */ {
31     // The following constants are also copied into source/common/ucol_swp.cpp.
32     // Keep them in sync!
33     enum {
34         /**
35          * Number of int32_t indexes.
36          *
37          * Can be 2 if there are only options.
38          * Can be 7 or 8 if there are only options and a script reordering.
39          * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0.
40          */
41         IX_INDEXES_LENGTH,  // 0
42         /**
43          * Bits 31..24: numericPrimary, for numeric collation
44          *      23..16: fast Latin format version (0 = no fast Latin table)
45          *      15.. 0: options bit set
46          */
47         IX_OPTIONS,
48         IX_RESERVED2,
49         IX_RESERVED3,
50 
51         /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */
52         IX_JAMO_CE32S_START,  // 4
53 
54         // Byte offsets from the start of the data, after the generic header.
55         // The indexes[] are at byte offset 0, other data follows.
56         // Each data item is aligned properly.
57         // The data items should be in descending order of unit size,
58         // to minimize the need for padding.
59         // Each item's byte length is given by the difference between its offset and
60         // the next index/offset value.
61         /** Byte offset to int32_t reorderCodes[]. */
62         IX_REORDER_CODES_OFFSET,
63         /**
64          * Byte offset to uint8_t reorderTable[].
65          * Empty table if <256 bytes (padding only).
66          * Otherwise 256 bytes or more (with padding).
67          */
68         IX_REORDER_TABLE_OFFSET,
69         /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */
70         IX_TRIE_OFFSET,
71 
72         IX_RESERVED8_OFFSET,  // 8
73         /** Byte offset to int64_t ces[]. */
74         IX_CES_OFFSET,
75         IX_RESERVED10_OFFSET,
76         /** Byte offset to uint32_t ce32s[]. */
77         IX_CE32S_OFFSET,
78 
79         /** Byte offset to uint32_t rootElements[]. */
80         IX_ROOT_ELEMENTS_OFFSET,  // 12
81         /** Byte offset to UChar *contexts[]. */
82         IX_CONTEXTS_OFFSET,
83         /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */
84         IX_UNSAFE_BWD_OFFSET,
85         /** Byte offset to uint16_t fastLatinTable[]. */
86         IX_FAST_LATIN_TABLE_OFFSET,
87 
88         /** Byte offset to uint16_t scripts[]. */
89         IX_SCRIPTS_OFFSET,  // 16
90         /**
91          * Byte offset to UBool compressibleBytes[].
92          * Empty table if <256 bytes (padding only).
93          * Otherwise 256 bytes or more (with padding).
94          */
95         IX_COMPRESSIBLE_BYTES_OFFSET,
96         IX_RESERVED18_OFFSET,
97         IX_TOTAL_SIZE
98     };
99 
100     static void read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength,
101                      CollationTailoring &tailoring, UErrorCode &errorCode);
102 
103     static UBool U_CALLCONV
104     isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);
105 
106 private:
107     CollationDataReader();  // no constructor
108 };
109 
110 /*
111  * Format of collation data (ucadata.icu, binary data in coll/ *.res files).
112  * Format version 5.
113  *
114  * The root collation data is stored in the ucadata.icu file.
115  * Tailorings are stored inside .res resource bundle files, with a complete file header.
116  *
117  * Collation data begins with a standard ICU data file header
118  * (DataHeader, see ucmndata.h and unicode/udata.h).
119  * The UDataInfo.dataVersion field contains the UCA and other version numbers,
120  * see the comments for CollationTailoring.version.
121  *
122  * After the header, the file contains the following parts.
123  * Constants are defined as enum values of the CollationDataReader class.
124  * See also the Collation class.
125  *
126  * int32_t indexes[indexesLength];
127  *      The indexes array has variable length.
128  *      Some tailorings only need the length and the options,
129  *      others only add reorderCodes and the reorderTable,
130  *      some need to store mappings.
131  *      Only as many indexes are stored as needed to read all of the data.
132  *
133  *      Index 0: indexesLength
134  *      Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS
135  *      Index 2..3: Unused/reserved/0.
136  *      Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo
137  *               are stored in a short, contiguous part of the ce32s array.
138  *
139  *      Indexes 5..19 are byte offsets in ascending order.
140  *      Each byte offset marks the start of the next part in the data file,
141  *      and the end of the previous one.
142  *      When two consecutive byte offsets are the same (or too short),
143  *      then the corresponding part is empty.
144  *      Byte offsets are offsets from after the header,
145  *      that is, from the beginning of the indexes[].
146  *      Each part starts at an offset with proper alignment for its data.
147  *      If necessary, the previous part may include padding bytes to achieve this alignment.
148  *      The last byte offset that is stored in the indexes indicates the total size of the data
149  *      (starting with the indexes).
150  *
151  * int32_t reorderCodes[]; -- empty in root
152  *      The list of script and reordering codes.
153  *
154  *      Beginning with format version 5, this array may optionally
155  *      have trailing entries with a full list of reorder ranges
156  *      as described for CollationSettings::reorderRanges.
157  *
158  *      Script or reorder codes are first and do not exceed 16-bit values.
159  *      Range limits are stored in the upper 16 bits, and are never 0.
160  *      Split this array into reorder codes and ranges at the first entry
161  *      with non-zero upper 16 bits.
162  *
163  *      If the ranges are missing but needed for split-reordered primary lead bytes,
164  *      then they are regenerated at load time.
165  *
166  * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes
167  *      Primary-weight lead byte permutation table.
168  *      Normally present when the reorderCodes are, but can be built at load time.
169  *
170  *      Beginning with format version 5, a 0 entry at a non-zero index
171  *      (which is otherwise an illegal value)
172  *      means that the primary lead byte is "split"
173  *      (there are different offsets for primaries that share that lead byte)
174  *      and the reordering offset must be determined via the reorder ranges
175  *      that are either stored as part of the reorderCodes array
176  *      or regenerated at load time.
177  *
178  * UTrie2 trie; -- see utrie2_impl.h and utrie2.h
179  *      The trie holds the main collation data. Each code point is mapped to a 32-bit value.
180  *      It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set,
181  *      in which case it is a special CE32 and contains a 4-bit tag and further data.
182  *      See the Collation class for details.
183  *
184  *      The trie has a value for each lead surrogate code unit with some bits encoding
185  *      collective properties of the 1024 supplementary characters whose UTF-16 form starts with
186  *      the lead surrogate. See Collation::LEAD_SURROGATE_TAG..
187  *
188  * int64_t ces[];
189  *      64-bit CEs and expansions that cannot be stored in a more compact form.
190  *
191  * uint32_t ce32s[];
192  *      CE32s for expansions in compact form, and for characters whose trie values
193  *      contain special data.
194  *
195  * uint32_t rootElements[]; -- empty in all tailorings
196  *      Compact storage for all of the CEs that occur in the root collation.
197  *      See the CollationRootElements class.
198  *
199  * UChar *contexts[];
200  *      Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings.
201  *
202  * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize()
203  *      Serialized form of characters that are unsafe when iterating backwards,
204  *      and at the end of an identical string prefix.
205  *      Back up to a safe character.
206  *      Lead surrogates are "unsafe" when any of their corresponding supplementary
207  *      code points are unsafe.
208  *      Does not include [:^lccc=0:][:^tccc=0:].
209  *      For each tailoring, the root unsafeBackwardSet is subtracted.
210  *      (As a result, in many tailorings no set needs to be stored.)
211  *
212  * uint16_t fastLatinTable[];
213  *      Optional optimization for Latin text.
214  *      See the CollationFastLatin class.
215  *
216  * uint16_t scripts[]; -- empty in all tailorings
217  *      Format version 5:
218  *      uint16_t numScripts;
219  *      uint16_t scriptsIndex[numScripts+16];
220  *      uint16_t scriptStarts[];
221  *      See CollationData::numScripts etc.
222  *
223  *      Format version 4:
224  *      Table of the reordering groups with their first and last lead bytes,
225  *      and their script and reordering codes.
226  *      See CollationData::scripts.
227  *
228  * UBool compressibleBytes[]; -- empty in all tailorings
229  *      Flag for getSortKey(), indicating primary weight lead bytes that are compressible.
230  *
231  * -----------------
232  * Changes for formatVersion 5 (ICU 55)
233  *
234  * Reordering moves single scripts, not groups of scripts.
235  * Reorder ranges are optionally appended to the reorderCodes,
236  * and a 0 entry in the reorderTable indicates a split lead byte.
237  * The scripts data has a new format.
238  *
239  * The rootElements may contain secondary and tertiary weights below common=05.
240  * (Used for small Hiragana letters.)
241  * Where is occurs, there is also an explicit unit with common secondary & tertiary weights.
242  * There are no other data structure changes, but builder code needs to be able to handle such data.
243  *
244  * The collation element for the merge separator code point U+FFFE
245  * does not necessarily have special, unique secondary/tertiary weights any more.
246  */
247 
248 U_NAMESPACE_END
249 
250 #endif  // !UCONFIG_NO_COLLATION
251 #endif  // __COLLATIONDATAREADER_H__
252