• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5 *******************************************************************************
6 * Copyright (C) 2013-2015, International Business Machines
7 * Corporation and others.  All Rights Reserved.
8 *******************************************************************************
9 * CollationDataReader.java, ported from collationdatareader.h/.cpp
10 *
11 * C++ version created on: 2013feb07
12 * created by: Markus W. Scherer
13 */
14 
15 package ohos.global.icu.impl.coll;
16 
17 import java.io.IOException;
18 import java.nio.ByteBuffer;
19 import java.nio.CharBuffer;
20 import java.util.Arrays;
21 
22 import ohos.global.icu.impl.ICUBinary;
23 import ohos.global.icu.impl.Trie2_32;
24 import ohos.global.icu.impl.USerializedSet;
25 import ohos.global.icu.text.Collator;
26 import ohos.global.icu.text.UnicodeSet;
27 import ohos.global.icu.util.ICUException;
28 
29 /**
30  * Collation binary data reader.
31  */
32 final class CollationDataReader /* all static */ {
33     // The following constants are also copied into source/common/ucol_swp.cpp.
34     // Keep them in sync!
35     /**
36      * Number of int indexes.
37      *
38      * Can be 2 if there are only options.
39      * Can be 7 or 8 if there are only options and a script reordering.
40      * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0.
41      */
42     static final int IX_INDEXES_LENGTH = 0;
43     /**
44      * Bits 31..24: numericPrimary, for numeric collation
45      *      23..16: fast Latin format version (0 = no fast Latin table)
46      *      15.. 0: options bit set
47      */
48     static final int IX_OPTIONS = 1;
49     static final int IX_RESERVED2 = 2;
50     static final int IX_RESERVED3 = 3;
51 
52     /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */
53     static final int IX_JAMO_CE32S_START = 4;
54 
55     // Byte offsets from the start of the data, after the generic header.
56     // The indexes[] are at byte offset 0, other data follows.
57     // Each data item is aligned properly.
58     // The data items should be in descending order of unit size,
59     // to minimize the need for padding.
60     // Each item's byte length is given by the difference between its offset and
61     // the next index/offset value.
62     /** Byte offset to int reorderCodes[]. */
63     static final int IX_REORDER_CODES_OFFSET = 5;
64     /**
65      * Byte offset to uint8_t reorderTable[].
66      * Empty table if <256 bytes (padding only).
67      * Otherwise 256 bytes or more (with padding).
68      */
69     static final int IX_REORDER_TABLE_OFFSET = 6;
70     /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */
71     static final int IX_TRIE_OFFSET = 7;
72 
73     static final int IX_RESERVED8_OFFSET = 8;
74     /** Byte offset to long ces[]. */
75     static final int IX_CES_OFFSET = 9;
76     static final int IX_RESERVED10_OFFSET = 10;
77     /** Byte offset to int ce32s[]. */
78     static final int IX_CE32S_OFFSET = 11;
79 
80     /** Byte offset to uint32_t rootElements[]. */
81     static final int IX_ROOT_ELEMENTS_OFFSET = 12;
82     /** Byte offset to UChar *contexts[]. */
83     static final int IX_CONTEXTS_OFFSET = 13;
84     /** Byte offset to char [] with serialized unsafeBackwardSet. */
85     static final int IX_UNSAFE_BWD_OFFSET = 14;
86     /** Byte offset to char fastLatinTable[]. */
87     static final int IX_FAST_LATIN_TABLE_OFFSET = 15;
88 
89     /** Byte offset to char scripts[]. */
90     static final int IX_SCRIPTS_OFFSET = 16;
91     /**
92      * Byte offset to boolean compressibleBytes[].
93      * Empty table if <256 bytes (padding only).
94      * Otherwise 256 bytes or more (with padding).
95      */
96     static final int IX_COMPRESSIBLE_BYTES_OFFSET = 17;
97     static final int IX_RESERVED18_OFFSET = 18;
98     static final int IX_TOTAL_SIZE = 19;
99 
read(CollationTailoring base, ByteBuffer inBytes, CollationTailoring tailoring)100     static void read(CollationTailoring base, ByteBuffer inBytes,
101                      CollationTailoring tailoring) throws IOException {
102         tailoring.version = ICUBinary.readHeader(inBytes, DATA_FORMAT, IS_ACCEPTABLE);
103         if(base != null && base.getUCAVersion() != tailoring.getUCAVersion()) {
104             throw new ICUException("Tailoring UCA version differs from base data UCA version");
105         }
106 
107         int inLength = inBytes.remaining();
108         if(inLength < 8) {
109             throw new ICUException("not enough bytes");
110         }
111         int indexesLength = inBytes.getInt();  // inIndexes[IX_INDEXES_LENGTH]
112         if(indexesLength < 2 || inLength < indexesLength * 4) {
113             throw new ICUException("not enough indexes");
114         }
115         int[] inIndexes = new int[IX_TOTAL_SIZE + 1];
116         inIndexes[0] = indexesLength;
117         for(int i = 1; i < indexesLength && i < inIndexes.length; ++i) {
118             inIndexes[i] = inBytes.getInt();
119         }
120         for(int i = indexesLength; i < inIndexes.length; ++i) {
121             inIndexes[i] = -1;
122         }
123         if(indexesLength > inIndexes.length) {
124             ICUBinary.skipBytes(inBytes, (indexesLength - inIndexes.length) * 4);
125         }
126 
127         // Assume that the tailoring data is in initial state,
128         // with null pointers and 0 lengths.
129 
130         // Set pointers to non-empty data parts.
131         // Do this in order of their byte offsets. (Should help porting to Java.)
132 
133         int index;  // one of the indexes[] slots
134         int offset;  // byte offset for the index part
135         int length;  // number of bytes in the index part
136 
137         if(indexesLength > IX_TOTAL_SIZE) {
138             length = inIndexes[IX_TOTAL_SIZE];
139         } else if(indexesLength > IX_REORDER_CODES_OFFSET) {
140             length = inIndexes[indexesLength - 1];
141         } else {
142             length = 0;  // only indexes, and inLength was already checked for them
143         }
144         if(inLength < length) {
145             throw new ICUException("not enough bytes");
146         }
147 
148         CollationData baseData = base == null ? null : base.data;
149         int[] reorderCodes;
150         int reorderCodesLength;
151         index = IX_REORDER_CODES_OFFSET;
152         offset = inIndexes[index];
153         length = inIndexes[index + 1] - offset;
154         if(length >= 4) {
155             if(baseData == null) {
156                 // We assume for collation settings that
157                 // the base data does not have a reordering.
158                 throw new ICUException("Collation base data must not reorder scripts");
159             }
160             reorderCodesLength = length / 4;
161             reorderCodes = ICUBinary.getInts(inBytes, reorderCodesLength, length & 3);
162 
163             // The reorderRanges (if any) are the trailing reorderCodes entries.
164             // Split the array at the boundary.
165             // Script or reorder codes do not exceed 16-bit values.
166             // Range limits are stored in the upper 16 bits, and are never 0.
167             int reorderRangesLength = 0;
168             while(reorderRangesLength < reorderCodesLength &&
169                     (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) {
170                 ++reorderRangesLength;
171             }
172             assert(reorderRangesLength < reorderCodesLength);
173             reorderCodesLength -= reorderRangesLength;
174         } else {
175             reorderCodes = new int[0];
176             reorderCodesLength = 0;
177             ICUBinary.skipBytes(inBytes, length);
178         }
179 
180         // There should be a reorder table only if there are reorder codes.
181         // However, when there are reorder codes the reorder table may be omitted to reduce
182         // the data size.
183         byte[] reorderTable = null;
184         index = IX_REORDER_TABLE_OFFSET;
185         offset = inIndexes[index];
186         length = inIndexes[index + 1] - offset;
187         if(length >= 256) {
188             if(reorderCodesLength == 0) {
189                 throw new ICUException("Reordering table without reordering codes");
190             }
191             reorderTable = new byte[256];
192             inBytes.get(reorderTable);
193             length -= 256;
194         } else {
195             // If we have reorder codes, then build the reorderTable at the end,
196             // when the CollationData is otherwise complete.
197         }
198         ICUBinary.skipBytes(inBytes, length);
199 
200         if(baseData != null && baseData.numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000L)) {
201             throw new ICUException("Tailoring numeric primary weight differs from base data");
202         }
203         CollationData data = null;  // Remains null if there are no mappings.
204 
205         index = IX_TRIE_OFFSET;
206         offset = inIndexes[index];
207         length = inIndexes[index + 1] - offset;
208         if(length >= 8) {
209             tailoring.ensureOwnedData();
210             data = tailoring.ownedData;
211             data.base = baseData;
212             data.numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000L;
213             data.trie = tailoring.trie = Trie2_32.createFromSerialized(inBytes);
214             int trieLength = data.trie.getSerializedLength();
215             if(trieLength > length) {
216                 throw new ICUException("Not enough bytes for the mappings trie");  // No mappings.
217             }
218             length -= trieLength;
219         } else if(baseData != null) {
220             // Use the base data. Only the settings are tailored.
221             tailoring.data = baseData;
222         } else {
223             throw new ICUException("Missing collation data mappings");  // No mappings.
224         }
225         ICUBinary.skipBytes(inBytes, length);
226 
227         index = IX_RESERVED8_OFFSET;
228         offset = inIndexes[index];
229         length = inIndexes[index + 1] - offset;
230         ICUBinary.skipBytes(inBytes, length);
231 
232         index = IX_CES_OFFSET;
233         offset = inIndexes[index];
234         length = inIndexes[index + 1] - offset;
235         if(length >= 8) {
236             if(data == null) {
237                 throw new ICUException("Tailored ces without tailored trie");
238             }
239             data.ces = ICUBinary.getLongs(inBytes, length / 8, length & 7);
240         } else {
241             ICUBinary.skipBytes(inBytes, length);
242         }
243 
244         index = IX_RESERVED10_OFFSET;
245         offset = inIndexes[index];
246         length = inIndexes[index + 1] - offset;
247         ICUBinary.skipBytes(inBytes, length);
248 
249         index = IX_CE32S_OFFSET;
250         offset = inIndexes[index];
251         length = inIndexes[index + 1] - offset;
252         if(length >= 4) {
253             if(data == null) {
254                 throw new ICUException("Tailored ce32s without tailored trie");
255             }
256             data.ce32s = ICUBinary.getInts(inBytes, length / 4, length & 3);
257         } else {
258             ICUBinary.skipBytes(inBytes, length);
259         }
260 
261         int jamoCE32sStart = inIndexes[IX_JAMO_CE32S_START];
262         if(jamoCE32sStart >= 0) {
263             if(data == null || data.ce32s == null) {
264                 throw new ICUException("JamoCE32sStart index into non-existent ce32s[]");
265             }
266             data.jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH];
267             System.arraycopy(data.ce32s, jamoCE32sStart, data.jamoCE32s, 0, CollationData.JAMO_CE32S_LENGTH);
268         } else if(data == null) {
269             // Nothing to do.
270         } else if(baseData != null) {
271             data.jamoCE32s = baseData.jamoCE32s;
272         } else {
273             throw new ICUException("Missing Jamo CE32s for Hangul processing");
274         }
275 
276         index = IX_ROOT_ELEMENTS_OFFSET;
277         offset = inIndexes[index];
278         length = inIndexes[index + 1] - offset;
279         if(length >= 4) {
280             int rootElementsLength = length / 4;
281             if(data == null) {
282                 throw new ICUException("Root elements but no mappings");
283             }
284             if(rootElementsLength <= CollationRootElements.IX_SEC_TER_BOUNDARIES) {
285                 throw new ICUException("Root elements array too short");
286             }
287             data.rootElements = new long[rootElementsLength];
288             for(int i = 0; i < rootElementsLength; ++i) {
289                 data.rootElements[i] = inBytes.getInt() & 0xffffffffL;  // unsigned int -> long
290             }
291             long commonSecTer = data.rootElements[CollationRootElements.IX_COMMON_SEC_AND_TER_CE];
292             if(commonSecTer != Collation.COMMON_SEC_AND_TER_CE) {
293                 throw new ICUException("Common sec/ter weights in base data differ from the hardcoded value");
294             }
295             long secTerBoundaries = data.rootElements[CollationRootElements.IX_SEC_TER_BOUNDARIES];
296             if((secTerBoundaries >>> 24) < CollationKeys.SEC_COMMON_HIGH) {
297                 // [fixed last secondary common byte] is too low,
298                 // and secondary weights would collide with compressed common secondaries.
299                 throw new ICUException("[fixed last secondary common byte] is too low");
300             }
301             length &= 3;
302         }
303         ICUBinary.skipBytes(inBytes, length);
304 
305         index = IX_CONTEXTS_OFFSET;
306         offset = inIndexes[index];
307         length = inIndexes[index + 1] - offset;
308         if(length >= 2) {
309             if(data == null) {
310                 throw new ICUException("Tailored contexts without tailored trie");
311             }
312             data.contexts = ICUBinary.getString(inBytes, length / 2, length & 1);
313         } else {
314             ICUBinary.skipBytes(inBytes, length);
315         }
316 
317         index = IX_UNSAFE_BWD_OFFSET;
318         offset = inIndexes[index];
319         length = inIndexes[index + 1] - offset;
320         if(length >= 2) {
321             if(data == null) {
322                 throw new ICUException("Unsafe-backward-set but no mappings");
323             }
324             if(baseData == null) {
325                 // Create the unsafe-backward set for the root collator.
326                 // Include all non-zero combining marks and trail surrogates.
327                 // We do this at load time, rather than at build time,
328                 // to simplify Unicode version bootstrapping:
329                 // The root data builder only needs the new FractionalUCA.txt data,
330                 // but it need not be built with a version of ICU already updated to
331                 // the corresponding new Unicode Character Database.
332                 //
333                 // The following is an optimized version of
334                 // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
335                 // It is faster and requires fewer code dependencies.
336                 tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff);  // trail surrogates
337                 data.nfcImpl.addLcccChars(tailoring.unsafeBackwardSet);
338             } else {
339                 // Clone the root collator's set contents.
340                 tailoring.unsafeBackwardSet = baseData.unsafeBackwardSet.cloneAsThawed();
341             }
342             // Add the ranges from the data file to the unsafe-backward set.
343             USerializedSet sset = new USerializedSet();
344             char[] unsafeData = ICUBinary.getChars(inBytes, length / 2, length & 1);
345             length = 0;
346             sset.getSet(unsafeData, 0);
347             int count = sset.countRanges();
348             int[] range = new int[2];
349             for(int i = 0; i < count; ++i) {
350                 sset.getRange(i, range);
351                 tailoring.unsafeBackwardSet.add(range[0], range[1]);
352             }
353             // Mark each lead surrogate as "unsafe"
354             // if any of its 1024 associated supplementary code points is "unsafe".
355             int c = 0x10000;
356             for(int lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
357                 if(!tailoring.unsafeBackwardSet.containsNone(c, c + 0x3ff)) {
358                     tailoring.unsafeBackwardSet.add(lead);
359                 }
360             }
361             tailoring.unsafeBackwardSet.freeze();
362             data.unsafeBackwardSet = tailoring.unsafeBackwardSet;
363         } else if(data == null) {
364             // Nothing to do.
365         } else if(baseData != null) {
366             // No tailoring-specific data: Alias the root collator's set.
367             data.unsafeBackwardSet = baseData.unsafeBackwardSet;
368         } else {
369             throw new ICUException("Missing unsafe-backward-set");
370         }
371         ICUBinary.skipBytes(inBytes, length);
372 
373         // If the fast Latin format version is different,
374         // or the version is set to 0 for "no fast Latin table",
375         // then just always use the normal string comparison path.
376         index = IX_FAST_LATIN_TABLE_OFFSET;
377         offset = inIndexes[index];
378         length = inIndexes[index + 1] - offset;
379         if(data != null) {
380             data.fastLatinTable = null;
381             data.fastLatinTableHeader = null;
382             if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin.VERSION) {
383                 if(length >= 2) {
384                     char header0 = inBytes.getChar();
385                     int headerLength = header0 & 0xff;
386                     data.fastLatinTableHeader = new char[headerLength];
387                     data.fastLatinTableHeader[0] = header0;
388                     for(int i = 1; i < headerLength; ++i) {
389                         data.fastLatinTableHeader[i] = inBytes.getChar();
390                     }
391                     int tableLength = length / 2 - headerLength;
392                     data.fastLatinTable = ICUBinary.getChars(inBytes, tableLength, length & 1);
393                     length = 0;
394                     if((header0 >> 8) != CollationFastLatin.VERSION) {
395                         throw new ICUException("Fast-Latin table version differs from version in data header");
396                     }
397                 } else if(baseData != null) {
398                     data.fastLatinTable = baseData.fastLatinTable;
399                     data.fastLatinTableHeader = baseData.fastLatinTableHeader;
400                 }
401             }
402         }
403         ICUBinary.skipBytes(inBytes, length);
404 
405         index = IX_SCRIPTS_OFFSET;
406         offset = inIndexes[index];
407         length = inIndexes[index + 1] - offset;
408         if(length >= 2) {
409             if(data == null) {
410                 throw new ICUException("Script order data but no mappings");
411             }
412             int scriptsLength = length / 2;
413             CharBuffer inChars = inBytes.asCharBuffer();
414             data.numScripts = inChars.get();
415             // There must be enough entries for both arrays, including more than two range starts.
416             int scriptStartsLength = scriptsLength - (1 + data.numScripts + 16);
417             if(scriptStartsLength <= 2) {
418                 throw new ICUException("Script order data too short");
419             }
420             inChars.get(data.scriptsIndex = new char[data.numScripts + 16]);
421             inChars.get(data.scriptStarts = new char[scriptStartsLength]);
422             if(!(data.scriptStarts[0] == 0 &&
423                     data.scriptStarts[1] == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8) &&
424                     data.scriptStarts[scriptStartsLength - 1] ==
425                             (Collation.TRAIL_WEIGHT_BYTE << 8))) {
426                 throw new ICUException("Script order data not valid");
427             }
428         } else if(data == null) {
429             // Nothing to do.
430         } else if(baseData != null) {
431             data.numScripts = baseData.numScripts;
432             data.scriptsIndex = baseData.scriptsIndex;
433             data.scriptStarts = baseData.scriptStarts;
434         }
435         ICUBinary.skipBytes(inBytes, length);
436 
437         index = IX_COMPRESSIBLE_BYTES_OFFSET;
438         offset = inIndexes[index];
439         length = inIndexes[index + 1] - offset;
440         if(length >= 256) {
441             if(data == null) {
442                 throw new ICUException("Data for compressible primary lead bytes but no mappings");
443             }
444             data.compressibleBytes = new boolean[256];
445             for(int i = 0; i < 256; ++i) {
446                 data.compressibleBytes[i] = inBytes.get() != 0;
447             }
448             length -= 256;
449         } else if(data == null) {
450             // Nothing to do.
451         } else if(baseData != null) {
452             data.compressibleBytes = baseData.compressibleBytes;
453         } else {
454             throw new ICUException("Missing data for compressible primary lead bytes");
455         }
456         ICUBinary.skipBytes(inBytes, length);
457 
458         index = IX_RESERVED18_OFFSET;
459         offset = inIndexes[index];
460         length = inIndexes[index + 1] - offset;
461         ICUBinary.skipBytes(inBytes, length);
462 
463         CollationSettings ts = tailoring.settings.readOnly();
464         int options = inIndexes[IX_OPTIONS] & 0xffff;
465         char[] fastLatinPrimaries = new char[CollationFastLatin.LATIN_LIMIT];
466         int fastLatinOptions = CollationFastLatin.getOptions(
467                 tailoring.data, ts, fastLatinPrimaries);
468         if(options == ts.options && ts.variableTop != 0 &&
469                 Arrays.equals(reorderCodes, ts.reorderCodes) &&
470                 fastLatinOptions == ts.fastLatinOptions &&
471                 (fastLatinOptions < 0 ||
472                         Arrays.equals(fastLatinPrimaries, ts.fastLatinPrimaries))) {
473             return;
474         }
475 
476         CollationSettings settings = tailoring.settings.copyOnWrite();
477         settings.options = options;
478         // Set variableTop from options and scripts data.
479         settings.variableTop = tailoring.data.getLastPrimaryForGroup(
480                 Collator.ReorderCodes.FIRST + settings.getMaxVariable());
481         if(settings.variableTop == 0) {
482             throw new ICUException("The maxVariable could not be mapped to a variableTop");
483         }
484 
485         if(reorderCodesLength != 0) {
486             settings.aliasReordering(baseData, reorderCodes, reorderCodesLength, reorderTable);
487         }
488 
489         settings.fastLatinOptions = CollationFastLatin.getOptions(
490             tailoring.data, settings,
491             settings.fastLatinPrimaries);
492     }
493 
494     private static final class IsAcceptable implements ICUBinary.Authenticate {
495         @Override
isDataVersionAcceptable(byte version[])496         public boolean isDataVersionAcceptable(byte version[]) {
497             return version[0] == 5;
498         }
499     }
500     private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
501     private static final int DATA_FORMAT = 0x55436f6c;  // "UCol"
502 
CollationDataReader()503     private CollationDataReader() {}  // no constructor
504 }
505 
506 /*
507  * Format of collation data (ucadata.icu, binary data in coll/ *.res files):
508  * See ICU4C source/common/collationdatareader.h.
509  */
510