• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2013-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * CollationDataReader.java, ported from collationdatareader.h/.cpp
9 *
10 * C++ version created on: 2013feb07
11 * created by: Markus W. Scherer
12 */
13 
14 package com.ibm.icu.impl.coll;
15 
16 import java.io.IOException;
17 import java.nio.ByteBuffer;
18 import java.nio.CharBuffer;
19 import java.util.Arrays;
20 
21 import com.ibm.icu.impl.ICUBinary;
22 import com.ibm.icu.impl.Trie2_32;
23 import com.ibm.icu.impl.USerializedSet;
24 import com.ibm.icu.text.Collator;
25 import com.ibm.icu.text.UnicodeSet;
26 import com.ibm.icu.util.ICUException;
27 
28 /**
29  * Collation binary data reader.
30  */
31 final class CollationDataReader /* all static */ {
32     // The following constants are also copied into source/common/ucol_swp.cpp.
33     // Keep them in sync!
34     /**
35      * Number of int indexes.
36      *
37      * Can be 2 if there are only options.
38      * Can be 7 or 8 if there are only options and a script reordering.
39      * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0.
40      */
41     static final int IX_INDEXES_LENGTH = 0;
42     /**
43      * Bits 31..24: numericPrimary, for numeric collation
44      *      23..16: fast Latin format version (0 = no fast Latin table)
45      *      15.. 0: options bit set
46      */
47     static final int IX_OPTIONS = 1;
48     static final int IX_RESERVED2 = 2;
49     static final int IX_RESERVED3 = 3;
50 
51     /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */
52     static final int IX_JAMO_CE32S_START = 4;
53 
54     // Byte offsets from the start of the data, after the generic header.
55     // The indexes[] are at byte offset 0, other data follows.
56     // Each data item is aligned properly.
57     // The data items should be in descending order of unit size,
58     // to minimize the need for padding.
59     // Each item's byte length is given by the difference between its offset and
60     // the next index/offset value.
61     /** Byte offset to int reorderCodes[]. */
62     static final int IX_REORDER_CODES_OFFSET = 5;
63     /**
64      * Byte offset to uint8_t reorderTable[].
65      * Empty table if <256 bytes (padding only).
66      * Otherwise 256 bytes or more (with padding).
67      */
68     static final int IX_REORDER_TABLE_OFFSET = 6;
69     /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */
70     static final int IX_TRIE_OFFSET = 7;
71 
72     static final int IX_RESERVED8_OFFSET = 8;
73     /** Byte offset to long ces[]. */
74     static final int IX_CES_OFFSET = 9;
75     static final int IX_RESERVED10_OFFSET = 10;
76     /** Byte offset to int ce32s[]. */
77     static final int IX_CE32S_OFFSET = 11;
78 
79     /** Byte offset to uint32_t rootElements[]. */
80     static final int IX_ROOT_ELEMENTS_OFFSET = 12;
81     /** Byte offset to UChar *contexts[]. */
82     static final int IX_CONTEXTS_OFFSET = 13;
83     /** Byte offset to char [] with serialized unsafeBackwardSet. */
84     static final int IX_UNSAFE_BWD_OFFSET = 14;
85     /** Byte offset to char fastLatinTable[]. */
86     static final int IX_FAST_LATIN_TABLE_OFFSET = 15;
87 
88     /** Byte offset to char scripts[]. */
89     static final int IX_SCRIPTS_OFFSET = 16;
90     /**
91      * Byte offset to boolean compressibleBytes[].
92      * Empty table if <256 bytes (padding only).
93      * Otherwise 256 bytes or more (with padding).
94      */
95     static final int IX_COMPRESSIBLE_BYTES_OFFSET = 17;
96     static final int IX_RESERVED18_OFFSET = 18;
97     static final int IX_TOTAL_SIZE = 19;
98 
read(CollationTailoring base, ByteBuffer inBytes, CollationTailoring tailoring)99     static void read(CollationTailoring base, ByteBuffer inBytes,
100                      CollationTailoring tailoring) throws IOException {
101         tailoring.version = ICUBinary.readHeader(inBytes, DATA_FORMAT, IS_ACCEPTABLE);
102         if(base != null && base.getUCAVersion() != tailoring.getUCAVersion()) {
103             throw new ICUException("Tailoring UCA version differs from base data UCA version");
104         }
105 
106         int inLength = inBytes.remaining();
107         if(inLength < 8) {
108             throw new ICUException("not enough bytes");
109         }
110         int indexesLength = inBytes.getInt();  // inIndexes[IX_INDEXES_LENGTH]
111         if(indexesLength < 2 || inLength < indexesLength * 4) {
112             throw new ICUException("not enough indexes");
113         }
114         int[] inIndexes = new int[IX_TOTAL_SIZE + 1];
115         inIndexes[0] = indexesLength;
116         for(int i = 1; i < indexesLength && i < inIndexes.length; ++i) {
117             inIndexes[i] = inBytes.getInt();
118         }
119         for(int i = indexesLength; i < inIndexes.length; ++i) {
120             inIndexes[i] = -1;
121         }
122         if(indexesLength > inIndexes.length) {
123             ICUBinary.skipBytes(inBytes, (indexesLength - inIndexes.length) * 4);
124         }
125 
126         // Assume that the tailoring data is in initial state,
127         // with null pointers and 0 lengths.
128 
129         // Set pointers to non-empty data parts.
130         // Do this in order of their byte offsets. (Should help porting to Java.)
131 
132         int index;  // one of the indexes[] slots
133         int offset;  // byte offset for the index part
134         int length;  // number of bytes in the index part
135 
136         if(indexesLength > IX_TOTAL_SIZE) {
137             length = inIndexes[IX_TOTAL_SIZE];
138         } else if(indexesLength > IX_REORDER_CODES_OFFSET) {
139             length = inIndexes[indexesLength - 1];
140         } else {
141             length = 0;  // only indexes, and inLength was already checked for them
142         }
143         if(inLength < length) {
144             throw new ICUException("not enough bytes");
145         }
146 
147         CollationData baseData = base == null ? null : base.data;
148         int[] reorderCodes;
149         int reorderCodesLength;
150         index = IX_REORDER_CODES_OFFSET;
151         offset = inIndexes[index];
152         length = inIndexes[index + 1] - offset;
153         if(length >= 4) {
154             if(baseData == null) {
155                 // We assume for collation settings that
156                 // the base data does not have a reordering.
157                 throw new ICUException("Collation base data must not reorder scripts");
158             }
159             reorderCodesLength = length / 4;
160             reorderCodes = ICUBinary.getInts(inBytes, reorderCodesLength, length & 3);
161 
162             // The reorderRanges (if any) are the trailing reorderCodes entries.
163             // Split the array at the boundary.
164             // Script or reorder codes do not exceed 16-bit values.
165             // Range limits are stored in the upper 16 bits, and are never 0.
166             int reorderRangesLength = 0;
167             while(reorderRangesLength < reorderCodesLength &&
168                     (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) {
169                 ++reorderRangesLength;
170             }
171             assert(reorderRangesLength < reorderCodesLength);
172             reorderCodesLength -= reorderRangesLength;
173         } else {
174             reorderCodes = new int[0];
175             reorderCodesLength = 0;
176             ICUBinary.skipBytes(inBytes, length);
177         }
178 
179         // There should be a reorder table only if there are reorder codes.
180         // However, when there are reorder codes the reorder table may be omitted to reduce
181         // the data size.
182         byte[] reorderTable = null;
183         index = IX_REORDER_TABLE_OFFSET;
184         offset = inIndexes[index];
185         length = inIndexes[index + 1] - offset;
186         if(length >= 256) {
187             if(reorderCodesLength == 0) {
188                 throw new ICUException("Reordering table without reordering codes");
189             }
190             reorderTable = new byte[256];
191             inBytes.get(reorderTable);
192             length -= 256;
193         } else {
194             // If we have reorder codes, then build the reorderTable at the end,
195             // when the CollationData is otherwise complete.
196         }
197         ICUBinary.skipBytes(inBytes, length);
198 
199         if(baseData != null && baseData.numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000L)) {
200             throw new ICUException("Tailoring numeric primary weight differs from base data");
201         }
202         CollationData data = null;  // Remains null if there are no mappings.
203 
204         index = IX_TRIE_OFFSET;
205         offset = inIndexes[index];
206         length = inIndexes[index + 1] - offset;
207         if(length >= 8) {
208             tailoring.ensureOwnedData();
209             data = tailoring.ownedData;
210             data.base = baseData;
211             data.numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000L;
212             data.trie = tailoring.trie = Trie2_32.createFromSerialized(inBytes);
213             int trieLength = data.trie.getSerializedLength();
214             if(trieLength > length) {
215                 throw new ICUException("Not enough bytes for the mappings trie");  // No mappings.
216             }
217             length -= trieLength;
218         } else if(baseData != null) {
219             // Use the base data. Only the settings are tailored.
220             tailoring.data = baseData;
221         } else {
222             throw new ICUException("Missing collation data mappings");  // No mappings.
223         }
224         ICUBinary.skipBytes(inBytes, length);
225 
226         index = IX_RESERVED8_OFFSET;
227         offset = inIndexes[index];
228         length = inIndexes[index + 1] - offset;
229         ICUBinary.skipBytes(inBytes, length);
230 
231         index = IX_CES_OFFSET;
232         offset = inIndexes[index];
233         length = inIndexes[index + 1] - offset;
234         if(length >= 8) {
235             if(data == null) {
236                 throw new ICUException("Tailored ces without tailored trie");
237             }
238             data.ces = ICUBinary.getLongs(inBytes, length / 8, length & 7);
239         } else {
240             ICUBinary.skipBytes(inBytes, length);
241         }
242 
243         index = IX_RESERVED10_OFFSET;
244         offset = inIndexes[index];
245         length = inIndexes[index + 1] - offset;
246         ICUBinary.skipBytes(inBytes, length);
247 
248         index = IX_CE32S_OFFSET;
249         offset = inIndexes[index];
250         length = inIndexes[index + 1] - offset;
251         if(length >= 4) {
252             if(data == null) {
253                 throw new ICUException("Tailored ce32s without tailored trie");
254             }
255             data.ce32s = ICUBinary.getInts(inBytes, length / 4, length & 3);
256         } else {
257             ICUBinary.skipBytes(inBytes, length);
258         }
259 
260         int jamoCE32sStart = inIndexes[IX_JAMO_CE32S_START];
261         if(jamoCE32sStart >= 0) {
262             if(data == null || data.ce32s == null) {
263                 throw new ICUException("JamoCE32sStart index into non-existent ce32s[]");
264             }
265             data.jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH];
266             System.arraycopy(data.ce32s, jamoCE32sStart, data.jamoCE32s, 0, CollationData.JAMO_CE32S_LENGTH);
267         } else if(data == null) {
268             // Nothing to do.
269         } else if(baseData != null) {
270             data.jamoCE32s = baseData.jamoCE32s;
271         } else {
272             throw new ICUException("Missing Jamo CE32s for Hangul processing");
273         }
274 
275         index = IX_ROOT_ELEMENTS_OFFSET;
276         offset = inIndexes[index];
277         length = inIndexes[index + 1] - offset;
278         if(length >= 4) {
279             int rootElementsLength = length / 4;
280             if(data == null) {
281                 throw new ICUException("Root elements but no mappings");
282             }
283             if(rootElementsLength <= CollationRootElements.IX_SEC_TER_BOUNDARIES) {
284                 throw new ICUException("Root elements array too short");
285             }
286             data.rootElements = new long[rootElementsLength];
287             for(int i = 0; i < rootElementsLength; ++i) {
288                 data.rootElements[i] = inBytes.getInt() & 0xffffffffL;  // unsigned int -> long
289             }
290             long commonSecTer = data.rootElements[CollationRootElements.IX_COMMON_SEC_AND_TER_CE];
291             if(commonSecTer != Collation.COMMON_SEC_AND_TER_CE) {
292                 throw new ICUException("Common sec/ter weights in base data differ from the hardcoded value");
293             }
294             long secTerBoundaries = data.rootElements[CollationRootElements.IX_SEC_TER_BOUNDARIES];
295             if((secTerBoundaries >>> 24) < CollationKeys.SEC_COMMON_HIGH) {
296                 // [fixed last secondary common byte] is too low,
297                 // and secondary weights would collide with compressed common secondaries.
298                 throw new ICUException("[fixed last secondary common byte] is too low");
299             }
300             length &= 3;
301         }
302         ICUBinary.skipBytes(inBytes, length);
303 
304         index = IX_CONTEXTS_OFFSET;
305         offset = inIndexes[index];
306         length = inIndexes[index + 1] - offset;
307         if(length >= 2) {
308             if(data == null) {
309                 throw new ICUException("Tailored contexts without tailored trie");
310             }
311             data.contexts = ICUBinary.getString(inBytes, length / 2, length & 1);
312         } else {
313             ICUBinary.skipBytes(inBytes, length);
314         }
315 
316         index = IX_UNSAFE_BWD_OFFSET;
317         offset = inIndexes[index];
318         length = inIndexes[index + 1] - offset;
319         if(length >= 2) {
320             if(data == null) {
321                 throw new ICUException("Unsafe-backward-set but no mappings");
322             }
323             if(baseData == null) {
324                 // Create the unsafe-backward set for the root collator.
325                 // Include all non-zero combining marks and trail surrogates.
326                 // We do this at load time, rather than at build time,
327                 // to simplify Unicode version bootstrapping:
328                 // The root data builder only needs the new FractionalUCA.txt data,
329                 // but it need not be built with a version of ICU already updated to
330                 // the corresponding new Unicode Character Database.
331                 //
332                 // The following is an optimized version of
333                 // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
334                 // It is faster and requires fewer code dependencies.
335                 tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff);  // trail surrogates
336                 data.nfcImpl.addLcccChars(tailoring.unsafeBackwardSet);
337             } else {
338                 // Clone the root collator's set contents.
339                 tailoring.unsafeBackwardSet = baseData.unsafeBackwardSet.cloneAsThawed();
340             }
341             // Add the ranges from the data file to the unsafe-backward set.
342             USerializedSet sset = new USerializedSet();
343             char[] unsafeData = ICUBinary.getChars(inBytes, length / 2, length & 1);
344             length = 0;
345             sset.getSet(unsafeData, 0);
346             int count = sset.countRanges();
347             int[] range = new int[2];
348             for(int i = 0; i < count; ++i) {
349                 sset.getRange(i, range);
350                 tailoring.unsafeBackwardSet.add(range[0], range[1]);
351             }
352             // Mark each lead surrogate as "unsafe"
353             // if any of its 1024 associated supplementary code points is "unsafe".
354             int c = 0x10000;
355             for(int lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
356                 if(!tailoring.unsafeBackwardSet.containsNone(c, c + 0x3ff)) {
357                     tailoring.unsafeBackwardSet.add(lead);
358                 }
359             }
360             tailoring.unsafeBackwardSet.freeze();
361             data.unsafeBackwardSet = tailoring.unsafeBackwardSet;
362         } else if(data == null) {
363             // Nothing to do.
364         } else if(baseData != null) {
365             // No tailoring-specific data: Alias the root collator's set.
366             data.unsafeBackwardSet = baseData.unsafeBackwardSet;
367         } else {
368             throw new ICUException("Missing unsafe-backward-set");
369         }
370         ICUBinary.skipBytes(inBytes, length);
371 
372         // If the fast Latin format version is different,
373         // or the version is set to 0 for "no fast Latin table",
374         // then just always use the normal string comparison path.
375         index = IX_FAST_LATIN_TABLE_OFFSET;
376         offset = inIndexes[index];
377         length = inIndexes[index + 1] - offset;
378         if(data != null) {
379             data.fastLatinTable = null;
380             data.fastLatinTableHeader = null;
381             if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin.VERSION) {
382                 if(length >= 2) {
383                     char header0 = inBytes.getChar();
384                     int headerLength = header0 & 0xff;
385                     data.fastLatinTableHeader = new char[headerLength];
386                     data.fastLatinTableHeader[0] = header0;
387                     for(int i = 1; i < headerLength; ++i) {
388                         data.fastLatinTableHeader[i] = inBytes.getChar();
389                     }
390                     int tableLength = length / 2 - headerLength;
391                     data.fastLatinTable = ICUBinary.getChars(inBytes, tableLength, length & 1);
392                     length = 0;
393                     if((header0 >> 8) != CollationFastLatin.VERSION) {
394                         throw new ICUException("Fast-Latin table version differs from version in data header");
395                     }
396                 } else if(baseData != null) {
397                     data.fastLatinTable = baseData.fastLatinTable;
398                     data.fastLatinTableHeader = baseData.fastLatinTableHeader;
399                 }
400             }
401         }
402         ICUBinary.skipBytes(inBytes, length);
403 
404         index = IX_SCRIPTS_OFFSET;
405         offset = inIndexes[index];
406         length = inIndexes[index + 1] - offset;
407         if(length >= 2) {
408             if(data == null) {
409                 throw new ICUException("Script order data but no mappings");
410             }
411             int scriptsLength = length / 2;
412             CharBuffer inChars = inBytes.asCharBuffer();
413             data.numScripts = inChars.get();
414             // There must be enough entries for both arrays, including more than two range starts.
415             int scriptStartsLength = scriptsLength - (1 + data.numScripts + 16);
416             if(scriptStartsLength <= 2) {
417                 throw new ICUException("Script order data too short");
418             }
419             inChars.get(data.scriptsIndex = new char[data.numScripts + 16]);
420             inChars.get(data.scriptStarts = new char[scriptStartsLength]);
421             if(!(data.scriptStarts[0] == 0 &&
422                     data.scriptStarts[1] == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8) &&
423                     data.scriptStarts[scriptStartsLength - 1] ==
424                             (Collation.TRAIL_WEIGHT_BYTE << 8))) {
425                 throw new ICUException("Script order data not valid");
426             }
427         } else if(data == null) {
428             // Nothing to do.
429         } else if(baseData != null) {
430             data.numScripts = baseData.numScripts;
431             data.scriptsIndex = baseData.scriptsIndex;
432             data.scriptStarts = baseData.scriptStarts;
433         }
434         ICUBinary.skipBytes(inBytes, length);
435 
436         index = IX_COMPRESSIBLE_BYTES_OFFSET;
437         offset = inIndexes[index];
438         length = inIndexes[index + 1] - offset;
439         if(length >= 256) {
440             if(data == null) {
441                 throw new ICUException("Data for compressible primary lead bytes but no mappings");
442             }
443             data.compressibleBytes = new boolean[256];
444             for(int i = 0; i < 256; ++i) {
445                 data.compressibleBytes[i] = inBytes.get() != 0;
446             }
447             length -= 256;
448         } else if(data == null) {
449             // Nothing to do.
450         } else if(baseData != null) {
451             data.compressibleBytes = baseData.compressibleBytes;
452         } else {
453             throw new ICUException("Missing data for compressible primary lead bytes");
454         }
455         ICUBinary.skipBytes(inBytes, length);
456 
457         index = IX_RESERVED18_OFFSET;
458         offset = inIndexes[index];
459         length = inIndexes[index + 1] - offset;
460         ICUBinary.skipBytes(inBytes, length);
461 
462         CollationSettings ts = tailoring.settings.readOnly();
463         int options = inIndexes[IX_OPTIONS] & 0xffff;
464         char[] fastLatinPrimaries = new char[CollationFastLatin.LATIN_LIMIT];
465         int fastLatinOptions = CollationFastLatin.getOptions(
466                 tailoring.data, ts, fastLatinPrimaries);
467         if(options == ts.options && ts.variableTop != 0 &&
468                 Arrays.equals(reorderCodes, ts.reorderCodes) &&
469                 fastLatinOptions == ts.fastLatinOptions &&
470                 (fastLatinOptions < 0 ||
471                         Arrays.equals(fastLatinPrimaries, ts.fastLatinPrimaries))) {
472             return;
473         }
474 
475         CollationSettings settings = tailoring.settings.copyOnWrite();
476         settings.options = options;
477         // Set variableTop from options and scripts data.
478         settings.variableTop = tailoring.data.getLastPrimaryForGroup(
479                 Collator.ReorderCodes.FIRST + settings.getMaxVariable());
480         if(settings.variableTop == 0) {
481             throw new ICUException("The maxVariable could not be mapped to a variableTop");
482         }
483 
484         if(reorderCodesLength != 0) {
485             settings.aliasReordering(baseData, reorderCodes, reorderCodesLength, reorderTable);
486         }
487 
488         settings.fastLatinOptions = CollationFastLatin.getOptions(
489             tailoring.data, settings,
490             settings.fastLatinPrimaries);
491     }
492 
493     private static final class IsAcceptable implements ICUBinary.Authenticate {
494         @Override
isDataVersionAcceptable(byte version[])495         public boolean isDataVersionAcceptable(byte version[]) {
496             return version[0] == 5;
497         }
498     }
499     private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
500     private static final int DATA_FORMAT = 0x55436f6c;  // "UCol"
501 
CollationDataReader()502     private CollationDataReader() {}  // no constructor
503 }
504 
505 /*
506  * Format of collation data (ucadata.icu, binary data in coll/ *.res files):
507  * See ICU4C source/common/collationdatareader.h.
508  */
509