• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 * Copyright (C) 2013-2014, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * collationdatareader.cpp
7 *
8 * created on: 2013feb07
9 * created by: Markus W. Scherer
10 */
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_COLLATION
15 
16 #include "unicode/ucol.h"
17 #include "unicode/udata.h"
18 #include "unicode/uscript.h"
19 #include "cmemory.h"
20 #include "collation.h"
21 #include "collationdata.h"
22 #include "collationdatareader.h"
23 #include "collationfastlatin.h"
24 #include "collationkeys.h"
25 #include "collationrootelements.h"
26 #include "collationsettings.h"
27 #include "collationtailoring.h"
28 #include "normalizer2impl.h"
29 #include "uassert.h"
30 #include "ucmndata.h"
31 #include "utrie2.h"
32 
33 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
34 
35 U_NAMESPACE_BEGIN
36 
37 namespace {
38 
getIndex(const int32_t * indexes,int32_t length,int32_t i)39 int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) {
40     return (i < length) ? indexes[i] : -1;
41 }
42 
43 }  // namespace
44 
45 void
read(const CollationTailoring * base,const uint8_t * inBytes,int32_t inLength,CollationTailoring & tailoring,UErrorCode & errorCode)46 CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength,
47                           CollationTailoring &tailoring, UErrorCode &errorCode) {
48     if(U_FAILURE(errorCode)) { return; }
49     if(base != NULL) {
50         if(inBytes == NULL || (0 <= inLength && inLength < 24)) {
51             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
52             return;
53         }
54         const DataHeader *header = reinterpret_cast<const DataHeader *>(inBytes);
55         if(!(header->dataHeader.magic1 == 0xda && header->dataHeader.magic2 == 0x27 &&
56                 isAcceptable(tailoring.version, NULL, NULL, &header->info))) {
57             errorCode = U_INVALID_FORMAT_ERROR;
58             return;
59         }
60         if(base->getUCAVersion() != tailoring.getUCAVersion()) {
61             errorCode = U_COLLATOR_VERSION_MISMATCH;
62             return;
63         }
64         int32_t headerLength = header->dataHeader.headerSize;
65         inBytes += headerLength;
66         if(inLength >= 0) {
67             inLength -= headerLength;
68         }
69     }
70 
71     if(inBytes == NULL || (0 <= inLength && inLength < 8)) {
72         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
73         return;
74     }
75     const int32_t *inIndexes = reinterpret_cast<const int32_t *>(inBytes);
76     int32_t indexesLength = inIndexes[IX_INDEXES_LENGTH];
77     if(indexesLength < 2 || (0 <= inLength && inLength < indexesLength * 4)) {
78         errorCode = U_INVALID_FORMAT_ERROR;  // Not enough indexes.
79         return;
80     }
81 
82     // Assume that the tailoring data is in initial state,
83     // with NULL pointers and 0 lengths.
84 
85     // Set pointers to non-empty data parts.
86     // Do this in order of their byte offsets. (Should help porting to Java.)
87 
88     int32_t index;  // one of the indexes[] slots
89     int32_t offset;  // byte offset for the index part
90     int32_t length;  // number of bytes in the index part
91 
92     if(indexesLength > IX_TOTAL_SIZE) {
93         length = inIndexes[IX_TOTAL_SIZE];
94     } else if(indexesLength > IX_REORDER_CODES_OFFSET) {
95         length = inIndexes[indexesLength - 1];
96     } else {
97         length = 0;  // only indexes, and inLength was already checked for them
98     }
99     if(0 <= inLength && inLength < length) {
100         errorCode = U_INVALID_FORMAT_ERROR;
101         return;
102     }
103 
104     const CollationData *baseData = base == NULL ? NULL : base->data;
105     const int32_t *reorderCodes = NULL;
106     int32_t reorderCodesLength = 0;
107     index = IX_REORDER_CODES_OFFSET;
108     offset = getIndex(inIndexes, indexesLength, index);
109     length = getIndex(inIndexes, indexesLength, index + 1) - offset;
110     if(length >= 4) {
111         if(baseData == NULL) {
112             // We assume for collation settings that
113             // the base data does not have a reordering.
114             errorCode = U_INVALID_FORMAT_ERROR;
115             return;
116         }
117         reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset);
118         reorderCodesLength = length / 4;
119     }
120 
121     // There should be a reorder table only if there are reorder codes.
122     // However, when there are reorder codes the reorder table may be omitted to reduce
123     // the data size.
124     const uint8_t *reorderTable = NULL;
125     index = IX_REORDER_TABLE_OFFSET;
126     offset = getIndex(inIndexes, indexesLength, index);
127     length = getIndex(inIndexes, indexesLength, index + 1) - offset;
128     if(length >= 256) {
129         if(reorderCodesLength == 0) {
130             errorCode = U_INVALID_FORMAT_ERROR;  // Reordering table without reordering codes.
131             return;
132         }
133         reorderTable = inBytes + offset;
134     } else {
135         // If we have reorder codes, then build the reorderTable at the end,
136         // when the CollationData is otherwise complete.
137     }
138 
139     if(baseData != NULL && baseData->numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000)) {
140         errorCode = U_INVALID_FORMAT_ERROR;
141         return;
142     }
143     CollationData *data = NULL;  // Remains NULL if there are no mappings.
144 
145     index = IX_TRIE_OFFSET;
146     offset = getIndex(inIndexes, indexesLength, index);
147     length = getIndex(inIndexes, indexesLength, index + 1) - offset;
148     if(length >= 8) {
149         if(!tailoring.ensureOwnedData(errorCode)) { return; }
150         data = tailoring.ownedData;
151         data->base = baseData;
152         data->numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000;
153         data->trie = tailoring.trie = utrie2_openFromSerialized(
154             UTRIE2_32_VALUE_BITS, inBytes + offset, length, NULL,
155             &errorCode);
156         if(U_FAILURE(errorCode)) { return; }
157     } else if(baseData != NULL) {
158         // Use the base data. Only the settings are tailored.
159         tailoring.data = baseData;
160     } else {
161         errorCode = U_INVALID_FORMAT_ERROR;  // No mappings.
162         return;
163     }
164 
165     index = IX_CES_OFFSET;
166     offset = getIndex(inIndexes, indexesLength, index);
167     length = getIndex(inIndexes, indexesLength, index + 1) - offset;
168     if(length >= 8) {
169         if(data == NULL) {
170             errorCode = U_INVALID_FORMAT_ERROR;  // Tailored ces without tailored trie.
171             return;
172         }
173         data->ces = reinterpret_cast<const int64_t *>(inBytes + offset);
174         data->cesLength = length / 8;
175     }
176 
177     index = IX_CE32S_OFFSET;
178     offset = getIndex(inIndexes, indexesLength, index);
179     length = getIndex(inIndexes, indexesLength, index + 1) - offset;
180     if(length >= 4) {
181         if(data == NULL) {
182             errorCode = U_INVALID_FORMAT_ERROR;  // Tailored ce32s without tailored trie.
183             return;
184         }
185         data->ce32s = reinterpret_cast<const uint32_t *>(inBytes + offset);
186         data->ce32sLength = length / 4;
187     }
188 
189     int32_t jamoCE32sStart = getIndex(inIndexes, indexesLength, IX_JAMO_CE32S_START);
190     if(jamoCE32sStart >= 0) {
191         if(data == NULL || data->ce32s == NULL) {
192             errorCode = U_INVALID_FORMAT_ERROR;  // Index into non-existent ce32s[].
193             return;
194         }
195         data->jamoCE32s = data->ce32s + jamoCE32sStart;
196     } else if(data == NULL) {
197         // Nothing to do.
198     } else if(baseData != NULL) {
199         data->jamoCE32s = baseData->jamoCE32s;
200     } else {
201         errorCode = U_INVALID_FORMAT_ERROR;  // No Jamo CE32s for Hangul processing.
202         return;
203     }
204 
205     index = IX_ROOT_ELEMENTS_OFFSET;
206     offset = getIndex(inIndexes, indexesLength, index);
207     length = getIndex(inIndexes, indexesLength, index + 1) - offset;
208     if(length >= 4) {
209         length /= 4;
210         if(data == NULL || length <= CollationRootElements::IX_SEC_TER_BOUNDARIES) {
211             errorCode = U_INVALID_FORMAT_ERROR;
212             return;
213         }
214         data->rootElements = reinterpret_cast<const uint32_t *>(inBytes + offset);
215         data->rootElementsLength = length;
216         uint32_t commonSecTer = data->rootElements[CollationRootElements::IX_COMMON_SEC_AND_TER_CE];
217         if(commonSecTer != Collation::COMMON_SEC_AND_TER_CE) {
218             errorCode = U_INVALID_FORMAT_ERROR;
219             return;
220         }
221         uint32_t secTerBoundaries = data->rootElements[CollationRootElements::IX_SEC_TER_BOUNDARIES];
222         if((secTerBoundaries >> 24) < CollationKeys::SEC_COMMON_HIGH) {
223             // [fixed last secondary common byte] is too low,
224             // and secondary weights would collide with compressed common secondaries.
225             errorCode = U_INVALID_FORMAT_ERROR;
226             return;
227         }
228     }
229 
230     index = IX_CONTEXTS_OFFSET;
231     offset = getIndex(inIndexes, indexesLength, index);
232     length = getIndex(inIndexes, indexesLength, index + 1) - offset;
233     if(length >= 2) {
234         if(data == NULL) {
235             errorCode = U_INVALID_FORMAT_ERROR;  // Tailored contexts without tailored trie.
236             return;
237         }
238         data->contexts = reinterpret_cast<const UChar *>(inBytes + offset);
239         data->contextsLength = length / 2;
240     }
241 
242     index = IX_UNSAFE_BWD_OFFSET;
243     offset = getIndex(inIndexes, indexesLength, index);
244     length = getIndex(inIndexes, indexesLength, index + 1) - offset;
245     if(length >= 2) {
246         if(data == NULL) {
247             errorCode = U_INVALID_FORMAT_ERROR;
248             return;
249         }
250         if(baseData == NULL) {
251             // Create the unsafe-backward set for the root collator.
252             // Include all non-zero combining marks and trail surrogates.
253             // We do this at load time, rather than at build time,
254             // to simplify Unicode version bootstrapping:
255             // The root data builder only needs the new FractionalUCA.txt data,
256             // but it need not be built with a version of ICU already updated to
257             // the corresponding new Unicode Character Database.
258             //
259             // The following is an optimized version of
260             // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
261             // It is faster and requires fewer code dependencies.
262             tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff);  // trail surrogates
263             if(tailoring.unsafeBackwardSet == NULL) {
264                 errorCode = U_MEMORY_ALLOCATION_ERROR;
265                 return;
266             }
267             data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet);
268         } else {
269             // Clone the root collator's set contents.
270             tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>(
271                 baseData->unsafeBackwardSet->cloneAsThawed());
272             if(tailoring.unsafeBackwardSet == NULL) {
273                 errorCode = U_MEMORY_ALLOCATION_ERROR;
274                 return;
275             }
276         }
277         // Add the ranges from the data file to the unsafe-backward set.
278         USerializedSet sset;
279         const uint16_t *unsafeData = reinterpret_cast<const uint16_t *>(inBytes + offset);
280         if(!uset_getSerializedSet(&sset, unsafeData, length / 2)) {
281             errorCode = U_INVALID_FORMAT_ERROR;
282             return;
283         }
284         int32_t count = uset_getSerializedRangeCount(&sset);
285         for(int32_t i = 0; i < count; ++i) {
286             UChar32 start, end;
287             uset_getSerializedRange(&sset, i, &start, &end);
288             tailoring.unsafeBackwardSet->add(start, end);
289         }
290         // Mark each lead surrogate as "unsafe"
291         // if any of its 1024 associated supplementary code points is "unsafe".
292         UChar32 c = 0x10000;
293         for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
294             if(!tailoring.unsafeBackwardSet->containsNone(c, c + 0x3ff)) {
295                 tailoring.unsafeBackwardSet->add(lead);
296             }
297         }
298         tailoring.unsafeBackwardSet->freeze();
299         data->unsafeBackwardSet = tailoring.unsafeBackwardSet;
300     } else if(data == NULL) {
301         // Nothing to do.
302     } else if(baseData != NULL) {
303         // No tailoring-specific data: Alias the root collator's set.
304         data->unsafeBackwardSet = baseData->unsafeBackwardSet;
305     } else {
306         errorCode = U_INVALID_FORMAT_ERROR;  // No unsafeBackwardSet.
307         return;
308     }
309 
310     // If the fast Latin format version is different,
311     // or the version is set to 0 for "no fast Latin table",
312     // then just always use the normal string comparison path.
313     if(data != NULL) {
314         data->fastLatinTable = NULL;
315         data->fastLatinTableLength = 0;
316         if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin::VERSION) {
317             index = IX_FAST_LATIN_TABLE_OFFSET;
318             offset = getIndex(inIndexes, indexesLength, index);
319             length = getIndex(inIndexes, indexesLength, index + 1) - offset;
320             if(length >= 2) {
321                 data->fastLatinTable = reinterpret_cast<const uint16_t *>(inBytes + offset);
322                 data->fastLatinTableLength = length / 2;
323                 if((*data->fastLatinTable >> 8) != CollationFastLatin::VERSION) {
324                     errorCode = U_INVALID_FORMAT_ERROR;  // header vs. table version mismatch
325                     return;
326                 }
327             } else if(baseData != NULL) {
328                 data->fastLatinTable = baseData->fastLatinTable;
329                 data->fastLatinTableLength = baseData->fastLatinTableLength;
330             }
331         }
332     }
333 
334     index = IX_SCRIPTS_OFFSET;
335     offset = getIndex(inIndexes, indexesLength, index);
336     length = getIndex(inIndexes, indexesLength, index + 1) - offset;
337     if(length >= 2) {
338         if(data == NULL) {
339             errorCode = U_INVALID_FORMAT_ERROR;
340             return;
341         }
342         data->scripts = reinterpret_cast<const uint16_t *>(inBytes + offset);
343         data->scriptsLength = length / 2;
344     } else if(data == NULL) {
345         // Nothing to do.
346     } else if(baseData != NULL) {
347         data->scripts = baseData->scripts;
348         data->scriptsLength = baseData->scriptsLength;
349     }
350 
351     index = IX_COMPRESSIBLE_BYTES_OFFSET;
352     offset = getIndex(inIndexes, indexesLength, index);
353     length = getIndex(inIndexes, indexesLength, index + 1) - offset;
354     if(length >= 256) {
355         if(data == NULL) {
356             errorCode = U_INVALID_FORMAT_ERROR;
357             return;
358         }
359         data->compressibleBytes = reinterpret_cast<const UBool *>(inBytes + offset);
360     } else if(data == NULL) {
361         // Nothing to do.
362     } else if(baseData != NULL) {
363         data->compressibleBytes = baseData->compressibleBytes;
364     } else {
365         errorCode = U_INVALID_FORMAT_ERROR;  // No compressibleBytes[].
366         return;
367     }
368 
369     const CollationSettings &ts = *tailoring.settings;
370     int32_t options = inIndexes[IX_OPTIONS] & 0xffff;
371     uint16_t fastLatinPrimaries[CollationFastLatin::LATIN_LIMIT];
372     int32_t fastLatinOptions = CollationFastLatin::getOptions(
373             tailoring.data, ts, fastLatinPrimaries, LENGTHOF(fastLatinPrimaries));
374     if(options == ts.options && ts.variableTop != 0 &&
375             reorderCodesLength == ts.reorderCodesLength &&
376             uprv_memcmp(reorderCodes, ts.reorderCodes, reorderCodesLength * 4) == 0 &&
377             fastLatinOptions == ts.fastLatinOptions &&
378             (fastLatinOptions < 0 ||
379                 uprv_memcmp(fastLatinPrimaries, ts.fastLatinPrimaries,
380                             sizeof(fastLatinPrimaries)) == 0)) {
381         return;
382     }
383 
384     CollationSettings *settings = SharedObject::copyOnWrite(tailoring.settings);
385     if(settings == NULL) {
386         errorCode = U_MEMORY_ALLOCATION_ERROR;
387         return;
388     }
389     settings->options = options;
390     // Set variableTop from options and scripts data.
391     settings->variableTop = tailoring.data->getLastPrimaryForGroup(
392             UCOL_REORDER_CODE_FIRST + settings->getMaxVariable());
393     if(settings->variableTop == 0) {
394         errorCode = U_INVALID_FORMAT_ERROR;
395         return;
396     }
397 
398     if(reorderCodesLength == 0 || reorderTable != NULL) {
399         settings->aliasReordering(reorderCodes, reorderCodesLength, reorderTable);
400     } else {
401         uint8_t table[256];
402         baseData->makeReorderTable(reorderCodes, reorderCodesLength, table, errorCode);
403         if(U_FAILURE(errorCode)) { return; }
404         if(!settings->setReordering(reorderCodes, reorderCodesLength,table)) {
405             errorCode = U_MEMORY_ALLOCATION_ERROR;
406             return;
407         }
408     }
409 
410     settings->fastLatinOptions = CollationFastLatin::getOptions(
411         tailoring.data, *settings,
412         settings->fastLatinPrimaries, LENGTHOF(settings->fastLatinPrimaries));
413 }
414 
415 UBool U_CALLCONV
isAcceptable(void * context,const char *,const char *,const UDataInfo * pInfo)416 CollationDataReader::isAcceptable(void *context,
417                                   const char * /* type */, const char * /*name*/,
418                                   const UDataInfo *pInfo) {
419     if(
420         pInfo->size >= 20 &&
421         pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
422         pInfo->charsetFamily == U_CHARSET_FAMILY &&
423         pInfo->dataFormat[0] == 0x55 &&  // dataFormat="UCol"
424         pInfo->dataFormat[1] == 0x43 &&
425         pInfo->dataFormat[2] == 0x6f &&
426         pInfo->dataFormat[3] == 0x6c &&
427         pInfo->formatVersion[0] == 4
428     ) {
429         UVersionInfo *version = static_cast<UVersionInfo *>(context);
430         if(version != NULL) {
431             uprv_memcpy(version, pInfo->dataVersion, 4);
432         }
433         return TRUE;
434     } else {
435         return FALSE;
436     }
437 }
438 
439 U_NAMESPACE_END
440 
441 #endif  // !UCONFIG_NO_COLLATION
442