• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 * Copyright (C) 2012-2014, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * collationdata.cpp
7 *
8 * created on: 2012jul28
9 * created by: Markus W. Scherer
10 */
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_COLLATION
15 
16 #include "unicode/ucol.h"
17 #include "unicode/udata.h"
18 #include "unicode/uscript.h"
19 #include "cmemory.h"
20 #include "collation.h"
21 #include "collationdata.h"
22 #include "uassert.h"
23 #include "utrie2.h"
24 
25 U_NAMESPACE_BEGIN
26 
27 uint32_t
getIndirectCE32(uint32_t ce32) const28 CollationData::getIndirectCE32(uint32_t ce32) const {
29     U_ASSERT(Collation::isSpecialCE32(ce32));
30     int32_t tag = Collation::tagFromCE32(ce32);
31     if(tag == Collation::DIGIT_TAG) {
32         // Fetch the non-numeric-collation CE32.
33         ce32 = ce32s[Collation::indexFromCE32(ce32)];
34     } else if(tag == Collation::LEAD_SURROGATE_TAG) {
35         ce32 = Collation::UNASSIGNED_CE32;
36     } else if(tag == Collation::U0000_TAG) {
37         // Fetch the normal ce32 for U+0000.
38         ce32 = ce32s[0];
39     }
40     return ce32;
41 }
42 
43 uint32_t
getFinalCE32(uint32_t ce32) const44 CollationData::getFinalCE32(uint32_t ce32) const {
45     if(Collation::isSpecialCE32(ce32)) {
46         ce32 = getIndirectCE32(ce32);
47     }
48     return ce32;
49 }
50 
51 uint32_t
getFirstPrimaryForGroup(int32_t script) const52 CollationData::getFirstPrimaryForGroup(int32_t script) const {
53     int32_t index = findScript(script);
54     if(index < 0) {
55         return 0;
56     }
57     uint32_t head = scripts[index];
58     return (head & 0xff00) << 16;
59 }
60 
61 uint32_t
getLastPrimaryForGroup(int32_t script) const62 CollationData::getLastPrimaryForGroup(int32_t script) const {
63     int32_t index = findScript(script);
64     if(index < 0) {
65         return 0;
66     }
67     uint32_t head = scripts[index];
68     uint32_t lastByte = head & 0xff;
69     return ((lastByte + 1) << 24) - 1;
70 }
71 
72 int32_t
getGroupForPrimary(uint32_t p) const73 CollationData::getGroupForPrimary(uint32_t p) const {
74     p >>= 24;  // Reordering groups are distinguished by primary lead bytes.
75     for(int32_t i = 0; i < scriptsLength; i = i + 2 + scripts[i + 1]) {
76         uint32_t lastByte = scripts[i] & 0xff;
77         if(p <= lastByte) {
78             return scripts[i + 2];
79         }
80     }
81     return -1;
82 }
83 
84 int32_t
findScript(int32_t script) const85 CollationData::findScript(int32_t script) const {
86     if(script < 0 || 0xffff < script) { return -1; }
87     for(int32_t i = 0; i < scriptsLength;) {
88         int32_t limit = i + 2 + scripts[i + 1];
89         for(int32_t j = i + 2; j < limit; ++j) {
90             if(script == scripts[j]) { return i; }
91         }
92         i = limit;
93     }
94     return -1;
95 }
96 
97 int32_t
getEquivalentScripts(int32_t script,int32_t dest[],int32_t capacity,UErrorCode & errorCode) const98 CollationData::getEquivalentScripts(int32_t script,
99                                     int32_t dest[], int32_t capacity,
100                                     UErrorCode &errorCode) const {
101     if(U_FAILURE(errorCode)) { return 0; }
102     int32_t i = findScript(script);
103     if(i < 0) { return 0; }
104     int32_t length = scripts[i + 1];
105     U_ASSERT(length != 0);
106     if(length > capacity) {
107         errorCode = U_BUFFER_OVERFLOW_ERROR;
108         return length;
109     }
110     i += 2;
111     dest[0] = scripts[i++];
112     for(int32_t j = 1; j < length; ++j) {
113         script = scripts[i++];
114         // Sorted insertion.
115         for(int32_t k = j;; --k) {
116             // Invariant: dest[k] is free to receive either script or dest[k - 1].
117             if(k > 0 && script < dest[k - 1]) {
118                 dest[k] = dest[k - 1];
119             } else {
120                 dest[k] = script;
121                 break;
122             }
123         }
124     }
125     return length;
126 }
127 
128 void
makeReorderTable(const int32_t * reorder,int32_t length,uint8_t table[256],UErrorCode & errorCode) const129 CollationData::makeReorderTable(const int32_t *reorder, int32_t length,
130                                 uint8_t table[256], UErrorCode &errorCode) const {
131     if(U_FAILURE(errorCode)) { return; }
132 
133     // Initialize the table.
134     // Never reorder special low and high primary lead bytes.
135     int32_t lowByte;
136     for(lowByte = 0; lowByte <= Collation::MERGE_SEPARATOR_BYTE; ++lowByte) {
137         table[lowByte] = lowByte;
138     }
139     // lowByte == 03
140 
141     int32_t highByte;
142     for(highByte = 0xff; highByte >= Collation::TRAIL_WEIGHT_BYTE; --highByte) {
143         table[highByte] = highByte;
144     }
145     // highByte == FE
146 
147     // Set intermediate bytes to 0 to indicate that they have not been set yet.
148     for(int32_t i = lowByte; i <= highByte; ++i) {
149         table[i] = 0;
150     }
151 
152     // Get the set of special reorder codes in the input list.
153     // This supports up to 32 special reorder codes;
154     // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
155     uint32_t specials = 0;
156     for(int32_t i = 0; i < length; ++i) {
157         int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;
158         if(0 <= reorderCode && reorderCode <= 31) {
159             specials |= (uint32_t)1 << reorderCode;
160         }
161     }
162 
163     // Start the reordering with the special low reorder codes that do not occur in the input.
164     for(int32_t i = 0;; i += 3) {
165         if(scripts[i + 1] != 1) { break; }  // Went beyond special single-code reorder codes.
166         int32_t reorderCode = (int32_t)scripts[i + 2] - UCOL_REORDER_CODE_FIRST;
167         if(reorderCode < 0) { break; }  // Went beyond special reorder codes.
168         if((specials & ((uint32_t)1 << reorderCode)) == 0) {
169             int32_t head = scripts[i];
170             int32_t firstByte = head >> 8;
171             int32_t lastByte = head & 0xff;
172             do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte);
173         }
174     }
175 
176     // Reorder according to the input scripts, continuing from the bottom of the bytes range.
177     for(int32_t i = 0; i < length;) {
178         int32_t script = reorder[i++];
179         if(script == USCRIPT_UNKNOWN) {
180             // Put the remaining scripts at the top.
181             while(i < length) {
182                 script = reorder[--length];
183                 if(script == USCRIPT_UNKNOWN ||  // Must occur at most once.
184                         script == UCOL_REORDER_CODE_DEFAULT) {
185                     errorCode = U_ILLEGAL_ARGUMENT_ERROR;
186                     return;
187                 }
188                 int32_t index = findScript(script);
189                 if(index < 0) { continue; }
190                 int32_t head = scripts[index];
191                 int32_t firstByte = head >> 8;
192                 int32_t lastByte = head & 0xff;
193                 if(table[firstByte] != 0) {  // Duplicate or equivalent script.
194                     errorCode = U_ILLEGAL_ARGUMENT_ERROR;
195                     return;
196                 }
197                 do { table[lastByte--] = highByte--; } while(firstByte <= lastByte);
198             }
199             break;
200         }
201         if(script == UCOL_REORDER_CODE_DEFAULT) {
202             // The default code must be the only one in the list, and that is handled by the caller.
203             // Otherwise it must not be used.
204             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
205             return;
206         }
207         int32_t index = findScript(script);
208         if(index < 0) { continue; }
209         int32_t head = scripts[index];
210         int32_t firstByte = head >> 8;
211         int32_t lastByte = head & 0xff;
212         if(table[firstByte] != 0) {  // Duplicate or equivalent script.
213             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
214             return;
215         }
216         do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte);
217     }
218 
219     // Put all remaining scripts into the middle.
220     // Avoid table[0] which must remain 0.
221     for(int32_t i = 1; i <= 0xff; ++i) {
222         if(table[i] == 0) { table[i] = lowByte++; }
223     }
224     U_ASSERT(lowByte == highByte + 1);
225 }
226 
227 U_NAMESPACE_END
228 
229 #endif  // !UCONFIG_NO_COLLATION
230