1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 1996-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationcompare.cpp
9 *
10 * created on: 2012feb14 with new and old collation code
11 * created by: Markus W. Scherer
12 */
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_COLLATION
17
18 #include "unicode/ucol.h"
19 #include "cmemory.h"
20 #include "collation.h"
21 #include "collationcompare.h"
22 #include "collationiterator.h"
23 #include "collationsettings.h"
24 #include "uassert.h"
25
26 U_NAMESPACE_BEGIN
27
28 UCollationResult
compareUpToQuaternary(CollationIterator & left,CollationIterator & right,const CollationSettings & settings,UErrorCode & errorCode)29 CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right,
30 const CollationSettings &settings,
31 UErrorCode &errorCode) {
32 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
33
34 int32_t options = settings.options;
35 uint32_t variableTop;
36 if((options & CollationSettings::ALTERNATE_MASK) == 0) {
37 variableTop = 0;
38 } else {
39 // +1 so that we can use "<" and primary ignorables test out early.
40 variableTop = settings.variableTop + 1;
41 }
42 UBool anyVariable = FALSE;
43
44 // Fetch CEs, compare primaries, store secondary & tertiary weights.
45 for(;;) {
46 // We fetch CEs until we get a non-ignorable primary or reach the end.
47 uint32_t leftPrimary;
48 do {
49 int64_t ce = left.nextCE(errorCode);
50 leftPrimary = (uint32_t)(ce >> 32);
51 if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
52 // Variable CE, shift it to quaternary level.
53 // Ignore all following primary ignorables, and shift further variable CEs.
54 anyVariable = TRUE;
55 do {
56 // Store only the primary of the variable CE.
57 left.setCurrentCE(ce & INT64_C(0xffffffff00000000));
58 for(;;) {
59 ce = left.nextCE(errorCode);
60 leftPrimary = (uint32_t)(ce >> 32);
61 if(leftPrimary == 0) {
62 left.setCurrentCE(0);
63 } else {
64 break;
65 }
66 }
67 } while(leftPrimary < variableTop &&
68 leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
69 }
70 } while(leftPrimary == 0);
71
72 uint32_t rightPrimary;
73 do {
74 int64_t ce = right.nextCE(errorCode);
75 rightPrimary = (uint32_t)(ce >> 32);
76 if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
77 // Variable CE, shift it to quaternary level.
78 // Ignore all following primary ignorables, and shift further variable CEs.
79 anyVariable = TRUE;
80 do {
81 // Store only the primary of the variable CE.
82 right.setCurrentCE(ce & INT64_C(0xffffffff00000000));
83 for(;;) {
84 ce = right.nextCE(errorCode);
85 rightPrimary = (uint32_t)(ce >> 32);
86 if(rightPrimary == 0) {
87 right.setCurrentCE(0);
88 } else {
89 break;
90 }
91 }
92 } while(rightPrimary < variableTop &&
93 rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
94 }
95 } while(rightPrimary == 0);
96
97 if(leftPrimary != rightPrimary) {
98 // Return the primary difference, with script reordering.
99 if(settings.hasReordering()) {
100 leftPrimary = settings.reorder(leftPrimary);
101 rightPrimary = settings.reorder(rightPrimary);
102 }
103 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
104 }
105 if(leftPrimary == Collation::NO_CE_PRIMARY) { break; }
106 }
107 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
108
109 // Compare the buffered secondary & tertiary weights.
110 // We might skip the secondary level but continue with the case level
111 // which is turned on separately.
112 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
113 if((options & CollationSettings::BACKWARD_SECONDARY) == 0) {
114 int32_t leftIndex = 0;
115 int32_t rightIndex = 0;
116 for(;;) {
117 uint32_t leftSecondary;
118 do {
119 leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16;
120 } while(leftSecondary == 0);
121
122 uint32_t rightSecondary;
123 do {
124 rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16;
125 } while(rightSecondary == 0);
126
127 if(leftSecondary != rightSecondary) {
128 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
129 }
130 if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; }
131 }
132 } else {
133 // The backwards secondary level compares secondary weights backwards
134 // within segments separated by the merge separator (U+FFFE, weight 02).
135 int32_t leftStart = 0;
136 int32_t rightStart = 0;
137 for(;;) {
138 // Find the merge separator or the NO_CE terminator.
139 uint32_t p;
140 int32_t leftLimit = leftStart;
141 while((p = (uint32_t)(left.getCE(leftLimit) >> 32)) >
142 Collation::MERGE_SEPARATOR_PRIMARY ||
143 p == 0) {
144 ++leftLimit;
145 }
146 int32_t rightLimit = rightStart;
147 while((p = (uint32_t)(right.getCE(rightLimit) >> 32)) >
148 Collation::MERGE_SEPARATOR_PRIMARY ||
149 p == 0) {
150 ++rightLimit;
151 }
152
153 // Compare the segments.
154 int32_t leftIndex = leftLimit;
155 int32_t rightIndex = rightLimit;
156 for(;;) {
157 int32_t leftSecondary = 0;
158 while(leftSecondary == 0 && leftIndex > leftStart) {
159 leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16;
160 }
161
162 int32_t rightSecondary = 0;
163 while(rightSecondary == 0 && rightIndex > rightStart) {
164 rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16;
165 }
166
167 if(leftSecondary != rightSecondary) {
168 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
169 }
170 if(leftSecondary == 0) { break; }
171 }
172
173 // Did we reach the end of either string?
174 // Both strings have the same number of merge separators,
175 // or else there would have been a primary-level difference.
176 U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit));
177 if(p == Collation::NO_CE_PRIMARY) { break; }
178 // Skip both merge separators and continue.
179 leftStart = leftLimit + 1;
180 rightStart = rightLimit + 1;
181 }
182 }
183 }
184
185 if((options & CollationSettings::CASE_LEVEL) != 0) {
186 int32_t strength = CollationSettings::getStrength(options);
187 int32_t leftIndex = 0;
188 int32_t rightIndex = 0;
189 for(;;) {
190 uint32_t leftCase, leftLower32, rightCase;
191 if(strength == UCOL_PRIMARY) {
192 // Primary+caseLevel: Ignore case level weights of primary ignorables.
193 // Otherwise we would get a-umlaut > a
194 // which is not desirable for accent-insensitive sorting.
195 // Check for (lower 32 bits) == 0 as well because variable CEs are stored
196 // with only primary weights.
197 int64_t ce;
198 do {
199 ce = left.getCE(leftIndex++);
200 leftCase = (uint32_t)ce;
201 } while((uint32_t)(ce >> 32) == 0 || leftCase == 0);
202 leftLower32 = leftCase;
203 leftCase &= 0xc000;
204
205 do {
206 ce = right.getCE(rightIndex++);
207 rightCase = (uint32_t)ce;
208 } while((uint32_t)(ce >> 32) == 0 || rightCase == 0);
209 rightCase &= 0xc000;
210 } else {
211 // Secondary+caseLevel: By analogy with the above,
212 // ignore case level weights of secondary ignorables.
213 //
214 // Note: A tertiary CE has uppercase case bits (0.0.ut)
215 // to keep tertiary+caseFirst well-formed.
216 //
217 // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
218 // Otherwise a tertiary CE's uppercase would be no greater than
219 // a primary/secondary CE's uppercase.
220 // (See UCA well-formedness condition 2.)
221 // We could construct a special case weight higher than uppercase,
222 // but it's simpler to always ignore case weights of secondary ignorables,
223 // turning 0.0.ut into 0.0.0.t.
224 // (See LDML Collation, Case Parameters.)
225 do {
226 leftCase = (uint32_t)left.getCE(leftIndex++);
227 } while(leftCase <= 0xffff);
228 leftLower32 = leftCase;
229 leftCase &= 0xc000;
230
231 do {
232 rightCase = (uint32_t)right.getCE(rightIndex++);
233 } while(rightCase <= 0xffff);
234 rightCase &= 0xc000;
235 }
236
237 // No need to handle NO_CE and MERGE_SEPARATOR specially:
238 // There is one case weight for each previous-level weight,
239 // so level length differences were handled there.
240 if(leftCase != rightCase) {
241 if((options & CollationSettings::UPPER_FIRST) == 0) {
242 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
243 } else {
244 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
245 }
246 }
247 if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; }
248 }
249 }
250 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
251
252 uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options);
253
254 int32_t leftIndex = 0;
255 int32_t rightIndex = 0;
256 uint32_t anyQuaternaries = 0;
257 for(;;) {
258 uint32_t leftLower32, leftTertiary;
259 do {
260 leftLower32 = (uint32_t)left.getCE(leftIndex++);
261 anyQuaternaries |= leftLower32;
262 U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
263 (leftLower32 & 0xc0c0) == 0);
264 leftTertiary = leftLower32 & tertiaryMask;
265 } while(leftTertiary == 0);
266
267 uint32_t rightLower32, rightTertiary;
268 do {
269 rightLower32 = (uint32_t)right.getCE(rightIndex++);
270 anyQuaternaries |= rightLower32;
271 U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
272 (rightLower32 & 0xc0c0) == 0);
273 rightTertiary = rightLower32 & tertiaryMask;
274 } while(rightTertiary == 0);
275
276 if(leftTertiary != rightTertiary) {
277 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
278 // Pass through NO_CE and keep real tertiary weights larger than that.
279 // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
280 // to keep tertiary CEs well-formed.
281 // Their case+tertiary weights must be greater than those of
282 // primary and secondary CEs.
283 if(leftTertiary > Collation::NO_CE_WEIGHT16) {
284 if(leftLower32 > 0xffff) {
285 leftTertiary ^= 0xc000;
286 } else {
287 leftTertiary += 0x4000;
288 }
289 }
290 if(rightTertiary > Collation::NO_CE_WEIGHT16) {
291 if(rightLower32 > 0xffff) {
292 rightTertiary ^= 0xc000;
293 } else {
294 rightTertiary += 0x4000;
295 }
296 }
297 }
298 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
299 }
300 if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; }
301 }
302 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
303
304 if(!anyVariable && (anyQuaternaries & 0xc0) == 0) {
305 // If there are no "variable" CEs and no non-zero quaternary weights,
306 // then there are no quaternary differences.
307 return UCOL_EQUAL;
308 }
309
310 leftIndex = 0;
311 rightIndex = 0;
312 for(;;) {
313 uint32_t leftQuaternary;
314 do {
315 int64_t ce = left.getCE(leftIndex++);
316 leftQuaternary = (uint32_t)ce & 0xffff;
317 if(leftQuaternary <= Collation::NO_CE_WEIGHT16) {
318 // Variable primary or completely ignorable or NO_CE.
319 leftQuaternary = (uint32_t)(ce >> 32);
320 } else {
321 // Regular CE, not tertiary ignorable.
322 // Preserve the quaternary weight in bits 7..6.
323 leftQuaternary |= 0xffffff3f;
324 }
325 } while(leftQuaternary == 0);
326
327 uint32_t rightQuaternary;
328 do {
329 int64_t ce = right.getCE(rightIndex++);
330 rightQuaternary = (uint32_t)ce & 0xffff;
331 if(rightQuaternary <= Collation::NO_CE_WEIGHT16) {
332 // Variable primary or completely ignorable or NO_CE.
333 rightQuaternary = (uint32_t)(ce >> 32);
334 } else {
335 // Regular CE, not tertiary ignorable.
336 // Preserve the quaternary weight in bits 7..6.
337 rightQuaternary |= 0xffffff3f;
338 }
339 } while(rightQuaternary == 0);
340
341 if(leftQuaternary != rightQuaternary) {
342 // Return the difference, with script reordering.
343 if(settings.hasReordering()) {
344 leftQuaternary = settings.reorder(leftQuaternary);
345 rightQuaternary = settings.reorder(rightQuaternary);
346 }
347 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
348 }
349 if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; }
350 }
351 return UCOL_EQUAL;
352 }
353
354 U_NAMESPACE_END
355
356 #endif // !UCONFIG_NO_COLLATION
357