1 /*
2 *******************************************************************************
3 * Copyright (C) 2010-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * utf16collationiterator.cpp
7 *
8 * created on: 2010oct27
9 * created by: Markus W. Scherer
10 */
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_COLLATION
15
16 #include "charstr.h"
17 #include "cmemory.h"
18 #include "collation.h"
19 #include "collationdata.h"
20 #include "collationfcd.h"
21 #include "collationiterator.h"
22 #include "normalizer2impl.h"
23 #include "uassert.h"
24 #include "utf16collationiterator.h"
25
26 U_NAMESPACE_BEGIN
27
UTF16CollationIterator(const UTF16CollationIterator & other,const UChar * newText)28 UTF16CollationIterator::UTF16CollationIterator(const UTF16CollationIterator &other,
29 const UChar *newText)
30 : CollationIterator(other),
31 start(newText),
32 pos(newText + (other.pos - other.start)),
33 limit(other.limit == NULL ? NULL : newText + (other.limit - other.start)) {
34 }
35
~UTF16CollationIterator()36 UTF16CollationIterator::~UTF16CollationIterator() {}
37
38 UBool
operator ==(const CollationIterator & other) const39 UTF16CollationIterator::operator==(const CollationIterator &other) const {
40 if(!CollationIterator::operator==(other)) { return FALSE; }
41 const UTF16CollationIterator &o = static_cast<const UTF16CollationIterator &>(other);
42 // Compare the iterator state but not the text: Assume that the caller does that.
43 return (pos - start) == (o.pos - o.start);
44 }
45
46 void
resetToOffset(int32_t newOffset)47 UTF16CollationIterator::resetToOffset(int32_t newOffset) {
48 reset();
49 pos = start + newOffset;
50 }
51
52 int32_t
getOffset() const53 UTF16CollationIterator::getOffset() const {
54 return (int32_t)(pos - start);
55 }
56
57 uint32_t
handleNextCE32(UChar32 & c,UErrorCode &)58 UTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
59 if(pos == limit) {
60 c = U_SENTINEL;
61 return Collation::FALLBACK_CE32;
62 }
63 c = *pos++;
64 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
65 }
66
67 UChar
handleGetTrailSurrogate()68 UTF16CollationIterator::handleGetTrailSurrogate() {
69 if(pos == limit) { return 0; }
70 UChar trail;
71 if(U16_IS_TRAIL(trail = *pos)) { ++pos; }
72 return trail;
73 }
74
75 UBool
foundNULTerminator()76 UTF16CollationIterator::foundNULTerminator() {
77 if(limit == NULL) {
78 limit = --pos;
79 return TRUE;
80 } else {
81 return FALSE;
82 }
83 }
84
85 UChar32
nextCodePoint(UErrorCode &)86 UTF16CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
87 if(pos == limit) {
88 return U_SENTINEL;
89 }
90 UChar32 c = *pos;
91 if(c == 0 && limit == NULL) {
92 limit = pos;
93 return U_SENTINEL;
94 }
95 ++pos;
96 UChar trail;
97 if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) {
98 ++pos;
99 return U16_GET_SUPPLEMENTARY(c, trail);
100 } else {
101 return c;
102 }
103 }
104
105 UChar32
previousCodePoint(UErrorCode &)106 UTF16CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
107 if(pos == start) {
108 return U_SENTINEL;
109 }
110 UChar32 c = *--pos;
111 UChar lead;
112 if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) {
113 --pos;
114 return U16_GET_SUPPLEMENTARY(lead, c);
115 } else {
116 return c;
117 }
118 }
119
120 void
forwardNumCodePoints(int32_t num,UErrorCode &)121 UTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
122 while(num > 0 && pos != limit) {
123 UChar32 c = *pos;
124 if(c == 0 && limit == NULL) {
125 limit = pos;
126 break;
127 }
128 ++pos;
129 --num;
130 if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(*pos)) {
131 ++pos;
132 }
133 }
134 }
135
136 void
backwardNumCodePoints(int32_t num,UErrorCode &)137 UTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
138 while(num > 0 && pos != start) {
139 UChar32 c = *--pos;
140 --num;
141 if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(*(pos-1))) {
142 --pos;
143 }
144 }
145 }
146
147 // FCDUTF16CollationIterator ----------------------------------------------- ***
148
FCDUTF16CollationIterator(const FCDUTF16CollationIterator & other,const UChar * newText)149 FCDUTF16CollationIterator::FCDUTF16CollationIterator(const FCDUTF16CollationIterator &other,
150 const UChar *newText)
151 : UTF16CollationIterator(other),
152 rawStart(newText),
153 segmentStart(newText + (other.segmentStart - other.rawStart)),
154 segmentLimit(other.segmentLimit == NULL ? NULL : newText + (other.segmentLimit - other.rawStart)),
155 rawLimit(other.rawLimit == NULL ? NULL : newText + (other.rawLimit - other.rawStart)),
156 nfcImpl(other.nfcImpl),
157 normalized(other.normalized),
158 checkDir(other.checkDir) {
159 if(checkDir != 0 || other.start == other.segmentStart) {
160 start = newText + (other.start - other.rawStart);
161 pos = newText + (other.pos - other.rawStart);
162 limit = other.limit == NULL ? NULL : newText + (other.limit - other.rawStart);
163 } else {
164 start = normalized.getBuffer();
165 pos = start + (other.pos - other.start);
166 limit = start + normalized.length();
167 }
168 }
169
~FCDUTF16CollationIterator()170 FCDUTF16CollationIterator::~FCDUTF16CollationIterator() {}
171
172 UBool
operator ==(const CollationIterator & other) const173 FCDUTF16CollationIterator::operator==(const CollationIterator &other) const {
174 // Skip the UTF16CollationIterator and call its parent.
175 if(!CollationIterator::operator==(other)) { return FALSE; }
176 const FCDUTF16CollationIterator &o = static_cast<const FCDUTF16CollationIterator &>(other);
177 // Compare the iterator state but not the text: Assume that the caller does that.
178 if(checkDir != o.checkDir) { return FALSE; }
179 if(checkDir == 0 && (start == segmentStart) != (o.start == o.segmentStart)) { return FALSE; }
180 if(checkDir != 0 || start == segmentStart) {
181 return (pos - rawStart) == (o.pos - o.rawStart);
182 } else {
183 return (segmentStart - rawStart) == (o.segmentStart - o.rawStart) &&
184 (pos - start) == (o.pos - o.start);
185 }
186 }
187
188 void
resetToOffset(int32_t newOffset)189 FCDUTF16CollationIterator::resetToOffset(int32_t newOffset) {
190 reset();
191 start = segmentStart = pos = rawStart + newOffset;
192 limit = rawLimit;
193 checkDir = 1;
194 }
195
196 int32_t
getOffset() const197 FCDUTF16CollationIterator::getOffset() const {
198 if(checkDir != 0 || start == segmentStart) {
199 return (int32_t)(pos - rawStart);
200 } else if(pos == start) {
201 return (int32_t)(segmentStart - rawStart);
202 } else {
203 return (int32_t)(segmentLimit - rawStart);
204 }
205 }
206
207 uint32_t
handleNextCE32(UChar32 & c,UErrorCode & errorCode)208 FCDUTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
209 for(;;) {
210 if(checkDir > 0) {
211 if(pos == limit) {
212 c = U_SENTINEL;
213 return Collation::FALLBACK_CE32;
214 }
215 c = *pos++;
216 if(CollationFCD::hasTccc(c)) {
217 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
218 (pos != limit && CollationFCD::hasLccc(*pos))) {
219 --pos;
220 if(!nextSegment(errorCode)) {
221 c = U_SENTINEL;
222 return Collation::FALLBACK_CE32;
223 }
224 c = *pos++;
225 }
226 }
227 break;
228 } else if(checkDir == 0 && pos != limit) {
229 c = *pos++;
230 break;
231 } else {
232 switchToForward();
233 }
234 }
235 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
236 }
237
238 UBool
foundNULTerminator()239 FCDUTF16CollationIterator::foundNULTerminator() {
240 if(limit == NULL) {
241 limit = rawLimit = --pos;
242 return TRUE;
243 } else {
244 return FALSE;
245 }
246 }
247
248 UChar32
nextCodePoint(UErrorCode & errorCode)249 FCDUTF16CollationIterator::nextCodePoint(UErrorCode &errorCode) {
250 UChar32 c;
251 for(;;) {
252 if(checkDir > 0) {
253 if(pos == limit) {
254 return U_SENTINEL;
255 }
256 c = *pos++;
257 if(CollationFCD::hasTccc(c)) {
258 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
259 (pos != limit && CollationFCD::hasLccc(*pos))) {
260 --pos;
261 if(!nextSegment(errorCode)) {
262 return U_SENTINEL;
263 }
264 c = *pos++;
265 }
266 } else if(c == 0 && limit == NULL) {
267 limit = rawLimit = --pos;
268 return U_SENTINEL;
269 }
270 break;
271 } else if(checkDir == 0 && pos != limit) {
272 c = *pos++;
273 break;
274 } else {
275 switchToForward();
276 }
277 }
278 UChar trail;
279 if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) {
280 ++pos;
281 return U16_GET_SUPPLEMENTARY(c, trail);
282 } else {
283 return c;
284 }
285 }
286
287 UChar32
previousCodePoint(UErrorCode & errorCode)288 FCDUTF16CollationIterator::previousCodePoint(UErrorCode &errorCode) {
289 UChar32 c;
290 for(;;) {
291 if(checkDir < 0) {
292 if(pos == start) {
293 return U_SENTINEL;
294 }
295 c = *--pos;
296 if(CollationFCD::hasLccc(c)) {
297 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
298 (pos != start && CollationFCD::hasTccc(*(pos - 1)))) {
299 ++pos;
300 if(!previousSegment(errorCode)) {
301 return U_SENTINEL;
302 }
303 c = *--pos;
304 }
305 }
306 break;
307 } else if(checkDir == 0 && pos != start) {
308 c = *--pos;
309 break;
310 } else {
311 switchToBackward();
312 }
313 }
314 UChar lead;
315 if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) {
316 --pos;
317 return U16_GET_SUPPLEMENTARY(lead, c);
318 } else {
319 return c;
320 }
321 }
322
323 void
forwardNumCodePoints(int32_t num,UErrorCode & errorCode)324 FCDUTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
325 // Specify the class to avoid a virtual-function indirection.
326 // In Java, we would declare this class final.
327 while(num > 0 && FCDUTF16CollationIterator::nextCodePoint(errorCode) >= 0) {
328 --num;
329 }
330 }
331
332 void
backwardNumCodePoints(int32_t num,UErrorCode & errorCode)333 FCDUTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
334 // Specify the class to avoid a virtual-function indirection.
335 // In Java, we would declare this class final.
336 while(num > 0 && FCDUTF16CollationIterator::previousCodePoint(errorCode) >= 0) {
337 --num;
338 }
339 }
340
341 void
switchToForward()342 FCDUTF16CollationIterator::switchToForward() {
343 U_ASSERT(checkDir < 0 || (checkDir == 0 && pos == limit));
344 if(checkDir < 0) {
345 // Turn around from backward checking.
346 start = segmentStart = pos;
347 if(pos == segmentLimit) {
348 limit = rawLimit;
349 checkDir = 1; // Check forward.
350 } else { // pos < segmentLimit
351 checkDir = 0; // Stay in FCD segment.
352 }
353 } else {
354 // Reached the end of the FCD segment.
355 if(start == segmentStart) {
356 // The input text segment is FCD, extend it forward.
357 } else {
358 // The input text segment needed to be normalized.
359 // Switch to checking forward from it.
360 pos = start = segmentStart = segmentLimit;
361 // Note: If this segment is at the end of the input text,
362 // then it might help to return FALSE to indicate that, so that
363 // we do not have to re-check and normalize when we turn around and go backwards.
364 // However, that would complicate the call sites for an optimization of an unusual case.
365 }
366 limit = rawLimit;
367 checkDir = 1;
368 }
369 }
370
371 UBool
nextSegment(UErrorCode & errorCode)372 FCDUTF16CollationIterator::nextSegment(UErrorCode &errorCode) {
373 if(U_FAILURE(errorCode)) { return FALSE; }
374 U_ASSERT(checkDir > 0 && pos != limit);
375 // The input text [segmentStart..pos[ passes the FCD check.
376 const UChar *p = pos;
377 uint8_t prevCC = 0;
378 for(;;) {
379 // Fetch the next character's fcd16 value.
380 const UChar *q = p;
381 uint16_t fcd16 = nfcImpl.nextFCD16(p, rawLimit);
382 uint8_t leadCC = (uint8_t)(fcd16 >> 8);
383 if(leadCC == 0 && q != pos) {
384 // FCD boundary before the [q, p[ character.
385 limit = segmentLimit = q;
386 break;
387 }
388 if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
389 // Fails FCD check. Find the next FCD boundary and normalize.
390 do {
391 q = p;
392 } while(p != rawLimit && nfcImpl.nextFCD16(p, rawLimit) > 0xff);
393 if(!normalize(pos, q, errorCode)) { return FALSE; }
394 pos = start;
395 break;
396 }
397 prevCC = (uint8_t)fcd16;
398 if(p == rawLimit || prevCC == 0) {
399 // FCD boundary after the last character.
400 limit = segmentLimit = p;
401 break;
402 }
403 }
404 U_ASSERT(pos != limit);
405 checkDir = 0;
406 return TRUE;
407 }
408
409 void
switchToBackward()410 FCDUTF16CollationIterator::switchToBackward() {
411 U_ASSERT(checkDir > 0 || (checkDir == 0 && pos == start));
412 if(checkDir > 0) {
413 // Turn around from forward checking.
414 limit = segmentLimit = pos;
415 if(pos == segmentStart) {
416 start = rawStart;
417 checkDir = -1; // Check backward.
418 } else { // pos > segmentStart
419 checkDir = 0; // Stay in FCD segment.
420 }
421 } else {
422 // Reached the start of the FCD segment.
423 if(start == segmentStart) {
424 // The input text segment is FCD, extend it backward.
425 } else {
426 // The input text segment needed to be normalized.
427 // Switch to checking backward from it.
428 pos = limit = segmentLimit = segmentStart;
429 }
430 start = rawStart;
431 checkDir = -1;
432 }
433 }
434
435 UBool
previousSegment(UErrorCode & errorCode)436 FCDUTF16CollationIterator::previousSegment(UErrorCode &errorCode) {
437 if(U_FAILURE(errorCode)) { return FALSE; }
438 U_ASSERT(checkDir < 0 && pos != start);
439 // The input text [pos..segmentLimit[ passes the FCD check.
440 const UChar *p = pos;
441 uint8_t nextCC = 0;
442 for(;;) {
443 // Fetch the previous character's fcd16 value.
444 const UChar *q = p;
445 uint16_t fcd16 = nfcImpl.previousFCD16(rawStart, p);
446 uint8_t trailCC = (uint8_t)fcd16;
447 if(trailCC == 0 && q != pos) {
448 // FCD boundary after the [p, q[ character.
449 start = segmentStart = q;
450 break;
451 }
452 if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
453 CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
454 // Fails FCD check. Find the previous FCD boundary and normalize.
455 do {
456 q = p;
457 } while(fcd16 > 0xff && p != rawStart &&
458 (fcd16 = nfcImpl.previousFCD16(rawStart, p)) != 0);
459 if(!normalize(q, pos, errorCode)) { return FALSE; }
460 pos = limit;
461 break;
462 }
463 nextCC = (uint8_t)(fcd16 >> 8);
464 if(p == rawStart || nextCC == 0) {
465 // FCD boundary before the following character.
466 start = segmentStart = p;
467 break;
468 }
469 }
470 U_ASSERT(pos != start);
471 checkDir = 0;
472 return TRUE;
473 }
474
475 UBool
normalize(const UChar * from,const UChar * to,UErrorCode & errorCode)476 FCDUTF16CollationIterator::normalize(const UChar *from, const UChar *to, UErrorCode &errorCode) {
477 // NFD without argument checking.
478 U_ASSERT(U_SUCCESS(errorCode));
479 nfcImpl.decompose(from, to, normalized, (int32_t)(to - from), errorCode);
480 if(U_FAILURE(errorCode)) { return FALSE; }
481 // Switch collation processing into the FCD buffer
482 // with the result of normalizing [segmentStart, segmentLimit[.
483 segmentStart = from;
484 segmentLimit = to;
485 start = normalized.getBuffer();
486 limit = start + normalized.length();
487 return TRUE;
488 }
489
490 U_NAMESPACE_END
491
492 #endif // !UCONFIG_NO_COLLATION
493