1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2012-2014, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * uitercollationiterator.cpp
9 *
10 * created on: 2012sep23 (from utf16collationiterator.cpp)
11 * created by: Markus W. Scherer
12 */
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_COLLATION
17
18 #include "unicode/uiter.h"
19 #include "charstr.h"
20 #include "cmemory.h"
21 #include "collation.h"
22 #include "collationdata.h"
23 #include "collationfcd.h"
24 #include "collationiterator.h"
25 #include "normalizer2impl.h"
26 #include "uassert.h"
27 #include "uitercollationiterator.h"
28
29 U_NAMESPACE_BEGIN
30
~UIterCollationIterator()31 UIterCollationIterator::~UIterCollationIterator() {}
32
33 void
resetToOffset(int32_t newOffset)34 UIterCollationIterator::resetToOffset(int32_t newOffset) {
35 reset();
36 iter.move(&iter, newOffset, UITER_START);
37 }
38
39 int32_t
getOffset() const40 UIterCollationIterator::getOffset() const {
41 return iter.getIndex(&iter, UITER_CURRENT);
42 }
43
44 uint32_t
handleNextCE32(UChar32 & c,UErrorCode &)45 UIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
46 c = iter.next(&iter);
47 if(c < 0) {
48 return Collation::FALLBACK_CE32;
49 }
50 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
51 }
52
53 UChar
handleGetTrailSurrogate()54 UIterCollationIterator::handleGetTrailSurrogate() {
55 UChar32 trail = iter.next(&iter);
56 if(!U16_IS_TRAIL(trail) && trail >= 0) { iter.previous(&iter); }
57 return (UChar)trail;
58 }
59
60 UChar32
nextCodePoint(UErrorCode &)61 UIterCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
62 return uiter_next32(&iter);
63 }
64
65 UChar32
previousCodePoint(UErrorCode &)66 UIterCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
67 return uiter_previous32(&iter);
68 }
69
70 void
forwardNumCodePoints(int32_t num,UErrorCode &)71 UIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
72 while(num > 0 && (uiter_next32(&iter)) >= 0) {
73 --num;
74 }
75 }
76
77 void
backwardNumCodePoints(int32_t num,UErrorCode &)78 UIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
79 while(num > 0 && (uiter_previous32(&iter)) >= 0) {
80 --num;
81 }
82 }
83
84 // FCDUIterCollationIterator ----------------------------------------------- ***
85
~FCDUIterCollationIterator()86 FCDUIterCollationIterator::~FCDUIterCollationIterator() {}
87
88 void
resetToOffset(int32_t newOffset)89 FCDUIterCollationIterator::resetToOffset(int32_t newOffset) {
90 UIterCollationIterator::resetToOffset(newOffset);
91 start = newOffset;
92 state = ITER_CHECK_FWD;
93 }
94
95 int32_t
getOffset() const96 FCDUIterCollationIterator::getOffset() const {
97 if(state <= ITER_CHECK_BWD) {
98 return iter.getIndex(&iter, UITER_CURRENT);
99 } else if(state == ITER_IN_FCD_SEGMENT) {
100 return pos;
101 } else if(pos == 0) {
102 return start;
103 } else {
104 return limit;
105 }
106 }
107
108 uint32_t
handleNextCE32(UChar32 & c,UErrorCode & errorCode)109 FCDUIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
110 for(;;) {
111 if(state == ITER_CHECK_FWD) {
112 c = iter.next(&iter);
113 if(c < 0) {
114 return Collation::FALLBACK_CE32;
115 }
116 if(CollationFCD::hasTccc(c)) {
117 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
118 CollationFCD::hasLccc(iter.current(&iter))) {
119 iter.previous(&iter);
120 if(!nextSegment(errorCode)) {
121 c = U_SENTINEL;
122 return Collation::FALLBACK_CE32;
123 }
124 continue;
125 }
126 }
127 break;
128 } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) {
129 c = iter.next(&iter);
130 ++pos;
131 U_ASSERT(c >= 0);
132 break;
133 } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) {
134 c = normalized[pos++];
135 break;
136 } else {
137 switchToForward();
138 }
139 }
140 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
141 }
142
143 UChar
handleGetTrailSurrogate()144 FCDUIterCollationIterator::handleGetTrailSurrogate() {
145 if(state <= ITER_IN_FCD_SEGMENT) {
146 UChar32 trail = iter.next(&iter);
147 if(U16_IS_TRAIL(trail)) {
148 if(state == ITER_IN_FCD_SEGMENT) { ++pos; }
149 } else if(trail >= 0) {
150 iter.previous(&iter);
151 }
152 return (UChar)trail;
153 } else {
154 U_ASSERT(pos < normalized.length());
155 UChar trail;
156 if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; }
157 return trail;
158 }
159 }
160
161 UChar32
nextCodePoint(UErrorCode & errorCode)162 FCDUIterCollationIterator::nextCodePoint(UErrorCode &errorCode) {
163 UChar32 c;
164 for(;;) {
165 if(state == ITER_CHECK_FWD) {
166 c = iter.next(&iter);
167 if(c < 0) {
168 return c;
169 }
170 if(CollationFCD::hasTccc(c)) {
171 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
172 CollationFCD::hasLccc(iter.current(&iter))) {
173 iter.previous(&iter);
174 if(!nextSegment(errorCode)) {
175 return U_SENTINEL;
176 }
177 continue;
178 }
179 }
180 if(U16_IS_LEAD(c)) {
181 UChar32 trail = iter.next(&iter);
182 if(U16_IS_TRAIL(trail)) {
183 return U16_GET_SUPPLEMENTARY(c, trail);
184 } else if(trail >= 0) {
185 iter.previous(&iter);
186 }
187 }
188 return c;
189 } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) {
190 c = uiter_next32(&iter);
191 pos += U16_LENGTH(c);
192 U_ASSERT(c >= 0);
193 return c;
194 } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) {
195 c = normalized.char32At(pos);
196 pos += U16_LENGTH(c);
197 return c;
198 } else {
199 switchToForward();
200 }
201 }
202 }
203
204 UChar32
previousCodePoint(UErrorCode & errorCode)205 FCDUIterCollationIterator::previousCodePoint(UErrorCode &errorCode) {
206 UChar32 c;
207 for(;;) {
208 if(state == ITER_CHECK_BWD) {
209 c = iter.previous(&iter);
210 if(c < 0) {
211 start = pos = 0;
212 state = ITER_IN_FCD_SEGMENT;
213 return U_SENTINEL;
214 }
215 if(CollationFCD::hasLccc(c)) {
216 UChar32 prev = U_SENTINEL;
217 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
218 CollationFCD::hasTccc(prev = iter.previous(&iter))) {
219 iter.next(&iter);
220 if(prev >= 0) {
221 iter.next(&iter);
222 }
223 if(!previousSegment(errorCode)) {
224 return U_SENTINEL;
225 }
226 continue;
227 }
228 // hasLccc(trail)=true for all trail surrogates
229 if(U16_IS_TRAIL(c)) {
230 if(prev < 0) {
231 prev = iter.previous(&iter);
232 }
233 if(U16_IS_LEAD(prev)) {
234 return U16_GET_SUPPLEMENTARY(prev, c);
235 }
236 }
237 if(prev >= 0) {
238 iter.next(&iter);
239 }
240 }
241 return c;
242 } else if(state == ITER_IN_FCD_SEGMENT && pos != start) {
243 c = uiter_previous32(&iter);
244 pos -= U16_LENGTH(c);
245 U_ASSERT(c >= 0);
246 return c;
247 } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != 0) {
248 c = normalized.char32At(pos - 1);
249 pos -= U16_LENGTH(c);
250 return c;
251 } else {
252 switchToBackward();
253 }
254 }
255 }
256
257 void
forwardNumCodePoints(int32_t num,UErrorCode & errorCode)258 FCDUIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
259 // Specify the class to avoid a virtual-function indirection.
260 // In Java, we would declare this class final.
261 while(num > 0 && FCDUIterCollationIterator::nextCodePoint(errorCode) >= 0) {
262 --num;
263 }
264 }
265
266 void
backwardNumCodePoints(int32_t num,UErrorCode & errorCode)267 FCDUIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
268 // Specify the class to avoid a virtual-function indirection.
269 // In Java, we would declare this class final.
270 while(num > 0 && FCDUIterCollationIterator::previousCodePoint(errorCode) >= 0) {
271 --num;
272 }
273 }
274
275 void
switchToForward()276 FCDUIterCollationIterator::switchToForward() {
277 U_ASSERT(state == ITER_CHECK_BWD ||
278 (state == ITER_IN_FCD_SEGMENT && pos == limit) ||
279 (state >= IN_NORM_ITER_AT_LIMIT && pos == normalized.length()));
280 if(state == ITER_CHECK_BWD) {
281 // Turn around from backward checking.
282 start = pos = iter.getIndex(&iter, UITER_CURRENT);
283 if(pos == limit) {
284 state = ITER_CHECK_FWD; // Check forward.
285 } else { // pos < limit
286 state = ITER_IN_FCD_SEGMENT; // Stay in FCD segment.
287 }
288 } else {
289 // Reached the end of the FCD segment.
290 if(state == ITER_IN_FCD_SEGMENT) {
291 // The input text segment is FCD, extend it forward.
292 } else {
293 // The input text segment needed to be normalized.
294 // Switch to checking forward from it.
295 if(state == IN_NORM_ITER_AT_START) {
296 iter.move(&iter, limit - start, UITER_CURRENT);
297 }
298 start = limit;
299 }
300 state = ITER_CHECK_FWD;
301 }
302 }
303
304 UBool
nextSegment(UErrorCode & errorCode)305 FCDUIterCollationIterator::nextSegment(UErrorCode &errorCode) {
306 if(U_FAILURE(errorCode)) { return FALSE; }
307 U_ASSERT(state == ITER_CHECK_FWD);
308 // The input text [start..(iter index)[ passes the FCD check.
309 pos = iter.getIndex(&iter, UITER_CURRENT);
310 // Collect the characters being checked, in case they need to be normalized.
311 UnicodeString s;
312 uint8_t prevCC = 0;
313 for(;;) {
314 // Fetch the next character and its fcd16 value.
315 UChar32 c = uiter_next32(&iter);
316 if(c < 0) { break; }
317 uint16_t fcd16 = nfcImpl.getFCD16(c);
318 uint8_t leadCC = (uint8_t)(fcd16 >> 8);
319 if(leadCC == 0 && !s.isEmpty()) {
320 // FCD boundary before this character.
321 uiter_previous32(&iter);
322 break;
323 }
324 s.append(c);
325 if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
326 // Fails FCD check. Find the next FCD boundary and normalize.
327 for(;;) {
328 c = uiter_next32(&iter);
329 if(c < 0) { break; }
330 if(nfcImpl.getFCD16(c) <= 0xff) {
331 uiter_previous32(&iter);
332 break;
333 }
334 s.append(c);
335 }
336 if(!normalize(s, errorCode)) { return FALSE; }
337 start = pos;
338 limit = pos + s.length();
339 state = IN_NORM_ITER_AT_LIMIT;
340 pos = 0;
341 return TRUE;
342 }
343 prevCC = (uint8_t)fcd16;
344 if(prevCC == 0) {
345 // FCD boundary after the last character.
346 break;
347 }
348 }
349 limit = pos + s.length();
350 U_ASSERT(pos != limit);
351 iter.move(&iter, -s.length(), UITER_CURRENT);
352 state = ITER_IN_FCD_SEGMENT;
353 return TRUE;
354 }
355
356 void
switchToBackward()357 FCDUIterCollationIterator::switchToBackward() {
358 U_ASSERT(state == ITER_CHECK_FWD ||
359 (state == ITER_IN_FCD_SEGMENT && pos == start) ||
360 (state >= IN_NORM_ITER_AT_LIMIT && pos == 0));
361 if(state == ITER_CHECK_FWD) {
362 // Turn around from forward checking.
363 limit = pos = iter.getIndex(&iter, UITER_CURRENT);
364 if(pos == start) {
365 state = ITER_CHECK_BWD; // Check backward.
366 } else { // pos > start
367 state = ITER_IN_FCD_SEGMENT; // Stay in FCD segment.
368 }
369 } else {
370 // Reached the start of the FCD segment.
371 if(state == ITER_IN_FCD_SEGMENT) {
372 // The input text segment is FCD, extend it backward.
373 } else {
374 // The input text segment needed to be normalized.
375 // Switch to checking backward from it.
376 if(state == IN_NORM_ITER_AT_LIMIT) {
377 iter.move(&iter, start - limit, UITER_CURRENT);
378 }
379 limit = start;
380 }
381 state = ITER_CHECK_BWD;
382 }
383 }
384
385 UBool
previousSegment(UErrorCode & errorCode)386 FCDUIterCollationIterator::previousSegment(UErrorCode &errorCode) {
387 if(U_FAILURE(errorCode)) { return FALSE; }
388 U_ASSERT(state == ITER_CHECK_BWD);
389 // The input text [(iter index)..limit[ passes the FCD check.
390 pos = iter.getIndex(&iter, UITER_CURRENT);
391 // Collect the characters being checked, in case they need to be normalized.
392 UnicodeString s;
393 uint8_t nextCC = 0;
394 for(;;) {
395 // Fetch the previous character and its fcd16 value.
396 UChar32 c = uiter_previous32(&iter);
397 if(c < 0) { break; }
398 uint16_t fcd16 = nfcImpl.getFCD16(c);
399 uint8_t trailCC = (uint8_t)fcd16;
400 if(trailCC == 0 && !s.isEmpty()) {
401 // FCD boundary after this character.
402 uiter_next32(&iter);
403 break;
404 }
405 s.append(c);
406 if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
407 CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
408 // Fails FCD check. Find the previous FCD boundary and normalize.
409 while(fcd16 > 0xff) {
410 c = uiter_previous32(&iter);
411 if(c < 0) { break; }
412 fcd16 = nfcImpl.getFCD16(c);
413 if(fcd16 == 0) {
414 (void)uiter_next32(&iter);
415 break;
416 }
417 s.append(c);
418 }
419 s.reverse();
420 if(!normalize(s, errorCode)) { return FALSE; }
421 limit = pos;
422 start = pos - s.length();
423 state = IN_NORM_ITER_AT_START;
424 pos = normalized.length();
425 return TRUE;
426 }
427 nextCC = (uint8_t)(fcd16 >> 8);
428 if(nextCC == 0) {
429 // FCD boundary before the following character.
430 break;
431 }
432 }
433 start = pos - s.length();
434 U_ASSERT(pos != start);
435 iter.move(&iter, s.length(), UITER_CURRENT);
436 state = ITER_IN_FCD_SEGMENT;
437 return TRUE;
438 }
439
440 UBool
normalize(const UnicodeString & s,UErrorCode & errorCode)441 FCDUIterCollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) {
442 // NFD without argument checking.
443 U_ASSERT(U_SUCCESS(errorCode));
444 nfcImpl.decompose(s, normalized, errorCode);
445 return U_SUCCESS(errorCode);
446 }
447
448 U_NAMESPACE_END
449
450 #endif // !UCONFIG_NO_COLLATION
451