1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2002-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: uiter.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002jan18
14 * created by: Markus W. Scherer
15 */
16
17 #include "unicode/utypes.h"
18 #include "unicode/ustring.h"
19 #include "unicode/chariter.h"
20 #include "unicode/rep.h"
21 #include "unicode/uiter.h"
22 #include "cstring.h"
23
24 U_NAMESPACE_USE
25
26 #define IS_EVEN(n) (((n)&1)==0)
27 #define IS_POINTER_EVEN(p) IS_EVEN((size_t)p)
28
29 U_CDECL_BEGIN
30
31 /* No-Op UCharIterator implementation for illegal input --------------------- */
32
33 static int32_t U_CALLCONV
noopGetIndex(UCharIterator *,UCharIteratorOrigin)34 noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) {
35 return 0;
36 }
37
38 static int32_t U_CALLCONV
noopMove(UCharIterator *,int32_t,UCharIteratorOrigin)39 noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*origin*/) {
40 return 0;
41 }
42
43 static UBool U_CALLCONV
noopHasNext(UCharIterator *)44 noopHasNext(UCharIterator * /*iter*/) {
45 return FALSE;
46 }
47
48 static UChar32 U_CALLCONV
noopCurrent(UCharIterator *)49 noopCurrent(UCharIterator * /*iter*/) {
50 return U_SENTINEL;
51 }
52
53 static uint32_t U_CALLCONV
noopGetState(const UCharIterator *)54 noopGetState(const UCharIterator * /*iter*/) {
55 return UITER_NO_STATE;
56 }
57
58 static void U_CALLCONV
noopSetState(UCharIterator *,uint32_t,UErrorCode * pErrorCode)59 noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode *pErrorCode) {
60 *pErrorCode=U_UNSUPPORTED_ERROR;
61 }
62
63 static const UCharIterator noopIterator={
64 0, 0, 0, 0, 0, 0,
65 noopGetIndex,
66 noopMove,
67 noopHasNext,
68 noopHasNext,
69 noopCurrent,
70 noopCurrent,
71 noopCurrent,
72 NULL,
73 noopGetState,
74 noopSetState
75 };
76
77 /* UCharIterator implementation for simple strings -------------------------- */
78
79 /*
80 * This is an implementation of a code unit (UChar) iterator
81 * for UChar * strings.
82 *
83 * The UCharIterator.context field holds a pointer to the string.
84 */
85
86 static int32_t U_CALLCONV
stringIteratorGetIndex(UCharIterator * iter,UCharIteratorOrigin origin)87 stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
88 switch(origin) {
89 case UITER_ZERO:
90 return 0;
91 case UITER_START:
92 return iter->start;
93 case UITER_CURRENT:
94 return iter->index;
95 case UITER_LIMIT:
96 return iter->limit;
97 case UITER_LENGTH:
98 return iter->length;
99 default:
100 /* not a valid origin */
101 /* Should never get here! */
102 return -1;
103 }
104 }
105
106 static int32_t U_CALLCONV
stringIteratorMove(UCharIterator * iter,int32_t delta,UCharIteratorOrigin origin)107 stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
108 int32_t pos;
109
110 switch(origin) {
111 case UITER_ZERO:
112 pos=delta;
113 break;
114 case UITER_START:
115 pos=iter->start+delta;
116 break;
117 case UITER_CURRENT:
118 pos=iter->index+delta;
119 break;
120 case UITER_LIMIT:
121 pos=iter->limit+delta;
122 break;
123 case UITER_LENGTH:
124 pos=iter->length+delta;
125 break;
126 default:
127 return -1; /* Error */
128 }
129
130 if(pos<iter->start) {
131 pos=iter->start;
132 } else if(pos>iter->limit) {
133 pos=iter->limit;
134 }
135
136 return iter->index=pos;
137 }
138
139 static UBool U_CALLCONV
stringIteratorHasNext(UCharIterator * iter)140 stringIteratorHasNext(UCharIterator *iter) {
141 return iter->index<iter->limit;
142 }
143
144 static UBool U_CALLCONV
stringIteratorHasPrevious(UCharIterator * iter)145 stringIteratorHasPrevious(UCharIterator *iter) {
146 return iter->index>iter->start;
147 }
148
149 static UChar32 U_CALLCONV
stringIteratorCurrent(UCharIterator * iter)150 stringIteratorCurrent(UCharIterator *iter) {
151 if(iter->index<iter->limit) {
152 return ((const UChar *)(iter->context))[iter->index];
153 } else {
154 return U_SENTINEL;
155 }
156 }
157
158 static UChar32 U_CALLCONV
stringIteratorNext(UCharIterator * iter)159 stringIteratorNext(UCharIterator *iter) {
160 if(iter->index<iter->limit) {
161 return ((const UChar *)(iter->context))[iter->index++];
162 } else {
163 return U_SENTINEL;
164 }
165 }
166
167 static UChar32 U_CALLCONV
stringIteratorPrevious(UCharIterator * iter)168 stringIteratorPrevious(UCharIterator *iter) {
169 if(iter->index>iter->start) {
170 return ((const UChar *)(iter->context))[--iter->index];
171 } else {
172 return U_SENTINEL;
173 }
174 }
175
176 static uint32_t U_CALLCONV
stringIteratorGetState(const UCharIterator * iter)177 stringIteratorGetState(const UCharIterator *iter) {
178 return (uint32_t)iter->index;
179 }
180
181 static void U_CALLCONV
stringIteratorSetState(UCharIterator * iter,uint32_t state,UErrorCode * pErrorCode)182 stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
183 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
184 /* do nothing */
185 } else if(iter==NULL) {
186 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
187 } else if((int32_t)state<iter->start || iter->limit<(int32_t)state) {
188 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
189 } else {
190 iter->index=(int32_t)state;
191 }
192 }
193
194 static const UCharIterator stringIterator={
195 0, 0, 0, 0, 0, 0,
196 stringIteratorGetIndex,
197 stringIteratorMove,
198 stringIteratorHasNext,
199 stringIteratorHasPrevious,
200 stringIteratorCurrent,
201 stringIteratorNext,
202 stringIteratorPrevious,
203 NULL,
204 stringIteratorGetState,
205 stringIteratorSetState
206 };
207
208 U_CAPI void U_EXPORT2
uiter_setString(UCharIterator * iter,const UChar * s,int32_t length)209 uiter_setString(UCharIterator *iter, const UChar *s, int32_t length) {
210 if(iter!=0) {
211 if(s!=0 && length>=-1) {
212 *iter=stringIterator;
213 iter->context=s;
214 if(length>=0) {
215 iter->length=length;
216 } else {
217 iter->length=u_strlen(s);
218 }
219 iter->limit=iter->length;
220 } else {
221 *iter=noopIterator;
222 }
223 }
224 }
225
226 /* UCharIterator implementation for UTF-16BE strings ------------------------ */
227
228 /*
229 * This is an implementation of a code unit (UChar) iterator
230 * for UTF-16BE strings, i.e., strings in byte-vectors where
231 * each UChar is stored as a big-endian pair of bytes.
232 *
233 * The UCharIterator.context field holds a pointer to the string.
234 * Everything works just like with a normal UChar iterator (uiter_setString),
235 * except that UChars are assembled from byte pairs.
236 */
237
238 /* internal helper function */
239 static inline UChar32
utf16BEIteratorGet(UCharIterator * iter,int32_t index)240 utf16BEIteratorGet(UCharIterator *iter, int32_t index) {
241 const uint8_t *p=(const uint8_t *)iter->context;
242 return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1];
243 }
244
245 static UChar32 U_CALLCONV
utf16BEIteratorCurrent(UCharIterator * iter)246 utf16BEIteratorCurrent(UCharIterator *iter) {
247 int32_t index;
248
249 if((index=iter->index)<iter->limit) {
250 return utf16BEIteratorGet(iter, index);
251 } else {
252 return U_SENTINEL;
253 }
254 }
255
256 static UChar32 U_CALLCONV
utf16BEIteratorNext(UCharIterator * iter)257 utf16BEIteratorNext(UCharIterator *iter) {
258 int32_t index;
259
260 if((index=iter->index)<iter->limit) {
261 iter->index=index+1;
262 return utf16BEIteratorGet(iter, index);
263 } else {
264 return U_SENTINEL;
265 }
266 }
267
268 static UChar32 U_CALLCONV
utf16BEIteratorPrevious(UCharIterator * iter)269 utf16BEIteratorPrevious(UCharIterator *iter) {
270 int32_t index;
271
272 if((index=iter->index)>iter->start) {
273 iter->index=--index;
274 return utf16BEIteratorGet(iter, index);
275 } else {
276 return U_SENTINEL;
277 }
278 }
279
280 static const UCharIterator utf16BEIterator={
281 0, 0, 0, 0, 0, 0,
282 stringIteratorGetIndex,
283 stringIteratorMove,
284 stringIteratorHasNext,
285 stringIteratorHasPrevious,
286 utf16BEIteratorCurrent,
287 utf16BEIteratorNext,
288 utf16BEIteratorPrevious,
289 NULL,
290 stringIteratorGetState,
291 stringIteratorSetState
292 };
293
294 /*
295 * Count the number of UChars in a UTF-16BE string before a terminating UChar NUL,
296 * i.e., before a pair of 0 bytes where the first 0 byte is at an even
297 * offset from s.
298 */
299 static int32_t
utf16BE_strlen(const char * s)300 utf16BE_strlen(const char *s) {
301 if(IS_POINTER_EVEN(s)) {
302 /*
303 * even-aligned, call u_strlen(s)
304 * we are probably on a little-endian machine, but searching for UChar NUL
305 * does not care about endianness
306 */
307 return u_strlen((const UChar *)s);
308 } else {
309 /* odd-aligned, search for pair of 0 bytes */
310 const char *p=s;
311
312 while(!(*p==0 && p[1]==0)) {
313 p+=2;
314 }
315 return (int32_t)((p-s)/2);
316 }
317 }
318
319 U_CAPI void U_EXPORT2
uiter_setUTF16BE(UCharIterator * iter,const char * s,int32_t length)320 uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) {
321 if(iter!=NULL) {
322 /* allow only even-length strings (the input length counts bytes) */
323 if(s!=NULL && (length==-1 || (length>=0 && IS_EVEN(length)))) {
324 /* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1) */
325 length>>=1;
326
327 if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) {
328 /* big-endian machine and 2-aligned UTF-16BE string: use normal UChar iterator */
329 uiter_setString(iter, (const UChar *)s, length);
330 return;
331 }
332
333 *iter=utf16BEIterator;
334 iter->context=s;
335 if(length>=0) {
336 iter->length=length;
337 } else {
338 iter->length=utf16BE_strlen(s);
339 }
340 iter->limit=iter->length;
341 } else {
342 *iter=noopIterator;
343 }
344 }
345 }
346
347 /* UCharIterator wrapper around CharacterIterator --------------------------- */
348
349 /*
350 * This is wrapper code around a C++ CharacterIterator to
351 * look like a C UCharIterator.
352 *
353 * The UCharIterator.context field holds a pointer to the CharacterIterator.
354 */
355
356 static int32_t U_CALLCONV
characterIteratorGetIndex(UCharIterator * iter,UCharIteratorOrigin origin)357 characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
358 switch(origin) {
359 case UITER_ZERO:
360 return 0;
361 case UITER_START:
362 return ((CharacterIterator *)(iter->context))->startIndex();
363 case UITER_CURRENT:
364 return ((CharacterIterator *)(iter->context))->getIndex();
365 case UITER_LIMIT:
366 return ((CharacterIterator *)(iter->context))->endIndex();
367 case UITER_LENGTH:
368 return ((CharacterIterator *)(iter->context))->getLength();
369 default:
370 /* not a valid origin */
371 /* Should never get here! */
372 return -1;
373 }
374 }
375
376 static int32_t U_CALLCONV
characterIteratorMove(UCharIterator * iter,int32_t delta,UCharIteratorOrigin origin)377 characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
378 switch(origin) {
379 case UITER_ZERO:
380 ((CharacterIterator *)(iter->context))->setIndex(delta);
381 return ((CharacterIterator *)(iter->context))->getIndex();
382 case UITER_START:
383 case UITER_CURRENT:
384 case UITER_LIMIT:
385 return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin);
386 case UITER_LENGTH:
387 ((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta);
388 return ((CharacterIterator *)(iter->context))->getIndex();
389 default:
390 /* not a valid origin */
391 /* Should never get here! */
392 return -1;
393 }
394 }
395
396 static UBool U_CALLCONV
characterIteratorHasNext(UCharIterator * iter)397 characterIteratorHasNext(UCharIterator *iter) {
398 return ((CharacterIterator *)(iter->context))->hasNext();
399 }
400
401 static UBool U_CALLCONV
characterIteratorHasPrevious(UCharIterator * iter)402 characterIteratorHasPrevious(UCharIterator *iter) {
403 return ((CharacterIterator *)(iter->context))->hasPrevious();
404 }
405
406 static UChar32 U_CALLCONV
characterIteratorCurrent(UCharIterator * iter)407 characterIteratorCurrent(UCharIterator *iter) {
408 UChar32 c;
409
410 c=((CharacterIterator *)(iter->context))->current();
411 if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) {
412 return c;
413 } else {
414 return U_SENTINEL;
415 }
416 }
417
418 static UChar32 U_CALLCONV
characterIteratorNext(UCharIterator * iter)419 characterIteratorNext(UCharIterator *iter) {
420 if(((CharacterIterator *)(iter->context))->hasNext()) {
421 return ((CharacterIterator *)(iter->context))->nextPostInc();
422 } else {
423 return U_SENTINEL;
424 }
425 }
426
427 static UChar32 U_CALLCONV
characterIteratorPrevious(UCharIterator * iter)428 characterIteratorPrevious(UCharIterator *iter) {
429 if(((CharacterIterator *)(iter->context))->hasPrevious()) {
430 return ((CharacterIterator *)(iter->context))->previous();
431 } else {
432 return U_SENTINEL;
433 }
434 }
435
436 static uint32_t U_CALLCONV
characterIteratorGetState(const UCharIterator * iter)437 characterIteratorGetState(const UCharIterator *iter) {
438 return ((CharacterIterator *)(iter->context))->getIndex();
439 }
440
441 static void U_CALLCONV
characterIteratorSetState(UCharIterator * iter,uint32_t state,UErrorCode * pErrorCode)442 characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
443 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
444 /* do nothing */
445 } else if(iter==NULL || iter->context==NULL) {
446 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
447 } else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex() || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) {
448 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
449 } else {
450 ((CharacterIterator *)(iter->context))->setIndex((int32_t)state);
451 }
452 }
453
454 static const UCharIterator characterIteratorWrapper={
455 0, 0, 0, 0, 0, 0,
456 characterIteratorGetIndex,
457 characterIteratorMove,
458 characterIteratorHasNext,
459 characterIteratorHasPrevious,
460 characterIteratorCurrent,
461 characterIteratorNext,
462 characterIteratorPrevious,
463 NULL,
464 characterIteratorGetState,
465 characterIteratorSetState
466 };
467
468 U_CAPI void U_EXPORT2
uiter_setCharacterIterator(UCharIterator * iter,CharacterIterator * charIter)469 uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) {
470 if(iter!=0) {
471 if(charIter!=0) {
472 *iter=characterIteratorWrapper;
473 iter->context=charIter;
474 } else {
475 *iter=noopIterator;
476 }
477 }
478 }
479
480 /* UCharIterator wrapper around Replaceable --------------------------------- */
481
482 /*
483 * This is an implementation of a code unit (UChar) iterator
484 * based on a Replaceable object.
485 *
486 * The UCharIterator.context field holds a pointer to the Replaceable.
487 * UCharIterator.length and UCharIterator.index hold Replaceable.length()
488 * and the iteration index.
489 */
490
491 static UChar32 U_CALLCONV
replaceableIteratorCurrent(UCharIterator * iter)492 replaceableIteratorCurrent(UCharIterator *iter) {
493 if(iter->index<iter->limit) {
494 return ((Replaceable *)(iter->context))->charAt(iter->index);
495 } else {
496 return U_SENTINEL;
497 }
498 }
499
500 static UChar32 U_CALLCONV
replaceableIteratorNext(UCharIterator * iter)501 replaceableIteratorNext(UCharIterator *iter) {
502 if(iter->index<iter->limit) {
503 return ((Replaceable *)(iter->context))->charAt(iter->index++);
504 } else {
505 return U_SENTINEL;
506 }
507 }
508
509 static UChar32 U_CALLCONV
replaceableIteratorPrevious(UCharIterator * iter)510 replaceableIteratorPrevious(UCharIterator *iter) {
511 if(iter->index>iter->start) {
512 return ((Replaceable *)(iter->context))->charAt(--iter->index);
513 } else {
514 return U_SENTINEL;
515 }
516 }
517
518 static const UCharIterator replaceableIterator={
519 0, 0, 0, 0, 0, 0,
520 stringIteratorGetIndex,
521 stringIteratorMove,
522 stringIteratorHasNext,
523 stringIteratorHasPrevious,
524 replaceableIteratorCurrent,
525 replaceableIteratorNext,
526 replaceableIteratorPrevious,
527 NULL,
528 stringIteratorGetState,
529 stringIteratorSetState
530 };
531
532 U_CAPI void U_EXPORT2
uiter_setReplaceable(UCharIterator * iter,const Replaceable * rep)533 uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) {
534 if(iter!=0) {
535 if(rep!=0) {
536 *iter=replaceableIterator;
537 iter->context=rep;
538 iter->limit=iter->length=rep->length();
539 } else {
540 *iter=noopIterator;
541 }
542 }
543 }
544
545 /* UCharIterator implementation for UTF-8 strings --------------------------- */
546
547 /*
548 * Possible, probably necessary only for an implementation for arbitrary
549 * converters:
550 * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text.
551 * This would require to turn reservedFn into a close function and
552 * to introduce a uiter_close(iter).
553 */
554
555 #define UITER_CNV_CAPACITY 16
556
557 /*
558 * Minimal implementation:
559 * Maintain a single-UChar buffer for an additional surrogate.
560 * The caller must not modify start and limit because they are used internally.
561 *
562 * Use UCharIterator fields as follows:
563 * context pointer to UTF-8 string
564 * length UTF-16 length of the string; -1 until lazy evaluation
565 * start current UTF-8 index
566 * index current UTF-16 index; may be -1="unknown" after setState()
567 * limit UTF-8 length of the string
568 * reservedField supplementary code point
569 *
570 * Since UCharIterator delivers 16-bit code units, the iteration can be
571 * currently in the middle of the byte sequence for a supplementary code point.
572 * In this case, reservedField will contain that code point and start will
573 * point to after the corresponding byte sequence. The UTF-16 index will be
574 * one less than what it would otherwise be corresponding to the UTF-8 index.
575 * Otherwise, reservedField will be 0.
576 */
577
578 /*
579 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
580 * Add implementations that do not call strlen() for iteration but check for NUL.
581 */
582
583 static int32_t U_CALLCONV
utf8IteratorGetIndex(UCharIterator * iter,UCharIteratorOrigin origin)584 utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
585 switch(origin) {
586 case UITER_ZERO:
587 case UITER_START:
588 return 0;
589 case UITER_CURRENT:
590 if(iter->index<0) {
591 /* the current UTF-16 index is unknown after setState(), count from the beginning */
592 const uint8_t *s;
593 UChar32 c;
594 int32_t i, limit, index;
595
596 s=(const uint8_t *)iter->context;
597 i=index=0;
598 limit=iter->start; /* count up to the UTF-8 index */
599 while(i<limit) {
600 U8_NEXT(s, i, limit, c);
601 if(c<=0xffff) {
602 ++index;
603 } else {
604 index+=2;
605 }
606 }
607
608 iter->start=i; /* just in case setState() did not get us to a code point boundary */
609 if(i==iter->limit) {
610 iter->length=index; /* in case it was <0 or wrong */
611 }
612 if(iter->reservedField!=0) {
613 --index; /* we are in the middle of a supplementary code point */
614 }
615 iter->index=index;
616 }
617 return iter->index;
618 case UITER_LIMIT:
619 case UITER_LENGTH:
620 if(iter->length<0) {
621 const uint8_t *s;
622 UChar32 c;
623 int32_t i, limit, length;
624
625 s=(const uint8_t *)iter->context;
626 if(iter->index<0) {
627 /*
628 * the current UTF-16 index is unknown after setState(),
629 * we must first count from the beginning to here
630 */
631 i=length=0;
632 limit=iter->start;
633
634 /* count from the beginning to the current index */
635 while(i<limit) {
636 U8_NEXT(s, i, limit, c);
637 if(c<=0xffff) {
638 ++length;
639 } else {
640 length+=2;
641 }
642 }
643
644 /* assume i==limit==iter->start, set the UTF-16 index */
645 iter->start=i; /* just in case setState() did not get us to a code point boundary */
646 iter->index= iter->reservedField!=0 ? length-1 : length;
647 } else {
648 i=iter->start;
649 length=iter->index;
650 if(iter->reservedField!=0) {
651 ++length;
652 }
653 }
654
655 /* count from the current index to the end */
656 limit=iter->limit;
657 while(i<limit) {
658 U8_NEXT(s, i, limit, c);
659 if(c<=0xffff) {
660 ++length;
661 } else {
662 length+=2;
663 }
664 }
665 iter->length=length;
666 }
667 return iter->length;
668 default:
669 /* not a valid origin */
670 /* Should never get here! */
671 return -1;
672 }
673 }
674
675 static int32_t U_CALLCONV
utf8IteratorMove(UCharIterator * iter,int32_t delta,UCharIteratorOrigin origin)676 utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
677 const uint8_t *s;
678 UChar32 c;
679 int32_t pos; /* requested UTF-16 index */
680 int32_t i; /* UTF-8 index */
681 UBool havePos;
682
683 /* calculate the requested UTF-16 index */
684 switch(origin) {
685 case UITER_ZERO:
686 case UITER_START:
687 pos=delta;
688 havePos=TRUE;
689 /* iter->index<0 (unknown) is possible */
690 break;
691 case UITER_CURRENT:
692 if(iter->index>=0) {
693 pos=iter->index+delta;
694 havePos=TRUE;
695 } else {
696 /* the current UTF-16 index is unknown after setState(), use only delta */
697 pos=0;
698 havePos=FALSE;
699 }
700 break;
701 case UITER_LIMIT:
702 case UITER_LENGTH:
703 if(iter->length>=0) {
704 pos=iter->length+delta;
705 havePos=TRUE;
706 } else {
707 /* pin to the end, avoid counting the length */
708 iter->index=-1;
709 iter->start=iter->limit;
710 iter->reservedField=0;
711 if(delta>=0) {
712 return UITER_UNKNOWN_INDEX;
713 } else {
714 /* the current UTF-16 index is unknown, use only delta */
715 pos=0;
716 havePos=FALSE;
717 }
718 }
719 break;
720 default:
721 return -1; /* Error */
722 }
723
724 if(havePos) {
725 /* shortcuts: pinning to the edges of the string */
726 if(pos<=0) {
727 iter->index=iter->start=iter->reservedField=0;
728 return 0;
729 } else if(iter->length>=0 && pos>=iter->length) {
730 iter->index=iter->length;
731 iter->start=iter->limit;
732 iter->reservedField=0;
733 return iter->index;
734 }
735
736 /* minimize the number of U8_NEXT/PREV operations */
737 if(iter->index<0 || pos<iter->index/2) {
738 /* go forward from the start instead of backward from the current index */
739 iter->index=iter->start=iter->reservedField=0;
740 } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
741 /*
742 * if we have the UTF-16 index and length and the new position is
743 * closer to the end than the current index,
744 * then go backward from the end instead of forward from the current index
745 */
746 iter->index=iter->length;
747 iter->start=iter->limit;
748 iter->reservedField=0;
749 }
750
751 delta=pos-iter->index;
752 if(delta==0) {
753 return iter->index; /* nothing to do */
754 }
755 } else {
756 /* move relative to unknown UTF-16 index */
757 if(delta==0) {
758 return UITER_UNKNOWN_INDEX; /* nothing to do */
759 } else if(-delta>=iter->start) {
760 /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
761 iter->index=iter->start=iter->reservedField=0;
762 return 0;
763 } else if(delta>=(iter->limit-iter->start)) {
764 /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
765 iter->index=iter->length; /* may or may not be <0 (unknown) */
766 iter->start=iter->limit;
767 iter->reservedField=0;
768 return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX;
769 }
770 }
771
772 /* delta!=0 */
773
774 /* move towards the requested position, pin to the edges of the string */
775 s=(const uint8_t *)iter->context;
776 pos=iter->index; /* could be <0 (unknown) */
777 i=iter->start;
778 if(delta>0) {
779 /* go forward */
780 int32_t limit=iter->limit;
781 if(iter->reservedField!=0) {
782 iter->reservedField=0;
783 ++pos;
784 --delta;
785 }
786 while(delta>0 && i<limit) {
787 U8_NEXT(s, i, limit, c);
788 if(c<0xffff) {
789 ++pos;
790 --delta;
791 } else if(delta>=2) {
792 pos+=2;
793 delta-=2;
794 } else /* delta==1 */ {
795 /* stop in the middle of a supplementary code point */
796 iter->reservedField=c;
797 ++pos;
798 break; /* delta=0; */
799 }
800 }
801 if(i==limit) {
802 if(iter->length<0 && iter->index>=0) {
803 iter->length= iter->reservedField==0 ? pos : pos+1;
804 } else if(iter->index<0 && iter->length>=0) {
805 iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
806 }
807 }
808 } else /* delta<0 */ {
809 /* go backward */
810 if(iter->reservedField!=0) {
811 iter->reservedField=0;
812 i-=4; /* we stayed behind the supplementary code point; go before it now */
813 --pos;
814 ++delta;
815 }
816 while(delta<0 && i>0) {
817 U8_PREV(s, 0, i, c);
818 if(c<0xffff) {
819 --pos;
820 ++delta;
821 } else if(delta<=-2) {
822 pos-=2;
823 delta+=2;
824 } else /* delta==-1 */ {
825 /* stop in the middle of a supplementary code point */
826 i+=4; /* back to behind this supplementary code point for consistent state */
827 iter->reservedField=c;
828 --pos;
829 break; /* delta=0; */
830 }
831 }
832 }
833
834 iter->start=i;
835 if(iter->index>=0) {
836 return iter->index=pos;
837 } else {
838 /* we started with index<0 (unknown) so pos is bogus */
839 if(i<=1) {
840 return iter->index=i; /* reached the beginning */
841 } else {
842 /* we still don't know the UTF-16 index */
843 return UITER_UNKNOWN_INDEX;
844 }
845 }
846 }
847
848 static UBool U_CALLCONV
utf8IteratorHasNext(UCharIterator * iter)849 utf8IteratorHasNext(UCharIterator *iter) {
850 return iter->start<iter->limit || iter->reservedField!=0;
851 }
852
853 static UBool U_CALLCONV
utf8IteratorHasPrevious(UCharIterator * iter)854 utf8IteratorHasPrevious(UCharIterator *iter) {
855 return iter->start>0;
856 }
857
858 static UChar32 U_CALLCONV
utf8IteratorCurrent(UCharIterator * iter)859 utf8IteratorCurrent(UCharIterator *iter) {
860 if(iter->reservedField!=0) {
861 return U16_TRAIL(iter->reservedField);
862 } else if(iter->start<iter->limit) {
863 const uint8_t *s=(const uint8_t *)iter->context;
864 UChar32 c;
865 int32_t i=iter->start;
866
867 U8_NEXT(s, i, iter->limit, c);
868 if(c<0) {
869 return 0xfffd;
870 } else if(c<=0xffff) {
871 return c;
872 } else {
873 return U16_LEAD(c);
874 }
875 } else {
876 return U_SENTINEL;
877 }
878 }
879
880 static UChar32 U_CALLCONV
utf8IteratorNext(UCharIterator * iter)881 utf8IteratorNext(UCharIterator *iter) {
882 int32_t index;
883
884 if(iter->reservedField!=0) {
885 UChar trail=U16_TRAIL(iter->reservedField);
886 iter->reservedField=0;
887 if((index=iter->index)>=0) {
888 iter->index=index+1;
889 }
890 return trail;
891 } else if(iter->start<iter->limit) {
892 const uint8_t *s=(const uint8_t *)iter->context;
893 UChar32 c;
894
895 U8_NEXT(s, iter->start, iter->limit, c);
896 if((index=iter->index)>=0) {
897 iter->index=++index;
898 if(iter->length<0 && iter->start==iter->limit) {
899 iter->length= c<=0xffff ? index : index+1;
900 }
901 } else if(iter->start==iter->limit && iter->length>=0) {
902 iter->index= c<=0xffff ? iter->length : iter->length-1;
903 }
904 if(c<0) {
905 return 0xfffd;
906 } else if(c<=0xffff) {
907 return c;
908 } else {
909 iter->reservedField=c;
910 return U16_LEAD(c);
911 }
912 } else {
913 return U_SENTINEL;
914 }
915 }
916
917 static UChar32 U_CALLCONV
utf8IteratorPrevious(UCharIterator * iter)918 utf8IteratorPrevious(UCharIterator *iter) {
919 int32_t index;
920
921 if(iter->reservedField!=0) {
922 UChar lead=U16_LEAD(iter->reservedField);
923 iter->reservedField=0;
924 iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
925 if((index=iter->index)>0) {
926 iter->index=index-1;
927 }
928 return lead;
929 } else if(iter->start>0) {
930 const uint8_t *s=(const uint8_t *)iter->context;
931 UChar32 c;
932
933 U8_PREV(s, 0, iter->start, c);
934 if((index=iter->index)>0) {
935 iter->index=index-1;
936 } else if(iter->start<=1) {
937 iter->index= c<=0xffff ? iter->start : iter->start+1;
938 }
939 if(c<0) {
940 return 0xfffd;
941 } else if(c<=0xffff) {
942 return c;
943 } else {
944 iter->start+=4; /* back to behind this supplementary code point for consistent state */
945 iter->reservedField=c;
946 return U16_TRAIL(c);
947 }
948 } else {
949 return U_SENTINEL;
950 }
951 }
952
953 static uint32_t U_CALLCONV
utf8IteratorGetState(const UCharIterator * iter)954 utf8IteratorGetState(const UCharIterator *iter) {
955 uint32_t state=(uint32_t)(iter->start<<1);
956 if(iter->reservedField!=0) {
957 state|=1;
958 }
959 return state;
960 }
961
962 static void U_CALLCONV
utf8IteratorSetState(UCharIterator * iter,uint32_t state,UErrorCode * pErrorCode)963 utf8IteratorSetState(UCharIterator *iter,
964 uint32_t state,
965 UErrorCode *pErrorCode)
966 {
967 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
968 /* do nothing */
969 } else if(iter==NULL) {
970 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
971 } else if(state==utf8IteratorGetState(iter)) {
972 /* setting to the current state: no-op */
973 } else {
974 int32_t index=(int32_t)(state>>1); /* UTF-8 index */
975 state&=1; /* 1 if in surrogate pair, must be index>=4 */
976
977 if((state==0 ? index<0 : index<4) || iter->limit<index) {
978 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
979 } else {
980 iter->start=index; /* restore UTF-8 byte index */
981 if(index<=1) {
982 iter->index=index;
983 } else {
984 iter->index=-1; /* unknown UTF-16 index */
985 }
986 if(state==0) {
987 iter->reservedField=0;
988 } else {
989 /* verified index>=4 above */
990 UChar32 c;
991 U8_PREV((const uint8_t *)iter->context, 0, index, c);
992 if(c<=0xffff) {
993 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
994 } else {
995 iter->reservedField=c;
996 }
997 }
998 }
999 }
1000 }
1001
1002 static const UCharIterator utf8Iterator={
1003 0, 0, 0, 0, 0, 0,
1004 utf8IteratorGetIndex,
1005 utf8IteratorMove,
1006 utf8IteratorHasNext,
1007 utf8IteratorHasPrevious,
1008 utf8IteratorCurrent,
1009 utf8IteratorNext,
1010 utf8IteratorPrevious,
1011 NULL,
1012 utf8IteratorGetState,
1013 utf8IteratorSetState
1014 };
1015
1016 U_CAPI void U_EXPORT2
uiter_setUTF8(UCharIterator * iter,const char * s,int32_t length)1017 uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) {
1018 if(iter!=0) {
1019 if(s!=0 && length>=-1) {
1020 *iter=utf8Iterator;
1021 iter->context=s;
1022 if(length>=0) {
1023 iter->limit=length;
1024 } else {
1025 iter->limit=(int32_t)uprv_strlen(s);
1026 }
1027 iter->length= iter->limit<=1 ? iter->limit : -1;
1028 } else {
1029 *iter=noopIterator;
1030 }
1031 }
1032 }
1033
1034 /* Helper functions --------------------------------------------------------- */
1035
1036 U_CAPI UChar32 U_EXPORT2
uiter_current32(UCharIterator * iter)1037 uiter_current32(UCharIterator *iter) {
1038 UChar32 c, c2;
1039
1040 c=iter->current(iter);
1041 if(UTF_IS_SURROGATE(c)) {
1042 if(UTF_IS_SURROGATE_FIRST(c)) {
1043 /*
1044 * go to the next code unit
1045 * we know that we are not at the limit because c!=U_SENTINEL
1046 */
1047 iter->move(iter, 1, UITER_CURRENT);
1048 if(UTF_IS_SECOND_SURROGATE(c2=iter->current(iter))) {
1049 c=UTF16_GET_PAIR_VALUE(c, c2);
1050 }
1051
1052 /* undo index movement */
1053 iter->move(iter, -1, UITER_CURRENT);
1054 } else {
1055 if(UTF_IS_FIRST_SURROGATE(c2=iter->previous(iter))) {
1056 c=UTF16_GET_PAIR_VALUE(c2, c);
1057 }
1058 if(c2>=0) {
1059 /* undo index movement */
1060 iter->move(iter, 1, UITER_CURRENT);
1061 }
1062 }
1063 }
1064 return c;
1065 }
1066
1067 U_CAPI UChar32 U_EXPORT2
uiter_next32(UCharIterator * iter)1068 uiter_next32(UCharIterator *iter) {
1069 UChar32 c, c2;
1070
1071 c=iter->next(iter);
1072 if(UTF_IS_FIRST_SURROGATE(c)) {
1073 if(UTF_IS_SECOND_SURROGATE(c2=iter->next(iter))) {
1074 c=UTF16_GET_PAIR_VALUE(c, c2);
1075 } else if(c2>=0) {
1076 /* unmatched first surrogate, undo index movement */
1077 iter->move(iter, -1, UITER_CURRENT);
1078 }
1079 }
1080 return c;
1081 }
1082
1083 U_CAPI UChar32 U_EXPORT2
uiter_previous32(UCharIterator * iter)1084 uiter_previous32(UCharIterator *iter) {
1085 UChar32 c, c2;
1086
1087 c=iter->previous(iter);
1088 if(UTF_IS_SECOND_SURROGATE(c)) {
1089 if(UTF_IS_FIRST_SURROGATE(c2=iter->previous(iter))) {
1090 c=UTF16_GET_PAIR_VALUE(c2, c);
1091 } else if(c2>=0) {
1092 /* unmatched second surrogate, undo index movement */
1093 iter->move(iter, 1, UITER_CURRENT);
1094 }
1095 }
1096 return c;
1097 }
1098
1099 U_CAPI uint32_t U_EXPORT2
uiter_getState(const UCharIterator * iter)1100 uiter_getState(const UCharIterator *iter) {
1101 if(iter==NULL || iter->getState==NULL) {
1102 return UITER_NO_STATE;
1103 } else {
1104 return iter->getState(iter);
1105 }
1106 }
1107
1108 U_CAPI void U_EXPORT2
uiter_setState(UCharIterator * iter,uint32_t state,UErrorCode * pErrorCode)1109 uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
1110 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1111 /* do nothing */
1112 } else if(iter==NULL) {
1113 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1114 } else if(iter->setState==NULL) {
1115 *pErrorCode=U_UNSUPPORTED_ERROR;
1116 } else {
1117 iter->setState(iter, state, pErrorCode);
1118 }
1119 }
1120
1121 U_CDECL_END
1122