1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2003-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: uit_len8.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2003feb10
14 * created by: Markus W. Scherer
15 *
16 * This file contains the implementation of the "lenient UTF-8" UCharIterator
17 * as used in the uciter8 sample code.
18 * UTF-8-style macros are defined as well as the UCharIterator.
19 * The macros are incomplete (do not assemble code points from pairs of
20 * surrogates, see comment below)
21 * but sufficient for the iterator.
22 */
23
24 #include <string.h>
25 #include "unicode/utypes.h"
26 #include "unicode/uiter.h"
27
28 /* lenient UTF-8/CESU-8 macros ---------------------------------------------- */
29
30 /*
31 * This code leniently reads 8-bit Unicode strings,
32 * which could contain a mix of UTF-8 and CESU-8.
33 * More precisely:
34 * - supplementary code points may be encoded with dedicated 4-byte sequences
35 * (UTF-8 style)
36 * - supplementary code points may be encoded with
37 * pairs of 3-byte sequences, one for each surrogate of the UTF-16 form
38 * (CESU-8 style)
39 * - single surrogates are allowed, encoded with their "natural" 3-byte sequences
40 *
41 * Limitation:
42 * Right now, the macros do not attempt to assemble code points from pairs of
43 * separately encoded surrogates.
44 * This would not be sufficient for processing based on these macros,
45 * but it is sufficient for a UCharIterator that returns only UChars anyway.
46 *
47 * The code is copied and modified from utf_impl.c and utf8.h.
48 *
49 * Change 2006feb08: Much of the implementation code is replaced by calling
50 * the utf_impl.c functions which accept a new "strict" parameter value
51 * of -2 implementing exactly this leniency.
52 */
53
54 #define L8_NEXT(s, i, length, c) { \
55 (c)=(uint8_t)(s)[(i)++]; \
56 if((c)>=0x80) { \
57 if(U8_IS_LEAD(c)) { \
58 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
59 } else { \
60 (c)=U_SENTINEL; \
61 } \
62 } \
63 }
64
65 #define L8_PREV(s, start, i, c) { \
66 (c)=(uint8_t)(s)[--(i)]; \
67 if((c)>=0x80) { \
68 if((c)<=0xbf) { \
69 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
70 } else { \
71 (c)=U_SENTINEL; \
72 } \
73 } \
74 }
75
76 /* lenient-8 UCharIterator -------------------------------------------------- */
77
78 /*
79 * This is a copy of the UTF-8 UCharIterator in uiter.cpp,
80 * except that it uses the lenient-8-bit-Unicode macros above.
81 */
82
83 /*
84 * Minimal implementation:
85 * Maintain a single-UChar buffer for an additional surrogate.
86 * The caller must not modify start and limit because they are used internally.
87 *
88 * Use UCharIterator fields as follows:
89 * context pointer to UTF-8 string
90 * length UTF-16 length of the string; -1 until lazy evaluation
91 * start current UTF-8 index
92 * index current UTF-16 index; may be -1="unknown" after setState()
93 * limit UTF-8 length of the string
94 * reservedField supplementary code point
95 *
96 * Since UCharIterator delivers 16-bit code units, the iteration can be
97 * currently in the middle of the byte sequence for a supplementary code point.
98 * In this case, reservedField will contain that code point and start will
99 * point to after the corresponding byte sequence. The UTF-16 index will be
100 * one less than what it would otherwise be corresponding to the UTF-8 index.
101 * Otherwise, reservedField will be 0.
102 */
103
104 /*
105 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
106 * Add implementations that do not call strlen() for iteration but check for NUL.
107 */
108
109 static int32_t U_CALLCONV
lenient8IteratorGetIndex(UCharIterator * iter,UCharIteratorOrigin origin)110 lenient8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
111 switch(origin) {
112 case UITER_ZERO:
113 case UITER_START:
114 return 0;
115 case UITER_CURRENT:
116 if(iter->index<0) {
117 /* the current UTF-16 index is unknown after setState(), count from the beginning */
118 const uint8_t *s;
119 UChar32 c;
120 int32_t i, limit, index;
121
122 s=(const uint8_t *)iter->context;
123 i=index=0;
124 limit=iter->start; /* count up to the UTF-8 index */
125 while(i<limit) {
126 L8_NEXT(s, i, limit, c);
127 if(c<=0xffff) {
128 ++index;
129 } else {
130 index+=2;
131 }
132 }
133
134 iter->start=i; /* just in case setState() did not get us to a code point boundary */
135 if(i==iter->limit) {
136 iter->length=index; /* in case it was <0 or wrong */
137 }
138 if(iter->reservedField!=0) {
139 --index; /* we are in the middle of a supplementary code point */
140 }
141 iter->index=index;
142 }
143 return iter->index;
144 case UITER_LIMIT:
145 case UITER_LENGTH:
146 if(iter->length<0) {
147 const uint8_t *s;
148 UChar32 c;
149 int32_t i, limit, length;
150
151 s=(const uint8_t *)iter->context;
152 if(iter->index<0) {
153 /*
154 * the current UTF-16 index is unknown after setState(),
155 * we must first count from the beginning to here
156 */
157 i=length=0;
158 limit=iter->start;
159
160 /* count from the beginning to the current index */
161 while(i<limit) {
162 L8_NEXT(s, i, limit, c);
163 if(c<=0xffff) {
164 ++length;
165 } else {
166 length+=2;
167 }
168 }
169
170 /* assume i==limit==iter->start, set the UTF-16 index */
171 iter->start=i; /* just in case setState() did not get us to a code point boundary */
172 iter->index= iter->reservedField!=0 ? length-1 : length;
173 } else {
174 i=iter->start;
175 length=iter->index;
176 if(iter->reservedField!=0) {
177 ++length;
178 }
179 }
180
181 /* count from the current index to the end */
182 limit=iter->limit;
183 while(i<limit) {
184 L8_NEXT(s, i, limit, c);
185 if(c<=0xffff) {
186 ++length;
187 } else {
188 length+=2;
189 }
190 }
191 iter->length=length;
192 }
193 return iter->length;
194 default:
195 /* not a valid origin */
196 /* Should never get here! */
197 return -1;
198 }
199 }
200
201 static int32_t U_CALLCONV
lenient8IteratorMove(UCharIterator * iter,int32_t delta,UCharIteratorOrigin origin)202 lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
203 const uint8_t *s;
204 UChar32 c;
205 int32_t pos; /* requested UTF-16 index */
206 int32_t i; /* UTF-8 index */
207 UBool havePos;
208
209 /* calculate the requested UTF-16 index */
210 switch(origin) {
211 case UITER_ZERO:
212 case UITER_START:
213 pos=delta;
214 havePos=TRUE;
215 /* iter->index<0 (unknown) is possible */
216 break;
217 case UITER_CURRENT:
218 if(iter->index>=0) {
219 pos=iter->index+delta;
220 havePos=TRUE;
221 } else {
222 /* the current UTF-16 index is unknown after setState(), use only delta */
223 pos=0;
224 havePos=FALSE;
225 }
226 break;
227 case UITER_LIMIT:
228 case UITER_LENGTH:
229 if(iter->length>=0) {
230 pos=iter->length+delta;
231 havePos=TRUE;
232 } else {
233 /* pin to the end, avoid counting the length */
234 iter->index=-1;
235 iter->start=iter->limit;
236 iter->reservedField=0;
237 if(delta>=0) {
238 return UITER_UNKNOWN_INDEX;
239 } else {
240 /* the current UTF-16 index is unknown, use only delta */
241 pos=0;
242 havePos=FALSE;
243 }
244 }
245 break;
246 default:
247 return -1; /* Error */
248 }
249
250 if(havePos) {
251 /* shortcuts: pinning to the edges of the string */
252 if(pos<=0) {
253 iter->index=iter->start=iter->reservedField=0;
254 return 0;
255 } else if(iter->length>=0 && pos>=iter->length) {
256 iter->index=iter->length;
257 iter->start=iter->limit;
258 iter->reservedField=0;
259 return iter->index;
260 }
261
262 /* minimize the number of L8_NEXT/PREV operations */
263 if(iter->index<0 || pos<iter->index/2) {
264 /* go forward from the start instead of backward from the current index */
265 iter->index=iter->start=iter->reservedField=0;
266 } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
267 /*
268 * if we have the UTF-16 index and length and the new position is
269 * closer to the end than the current index,
270 * then go backward from the end instead of forward from the current index
271 */
272 iter->index=iter->length;
273 iter->start=iter->limit;
274 iter->reservedField=0;
275 }
276
277 delta=pos-iter->index;
278 if(delta==0) {
279 return iter->index; /* nothing to do */
280 }
281 } else {
282 /* move relative to unknown UTF-16 index */
283 if(delta==0) {
284 return UITER_UNKNOWN_INDEX; /* nothing to do */
285 } else if(-delta>=iter->start) {
286 /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
287 iter->index=iter->start=iter->reservedField=0;
288 return 0;
289 } else if(delta>=(iter->limit-iter->start)) {
290 /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
291 iter->index=iter->length; /* may or may not be <0 (unknown) */
292 iter->start=iter->limit;
293 iter->reservedField=0;
294 return iter->index>=0 ? iter->index : UITER_UNKNOWN_INDEX;
295 }
296 }
297
298 /* delta!=0 */
299
300 /* move towards the requested position, pin to the edges of the string */
301 s=(const uint8_t *)iter->context;
302 pos=iter->index; /* could be <0 (unknown) */
303 i=iter->start;
304 if(delta>0) {
305 /* go forward */
306 int32_t limit=iter->limit;
307 if(iter->reservedField!=0) {
308 iter->reservedField=0;
309 ++pos;
310 --delta;
311 }
312 while(delta>0 && i<limit) {
313 L8_NEXT(s, i, limit, c);
314 if(c<0xffff) {
315 ++pos;
316 --delta;
317 } else if(delta>=2) {
318 pos+=2;
319 delta-=2;
320 } else /* delta==1 */ {
321 /* stop in the middle of a supplementary code point */
322 iter->reservedField=c;
323 ++pos;
324 break; /* delta=0; */
325 }
326 }
327 if(i==limit) {
328 if(iter->length<0 && iter->index>=0) {
329 iter->length= iter->reservedField==0 ? pos : pos+1;
330 } else if(iter->index<0 && iter->length>=0) {
331 iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
332 }
333 }
334 } else /* delta<0 */ {
335 /* go backward */
336 if(iter->reservedField!=0) {
337 iter->reservedField=0;
338 i-=4; /* we stayed behind the supplementary code point; go before it now */
339 --pos;
340 ++delta;
341 }
342 while(delta<0 && i>0) {
343 L8_PREV(s, 0, i, c);
344 if(c<0xffff) {
345 --pos;
346 ++delta;
347 } else if(delta<=-2) {
348 pos-=2;
349 delta+=2;
350 } else /* delta==-1 */ {
351 /* stop in the middle of a supplementary code point */
352 i+=4; /* back to behind this supplementary code point for consistent state */
353 iter->reservedField=c;
354 --pos;
355 break; /* delta=0; */
356 }
357 }
358 }
359
360 iter->start=i;
361 if(iter->index>=0) {
362 return iter->index=pos;
363 } else {
364 /* we started with index<0 (unknown) so pos is bogus */
365 if(i<=1) {
366 return iter->index=i; /* reached the beginning */
367 } else {
368 /* we still don't know the UTF-16 index */
369 return UITER_UNKNOWN_INDEX;
370 }
371 }
372 }
373
374 static UBool U_CALLCONV
lenient8IteratorHasNext(UCharIterator * iter)375 lenient8IteratorHasNext(UCharIterator *iter) {
376 return iter->reservedField!=0 || iter->start<iter->limit;
377 }
378
379 static UBool U_CALLCONV
lenient8IteratorHasPrevious(UCharIterator * iter)380 lenient8IteratorHasPrevious(UCharIterator *iter) {
381 return iter->start>0;
382 }
383
384 static UChar32 U_CALLCONV
lenient8IteratorCurrent(UCharIterator * iter)385 lenient8IteratorCurrent(UCharIterator *iter) {
386 if(iter->reservedField!=0) {
387 return U16_TRAIL(iter->reservedField);
388 } else if(iter->start<iter->limit) {
389 const uint8_t *s=(const uint8_t *)iter->context;
390 UChar32 c;
391 int32_t i=iter->start;
392
393 L8_NEXT(s, i, iter->limit, c);
394 if(c<0) {
395 return 0xfffd;
396 } else if(c<=0xffff) {
397 return c;
398 } else {
399 return U16_LEAD(c);
400 }
401 } else {
402 return U_SENTINEL;
403 }
404 }
405
406 static UChar32 U_CALLCONV
lenient8IteratorNext(UCharIterator * iter)407 lenient8IteratorNext(UCharIterator *iter) {
408 int32_t index;
409
410 if(iter->reservedField!=0) {
411 UChar trail=U16_TRAIL(iter->reservedField);
412 iter->reservedField=0;
413 if((index=iter->index)>=0) {
414 iter->index=index+1;
415 }
416 return trail;
417 } else if(iter->start<iter->limit) {
418 const uint8_t *s=(const uint8_t *)iter->context;
419 UChar32 c;
420
421 L8_NEXT(s, iter->start, iter->limit, c);
422 if((index=iter->index)>=0) {
423 iter->index=++index;
424 if(iter->length<0 && iter->start==iter->limit) {
425 iter->length= c<=0xffff ? index : index+1;
426 }
427 } else if(iter->start==iter->limit && iter->length>=0) {
428 iter->index= c<=0xffff ? iter->length : iter->length-1;
429 }
430 if(c<0) {
431 return 0xfffd;
432 } else if(c<=0xffff) {
433 return c;
434 } else {
435 iter->reservedField=c;
436 return U16_LEAD(c);
437 }
438 } else {
439 return U_SENTINEL;
440 }
441 }
442
443 static UChar32 U_CALLCONV
lenient8IteratorPrevious(UCharIterator * iter)444 lenient8IteratorPrevious(UCharIterator *iter) {
445 int32_t index;
446
447 if(iter->reservedField!=0) {
448 UChar lead=U16_LEAD(iter->reservedField);
449 iter->reservedField=0;
450 iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
451 if((index=iter->index)>0) {
452 iter->index=index-1;
453 }
454 return lead;
455 } else if(iter->start>0) {
456 const uint8_t *s=(const uint8_t *)iter->context;
457 UChar32 c;
458
459 L8_PREV(s, 0, iter->start, c);
460 if((index=iter->index)>0) {
461 iter->index=index-1;
462 } else if(iter->start<=1) {
463 iter->index= c<=0xffff ? iter->start : iter->start+1;
464 }
465 if(c<0) {
466 return 0xfffd;
467 } else if(c<=0xffff) {
468 return c;
469 } else {
470 iter->start+=4; /* back to behind this supplementary code point for consistent state */
471 iter->reservedField=c;
472 return U16_TRAIL(c);
473 }
474 } else {
475 return U_SENTINEL;
476 }
477 }
478
479 static uint32_t U_CALLCONV
lenient8IteratorGetState(const UCharIterator * iter)480 lenient8IteratorGetState(const UCharIterator *iter) {
481 uint32_t state=(uint32_t)(iter->start<<1);
482 if(iter->reservedField!=0) {
483 state|=1;
484 }
485 return state;
486 }
487
488 static void U_CALLCONV
lenient8IteratorSetState(UCharIterator * iter,uint32_t state,UErrorCode * pErrorCode)489 lenient8IteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
490 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
491 /* do nothing */
492 } else if(iter==NULL) {
493 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
494 } else if(state==lenient8IteratorGetState(iter)) {
495 /* setting to the current state: no-op */
496 } else {
497 int32_t index=(int32_t)(state>>1); /* UTF-8 index */
498 state&=1; /* 1 if in surrogate pair, must be index>=4 */
499
500 if((state==0 ? index<0 : index<4) || iter->limit<index) {
501 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
502 } else {
503 iter->start=index; /* restore UTF-8 byte index */
504 if(index<=1) {
505 iter->index=index;
506 } else {
507 iter->index=-1; /* unknown UTF-16 index */
508 }
509 if(state==0) {
510 iter->reservedField=0;
511 } else {
512 /* verified index>=4 above */
513 UChar32 c;
514 L8_PREV((const uint8_t *)iter->context, 0, index, c);
515 if(c<=0xffff) {
516 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
517 } else {
518 iter->reservedField=c;
519 }
520 }
521 }
522 }
523 }
524
525 static const UCharIterator lenient8Iterator={
526 0, 0, 0, 0, 0, 0,
527 lenient8IteratorGetIndex,
528 lenient8IteratorMove,
529 lenient8IteratorHasNext,
530 lenient8IteratorHasPrevious,
531 lenient8IteratorCurrent,
532 lenient8IteratorNext,
533 lenient8IteratorPrevious,
534 NULL,
535 lenient8IteratorGetState,
536 lenient8IteratorSetState
537 };
538
539 U_CAPI void U_EXPORT2
uiter_setLenient8(UCharIterator * iter,const char * s,int32_t length)540 uiter_setLenient8(UCharIterator *iter, const char *s, int32_t length) {
541 if(iter!=0) {
542 if(s!=0 && length>=-1) {
543 *iter=lenient8Iterator;
544 iter->context=s;
545 if(length>=0) {
546 iter->limit=length;
547 } else {
548 iter->limit=strlen(s);
549 }
550 iter->length= iter->limit<=1 ? iter->limit : -1;
551 } else {
552 /* set no-op iterator */
553 uiter_setString(iter, NULL, 0);
554 }
555 }
556 }
557