• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 1999-2006, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *   file name:  utf_impl.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 1999sep13
14 *   created by: Markus W. Scherer
15 *
16 *   This file provides implementation functions for macros in the utfXX.h
17 *   that would otherwise be too long as macros.
18 */
19 
20 /* set import/export definitions */
21 #ifndef U_UTF8_IMPL
22 #   define U_UTF8_IMPL
23 #endif
24 
25 #include "unicode/utypes.h"
26 
27 /*
28  * This table could be replaced on many machines by
29  * a few lines of assembler code using an
30  * "index of first 0-bit from msb" instruction and
31  * one or two more integer instructions.
32  *
33  * For example, on an i386, do something like
34  * - MOV AL, leadByte
35  * - NOT AL         (8-bit, leave b15..b8==0..0, reverse only b7..b0)
36  * - MOV AH, 0
37  * - BSR BX, AX     (16-bit)
38  * - MOV AX, 6      (result)
39  * - JZ finish      (ZF==1 if leadByte==0xff)
40  * - SUB AX, BX (result)
41  * -finish:
42  * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
43  *
44  * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal;
45  * lead bytes above 0xf4 are illegal.
46  * We keep them in this table for skipping long ISO 10646-UTF-8 sequences.
47  */
48 U_EXPORT const uint8_t
49 utf8_countTrailBytes[256]={
50     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
51     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
52     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
53     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
54 
55     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
56     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
58     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
59 
60     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
61     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
62     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
63     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
64 
65     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
66     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
67 
68     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
69     3, 3, 3, 3, 3,
70     3, 3, 3,    /* illegal in Unicode */
71     4, 4, 4, 4, /* illegal in Unicode */
72     5, 5,       /* illegal in Unicode */
73     0, 0        /* illegal bytes 0xfe and 0xff */
74 };
75 
76 static const UChar32
77 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
78 
79 static const UChar32
80 utf8_errorValue[6]={
81     UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff,
82     0x3ffffff, 0x7fffffff
83 };
84 
85 /*
86  * Handle the non-inline part of the U8_NEXT() macro and its obsolete sibling
87  * UTF8_NEXT_CHAR_SAFE().
88  *
89  * The "strict" parameter controls the error behavior:
90  * <0  "Safe" behavior of U8_NEXT(): All illegal byte sequences yield a negative
91  *     code point result.
92  *  0  Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
93  *     All illegal byte sequences yield a positive code point such that this
94  *     result code point would be encoded with the same number of bytes as
95  *     the illegal sequence.
96  * >0  Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):
97  *     Same as the obsolete "safe" behavior, but non-characters are also treated
98  *     like illegal sequences.
99  *
100  * The special negative (<0) value -2 is used for lenient treatment of surrogate
101  * code points as legal. Some implementations use this for roundtripping of
102  * Unicode 16-bit strings that are not well-formed UTF-16, that is, they
103  * contain unpaired surrogates.
104  *
105  * Note that a UBool is the same as an int8_t.
106  */
107 U_CAPI UChar32 U_EXPORT2
utf8_nextCharSafeBody(const uint8_t * s,int32_t * pi,int32_t length,UChar32 c,UBool strict)108 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
109     int32_t i=*pi;
110     uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
111     if((i)+count<=(length)) {
112         uint8_t trail, illegal=0;
113 
114         UTF8_MASK_LEAD_BYTE((c), count);
115         /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
116         switch(count) {
117         /* each branch falls through to the next one */
118         case 5:
119         case 4:
120             /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
121             illegal=1;
122             break;
123         case 3:
124             trail=s[(i)++];
125             (c)=((c)<<6)|(trail&0x3f);
126             if(c<0x110) {
127                 illegal|=(trail&0xc0)^0x80;
128             } else {
129                 /* code point>0x10ffff, outside Unicode */
130                 illegal=1;
131                 break;
132             }
133         case 2:
134             trail=s[(i)++];
135             (c)=((c)<<6)|(trail&0x3f);
136             illegal|=(trail&0xc0)^0x80;
137         case 1:
138             trail=s[(i)++];
139             (c)=((c)<<6)|(trail&0x3f);
140             illegal|=(trail&0xc0)^0x80;
141             break;
142         case 0:
143             if(strict>=0) {
144                 return UTF8_ERROR_VALUE_1;
145             } else {
146                 return U_SENTINEL;
147             }
148         /* no default branch to optimize switch()  - all values are covered */
149         }
150 
151         /*
152          * All the error handling should return a value
153          * that needs count bytes so that UTF8_GET_CHAR_SAFE() works right.
154          *
155          * Starting with Unicode 3.0.1, non-shortest forms are illegal.
156          * Starting with Unicode 3.2, surrogate code points must not be
157          * encoded in UTF-8, and there are no irregular sequences any more.
158          *
159          * U8_ macros (new in ICU 2.4) return negative values for error conditions.
160          */
161 
162         /* correct sequence - all trail bytes have (b7..b6)==(10)? */
163         /* illegal is also set if count>=4 */
164         if(illegal || (c)<utf8_minLegal[count] || (UTF_IS_SURROGATE(c) && strict!=-2)) {
165             /* error handling */
166             uint8_t errorCount=count;
167             /* don't go beyond this sequence */
168             i=*pi;
169             while(count>0 && UTF8_IS_TRAIL(s[i])) {
170                 ++(i);
171                 --count;
172             }
173             if(strict>=0) {
174                 c=utf8_errorValue[errorCount-count];
175             } else {
176                 c=U_SENTINEL;
177             }
178         } else if((strict)>0 && UTF_IS_UNICODE_NONCHAR(c)) {
179             /* strict: forbid non-characters like U+fffe */
180             c=utf8_errorValue[count];
181         }
182     } else /* too few bytes left */ {
183         /* error handling */
184         int32_t i0=i;
185         /* don't just set (i)=(length) in case there is an illegal sequence */
186         while((i)<(length) && UTF8_IS_TRAIL(s[i])) {
187             ++(i);
188         }
189         if(strict>=0) {
190             c=utf8_errorValue[i-i0];
191         } else {
192             c=U_SENTINEL;
193         }
194     }
195     *pi=i;
196     return c;
197 }
198 
199 U_CAPI int32_t U_EXPORT2
utf8_appendCharSafeBody(uint8_t * s,int32_t i,int32_t length,UChar32 c,UBool * pIsError)200 utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError) {
201     if((uint32_t)(c)<=0x7ff) {
202         if((i)+1<(length)) {
203             (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0);
204             (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
205             return i;
206         }
207     } else if((uint32_t)(c)<=0xffff) {
208         /* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */
209         if((i)+2<(length) && !U_IS_SURROGATE(c)) {
210             (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0);
211             (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
212             (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
213             return i;
214         }
215     } else if((uint32_t)(c)<=0x10ffff) {
216         if((i)+3<(length)) {
217             (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0);
218             (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80);
219             (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
220             (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
221             return i;
222         }
223     }
224     /* c>0x10ffff or not enough space, write an error value */
225     if(pIsError!=NULL) {
226         *pIsError=TRUE;
227     } else {
228         length-=i;
229         if(length>0) {
230             int32_t offset;
231             if(length>3) {
232                 length=3;
233             }
234             s+=i;
235             offset=0;
236             c=utf8_errorValue[length-1];
237             UTF8_APPEND_CHAR_UNSAFE(s, offset, c);
238             i=i+offset;
239         }
240     }
241     return i;
242 }
243 
244 U_CAPI UChar32 U_EXPORT2
utf8_prevCharSafeBody(const uint8_t * s,int32_t start,int32_t * pi,UChar32 c,UBool strict)245 utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) {
246     int32_t i=*pi;
247     uint8_t b, count=1, shift=6;
248 
249     /* extract value bits from the last trail byte */
250     c&=0x3f;
251 
252     for(;;) {
253         if(i<=start) {
254             /* no lead byte at all */
255             if(strict>=0) {
256                 return UTF8_ERROR_VALUE_1;
257             } else {
258                 return U_SENTINEL;
259             }
260             /*break;*/
261         }
262 
263         /* read another previous byte */
264         b=s[--i];
265         if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
266             if(b&0x40) {
267                 /* lead byte, this will always end the loop */
268                 uint8_t shouldCount=UTF8_COUNT_TRAIL_BYTES(b);
269 
270                 if(count==shouldCount) {
271                     /* set the new position */
272                     *pi=i;
273                     UTF8_MASK_LEAD_BYTE(b, count);
274                     c|=(UChar32)b<<shift;
275                     if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || (UTF_IS_SURROGATE(c) && strict!=-2) || (strict>0 && UTF_IS_UNICODE_NONCHAR(c))) {
276                         /* illegal sequence or (strict and non-character) */
277                         if(count>=4) {
278                             count=3;
279                         }
280                         if(strict>=0) {
281                             c=utf8_errorValue[count];
282                         } else {
283                             c=U_SENTINEL;
284                         }
285                     } else {
286                         /* exit with correct c */
287                     }
288                 } else {
289                     /* the lead byte does not match the number of trail bytes */
290                     /* only set the position to the lead byte if it would
291                        include the trail byte that we started with */
292                     if(count<shouldCount) {
293                         *pi=i;
294                         if(strict>=0) {
295                             c=utf8_errorValue[count];
296                         } else {
297                             c=U_SENTINEL;
298                         }
299                     } else {
300                         if(strict>=0) {
301                             c=UTF8_ERROR_VALUE_1;
302                         } else {
303                             c=U_SENTINEL;
304                         }
305                     }
306                 }
307                 break;
308             } else if(count<5) {
309                 /* trail byte */
310                 c|=(UChar32)(b&0x3f)<<shift;
311                 ++count;
312                 shift+=6;
313             } else {
314                 /* more than 5 trail bytes is illegal */
315                 if(strict>=0) {
316                     c=UTF8_ERROR_VALUE_1;
317                 } else {
318                     c=U_SENTINEL;
319                 }
320                 break;
321             }
322         } else {
323             /* single-byte character precedes trailing bytes */
324             if(strict>=0) {
325                 c=UTF8_ERROR_VALUE_1;
326             } else {
327                 c=U_SENTINEL;
328             }
329             break;
330         }
331     }
332     return c;
333 }
334 
335 U_CAPI int32_t U_EXPORT2
utf8_back1SafeBody(const uint8_t * s,int32_t start,int32_t i)336 utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
337     /* i had been decremented once before the function call */
338     int32_t I=i, Z;
339     uint8_t b;
340 
341     /* read at most the 6 bytes s[Z] to s[i], inclusively */
342     if(I-5>start) {
343         Z=I-5;
344     } else {
345         Z=start;
346     }
347 
348     /* return I if the sequence starting there is long enough to include i */
349     do {
350         b=s[I];
351         if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */
352             break;
353         } else if(b>=0xc0) {
354             if(UTF8_COUNT_TRAIL_BYTES(b)>=(i-I)) {
355                 return I;
356             } else {
357                 break;
358             }
359         }
360     } while(Z<=--I);
361 
362     /* return i itself to be consistent with the FWD_1 macro */
363     return i;
364 }
365