1 /*
2 * The authors of this software are Rob Pike and Ken Thompson.
3 * Copyright (c) 2002 by Lucent Technologies.
4 * Portions Copyright (c) 2009 The Go Authors. All rights reserved.
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose without fee is hereby granted, provided that this entire notice
7 * is included in all copies of any software which is or includes a copy
8 * or modification of this software and in all copies of the supporting
9 * documentation for such software.
10 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
11 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
12 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
13 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
14 */
15 #include "phonenumbers/utf/utf.h"
16 #include "phonenumbers/utf/utfdef.h"
17
18 enum
19 {
20 Bit1 = 7,
21 Bitx = 6,
22 Bit2 = 5,
23 Bit3 = 4,
24 Bit4 = 3,
25 Bit5 = 2,
26
27 T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
28 Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
29 T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
30 T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
31 T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
32 T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
33
34 Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
35 Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
36 Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
37 Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0001 1111 1111 1111 1111 1111 */
38
39 Maskx = (1<<Bitx)-1, /* 0011 1111 */
40 Testx = Maskx ^ 0xFF, /* 1100 0000 */
41
42 SurrogateMin = 0xD800,
43 SurrogateMax = 0xDFFF,
44
45 Bad = Runeerror,
46 };
47
48 /*
49 * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
50 * This is a slower but "safe" version of the old chartorune
51 * that works on strings that are not necessarily null-terminated.
52 *
53 * If you know for sure that your string is null-terminated,
54 * chartorune will be a bit faster.
55 *
56 * It is guaranteed not to attempt to access "length"
57 * past the incoming pointer. This is to avoid
58 * possible access violations. If the string appears to be
59 * well-formed but incomplete (i.e., to get the whole Rune
60 * we'd need to read past str+length) then we'll set the Rune
61 * to Bad and return 0.
62 *
63 * Note that if we have decoding problems for other
64 * reasons, we return 1 instead of 0.
65 */
66 int
charntorune(Rune * rune,const char * str,int length)67 charntorune(Rune *rune, const char *str, int length)
68 {
69 int c, c1, c2, c3;
70 long l;
71
72 /* When we're not allowed to read anything */
73 if(length <= 0) {
74 goto badlen;
75 }
76
77 /*
78 * one character sequence (7-bit value)
79 * 00000-0007F => T1
80 */
81 c = *(uchar*)str;
82 if(c < Tx) {
83 *rune = (Rune)c;
84 return 1;
85 }
86
87 // If we can't read more than one character we must stop
88 if(length <= 1) {
89 goto badlen;
90 }
91
92 /*
93 * two character sequence (11-bit value)
94 * 0080-07FF => T2 Tx
95 */
96 c1 = *(uchar*)(str+1) ^ Tx;
97 if(c1 & Testx)
98 goto bad;
99 if(c < T3) {
100 if(c < T2)
101 goto bad;
102 l = ((c << Bitx) | c1) & Rune2;
103 if(l <= Rune1)
104 goto bad;
105 *rune = (Rune)l;
106 return 2;
107 }
108
109 // If we can't read more than two characters we must stop
110 if(length <= 2) {
111 goto badlen;
112 }
113
114 /*
115 * three character sequence (16-bit value)
116 * 0800-FFFF => T3 Tx Tx
117 */
118 c2 = *(uchar*)(str+2) ^ Tx;
119 if(c2 & Testx)
120 goto bad;
121 if(c < T4) {
122 l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
123 if(l <= Rune2)
124 goto bad;
125 if (SurrogateMin <= l && l <= SurrogateMax)
126 goto bad;
127 *rune = (Rune)l;
128 return 3;
129 }
130
131 if (length <= 3)
132 goto badlen;
133
134 /*
135 * four character sequence (21-bit value)
136 * 10000-1FFFFF => T4 Tx Tx Tx
137 */
138 c3 = *(uchar*)(str+3) ^ Tx;
139 if (c3 & Testx)
140 goto bad;
141 if (c < T5) {
142 l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
143 if (l <= Rune3 || l > Runemax)
144 goto bad;
145 *rune = (Rune)l;
146 return 4;
147 }
148
149 // Support for 5-byte or longer UTF-8 would go here, but
150 // since we don't have that, we'll just fall through to bad.
151
152 /*
153 * bad decoding
154 */
155 bad:
156 *rune = Bad;
157 return 1;
158 badlen:
159 *rune = Bad;
160 return 0;
161
162 }
163
164
165 /*
166 * This is the older "unsafe" version, which works fine on
167 * null-terminated strings.
168 */
169 int
chartorune(Rune * rune,const char * str)170 chartorune(Rune *rune, const char *str)
171 {
172 int c, c1, c2, c3;
173 long l;
174
175 /*
176 * one character sequence
177 * 00000-0007F => T1
178 */
179 c = *(uchar*)str;
180 if(c < Tx) {
181 *rune = (Rune)c;
182 return 1;
183 }
184
185 /*
186 * two character sequence
187 * 0080-07FF => T2 Tx
188 */
189 c1 = *(uchar*)(str+1) ^ Tx;
190 if(c1 & Testx)
191 goto bad;
192 if(c < T3) {
193 if(c < T2)
194 goto bad;
195 l = ((c << Bitx) | c1) & Rune2;
196 if(l <= Rune1)
197 goto bad;
198 *rune = (Rune)l;
199 return 2;
200 }
201
202 /*
203 * three character sequence
204 * 0800-FFFF => T3 Tx Tx
205 */
206 c2 = *(uchar*)(str+2) ^ Tx;
207 if(c2 & Testx)
208 goto bad;
209 if(c < T4) {
210 l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
211 if(l <= Rune2)
212 goto bad;
213 if (SurrogateMin <= l && l <= SurrogateMax)
214 goto bad;
215 *rune = (Rune)l;
216 return 3;
217 }
218
219 /*
220 * four character sequence (21-bit value)
221 * 10000-1FFFFF => T4 Tx Tx Tx
222 */
223 c3 = *(uchar*)(str+3) ^ Tx;
224 if (c3 & Testx)
225 goto bad;
226 if (c < T5) {
227 l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
228 if (l <= Rune3 || l > Runemax)
229 goto bad;
230 *rune = (Rune)l;
231 return 4;
232 }
233
234 /*
235 * Support for 5-byte or longer UTF-8 would go here, but
236 * since we don't have that, we'll just fall through to bad.
237 */
238
239 /*
240 * bad decoding
241 */
242 bad:
243 *rune = Bad;
244 return 1;
245 }
246
247 int
isvalidcharntorune(const char * str,int length,Rune * rune,int * consumed)248 isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed)
249 {
250 *consumed = charntorune(rune, str, length);
251 return *rune != Runeerror || *consumed == 3;
252 }
253
254 int
runetochar(char * str,const Rune * rune)255 runetochar(char *str, const Rune *rune)
256 {
257 /* Runes are signed, so convert to unsigned for range check. */
258 unsigned long c;
259
260 /*
261 * one character sequence
262 * 00000-0007F => 00-7F
263 */
264 c = *rune;
265 if(c <= Rune1) {
266 str[0] = (char)c;
267 return 1;
268 }
269
270 /*
271 * two character sequence
272 * 0080-07FF => T2 Tx
273 */
274 if(c <= Rune2) {
275 str[0] = (char)(T2 | (c >> 1*Bitx));
276 str[1] = (char)(Tx | (c & Maskx));
277 return 2;
278 }
279
280 /*
281 * If the Rune is out of range or a surrogate half, convert it to the error rune.
282 * Do this test here because the error rune encodes to three bytes.
283 * Doing it earlier would duplicate work, since an out of range
284 * Rune wouldn't have fit in one or two bytes.
285 */
286 if (c > Runemax)
287 c = Runeerror;
288 if (SurrogateMin <= c && c <= SurrogateMax)
289 c = Runeerror;
290
291 /*
292 * three character sequence
293 * 0800-FFFF => T3 Tx Tx
294 */
295 if (c <= Rune3) {
296 str[0] = (char)(T3 | (c >> 2*Bitx));
297 str[1] = (char)(Tx | ((c >> 1*Bitx) & Maskx));
298 str[2] = (char)(Tx | (c & Maskx));
299 return 3;
300 }
301
302 /*
303 * four character sequence (21-bit value)
304 * 10000-1FFFFF => T4 Tx Tx Tx
305 */
306 str[0] = (char)(T4 | (c >> 3*Bitx));
307 str[1] = (char)(Tx | ((c >> 2*Bitx) & Maskx));
308 str[2] = (char)(Tx | ((c >> 1*Bitx) & Maskx));
309 str[3] = (char)(Tx | (c & Maskx));
310 return 4;
311 }
312
313 int
runelen(Rune rune)314 runelen(Rune rune)
315 {
316 char str[10];
317
318 return runetochar(str, &rune);
319 }
320
321 int
runenlen(const Rune * r,int nrune)322 runenlen(const Rune *r, int nrune)
323 {
324 int nb, c;
325
326 nb = 0;
327 while(nrune--) {
328 c = (int)*r++;
329 if (c <= Rune1)
330 nb++;
331 else if (c <= Rune2)
332 nb += 2;
333 else if (c <= Rune3)
334 nb += 3;
335 else /* assert(c <= Rune4) */
336 nb += 4;
337 }
338 return nb;
339 }
340
341 int
fullrune(const char * str,int n)342 fullrune(const char *str, int n)
343 {
344 if (n > 0) {
345 int c = *(uchar*)str;
346 if (c < Tx)
347 return 1;
348 if (n > 1) {
349 if (c < T3)
350 return 1;
351 if (n > 2) {
352 if (c < T4 || n > 3)
353 return 1;
354 }
355 }
356 }
357 return 0;
358 }
359