1 /*
2 * The authors of this software are Rob Pike and Ken Thompson.
3 * Copyright (c) 2002 by Lucent Technologies.
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose without fee is hereby granted, provided that this entire notice
6 * is included in all copies of any software which is or includes a copy
7 * or modification of this software and in all copies of the supporting
8 * documentation for such software.
9 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
10 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
11 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
12 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
13 */
14
15 #include <stdarg.h>
16 #include <string.h>
17
18 #include "util/utf.h"
19
20 namespace re2 {
21
22 enum
23 {
24 Bit1 = 7,
25 Bitx = 6,
26 Bit2 = 5,
27 Bit3 = 4,
28 Bit4 = 3,
29 Bit5 = 2,
30
31 T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
32 Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
33 T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
34 T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
35 T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
36 T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
37
38 Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
39 Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
40 Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
41 Rune4 = (1<<(Bit4+3*Bitx))-1,
42 /* 0001 1111 1111 1111 1111 1111 */
43
44 Maskx = (1<<Bitx)-1, /* 0011 1111 */
45 Testx = Maskx ^ 0xFF, /* 1100 0000 */
46
47 Bad = Runeerror,
48 };
49
50 int
chartorune(Rune * rune,const char * str)51 chartorune(Rune *rune, const char *str)
52 {
53 int c, c1, c2, c3;
54 long l;
55
56 /*
57 * one character sequence
58 * 00000-0007F => T1
59 */
60 c = *(unsigned char*)str;
61 if(c < Tx) {
62 *rune = c;
63 return 1;
64 }
65
66 /*
67 * two character sequence
68 * 0080-07FF => T2 Tx
69 */
70 c1 = *(unsigned char*)(str+1) ^ Tx;
71 if(c1 & Testx)
72 goto bad;
73 if(c < T3) {
74 if(c < T2)
75 goto bad;
76 l = ((c << Bitx) | c1) & Rune2;
77 if(l <= Rune1)
78 goto bad;
79 *rune = l;
80 return 2;
81 }
82
83 /*
84 * three character sequence
85 * 0800-FFFF => T3 Tx Tx
86 */
87 c2 = *(unsigned char*)(str+2) ^ Tx;
88 if(c2 & Testx)
89 goto bad;
90 if(c < T4) {
91 l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
92 if(l <= Rune2)
93 goto bad;
94 *rune = l;
95 return 3;
96 }
97
98 /*
99 * four character sequence (21-bit value)
100 * 10000-1FFFFF => T4 Tx Tx Tx
101 */
102 c3 = *(unsigned char*)(str+3) ^ Tx;
103 if (c3 & Testx)
104 goto bad;
105 if (c < T5) {
106 l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
107 if (l <= Rune3)
108 goto bad;
109 *rune = l;
110 return 4;
111 }
112
113 /*
114 * Support for 5-byte or longer UTF-8 would go here, but
115 * since we don't have that, we'll just fall through to bad.
116 */
117
118 /*
119 * bad decoding
120 */
121 bad:
122 *rune = Bad;
123 return 1;
124 }
125
126 int
runetochar(char * str,const Rune * rune)127 runetochar(char *str, const Rune *rune)
128 {
129 /* Runes are signed, so convert to unsigned for range check. */
130 unsigned long c;
131
132 /*
133 * one character sequence
134 * 00000-0007F => 00-7F
135 */
136 c = *rune;
137 if(c <= Rune1) {
138 str[0] = static_cast<char>(c);
139 return 1;
140 }
141
142 /*
143 * two character sequence
144 * 0080-07FF => T2 Tx
145 */
146 if(c <= Rune2) {
147 str[0] = T2 | static_cast<char>(c >> 1*Bitx);
148 str[1] = Tx | (c & Maskx);
149 return 2;
150 }
151
152 /*
153 * If the Rune is out of range, convert it to the error rune.
154 * Do this test here because the error rune encodes to three bytes.
155 * Doing it earlier would duplicate work, since an out of range
156 * Rune wouldn't have fit in one or two bytes.
157 */
158 if (c > Runemax)
159 c = Runeerror;
160
161 /*
162 * three character sequence
163 * 0800-FFFF => T3 Tx Tx
164 */
165 if (c <= Rune3) {
166 str[0] = T3 | static_cast<char>(c >> 2*Bitx);
167 str[1] = Tx | ((c >> 1*Bitx) & Maskx);
168 str[2] = Tx | (c & Maskx);
169 return 3;
170 }
171
172 /*
173 * four character sequence (21-bit value)
174 * 10000-1FFFFF => T4 Tx Tx Tx
175 */
176 str[0] = T4 | static_cast<char>(c >> 3*Bitx);
177 str[1] = Tx | ((c >> 2*Bitx) & Maskx);
178 str[2] = Tx | ((c >> 1*Bitx) & Maskx);
179 str[3] = Tx | (c & Maskx);
180 return 4;
181 }
182
183 int
runelen(Rune rune)184 runelen(Rune rune)
185 {
186 char str[10];
187
188 return runetochar(str, &rune);
189 }
190
191 int
fullrune(const char * str,int n)192 fullrune(const char *str, int n)
193 {
194 if (n > 0) {
195 int c = *(unsigned char*)str;
196 if (c < Tx)
197 return 1;
198 if (n > 1) {
199 if (c < T3)
200 return 1;
201 if (n > 2) {
202 if (c < T4 || n > 3)
203 return 1;
204 }
205 }
206 }
207 return 0;
208 }
209
210
211 int
utflen(const char * s)212 utflen(const char *s)
213 {
214 int c;
215 long n;
216 Rune rune;
217
218 n = 0;
219 for(;;) {
220 c = *(unsigned char*)s;
221 if(c < Runeself) {
222 if(c == 0)
223 return n;
224 s++;
225 } else
226 s += chartorune(&rune, s);
227 n++;
228 }
229 return 0;
230 }
231
232 char*
utfrune(const char * s,Rune c)233 utfrune(const char *s, Rune c)
234 {
235 long c1;
236 Rune r;
237 int n;
238
239 if(c < Runesync) /* not part of utf sequence */
240 return strchr((char*)s, c);
241
242 for(;;) {
243 c1 = *(unsigned char*)s;
244 if(c1 < Runeself) { /* one byte rune */
245 if(c1 == 0)
246 return 0;
247 if(c1 == c)
248 return (char*)s;
249 s++;
250 continue;
251 }
252 n = chartorune(&r, s);
253 if(r == c)
254 return (char*)s;
255 s += n;
256 }
257 return 0;
258 }
259
260 } // namespace re2
261