1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
6
7 #include "base/basictypes.h"
8
9 // Return true if current Tbl pointer is within state0 range
10 // Note that unsigned compare checks both ends of range simultaneously
InStateZero(const UTF8ScanObj * st,const uint8 * Tbl)11 static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
12 const uint8* Tbl0 = &st->state_table[st->state0];
13 return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
14 }
15
16
17 // Look up property of one UTF-8 character and advance over it
18 // Return 0 if input length is zero
19 // Return 0 and advance one byte if input is ill-formed
UTF8GenericProperty(const UTF8PropObj * st,const uint8 ** src,int * srclen)20 uint8 UTF8GenericProperty(const UTF8PropObj* st,
21 const uint8** src,
22 int* srclen) {
23 if (*srclen <= 0) {
24 return 0;
25 }
26
27 const uint8* lsrc = *src;
28 const uint8* Tbl_0 = &st->state_table[st->state0];
29 const uint8* Tbl = Tbl_0;
30 int e;
31 int eshift = st->entry_shift;
32
33 // Short series of tests faster than switch, optimizes 7-bit ASCII
34 unsigned char c = lsrc[0];
35 if (static_cast<signed char>(c) >= 0) { // one byte
36 e = Tbl[c];
37 *src += 1;
38 *srclen -= 1;
39 } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
40 e = Tbl[c];
41 Tbl = &Tbl_0[e << eshift];
42 e = Tbl[lsrc[1]];
43 *src += 2;
44 *srclen -= 2;
45 } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
46 e = Tbl[c];
47 Tbl = &Tbl_0[e << eshift];
48 e = Tbl[lsrc[1]];
49 Tbl = &Tbl_0[e << eshift];
50 e = Tbl[lsrc[2]];
51 *src += 3;
52 *srclen -= 3;
53 }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
54 e = Tbl[c];
55 Tbl = &Tbl_0[e << eshift];
56 e = Tbl[lsrc[1]];
57 Tbl = &Tbl_0[e << eshift];
58 e = Tbl[lsrc[2]];
59 Tbl = &Tbl_0[e << eshift];
60 e = Tbl[lsrc[3]];
61 *src += 4;
62 *srclen -= 4;
63 } else { // Ill-formed
64 e = 0;
65 *src += 1;
66 *srclen -= 1;
67 }
68 return e;
69 }
70
71 // BigOneByte versions are needed for tables > 240 states, but most
72 // won't need the TwoByte versions.
73 // Internally, to next-to-last offset is multiplied by 16 and the last
74 // offset is relative instead of absolute.
75 // Look up property of one UTF-8 character and advance over it
76 // Return 0 if input length is zero
77 // Return 0 and advance one byte if input is ill-formed
UTF8GenericPropertyBigOneByte(const UTF8PropObj * st,const uint8 ** src,int * srclen)78 uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
79 const uint8** src,
80 int* srclen) {
81 if (*srclen <= 0) {
82 return 0;
83 }
84
85 const uint8* lsrc = *src;
86 const uint8* Tbl_0 = &st->state_table[st->state0];
87 const uint8* Tbl = Tbl_0;
88 int e;
89 int eshift = st->entry_shift;
90
91 // Short series of tests faster than switch, optimizes 7-bit ASCII
92 unsigned char c = lsrc[0];
93 if (static_cast<signed char>(c) >= 0) { // one byte
94 e = Tbl[c];
95 *src += 1;
96 *srclen -= 1;
97 } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
98 e = Tbl[c];
99 Tbl = &Tbl_0[e << eshift];
100 e = Tbl[lsrc[1]];
101 *src += 2;
102 *srclen -= 2;
103 } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
104 e = Tbl[c];
105 Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
106 e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
107 Tbl = &Tbl[e << eshift]; // Relative +/-
108 e = Tbl[lsrc[2]];
109 *src += 3;
110 *srclen -= 3;
111 }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
112 e = Tbl[c];
113 Tbl = &Tbl_0[e << eshift];
114 e = Tbl[lsrc[1]];
115 Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
116 e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
117 Tbl = &Tbl[e << eshift]; // Relative +/-
118 e = Tbl[lsrc[3]];
119 *src += 4;
120 *srclen -= 4;
121 } else { // Ill-formed
122 e = 0;
123 *src += 1;
124 *srclen -= 1;
125 }
126 return e;
127 }
128
129 // Scan a UTF-8 stringpiece based on a state table.
130 // Always scan complete UTF-8 characters
131 // Set number of bytes scanned. Return reason for exiting
UTF8GenericScan(const UTF8ScanObj * st,const uint8 * str,const int len,int * bytes_consumed)132 int UTF8GenericScan(const UTF8ScanObj* st,
133 const uint8* str,
134 const int len,
135 int* bytes_consumed) {
136 int eshift = st->entry_shift; // 6 (space optimized) or 8
137 // int nEntries = (1 << eshift); // 64 or 256 entries per state
138
139 const uint8* isrc = str;
140 //reinterpret_cast<const uint8*>(str.data());
141 const uint8* src = isrc;
142 //const int len = str.length();
143 const uint8* srclimit = isrc + len;
144 const uint8* srclimit8 = srclimit - 7;
145 *bytes_consumed = 0;
146 if (len == 0) return kExitOK;
147
148 const uint8* Tbl_0 = &st->state_table[st->state0];
149
150 DoAgain:
151 // Do state-table scan
152 int e = 0;
153 uint8 c;
154
155 // Do fast for groups of 8 identity bytes.
156 // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
157 // including slowing slightly on cr/lf/ht
158 //----------------------------
159 const uint8* Tbl2 = &st->fast_state[0];
160 uint32 losub = st->losub;
161 uint32 hiadd = st->hiadd;
162 while (src < srclimit8) {
163 uint32 s0123 = UnalignedLoad32(src);
164 uint32 s4567 = UnalignedLoad32(src + 4);
165 src += 8;
166 // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
167 uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
168 (s4567 - losub) | (s4567 + hiadd);
169 if ((temp & 0x80808080) != 0) {
170 // We typically end up here on cr/lf/ht; src was incremented
171 int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
172 (Tbl2[src[-6]] | Tbl2[src[-5]]);
173 if (e0123 != 0) {src -= 8; break;} // Exit on Non-interchange
174 e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
175 (Tbl2[src[-2]] | Tbl2[src[-1]]);
176 if (e0123 != 0) {src -= 4; break;} // Exit on Non-interchange
177 // Else OK, go around again
178 }
179 }
180 //----------------------------
181
182 // Byte-at-a-time scan
183 //----------------------------
184 const uint8* Tbl = Tbl_0;
185 while (src < srclimit) {
186 c = *src;
187 e = Tbl[c];
188 src++;
189 if (e >= kExitIllegalStructure) {break;}
190 Tbl = &Tbl_0[e << eshift];
191 }
192 //----------------------------
193
194
195 // Exit posibilities:
196 // Some exit code, !state0, back up over last char
197 // Some exit code, state0, back up one byte exactly
198 // source consumed, !state0, back up over partial char
199 // source consumed, state0, exit OK
200 // For illegal byte in state0, avoid backup up over PREVIOUS char
201 // For truncated last char, back up to beginning of it
202
203 if (e >= kExitIllegalStructure) {
204 // Back up over exactly one byte of rejected/illegal UTF-8 character
205 src--;
206 // Back up more if needed
207 if (!InStateZero(st, Tbl)) {
208 do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
209 }
210 } else if (!InStateZero(st, Tbl)) {
211 // Back up over truncated UTF-8 character
212 e = kExitIllegalStructure;
213 do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
214 } else {
215 // Normal termination, source fully consumed
216 e = kExitOK;
217 }
218
219 if (e == kExitDoAgain) {
220 // Loop back up to the fast scan
221 goto DoAgain;
222 }
223
224 *bytes_consumed = src - isrc;
225 return e;
226 }
227