• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
6 
7 #include "base/basictypes.h"
8 
9 // Return true if current Tbl pointer is within state0 range
10 // Note that unsigned compare checks both ends of range simultaneously
InStateZero(const UTF8ScanObj * st,const uint8 * Tbl)11 static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
12   const uint8* Tbl0 = &st->state_table[st->state0];
13   return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
14 }
15 
16 
17 // Look up property of one UTF-8 character and advance over it
18 // Return 0 if input length is zero
19 // Return 0 and advance one byte if input is ill-formed
UTF8GenericProperty(const UTF8PropObj * st,const uint8 ** src,int * srclen)20 uint8 UTF8GenericProperty(const UTF8PropObj* st,
21                           const uint8** src,
22                           int* srclen) {
23   if (*srclen <= 0) {
24     return 0;
25   }
26 
27   const uint8* lsrc = *src;
28   const uint8* Tbl_0 = &st->state_table[st->state0];
29   const uint8* Tbl = Tbl_0;
30   int e;
31   int eshift = st->entry_shift;
32 
33   // Short series of tests faster than switch, optimizes 7-bit ASCII
34   unsigned char c = lsrc[0];
35   if (static_cast<signed char>(c) >= 0) {           // one byte
36     e = Tbl[c];
37     *src += 1;
38     *srclen -= 1;
39   } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
40     e = Tbl[c];
41     Tbl = &Tbl_0[e << eshift];
42     e = Tbl[lsrc[1]];
43     *src += 2;
44     *srclen -= 2;
45   } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
46     e = Tbl[c];
47     Tbl = &Tbl_0[e << eshift];
48     e = Tbl[lsrc[1]];
49     Tbl = &Tbl_0[e << eshift];
50     e = Tbl[lsrc[2]];
51     *src += 3;
52     *srclen -= 3;
53   }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
54     e = Tbl[c];
55     Tbl = &Tbl_0[e << eshift];
56     e = Tbl[lsrc[1]];
57     Tbl = &Tbl_0[e << eshift];
58     e = Tbl[lsrc[2]];
59     Tbl = &Tbl_0[e << eshift];
60     e = Tbl[lsrc[3]];
61     *src += 4;
62     *srclen -= 4;
63   } else {                                                // Ill-formed
64     e = 0;
65     *src += 1;
66     *srclen -= 1;
67   }
68   return e;
69 }
70 
71 // BigOneByte versions are needed for tables > 240 states, but most
72 // won't need the TwoByte versions.
73 // Internally, to next-to-last offset is multiplied by 16 and the last
74 // offset is relative instead of absolute.
75 // Look up property of one UTF-8 character and advance over it
76 // Return 0 if input length is zero
77 // Return 0 and advance one byte if input is ill-formed
UTF8GenericPropertyBigOneByte(const UTF8PropObj * st,const uint8 ** src,int * srclen)78 uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
79                           const uint8** src,
80                           int* srclen) {
81   if (*srclen <= 0) {
82     return 0;
83   }
84 
85   const uint8* lsrc = *src;
86   const uint8* Tbl_0 = &st->state_table[st->state0];
87   const uint8* Tbl = Tbl_0;
88   int e;
89   int eshift = st->entry_shift;
90 
91   // Short series of tests faster than switch, optimizes 7-bit ASCII
92   unsigned char c = lsrc[0];
93   if (static_cast<signed char>(c) >= 0) {           // one byte
94     e = Tbl[c];
95     *src += 1;
96     *srclen -= 1;
97   } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
98     e = Tbl[c];
99     Tbl = &Tbl_0[e << eshift];
100     e = Tbl[lsrc[1]];
101     *src += 2;
102     *srclen -= 2;
103   } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
104     e = Tbl[c];
105     Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
106     e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
107     Tbl = &Tbl[e << eshift];          // Relative +/-
108     e = Tbl[lsrc[2]];
109     *src += 3;
110     *srclen -= 3;
111   }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
112     e = Tbl[c];
113     Tbl = &Tbl_0[e << eshift];
114     e = Tbl[lsrc[1]];
115     Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
116     e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
117     Tbl = &Tbl[e << eshift];          // Relative +/-
118     e = Tbl[lsrc[3]];
119     *src += 4;
120     *srclen -= 4;
121   } else {                                                // Ill-formed
122     e = 0;
123     *src += 1;
124     *srclen -= 1;
125   }
126   return e;
127 }
128 
129 // Scan a UTF-8 stringpiece based on a state table.
130 // Always scan complete UTF-8 characters
131 // Set number of bytes scanned. Return reason for exiting
UTF8GenericScan(const UTF8ScanObj * st,const uint8 * str,const int len,int * bytes_consumed)132 int UTF8GenericScan(const UTF8ScanObj* st,
133                     const uint8* str,
134                     const int len,
135                     int* bytes_consumed) {
136   int eshift = st->entry_shift;        // 6 (space optimized) or 8
137   // int nEntries = (1 << eshift);       // 64 or 256 entries per state
138 
139   const uint8* isrc = str;
140     //reinterpret_cast<const uint8*>(str.data());
141   const uint8* src = isrc;
142   //const int len = str.length();
143   const uint8* srclimit = isrc + len;
144   const uint8* srclimit8 = srclimit - 7;
145   *bytes_consumed = 0;
146   if (len == 0) return kExitOK;
147 
148   const uint8* Tbl_0 = &st->state_table[st->state0];
149 
150 DoAgain:
151   // Do state-table scan
152   int e = 0;
153   uint8 c;
154 
155   // Do fast for groups of 8 identity bytes.
156   // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
157   // including slowing slightly on cr/lf/ht
158   //----------------------------
159   const uint8* Tbl2 = &st->fast_state[0];
160   uint32 losub = st->losub;
161   uint32 hiadd = st->hiadd;
162   while (src < srclimit8) {
163     uint32 s0123 = UnalignedLoad32(src);
164     uint32 s4567 = UnalignedLoad32(src + 4);
165     src += 8;
166     // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
167     uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
168                   (s4567 - losub) | (s4567 + hiadd);
169     if ((temp & 0x80808080) != 0) {
170       // We typically end up here on cr/lf/ht; src was incremented
171       int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
172                   (Tbl2[src[-6]] | Tbl2[src[-5]]);
173       if (e0123 != 0) {src -= 8; break;}    // Exit on Non-interchange
174       e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
175               (Tbl2[src[-2]] | Tbl2[src[-1]]);
176       if (e0123 != 0) {src -= 4; break;}    // Exit on Non-interchange
177       // Else OK, go around again
178     }
179   }
180   //----------------------------
181 
182   // Byte-at-a-time scan
183   //----------------------------
184   const uint8* Tbl = Tbl_0;
185   while (src < srclimit) {
186     c = *src;
187     e = Tbl[c];
188     src++;
189     if (e >= kExitIllegalStructure) {break;}
190     Tbl = &Tbl_0[e << eshift];
191   }
192   //----------------------------
193 
194 
195   // Exit posibilities:
196   //  Some exit code, !state0, back up over last char
197   //  Some exit code, state0, back up one byte exactly
198   //  source consumed, !state0, back up over partial char
199   //  source consumed, state0, exit OK
200   // For illegal byte in state0, avoid backup up over PREVIOUS char
201   // For truncated last char, back up to beginning of it
202 
203   if (e >= kExitIllegalStructure) {
204     // Back up over exactly one byte of rejected/illegal UTF-8 character
205     src--;
206     // Back up more if needed
207     if (!InStateZero(st, Tbl)) {
208       do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
209     }
210   } else if (!InStateZero(st, Tbl)) {
211     // Back up over truncated UTF-8 character
212     e = kExitIllegalStructure;
213     do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
214   } else {
215     // Normal termination, source fully consumed
216     e = kExitOK;
217   }
218 
219   if (e == kExitDoAgain) {
220     // Loop back up to the fast scan
221     goto DoAgain;
222   }
223 
224   *bytes_consumed = src - isrc;
225   return e;
226 }
227