1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "encodings/compact_lang_det/getonescriptspan.h"
6 #include <stdio.h>
7 #include <string.h>
8
9 #include "base/basictypes.h"
10 #include "encodings/lang_enc.h"
11 #include "encodings/compact_lang_det/utf8propjustletter.h"
12 #include "encodings/compact_lang_det/utf8propletterscriptnum.h"
13 #include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
14
15 #include "encodings/compact_lang_det/win/cld_basictypes.h"
16 #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
17 #include "encodings/compact_lang_det/win/cld_google.h"
18 #include "encodings/compact_lang_det/win/cld_htmlutils.h"
19 #include "encodings/compact_lang_det/win/cld_unilib.h"
20 #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
21 #include "encodings/compact_lang_det/win/cld_utf8utils.h"
22
23 static const Language GRAY_LANG = (Language)254;
24
25 static const int kMaxUpToWordBoundary = 50; // span < this make longer,
26 // else make shorter
27 static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes
28 // to round to word boundary,
29 // direction above
30
31 static const char kSpecialSymbol[256] = { // true for < > &
32 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
33 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
34 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
35 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
36
37 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
38 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
39 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
40 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
41 };
42
43
44
45 #define LT 0 // <
46 #define GT 1 // >
47 #define EX 2 // !
48 #define HY 3 // -
49 #define QU 4 // "
50 #define AP 5 // '
51 #define SL 6 // /
52 #define S_ 7
53 #define C_ 8
54 #define R_ 9
55 #define I_ 10
56 #define P_ 11
57 #define T_ 12
58 #define Y_ 13
59 #define L_ 14
60 #define E_ 15
61 #define CR 16 // <cr> or <lf>
62 #define NL 17 // non-letter: ASCII whitespace, digit, punctuation
63 #define PL 18 // possible letter, incl. &
64 #define xx 19 // <unused>
65
66 // Map byte to one of ~20 interesting categories for cheap tag parsing
67 static const uint8 kCharToSub[256] = {
68 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
69 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
70 NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
71 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
72
73 PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
74 P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
75 PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
76 P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
77
78 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
79 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
80 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
81 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
82
83 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
84 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
85 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
86 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
87 };
88
89 #undef LT
90 #undef GT
91 #undef EX
92 #undef HY
93 #undef QU
94 #undef AP
95 #undef SL
96 #undef S_
97 #undef C_
98 #undef R_
99 #undef I_
100 #undef P_
101 #undef T_
102 #undef Y_
103 #undef L_
104 #undef E_
105 #undef CR
106 #undef NL
107 #undef PL
108 #undef xx
109
110
111 #define OK 0
112 #define X_ 1
113
114 // State machine to do cheap parse of non-letter strings incl. tags
115 // advances <tag>
116 // | |
117 // advances <tag> ... </tag> for <script> <style>
118 // | |
119 // advances <!-- ... <tag> ... -->
120 // | |
121 // advances <tag
122 // || (0)
123 // advances <tag <tag2>
124 // || (0)
125 static const uint8 kTagParseTbl_0[] = {
126 // < > ! - " ' / S C R I P T Y L E CR NL PL xx
127 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK
128 X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error
129 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL*
130 X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] <
131 X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <!
132 X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!-
133 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.*
134 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*-
135 6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*--
136 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.*
137 10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
138 11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
139 X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
140
141 // < > ! - " ' / S C R I P T Y L E CR NL PL xx
142 X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S
143 X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC
144 X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR
145 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI
146 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP
147 X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
148 20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
149 19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
150 19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</
151 19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
152 19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
153 19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
154 19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
155 19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
156 19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
157
158 // < > ! - " ' / S C R I P T Y L E CR NL PL xx
159 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST
160 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY
161 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL
162 X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
163 33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
164 32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
165 32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</
166 32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
167 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
168 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
169 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
170 32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
171 };
172
173 #undef OK
174 #undef X_
175
176
177 /*
178 // Convert GetTimeOfDay output to 64-bit usec
179 static inline uint64 Microseconds(const struct timeval& t) {
180 // The SumReducer uses uint64, so convert to (uint64) microseconds,
181 // not (double) seconds.
182 return t.tv_sec * 1000000ULL + t.tv_usec;
183 }
184 */
185
186
187 // Returns true if character is < > or &
IsSpecial(char c)188 bool inline IsSpecial(char c) {
189 if ((c & 0xe0) == 0x20) {
190 return kSpecialSymbol[static_cast<uint8>(c)];
191 }
192 return false;
193 }
194
195 // Quick Skip to next letter or < > & or to end of string (eos)
196 // Always return is_letter for eos
ScanToLetterOrSpecial(const char * src,int len)197 int ScanToLetterOrSpecial(const char* src, int len) {
198 int bytes_consumed;
199 cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len,
200 &bytes_consumed);
201 return bytes_consumed;
202 }
203
204
205
206 // src points to non-letter, such as tag-opening '<'
207 // Return length from here to next possible letter
208 // On eos or another < before >, return 1
209 // advances <tag>
210 // | |
211 // advances <tag> ... </tag> for <script> <style>
212 // | |
213 // advances <!-- ... <tag> ... -->
214 // | |
215 // advances <tag
216 // || (1)
217 // advances <tag <tag2>
218 // || (1)
ScanToPossibleLetter(const char * isrc,int len)219 int ScanToPossibleLetter(const char* isrc, int len) {
220 const uint8* src = reinterpret_cast<const uint8*>(isrc);
221 const uint8* srclimit = src + len;
222 const uint8* tagParseTbl = kTagParseTbl_0;
223 int e = 0;
224 while (src < srclimit) {
225 e = tagParseTbl[kCharToSub[*src++]];
226 if ((e & ~1) == 0) {
227 // We overshot by one byte
228 --src;
229 break;
230 }
231 tagParseTbl = &kTagParseTbl_0[e * 20];
232 }
233
234 if (src >= srclimit) {
235 // We fell off the end of the text.
236 // It looks like the most common case for this is a truncated file, not
237 // mismatched angle brackets. So we pretend that the last char was '>'
238 return len;
239 }
240
241 // OK to be in state 0 or state 2 at exit
242 if ((e != 0) && (e != 2)) {
243 // Error, '<' followed by '<'
244 // We want to back up to first <, then advance by one byte past it
245 int offset = src - reinterpret_cast<const uint8*>(isrc);
246 // printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc);
247
248 // Backscan to first '<' and return enough length to just get past it
249 --offset; // back up over the second '<', which caused us to stop
250 while ((0 < offset) && (isrc[offset] != '<')) {
251 // Find the first '<', which is unmatched
252 --offset;
253 }
254 // skip to just beyond first '<'
255 // printf(" returning %d\n", offset + 1);
256 return offset + 1;
257 }
258
259 return src - reinterpret_cast<const uint8*>(isrc);
260 }
261
262
263
ScriptScanner(const char * buffer,int buffer_length,bool is_plain_text)264 ScriptScanner::ScriptScanner(const char* buffer,
265 int buffer_length,
266 bool is_plain_text)
267 : start_byte_(buffer),
268 next_byte_(buffer),
269 next_byte_limit_(buffer + buffer_length),
270 byte_length_(buffer_length),
271 is_plain_text_(is_plain_text) {
272 script_buffer_ = new char[getone::kMaxScriptBuffer];
273 script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer];
274 }
275
~ScriptScanner()276 ScriptScanner::~ScriptScanner() {
277 delete[] script_buffer_;
278 delete[] script_buffer_lower_;
279 }
280
281
282
283
284 // Get to the first real non-tag letter or entity that is a letter
285 // Sets script of that letter
286 // Return len if no more letters
SkipToFrontOfSpan(const char * src,int len,int * script)287 int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
288 int sc = UNKNOWN_LSCRIPT;
289 int skip = 0;
290 int tlen, plen;
291
292 // Do run of non-letters (tag | &NL | NL)*
293 while (skip < len) {
294 // Do fast scan to next interesting byte
295 // int oldskip = skip;
296 skip += ScanToLetterOrSpecial(src + skip, len - skip);
297 // TEMP
298 // printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n",
299 // oldskip, src[oldskip], skip, src[skip]);
300
301 // Check for no more letters/specials
302 if (skip >= len) {
303 // All done
304 return len;
305 }
306
307 // We are at a letter, nonletter, tag, or entity
308 if (IsSpecial(src[skip]) && !is_plain_text_) {
309 if (src[skip] == '<') {
310 // Begining of tag; skip to end and go around again
311 tlen = ScanToPossibleLetter(src + skip, len - skip);
312 sc = 0;
313 // printf("<...> ");
314 } else if (src[skip] == '>') {
315 // Unexpected end of tag; skip it and go around again
316 tlen = 1; // Over the >
317 sc = 0;
318 // printf("..> ");
319 } else if (src[skip] == '&') {
320 // Expand entity, no advance
321 char temp[4];
322 EntityToBuffer(src + skip, len - skip,
323 temp, &tlen, &plen);
324 sc = getone::GetUTF8LetterScriptNum(temp);
325 // printf("#(%02x%02x)=%d ", temp[0], temp[1], sc);
326 }
327 } else {
328 // Update 1..4 bytes
329 tlen = cld_UniLib::OneCharLen(src + skip);
330 sc = getone::GetUTF8LetterScriptNum(src + skip);
331 // printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc);
332 }
333 // TEMP
334 // printf("sc=%d ", sc);
335 if (sc != 0) {break;} // Letter found
336 skip += tlen; // Advance
337 }
338
339 *script = sc;
340 return skip;
341 }
342
343 #ifdef NEED_ALIGNED_LOADS
344 static const bool kNeedsAlignedLoads = true;
345 #else
346 static const bool kNeedsAlignedLoads = false;
347 #endif
348
349
350 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
351 // Buffer has leading space and all text is lowercased
GetOneScriptSpan(getone::LangSpan * span)352 bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) {
353 span->text = script_buffer_;
354 span->text_bytes = 0;
355 span->offset = next_byte_ - start_byte_;
356 span->script = UNKNOWN_LSCRIPT;
357 span->lang = UNKNOWN_LANGUAGE;
358 span->truncated = false;
359
360 // printf("GetOneScriptSpan[[ ");
361 // struct timeval script_start, script_mid, script_end;
362
363 int spanscript; // The script of this span
364 int sc = UNKNOWN_LSCRIPT; // The script of next character
365 int tlen, plen;
366
367
368 script_buffer_[0] = ' '; // Always a space at front of output
369 script_buffer_[1] = '\0';
370 int take = 0;
371 int put = 1; // Start after the initial space
372
373 // gettimeofday(&script_start, NULL);
374 // Get to the first real non-tag letter or entity that is a letter
375 int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
376 next_byte_ += skip;
377 byte_length_ -= skip;
378 if (byte_length_ <= 0) {
379 // printf("]]\n");
380 return false; // No more letters to be found
381 }
382
383 // gettimeofday(&script_mid, NULL);
384
385 // There is at least one letter, so we know the script for this span
386 // printf("{%d} ", spanscript);
387 span->script = (UnicodeLScript)spanscript;
388
389
390 // Go over alternating spans of same-script letters and non-letters,
391 // copying letters to buffer with single spaces for each run of non-letters
392 while (take < byte_length_) {
393 // Copy run of letters in same script (&LS | LS)*
394 int letter_count = 0; // Keep track of word length
395 bool need_break = false;
396 while (take < byte_length_) {
397 // We are at a letter, nonletter, tag, or entity
398 if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
399 // printf("\"%c\" ", next_byte_[take]);
400 if (next_byte_[take] == '<') {
401 // Begining of tag
402 sc = 0;
403 break;
404 } else if (next_byte_[take] == '>') {
405 // Unexpected end of tag
406 sc = 0;
407 break;
408 } else if (next_byte_[take] == '&') {
409 // Copy entity, no advance
410 EntityToBuffer(next_byte_ + take, byte_length_ - take,
411 script_buffer_ + put, &tlen, &plen);
412 sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
413 }
414 } else {
415 // Real letter, safely copy up to 4 bytes, increment by 1..4
416 // Will update by 1..4 bytes at Advance, below
417 tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take);
418 if (!kNeedsAlignedLoads && (take < (byte_length_ - 3))) {
419 // Fast case
420 *reinterpret_cast<uint32*>(script_buffer_ + put) =
421 *reinterpret_cast<const uint32*>(next_byte_ + take);
422 } else {
423 // Slow case, happens 1-3 times per input document
424 memcpy(script_buffer_ + put, next_byte_ + take, plen);
425 }
426 sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
427 }
428 // printf("sc(%c)=%d ", next_byte_[take], sc);
429 // char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen);
430 // xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc);
431
432 // Allow continue across a single letter in a different script:
433 // A B D = three scripts, c = common script, i = inherited script,
434 // - = don't care, ( = take position before the += below
435 // AAA(A- continue
436 //
437 // AAA(BA continue
438 // AAA(BB break
439 // AAA(Bc continue (breaks after B)
440 // AAA(BD break
441 // AAA(Bi break
442 //
443 // AAA(c- break
444 //
445 // AAA(i- continue
446 //
447
448 if ((sc != spanscript) && (sc != ULScript_Inherited)) {
449 // Might need to break this script span
450 if (sc == ULScript_Common) {
451 need_break = true;
452 } else {
453 // Look at next following character, ignoring entity as Common
454 int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen);
455 if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
456 need_break = true;
457 }
458 }
459 }
460 if (need_break) {break;} // Non-letter or letter in wrong script
461
462 take += tlen; // Advance
463 put += plen; // Advance
464 ++letter_count;
465 if (put >= getone::kMaxScriptBytes) {
466 // Buffer is full
467 span->truncated = true;
468 break;
469 }
470 } // End while letters
471
472 // Do run of non-letters (tag | &NL | NL)*
473 while (take < byte_length_) {
474 // Do fast scan to next interesting byte
475 take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
476
477 // Check for no more letters/specials
478 if (take >= byte_length_) {
479 take = byte_length_;
480 break;
481 }
482
483 // We are at a letter, nonletter, tag, or entity
484 if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
485 // printf("\"%c\" ", next_byte_[take]);
486 if (next_byte_[take] == '<') {
487 // Begining of tag; skip to end and go around again
488 tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take);
489 sc = 0;
490 // printf("<...> ");
491 } else if (next_byte_[take] == '>') {
492 // Unexpected end of tag; skip it and go around again
493 tlen = 1; // Over the >
494 sc = 0;
495 // printf("..> ");
496 } else if (next_byte_[take] == '&') {
497 // Expand entity, no advance
498 EntityToBuffer(next_byte_ + take, byte_length_ - take,
499 script_buffer_ + put, &tlen, &plen);
500 sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
501 }
502 } else {
503 // Update 1..4
504 tlen = cld_UniLib::OneCharLen(next_byte_ + take);
505 sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
506 }
507 // printf("sc[%c]=%d ", next_byte_[take], sc);
508 if (sc != 0) {break;} // Letter found
509 take += tlen; // Advance
510 } // End while not-letters
511
512 script_buffer_[put++] = ' ';
513
514 // We are at a letter again (or eos), after letter* not-letter*
515 if (sc != spanscript) {break;} // Letter in wrong script
516 if (put >= getone::kMaxScriptBytes - 8) {
517 // Buffer is almost full
518 span->truncated = true;
519 break;
520 }
521 }
522
523 // Update input position
524 next_byte_ += take;
525 byte_length_ -= take;
526
527 // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
528 // kMaxScriptBytes | | put
529 script_buffer_[put + 0] = ' ';
530 script_buffer_[put + 1] = ' ';
531 script_buffer_[put + 2] = ' ';
532 script_buffer_[put + 3] = '\0';
533
534 span->text_bytes = put; // Does not include the last four chars above
535
536 // printf(" %d]]\n\n", put);
537 return true;
538 }
539
540 // Force Latin, Cyrillic, Greek scripts to be lowercase
LowerScriptSpan(getone::LangSpan * span)541 void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) {
542 // On Windows, text is lowercased beforehand, so no need to do anything here.
543 #if !defined(CLD_WINDOWS)
544 // If needed, lowercase all the text. If we do it sooner, might miss
545 // lowercasing an entity such as Á
546 // We only need to do this for Latn and Cyrl scripts
547 if ((span->script == ULScript_Latin) ||
548 (span->script == ULScript_Cyrillic) ||
549 (span->script == ULScript_Greek)) {
550 // Full Unicode lowercase of the entire buffer, including
551 // four pad bytes off the end
552 int consumed, filled;
553 UniLib::ToLower(span->text, span->text_bytes + 4,
554 script_buffer_lower_, getone::kMaxScriptLowerBuffer,
555 &consumed, &filled);
556 span->text = script_buffer_lower_;
557 span->text_bytes = filled - 4;
558 }
559 #endif
560 }
561
562 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
563 // Force Latin and Cyrillic scripts to be lowercase
GetOneScriptSpanLower(getone::LangSpan * span)564 bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) {
565 bool ok = GetOneScriptSpan(span);
566 LowerScriptSpan(span);
567 return ok;
568 }
569
570 // Gets lscript number for letters; always returns
571 // 0 (common script) for non-letters
GetUTF8LetterScriptNum(const char * src)572 int getone::GetUTF8LetterScriptNum(const char* src) {
573 int srclen = cld_UniLib::OneCharLen(src);
574 const uint8* usrc = reinterpret_cast<const uint8*>(src);
575 return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen);
576 }
577