• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "encodings/compact_lang_det/getonescriptspan.h"
6 #include <stdio.h>
7 #include <string.h>
8 
9 #include "base/basictypes.h"
10 #include "encodings/lang_enc.h"
11 #include "encodings/compact_lang_det/utf8propjustletter.h"
12 #include "encodings/compact_lang_det/utf8propletterscriptnum.h"
13 #include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
14 
15 #include "encodings/compact_lang_det/win/cld_basictypes.h"
16 #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
17 #include "encodings/compact_lang_det/win/cld_google.h"
18 #include "encodings/compact_lang_det/win/cld_htmlutils.h"
19 #include "encodings/compact_lang_det/win/cld_unilib.h"
20 #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
21 #include "encodings/compact_lang_det/win/cld_utf8utils.h"
22 
23 static const Language GRAY_LANG = (Language)254;
24 
25 static const int kMaxUpToWordBoundary = 50;       // span < this make longer,
26                                                   // else make shorter
27 static const int kMaxAdvanceToWordBoundary = 10;  // +/- this many bytes
28                                                   // to round to word boundary,
29                                                   // direction above
30 
31 static const char kSpecialSymbol[256] = {       // true for < > &
32   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
33   0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
34   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
35   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
36 
37   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
38   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
39   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
40   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
41 };
42 
43 
44 
45 #define LT 0      // <
46 #define GT 1      // >
47 #define EX 2      // !
48 #define HY 3      // -
49 #define QU 4      // "
50 #define AP 5      // '
51 #define SL 6      // /
52 #define S_ 7
53 #define C_ 8
54 #define R_ 9
55 #define I_ 10
56 #define P_ 11
57 #define T_ 12
58 #define Y_ 13
59 #define L_ 14
60 #define E_ 15
61 #define CR 16     // <cr> or <lf>
62 #define NL 17     // non-letter: ASCII whitespace, digit, punctuation
63 #define PL 18     // possible letter, incl. &
64 #define xx 19     // <unused>
65 
66 // Map byte to one of ~20 interesting categories for cheap tag parsing
67 static const uint8 kCharToSub[256] = {
68   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
69   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
70   NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
71   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
72 
73   PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
74   P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
75   PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
76   P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
77 
78   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
79   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
80   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
81   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
82 
83   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
84   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
85   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
86   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
87 };
88 
89 #undef LT
90 #undef GT
91 #undef EX
92 #undef HY
93 #undef QU
94 #undef AP
95 #undef SL
96 #undef S_
97 #undef C_
98 #undef R_
99 #undef I_
100 #undef P_
101 #undef T_
102 #undef Y_
103 #undef L_
104 #undef E_
105 #undef CR
106 #undef NL
107 #undef PL
108 #undef xx
109 
110 
111 #define OK 0
112 #define X_ 1
113 
114 // State machine to do cheap parse of non-letter strings incl. tags
115 // advances <tag>
116 //          |    |
117 // advances <tag> ... </tag>  for <script> <style>
118 //          |               |
119 // advances <!-- ... <tag> ... -->
120 //          |                     |
121 // advances <tag
122 //          ||  (0)
123 // advances <tag <tag2>
124 //          ||  (0)
125 static const uint8 kTagParseTbl_0[] = {
126 // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
127    3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [0] OK
128   X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error
129    3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [2] NL*
130   X_, 2, 4, 9, 10,11, 9,13,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [3] <
131   X_, 2, 9, 5, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [4] <!
132   X_, 2, 9, 6, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [5] <!-
133    6, 6, 6, 7,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [6] <!--.*
134    6, 6, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [7] <!--.*-
135    6, 2, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [8] <!--.*--
136   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [9] <.*
137   10,10,10,10,  9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
138   11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
139   X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
140 
141 // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
142   X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9,  9, 9, 9,X_, // [13] <S
143   X_, 2, 9, 9, 10,11, 9, 9,  9,15, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [14] <SC
144   X_, 2, 9, 9, 10,11, 9, 9,  9, 9,16, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [15] <SCR
145   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9,17,  9, 9, 9, 9,  9, 9, 9,X_, // [16] <SCRI
146   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9, 18, 9, 9, 9,  9, 9, 9,X_, // [17] <SCRIP
147   X_,19, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
148   20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
149   19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
150   19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</
151   19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
152   19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
153   19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
154   19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
155   19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
156   19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
157 
158 // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
159   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9,29, 9, 9,  9, 9, 9,X_, // [28] <ST
160   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9,30, 9,  9, 9, 9,X_, // [29] <STY
161   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9,31,  9, 9, 9,X_, // [30] <STYL
162   X_,32, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
163   33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
164   32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
165   32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</
166   32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
167   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
168   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
169   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
170   32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
171 };
172 
173 #undef OK
174 #undef X_
175 
176 
177 /*
178 // Convert GetTimeOfDay output to 64-bit usec
179 static inline uint64 Microseconds(const struct timeval& t) {
180   // The SumReducer uses uint64, so convert to (uint64) microseconds,
181   // not (double) seconds.
182   return t.tv_sec * 1000000ULL + t.tv_usec;
183 }
184 */
185 
186 
187 // Returns true if character is < > or &
IsSpecial(char c)188 bool inline IsSpecial(char c) {
189   if ((c & 0xe0) == 0x20) {
190     return kSpecialSymbol[static_cast<uint8>(c)];
191   }
192   return false;
193 }
194 
195 // Quick Skip to next letter or < > & or to end of string (eos)
196 // Always return is_letter for eos
ScanToLetterOrSpecial(const char * src,int len)197 int ScanToLetterOrSpecial(const char* src, int len) {
198   int bytes_consumed;
199   cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len,
200                        &bytes_consumed);
201   return bytes_consumed;
202 }
203 
204 
205 
206 // src points to non-letter, such as tag-opening '<'
207 // Return length from here to next possible letter
208 // On eos or another < before >, return 1
209 // advances <tag>
210 //          |    |
211 // advances <tag> ... </tag>  for <script> <style>
212 //          |               |
213 // advances <!-- ... <tag> ... -->
214 //          |                     |
215 // advances <tag
216 //          ||  (1)
217 // advances <tag <tag2>
218 //          ||  (1)
ScanToPossibleLetter(const char * isrc,int len)219 int ScanToPossibleLetter(const char* isrc, int len) {
220   const uint8* src = reinterpret_cast<const uint8*>(isrc);
221   const uint8* srclimit = src + len;
222   const uint8* tagParseTbl = kTagParseTbl_0;
223   int e = 0;
224   while (src < srclimit) {
225     e = tagParseTbl[kCharToSub[*src++]];
226     if ((e & ~1) == 0) {
227       // We overshot by one byte
228       --src;
229       break;
230     }
231     tagParseTbl = &kTagParseTbl_0[e * 20];
232   }
233 
234   if (src >= srclimit) {
235     // We fell off the end of the text.
236     // It looks like the most common case for this is a truncated file, not
237     // mismatched angle brackets. So we pretend that the last char was '>'
238     return len;
239   }
240 
241   // OK to be in state 0 or state 2 at exit
242   if ((e != 0) && (e != 2)) {
243     // Error, '<' followed by '<'
244     // We want to back up to first <, then advance by one byte past it
245     int offset = src - reinterpret_cast<const uint8*>(isrc);
246     // printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc);
247 
248     // Backscan to first '<' and return enough length to just get past it
249     --offset;   // back up over the second '<', which caused us to stop
250     while ((0 < offset) && (isrc[offset] != '<')) {
251       // Find the first '<', which is unmatched
252       --offset;
253     }
254     // skip to just beyond first '<'
255     // printf("  returning %d\n", offset + 1);
256     return offset + 1;
257   }
258 
259   return src - reinterpret_cast<const uint8*>(isrc);
260 }
261 
262 
263 
ScriptScanner(const char * buffer,int buffer_length,bool is_plain_text)264 ScriptScanner::ScriptScanner(const char* buffer,
265                              int buffer_length,
266                              bool is_plain_text)
267   : start_byte_(buffer),
268   next_byte_(buffer),
269   next_byte_limit_(buffer + buffer_length),
270   byte_length_(buffer_length),
271   is_plain_text_(is_plain_text) {
272     script_buffer_ = new char[getone::kMaxScriptBuffer];
273     script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer];
274 }
275 
~ScriptScanner()276 ScriptScanner::~ScriptScanner() {
277   delete[] script_buffer_;
278   delete[] script_buffer_lower_;
279 }
280 
281 
282 
283 
284 // Get to the first real non-tag letter or entity that is a letter
285 // Sets script of that letter
286 // Return len if no more letters
SkipToFrontOfSpan(const char * src,int len,int * script)287 int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
288   int sc = UNKNOWN_LSCRIPT;
289   int skip = 0;
290   int tlen, plen;
291 
292   // Do run of non-letters (tag | &NL | NL)*
293   while (skip < len) {
294     // Do fast scan to next interesting byte
295     // int oldskip = skip;
296     skip += ScanToLetterOrSpecial(src + skip, len - skip);
297     // TEMP
298     // printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n",
299     //       oldskip, src[oldskip], skip, src[skip]);
300 
301     // Check for no more letters/specials
302     if (skip >= len) {
303       // All done
304       return len;
305     }
306 
307     // We are at a letter, nonletter, tag, or entity
308     if (IsSpecial(src[skip]) && !is_plain_text_) {
309       if (src[skip] == '<') {
310         // Begining of tag; skip to end and go around again
311         tlen = ScanToPossibleLetter(src + skip, len - skip);
312         sc = 0;
313         // printf("<...> ");
314       } else if (src[skip] == '>') {
315         // Unexpected end of tag; skip it and go around again
316         tlen = 1;         // Over the >
317         sc = 0;
318         // printf("..> ");
319       } else if (src[skip] == '&') {
320         // Expand entity, no advance
321         char temp[4];
322         EntityToBuffer(src + skip, len - skip,
323                        temp, &tlen, &plen);
324         sc = getone::GetUTF8LetterScriptNum(temp);
325         // printf("#(%02x%02x)=%d ", temp[0], temp[1], sc);
326       }
327     } else {
328       // Update 1..4 bytes
329       tlen = cld_UniLib::OneCharLen(src + skip);
330       sc = getone::GetUTF8LetterScriptNum(src + skip);
331       // printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc);
332     }
333     // TEMP
334     // printf("sc=%d ", sc);
335     if (sc != 0) {break;}           // Letter found
336     skip += tlen;                   // Advance
337   }
338 
339   *script = sc;
340   return skip;
341 }
342 
343 #ifdef NEED_ALIGNED_LOADS
344 static const bool kNeedsAlignedLoads = true;
345 #else
346 static const bool kNeedsAlignedLoads = false;
347 #endif
348 
349 
350 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
351 // Buffer has leading space and all text is lowercased
GetOneScriptSpan(getone::LangSpan * span)352 bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) {
353   span->text = script_buffer_;
354   span->text_bytes = 0;
355   span->offset = next_byte_ - start_byte_;
356   span->script = UNKNOWN_LSCRIPT;
357   span->lang = UNKNOWN_LANGUAGE;
358   span->truncated = false;
359 
360   // printf("GetOneScriptSpan[[ ");
361   // struct timeval script_start, script_mid, script_end;
362 
363   int spanscript;           // The script of this span
364   int sc = UNKNOWN_LSCRIPT;  // The script of next character
365   int tlen, plen;
366 
367 
368   script_buffer_[0] = ' ';  // Always a space at front of output
369   script_buffer_[1] = '\0';
370   int take = 0;
371   int put = 1;              // Start after the initial space
372 
373   // gettimeofday(&script_start, NULL);
374   // Get to the first real non-tag letter or entity that is a letter
375   int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
376   next_byte_ += skip;
377   byte_length_ -= skip;
378   if (byte_length_ <= 0) {
379     // printf("]]\n");
380     return false;               // No more letters to be found
381   }
382 
383   // gettimeofday(&script_mid, NULL);
384 
385   // There is at least one letter, so we know the script for this span
386   // printf("{%d} ", spanscript);
387   span->script = (UnicodeLScript)spanscript;
388 
389 
390   // Go over alternating spans of same-script letters and non-letters,
391   // copying letters to buffer with single spaces for each run of non-letters
392   while (take < byte_length_) {
393     // Copy run of letters in same script (&LS | LS)*
394     int letter_count = 0;              // Keep track of word length
395     bool need_break = false;
396     while (take < byte_length_) {
397       // We are at a letter, nonletter, tag, or entity
398       if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
399         // printf("\"%c\" ", next_byte_[take]);
400         if (next_byte_[take] == '<') {
401           // Begining of tag
402           sc = 0;
403           break;
404         } else if (next_byte_[take] == '>') {
405           // Unexpected end of tag
406           sc = 0;
407           break;
408         } else if (next_byte_[take] == '&') {
409           // Copy entity, no advance
410           EntityToBuffer(next_byte_ + take, byte_length_ - take,
411                          script_buffer_ + put, &tlen, &plen);
412           sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
413         }
414       } else {
415         // Real letter, safely copy up to 4 bytes, increment by 1..4
416         // Will update by 1..4 bytes at Advance, below
417         tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take);
418         if (!kNeedsAlignedLoads && (take < (byte_length_ - 3))) {
419           // Fast case
420           *reinterpret_cast<uint32*>(script_buffer_ + put) =
421             *reinterpret_cast<const uint32*>(next_byte_ + take);
422         } else {
423           // Slow case, happens 1-3 times per input document
424           memcpy(script_buffer_ + put, next_byte_ + take, plen);
425         }
426         sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
427       }
428       // printf("sc(%c)=%d ", next_byte_[take], sc);
429       // char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen);
430       // xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc);
431 
432       // Allow continue across a single letter in a different script:
433       // A B D = three scripts, c = common script, i = inherited script,
434       // - = don't care, ( = take position before the += below
435       //  AAA(A-    continue
436       //
437       //  AAA(BA    continue
438       //  AAA(BB    break
439       //  AAA(Bc    continue (breaks after B)
440       //  AAA(BD    break
441       //  AAA(Bi    break
442       //
443       //  AAA(c-    break
444       //
445       //  AAA(i-    continue
446       //
447 
448       if ((sc != spanscript) && (sc != ULScript_Inherited)) {
449         // Might need to break this script span
450         if (sc == ULScript_Common) {
451           need_break = true;
452         } else {
453           // Look at next following character, ignoring entity as Common
454           int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen);
455           if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
456             need_break = true;
457           }
458         }
459       }
460       if (need_break) {break;}  // Non-letter or letter in wrong script
461 
462       take += tlen;                   // Advance
463       put += plen;                    // Advance
464       ++letter_count;
465       if (put >= getone::kMaxScriptBytes) {
466         // Buffer is full
467         span->truncated = true;
468         break;
469       }
470     }     // End while letters
471 
472     // Do run of non-letters (tag | &NL | NL)*
473     while (take < byte_length_) {
474       // Do fast scan to next interesting byte
475       take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
476 
477       // Check for no more letters/specials
478       if (take >= byte_length_) {
479         take = byte_length_;
480         break;
481       }
482 
483       // We are at a letter, nonletter, tag, or entity
484       if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
485         // printf("\"%c\" ", next_byte_[take]);
486         if (next_byte_[take] == '<') {
487           // Begining of tag; skip to end and go around again
488           tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take);
489           sc = 0;
490           // printf("<...> ");
491         } else if (next_byte_[take] == '>') {
492           // Unexpected end of tag; skip it and go around again
493           tlen = 1;         // Over the >
494           sc = 0;
495           // printf("..> ");
496         } else if (next_byte_[take] == '&') {
497           // Expand entity, no advance
498           EntityToBuffer(next_byte_ + take, byte_length_ - take,
499                          script_buffer_ + put, &tlen, &plen);
500           sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
501         }
502       } else {
503         // Update 1..4
504         tlen = cld_UniLib::OneCharLen(next_byte_ + take);
505         sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
506       }
507       // printf("sc[%c]=%d ", next_byte_[take], sc);
508       if (sc != 0) {break;}           // Letter found
509       take += tlen;                   // Advance
510     }     // End while not-letters
511 
512     script_buffer_[put++] = ' ';
513 
514     // We are at a letter again (or eos), after letter* not-letter*
515     if (sc != spanscript) {break;}            // Letter in wrong script
516     if (put >= getone::kMaxScriptBytes - 8) {
517       // Buffer is almost full
518       span->truncated = true;
519       break;
520     }
521   }
522 
523   // Update input position
524   next_byte_ += take;
525   byte_length_ -= take;
526 
527   // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
528   //                          kMaxScriptBytes |   | put
529   script_buffer_[put + 0] = ' ';
530   script_buffer_[put + 1] = ' ';
531   script_buffer_[put + 2] = ' ';
532   script_buffer_[put + 3] = '\0';
533 
534   span->text_bytes = put;       // Does not include the last four chars above
535 
536   // printf(" %d]]\n\n", put);
537   return true;
538 }
539 
540 // Force Latin, Cyrillic, Greek scripts to be lowercase
LowerScriptSpan(getone::LangSpan * span)541 void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) {
542   // On Windows, text is lowercased beforehand, so no need to do anything here.
543 #if !defined(CLD_WINDOWS)
544   // If needed, lowercase all the text. If we do it sooner, might miss
545   // lowercasing an entity such as &Aacute;
546   // We only need to do this for Latn and Cyrl scripts
547   if ((span->script == ULScript_Latin) ||
548       (span->script == ULScript_Cyrillic) ||
549       (span->script == ULScript_Greek)) {
550     // Full Unicode lowercase of the entire buffer, including
551     // four pad bytes off the end
552     int consumed, filled;
553     UniLib::ToLower(span->text, span->text_bytes + 4,
554                     script_buffer_lower_, getone::kMaxScriptLowerBuffer,
555                     &consumed, &filled);
556     span->text = script_buffer_lower_;
557     span->text_bytes = filled - 4;
558   }
559 #endif
560 }
561 
562 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
563 // Force Latin and Cyrillic scripts to be lowercase
GetOneScriptSpanLower(getone::LangSpan * span)564 bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) {
565   bool ok = GetOneScriptSpan(span);
566   LowerScriptSpan(span);
567   return ok;
568 }
569 
570 // Gets lscript number for letters; always returns
571 //   0 (common script) for non-letters
GetUTF8LetterScriptNum(const char * src)572 int getone::GetUTF8LetterScriptNum(const char* src) {
573   int srclen = cld_UniLib::OneCharLen(src);
574   const uint8* usrc = reinterpret_cast<const uint8*>(src);
575   return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen);
576 }
577