• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "encodings/compact_lang_det/compact_lang_det.h"
6 #include "encodings/compact_lang_det/compact_lang_det_impl.h"
7 #include "encodings/compact_lang_det/win/cld_basictypes.h"
8 
9 // String is "code_version - data_scrape_date"
10 static const char* kDetectLanguageVersion = "V1.6 - 20081121";
11 
12 // Large-table version for all ~160 languages (all Tiers)
13 
14 // Scan interchange-valid UTF-8 bytes and detect most likely language
DetectLanguage(const DetectionTables * tables,const char * buffer,int buffer_length,bool is_plain_text,bool * is_reliable)15 Language CompactLangDet::DetectLanguage(
16                           const DetectionTables* tables,
17                           const char* buffer,
18                           int buffer_length,
19                           bool is_plain_text,
20                           bool* is_reliable) {
21   bool allow_extended_lang = false;
22   Language language3[3];
23   int percent3[3];
24   double normalized_score3[3];
25   int text_bytes;
26   int flags = 0;
27   Language plus_one = UNKNOWN_LANGUAGE;
28   const char* tld_hint = "";
29   int encoding_hint = UNKNOWN_ENCODING;
30   Language language_hint = UNKNOWN_LANGUAGE;
31 
32   Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
33                           tables,
34                           buffer,
35                           buffer_length,
36                           is_plain_text,
37                           tld_hint,               // "id" boosts Indonesian
38                           encoding_hint,          // SJS boosts Japanese
39                           language_hint,          // ITALIAN boosts it
40                           allow_extended_lang,
41                           flags,
42                           plus_one,
43                           language3,
44                           percent3,
45                           normalized_score3,
46                           &text_bytes,
47                           is_reliable);
48   // Default to English.
49   if (lang == UNKNOWN_LANGUAGE) {
50     lang = ENGLISH;
51   }
52   return lang;
53 }
54 
55 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
DetectLanguageSummary(const DetectionTables * tables,const char * buffer,int buffer_length,bool is_plain_text,Language * language3,int * percent3,int * text_bytes,bool * is_reliable)56 Language CompactLangDet::DetectLanguageSummary(
57                           const DetectionTables* tables,
58                           const char* buffer,
59                           int buffer_length,
60                           bool is_plain_text,
61                           Language* language3,
62                           int* percent3,
63                           int* text_bytes,
64                           bool* is_reliable) {
65   double normalized_score3[3];
66   bool allow_extended_lang = false;
67   int flags = 0;
68   Language plus_one = UNKNOWN_LANGUAGE;
69   const char* tld_hint = "";
70   int encoding_hint = UNKNOWN_ENCODING;
71   Language language_hint = UNKNOWN_LANGUAGE;
72 
73   Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
74                           tables,
75                           buffer,
76                           buffer_length,
77                           is_plain_text,
78                           tld_hint,               // "id" boosts Indonesian
79                           encoding_hint,          // SJS boosts Japanese
80                           language_hint,          // ITALIAN boosts it
81                           allow_extended_lang,
82                           flags,
83                           plus_one,
84                           language3,
85                           percent3,
86                           normalized_score3,
87                           text_bytes,
88                           is_reliable);
89   // Default to English
90   if (lang == UNKNOWN_LANGUAGE) {
91     lang = ENGLISH;
92   }
93   return lang;
94 }
95 
96 // Same as above, with hints supplied
97 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
DetectLanguageSummary(const DetectionTables * tables,const char * buffer,int buffer_length,bool is_plain_text,const char * tld_hint,int encoding_hint,Language language_hint,Language * language3,int * percent3,int * text_bytes,bool * is_reliable)98 Language CompactLangDet::DetectLanguageSummary(
99                           const DetectionTables* tables,
100                           const char* buffer,
101                           int buffer_length,
102                           bool is_plain_text,
103                           const char* tld_hint,       // "id" boosts Indonesian
104                           int encoding_hint,          // SJS boosts Japanese
105                           Language language_hint,     // ITALIAN boosts it
106                           Language* language3,
107                           int* percent3,
108                           int* text_bytes,
109                           bool* is_reliable) {
110   double normalized_score3[3];
111   bool allow_extended_lang = false;
112   int flags = 0;
113   Language plus_one = UNKNOWN_LANGUAGE;
114 
115   Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
116                           tables,
117                           buffer,
118                           buffer_length,
119                           is_plain_text,
120                           tld_hint,               // "id" boosts Indonesian
121                           encoding_hint,          // SJS boosts Japanese
122                           language_hint,          // ITALIAN boosts it
123                           allow_extended_lang,
124                           flags,
125                           plus_one,
126                           language3,
127                           percent3,
128                           normalized_score3,
129                           text_bytes,
130                           is_reliable);
131   // Default to English
132   if (lang == UNKNOWN_LANGUAGE) {
133     lang = ENGLISH;
134   }
135   return lang;
136 }
137 
138 
139 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
140 // languages.
141 // Extended languages are additional Google interface languages and Unicode
142 // single-language scripts, from ext_lang_enc.h
ExtDetectLanguageSummary(const DetectionTables * tables,const char * buffer,int buffer_length,bool is_plain_text,Language * language3,int * percent3,int * text_bytes,bool * is_reliable)143 Language CompactLangDet::ExtDetectLanguageSummary(
144                           const DetectionTables* tables,
145                           const char* buffer,
146                           int buffer_length,
147                           bool is_plain_text,
148                           Language* language3,
149                           int* percent3,
150                           int* text_bytes,
151                           bool* is_reliable) {
152   double normalized_score3[3];
153   bool allow_extended_lang = true;
154   int flags = 0;
155   Language plus_one = UNKNOWN_LANGUAGE;
156   const char* tld_hint = "";
157   int encoding_hint = UNKNOWN_ENCODING;
158   Language language_hint = UNKNOWN_LANGUAGE;
159 
160   Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
161                           tables,
162                           buffer,
163                           buffer_length,
164                           is_plain_text,
165                           tld_hint,               // "id" boosts Indonesian
166                           encoding_hint,          // SJS boosts Japanese
167                           language_hint,          // ITALIAN boosts it
168                           allow_extended_lang,
169                           flags,
170                           plus_one,
171                           language3,
172                           percent3,
173                           normalized_score3,
174                           text_bytes,
175                           is_reliable);
176   // Do not default to English
177   return lang;
178 }
179 
180 // Same as above, with hints supplied
181 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
182 // languages.
183 // Extended languages are additional Google interface languages and Unicode
184 // single-language scripts, from ext_lang_enc.h
ExtDetectLanguageSummary(const DetectionTables * tables,const char * buffer,int buffer_length,bool is_plain_text,const char * tld_hint,int encoding_hint,Language language_hint,Language * language3,int * percent3,int * text_bytes,bool * is_reliable)185 Language CompactLangDet::ExtDetectLanguageSummary(
186                           const DetectionTables* tables,
187                           const char* buffer,
188                           int buffer_length,
189                           bool is_plain_text,
190                           const char* tld_hint,       // "id" boosts Indonesian
191                           int encoding_hint,          // SJS boosts Japanese
192                           Language language_hint,     // ITALIAN boosts it
193                           Language* language3,
194                           int* percent3,
195                           int* text_bytes,
196                           bool* is_reliable) {
197   double normalized_score3[3];
198   bool allow_extended_lang = true;
199   int flags = 0;
200   Language plus_one = UNKNOWN_LANGUAGE;
201 
202   Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
203                           tables,
204                           buffer,
205                           buffer_length,
206                           is_plain_text,
207                           tld_hint,               // "id" boosts Indonesian
208                           encoding_hint,          // SJS boosts Japanese
209                           language_hint,          // ITALIAN boosts it
210                           allow_extended_lang,
211                           flags,
212                           plus_one,
213                           language3,
214                           percent3,
215                           normalized_score3,
216                           text_bytes,
217                           is_reliable);
218   // Do not default to English
219   return lang;
220 }
221 
222 // Same as above, and also returns internal language scores as a ratio to
223 // normal score for real text in that language. Scores close to 1.0 indicate
224 // normal text, while scores far away from 1.0 indicate badly-skewed text or
225 // gibberish
226 //
ExtDetectLanguageSummary(const DetectionTables * tables,const char * buffer,int buffer_length,bool is_plain_text,const char * tld_hint,int encoding_hint,Language language_hint,Language * language3,int * percent3,double * normalized_score3,int * text_bytes,bool * is_reliable)227 Language CompactLangDet::ExtDetectLanguageSummary(
228                         const DetectionTables* tables,
229                         const char* buffer,
230                         int buffer_length,
231                         bool is_plain_text,
232                         const char* tld_hint,       // "id" boosts Indonesian
233                         int encoding_hint,          // SJS boosts Japanese
234                         Language language_hint,     // ITALIAN boosts it
235                         Language* language3,
236                         int* percent3,
237                         double* normalized_score3,
238                         int* text_bytes,
239                         bool* is_reliable) {
240   bool allow_extended_lang = true;
241   int flags = 0;
242   Language plus_one = UNKNOWN_LANGUAGE;
243 
244   Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
245                           tables,
246                           buffer,
247                           buffer_length,
248                           is_plain_text,
249                           tld_hint,               // "id" boosts Indonesian
250                           encoding_hint,          // SJS boosts Japanese
251                           language_hint,          // ITALIAN boosts it
252                           allow_extended_lang,
253                           flags,
254                           plus_one,
255                           language3,
256                           percent3,
257                           normalized_score3,
258                           text_bytes,
259                           is_reliable);
260   // Do not default to English
261   return lang;
262   }
263 
264 
265 
266 // Return version text string
267 // String is "code_version - data_scrape_date"
DetectLanguageVersion()268 const char* CompactLangDet::DetectLanguageVersion() {
269   return kDetectLanguageVersion;
270 }
271 
272