1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "encodings/compact_lang_det/win/cld_unicodetext.h"
6
7 #include <string>
8 #include <vector> // to compile bar/common/component.h
9
10 #include "encodings/compact_lang_det/compact_lang_det.h"
11 #include "encodings/compact_lang_det/string_byte_sink.h"
12 #include "base/string_util.h"
13 #include "unicode/normlzr.h"
14 #include "unicode/unistr.h"
15 #include "unicode/ustring.h"
16
NormalizeText(const UChar * text)17 std::string NormalizeText(const UChar* text) {
18 // To avoid a copy, use the read-only aliasing ctor.
19 icu::UnicodeString source(1, text, -1);
20 icu::UnicodeString normalized;
21 UErrorCode status = U_ZERO_ERROR;
22 icu::Normalizer::normalize(source, UNORM_NFC, 0, normalized, status);
23 if (U_FAILURE(status))
24 return std::string();
25 normalized.toLower();
26 std::string utf8;
27 // Internally, toUTF8 uses a 1kB stack buffer (which is not large enough
28 // for most web pages) and does pre-flighting followed by malloc for larger
29 // strings. We have to switch to obtaining the buffer with the maximum size
30 // (UTF-16 length * 3) without pre-flighting if necessary.
31 StringByteSink sink(&utf8);
32 normalized.toUTF8(sink);
33 return utf8;
34 }
35
36
37 // Detects a language of the UTF-16 encoded zero-terminated text.
38 // Returns: Language enum.
DetectLanguageOfUnicodeText(const CompactLangDet::DetectionTables * detection_tables,const UChar * text,bool is_plain_text,bool * is_reliable,int * num_languages,int * error_code,int * text_bytes)39 Language DetectLanguageOfUnicodeText(
40 const CompactLangDet::DetectionTables* detection_tables,
41 const UChar* text, bool is_plain_text,
42 bool* is_reliable, int* num_languages,
43 int* error_code, int* text_bytes) {
44 if (!text || !num_languages)
45 return NUM_LANGUAGES;
46 // Normalize text to NFC, lowercase and convert to UTF-8.
47 std::string utf8_encoded = NormalizeText(text);
48 if (utf8_encoded.empty())
49 return NUM_LANGUAGES;
50
51 // Engage core CLD library language detection.
52 Language language3[3] = {
53 UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE
54 };
55 int percent3[3] = { 0, 0, 0 };
56 int text_bytes_tmp = 0;
57 // We ignore return value here due to the problem described in bug 1800161.
58 // For example, translate.google.com was detected as Indonesian. It happened
59 // due to the heuristic in CLD, which ignores English as a top language
60 // in the presence of another reliably detected language.
61 // See the actual code in compact_lang_det_impl.cc, CalcSummaryLang function.
62 // language3 array is always set according to the detection results and
63 // is not affected by this heuristic.
64 CompactLangDet::DetectLanguageSummary(detection_tables,
65 utf8_encoded.c_str(),
66 utf8_encoded.length(),
67 is_plain_text, language3, percent3,
68 &text_bytes_tmp, is_reliable);
69
70 // Calcualte a number of languages detected in more than 20% of the text.
71 const int kMinTextPercentToCountLanguage = 20;
72 *num_languages = 0;
73 if (text_bytes)
74 *text_bytes = text_bytes_tmp;
75 COMPILE_ASSERT(arraysize(language3) == arraysize(percent3),
76 language3_and_percent3_should_be_of_the_same_size);
77 for (int i = 0; i < arraysize(language3); ++i) {
78 if (IsValidLanguage(language3[i]) && !IS_LANGUAGE_UNKNOWN(language3[i]) &&
79 percent3[i] >= kMinTextPercentToCountLanguage) {
80 ++*num_languages;
81 }
82 }
83
84 return language3[0];
85 }
86