1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "lang_id/lang-id.h"
18
19 #include <memory>
20 #include <string>
21 #include <utility>
22 #include <vector>
23
24 #include "base.h"
25 #include "util/base/logging.h"
26 #include "gtest/gtest.h"
27
28 namespace libtextclassifier {
29 namespace nlp_core {
30 namespace lang_id {
31
32 namespace {
33
GetModelPath()34 std::string GetModelPath() {
35 return TEST_DATA_DIR "langid.model";
36 }
37
38 // Creates a LangId with default model. Passes ownership to
39 // the caller.
CreateLanguageDetector()40 LangId *CreateLanguageDetector() { return new LangId(GetModelPath()); }
41
42 } // namespace
43
TEST(LangIdTest,Normal)44 TEST(LangIdTest, Normal) {
45 std::unique_ptr<LangId> lang_id(CreateLanguageDetector());
46
47 EXPECT_EQ("en", lang_id->FindLanguage("This text is written in English."));
48 EXPECT_EQ("en",
49 lang_id->FindLanguage("This text is written in English. "));
50 EXPECT_EQ("en",
51 lang_id->FindLanguage(" This text is written in English. "));
52 EXPECT_EQ("fr", lang_id->FindLanguage("Vive la France! Vive la France!"));
53 EXPECT_EQ("ro", lang_id->FindLanguage("Sunt foarte foarte foarte fericit!"));
54 }
55
56 // Test that for very small queries, we return the default language and a low
57 // confidence score.
TEST(LangIdTest,SuperSmallQueries)58 TEST(LangIdTest, SuperSmallQueries) {
59 std::unique_ptr<LangId> lang_id(CreateLanguageDetector());
60
61 // Use a default language different from any real language: to be sure the
62 // result is the default language, not a language that happens to be the
63 // default language.
64 const std::string kDefaultLanguage = "dflt-lng";
65 lang_id->SetDefaultLanguage(kDefaultLanguage);
66
67 // Test the simple FindLanguage() method: that method returns a single
68 // language.
69 EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("y"));
70 EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("j"));
71 EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("l"));
72 EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("w"));
73 EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("z"));
74 EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("zulu"));
75
76 // Test the more complex FindLanguages() method: that method returns a vector
77 // of (language, confidence_score) pairs.
78 std::vector<std::pair<std::string, float>> languages;
79 languages = lang_id->FindLanguages("y");
80 EXPECT_EQ(1, languages.size());
81 EXPECT_EQ(kDefaultLanguage, languages[0].first);
82 EXPECT_GT(0.01f, languages[0].second);
83
84 languages = lang_id->FindLanguages("Todoist");
85 EXPECT_EQ(1, languages.size());
86 EXPECT_EQ(kDefaultLanguage, languages[0].first);
87 EXPECT_GT(0.01f, languages[0].second);
88
89 // A few tests with a default language that is a real language code.
90 const std::string kJapanese = "ja";
91 lang_id->SetDefaultLanguage(kJapanese);
92 EXPECT_EQ(kJapanese, lang_id->FindLanguage("y"));
93 EXPECT_EQ(kJapanese, lang_id->FindLanguage("j"));
94 EXPECT_EQ(kJapanese, lang_id->FindLanguage("l"));
95 languages = lang_id->FindLanguages("y");
96 EXPECT_EQ(1, languages.size());
97 EXPECT_EQ(kJapanese, languages[0].first);
98 EXPECT_GT(0.01f, languages[0].second);
99
100 // Make sure the min text size limit is applied to the number of real
101 // characters (e.g., without spaces and punctuation chars, which don't
102 // influence language identification).
103 const std::string kWhitespaces = " \t \n \t\t\t\n \t";
104 const std::string kPunctuation = "... ?!!--- -%%^...-";
105 std::string still_small_string = kWhitespaces + "y" + kWhitespaces +
106 kPunctuation + kWhitespaces + kPunctuation +
107 kPunctuation;
108 EXPECT_LE(100, still_small_string.size());
109 lang_id->SetDefaultLanguage(kDefaultLanguage);
110 EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage(still_small_string));
111 languages = lang_id->FindLanguages(still_small_string);
112 EXPECT_EQ(1, languages.size());
113 EXPECT_EQ(kDefaultLanguage, languages[0].first);
114 EXPECT_GT(0.01f, languages[0].second);
115 }
116
117 namespace {
CheckPredictionForGibberishStrings(const std::string & default_language)118 void CheckPredictionForGibberishStrings(const std::string &default_language) {
119 static const char *const kGibberish[] = {
120 "",
121 " ",
122 " ",
123 " ___ ",
124 "123 456 789",
125 "><> (-_-) <><",
126 nullptr,
127 };
128
129 std::unique_ptr<LangId> lang_id(CreateLanguageDetector());
130 TC_LOG(INFO) << "Default language: " << default_language;
131 lang_id->SetDefaultLanguage(default_language);
132 for (int i = 0; true; ++i) {
133 const char *gibberish = kGibberish[i];
134 if (gibberish == nullptr) {
135 break;
136 }
137 const std::string predicted_language = lang_id->FindLanguage(gibberish);
138 TC_LOG(INFO) << "Predicted " << predicted_language << " for \"" << gibberish
139 << "\"";
140 EXPECT_EQ(default_language, predicted_language);
141 }
142 }
143 } // namespace
144
TEST(LangIdTest,CornerCases)145 TEST(LangIdTest, CornerCases) {
146 CheckPredictionForGibberishStrings("en");
147 CheckPredictionForGibberishStrings("ro");
148 CheckPredictionForGibberishStrings("fr");
149 }
150
151 } // namespace lang_id
152 } // namespace nlp_core
153 } // namespace libtextclassifier
154