• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "lang_id/lang-id.h"
18 
19 #include <memory>
20 #include <string>
21 #include <utility>
22 #include <vector>
23 
24 #include "base.h"
25 #include "util/base/logging.h"
26 #include "gtest/gtest.h"
27 
28 namespace libtextclassifier {
29 namespace nlp_core {
30 namespace lang_id {
31 
32 namespace {
33 
GetModelPath()34 std::string GetModelPath() {
35   return TEST_DATA_DIR "langid.model";
36 }
37 
38 // Creates a LangId with default model.  Passes ownership to
39 // the caller.
CreateLanguageDetector()40 LangId *CreateLanguageDetector() { return new LangId(GetModelPath()); }
41 
42 }  // namespace
43 
TEST(LangIdTest,Normal)44 TEST(LangIdTest, Normal) {
45   std::unique_ptr<LangId> lang_id(CreateLanguageDetector());
46 
47   EXPECT_EQ("en", lang_id->FindLanguage("This text is written in English."));
48   EXPECT_EQ("en",
49             lang_id->FindLanguage("This text   is written in   English.  "));
50   EXPECT_EQ("en",
51             lang_id->FindLanguage("  This text is written in English.  "));
52   EXPECT_EQ("fr", lang_id->FindLanguage("Vive la France!  Vive la France!"));
53   EXPECT_EQ("ro", lang_id->FindLanguage("Sunt foarte foarte foarte fericit!"));
54 }
55 
56 // Test that for very small queries, we return the default language and a low
57 // confidence score.
TEST(LangIdTest,SuperSmallQueries)58 TEST(LangIdTest, SuperSmallQueries) {
59   std::unique_ptr<LangId> lang_id(CreateLanguageDetector());
60 
61   // Use a default language different from any real language: to be sure the
62   // result is the default language, not a language that happens to be the
63   // default language.
64   const std::string kDefaultLanguage = "dflt-lng";
65   lang_id->SetDefaultLanguage(kDefaultLanguage);
66 
67   // Test the simple FindLanguage() method: that method returns a single
68   // language.
69   EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("y"));
70   EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("j"));
71   EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("l"));
72   EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("w"));
73   EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("z"));
74   EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("zulu"));
75 
76   // Test the more complex FindLanguages() method: that method returns a vector
77   // of (language, confidence_score) pairs.
78   std::vector<std::pair<std::string, float>> languages;
79   languages = lang_id->FindLanguages("y");
80   EXPECT_EQ(1, languages.size());
81   EXPECT_EQ(kDefaultLanguage, languages[0].first);
82   EXPECT_GT(0.01f, languages[0].second);
83 
84   languages = lang_id->FindLanguages("Todoist");
85   EXPECT_EQ(1, languages.size());
86   EXPECT_EQ(kDefaultLanguage, languages[0].first);
87   EXPECT_GT(0.01f, languages[0].second);
88 
89   // A few tests with a default language that is a real language code.
90   const std::string kJapanese = "ja";
91   lang_id->SetDefaultLanguage(kJapanese);
92   EXPECT_EQ(kJapanese, lang_id->FindLanguage("y"));
93   EXPECT_EQ(kJapanese, lang_id->FindLanguage("j"));
94   EXPECT_EQ(kJapanese, lang_id->FindLanguage("l"));
95   languages = lang_id->FindLanguages("y");
96   EXPECT_EQ(1, languages.size());
97   EXPECT_EQ(kJapanese, languages[0].first);
98   EXPECT_GT(0.01f, languages[0].second);
99 
100   // Make sure the min text size limit is applied to the number of real
101   // characters (e.g., without spaces and punctuation chars, which don't
102   // influence language identification).
103   const std::string kWhitespaces = "   \t   \n   \t\t\t\n    \t";
104   const std::string kPunctuation = "... ?!!--- -%%^...-";
105   std::string still_small_string = kWhitespaces + "y" + kWhitespaces +
106                                    kPunctuation + kWhitespaces + kPunctuation +
107                                    kPunctuation;
108   EXPECT_LE(100, still_small_string.size());
109   lang_id->SetDefaultLanguage(kDefaultLanguage);
110   EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage(still_small_string));
111   languages = lang_id->FindLanguages(still_small_string);
112   EXPECT_EQ(1, languages.size());
113   EXPECT_EQ(kDefaultLanguage, languages[0].first);
114   EXPECT_GT(0.01f, languages[0].second);
115 }
116 
117 namespace {
CheckPredictionForGibberishStrings(const std::string & default_language)118 void CheckPredictionForGibberishStrings(const std::string &default_language) {
119   static const char *const kGibberish[] = {
120     "",
121     " ",
122     "       ",
123     "  ___  ",
124     "123 456 789",
125     "><> (-_-) <><",
126     nullptr,
127   };
128 
129   std::unique_ptr<LangId> lang_id(CreateLanguageDetector());
130   TC_LOG(INFO) << "Default language: " << default_language;
131   lang_id->SetDefaultLanguage(default_language);
132   for (int i = 0; true; ++i) {
133     const char *gibberish = kGibberish[i];
134     if (gibberish == nullptr) {
135       break;
136     }
137     const std::string predicted_language = lang_id->FindLanguage(gibberish);
138     TC_LOG(INFO) << "Predicted " << predicted_language << " for \"" << gibberish
139                  << "\"";
140     EXPECT_EQ(default_language, predicted_language);
141   }
142 }
143 }  // namespace
144 
TEST(LangIdTest,CornerCases)145 TEST(LangIdTest, CornerCases) {
146   CheckPredictionForGibberishStrings("en");
147   CheckPredictionForGibberishStrings("ro");
148   CheckPredictionForGibberishStrings("fr");
149 }
150 
151 }  // namespace lang_id
152 }  // namespace nlp_core
153 }  // namespace libtextclassifier
154