• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <fstream>
18 #include <string>
19 
20 #include "gmock/gmock.h"
21 #include "gtest/gtest.h"
22 
23 #include "utils/sentencepiece/double_array_trie.h"
24 #include "utils/sentencepiece/normalizer.h"
25 #include "utils/sentencepiece/test_utils.h"
26 #include "utils/strings/stringpiece.h"
27 
28 namespace libtextclassifier3 {
29 namespace {
30 
GetTestConfigPath()31 std::string GetTestConfigPath() {
32   return "";
33 }
34 
TEST(NormalizerTest,NormalizesAsReferenceNormalizer)35 TEST(NormalizerTest, NormalizesAsReferenceNormalizer) {
36   std::ifstream test_config_stream(GetTestConfigPath());
37   std::string config((std::istreambuf_iterator<char>(test_config_stream)),
38                      (std::istreambuf_iterator<char>()));
39   SentencePieceNormalizer normalizer =
40       NormalizerFromSpec(config, /*add_dummy_prefix=*/true,
41                          /*remove_extra_whitespaces=*/true,
42                          /*escape_whitespaces=*/true);
43   {
44     std::string normalized;
45     EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
46     EXPECT_EQ(normalized, "▁hello▁there");
47   }
48 
49   // Redundant whitespace.
50   {
51     std::string normalized;
52     EXPECT_TRUE(normalizer.Normalize("when is  the  world cup?", &normalized));
53     EXPECT_EQ(normalized, "▁when▁is▁the▁world▁cup?");
54   }
55 
56   // Different whitespace.
57   {
58     std::string normalized;
59     EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
60     EXPECT_EQ(normalized, "▁general▁kenobi");
61   }
62 
63   // NFKC char to multi-char normalization.
64   {
65     std::string normalized;
66     EXPECT_TRUE(normalizer.Normalize("㍿", &normalized));
67     EXPECT_EQ(normalized, "▁株式会社");
68   }
69 
70   // Half width katakana, character composition happens.
71   {
72     std::string normalized;
73     EXPECT_TRUE(normalizer.Normalize(" グーグル ", &normalized));
74     EXPECT_EQ(normalized, "▁グーグル");
75   }
76 
77   // NFKC char to char normalization.
78   {
79     std::string normalized;
80     EXPECT_TRUE(normalizer.Normalize("①②③", &normalized));
81     EXPECT_EQ(normalized, "▁123");
82   }
83 }
84 
TEST(NormalizerTest,NoDummyPrefix)85 TEST(NormalizerTest, NoDummyPrefix) {
86   std::ifstream test_config_stream(GetTestConfigPath());
87   std::string config((std::istreambuf_iterator<char>(test_config_stream)),
88                      (std::istreambuf_iterator<char>()));
89   SentencePieceNormalizer normalizer =
90       NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
91                          /*remove_extra_whitespaces=*/true,
92                          /*escape_whitespaces=*/true);
93 
94   // NFKC char to char normalization.
95   {
96     std::string normalized;
97     EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
98     EXPECT_EQ(normalized, "hello▁there");
99   }
100 
101   // Redundant whitespace.
102   {
103     std::string normalized;
104     EXPECT_TRUE(normalizer.Normalize("when is  the  world cup?", &normalized));
105     EXPECT_EQ(normalized, "when▁is▁the▁world▁cup?");
106   }
107 
108   // Different whitespace.
109   {
110     std::string normalized;
111     EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
112     EXPECT_EQ(normalized, "general▁kenobi");
113   }
114 
115   // NFKC char to multi-char normalization.
116   {
117     std::string normalized;
118     EXPECT_TRUE(normalizer.Normalize("㍿", &normalized));
119     EXPECT_EQ(normalized, "株式会社");
120   }
121 
122   // Half width katakana, character composition happens.
123   {
124     std::string normalized;
125     EXPECT_TRUE(normalizer.Normalize(" グーグル ", &normalized));
126     EXPECT_EQ(normalized, "グーグル");
127   }
128 
129   // NFKC char to char normalization.
130   {
131     std::string normalized;
132     EXPECT_TRUE(normalizer.Normalize("①②③", &normalized));
133     EXPECT_EQ(normalized, "123");
134   }
135 }
136 
TEST(NormalizerTest,NoRemoveExtraWhitespace)137 TEST(NormalizerTest, NoRemoveExtraWhitespace) {
138   std::ifstream test_config_stream(GetTestConfigPath());
139   std::string config((std::istreambuf_iterator<char>(test_config_stream)),
140                      (std::istreambuf_iterator<char>()));
141   SentencePieceNormalizer normalizer =
142       NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
143                          /*remove_extra_whitespaces=*/false,
144                          /*escape_whitespaces=*/true);
145 
146   {
147     std::string normalized;
148     EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
149     EXPECT_EQ(normalized, "hello▁there");
150   }
151 
152   // Redundant whitespace.
153   {
154     std::string normalized;
155     EXPECT_TRUE(normalizer.Normalize("when is  the  world cup?", &normalized));
156     EXPECT_EQ(normalized, "when▁is▁▁the▁▁world▁cup?");
157   }
158 
159   // Different whitespace.
160   {
161     std::string normalized;
162     EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
163     EXPECT_EQ(normalized, "general▁kenobi");
164   }
165 }
166 
TEST(NormalizerTest,NoEscapeWhitespaces)167 TEST(NormalizerTest, NoEscapeWhitespaces) {
168   std::ifstream test_config_stream(GetTestConfigPath());
169   std::string config((std::istreambuf_iterator<char>(test_config_stream)),
170                      (std::istreambuf_iterator<char>()));
171   SentencePieceNormalizer normalizer =
172       NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
173                          /*remove_extra_whitespaces=*/false,
174                          /*escape_whitespaces=*/false);
175 
176   {
177     std::string normalized;
178     EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
179     EXPECT_EQ(normalized, "hello there");
180   }
181 
182   // Redundant whitespace.
183   {
184     std::string normalized;
185     EXPECT_TRUE(normalizer.Normalize("when is  the  world cup?", &normalized));
186     EXPECT_EQ(normalized, "when is  the  world cup?");
187   }
188 
189   // Different whitespace.
190   {
191     std::string normalized;
192     EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
193     EXPECT_EQ(normalized, "general kenobi");
194   }
195 }
196 
197 }  // namespace
198 }  // namespace libtextclassifier3
199