1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <fstream>
18 #include <string>
19
20 #include "gmock/gmock.h"
21 #include "gtest/gtest.h"
22
23 #include "utils/sentencepiece/double_array_trie.h"
24 #include "utils/sentencepiece/normalizer.h"
25 #include "utils/sentencepiece/test_utils.h"
26 #include "utils/strings/stringpiece.h"
27
28 namespace libtextclassifier3 {
29 namespace {
30
GetTestConfigPath()31 std::string GetTestConfigPath() {
32 return "";
33 }
34
TEST(NormalizerTest,NormalizesAsReferenceNormalizer)35 TEST(NormalizerTest, NormalizesAsReferenceNormalizer) {
36 std::ifstream test_config_stream(GetTestConfigPath());
37 std::string config((std::istreambuf_iterator<char>(test_config_stream)),
38 (std::istreambuf_iterator<char>()));
39 SentencePieceNormalizer normalizer =
40 NormalizerFromSpec(config, /*add_dummy_prefix=*/true,
41 /*remove_extra_whitespaces=*/true,
42 /*escape_whitespaces=*/true);
43 {
44 std::string normalized;
45 EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
46 EXPECT_EQ(normalized, "▁hello▁there");
47 }
48
49 // Redundant whitespace.
50 {
51 std::string normalized;
52 EXPECT_TRUE(normalizer.Normalize("when is the world cup?", &normalized));
53 EXPECT_EQ(normalized, "▁when▁is▁the▁world▁cup?");
54 }
55
56 // Different whitespace.
57 {
58 std::string normalized;
59 EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
60 EXPECT_EQ(normalized, "▁general▁kenobi");
61 }
62
63 // NFKC char to multi-char normalization.
64 {
65 std::string normalized;
66 EXPECT_TRUE(normalizer.Normalize("㍿", &normalized));
67 EXPECT_EQ(normalized, "▁株式会社");
68 }
69
70 // Half width katakana, character composition happens.
71 {
72 std::string normalized;
73 EXPECT_TRUE(normalizer.Normalize(" グーグル ", &normalized));
74 EXPECT_EQ(normalized, "▁グーグル");
75 }
76
77 // NFKC char to char normalization.
78 {
79 std::string normalized;
80 EXPECT_TRUE(normalizer.Normalize("①②③", &normalized));
81 EXPECT_EQ(normalized, "▁123");
82 }
83 }
84
TEST(NormalizerTest,NoDummyPrefix)85 TEST(NormalizerTest, NoDummyPrefix) {
86 std::ifstream test_config_stream(GetTestConfigPath());
87 std::string config((std::istreambuf_iterator<char>(test_config_stream)),
88 (std::istreambuf_iterator<char>()));
89 SentencePieceNormalizer normalizer =
90 NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
91 /*remove_extra_whitespaces=*/true,
92 /*escape_whitespaces=*/true);
93
94 // NFKC char to char normalization.
95 {
96 std::string normalized;
97 EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
98 EXPECT_EQ(normalized, "hello▁there");
99 }
100
101 // Redundant whitespace.
102 {
103 std::string normalized;
104 EXPECT_TRUE(normalizer.Normalize("when is the world cup?", &normalized));
105 EXPECT_EQ(normalized, "when▁is▁the▁world▁cup?");
106 }
107
108 // Different whitespace.
109 {
110 std::string normalized;
111 EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
112 EXPECT_EQ(normalized, "general▁kenobi");
113 }
114
115 // NFKC char to multi-char normalization.
116 {
117 std::string normalized;
118 EXPECT_TRUE(normalizer.Normalize("㍿", &normalized));
119 EXPECT_EQ(normalized, "株式会社");
120 }
121
122 // Half width katakana, character composition happens.
123 {
124 std::string normalized;
125 EXPECT_TRUE(normalizer.Normalize(" グーグル ", &normalized));
126 EXPECT_EQ(normalized, "グーグル");
127 }
128
129 // NFKC char to char normalization.
130 {
131 std::string normalized;
132 EXPECT_TRUE(normalizer.Normalize("①②③", &normalized));
133 EXPECT_EQ(normalized, "123");
134 }
135 }
136
TEST(NormalizerTest,NoRemoveExtraWhitespace)137 TEST(NormalizerTest, NoRemoveExtraWhitespace) {
138 std::ifstream test_config_stream(GetTestConfigPath());
139 std::string config((std::istreambuf_iterator<char>(test_config_stream)),
140 (std::istreambuf_iterator<char>()));
141 SentencePieceNormalizer normalizer =
142 NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
143 /*remove_extra_whitespaces=*/false,
144 /*escape_whitespaces=*/true);
145
146 {
147 std::string normalized;
148 EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
149 EXPECT_EQ(normalized, "hello▁there");
150 }
151
152 // Redundant whitespace.
153 {
154 std::string normalized;
155 EXPECT_TRUE(normalizer.Normalize("when is the world cup?", &normalized));
156 EXPECT_EQ(normalized, "when▁is▁▁the▁▁world▁cup?");
157 }
158
159 // Different whitespace.
160 {
161 std::string normalized;
162 EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
163 EXPECT_EQ(normalized, "general▁kenobi");
164 }
165 }
166
TEST(NormalizerTest,NoEscapeWhitespaces)167 TEST(NormalizerTest, NoEscapeWhitespaces) {
168 std::ifstream test_config_stream(GetTestConfigPath());
169 std::string config((std::istreambuf_iterator<char>(test_config_stream)),
170 (std::istreambuf_iterator<char>()));
171 SentencePieceNormalizer normalizer =
172 NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
173 /*remove_extra_whitespaces=*/false,
174 /*escape_whitespaces=*/false);
175
176 {
177 std::string normalized;
178 EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
179 EXPECT_EQ(normalized, "hello there");
180 }
181
182 // Redundant whitespace.
183 {
184 std::string normalized;
185 EXPECT_TRUE(normalizer.Normalize("when is the world cup?", &normalized));
186 EXPECT_EQ(normalized, "when is the world cup?");
187 }
188
189 // Different whitespace.
190 {
191 std::string normalized;
192 EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
193 EXPECT_EQ(normalized, "general kenobi");
194 }
195 }
196
197 } // namespace
198 } // namespace libtextclassifier3
199