1 /**
2 * Copyright 2020-2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <memory>
18 #include <vector>
19 #include <string>
20
21 #include "common/common.h"
22 #include "include/api/status.h"
23 #include "minddata/dataset/text/vocab.h"
24
25 using mindspore::dataset::Tensor;
26 using mindspore::dataset::Vocab;
27
28 class MindDataTestVocab : public UT::DatasetOpTesting {
29 protected:
30 };
31
TEST_F(MindDataTestVocab,TestVocabFromUnorderedMap)32 TEST_F(MindDataTestVocab, TestVocabFromUnorderedMap) {
33 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromUnorderedMap.";
34 // Build a map
35 std::unordered_map<std::string, int32_t> dict;
36 dict["banana"] = 0;
37 dict["apple"] = 1;
38 dict["cat"] = 2;
39 dict["dog"] = 3;
40
41 // Build vocab from map
42 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
43 Status s = Vocab::BuildFromUnorderedMap(dict, &vocab);
44 EXPECT_EQ(s, Status::OK());
45
46 // Look up specified words
47 std::vector<std::string> words = {"apple", "dog", "egg"};
48 std::vector<int64_t> expected = {1, 3, -1};
49 for (uint32_t i = 0; i < words.size(); ++i) {
50 int32_t x = vocab->Lookup(words[i]);
51 EXPECT_EQ(x, expected[i]);
52 }
53 }
54
TEST_F(MindDataTestVocab,TestVocabFromEmptyMap)55 TEST_F(MindDataTestVocab, TestVocabFromEmptyMap) {
56 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromEmptyMap.";
57 // Build vocab from empty map
58 std::unordered_map<std::string, int32_t> dict;
59 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
60 Status s = Vocab::BuildFromUnorderedMap(dict, &vocab);
61 EXPECT_EQ(s, Status::OK());
62
63 // Look up specified words
64 // Expect that we will return -1 when word is not in vocab
65 std::vector<std::string> words = {"apple", "dog", "egg"};
66 std::vector<int64_t> expected = {-1, -1, -1};
67 for (uint32_t i = 0; i < words.size(); ++i) {
68 int32_t x = vocab->Lookup(words[i]);
69 EXPECT_EQ(x, expected[i]);
70 }
71 }
72
TEST_F(MindDataTestVocab,TestVocabFromMapFail)73 TEST_F(MindDataTestVocab, TestVocabFromMapFail) {
74 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromMapFail.";
75 // Build a map
76 std::unordered_map<std::string, int32_t> dict;
77 dict["banana"] = 0;
78 dict["apple"] = -1;
79
80 // Expected failure: index of word can not be negative
81 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
82 Status s = Vocab::BuildFromUnorderedMap(dict, &vocab);
83 EXPECT_NE(s, Status::OK());
84 }
85
TEST_F(MindDataTestVocab,TestVocabFromVectorPrependSpTokens)86 TEST_F(MindDataTestVocab, TestVocabFromVectorPrependSpTokens) {
87 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorPrependSpTokens.";
88 // Build vocab from a vector of words, special tokens are prepended to vocab
89 std::vector<std::string> list = {"apple", "banana", "cat", "dog", "egg"};
90 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
91 Status s = Vocab::BuildFromVector(list, {"<unk>"}, true, &vocab);
92 EXPECT_EQ(s, Status::OK());
93
94 // Look up specified words
95 // Expect that we will return -1 when word is not in vocab
96 std::vector<std::string> words = {"apple", "banana", "fox"};
97 std::vector<int64_t> expected = {1, 2, -1};
98 for (uint32_t i = 0; i < words.size(); ++i) {
99 int32_t x = vocab->Lookup(words[i]);
100 EXPECT_EQ(x, expected[i]);
101 }
102 }
103
TEST_F(MindDataTestVocab,TestVocabFromVectorAppendSpTokens)104 TEST_F(MindDataTestVocab, TestVocabFromVectorAppendSpTokens) {
105 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorAppendSpTokens.";
106 // Build vocab from a vector of words, special tokens are appended to vocab
107 std::vector<std::string> list = {"apple", "banana", "cat", "dog", "egg"};
108 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
109 Status s = Vocab::BuildFromVector(list, {"<unk>"}, false, &vocab);
110 EXPECT_EQ(s, Status::OK());
111
112 // Look up specified words
113 std::vector<std::string> words = {"apple", "<unk>", "fox"};
114 std::vector<int64_t> expected = {0, 5, -1};
115 for (uint32_t i = 0; i < words.size(); ++i) {
116 int32_t x = vocab->Lookup(words[i]);
117 EXPECT_EQ(x, expected[i]);
118 }
119 }
120
TEST_F(MindDataTestVocab,TestVocabFromVectorWithNoSpTokens)121 TEST_F(MindDataTestVocab, TestVocabFromVectorWithNoSpTokens) {
122 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorWithNoSpTokens.";
123 // Build vocab from a vector of words with no special tokens
124 std::vector<std::string> list = {"apple", "banana", "cat", "dog", "egg"};
125 std::vector<std::string> sp_tokens = {};
126 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
127 Status s = Vocab::BuildFromVector(list, sp_tokens, true, &vocab);
128 EXPECT_EQ(s, Status::OK());
129
130 // Look up specified words
131 std::vector<std::string> words = {"apple", "banana", "fox", "<pad>"};
132 std::vector<int64_t> expected = {0, 1, -1, -1};
133 for (uint32_t i = 0; i < words.size(); ++i) {
134 int32_t x = vocab->Lookup(words[i]);
135 EXPECT_EQ(x, expected[i]);
136 }
137 }
138
TEST_F(MindDataTestVocab,TestVocabFromEmptyVector)139 TEST_F(MindDataTestVocab, TestVocabFromEmptyVector) {
140 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromEmptyVector.";
141 // Build vocab from empty vector
142 std::vector<std::string> list = {};
143 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
144 Status s = Vocab::BuildFromVector(list, {}, false, &vocab);
145 EXPECT_EQ(s, Status::OK());
146
147 // Look up specified words
148 // Expect that we will return -1 when word is not in vocab
149 std::vector<std::string> words = {"apple", "banana", "fox"};
150 std::vector<int64_t> expected = {-1, -1, -1};
151 for (uint32_t i = 0; i < words.size(); ++i) {
152 int32_t x = vocab->Lookup(words[i]);
153 EXPECT_EQ(x, expected[i]);
154 }
155 }
156
TEST_F(MindDataTestVocab,TestVocabFromVectorFail1)157 TEST_F(MindDataTestVocab, TestVocabFromVectorFail1) {
158 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorFail1.";
159 // Build vocab from a vector of words
160 std::vector<std::string> list = {"apple", "apple", "cat", "cat", "egg"};
161 std::vector<std::string> sp_tokens = {};
162 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
163
164 // Expected failure: duplicate word apple
165 Status s = Vocab::BuildFromVector(list, sp_tokens, true, &vocab);
166 EXPECT_NE(s, Status::OK());
167 }
168
TEST_F(MindDataTestVocab,TestVocabFromVectorFail2)169 TEST_F(MindDataTestVocab, TestVocabFromVectorFail2) {
170 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorFail2.";
171 // Build vocab from a vector
172 std::vector<std::string> list = {"apple", "dog", "egg"};
173 std::vector<std::string> sp_tokens = {"<pad>", "<unk>", "<pad>", "<unk>", "<none>"};
174 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
175
176 // Expected failure: duplicate special token <pad> <unk>
177 Status s = Vocab::BuildFromVector(list, sp_tokens, true, &vocab);
178 EXPECT_NE(s, Status::OK());
179 }
180
TEST_F(MindDataTestVocab,TestVocabFromVectorFail3)181 TEST_F(MindDataTestVocab, TestVocabFromVectorFail3) {
182 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorFail3.";
183 // Build vocab from a vector
184 std::vector<std::string> list = {"apple", "dog", "egg", "<unk>", ""};
185 std::vector<std::string> sp_tokens = {"", "<unk>"};
186 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
187
188 // Expected failure: special tokens are already existed in word_list
189 Status s = Vocab::BuildFromVector(list, sp_tokens, true, &vocab);
190 EXPECT_NE(s, Status::OK());
191 }
192
TEST_F(MindDataTestVocab,TestVocabFromFile)193 TEST_F(MindDataTestVocab, TestVocabFromFile) {
194 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFile.";
195 // Build vocab from local file
196 std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
197 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
198 Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"<pad>", "<unk>"}, true, &vocab);
199 EXPECT_EQ(s, Status::OK());
200
201 // Look up specified words
202 std::vector<std::string> words = {"not", "all"};
203 std::vector<int64_t> expected = {2, 3};
204 for (uint32_t i = 0; i < words.size(); ++i) {
205 int32_t x = vocab->Lookup(words[i]);
206 EXPECT_EQ(x, expected[i]);
207 }
208 }
209
TEST_F(MindDataTestVocab,TestVocabFromFileFail1)210 TEST_F(MindDataTestVocab, TestVocabFromFileFail1) {
211 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFileFail1.";
212 // Build vocab from local file which is not exist
213 std::string vocab_dir = datasets_root_path_ + "/testVocab/not_exist.txt";
214 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
215 Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {}, true, &vocab);
216 EXPECT_NE(s, Status::OK());
217 }
218
TEST_F(MindDataTestVocab,TestVocabFromFileFail2)219 TEST_F(MindDataTestVocab, TestVocabFromFileFail2) {
220 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFileFail2.";
221 // Build vocab from local file
222 std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
223 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
224
225 // Expected failure: vocab_size should be either -1 or positive integer
226 Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -2, {}, true, &vocab);
227 EXPECT_NE(s, Status::OK());
228 }
229
TEST_F(MindDataTestVocab,TestVocabFromFileFail3)230 TEST_F(MindDataTestVocab, TestVocabFromFileFail3) {
231 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFileFail3.";
232 // Build vocab from local file
233 std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
234 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
235
236 // Expected failure: duplicate special token <unk>
237 Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"<unk>", "<unk>"}, true, &vocab);
238 EXPECT_NE(s, Status::OK());
239 }
240
TEST_F(MindDataTestVocab,TestVocabFromFileFail4)241 TEST_F(MindDataTestVocab, TestVocabFromFileFail4) {
242 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFileFail4.";
243 // Build vocab from local file
244 std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
245 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
246
247 // Expected failure: special_tokens and word_list contain duplicate word
248 Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"home"}, true, &vocab);
249 EXPECT_NE(s, Status::OK());
250 }
251