• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <memory>
18 #include <vector>
19 #include <string>
20 
21 #include "common/common.h"
22 #include "include/api/status.h"
23 #include "minddata/dataset/text/vocab.h"
24 
25 using mindspore::dataset::Tensor;
26 using mindspore::dataset::Vocab;
27 
28 class MindDataTestVocab : public UT::DatasetOpTesting {
29  protected:
30 };
31 
32 TEST_F(MindDataTestVocab, TestVocabFromUnorderedMap) {
33   MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromUnorderedMap.";
34   // Build a map
35   std::unordered_map<std::string, int32_t> dict;
36   dict["banana"] = 0;
37   dict["apple"] = 1;
38   dict["cat"] = 2;
39   dict["dog"] = 3;
40 
41   // Build vocab from map
42   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
43   Status s = Vocab::BuildFromUnorderedMap(dict, &vocab);
44   EXPECT_EQ(s, Status::OK());
45 
46   // Look up specified words
47   std::vector<std::string> words = {"apple", "dog", "egg"};
48   std::vector<int64_t> expected = {1, 3, -1};
49   for (uint32_t i = 0; i < words.size(); ++i) {
50     int32_t x = vocab->Lookup(words[i]);
51     EXPECT_EQ(x, expected[i]);
52   }
53 }
54 
55 TEST_F(MindDataTestVocab, TestVocabFromEmptyMap) {
56   MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromEmptyMap.";
57   // Build vocab from empty map
58   std::unordered_map<std::string, int32_t> dict;
59   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
60   Status s = Vocab::BuildFromUnorderedMap(dict, &vocab);
61   EXPECT_EQ(s, Status::OK());
62 
63   // Look up specified words
64   // Expect that we will return -1 when word is not in vocab
65   std::vector<std::string> words = {"apple", "dog", "egg"};
66   std::vector<int64_t> expected = {-1, -1, -1};
67   for (uint32_t i = 0; i < words.size(); ++i) {
68     int32_t x = vocab->Lookup(words[i]);
69     EXPECT_EQ(x, expected[i]);
70   }
71 }
72 
73 TEST_F(MindDataTestVocab, TestVocabFromMapFail) {
74   MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromMapFail.";
75   // Build a map
76   std::unordered_map<std::string, int32_t> dict;
77   dict["banana"] = 0;
78   dict["apple"] = -1;
79 
80   // Expected failure: index of word can not be negative
81   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
82   Status s = Vocab::BuildFromUnorderedMap(dict, &vocab);
83   EXPECT_NE(s, Status::OK());
84 }
85 
86 TEST_F(MindDataTestVocab, TestVocabFromVectorPrependSpTokens) {
87   MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorPrependSpTokens.";
88   // Build vocab from a vector of words, special tokens are prepended to vocab
89   std::vector<std::string> list = {"apple", "banana", "cat", "dog", "egg"};
90   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
91   Status s = Vocab::BuildFromVector(list, {"<unk>"}, true, &vocab);
92   EXPECT_EQ(s, Status::OK());
93 
94   // Look up specified words
95   // Expect that we will return -1 when word is not in vocab
96   std::vector<std::string> words = {"apple", "banana", "fox"};
97   std::vector<int64_t> expected = {1, 2, -1};
98   for (uint32_t i = 0; i < words.size(); ++i) {
99     int32_t x = vocab->Lookup(words[i]);
100     EXPECT_EQ(x, expected[i]);
101   }
102 }
103 
104 TEST_F(MindDataTestVocab, TestVocabFromVectorAppendSpTokens) {
105   MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorAppendSpTokens.";
106   // Build vocab from a vector of words, special tokens are appended to vocab
107   std::vector<std::string> list = {"apple", "banana", "cat", "dog", "egg"};
108   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
109   Status s = Vocab::BuildFromVector(list, {"<unk>"}, false, &vocab);
110   EXPECT_EQ(s, Status::OK());
111 
112   // Look up specified words
113   std::vector<std::string> words = {"apple", "<unk>", "fox"};
114   std::vector<int64_t> expected = {0, 5, -1};
115   for (uint32_t i = 0; i < words.size(); ++i) {
116     int32_t x = vocab->Lookup(words[i]);
117     EXPECT_EQ(x, expected[i]);
118   }
119 }
120 
121 TEST_F(MindDataTestVocab, TestVocabFromVectorWithNoSpTokens) {
122   MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorWithNoSpTokens.";
123   // Build vocab from a vector of words with no special tokens
124   std::vector<std::string> list = {"apple", "banana", "cat", "dog", "egg"};
125   std::vector<std::string> sp_tokens = {};
126   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
127   Status s = Vocab::BuildFromVector(list, sp_tokens, true, &vocab);
128   EXPECT_EQ(s, Status::OK());
129 
130   // Look up specified words
131   std::vector<std::string> words = {"apple", "banana", "fox", "<pad>"};
132   std::vector<int64_t> expected = {0, 1, -1, -1};
133   for (uint32_t i = 0; i < words.size(); ++i) {
134     int32_t x = vocab->Lookup(words[i]);
135     EXPECT_EQ(x, expected[i]);
136   }
137 }
138 
139 TEST_F(MindDataTestVocab, TestVocabFromEmptyVector) {
140   MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromEmptyVector.";
141   // Build vocab from empty vector
142   std::vector<std::string> list = {};
143   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
144   Status s = Vocab::BuildFromVector(list, {}, false, &vocab);
145   EXPECT_EQ(s, Status::OK());
146 
147   // Look up specified words
148   // Expect that we will return -1 when word is not in vocab
149   std::vector<std::string> words = {"apple", "banana", "fox"};
150   std::vector<int64_t> expected = {-1, -1, -1};
151   for (uint32_t i = 0; i < words.size(); ++i) {
152     int32_t x = vocab->Lookup(words[i]);
153     EXPECT_EQ(x, expected[i]);
154   }
155 }
156 
157 TEST_F(MindDataTestVocab, TestVocabFromVectorFail1) {
158   MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorFail1.";
159   // Build vocab from a vector of words
160   std::vector<std::string> list = {"apple", "apple", "cat", "cat", "egg"};
161   std::vector<std::string> sp_tokens = {};
162   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
163 
164   // Expected failure: duplicate word apple
165   Status s = Vocab::BuildFromVector(list, sp_tokens, true, &vocab);
166   EXPECT_NE(s, Status::OK());
167 }
168 
169 TEST_F(MindDataTestVocab, TestVocabFromVectorFail2) {
170   MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorFail2.";
171   // Build vocab from a vector
172   std::vector<std::string> list = {"apple", "dog", "egg"};
173   std::vector<std::string> sp_tokens = {"<pad>", "<unk>", "<pad>", "<unk>", "<none>"};
174   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
175 
176   // Expected failure: duplicate special token <pad> <unk>
177   Status s = Vocab::BuildFromVector(list, sp_tokens, true, &vocab);
178   EXPECT_NE(s, Status::OK());
179 }
180 
181 TEST_F(MindDataTestVocab, TestVocabFromVectorFail3) {
182   MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorFail3.";
183   // Build vocab from a vector
184   std::vector<std::string> list = {"apple", "dog", "egg", "<unk>", ""};
185   std::vector<std::string> sp_tokens = {"", "<unk>"};
186   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
187 
188   // Expected failure: special tokens are already existed in word_list
189   Status s = Vocab::BuildFromVector(list, sp_tokens, true, &vocab);
190   EXPECT_NE(s, Status::OK());
191 }
192 
193 TEST_F(MindDataTestVocab, TestVocabFromFile) {
194   MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFile.";
195   // Build vocab from local file
196   std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
197   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
198   Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"<pad>", "<unk>"}, true, &vocab);
199   EXPECT_EQ(s, Status::OK());
200 
201   // Look up specified words
202   std::vector<std::string> words = {"not", "all"};
203   std::vector<int64_t> expected = {2, 3};
204   for (uint32_t i = 0; i < words.size(); ++i) {
205     int32_t x = vocab->Lookup(words[i]);
206     EXPECT_EQ(x, expected[i]);
207   }
208 }
209 
210 TEST_F(MindDataTestVocab, TestVocabFromFileFail1) {
211   MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFileFail1.";
212   // Build vocab from local file which is not exist
213   std::string vocab_dir = datasets_root_path_ + "/testVocab/not_exist.txt";
214   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
215   Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {}, true, &vocab);
216   EXPECT_NE(s, Status::OK());
217 }
218 
219 TEST_F(MindDataTestVocab, TestVocabFromFileFail2) {
220   MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFileFail2.";
221   // Build vocab from local file
222   std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
223   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
224 
225   // Expected failure: vocab_size should be either -1 or positive integer
226   Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -2, {}, true, &vocab);
227   EXPECT_NE(s, Status::OK());
228 }
229 
230 TEST_F(MindDataTestVocab, TestVocabFromFileFail3) {
231   MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFileFail3.";
232   // Build vocab from local file
233   std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
234   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
235 
236   // Expected failure: duplicate special token <unk>
237   Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"<unk>", "<unk>"}, true, &vocab);
238   EXPECT_NE(s, Status::OK());
239 }
240 
241 TEST_F(MindDataTestVocab, TestVocabFromFileFail4) {
242   MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFileFail4.";
243   // Build vocab from local file
244   std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
245   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
246 
247   // Expected failure: special_tokens and word_list contain duplicate word
248   Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"home"}, true, &vocab);
249   EXPECT_NE(s, Status::OK());
250 }
251