1 /** 2 * Copyright 2020-2021 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <memory> 18 #include <vector> 19 #include <string> 20 21 #include "common/common.h" 22 #include "include/api/status.h" 23 #include "minddata/dataset/text/vocab.h" 24 25 using mindspore::dataset::Tensor; 26 using mindspore::dataset::Vocab; 27 28 class MindDataTestVocab : public UT::DatasetOpTesting { 29 protected: 30 }; 31 32 TEST_F(MindDataTestVocab, TestVocabFromUnorderedMap) { 33 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromUnorderedMap."; 34 // Build a map 35 std::unordered_map<std::string, int32_t> dict; 36 dict["banana"] = 0; 37 dict["apple"] = 1; 38 dict["cat"] = 2; 39 dict["dog"] = 3; 40 41 // Build vocab from map 42 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 43 Status s = Vocab::BuildFromUnorderedMap(dict, &vocab); 44 EXPECT_EQ(s, Status::OK()); 45 46 // Look up specified words 47 std::vector<std::string> words = {"apple", "dog", "egg"}; 48 std::vector<int64_t> expected = {1, 3, -1}; 49 for (uint32_t i = 0; i < words.size(); ++i) { 50 int32_t x = vocab->Lookup(words[i]); 51 EXPECT_EQ(x, expected[i]); 52 } 53 } 54 55 TEST_F(MindDataTestVocab, TestVocabFromEmptyMap) { 56 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromEmptyMap."; 57 // Build vocab from empty map 58 std::unordered_map<std::string, int32_t> dict; 59 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 60 Status s = Vocab::BuildFromUnorderedMap(dict, &vocab); 61 EXPECT_EQ(s, Status::OK()); 62 63 // Look up specified words 64 // Expect that we will return -1 when word is not in vocab 65 std::vector<std::string> words = {"apple", "dog", "egg"}; 66 std::vector<int64_t> expected = {-1, -1, -1}; 67 for (uint32_t i = 0; i < words.size(); ++i) { 68 int32_t x = vocab->Lookup(words[i]); 69 EXPECT_EQ(x, expected[i]); 70 } 71 } 72 73 TEST_F(MindDataTestVocab, TestVocabFromMapFail) { 74 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromMapFail."; 75 // Build a map 76 std::unordered_map<std::string, int32_t> dict; 77 dict["banana"] = 0; 78 dict["apple"] = -1; 79 80 // Expected failure: index of word can not be negative 81 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 82 Status s = Vocab::BuildFromUnorderedMap(dict, &vocab); 83 EXPECT_NE(s, Status::OK()); 84 } 85 86 TEST_F(MindDataTestVocab, TestVocabFromVectorPrependSpTokens) { 87 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorPrependSpTokens."; 88 // Build vocab from a vector of words, special tokens are prepended to vocab 89 std::vector<std::string> list = {"apple", "banana", "cat", "dog", "egg"}; 90 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 91 Status s = Vocab::BuildFromVector(list, {"<unk>"}, true, &vocab); 92 EXPECT_EQ(s, Status::OK()); 93 94 // Look up specified words 95 // Expect that we will return -1 when word is not in vocab 96 std::vector<std::string> words = {"apple", "banana", "fox"}; 97 std::vector<int64_t> expected = {1, 2, -1}; 98 for (uint32_t i = 0; i < words.size(); ++i) { 99 int32_t x = vocab->Lookup(words[i]); 100 EXPECT_EQ(x, expected[i]); 101 } 102 } 103 104 TEST_F(MindDataTestVocab, TestVocabFromVectorAppendSpTokens) { 105 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorAppendSpTokens."; 106 // Build vocab from a vector of words, special tokens are appended to vocab 107 std::vector<std::string> list = {"apple", "banana", "cat", "dog", "egg"}; 108 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 109 Status s = Vocab::BuildFromVector(list, {"<unk>"}, false, &vocab); 110 EXPECT_EQ(s, Status::OK()); 111 112 // Look up specified words 113 std::vector<std::string> words = {"apple", "<unk>", "fox"}; 114 std::vector<int64_t> expected = {0, 5, -1}; 115 for (uint32_t i = 0; i < words.size(); ++i) { 116 int32_t x = vocab->Lookup(words[i]); 117 EXPECT_EQ(x, expected[i]); 118 } 119 } 120 121 TEST_F(MindDataTestVocab, TestVocabFromVectorWithNoSpTokens) { 122 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorWithNoSpTokens."; 123 // Build vocab from a vector of words with no special tokens 124 std::vector<std::string> list = {"apple", "banana", "cat", "dog", "egg"}; 125 std::vector<std::string> sp_tokens = {}; 126 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 127 Status s = Vocab::BuildFromVector(list, sp_tokens, true, &vocab); 128 EXPECT_EQ(s, Status::OK()); 129 130 // Look up specified words 131 std::vector<std::string> words = {"apple", "banana", "fox", "<pad>"}; 132 std::vector<int64_t> expected = {0, 1, -1, -1}; 133 for (uint32_t i = 0; i < words.size(); ++i) { 134 int32_t x = vocab->Lookup(words[i]); 135 EXPECT_EQ(x, expected[i]); 136 } 137 } 138 139 TEST_F(MindDataTestVocab, TestVocabFromEmptyVector) { 140 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromEmptyVector."; 141 // Build vocab from empty vector 142 std::vector<std::string> list = {}; 143 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 144 Status s = Vocab::BuildFromVector(list, {}, false, &vocab); 145 EXPECT_EQ(s, Status::OK()); 146 147 // Look up specified words 148 // Expect that we will return -1 when word is not in vocab 149 std::vector<std::string> words = {"apple", "banana", "fox"}; 150 std::vector<int64_t> expected = {-1, -1, -1}; 151 for (uint32_t i = 0; i < words.size(); ++i) { 152 int32_t x = vocab->Lookup(words[i]); 153 EXPECT_EQ(x, expected[i]); 154 } 155 } 156 157 TEST_F(MindDataTestVocab, TestVocabFromVectorFail1) { 158 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorFail1."; 159 // Build vocab from a vector of words 160 std::vector<std::string> list = {"apple", "apple", "cat", "cat", "egg"}; 161 std::vector<std::string> sp_tokens = {}; 162 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 163 164 // Expected failure: duplicate word apple 165 Status s = Vocab::BuildFromVector(list, sp_tokens, true, &vocab); 166 EXPECT_NE(s, Status::OK()); 167 } 168 169 TEST_F(MindDataTestVocab, TestVocabFromVectorFail2) { 170 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorFail2."; 171 // Build vocab from a vector 172 std::vector<std::string> list = {"apple", "dog", "egg"}; 173 std::vector<std::string> sp_tokens = {"<pad>", "<unk>", "<pad>", "<unk>", "<none>"}; 174 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 175 176 // Expected failure: duplicate special token <pad> <unk> 177 Status s = Vocab::BuildFromVector(list, sp_tokens, true, &vocab); 178 EXPECT_NE(s, Status::OK()); 179 } 180 181 TEST_F(MindDataTestVocab, TestVocabFromVectorFail3) { 182 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorFail3."; 183 // Build vocab from a vector 184 std::vector<std::string> list = {"apple", "dog", "egg", "<unk>", ""}; 185 std::vector<std::string> sp_tokens = {"", "<unk>"}; 186 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 187 188 // Expected failure: special tokens are already existed in word_list 189 Status s = Vocab::BuildFromVector(list, sp_tokens, true, &vocab); 190 EXPECT_NE(s, Status::OK()); 191 } 192 193 TEST_F(MindDataTestVocab, TestVocabFromFile) { 194 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFile."; 195 // Build vocab from local file 196 std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt"; 197 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 198 Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"<pad>", "<unk>"}, true, &vocab); 199 EXPECT_EQ(s, Status::OK()); 200 201 // Look up specified words 202 std::vector<std::string> words = {"not", "all"}; 203 std::vector<int64_t> expected = {2, 3}; 204 for (uint32_t i = 0; i < words.size(); ++i) { 205 int32_t x = vocab->Lookup(words[i]); 206 EXPECT_EQ(x, expected[i]); 207 } 208 } 209 210 TEST_F(MindDataTestVocab, TestVocabFromFileFail1) { 211 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFileFail1."; 212 // Build vocab from local file which is not exist 213 std::string vocab_dir = datasets_root_path_ + "/testVocab/not_exist.txt"; 214 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 215 Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {}, true, &vocab); 216 EXPECT_NE(s, Status::OK()); 217 } 218 219 TEST_F(MindDataTestVocab, TestVocabFromFileFail2) { 220 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFileFail2."; 221 // Build vocab from local file 222 std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt"; 223 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 224 225 // Expected failure: vocab_size should be either -1 or positive integer 226 Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -2, {}, true, &vocab); 227 EXPECT_NE(s, Status::OK()); 228 } 229 230 TEST_F(MindDataTestVocab, TestVocabFromFileFail3) { 231 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFileFail3."; 232 // Build vocab from local file 233 std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt"; 234 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 235 236 // Expected failure: duplicate special token <unk> 237 Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"<unk>", "<unk>"}, true, &vocab); 238 EXPECT_NE(s, Status::OK()); 239 } 240 241 TEST_F(MindDataTestVocab, TestVocabFromFileFail4) { 242 MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFileFail4."; 243 // Build vocab from local file 244 std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt"; 245 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 246 247 // Expected failure: special_tokens and word_list contain duplicate word 248 Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"home"}, true, &vocab); 249 EXPECT_NE(s, Status::OK()); 250 } 251