1 /** 2 * Copyright 2020 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #include <memory> 17 #include <string> 18 #include <string_view> 19 20 #include "common/common.h" 21 #include "minddata/dataset/text/kernels/basic_tokenizer_op.h" 22 #include "minddata/dataset/text/kernels/case_fold_op.h" 23 #include "minddata/dataset/text/kernels/normalize_utf8_op.h" 24 #include "minddata/dataset/text/kernels/regex_replace_op.h" 25 #include "minddata/dataset/text/kernels/regex_tokenizer_op.h" 26 #include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h" 27 #include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h" 28 #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h" 29 #include "gtest/gtest.h" 30 #include "utils/log_adapter.h" 31 32 using namespace mindspore::dataset; 33 34 class MindDataTestTokenizerOp : public UT::Common { 35 public: 36 void CheckEqual(const std::shared_ptr<Tensor> &o, 37 const std::vector<dsize_t> &index, 38 const std::string &expect) { 39 std::string_view str; 40 Status s = o->GetItemAt(&str, index); 41 EXPECT_TRUE(s.IsOk()); 42 EXPECT_EQ(str, expect); 43 } 44 }; 45 46 TEST_F(MindDataTestTokenizerOp, TestUnicodeCharTokenizerOp) { 47 MS_LOG(INFO) << "Doing TestUnicodeCharTokenizerOp."; 48 std::unique_ptr<UnicodeCharTokenizerOp> op(new UnicodeCharTokenizerOp(true)); 49 std::shared_ptr<Tensor> input; 50 Tensor::CreateScalar<std::string>("Hello World!", &input); TensorRow output; 51 Status s = op->Compute(TensorRow(0, {input}), &output); 52 EXPECT_TRUE(s.IsOk()); 53 EXPECT_EQ(output[0]->Size(), 12); 54 EXPECT_EQ(output[0]->Rank(), 1); 55 MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString(); 56 CheckEqual(output[0], {0}, "H"); 57 CheckEqual(output[0], {1}, "e"); 58 CheckEqual(output[0], {2}, "l"); 59 CheckEqual(output[0], {3}, "l"); 60 CheckEqual(output[0], {4}, "o"); 61 CheckEqual(output[0], {5}, " "); 62 CheckEqual(output[0], {6}, "W"); 63 CheckEqual(output[0], {7}, "o"); 64 CheckEqual(output[0], {8}, "r"); 65 CheckEqual(output[0], {9}, "l"); 66 CheckEqual(output[0], {10}, "d"); 67 CheckEqual(output[0], {11}, "!"); 68 69 Tensor::CreateScalar<std::string>("中国 你好!", &input); 70 output.clear(); 71 s = op->Compute(TensorRow(0, {input}), &output); 72 EXPECT_TRUE(s.IsOk()); 73 EXPECT_EQ(output[0]->Size(), 6); 74 EXPECT_EQ(output[0]->Rank(), 1); 75 MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString(); 76 CheckEqual(output[0], {0}, "中"); 77 CheckEqual(output[0], {1}, "国"); 78 CheckEqual(output[0], {2}, " "); 79 CheckEqual(output[0], {3}, "你"); 80 CheckEqual(output[0], {4}, "好"); 81 CheckEqual(output[0], {5}, "!"); 82 83 Tensor::CreateScalar<std::string>("中", &input); 84 output.clear(); 85 s = op->Compute(TensorRow(0, {input}), &output); 86 EXPECT_TRUE(s.IsOk()); 87 EXPECT_EQ(output[0]->Size(), 1); 88 EXPECT_EQ(output[0]->Rank(), 1); 89 MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString(); 90 CheckEqual(output[0], {0}, "中"); 91 92 Tensor::CreateScalar<std::string>("H", &input); 93 output.clear(); 94 s = op->Compute(TensorRow(0, {input}), &output); 95 EXPECT_TRUE(s.IsOk()); 96 EXPECT_EQ(output[0]->Size(), 1); 97 EXPECT_EQ(output[0]->Rank(), 1); 98 MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString(); 99 CheckEqual(output[0], {0}, "H"); 100 101 Tensor::CreateScalar<std::string>(" ", &input); 102 output.clear(); 103 s = op->Compute(TensorRow(0, {input}), &output); 104 EXPECT_TRUE(s.IsOk()); 105 EXPECT_EQ(output[0]->Size(), 2); 106 EXPECT_EQ(output[0]->Rank(), 1); 107 MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString(); 108 CheckEqual(output[0], {0}, " "); 109 CheckEqual(output[0], {1}, " "); 110 111 Tensor::CreateScalar<std::string>("", &input); 112 output.clear(); 113 s = op->Compute(TensorRow(0, {input}), &output); 114 EXPECT_TRUE(s.IsOk()); 115 EXPECT_EQ(output[0]->Size(), 1); 116 EXPECT_EQ(output[0]->Rank(), 1); 117 MS_LOG(INFO) << "Out tensor6: " << output[0]->ToString(); 118 CheckEqual(output[0], {0}, ""); 119 } 120 121 TEST_F(MindDataTestTokenizerOp, TestWhitespaceTokenizerOp) { 122 MS_LOG(INFO) << "Doing TestWhitespaceTokenizerOp."; 123 std::unique_ptr<WhitespaceTokenizerOp> op(new WhitespaceTokenizerOp(true)); 124 std::shared_ptr<Tensor> input; 125 Tensor::CreateScalar<std::string>("Welcome to China.", &input); TensorRow output; 126 Status s = op->Compute(TensorRow(0, {input}), &output); 127 EXPECT_TRUE(s.IsOk()); 128 EXPECT_EQ(output[0]->Size(), 3); 129 EXPECT_EQ(output[0]->Rank(), 1); 130 MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString(); 131 CheckEqual(output[0], {0}, "Welcome"); 132 CheckEqual(output[0], {1}, "to"); 133 CheckEqual(output[0], {2}, "China."); 134 135 Tensor::CreateScalar<std::string>(" hello", &input); 136 output.clear(); 137 s = op->Compute(TensorRow(0, {input}), &output); 138 EXPECT_TRUE(s.IsOk()); 139 EXPECT_EQ(output[0]->Size(), 1); 140 EXPECT_EQ(output[0]->Rank(), 1); 141 MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString(); 142 CheckEqual(output[0], {0}, "hello"); 143 144 Tensor::CreateScalar<std::string>("hello", &input); 145 output.clear(); 146 s = op->Compute(TensorRow(0, {input}), &output); 147 EXPECT_TRUE(s.IsOk()); 148 EXPECT_EQ(output[0]->Size(), 1); 149 EXPECT_EQ(output[0]->Rank(), 1); 150 MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString(); 151 CheckEqual(output[0], {0}, "hello"); 152 153 Tensor::CreateScalar<std::string>("hello ", &input); 154 output.clear(); 155 s = op->Compute(TensorRow(0, {input}), &output); 156 EXPECT_TRUE(s.IsOk()); 157 EXPECT_EQ(output[0]->Size(), 1); 158 EXPECT_EQ(output[0]->Rank(), 1); 159 MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString(); 160 CheckEqual(output[0], {0}, "hello"); 161 162 Tensor::CreateScalar<std::string>(" ", &input); 163 output.clear(); 164 s = op->Compute(TensorRow(0, {input}), &output); 165 EXPECT_TRUE(s.IsOk()); 166 EXPECT_EQ(output[0]->Size(), 1); 167 EXPECT_EQ(output[0]->Rank(), 1); 168 MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString(); 169 CheckEqual(output[0], {0}, ""); 170 } 171 172 TEST_F(MindDataTestTokenizerOp, TestUnicodeScriptTokenizer) { 173 MS_LOG(INFO) << "Doing TestUnicodeScriptTokenizer."; 174 std::unique_ptr<UnicodeScriptTokenizerOp> keep_whitespace_op(new UnicodeScriptTokenizerOp(true, true)); 175 std::unique_ptr<UnicodeScriptTokenizerOp> skip_whitespace_op(new UnicodeScriptTokenizerOp(false, true)); 176 177 std::shared_ptr<Tensor> input; 178 Tensor::CreateScalar<std::string>("Welcome to China. \n 中国\t北京", &input); 179 TensorRow output; 180 Status s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); 181 EXPECT_TRUE(s.IsOk()); 182 EXPECT_EQ(output[0]->Size(), 10); 183 EXPECT_EQ(output[0]->Rank(), 1); 184 MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString(); 185 CheckEqual(output[0], {0}, "Welcome"); 186 CheckEqual(output[0], {1}, " "); 187 CheckEqual(output[0], {2}, "to"); 188 CheckEqual(output[0], {3}, " "); 189 CheckEqual(output[0], {4}, "China"); 190 CheckEqual(output[0], {5}, "."); 191 CheckEqual(output[0], {6}, " \n "); 192 CheckEqual(output[0], {7}, "中国"); 193 CheckEqual(output[0], {8}, "\t"); 194 CheckEqual(output[0], {9}, "北京"); 195 output.clear(); 196 s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output); 197 EXPECT_TRUE(s.IsOk()); 198 EXPECT_EQ(output[0]->Size(), 6); 199 EXPECT_EQ(output[0]->Rank(), 1); 200 MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString(); 201 CheckEqual(output[0], {0}, "Welcome"); 202 CheckEqual(output[0], {1}, "to"); 203 CheckEqual(output[0], {2}, "China"); 204 CheckEqual(output[0], {3}, "."); 205 CheckEqual(output[0], {4}, "中国"); 206 CheckEqual(output[0], {5}, "北京"); 207 208 Tensor::CreateScalar<std::string>(" Welcome to 中国. ", &input); 209 output.clear(); 210 s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); 211 EXPECT_EQ(output[0]->Size(), 4); 212 EXPECT_EQ(output[0]->Rank(), 1); 213 MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString(); 214 CheckEqual(output[0], {0}, "Welcome"); 215 CheckEqual(output[0], {1}, "to"); 216 CheckEqual(output[0], {2}, "中国"); 217 CheckEqual(output[0], {3}, "."); 218 output.clear(); 219 s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); 220 EXPECT_TRUE(s.IsOk()); 221 EXPECT_EQ(output[0]->Size(), 8); 222 EXPECT_EQ(output[0]->Rank(), 1); 223 MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString(); 224 CheckEqual(output[0], {0}, " "); 225 CheckEqual(output[0], {1}, "Welcome"); 226 CheckEqual(output[0], {2}, " "); 227 CheckEqual(output[0], {3}, "to"); 228 CheckEqual(output[0], {4}, " "); 229 CheckEqual(output[0], {5}, "中国"); 230 CheckEqual(output[0], {6}, "."); 231 CheckEqual(output[0], {7}, " "); 232 233 Tensor::CreateScalar<std::string>("Hello", &input); 234 output.clear(); 235 s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); 236 EXPECT_EQ(output[0]->Size(), 1); 237 EXPECT_EQ(output[0]->Rank(), 1); 238 MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString(); 239 CheckEqual(output[0], {0}, "Hello"); 240 241 Tensor::CreateScalar<std::string>("H", &input); 242 output.clear(); 243 s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); 244 EXPECT_EQ(output[0]->Size(), 1); 245 EXPECT_EQ(output[0]->Rank(), 1); 246 MS_LOG(INFO) << "Out tensor6: " << output[0]->ToString(); 247 CheckEqual(output[0], {0}, "H"); 248 249 Tensor::CreateScalar<std::string>("", &input); 250 output.clear(); 251 s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); 252 EXPECT_TRUE(s.IsOk()); 253 EXPECT_EQ(output[0]->Size(), 1); 254 EXPECT_EQ(output[0]->Rank(), 1); 255 MS_LOG(INFO) << "Out tensor7: " << output[0]->ToString(); 256 CheckEqual(output[0], {0}, ""); 257 258 Tensor::CreateScalar<std::string>("Hello中国Hello世界", &input); 259 output.clear(); 260 s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 4); 261 EXPECT_EQ(output[0]->Rank(), 1); 262 MS_LOG(INFO) << "Out tensor8: " << output[0]->ToString(); 263 CheckEqual(output[0], {0}, "Hello"); 264 CheckEqual(output[0], {1}, "中国"); 265 CheckEqual(output[0], {2}, "Hello"); 266 CheckEqual(output[0], {3}, "世界"); 267 268 Tensor::CreateScalar<std::string>(" ", &input); 269 output.clear(); 270 s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); 271 EXPECT_TRUE(s.IsOk()); 272 EXPECT_EQ(output[0]->Size(), 1); 273 EXPECT_EQ(output[0]->Rank(), 1); 274 MS_LOG(INFO) << "Out tensor10: " << output[0]->ToString(); 275 CheckEqual(output[0], {0}, " "); 276 Tensor::CreateScalar<std::string>(" ", &input); 277 output.clear(); 278 s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output); 279 EXPECT_TRUE(s.IsOk()); 280 EXPECT_EQ(output[0]->Size(), 1); 281 EXPECT_EQ(output[0]->Rank(), 1); 282 MS_LOG(INFO) << "Out tensor11: " << output[0]->ToString(); 283 CheckEqual(output[0], {0}, ""); 284 } 285 286 TEST_F(MindDataTestTokenizerOp, TestCaseFold) { 287 MS_LOG(INFO) << "Doing TestCaseFold."; 288 std::unique_ptr<CaseFoldOp> case_fold_op(new CaseFoldOp()); 289 std::shared_ptr<Tensor> input; 290 Tensor::CreateScalar<std::string>("Welcome to China. \n 中国\t北京", &input); 291 292 std::shared_ptr<Tensor> output; 293 Status s = case_fold_op->Compute(input, &output); 294 EXPECT_TRUE(s.IsOk()); 295 EXPECT_EQ(output->Size(), 1); 296 EXPECT_EQ(output->Rank(), 0); 297 MS_LOG(INFO) << "Out tensor1: " << output->ToString(); 298 CheckEqual(output, {}, "welcome to china. \n 中国\t北京"); 299 } 300 301 TEST_F(MindDataTestTokenizerOp, TestNormalize) { 302 MS_LOG(INFO) << "Doing TestNormalize."; 303 std::unique_ptr<NormalizeUTF8Op> nfc_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfc)); 304 std::unique_ptr<NormalizeUTF8Op> nfkc_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfkc)); 305 std::unique_ptr<NormalizeUTF8Op> nfd_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfd)); 306 std::unique_ptr<NormalizeUTF8Op> nfkd_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfkd)); 307 std::shared_ptr<Tensor> input; 308 Tensor::CreateScalar<std::string>("ṩ", &input); 309 std::shared_ptr<Tensor> output; 310 Status s = nfc_normalize_op->Compute(input, &output); 311 EXPECT_TRUE(s.IsOk()); 312 MS_LOG(INFO) << "NFC str:" << output->ToString(); 313 314 nfkc_normalize_op->Compute(input, &output); 315 EXPECT_TRUE(s.IsOk()); 316 MS_LOG(INFO) << "NFKC str:" << output->ToString(); 317 318 nfd_normalize_op->Compute(input, &output); 319 EXPECT_TRUE(s.IsOk()); 320 MS_LOG(INFO) << "NFD str:" << output->ToString(); 321 322 nfkd_normalize_op->Compute(input, &output); 323 EXPECT_TRUE(s.IsOk()); 324 MS_LOG(INFO) << "NFKD str:" << output->ToString(); 325 } 326 327 TEST_F(MindDataTestTokenizerOp, TestRegexReplace) { 328 MS_LOG(INFO) << "Doing TestRegexReplace."; 329 std::unique_ptr<RegexReplaceOp> regex_replace_op(new RegexReplaceOp("\\s+", "_", true)); 330 std::shared_ptr<Tensor> input; 331 Tensor::CreateScalar<std::string>("Welcome to China. \n 中国\t北京", &input); 332 std::shared_ptr<Tensor> output; 333 Status s = regex_replace_op->Compute(input, &output); 334 EXPECT_TRUE(s.IsOk()); 335 EXPECT_EQ(output->Size(), 1); 336 EXPECT_EQ(output->Rank(), 0); 337 MS_LOG(INFO) << "Out tensor1: " << output->ToString(); 338 CheckEqual(output, {}, "Welcome_to_China._中国_北京"); 339 } 340 341 TEST_F(MindDataTestTokenizerOp, TestRegexTokenizer) { 342 MS_LOG(INFO) << "Doing TestRegexTokenizerOp."; 343 std::unique_ptr<RegexTokenizerOp> regex_tokenizer_op(new RegexTokenizerOp("\\p{Cc}|\\p{Cf}|\\s+", "", true)); 344 std::shared_ptr<Tensor> input; 345 Tensor::CreateScalar<std::string>("Welcome to China. \n 中国\t北京", &input); 346 TensorRow output; 347 Status s = regex_tokenizer_op->Compute(TensorRow(0, {input}), &output); 348 EXPECT_TRUE(s.IsOk()); 349 } 350 351 TEST_F(MindDataTestTokenizerOp, TestBasicTokenizer) { 352 MS_LOG(INFO) << "Doing TestBasicTokenizer."; 353 // bool lower_case, bool keep_whitespace, 354 // NormalizeForm normalization_form, bool preserve_unused_token 355 std::unique_ptr<BasicTokenizerOp> basic_tokenizer(new BasicTokenizerOp(true, true, NormalizeForm::kNone, false,true)); 356 std::shared_ptr<Tensor> input; 357 Tensor::CreateScalar<std::string>("Welcome to China. 中国\t北京", &input); 358 TensorRow output; 359 Status s = basic_tokenizer->Compute(TensorRow(0, {input}), &output); 360 EXPECT_TRUE(s.IsOk()); 361 }