• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <string>
18 #include <string_view>
19 #include "common/common.h"
20 #include "minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h"
21 #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
22 #include "minddata/dataset/text/sentence_piece_vocab.h"
23 #include "minddata/dataset/engine/datasetops/source/text_file_op.h"
24 #include "gtest/gtest.h"
25 #include "utils/log_adapter.h"
26 #include "minddata/dataset/util/status.h"
27 
28 using namespace mindspore::dataset;
29 
30 class MindDataTestSentencePieceVocabOp : public UT::DatasetOpTesting {
31  public:
CheckEqual(const std::shared_ptr<Tensor> & o,const std::vector<dsize_t> & index,const std::string & expect)32   void CheckEqual(const std::shared_ptr<Tensor> &o, const std::vector<dsize_t> &index, const std::string &expect) {
33     std::string_view str;
34     Status s = o->GetItemAt(&str, index);
35     EXPECT_TRUE(s.IsOk());
36     EXPECT_EQ(str, expect);
37   }
38 };
39 
40 // Testing helper to create TextFileOp
TextFile(std::vector<std::string> text_files_list,int32_t num_workers,int32_t op_connector_size)41 std::shared_ptr<TextFileOp> TextFile(std::vector<std::string> text_files_list, int32_t num_workers,
42                                      int32_t op_connector_size) {
43   std::unique_ptr<DataSchema> schema = std::make_unique<DataSchema>();
44   (void)schema->AddColumn(ColDescriptor("text", DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1));
45   std::shared_ptr<ConfigManager> config_manager = GlobalContext::config_manager();
46   auto worker_connector_size = config_manager->worker_connector_size();
47   int32_t device_id = 0;
48   int32_t num_devices = 1;
49   int32_t num_rows = 0;
50   bool shuffle = false;
51 
52   std::shared_ptr<TextFileOp> text_file_op =
53     std::make_shared<TextFileOp>(num_workers, num_rows, worker_connector_size, std::move(schema), text_files_list,
54                                  op_connector_size, shuffle, num_devices, device_id);
55   (void)text_file_op->Init();
56   return text_file_op;
57 }
58 
TEST_F(MindDataTestSentencePieceVocabOp,TestSentencePieceFromDatasetFuntions)59 TEST_F(MindDataTestSentencePieceVocabOp, TestSentencePieceFromDatasetFuntions) {
60   MS_LOG(INFO) << "Doing MindDataTestSentencePieceVocabOp  TestSentencePieceFromDatasetFuntions.";
61   Status rc;
62   std::string dataset_path;
63   dataset_path = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
64   auto tree = std::make_shared<ExecutionTree>();
65 
66   std::shared_ptr<TextFileOp> file_op = TextFile({dataset_path}, 1, 2);
67 
68   rc = tree->AssociateNode(file_op);
69   ASSERT_TRUE(rc.IsOk());
70   std::vector<std::string> cols;
71   std::unordered_map<std::string, std::string> m_params;
72 
73   std::shared_ptr<SentencePieceVocab> spm = std::make_unique<SentencePieceVocab>();
74   // Sample construstructor for reference
75   // BuildSentencePieceVocabOp(std::shared_ptr<SentencePieceVocab> vocab, std::vector<std::string> col_names,
76   //                          int32_t vocab_size, float character_coverage, SentencePieceModel model_type,
77   //                          const std::unordered_map<std::string, std::string> &params, int32_t op_conn_size);
78   std::shared_ptr<BuildSentencePieceVocabOp> spv_op = std::make_shared<BuildSentencePieceVocabOp>(
79     std::move(spm), cols, 5000, 0.9995, SentencePieceModel::kUnigram, m_params, 2);
80   rc = tree->AssociateNode(spv_op);
81   ASSERT_TRUE(rc.IsOk());
82 
83   rc = spv_op->AddChild(file_op);
84   ASSERT_TRUE(rc.IsOk());
85 
86   file_op->SetTotalRepeats(1);
87   file_op->SetNumRepeatsPerEpoch(1);
88   rc = tree->AssignRoot(spv_op);
89   ASSERT_TRUE(rc.IsOk());
90   rc = tree->Prepare();
91   ASSERT_TRUE(rc.IsOk());
92 
93   rc = tree->Launch();
94   ASSERT_TRUE(rc.IsOk());
95 
96   // Start the loop of reading tensors from our pipeline
97   DatasetIterator di(tree);
98   TensorRow tensor_list;
99   rc = di.FetchNextTensorRow(&tensor_list);
100   ASSERT_TRUE(rc.IsOk());
101 
102   while (!tensor_list.empty()) {
103     rc = di.FetchNextTensorRow(&tensor_list);
104   }
105   ASSERT_TRUE(rc.IsOk());
106 }
107 
TEST_F(MindDataTestSentencePieceVocabOp,TestSentencePieceFromFileFuntions)108 TEST_F(MindDataTestSentencePieceVocabOp, TestSentencePieceFromFileFuntions) {
109   MS_LOG(INFO) << "Doing MindDataTestSentencePieceVocabOp  TestSentencePieceFromFileFuntions.";
110 
111   std::string dataset_path;
112   dataset_path = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
113   std::vector<std::string> path_list;
114   path_list.emplace_back(dataset_path);
115   std::unordered_map<std::string, std::string> param_map;
116   std::shared_ptr<SentencePieceVocab> spm = std::make_unique<SentencePieceVocab>();
117   Status rc = SentencePieceVocab::BuildFromFile(path_list, 5000, 0.9995, SentencePieceModel::kUnigram, param_map, &spm);
118   ASSERT_TRUE(rc.IsOk());
119 }
120 
TEST_F(MindDataTestSentencePieceVocabOp,TestSentencePieceTokenizerFuntions)121 TEST_F(MindDataTestSentencePieceVocabOp, TestSentencePieceTokenizerFuntions) {
122   MS_LOG(INFO) << "Doing MindDataTestSentencePieceVocabOp  TestSentencePieceTokenizerFuntions.";
123 
124   std::string dataset_path;
125   dataset_path = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
126   auto tree = std::make_shared<ExecutionTree>();
127   std::shared_ptr<TextFileOp> file_op = TextFile({dataset_path}, 1, 2);
128 
129   Status rc = tree->AssociateNode(file_op);
130   ASSERT_TRUE(rc.IsOk());
131 
132   std::shared_ptr<SentencePieceVocab> spm = std::make_unique<SentencePieceVocab>();
133   std::vector<std::string> cols;
134   std::unordered_map<std::string, std::string> m_params;
135 
136   std::shared_ptr<BuildSentencePieceVocabOp> spv_op = std::make_shared<BuildSentencePieceVocabOp>(
137     spm, cols, 5000, 0.9995, SentencePieceModel::kUnigram, m_params, 2);
138   rc = tree->AssociateNode(spv_op);
139   ASSERT_TRUE(rc.IsOk());
140 
141   rc = spv_op->AddChild(file_op);
142   ASSERT_TRUE(rc.IsOk());
143 
144   file_op->SetTotalRepeats(1);
145   file_op->SetNumRepeatsPerEpoch(1);
146   rc = tree->AssignRoot(spv_op);
147   ASSERT_TRUE(rc.IsOk());
148   rc = tree->Prepare();
149   ASSERT_TRUE(rc.IsOk());
150 
151   rc = tree->Launch();
152   ASSERT_TRUE(rc.IsOk());
153 
154   // Start the loop of reading tensors from our pipeline
155   DatasetIterator di(tree);
156   TensorRow tensor_list;
157   rc = di.FetchNextTensorRow(&tensor_list);
158   ASSERT_TRUE(rc.IsOk());
159 
160   while (!tensor_list.empty()) {
161     rc = di.FetchNextTensorRow(&tensor_list);
162   }
163   std::shared_ptr<Tensor> output_tensor;
164   std::unique_ptr<SentencePieceTokenizerOp> op(
165     new SentencePieceTokenizerOp(spm, SPieceTokenizerLoadType::kModel, SPieceTokenizerOutType::kString));
166   std::shared_ptr<Tensor> input_tensor;
167   Tensor::CreateScalar<std::string>("I saw a girl with a telescope.", &input_tensor);
168   Status s = op->Compute(input_tensor, &output_tensor);
169 
170   std::vector<std::string> expect;
171   expect.push_back("▁I");
172   expect.push_back("▁sa");
173   expect.push_back("w");
174   expect.push_back("▁a");
175   expect.push_back("▁girl");
176   expect.push_back("▁with");
177   expect.push_back("▁a");
178   expect.push_back("▁te");
179   expect.push_back("les");
180   expect.push_back("co");
181   expect.push_back("pe");
182   expect.push_back(".");
183   ASSERT_TRUE(output_tensor->Size() == expect.size());
184   for (int i = 0; i < output_tensor->Size(); i++) {
185     std::string_view str;
186     output_tensor->GetItemAt(&str, {i});
187     std::string sentence{str};
188     ASSERT_TRUE(sentence == expect[i]);
189   }
190 }
191