1 /**
2 * Copyright 2020-2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <unistd.h>
18 #include <fstream>
19 #include <regex>
20
21 #include "utils/file_utils.h"
22 #include "minddata/dataset/include/dataset/text.h"
23 #include "minddata/dataset/core/type_id.h"
24 #include "minddata/dataset/text/ir/kernels/text_ir.h"
25 #include "mindspore/core/ir/dtype/type_id.h"
26
27 namespace mindspore {
28 namespace dataset {
29
30 // Transform operations for text.
31 namespace text {
32
33 constexpr size_t size_two = 2;
34 constexpr size_t size_three = 3;
35 constexpr int64_t value_one = 1;
36 constexpr int64_t value_two = 2;
37 constexpr size_t kMaxLoggedRows = 10;
38
39 // FUNCTIONS TO CREATE TEXT OPERATIONS
40 // (In alphabetical order)
41
42 #ifndef _WIN32
43 // BasicTokenizer
44 struct BasicTokenizer::Data {
Datamindspore::dataset::text::BasicTokenizer::Data45 Data(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
46 bool with_offsets)
47 : lower_case_(lower_case),
48 keep_whitespace_(keep_whitespace),
49 normalize_form_(normalize_form),
50 preserve_unused_token_(preserve_unused_token),
51 with_offsets_(with_offsets) {}
52 bool lower_case_;
53 bool keep_whitespace_;
54 NormalizeForm normalize_form_;
55 bool preserve_unused_token_;
56 bool with_offsets_;
57 };
58
BasicTokenizer(bool lower_case,bool keep_whitespace,const NormalizeForm normalize_form,bool preserve_unused_token,bool with_offsets)59 BasicTokenizer::BasicTokenizer(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form,
60 bool preserve_unused_token, bool with_offsets)
61 : data_(std::make_shared<Data>(lower_case, keep_whitespace, normalize_form, preserve_unused_token, with_offsets)) {}
62
Parse()63 std::shared_ptr<TensorOperation> BasicTokenizer::Parse() {
64 return std::make_shared<BasicTokenizerOperation>(data_->lower_case_, data_->keep_whitespace_, data_->normalize_form_,
65 data_->preserve_unused_token_, data_->with_offsets_);
66 }
67
68 // BertTokenizer
69 struct BertTokenizer::Data {
Datamindspore::dataset::text::BertTokenizer::Data70 Data(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, int32_t max_bytes_per_token,
71 const std::vector<char> &unknown_token, bool lower_case, bool keep_whitespace,
72 const NormalizeForm normalize_form, bool preserve_unused_token, bool with_offsets)
73 : vocab_(vocab),
74 suffix_indicator_(CharToString(suffix_indicator)),
75 max_bytes_per_token_(max_bytes_per_token),
76 unknown_token_(CharToString(unknown_token)),
77 lower_case_(lower_case),
78 keep_whitespace_(keep_whitespace),
79 normalize_form_(normalize_form),
80 preserve_unused_token_(preserve_unused_token),
81 with_offsets_(with_offsets) {}
82 std::shared_ptr<Vocab> vocab_;
83 std::string suffix_indicator_;
84 int32_t max_bytes_per_token_;
85 std::string unknown_token_;
86 bool lower_case_;
87 bool keep_whitespace_;
88 NormalizeForm normalize_form_;
89 bool preserve_unused_token_;
90 bool with_offsets_;
91 };
92
BertTokenizer(const std::shared_ptr<Vocab> & vocab,const std::vector<char> & suffix_indicator,int32_t max_bytes_per_token,const std::vector<char> & unknown_token,bool lower_case,bool keep_whitespace,const NormalizeForm normalize_form,bool preserve_unused_token,bool with_offsets)93 BertTokenizer::BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
94 int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool lower_case,
95 bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
96 bool with_offsets)
97 : data_(std::make_shared<Data>(vocab, suffix_indicator, max_bytes_per_token, unknown_token, lower_case,
98 keep_whitespace, normalize_form, preserve_unused_token, with_offsets)) {}
99
Parse()100 std::shared_ptr<TensorOperation> BertTokenizer::Parse() {
101 return std::make_shared<BertTokenizerOperation>(
102 data_->vocab_, data_->suffix_indicator_, data_->max_bytes_per_token_, data_->unknown_token_, data_->lower_case_,
103 data_->keep_whitespace_, data_->normalize_form_, data_->preserve_unused_token_, data_->with_offsets_);
104 }
105
106 // CaseFold
CaseFold()107 CaseFold::CaseFold() {}
108
Parse()109 std::shared_ptr<TensorOperation> CaseFold::Parse() { return std::make_shared<CaseFoldOperation>(); }
110 #endif
111
112 // JiebaTokenizer
113 struct JiebaTokenizer::Data {
Datamindspore::dataset::text::JiebaTokenizer::Data114 Data(const std::vector<char> &hmm_path, const std::vector<char> &mp_path, const JiebaMode &mode, bool with_offsets)
115 : hmm_path_(CharToString(hmm_path)),
116 mp_path_(CharToString(mp_path)),
117 mode_(mode),
118 with_offsets_(with_offsets),
119 words_list_({}) {}
120 std::string hmm_path_;
121 std::string mp_path_;
122 JiebaMode mode_;
123 bool with_offsets_;
124 std::vector<std::pair<std::string, int64_t>> words_list_;
125 };
126
JiebaTokenizer(const std::vector<char> & hmm_path,const std::vector<char> & mp_path,const JiebaMode & mode,bool with_offsets)127 JiebaTokenizer::JiebaTokenizer(const std::vector<char> &hmm_path, const std::vector<char> &mp_path,
128 const JiebaMode &mode, bool with_offsets)
129 : data_(std::make_shared<Data>(hmm_path, mp_path, mode, with_offsets)) {}
130
Parse()131 std::shared_ptr<TensorOperation> JiebaTokenizer::Parse() {
132 std::shared_ptr<JiebaTokenizerOperation> jieba_tokenizer =
133 std::make_shared<JiebaTokenizerOperation>(data_->hmm_path_, data_->mp_path_, data_->mode_, data_->with_offsets_);
134 for (auto &word : data_->words_list_) {
135 Status rc = jieba_tokenizer->AddWord(word.first, word.second);
136 if (rc.IsError()) {
137 MS_LOG(ERROR) << rc;
138 return {};
139 }
140 }
141 return jieba_tokenizer;
142 }
143
AddWordChar(const std::vector<char> & word,int64_t freq)144 Status JiebaTokenizer::AddWordChar(const std::vector<char> &word, int64_t freq) {
145 if (word.empty()) {
146 std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided.";
147 MS_LOG(ERROR) << err_msg;
148 RETURN_STATUS_SYNTAX_ERROR(err_msg);
149 }
150 if (freq < 0) {
151 std::string err_msg = "JiebaTokenizer : The parameter freq must be greater than or equal to 0.";
152 MS_LOG(ERROR) << err_msg;
153 RETURN_STATUS_SYNTAX_ERROR(err_msg);
154 }
155 (void)data_->words_list_.emplace_back(CharToString(word), freq);
156 return Status::OK();
157 }
158
AddDictChar(const std::vector<std::pair<std::vector<char>,int64_t>> & user_dict)159 Status JiebaTokenizer::AddDictChar(const std::vector<std::pair<std::vector<char>, int64_t>> &user_dict) {
160 for (auto &word_freq_pair : user_dict) {
161 RETURN_IF_NOT_OK(AddWordChar(word_freq_pair.first, word_freq_pair.second));
162 }
163 return Status::OK();
164 }
165
AddDictChar(const std::vector<char> & file_path)166 Status JiebaTokenizer::AddDictChar(const std::vector<char> &file_path) {
167 std::vector<std::pair<std::string, int64_t>> user_dict;
168 RETURN_IF_NOT_OK(ParserFile(CharToString(file_path), &user_dict));
169 RETURN_IF_NOT_OK(AddDictChar(PairStringInt64ToPairCharInt64(user_dict)));
170 return Status::OK();
171 }
172
ParserFile(const std::string & file_path,std::vector<std::pair<std::string,int64_t>> * const user_dict)173 Status JiebaTokenizer::ParserFile(const std::string &file_path,
174 std::vector<std::pair<std::string, int64_t>> *const user_dict) {
175 auto realpath = FileUtils::GetRealPath(file_path.data());
176 if (!realpath.has_value()) {
177 MS_LOG(ERROR) << "Get real path failed, path=" << file_path;
178 RETURN_STATUS_SYNTAX_ERROR("Get real path failed, path=" + file_path);
179 }
180
181 std::ifstream ifs(realpath.value());
182 if (!ifs) {
183 std::string err_msg = "JiebaTokenizer : Fail to load dictionary from the input file, check the file path.";
184 MS_LOG(ERROR) << err_msg;
185 RETURN_STATUS_SYNTAX_ERROR(err_msg);
186 }
187
188 std::string line;
189 while (std::getline(ifs, line)) {
190 if (line.empty()) {
191 continue;
192 }
193 std::regex regex("^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$");
194 std::smatch tokens;
195 if (std::regex_match(line, tokens, regex)) {
196 if (tokens.size() == size_two) {
197 (void)user_dict->emplace_back(tokens.str(value_one), 0);
198 } else if (tokens.size() == size_three) {
199 (void)user_dict->emplace_back(tokens.str(value_one), strtoll(tokens.str(value_two).c_str(), NULL, 0));
200 } else {
201 continue;
202 }
203 } else {
204 continue;
205 }
206 }
207 MS_LOG(INFO) << "JiebaTokenizer::AddDict: The size of user input dictionary is: " << user_dict->size();
208 MS_LOG(INFO) << "Valid rows in input dictionary (Maximum of first 10 rows are shown.):";
209 for (std::size_t i = 0; i != user_dict->size(); ++i) {
210 if (i >= kMaxLoggedRows) break;
211 MS_LOG(INFO) << user_dict->at(i).first << " " << user_dict->at(i).second;
212 }
213 return Status::OK();
214 }
215
216 // Lookup
217 struct Lookup::Data {
Datamindspore::dataset::text::Lookup::Data218 Data(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
219 mindspore::DataType data_type)
220 : vocab_(vocab), data_type_(dataset::MSTypeToDEType(static_cast<TypeId>(data_type))) {
221 if (unknown_token == std::nullopt) {
222 unknown_token_ = std::nullopt;
223 } else {
224 unknown_token_ = std::string(unknown_token->begin(), unknown_token->end());
225 }
226 }
227 std::shared_ptr<Vocab> vocab_;
228 std::optional<std::string> unknown_token_;
229 dataset::DataType data_type_;
230 };
231
Lookup(const std::shared_ptr<Vocab> & vocab,const std::optional<std::vector<char>> & unknown_token,mindspore::DataType data_type)232 Lookup::Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
233 mindspore::DataType data_type)
234 : data_(std::make_shared<Data>(vocab, unknown_token, data_type)) {
235 data_->data_type_ = dataset::MSTypeToDEType(static_cast<TypeId>(data_type));
236 }
237
Parse()238 std::shared_ptr<TensorOperation> Lookup::Parse() {
239 return std::make_shared<LookupOperation>(data_->vocab_, data_->unknown_token_, data_->data_type_);
240 }
241
242 // Ngram
243 struct Ngram::Data {
Datamindspore::dataset::text::Ngram::Data244 Data(const std::vector<int32_t> &ngrams, const std::pair<std::vector<char>, int32_t> &left_pad,
245 const std::pair<std::vector<char>, int32_t> &right_pad, const std::vector<char> &separator)
246 : ngrams_(ngrams),
247 left_pad_(PairCharToString(left_pad)),
248 right_pad_(PairCharToString(right_pad)),
249 separator_(CharToString(separator)) {}
250 std::vector<int32_t> ngrams_;
251 std::pair<std::string, int32_t> left_pad_;
252 std::pair<std::string, int32_t> right_pad_;
253 std::string separator_;
254 };
255
Ngram(const std::vector<int32_t> & ngrams,const std::pair<std::vector<char>,int32_t> & left_pad,const std::pair<std::vector<char>,int32_t> & right_pad,const std::vector<char> & separator)256 Ngram::Ngram(const std::vector<int32_t> &ngrams, const std::pair<std::vector<char>, int32_t> &left_pad,
257 const std::pair<std::vector<char>, int32_t> &right_pad, const std::vector<char> &separator)
258 : data_(std::make_shared<Data>(ngrams, left_pad, right_pad, separator)) {}
259
Parse()260 std::shared_ptr<TensorOperation> Ngram::Parse() {
261 return std::make_shared<NgramOperation>(data_->ngrams_, data_->left_pad_, data_->right_pad_, data_->separator_);
262 }
263
264 #ifndef _WIN32
265 // NormalizeUTF8
266 struct NormalizeUTF8::Data {
Datamindspore::dataset::text::NormalizeUTF8::Data267 explicit Data(NormalizeForm normalize_form) : normalize_form_(normalize_form) {}
268 NormalizeForm normalize_form_;
269 };
270
NormalizeUTF8(NormalizeForm normalize_form)271 NormalizeUTF8::NormalizeUTF8(NormalizeForm normalize_form) : data_(std::make_shared<Data>(normalize_form)) {}
272
Parse()273 std::shared_ptr<TensorOperation> NormalizeUTF8::Parse() {
274 return std::make_shared<NormalizeUTF8Operation>(data_->normalize_form_);
275 }
276
277 // RegexReplace
278 struct RegexReplace::Data {
Datamindspore::dataset::text::RegexReplace::Data279 Data(const std::vector<char> &pattern, const std::vector<char> &replace, bool replace_all)
280 : pattern_(CharToString(pattern)), replace_(CharToString(replace)), replace_all_(replace_all) {}
281 std::string pattern_;
282 std::string replace_;
283 bool replace_all_;
284 };
285
RegexReplace(const std::vector<char> & pattern,const std::vector<char> & replace,bool replace_all)286 RegexReplace::RegexReplace(const std::vector<char> &pattern, const std::vector<char> &replace, bool replace_all)
287 : data_(std::make_shared<Data>(pattern, replace, replace_all)) {}
288
Parse()289 std::shared_ptr<TensorOperation> RegexReplace::Parse() {
290 return std::make_shared<RegexReplaceOperation>(data_->pattern_, data_->replace_, data_->replace_all_);
291 }
292
293 // RegexTokenizer
294 struct RegexTokenizer::Data {
Datamindspore::dataset::text::RegexTokenizer::Data295 Data(const std::vector<char> &delim_pattern, const std::vector<char> &keep_delim_pattern, bool with_offsets)
296 : delim_pattern_(CharToString(delim_pattern)),
297 keep_delim_pattern_(CharToString(keep_delim_pattern)),
298 with_offsets_(with_offsets) {}
299 std::string delim_pattern_;
300 std::string keep_delim_pattern_;
301 bool with_offsets_;
302 };
303
RegexTokenizer(const std::vector<char> & delim_pattern,const std::vector<char> & keep_delim_pattern,bool with_offsets)304 RegexTokenizer::RegexTokenizer(const std::vector<char> &delim_pattern, const std::vector<char> &keep_delim_pattern,
305 bool with_offsets)
306 : data_(std::make_shared<Data>(delim_pattern, keep_delim_pattern, with_offsets)) {}
307
Parse()308 std::shared_ptr<TensorOperation> RegexTokenizer::Parse() {
309 return std::make_shared<RegexTokenizerOperation>(data_->delim_pattern_, data_->keep_delim_pattern_,
310 data_->with_offsets_);
311 }
312 #endif
313
314 // SentencePieceTokenizer
315 struct SentencePieceTokenizer::Data {
Datamindspore::dataset::text::SentencePieceTokenizer::Data316 Data(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type)
317 : vocab_(vocab), vocab_path_(""), out_type_(out_type) {}
Datamindspore::dataset::text::SentencePieceTokenizer::Data318 Data(const std::vector<char> &vocab_path, SPieceTokenizerOutType out_type)
319 : vocab_(nullptr), vocab_path_(CharToString(vocab_path)), out_type_(out_type) {}
320 std::shared_ptr<SentencePieceVocab> vocab_;
321 std::string vocab_path_;
322 SPieceTokenizerOutType out_type_;
323 };
324
SentencePieceTokenizer(const std::shared_ptr<SentencePieceVocab> & vocab,SPieceTokenizerOutType out_type)325 SentencePieceTokenizer::SentencePieceTokenizer(const std::shared_ptr<SentencePieceVocab> &vocab,
326 SPieceTokenizerOutType out_type)
327 : data_(std::make_shared<Data>(vocab, out_type)) {}
328
SentencePieceTokenizer(const std::vector<char> & vocab_path,SPieceTokenizerOutType out_type)329 SentencePieceTokenizer::SentencePieceTokenizer(const std::vector<char> &vocab_path, SPieceTokenizerOutType out_type)
330 : data_(std::make_shared<Data>(vocab_path, out_type)) {}
331
Parse()332 std::shared_ptr<TensorOperation> SentencePieceTokenizer::Parse() {
333 if (data_->vocab_ != nullptr) {
334 return std::make_shared<SentencePieceTokenizerOperation>(data_->vocab_, data_->out_type_);
335 } else {
336 return std::make_shared<SentencePieceTokenizerOperation>(data_->vocab_path_, data_->out_type_);
337 }
338 }
339
340 // SlidingWindow
341 struct SlidingWindow::Data {
Datamindspore::dataset::text::SlidingWindow::Data342 Data(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {}
343 int32_t width_;
344 int32_t axis_;
345 };
346
SlidingWindow(const int32_t width,const int32_t axis)347 SlidingWindow::SlidingWindow(const int32_t width, const int32_t axis) : data_(std::make_shared<Data>(width, axis)) {}
348
Parse()349 std::shared_ptr<TensorOperation> SlidingWindow::Parse() {
350 return std::make_shared<SlidingWindowOperation>(data_->width_, data_->axis_);
351 }
352
353 // ToNumber
354 struct ToNumber::Data {
355 dataset::DataType data_type_;
356 };
357
ToNumber(mindspore::DataType data_type)358 ToNumber::ToNumber(mindspore::DataType data_type) : data_(std::make_shared<Data>()) {
359 data_->data_type_ = dataset::MSTypeToDEType(static_cast<TypeId>(data_type));
360 }
361
Parse()362 std::shared_ptr<TensorOperation> ToNumber::Parse() { return std::make_shared<ToNumberOperation>(data_->data_type_); }
363
364 // TruncateSequencePair
365 struct TruncateSequencePair::Data {
Datamindspore::dataset::text::TruncateSequencePair::Data366 explicit Data(int32_t max_length) : max_length_(max_length) {}
367 int32_t max_length_;
368 };
369
TruncateSequencePair(int32_t max_length)370 TruncateSequencePair::TruncateSequencePair(int32_t max_length) : data_(std::make_shared<Data>(max_length)) {}
371
Parse()372 std::shared_ptr<TensorOperation> TruncateSequencePair::Parse() {
373 return std::make_shared<TruncateSequencePairOperation>(data_->max_length_);
374 }
375
376 // UnicodeCharTokenizer
377 struct UnicodeCharTokenizer::Data {
Datamindspore::dataset::text::UnicodeCharTokenizer::Data378 explicit Data(bool with_offsets) : with_offsets_(with_offsets) {}
379 bool with_offsets_;
380 };
381
UnicodeCharTokenizer(bool with_offsets)382 UnicodeCharTokenizer::UnicodeCharTokenizer(bool with_offsets) : data_(std::make_shared<Data>(with_offsets)) {}
383
Parse()384 std::shared_ptr<TensorOperation> UnicodeCharTokenizer::Parse() {
385 return std::make_shared<UnicodeCharTokenizerOperation>(data_->with_offsets_);
386 }
387
388 // WordpieceTokenizer
389 struct WordpieceTokenizer::Data {
Datamindspore::dataset::text::WordpieceTokenizer::Data390 Data(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, int32_t max_bytes_per_token,
391 const std::vector<char> &unknown_token, bool with_offsets)
392 : vocab_(vocab),
393 suffix_indicator_(CharToString(suffix_indicator)),
394 max_bytes_per_token_(max_bytes_per_token),
395 unknown_token_(CharToString(unknown_token)),
396 with_offsets_(with_offsets) {}
397 std::shared_ptr<Vocab> vocab_;
398 std::string suffix_indicator_;
399 int32_t max_bytes_per_token_;
400 std::string unknown_token_;
401 bool with_offsets_;
402 };
403
WordpieceTokenizer(const std::shared_ptr<Vocab> & vocab,const std::vector<char> & suffix_indicator,int32_t max_bytes_per_token,const std::vector<char> & unknown_token,bool with_offsets)404 WordpieceTokenizer::WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
405 int32_t max_bytes_per_token, const std::vector<char> &unknown_token,
406 bool with_offsets)
407 : data_(std::make_shared<Data>(vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets)) {}
408
Parse()409 std::shared_ptr<TensorOperation> WordpieceTokenizer::Parse() {
410 return std::make_shared<WordpieceTokenizerOperation>(
411 data_->vocab_, data_->suffix_indicator_, data_->max_bytes_per_token_, data_->unknown_token_, data_->with_offsets_);
412 }
413
414 #ifndef _WIN32
415 // UnicodeScriptTokenizer
416 struct UnicodeScriptTokenizer::Data {
Datamindspore::dataset::text::UnicodeScriptTokenizer::Data417 Data(bool keep_whitespace, bool with_offsets) : keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {}
418 bool keep_whitespace_;
419 bool with_offsets_;
420 };
421
UnicodeScriptTokenizer(bool keep_whitespace,bool with_offsets)422 UnicodeScriptTokenizer::UnicodeScriptTokenizer(bool keep_whitespace, bool with_offsets)
423 : data_(std::make_shared<Data>(keep_whitespace, with_offsets)) {}
424
Parse()425 std::shared_ptr<TensorOperation> UnicodeScriptTokenizer::Parse() {
426 return std::make_shared<UnicodeScriptTokenizerOperation>(data_->keep_whitespace_, data_->with_offsets_);
427 }
428
429 // WhitespaceTokenizer
430 struct WhitespaceTokenizer::Data {
Datamindspore::dataset::text::WhitespaceTokenizer::Data431 explicit Data(bool with_offsets) : with_offsets_(with_offsets) {}
432 bool with_offsets_;
433 };
434
WhitespaceTokenizer(bool with_offsets)435 WhitespaceTokenizer::WhitespaceTokenizer(bool with_offsets) : data_(std::make_shared<Data>(with_offsets)) {}
436
Parse()437 std::shared_ptr<TensorOperation> WhitespaceTokenizer::Parse() {
438 return std::make_shared<WhitespaceTokenizerOperation>(data_->with_offsets_);
439 }
440 #endif
441 } // namespace text
442 } // namespace dataset
443 } // namespace mindspore
444