1 /**
2 * Copyright 2020-2022 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "minddata/dataset/include/dataset/text.h"
17
18 #include <fstream>
19 #include <regex>
20
21 #include "minddata/dataset/core/type_id.h"
22 #include "minddata/dataset/text/ir/kernels/text_ir.h"
23 #include "mindspore/core/ir/dtype/type_id.h"
24 #include "utils/file_utils.h"
25
26 namespace mindspore {
27 namespace dataset {
28 // Transform operations for text.
29 namespace text {
30 constexpr size_t size_two = 2;
31 constexpr size_t size_three = 3;
32 constexpr int64_t value_one = 1;
33 constexpr int64_t value_two = 2;
34 constexpr size_t kMaxLoggedRows = 10;
35
36 // FUNCTIONS TO CREATE TEXT OPERATIONS
37 // (In alphabetical order)
38
39 // AddToken
40 struct AddToken::Data {
Datamindspore::dataset::text::AddToken::Data41 Data(const std::string &token, bool begin) : token_(token), begin_(begin) {}
42 std::string token_;
43 bool begin_;
44 };
45
AddToken(const std::string & token,bool begin)46 AddToken::AddToken(const std::string &token, bool begin) : data_(std::make_shared<Data>(token, begin)) {}
47
Parse()48 std::shared_ptr<TensorOperation> AddToken::Parse() {
49 return std::make_shared<AddTokenOperation>(data_->token_, data_->begin_);
50 }
51
52 #ifndef _WIN32
53 // BasicTokenizer
54 struct BasicTokenizer::Data {
Datamindspore::dataset::text::BasicTokenizer::Data55 Data(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
56 bool with_offsets)
57 : lower_case_(lower_case),
58 keep_whitespace_(keep_whitespace),
59 normalize_form_(normalize_form),
60 preserve_unused_token_(preserve_unused_token),
61 with_offsets_(with_offsets) {}
62 bool lower_case_;
63 bool keep_whitespace_;
64 NormalizeForm normalize_form_;
65 bool preserve_unused_token_;
66 bool with_offsets_;
67 };
68
BasicTokenizer(bool lower_case,bool keep_whitespace,const NormalizeForm normalize_form,bool preserve_unused_token,bool with_offsets)69 BasicTokenizer::BasicTokenizer(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form,
70 bool preserve_unused_token, bool with_offsets)
71 : data_(std::make_shared<Data>(lower_case, keep_whitespace, normalize_form, preserve_unused_token, with_offsets)) {}
72
Parse()73 std::shared_ptr<TensorOperation> BasicTokenizer::Parse() {
74 return std::make_shared<BasicTokenizerOperation>(data_->lower_case_, data_->keep_whitespace_, data_->normalize_form_,
75 data_->preserve_unused_token_, data_->with_offsets_);
76 }
77
78 // BertTokenizer
79 struct BertTokenizer::Data {
Datamindspore::dataset::text::BertTokenizer::Data80 Data(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, int32_t max_bytes_per_token,
81 const std::vector<char> &unknown_token, bool lower_case, bool keep_whitespace,
82 const NormalizeForm normalize_form, bool preserve_unused_token, bool with_offsets)
83 : vocab_(vocab),
84 suffix_indicator_(CharToString(suffix_indicator)),
85 max_bytes_per_token_(max_bytes_per_token),
86 unknown_token_(CharToString(unknown_token)),
87 lower_case_(lower_case),
88 keep_whitespace_(keep_whitespace),
89 normalize_form_(normalize_form),
90 preserve_unused_token_(preserve_unused_token),
91 with_offsets_(with_offsets) {}
92 std::shared_ptr<Vocab> vocab_;
93 std::string suffix_indicator_;
94 int32_t max_bytes_per_token_;
95 std::string unknown_token_;
96 bool lower_case_;
97 bool keep_whitespace_;
98 NormalizeForm normalize_form_;
99 bool preserve_unused_token_;
100 bool with_offsets_;
101 };
102
BertTokenizer(const std::shared_ptr<Vocab> & vocab,const std::vector<char> & suffix_indicator,int32_t max_bytes_per_token,const std::vector<char> & unknown_token,bool lower_case,bool keep_whitespace,const NormalizeForm normalize_form,bool preserve_unused_token,bool with_offsets)103 BertTokenizer::BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
104 int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool lower_case,
105 bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
106 bool with_offsets)
107 : data_(std::make_shared<Data>(vocab, suffix_indicator, max_bytes_per_token, unknown_token, lower_case,
108 keep_whitespace, normalize_form, preserve_unused_token, with_offsets)) {}
109
Parse()110 std::shared_ptr<TensorOperation> BertTokenizer::Parse() {
111 return std::make_shared<BertTokenizerOperation>(
112 data_->vocab_, data_->suffix_indicator_, data_->max_bytes_per_token_, data_->unknown_token_, data_->lower_case_,
113 data_->keep_whitespace_, data_->normalize_form_, data_->preserve_unused_token_, data_->with_offsets_);
114 }
115
116 // CaseFold
117 CaseFold::CaseFold() = default;
118
Parse()119 std::shared_ptr<TensorOperation> CaseFold::Parse() { return std::make_shared<CaseFoldOperation>(); }
120
121 // FilterWikipediaXML
FilterWikipediaXML()122 FilterWikipediaXML::FilterWikipediaXML() {}
123
Parse()124 std::shared_ptr<TensorOperation> FilterWikipediaXML::Parse() { return std::make_shared<FilterWikipediaXMLOperation>(); }
125 #endif
126
127 // JiebaTokenizer
128 struct JiebaTokenizer::Data {
Datamindspore::dataset::text::JiebaTokenizer::Data129 Data(const std::vector<char> &hmm_path, const std::vector<char> &mp_path, const JiebaMode &mode, bool with_offsets)
130 : hmm_path_(CharToString(hmm_path)),
131 mp_path_(CharToString(mp_path)),
132 mode_(mode),
133 with_offsets_(with_offsets),
134 words_list_({}) {}
135 std::string hmm_path_;
136 std::string mp_path_;
137 JiebaMode mode_;
138 bool with_offsets_;
139 std::vector<std::pair<std::string, int64_t>> words_list_;
140 };
141
JiebaTokenizer(const std::vector<char> & hmm_path,const std::vector<char> & mp_path,const JiebaMode & mode,bool with_offsets)142 JiebaTokenizer::JiebaTokenizer(const std::vector<char> &hmm_path, const std::vector<char> &mp_path,
143 const JiebaMode &mode, bool with_offsets)
144 : data_(std::make_shared<Data>(hmm_path, mp_path, mode, with_offsets)) {}
145
Parse()146 std::shared_ptr<TensorOperation> JiebaTokenizer::Parse() {
147 std::shared_ptr<JiebaTokenizerOperation> jieba_tokenizer =
148 std::make_shared<JiebaTokenizerOperation>(data_->hmm_path_, data_->mp_path_, data_->mode_, data_->with_offsets_);
149 for (auto &word : data_->words_list_) {
150 Status rc = jieba_tokenizer->AddWord(word.first, word.second);
151 if (rc.IsError()) {
152 MS_LOG(ERROR) << rc;
153 return {};
154 }
155 }
156 return jieba_tokenizer;
157 }
158
AddWordChar(const std::vector<char> & word,int64_t freq)159 Status JiebaTokenizer::AddWordChar(const std::vector<char> &word, int64_t freq) {
160 if (word.empty()) {
161 std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided.";
162 LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
163 }
164 if (freq < 0) {
165 std::string err_msg = "JiebaTokenizer : The parameter freq must be greater than or equal to 0.";
166 LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
167 }
168 (void)data_->words_list_.emplace_back(CharToString(word), freq);
169 return Status::OK();
170 }
171
AddDictChar(const std::vector<std::pair<std::vector<char>,int64_t>> & user_dict)172 Status JiebaTokenizer::AddDictChar(const std::vector<std::pair<std::vector<char>, int64_t>> &user_dict) {
173 for (auto &word_freq_pair : user_dict) {
174 RETURN_IF_NOT_OK(AddWordChar(word_freq_pair.first, word_freq_pair.second));
175 }
176 return Status::OK();
177 }
178
AddDictChar(const std::vector<char> & file_path)179 Status JiebaTokenizer::AddDictChar(const std::vector<char> &file_path) {
180 std::vector<std::pair<std::string, int64_t>> user_dict;
181 RETURN_IF_NOT_OK(ParserFile(CharToString(file_path), &user_dict));
182 RETURN_IF_NOT_OK(AddDictChar(PairStringInt64ToPairCharInt64(user_dict)));
183 return Status::OK();
184 }
185
ParserFile(const std::string & file_path,std::vector<std::pair<std::string,int64_t>> * const user_dict)186 Status JiebaTokenizer::ParserFile(const std::string &file_path,
187 std::vector<std::pair<std::string, int64_t>> *const user_dict) {
188 RETURN_UNEXPECTED_IF_NULL(user_dict);
189 auto realpath = FileUtils::GetRealPath(file_path.c_str());
190 if (!realpath.has_value()) {
191 std::string err_msg = "Get real path failed, path: " + file_path;
192 LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
193 }
194
195 std::ifstream ifs(realpath.value(), std::ios::in);
196 if (!ifs) {
197 std::string err_msg = "JiebaTokenizer : Fail to load dictionary from the input file, check the file path.";
198 LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
199 }
200
201 std::string line;
202 while (std::getline(ifs, line)) {
203 if (line.empty()) {
204 continue;
205 }
206 std::regex regex("^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$");
207 std::smatch tokens;
208 if (std::regex_match(line, tokens, regex)) {
209 if (tokens.size() == size_two) {
210 (void)user_dict->emplace_back(tokens.str(value_one), 0);
211 } else if (tokens.size() == size_three) {
212 (void)user_dict->emplace_back(tokens.str(value_one), strtoll(tokens.str(value_two).c_str(), nullptr, 0));
213 } else {
214 continue;
215 }
216 } else {
217 continue;
218 }
219 }
220 ifs.close();
221 MS_LOG(INFO) << "JiebaTokenizer::AddDict: The size of user input dictionary is: " << user_dict->size();
222 MS_LOG(INFO) << "Valid rows in input dictionary (Maximum of first 10 rows are shown.):";
223 for (std::size_t i = 0; i != user_dict->size(); ++i) {
224 if (i >= kMaxLoggedRows) {
225 break;
226 }
227 MS_LOG(INFO) << user_dict->at(i).first << " " << user_dict->at(i).second;
228 }
229 return Status::OK();
230 }
231
232 // Lookup
233 struct Lookup::Data {
Datamindspore::dataset::text::Lookup::Data234 Data(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
235 mindspore::DataType data_type)
236 : vocab_(vocab), data_type_(dataset::MSTypeToDEType(static_cast<TypeId>(data_type))) {
237 if (unknown_token == std::nullopt) {
238 unknown_token_ = std::nullopt;
239 } else {
240 unknown_token_ = std::string(unknown_token->begin(), unknown_token->end());
241 }
242 }
243 std::shared_ptr<Vocab> vocab_;
244 std::optional<std::string> unknown_token_;
245 dataset::DataType data_type_;
246 };
247
Lookup(const std::shared_ptr<Vocab> & vocab,const std::optional<std::vector<char>> & unknown_token,mindspore::DataType data_type)248 Lookup::Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
249 mindspore::DataType data_type)
250 : data_(std::make_shared<Data>(vocab, unknown_token, data_type)) {
251 data_->data_type_ = dataset::MSTypeToDEType(static_cast<TypeId>(data_type));
252 }
253
Parse()254 std::shared_ptr<TensorOperation> Lookup::Parse() {
255 return std::make_shared<LookupOperation>(data_->vocab_, data_->unknown_token_, data_->data_type_);
256 }
257
258 // Ngram
259 struct Ngram::Data {
Datamindspore::dataset::text::Ngram::Data260 Data(const std::vector<int32_t> &ngrams, const std::pair<std::vector<char>, int32_t> &left_pad,
261 const std::pair<std::vector<char>, int32_t> &right_pad, const std::vector<char> &separator)
262 : ngrams_(ngrams),
263 left_pad_(PairCharToString(left_pad)),
264 right_pad_(PairCharToString(right_pad)),
265 separator_(CharToString(separator)) {}
266 std::vector<int32_t> ngrams_;
267 std::pair<std::string, int32_t> left_pad_;
268 std::pair<std::string, int32_t> right_pad_;
269 std::string separator_;
270 };
271
Ngram(const std::vector<int32_t> & ngrams,const std::pair<std::vector<char>,int32_t> & left_pad,const std::pair<std::vector<char>,int32_t> & right_pad,const std::vector<char> & separator)272 Ngram::Ngram(const std::vector<int32_t> &ngrams, const std::pair<std::vector<char>, int32_t> &left_pad,
273 const std::pair<std::vector<char>, int32_t> &right_pad, const std::vector<char> &separator)
274 : data_(std::make_shared<Data>(ngrams, left_pad, right_pad, separator)) {}
275
Parse()276 std::shared_ptr<TensorOperation> Ngram::Parse() {
277 return std::make_shared<NgramOperation>(data_->ngrams_, data_->left_pad_, data_->right_pad_, data_->separator_);
278 }
279
280 #ifndef _WIN32
281 // NormalizeUTF8
282 struct NormalizeUTF8::Data {
Datamindspore::dataset::text::NormalizeUTF8::Data283 explicit Data(NormalizeForm normalize_form) : normalize_form_(normalize_form) {}
284 NormalizeForm normalize_form_;
285 };
286
NormalizeUTF8(NormalizeForm normalize_form)287 NormalizeUTF8::NormalizeUTF8(NormalizeForm normalize_form) : data_(std::make_shared<Data>(normalize_form)) {}
288
Parse()289 std::shared_ptr<TensorOperation> NormalizeUTF8::Parse() {
290 return std::make_shared<NormalizeUTF8Operation>(data_->normalize_form_);
291 }
292
293 // RegexReplace
294 struct RegexReplace::Data {
Datamindspore::dataset::text::RegexReplace::Data295 Data(const std::vector<char> &pattern, const std::vector<char> &replace, bool replace_all)
296 : pattern_(CharToString(pattern)), replace_(CharToString(replace)), replace_all_(replace_all) {}
297 std::string pattern_;
298 std::string replace_;
299 bool replace_all_;
300 };
301
RegexReplace(const std::vector<char> & pattern,const std::vector<char> & replace,bool replace_all)302 RegexReplace::RegexReplace(const std::vector<char> &pattern, const std::vector<char> &replace, bool replace_all)
303 : data_(std::make_shared<Data>(pattern, replace, replace_all)) {}
304
Parse()305 std::shared_ptr<TensorOperation> RegexReplace::Parse() {
306 return std::make_shared<RegexReplaceOperation>(data_->pattern_, data_->replace_, data_->replace_all_);
307 }
308
309 // RegexTokenizer
310 struct RegexTokenizer::Data {
Datamindspore::dataset::text::RegexTokenizer::Data311 Data(const std::vector<char> &delim_pattern, const std::vector<char> &keep_delim_pattern, bool with_offsets)
312 : delim_pattern_(CharToString(delim_pattern)),
313 keep_delim_pattern_(CharToString(keep_delim_pattern)),
314 with_offsets_(with_offsets) {}
315 std::string delim_pattern_;
316 std::string keep_delim_pattern_;
317 bool with_offsets_;
318 };
319
RegexTokenizer(const std::vector<char> & delim_pattern,const std::vector<char> & keep_delim_pattern,bool with_offsets)320 RegexTokenizer::RegexTokenizer(const std::vector<char> &delim_pattern, const std::vector<char> &keep_delim_pattern,
321 bool with_offsets)
322 : data_(std::make_shared<Data>(delim_pattern, keep_delim_pattern, with_offsets)) {}
323
Parse()324 std::shared_ptr<TensorOperation> RegexTokenizer::Parse() {
325 return std::make_shared<RegexTokenizerOperation>(data_->delim_pattern_, data_->keep_delim_pattern_,
326 data_->with_offsets_);
327 }
328 #endif
329
330 // SentencePieceTokenizer
331 struct SentencePieceTokenizer::Data {
Datamindspore::dataset::text::SentencePieceTokenizer::Data332 Data(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type)
333 : vocab_(vocab), vocab_path_(""), out_type_(out_type) {}
Datamindspore::dataset::text::SentencePieceTokenizer::Data334 Data(const std::vector<char> &vocab_path, SPieceTokenizerOutType out_type)
335 : vocab_(nullptr), vocab_path_(CharToString(vocab_path)), out_type_(out_type) {}
336 std::shared_ptr<SentencePieceVocab> vocab_;
337 std::string vocab_path_;
338 SPieceTokenizerOutType out_type_;
339 };
340
SentencePieceTokenizer(const std::shared_ptr<SentencePieceVocab> & vocab,SPieceTokenizerOutType out_type)341 SentencePieceTokenizer::SentencePieceTokenizer(const std::shared_ptr<SentencePieceVocab> &vocab,
342 SPieceTokenizerOutType out_type)
343 : data_(std::make_shared<Data>(vocab, out_type)) {}
344
SentencePieceTokenizer(const std::vector<char> & vocab_path,SPieceTokenizerOutType out_type)345 SentencePieceTokenizer::SentencePieceTokenizer(const std::vector<char> &vocab_path, SPieceTokenizerOutType out_type)
346 : data_(std::make_shared<Data>(vocab_path, out_type)) {}
347
Parse()348 std::shared_ptr<TensorOperation> SentencePieceTokenizer::Parse() {
349 if (data_->vocab_ != nullptr) {
350 return std::make_shared<SentencePieceTokenizerOperation>(data_->vocab_, data_->out_type_);
351 } else {
352 return std::make_shared<SentencePieceTokenizerOperation>(data_->vocab_path_, data_->out_type_);
353 }
354 }
355
356 // SlidingWindow
357 struct SlidingWindow::Data {
Datamindspore::dataset::text::SlidingWindow::Data358 Data(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {}
359 int32_t width_;
360 int32_t axis_;
361 };
362
SlidingWindow(const int32_t width,const int32_t axis)363 SlidingWindow::SlidingWindow(const int32_t width, const int32_t axis) : data_(std::make_shared<Data>(width, axis)) {}
364
Parse()365 std::shared_ptr<TensorOperation> SlidingWindow::Parse() {
366 return std::make_shared<SlidingWindowOperation>(data_->width_, data_->axis_);
367 }
368
369 // ToNumber
370 struct ToNumber::Data {
371 dataset::DataType data_type_;
372 };
373
ToNumber(mindspore::DataType data_type)374 ToNumber::ToNumber(mindspore::DataType data_type) : data_(std::make_shared<Data>()) {
375 data_->data_type_ = dataset::MSTypeToDEType(static_cast<TypeId>(data_type));
376 }
377
Parse()378 std::shared_ptr<TensorOperation> ToNumber::Parse() { return std::make_shared<ToNumberOperation>(data_->data_type_); }
379
380 // ToVectors
381 struct ToVectors::Data {
Datamindspore::dataset::text::ToVectors::Data382 Data(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init, bool lower_case_backup)
383 : vectors_(vectors), unk_init_(unk_init), lower_case_backup_(lower_case_backup) {}
384 std::shared_ptr<Vectors> vectors_;
385 std::vector<float> unk_init_;
386 bool lower_case_backup_;
387 };
388
ToVectors(const std::shared_ptr<Vectors> & vectors,const std::vector<float> & unk_init,bool lower_case_backup)389 ToVectors::ToVectors(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init,
390 bool lower_case_backup)
391 : data_(std::make_shared<Data>(vectors, unk_init, lower_case_backup)) {}
392
Parse()393 std::shared_ptr<TensorOperation> ToVectors::Parse() {
394 return std::make_shared<ToVectorsOperation>(data_->vectors_, data_->unk_init_, data_->lower_case_backup_);
395 }
396
397 // Truncate
398 struct Truncate::Data {
Datamindspore::dataset::text::Truncate::Data399 explicit Data(int32_t max_seq_len) : max_seq_len_(max_seq_len) {}
400 int32_t max_seq_len_;
401 };
402
Truncate(int32_t max_seq_len)403 Truncate::Truncate(int32_t max_seq_len) : data_(std::make_shared<Data>(max_seq_len)) {}
404
Parse()405 std::shared_ptr<TensorOperation> Truncate::Parse() { return std::make_shared<TruncateOperation>(data_->max_seq_len_); }
406
407 // TruncateSequencePair
408 struct TruncateSequencePair::Data {
Datamindspore::dataset::text::TruncateSequencePair::Data409 explicit Data(int32_t max_length) : max_length_(max_length) {}
410 int32_t max_length_;
411 };
412
TruncateSequencePair(int32_t max_length)413 TruncateSequencePair::TruncateSequencePair(int32_t max_length) : data_(std::make_shared<Data>(max_length)) {}
414
Parse()415 std::shared_ptr<TensorOperation> TruncateSequencePair::Parse() {
416 return std::make_shared<TruncateSequencePairOperation>(data_->max_length_);
417 }
418
419 // UnicodeCharTokenizer
420 struct UnicodeCharTokenizer::Data {
Datamindspore::dataset::text::UnicodeCharTokenizer::Data421 explicit Data(bool with_offsets) : with_offsets_(with_offsets) {}
422 bool with_offsets_;
423 };
424
UnicodeCharTokenizer(bool with_offsets)425 UnicodeCharTokenizer::UnicodeCharTokenizer(bool with_offsets) : data_(std::make_shared<Data>(with_offsets)) {}
426
Parse()427 std::shared_ptr<TensorOperation> UnicodeCharTokenizer::Parse() {
428 return std::make_shared<UnicodeCharTokenizerOperation>(data_->with_offsets_);
429 }
430
431 // WordpieceTokenizer
432 struct WordpieceTokenizer::Data {
Datamindspore::dataset::text::WordpieceTokenizer::Data433 Data(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, int32_t max_bytes_per_token,
434 const std::vector<char> &unknown_token, bool with_offsets)
435 : vocab_(vocab),
436 suffix_indicator_(CharToString(suffix_indicator)),
437 max_bytes_per_token_(max_bytes_per_token),
438 unknown_token_(CharToString(unknown_token)),
439 with_offsets_(with_offsets) {}
440 std::shared_ptr<Vocab> vocab_;
441 std::string suffix_indicator_;
442 int32_t max_bytes_per_token_;
443 std::string unknown_token_;
444 bool with_offsets_;
445 };
446
WordpieceTokenizer(const std::shared_ptr<Vocab> & vocab,const std::vector<char> & suffix_indicator,int32_t max_bytes_per_token,const std::vector<char> & unknown_token,bool with_offsets)447 WordpieceTokenizer::WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
448 int32_t max_bytes_per_token, const std::vector<char> &unknown_token,
449 bool with_offsets)
450 : data_(std::make_shared<Data>(vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets)) {}
451
Parse()452 std::shared_ptr<TensorOperation> WordpieceTokenizer::Parse() {
453 return std::make_shared<WordpieceTokenizerOperation>(
454 data_->vocab_, data_->suffix_indicator_, data_->max_bytes_per_token_, data_->unknown_token_, data_->with_offsets_);
455 }
456
457 #ifndef _WIN32
458 // UnicodeScriptTokenizer
459 struct UnicodeScriptTokenizer::Data {
Datamindspore::dataset::text::UnicodeScriptTokenizer::Data460 Data(bool keep_whitespace, bool with_offsets) : keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {}
461 bool keep_whitespace_;
462 bool with_offsets_;
463 };
464
UnicodeScriptTokenizer(bool keep_whitespace,bool with_offsets)465 UnicodeScriptTokenizer::UnicodeScriptTokenizer(bool keep_whitespace, bool with_offsets)
466 : data_(std::make_shared<Data>(keep_whitespace, with_offsets)) {}
467
Parse()468 std::shared_ptr<TensorOperation> UnicodeScriptTokenizer::Parse() {
469 return std::make_shared<UnicodeScriptTokenizerOperation>(data_->keep_whitespace_, data_->with_offsets_);
470 }
471
472 // WhitespaceTokenizer
473 struct WhitespaceTokenizer::Data {
Datamindspore::dataset::text::WhitespaceTokenizer::Data474 explicit Data(bool with_offsets) : with_offsets_(with_offsets) {}
475 bool with_offsets_;
476 };
477
WhitespaceTokenizer(bool with_offsets)478 WhitespaceTokenizer::WhitespaceTokenizer(bool with_offsets) : data_(std::make_shared<Data>(with_offsets)) {}
479
Parse()480 std::shared_ptr<TensorOperation> WhitespaceTokenizer::Parse() {
481 return std::make_shared<WhitespaceTokenizerOperation>(data_->with_offsets_);
482 }
483 #endif
484 } // namespace text
485 } // namespace dataset
486 } // namespace mindspore
487