• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2022 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "minddata/dataset/include/dataset/text.h"
17 
18 #include <fstream>
19 #include <regex>
20 
21 #include "minddata/dataset/core/type_id.h"
22 #include "minddata/dataset/text/ir/kernels/text_ir.h"
23 #include "mindspore/core/ir/dtype/type_id.h"
24 #include "utils/file_utils.h"
25 
26 namespace mindspore {
27 namespace dataset {
28 // Transform operations for text.
29 namespace text {
30 constexpr size_t size_two = 2;
31 constexpr size_t size_three = 3;
32 constexpr int64_t value_one = 1;
33 constexpr int64_t value_two = 2;
34 constexpr size_t kMaxLoggedRows = 10;
35 
36 // FUNCTIONS TO CREATE TEXT OPERATIONS
37 // (In alphabetical order)
38 
39 // AddToken
40 struct AddToken::Data {
Datamindspore::dataset::text::AddToken::Data41   Data(const std::string &token, bool begin) : token_(token), begin_(begin) {}
42   std::string token_;
43   bool begin_;
44 };
45 
AddToken(const std::string & token,bool begin)46 AddToken::AddToken(const std::string &token, bool begin) : data_(std::make_shared<Data>(token, begin)) {}
47 
Parse()48 std::shared_ptr<TensorOperation> AddToken::Parse() {
49   return std::make_shared<AddTokenOperation>(data_->token_, data_->begin_);
50 }
51 
52 #ifndef _WIN32
53 // BasicTokenizer
54 struct BasicTokenizer::Data {
Datamindspore::dataset::text::BasicTokenizer::Data55   Data(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
56        bool with_offsets)
57       : lower_case_(lower_case),
58         keep_whitespace_(keep_whitespace),
59         normalize_form_(normalize_form),
60         preserve_unused_token_(preserve_unused_token),
61         with_offsets_(with_offsets) {}
62   bool lower_case_;
63   bool keep_whitespace_;
64   NormalizeForm normalize_form_;
65   bool preserve_unused_token_;
66   bool with_offsets_;
67 };
68 
BasicTokenizer(bool lower_case,bool keep_whitespace,const NormalizeForm normalize_form,bool preserve_unused_token,bool with_offsets)69 BasicTokenizer::BasicTokenizer(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form,
70                                bool preserve_unused_token, bool with_offsets)
71     : data_(std::make_shared<Data>(lower_case, keep_whitespace, normalize_form, preserve_unused_token, with_offsets)) {}
72 
Parse()73 std::shared_ptr<TensorOperation> BasicTokenizer::Parse() {
74   return std::make_shared<BasicTokenizerOperation>(data_->lower_case_, data_->keep_whitespace_, data_->normalize_form_,
75                                                    data_->preserve_unused_token_, data_->with_offsets_);
76 }
77 
78 // BertTokenizer
79 struct BertTokenizer::Data {
Datamindspore::dataset::text::BertTokenizer::Data80   Data(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, int32_t max_bytes_per_token,
81        const std::vector<char> &unknown_token, bool lower_case, bool keep_whitespace,
82        const NormalizeForm normalize_form, bool preserve_unused_token, bool with_offsets)
83       : vocab_(vocab),
84         suffix_indicator_(CharToString(suffix_indicator)),
85         max_bytes_per_token_(max_bytes_per_token),
86         unknown_token_(CharToString(unknown_token)),
87         lower_case_(lower_case),
88         keep_whitespace_(keep_whitespace),
89         normalize_form_(normalize_form),
90         preserve_unused_token_(preserve_unused_token),
91         with_offsets_(with_offsets) {}
92   std::shared_ptr<Vocab> vocab_;
93   std::string suffix_indicator_;
94   int32_t max_bytes_per_token_;
95   std::string unknown_token_;
96   bool lower_case_;
97   bool keep_whitespace_;
98   NormalizeForm normalize_form_;
99   bool preserve_unused_token_;
100   bool with_offsets_;
101 };
102 
BertTokenizer(const std::shared_ptr<Vocab> & vocab,const std::vector<char> & suffix_indicator,int32_t max_bytes_per_token,const std::vector<char> & unknown_token,bool lower_case,bool keep_whitespace,const NormalizeForm normalize_form,bool preserve_unused_token,bool with_offsets)103 BertTokenizer::BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
104                              int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool lower_case,
105                              bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
106                              bool with_offsets)
107     : data_(std::make_shared<Data>(vocab, suffix_indicator, max_bytes_per_token, unknown_token, lower_case,
108                                    keep_whitespace, normalize_form, preserve_unused_token, with_offsets)) {}
109 
Parse()110 std::shared_ptr<TensorOperation> BertTokenizer::Parse() {
111   return std::make_shared<BertTokenizerOperation>(
112     data_->vocab_, data_->suffix_indicator_, data_->max_bytes_per_token_, data_->unknown_token_, data_->lower_case_,
113     data_->keep_whitespace_, data_->normalize_form_, data_->preserve_unused_token_, data_->with_offsets_);
114 }
115 
116 // CaseFold
117 CaseFold::CaseFold() = default;
118 
Parse()119 std::shared_ptr<TensorOperation> CaseFold::Parse() { return std::make_shared<CaseFoldOperation>(); }
120 
121 // FilterWikipediaXML
FilterWikipediaXML()122 FilterWikipediaXML::FilterWikipediaXML() {}
123 
Parse()124 std::shared_ptr<TensorOperation> FilterWikipediaXML::Parse() { return std::make_shared<FilterWikipediaXMLOperation>(); }
125 #endif
126 
127 // JiebaTokenizer
128 struct JiebaTokenizer::Data {
Datamindspore::dataset::text::JiebaTokenizer::Data129   Data(const std::vector<char> &hmm_path, const std::vector<char> &mp_path, const JiebaMode &mode, bool with_offsets)
130       : hmm_path_(CharToString(hmm_path)),
131         mp_path_(CharToString(mp_path)),
132         mode_(mode),
133         with_offsets_(with_offsets),
134         words_list_({}) {}
135   std::string hmm_path_;
136   std::string mp_path_;
137   JiebaMode mode_;
138   bool with_offsets_;
139   std::vector<std::pair<std::string, int64_t>> words_list_;
140 };
141 
JiebaTokenizer(const std::vector<char> & hmm_path,const std::vector<char> & mp_path,const JiebaMode & mode,bool with_offsets)142 JiebaTokenizer::JiebaTokenizer(const std::vector<char> &hmm_path, const std::vector<char> &mp_path,
143                                const JiebaMode &mode, bool with_offsets)
144     : data_(std::make_shared<Data>(hmm_path, mp_path, mode, with_offsets)) {}
145 
Parse()146 std::shared_ptr<TensorOperation> JiebaTokenizer::Parse() {
147   std::shared_ptr<JiebaTokenizerOperation> jieba_tokenizer =
148     std::make_shared<JiebaTokenizerOperation>(data_->hmm_path_, data_->mp_path_, data_->mode_, data_->with_offsets_);
149   for (auto &word : data_->words_list_) {
150     Status rc = jieba_tokenizer->AddWord(word.first, word.second);
151     if (rc.IsError()) {
152       MS_LOG(ERROR) << rc;
153       return {};
154     }
155   }
156   return jieba_tokenizer;
157 }
158 
AddWordChar(const std::vector<char> & word,int64_t freq)159 Status JiebaTokenizer::AddWordChar(const std::vector<char> &word, int64_t freq) {
160   if (word.empty()) {
161     std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided.";
162     LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
163   }
164   if (freq < 0) {
165     std::string err_msg = "JiebaTokenizer : The parameter freq must be greater than or equal to 0.";
166     LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
167   }
168   (void)data_->words_list_.emplace_back(CharToString(word), freq);
169   return Status::OK();
170 }
171 
AddDictChar(const std::vector<std::pair<std::vector<char>,int64_t>> & user_dict)172 Status JiebaTokenizer::AddDictChar(const std::vector<std::pair<std::vector<char>, int64_t>> &user_dict) {
173   for (auto &word_freq_pair : user_dict) {
174     RETURN_IF_NOT_OK(AddWordChar(word_freq_pair.first, word_freq_pair.second));
175   }
176   return Status::OK();
177 }
178 
AddDictChar(const std::vector<char> & file_path)179 Status JiebaTokenizer::AddDictChar(const std::vector<char> &file_path) {
180   std::vector<std::pair<std::string, int64_t>> user_dict;
181   RETURN_IF_NOT_OK(ParserFile(CharToString(file_path), &user_dict));
182   RETURN_IF_NOT_OK(AddDictChar(PairStringInt64ToPairCharInt64(user_dict)));
183   return Status::OK();
184 }
185 
ParserFile(const std::string & file_path,std::vector<std::pair<std::string,int64_t>> * const user_dict)186 Status JiebaTokenizer::ParserFile(const std::string &file_path,
187                                   std::vector<std::pair<std::string, int64_t>> *const user_dict) {
188   RETURN_UNEXPECTED_IF_NULL(user_dict);
189   auto realpath = FileUtils::GetRealPath(file_path.c_str());
190   if (!realpath.has_value()) {
191     std::string err_msg = "Get real path failed, path: " + file_path;
192     LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
193   }
194 
195   std::ifstream ifs(realpath.value(), std::ios::in);
196   if (!ifs) {
197     std::string err_msg = "JiebaTokenizer : Fail to load dictionary from the input file, check the file path.";
198     LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
199   }
200 
201   std::string line;
202   while (std::getline(ifs, line)) {
203     if (line.empty()) {
204       continue;
205     }
206     std::regex regex("^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$");
207     std::smatch tokens;
208     if (std::regex_match(line, tokens, regex)) {
209       if (tokens.size() == size_two) {
210         (void)user_dict->emplace_back(tokens.str(value_one), 0);
211       } else if (tokens.size() == size_three) {
212         (void)user_dict->emplace_back(tokens.str(value_one), strtoll(tokens.str(value_two).c_str(), nullptr, 0));
213       } else {
214         continue;
215       }
216     } else {
217       continue;
218     }
219   }
220   ifs.close();
221   MS_LOG(INFO) << "JiebaTokenizer::AddDict: The size of user input dictionary is: " << user_dict->size();
222   MS_LOG(INFO) << "Valid rows in input dictionary (Maximum of first 10 rows are shown.):";
223   for (std::size_t i = 0; i != user_dict->size(); ++i) {
224     if (i >= kMaxLoggedRows) {
225       break;
226     }
227     MS_LOG(INFO) << user_dict->at(i).first << " " << user_dict->at(i).second;
228   }
229   return Status::OK();
230 }
231 
232 // Lookup
233 struct Lookup::Data {
Datamindspore::dataset::text::Lookup::Data234   Data(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
235        mindspore::DataType data_type)
236       : vocab_(vocab), data_type_(dataset::MSTypeToDEType(static_cast<TypeId>(data_type))) {
237     if (unknown_token == std::nullopt) {
238       unknown_token_ = std::nullopt;
239     } else {
240       unknown_token_ = std::string(unknown_token->begin(), unknown_token->end());
241     }
242   }
243   std::shared_ptr<Vocab> vocab_;
244   std::optional<std::string> unknown_token_;
245   dataset::DataType data_type_;
246 };
247 
Lookup(const std::shared_ptr<Vocab> & vocab,const std::optional<std::vector<char>> & unknown_token,mindspore::DataType data_type)248 Lookup::Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
249                mindspore::DataType data_type)
250     : data_(std::make_shared<Data>(vocab, unknown_token, data_type)) {
251   data_->data_type_ = dataset::MSTypeToDEType(static_cast<TypeId>(data_type));
252 }
253 
Parse()254 std::shared_ptr<TensorOperation> Lookup::Parse() {
255   return std::make_shared<LookupOperation>(data_->vocab_, data_->unknown_token_, data_->data_type_);
256 }
257 
258 // Ngram
259 struct Ngram::Data {
Datamindspore::dataset::text::Ngram::Data260   Data(const std::vector<int32_t> &ngrams, const std::pair<std::vector<char>, int32_t> &left_pad,
261        const std::pair<std::vector<char>, int32_t> &right_pad, const std::vector<char> &separator)
262       : ngrams_(ngrams),
263         left_pad_(PairCharToString(left_pad)),
264         right_pad_(PairCharToString(right_pad)),
265         separator_(CharToString(separator)) {}
266   std::vector<int32_t> ngrams_;
267   std::pair<std::string, int32_t> left_pad_;
268   std::pair<std::string, int32_t> right_pad_;
269   std::string separator_;
270 };
271 
Ngram(const std::vector<int32_t> & ngrams,const std::pair<std::vector<char>,int32_t> & left_pad,const std::pair<std::vector<char>,int32_t> & right_pad,const std::vector<char> & separator)272 Ngram::Ngram(const std::vector<int32_t> &ngrams, const std::pair<std::vector<char>, int32_t> &left_pad,
273              const std::pair<std::vector<char>, int32_t> &right_pad, const std::vector<char> &separator)
274     : data_(std::make_shared<Data>(ngrams, left_pad, right_pad, separator)) {}
275 
Parse()276 std::shared_ptr<TensorOperation> Ngram::Parse() {
277   return std::make_shared<NgramOperation>(data_->ngrams_, data_->left_pad_, data_->right_pad_, data_->separator_);
278 }
279 
280 #ifndef _WIN32
281 // NormalizeUTF8
282 struct NormalizeUTF8::Data {
Datamindspore::dataset::text::NormalizeUTF8::Data283   explicit Data(NormalizeForm normalize_form) : normalize_form_(normalize_form) {}
284   NormalizeForm normalize_form_;
285 };
286 
NormalizeUTF8(NormalizeForm normalize_form)287 NormalizeUTF8::NormalizeUTF8(NormalizeForm normalize_form) : data_(std::make_shared<Data>(normalize_form)) {}
288 
Parse()289 std::shared_ptr<TensorOperation> NormalizeUTF8::Parse() {
290   return std::make_shared<NormalizeUTF8Operation>(data_->normalize_form_);
291 }
292 
293 // RegexReplace
294 struct RegexReplace::Data {
Datamindspore::dataset::text::RegexReplace::Data295   Data(const std::vector<char> &pattern, const std::vector<char> &replace, bool replace_all)
296       : pattern_(CharToString(pattern)), replace_(CharToString(replace)), replace_all_(replace_all) {}
297   std::string pattern_;
298   std::string replace_;
299   bool replace_all_;
300 };
301 
RegexReplace(const std::vector<char> & pattern,const std::vector<char> & replace,bool replace_all)302 RegexReplace::RegexReplace(const std::vector<char> &pattern, const std::vector<char> &replace, bool replace_all)
303     : data_(std::make_shared<Data>(pattern, replace, replace_all)) {}
304 
Parse()305 std::shared_ptr<TensorOperation> RegexReplace::Parse() {
306   return std::make_shared<RegexReplaceOperation>(data_->pattern_, data_->replace_, data_->replace_all_);
307 }
308 
309 // RegexTokenizer
310 struct RegexTokenizer::Data {
Datamindspore::dataset::text::RegexTokenizer::Data311   Data(const std::vector<char> &delim_pattern, const std::vector<char> &keep_delim_pattern, bool with_offsets)
312       : delim_pattern_(CharToString(delim_pattern)),
313         keep_delim_pattern_(CharToString(keep_delim_pattern)),
314         with_offsets_(with_offsets) {}
315   std::string delim_pattern_;
316   std::string keep_delim_pattern_;
317   bool with_offsets_;
318 };
319 
RegexTokenizer(const std::vector<char> & delim_pattern,const std::vector<char> & keep_delim_pattern,bool with_offsets)320 RegexTokenizer::RegexTokenizer(const std::vector<char> &delim_pattern, const std::vector<char> &keep_delim_pattern,
321                                bool with_offsets)
322     : data_(std::make_shared<Data>(delim_pattern, keep_delim_pattern, with_offsets)) {}
323 
Parse()324 std::shared_ptr<TensorOperation> RegexTokenizer::Parse() {
325   return std::make_shared<RegexTokenizerOperation>(data_->delim_pattern_, data_->keep_delim_pattern_,
326                                                    data_->with_offsets_);
327 }
328 #endif
329 
330 // SentencePieceTokenizer
331 struct SentencePieceTokenizer::Data {
Datamindspore::dataset::text::SentencePieceTokenizer::Data332   Data(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type)
333       : vocab_(vocab), vocab_path_(""), out_type_(out_type) {}
Datamindspore::dataset::text::SentencePieceTokenizer::Data334   Data(const std::vector<char> &vocab_path, SPieceTokenizerOutType out_type)
335       : vocab_(nullptr), vocab_path_(CharToString(vocab_path)), out_type_(out_type) {}
336   std::shared_ptr<SentencePieceVocab> vocab_;
337   std::string vocab_path_;
338   SPieceTokenizerOutType out_type_;
339 };
340 
SentencePieceTokenizer(const std::shared_ptr<SentencePieceVocab> & vocab,SPieceTokenizerOutType out_type)341 SentencePieceTokenizer::SentencePieceTokenizer(const std::shared_ptr<SentencePieceVocab> &vocab,
342                                                SPieceTokenizerOutType out_type)
343     : data_(std::make_shared<Data>(vocab, out_type)) {}
344 
SentencePieceTokenizer(const std::vector<char> & vocab_path,SPieceTokenizerOutType out_type)345 SentencePieceTokenizer::SentencePieceTokenizer(const std::vector<char> &vocab_path, SPieceTokenizerOutType out_type)
346     : data_(std::make_shared<Data>(vocab_path, out_type)) {}
347 
Parse()348 std::shared_ptr<TensorOperation> SentencePieceTokenizer::Parse() {
349   if (data_->vocab_ != nullptr) {
350     return std::make_shared<SentencePieceTokenizerOperation>(data_->vocab_, data_->out_type_);
351   } else {
352     return std::make_shared<SentencePieceTokenizerOperation>(data_->vocab_path_, data_->out_type_);
353   }
354 }
355 
356 // SlidingWindow
357 struct SlidingWindow::Data {
Datamindspore::dataset::text::SlidingWindow::Data358   Data(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {}
359   int32_t width_;
360   int32_t axis_;
361 };
362 
SlidingWindow(const int32_t width,const int32_t axis)363 SlidingWindow::SlidingWindow(const int32_t width, const int32_t axis) : data_(std::make_shared<Data>(width, axis)) {}
364 
Parse()365 std::shared_ptr<TensorOperation> SlidingWindow::Parse() {
366   return std::make_shared<SlidingWindowOperation>(data_->width_, data_->axis_);
367 }
368 
369 // ToNumber
370 struct ToNumber::Data {
371   dataset::DataType data_type_;
372 };
373 
ToNumber(mindspore::DataType data_type)374 ToNumber::ToNumber(mindspore::DataType data_type) : data_(std::make_shared<Data>()) {
375   data_->data_type_ = dataset::MSTypeToDEType(static_cast<TypeId>(data_type));
376 }
377 
Parse()378 std::shared_ptr<TensorOperation> ToNumber::Parse() { return std::make_shared<ToNumberOperation>(data_->data_type_); }
379 
380 // ToVectors
381 struct ToVectors::Data {
Datamindspore::dataset::text::ToVectors::Data382   Data(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init, bool lower_case_backup)
383       : vectors_(vectors), unk_init_(unk_init), lower_case_backup_(lower_case_backup) {}
384   std::shared_ptr<Vectors> vectors_;
385   std::vector<float> unk_init_;
386   bool lower_case_backup_;
387 };
388 
ToVectors(const std::shared_ptr<Vectors> & vectors,const std::vector<float> & unk_init,bool lower_case_backup)389 ToVectors::ToVectors(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init,
390                      bool lower_case_backup)
391     : data_(std::make_shared<Data>(vectors, unk_init, lower_case_backup)) {}
392 
Parse()393 std::shared_ptr<TensorOperation> ToVectors::Parse() {
394   return std::make_shared<ToVectorsOperation>(data_->vectors_, data_->unk_init_, data_->lower_case_backup_);
395 }
396 
397 // Truncate
398 struct Truncate::Data {
Datamindspore::dataset::text::Truncate::Data399   explicit Data(int32_t max_seq_len) : max_seq_len_(max_seq_len) {}
400   int32_t max_seq_len_;
401 };
402 
Truncate(int32_t max_seq_len)403 Truncate::Truncate(int32_t max_seq_len) : data_(std::make_shared<Data>(max_seq_len)) {}
404 
Parse()405 std::shared_ptr<TensorOperation> Truncate::Parse() { return std::make_shared<TruncateOperation>(data_->max_seq_len_); }
406 
407 // TruncateSequencePair
408 struct TruncateSequencePair::Data {
Datamindspore::dataset::text::TruncateSequencePair::Data409   explicit Data(int32_t max_length) : max_length_(max_length) {}
410   int32_t max_length_;
411 };
412 
TruncateSequencePair(int32_t max_length)413 TruncateSequencePair::TruncateSequencePair(int32_t max_length) : data_(std::make_shared<Data>(max_length)) {}
414 
Parse()415 std::shared_ptr<TensorOperation> TruncateSequencePair::Parse() {
416   return std::make_shared<TruncateSequencePairOperation>(data_->max_length_);
417 }
418 
419 // UnicodeCharTokenizer
420 struct UnicodeCharTokenizer::Data {
Datamindspore::dataset::text::UnicodeCharTokenizer::Data421   explicit Data(bool with_offsets) : with_offsets_(with_offsets) {}
422   bool with_offsets_;
423 };
424 
UnicodeCharTokenizer(bool with_offsets)425 UnicodeCharTokenizer::UnicodeCharTokenizer(bool with_offsets) : data_(std::make_shared<Data>(with_offsets)) {}
426 
Parse()427 std::shared_ptr<TensorOperation> UnicodeCharTokenizer::Parse() {
428   return std::make_shared<UnicodeCharTokenizerOperation>(data_->with_offsets_);
429 }
430 
431 // WordpieceTokenizer
432 struct WordpieceTokenizer::Data {
Datamindspore::dataset::text::WordpieceTokenizer::Data433   Data(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, int32_t max_bytes_per_token,
434        const std::vector<char> &unknown_token, bool with_offsets)
435       : vocab_(vocab),
436         suffix_indicator_(CharToString(suffix_indicator)),
437         max_bytes_per_token_(max_bytes_per_token),
438         unknown_token_(CharToString(unknown_token)),
439         with_offsets_(with_offsets) {}
440   std::shared_ptr<Vocab> vocab_;
441   std::string suffix_indicator_;
442   int32_t max_bytes_per_token_;
443   std::string unknown_token_;
444   bool with_offsets_;
445 };
446 
WordpieceTokenizer(const std::shared_ptr<Vocab> & vocab,const std::vector<char> & suffix_indicator,int32_t max_bytes_per_token,const std::vector<char> & unknown_token,bool with_offsets)447 WordpieceTokenizer::WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
448                                        int32_t max_bytes_per_token, const std::vector<char> &unknown_token,
449                                        bool with_offsets)
450     : data_(std::make_shared<Data>(vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets)) {}
451 
Parse()452 std::shared_ptr<TensorOperation> WordpieceTokenizer::Parse() {
453   return std::make_shared<WordpieceTokenizerOperation>(
454     data_->vocab_, data_->suffix_indicator_, data_->max_bytes_per_token_, data_->unknown_token_, data_->with_offsets_);
455 }
456 
457 #ifndef _WIN32
458 // UnicodeScriptTokenizer
459 struct UnicodeScriptTokenizer::Data {
Datamindspore::dataset::text::UnicodeScriptTokenizer::Data460   Data(bool keep_whitespace, bool with_offsets) : keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {}
461   bool keep_whitespace_;
462   bool with_offsets_;
463 };
464 
UnicodeScriptTokenizer(bool keep_whitespace,bool with_offsets)465 UnicodeScriptTokenizer::UnicodeScriptTokenizer(bool keep_whitespace, bool with_offsets)
466     : data_(std::make_shared<Data>(keep_whitespace, with_offsets)) {}
467 
Parse()468 std::shared_ptr<TensorOperation> UnicodeScriptTokenizer::Parse() {
469   return std::make_shared<UnicodeScriptTokenizerOperation>(data_->keep_whitespace_, data_->with_offsets_);
470 }
471 
472 // WhitespaceTokenizer
473 struct WhitespaceTokenizer::Data {
Datamindspore::dataset::text::WhitespaceTokenizer::Data474   explicit Data(bool with_offsets) : with_offsets_(with_offsets) {}
475   bool with_offsets_;
476 };
477 
WhitespaceTokenizer(bool with_offsets)478 WhitespaceTokenizer::WhitespaceTokenizer(bool with_offsets) : data_(std::make_shared<Data>(with_offsets)) {}
479 
Parse()480 std::shared_ptr<TensorOperation> WhitespaceTokenizer::Parse() {
481   return std::make_shared<WhitespaceTokenizerOperation>(data_->with_offsets_);
482 }
483 #endif
484 }  // namespace text
485 }  // namespace dataset
486 }  // namespace mindspore
487