• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "minddata/dataset/text/kernels/basic_tokenizer_op.h"
17 #include <memory>
18 #include <queue>
19 #include <string>
20 #include <string_view>
21 #include <utility>
22 #include <vector>
23 
24 #include "unicode/errorcode.h"
25 #include "unicode/normalizer2.h"
26 
27 namespace mindspore {
28 namespace dataset {
29 
30 const bool BasicTokenizerOp::kDefLowerCase = false;
31 const bool BasicTokenizerOp::kDefKeepWhitespace = false;
32 const NormalizeForm BasicTokenizerOp::kDefNormalizationForm = NormalizeForm::kNone;
33 const bool BasicTokenizerOp::kDefPreserveUnusedToken = true;
34 const char BasicTokenizerOp::kCommonPattern[] =
35   "[!-/]"
36   "|[:-@]"
37   "|[\\[-`]"
38   "|[{-~]"
39   "|[\\p{P}]"
40   "|[\\x{4E00}-\\x{9FFF}]"
41   "|[\\x{3400}-\\x{4DBF}]"
42   "|[\\x{20000}-\\x{2A6DF}]"
43   "|[\\x{2A700}-\\x{2B73F}]"
44   "|[\\x{2B740}-\\x{2B81F}]"
45   "|[\\x{2B820}-\\x{2CEAF}]"
46   "|[\\x{F900}-\\x{FAFF}]"
47   "|[\\x{2F800}-\\x{2FA1F}]";
48 const char BasicTokenizerOp::kUnusedPattern[] = "\\[CLS\\]|\\[SEP\\]|\\[UNK\\]|\\[PAD\\]|\\[MASK\\]|\\[unused\\d+\\]|";
49 const std::unordered_set<std::string> BasicTokenizerOp::kUnusedWords{"[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]"};
50 
BasicTokenizerOp(const bool & lower_case,const bool & keep_whitespace,const NormalizeForm & normalization_form,const bool & preserve_unused_token,const bool & with_offsets)51 BasicTokenizerOp::BasicTokenizerOp(const bool &lower_case, const bool &keep_whitespace,
52                                    const NormalizeForm &normalization_form, const bool &preserve_unused_token,
53                                    const bool &with_offsets)
54     : TokenizerOp(with_offsets),
55       lower_case_(lower_case),
56       keep_whitespace_(keep_whitespace),
57       normalization_form_(normalization_form),
58       preserve_unused_token_(preserve_unused_token),
59       case_fold_(std::make_unique<CaseFoldOp>()),
60       nfd_normalize_(std::make_unique<NormalizeUTF8Op>(NormalizeForm::kNfd)),
61       common_normalize_(std::make_unique<NormalizeUTF8Op>(normalization_form)),
62       replace_accent_chars_(std::make_unique<RegexReplaceOp>("\\p{Mn}", "")),
63       replace_control_chars_(std::make_unique<RegexReplaceOp>("\\p{Cc}|\\p{Cf}", " ")) {
64   std::string delim_pattern = std::string("\\s+|") + kCommonPattern;
65   std::string keep_delim_pattern;
66   if (keep_whitespace_) {
67     keep_delim_pattern = delim_pattern;
68   } else {
69     keep_delim_pattern = kCommonPattern;
70   }
71   if (preserve_unused_token_) {
72     keep_delim_pattern = kUnusedPattern + keep_delim_pattern;
73     delim_pattern = kUnusedPattern + delim_pattern;
74   }
75   regex_tokenizer_ = std::make_unique<RegexTokenizerOp>(delim_pattern, keep_delim_pattern, with_offsets_);
76 }
77 
CaseFoldWithoutUnusedWords(const std::string_view & text,const std::unordered_set<std::string> & unused_words,std::string * output)78 Status BasicTokenizerOp::CaseFoldWithoutUnusedWords(const std::string_view &text,
79                                                     const std::unordered_set<std::string> &unused_words,
80                                                     std::string *output) {
81   icu::ErrorCode error;
82   const icu::Normalizer2 *nfkc_case_fold = icu::Normalizer2::getNFKCCasefoldInstance(error);
83   CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "BasicTokenizer: getNFKCCasefoldInstance failed.");
84   RETURN_UNEXPECTED_IF_NULL(output);
85   output->clear();
86 
87   // 1. get start and end offsets of not case fold strs
88   std::queue<std::pair<int, int>> offsets;  // offsets of not used words
89   int start = -1;
90   int len = 0;
91   for (int i = 0; i < text.length(); i++) {
92     if (text[i] == '[') {
93       start = i;
94       ++len;
95     } else if (text[i] == ']' && start >= 0) {
96       ++len;
97       std::string word(text.substr(start, len));
98       if (unused_words.find(word) != unused_words.end()) {
99         offsets.push(std::make_pair(start, start + len - 1));
100       }
101       start = -1;
102       len = 0;
103     } else if (start >= 0) {
104       ++len;
105     }
106   }
107 
108   // 2. Do not apply case fold on `unused_words`
109   start = 0;
110   for (int i = 0; i < text.length();) {
111     std::string_view process_text;
112     std::string preserve_token;
113     if (offsets.empty()) {
114       i = text.length();
115       process_text = text.substr(start, i - start);
116     } else {
117       preserve_token = text.substr(offsets.front().first, offsets.front().second - offsets.front().first + 1);
118       process_text = text.substr(start, offsets.front().first - start);
119       i = offsets.front().second + 1;
120       offsets.pop();
121     }
122     std::string temp;
123     icu::StringByteSink<std::string> sink(&temp);
124     nfkc_case_fold->normalizeUTF8(0, icu::StringPiece(process_text.data(), process_text.size()), sink, nullptr, error);
125     *output += temp + preserve_token;
126   }
127   return Status::OK();
128 }
129 
CaseFoldWithoutUnusedWords(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)130 Status BasicTokenizerOp::CaseFoldWithoutUnusedWords(const std::shared_ptr<Tensor> &input,
131                                                     std::shared_ptr<Tensor> *output) {
132   IO_CHECK(input, output);
133   CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "BasicTokenizer: input is not string datatype.");
134   std::vector<std::string> strs(input->Size());
135   size_t i = 0;
136   for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
137     RETURN_IF_NOT_OK(CaseFoldWithoutUnusedWords(*iter, kUnusedWords, &strs[i++]));
138   }
139   return Tensor::CreateFromVector(strs, input->shape(), output);
140 }
141 
Compute(const TensorRow & input,TensorRow * output)142 Status BasicTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
143   IO_CHECK_VECTOR(input, output);
144   CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "BasicTokenizer: input only support one column data.");
145   if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
146     RETURN_STATUS_UNEXPECTED("BasicTokenizer: the input should be scalar with string datatype");
147   }
148   std::shared_ptr<Tensor> cur_input;
149   std::shared_ptr<Tensor> processed_tensor;
150   if (lower_case_) {
151     if (!preserve_unused_token_) {
152       // to lower case
153       RETURN_IF_NOT_OK(case_fold_->Compute(input[0], &processed_tensor));
154     } else {
155       // to lower case except words in kUnusedWords
156       RETURN_IF_NOT_OK(CaseFoldWithoutUnusedWords(input[0], &processed_tensor));
157     }
158     cur_input = processed_tensor;
159     // strip accent characters
160     RETURN_IF_NOT_OK(nfd_normalize_->Compute(cur_input, &processed_tensor));
161     cur_input = processed_tensor;
162     RETURN_IF_NOT_OK(replace_accent_chars_->Compute(cur_input, &processed_tensor));
163   } else {
164     RETURN_IF_NOT_OK(common_normalize_->Compute(input[0], &processed_tensor));
165   }
166   // strip control characters
167   cur_input = processed_tensor;
168   RETURN_IF_NOT_OK(replace_control_chars_->Compute(cur_input, &processed_tensor));
169   return regex_tokenizer_->Compute(TensorRow(0, {std::move(processed_tensor)}), output);
170 }
171 }  // namespace dataset
172 }  // namespace mindspore
173