• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
17 #define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
18 #include <memory>
19 #include <string>
20 #include <unordered_set>
21 
22 #include "minddata/dataset/core/tensor.h"
23 #include "minddata/dataset/kernels/tensor_op.h"
24 #include "minddata/dataset/text/kernels/case_fold_op.h"
25 #include "minddata/dataset/text/kernels/normalize_utf8_op.h"
26 #include "minddata/dataset/text/kernels/regex_replace_op.h"
27 #include "minddata/dataset/text/kernels/regex_tokenizer_op.h"
28 #include "minddata/dataset/text/kernels/tokenizer_op.h"
29 #include "minddata/dataset/util/status.h"
30 
31 namespace mindspore {
32 namespace dataset {
33 
34 class BasicTokenizerOp : public TokenizerOp {
35  public:
36   static const bool kDefLowerCase;
37   static const bool kDefKeepWhitespace;
38   static const NormalizeForm kDefNormalizationForm;
39   static const bool kDefPreserveUnusedToken;
40 
41   explicit BasicTokenizerOp(const bool &lower_case = kDefLowerCase, const bool &keep_whitespace = kDefKeepWhitespace,
42                             const NormalizeForm &normalization_form = kDefNormalizationForm,
43                             const bool &preserve_unused_token = kDefPreserveUnusedToken,
44                             const bool &with_offsets = kDefWithOffsets);
45 
46   ~BasicTokenizerOp() override = default;
47 
48   Status Compute(const TensorRow &input, TensorRow *output) override;
49 
50  protected:
51   Status CaseFoldWithoutUnusedWords(const std::string_view &text, const std::unordered_set<std::string> &unused_words,
52                                     std::string *output);
53   Status CaseFoldWithoutUnusedWords(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output);
54 
Name()55   std::string Name() const override { return kBasicTokenizerOp; }
56 
57  private:
58   static const char kCommonPattern[];
59   static const char kUnusedPattern[];
60   static const std::unordered_set<std::string> kUnusedWords;
61   bool lower_case_;
62   bool keep_whitespace_;
63   NormalizeForm normalization_form_;
64   bool preserve_unused_token_;
65   std::unique_ptr<CaseFoldOp> case_fold_;
66   std::unique_ptr<NormalizeUTF8Op> nfd_normalize_;
67   std::unique_ptr<NormalizeUTF8Op> common_normalize_;
68   std::unique_ptr<RegexReplaceOp> replace_accent_chars_;
69   std::unique_ptr<RegexReplaceOp> replace_control_chars_;
70   std::unique_ptr<RegexTokenizerOp> regex_tokenizer_;
71 };
72 }  // namespace dataset
73 }  // namespace mindspore
74 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
75