1 /** 2 * Copyright 2020 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_ 18 #define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_ 19 20 #include <string> 21 #include <memory> 22 #include <vector> 23 #include <unordered_map> 24 #include "minddata/dataset/util/status.h" 25 #include "minddata/dataset/include/dataset/constants.h" 26 27 namespace mindspore { 28 namespace dataset { 29 30 class SentencePieceVocab { 31 public: 32 static Status BuildFromFile(const std::vector<std::string> &path_list, const int32_t vocab_size, 33 const float character_coverage, const SentencePieceModel model_type, 34 const std::unordered_map<std::string, std::string> ¶ms, 35 std::shared_ptr<SentencePieceVocab> *vocab); 36 static Status SaveModel(const std::shared_ptr<SentencePieceVocab> *vocab, std::string path, std::string filename); 37 SentencePieceVocab(); 38 39 ~SentencePieceVocab() = default; 40 41 const std::string &model_proto(); 42 43 void set_model_proto(const std::string model_proto); 44 45 private: 46 std::string model_proto_; 47 }; 48 } // namespace dataset 49 } // namespace mindspore 50 #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_ 51