• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VECTORS_H_
18 #define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VECTORS_H_
19 
20 #include <algorithm>
21 #include <limits>
22 #include <memory>
23 #include <string>
24 #include <unordered_map>
25 #include <utility>
26 #include <vector>
27 
28 #include "minddata/dataset/core/tensor.h"
29 #include "minddata/dataset/include/dataset/iterator.h"
30 
31 namespace mindspore {
32 namespace dataset {
33 /// \brief Pre-train word vectors.
34 class Vectors {
35  public:
36   /// Constructor.
37   Vectors() = default;
38 
39   /// Constructor.
40   /// \param[in] map A map between string and vector.
41   /// \param[in] dim Dimension of the vectors.
42   Vectors(const std::unordered_map<std::string, std::vector<float>> &map, int32_t dim);
43 
44   /// Destructor.
45   virtual ~Vectors() = default;
46 
47   /// \brief Build Vectors from reading a pre-train vector file.
48   /// \param[out] vectors Vectors object which contains the pre-train vectors.
49   /// \param[in] path Path to the pre-trained word vector file.
50   /// \param[in] max_vectors This can be used to limit the number of pre-trained vectors loaded (default=0, no limit).
51   static Status BuildFromFile(std::shared_ptr<Vectors> *vectors, const std::string &path, int32_t max_vectors = 0);
52 
53   /// \brief Look up embedding vectors of token.
54   /// \param[in] token A token to be looked up.
55   /// \param[in] unk_init In case of the token is out-of-vectors (OOV), the result will be initialized with `unk_init`.
56   ///     (default={}, means to initialize with zero vectors).
57   /// \param[in] lower_case_backup Whether to look up the token in the lower case (Default = false).
58   /// \return The vector of the input token.
59   virtual std::vector<float> Lookup(const std::string &token, const std::vector<float> &unk_init = {},
60                                     bool lower_case_backup = false);
61 
62   /// \brief Getter of dimension.
Dim()63   const int32_t &Dim() const { return dim_; }
64 
65  protected:
66   /// \brief Infer the shape of the pre-trained word vector file.
67   /// \param[in] path Path to the pre-trained word vector file.
68   /// \param[in] max_vectors Maximum number of pre-trained word vectors to be read.
69   /// \param[out] num_lines The number of lines of the file.
70   /// \param[out] header_num_lines The number of lines of file header.
71   /// \param[out] vector_dim The dimension of the vectors in the file.
72   static Status InferShape(const std::string &path, int32_t max_vectors, int32_t *num_lines, int32_t *header_num_lines,
73                            int32_t *vector_dim);
74 
75   /// \brief Load map from reading a pre-train vector file.
76   /// \param[in] path Path to the pre-trained word vector file.
77   /// \param[in] max_vectors This can be used to limit the number of pre-trained vectors loaded, must be non negative.
78   /// \param[out] map The map between words and vectors.
79   /// \param[out] vector_dim The dimension of the vectors in the file.
80   static Status Load(const std::string &path, int32_t max_vectors,
81                      std::unordered_map<std::string, std::vector<float>> *map, int32_t *vector_dim);
82 
83   int32_t dim_;
84   std::unordered_map<std::string, std::vector<float>> map_;
85 };
86 }  // namespace dataset
87 }  // namespace mindspore
88 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VECTORS_H_
89