• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_MINDDATA_MINDRECORD_INCLUDE_SHARD_COLUMN_H_
18 #define MINDSPORE_CCSRC_MINDDATA_MINDRECORD_INCLUDE_SHARD_COLUMN_H_
19 
20 #include <memory>
21 #include <string>
22 #include <unordered_map>
23 #include <utility>
24 #include <vector>
25 #include "minddata/mindrecord/include/shard_header.h"
26 
27 namespace mindspore {
28 namespace mindrecord {
29 const uint64_t kUnsignedOne = 1;
30 const uint64_t kBitsOfByte = 8;
31 const uint64_t kDataTypeBits = 2;
32 const uint64_t kNumDataOfByte = 4;
33 const uint64_t kBytesOfColumnLen = 4;
34 const uint64_t kDataTypeBitMask = 3;
35 const uint64_t kDataTypes = 6;
36 
37 enum IntegerType { kInt8Type = 0, kInt16Type, kInt32Type, kInt64Type };
38 
39 enum ColumnCategory { ColumnInRaw, ColumnInBlob, ColumnNotFound };
40 
41 enum ColumnDataType {
42   ColumnBytes = 0,
43   ColumnString = 1,
44   ColumnInt32 = 2,
45   ColumnInt64 = 3,
46   ColumnFloat32 = 4,
47   ColumnFloat64 = 5,
48   ColumnNoDataType = 6
49 };
50 
51 const uint32_t ColumnDataTypeSize[kDataTypes] = {1, 1, 4, 8, 4, 8};
52 
53 const std::vector<std::string> ColumnDataTypeNameNormalized = {"uint8", "string",  "int32",
54                                                                "int64", "float32", "float64"};
55 
56 const std::unordered_map<std::string, ColumnDataType> ColumnDataTypeMap = {
57   {"bytes", ColumnBytes}, {"string", ColumnString},   {"int32", ColumnInt32},
58   {"int64", ColumnInt64}, {"float32", ColumnFloat32}, {"float64", ColumnFloat64}};
59 
60 class __attribute__((visibility("default"))) ShardColumn {
61  public:
62   explicit ShardColumn(const std::shared_ptr<ShardHeader> &shard_header, bool compress_integer = true);
63   explicit ShardColumn(const json &schema_json, bool compress_integer = true);
64 
65   ~ShardColumn() = default;
66 
67   /// \brief get column value by column name
68   Status GetColumnValueByName(const std::string &column_name, const std::vector<uint8_t> &columns_blob,
69                               const json &columns_json, const unsigned char **data,
70                               std::unique_ptr<unsigned char[]> *data_ptr, uint64_t *const n_bytes,
71                               ColumnDataType *column_data_type, uint64_t *column_data_type_size,
72                               std::vector<int64_t> *column_shape);
73 
74   /// \brief compress blob
75   std::vector<uint8_t> CompressBlob(const std::vector<uint8_t> &blob, int64_t *compression_size);
76 
77   /// \brief check if blob compressed
CheckCompressBlob()78   bool CheckCompressBlob() const { return has_compress_blob_; }
79 
80   /// \brief getter
GetNumBlobColumn()81   uint64_t GetNumBlobColumn() const { return num_blob_column_; }
82 
83   /// \brief getter
GetColumnName()84   std::vector<std::string> GetColumnName() { return column_name_; }
85 
86   /// \brief getter
GeColumnDataType()87   std::vector<ColumnDataType> GeColumnDataType() { return column_data_type_; }
88 
89   /// \brief getter
GetColumnShape()90   std::vector<std::vector<int64_t>> GetColumnShape() { return column_shape_; }
91 
92   /// \brief get column value from blob
93   Status GetColumnFromBlob(const std::string &column_name, const std::vector<uint8_t> &columns_blob,
94                            const unsigned char **data, std::unique_ptr<unsigned char[]> *data_ptr,
95                            uint64_t *const n_bytes);
96 
97   /// \brief get column type
98   Status GetColumnTypeByName(const std::string &column_name, ColumnDataType *column_data_type,
99                              uint64_t *column_data_type_size, std::vector<int64_t> *column_shape,
100                              ColumnCategory *column_category);
101 
102   /// \brief get column value from json
103   Status GetColumnFromJson(const std::string &column_name, const json &columns_json,
104                            std::unique_ptr<unsigned char[]> *data_ptr, uint64_t *n_bytes);
105 
106  private:
107   /// \brief initialization
108   void Init(const json &schema_json, bool compress_integer = true);
109 
110   /// \brief get float value from json
111   template <typename T>
112   Status GetFloat(std::unique_ptr<unsigned char[]> *data_ptr, const json &json_column_value, bool use_double);
113 
114   /// \brief get integer value from json
115   template <typename T>
116   Status GetInt(std::unique_ptr<unsigned char[]> *data_ptr, const json &json_column_value);
117 
118   /// \brief get column offset address and size from blob
119   Status GetColumnAddressInBlock(const uint64_t &column_id, const std::vector<uint8_t> &columns_blob,
120                                  uint64_t *num_bytes, uint64_t *shift_idx);
121 
122   /// \brief check if column name is available
123   ColumnCategory CheckColumnName(const std::string &column_name);
124 
125   /// \brief compress integer column
126   static vector<uint8_t> CompressInt(const vector<uint8_t> &src_bytes, const IntegerType &int_type);
127 
128   /// \brief uncompress integer array column
129   template <typename T>
130   static Status UncompressInt(const uint64_t &column_id, std::unique_ptr<unsigned char[]> *const data_ptr,
131                               const std::vector<uint8_t> &columns_blob, uint64_t *num_bytes, uint64_t shift_idx);
132 
133   /// \brief convert big-endian bytes to unsigned int
134   /// \param bytes_array bytes array
135   /// \param pos shift address in bytes array
136   /// \param i_type integer type
137   /// \return unsigned int
138   static uint64_t BytesBigToUInt64(const std::vector<uint8_t> &bytes_array, const uint64_t &pos,
139                                    const IntegerType &i_type);
140 
141   /// \brief convert unsigned int to big-endian bytes
142   /// \param value integer value
143   /// \param i_type integer type
144   /// \return bytes
145   static std::vector<uint8_t> UIntToBytesBig(uint64_t value, const IntegerType &i_type);
146 
147   /// \brief convert unsigned int to little-endian bytes
148   /// \param value integer value
149   /// \param i_type integer type
150   /// \return bytes
151   static std::vector<uint8_t> UIntToBytesLittle(uint64_t value, const IntegerType &i_type);
152 
153   /// \brief convert unsigned int to little-endian bytes
154   /// \param bytes_array bytes array
155   /// \param pos shift address in bytes array
156   /// \param src_i_type source integer typ0e
157   /// \param dst_i_type (output), destination integer type
158   /// \return integer
159   static int64_t BytesLittleToMinIntType(const std::vector<uint8_t> &bytes_array, const uint64_t &pos,
160                                          const IntegerType &src_i_type, IntegerType *dst_i_type = nullptr);
161 
162  private:
163   std::vector<std::string> column_name_;                      // column name list
164   std::vector<ColumnDataType> column_data_type_;              // column data type list
165   std::vector<std::vector<int64_t>> column_shape_;            // column shape list
166   std::unordered_map<string, uint64_t> column_name_id_;       // column name id map
167   std::vector<std::string> blob_column_;                      // blob column list
168   std::unordered_map<std::string, uint64_t> blob_column_id_;  // blob column name id map
169   bool has_compress_blob_;                                    // if has compress blob
170   uint64_t num_blob_column_;                                  // number of blob columns
171 };
172 }  // namespace mindrecord
173 }  // namespace mindspore
174 
175 #endif  // MINDSPORE_CCSRC_MINDDATA_MINDRECORD_INCLUDE_SHARD_COLUMN_H_
176