1 /** 2 * Copyright 2020 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_MINDDATA_MINDRECORD_INCLUDE_SHARD_COLUMN_H_ 18 #define MINDSPORE_CCSRC_MINDDATA_MINDRECORD_INCLUDE_SHARD_COLUMN_H_ 19 20 #include <memory> 21 #include <string> 22 #include <unordered_map> 23 #include <utility> 24 #include <vector> 25 #include "minddata/mindrecord/include/shard_header.h" 26 27 namespace mindspore { 28 namespace mindrecord { 29 const uint64_t kUnsignedOne = 1; 30 const uint64_t kBitsOfByte = 8; 31 const uint64_t kDataTypeBits = 2; 32 const uint64_t kNumDataOfByte = 4; 33 const uint64_t kBytesOfColumnLen = 4; 34 const uint64_t kDataTypeBitMask = 3; 35 const uint64_t kDataTypes = 6; 36 37 enum IntegerType { kInt8Type = 0, kInt16Type, kInt32Type, kInt64Type }; 38 39 enum ColumnCategory { ColumnInRaw, ColumnInBlob, ColumnNotFound }; 40 41 enum ColumnDataType { 42 ColumnBytes = 0, 43 ColumnString = 1, 44 ColumnInt32 = 2, 45 ColumnInt64 = 3, 46 ColumnFloat32 = 4, 47 ColumnFloat64 = 5, 48 ColumnNoDataType = 6 49 }; 50 51 const uint32_t ColumnDataTypeSize[kDataTypes] = {1, 1, 4, 8, 4, 8}; 52 53 const std::vector<std::string> ColumnDataTypeNameNormalized = {"uint8", "string", "int32", 54 "int64", "float32", "float64"}; 55 56 const std::unordered_map<std::string, ColumnDataType> ColumnDataTypeMap = { 57 {"bytes", ColumnBytes}, {"string", ColumnString}, {"int32", ColumnInt32}, 58 {"int64", ColumnInt64}, {"float32", ColumnFloat32}, {"float64", ColumnFloat64}}; 59 60 class __attribute__((visibility("default"))) ShardColumn { 61 public: 62 explicit ShardColumn(const std::shared_ptr<ShardHeader> &shard_header, bool compress_integer = true); 63 explicit ShardColumn(const json &schema_json, bool compress_integer = true); 64 65 ~ShardColumn() = default; 66 67 /// \brief get column value by column name 68 Status GetColumnValueByName(const std::string &column_name, const std::vector<uint8_t> &columns_blob, 69 const json &columns_json, const unsigned char **data, 70 std::unique_ptr<unsigned char[]> *data_ptr, uint64_t *const n_bytes, 71 ColumnDataType *column_data_type, uint64_t *column_data_type_size, 72 std::vector<int64_t> *column_shape); 73 74 /// \brief compress blob 75 std::vector<uint8_t> CompressBlob(const std::vector<uint8_t> &blob, int64_t *compression_size); 76 77 /// \brief check if blob compressed CheckCompressBlob()78 bool CheckCompressBlob() const { return has_compress_blob_; } 79 80 /// \brief getter GetNumBlobColumn()81 uint64_t GetNumBlobColumn() const { return num_blob_column_; } 82 83 /// \brief getter GetColumnName()84 std::vector<std::string> GetColumnName() { return column_name_; } 85 86 /// \brief getter GeColumnDataType()87 std::vector<ColumnDataType> GeColumnDataType() { return column_data_type_; } 88 89 /// \brief getter GetColumnShape()90 std::vector<std::vector<int64_t>> GetColumnShape() { return column_shape_; } 91 92 /// \brief get column value from blob 93 Status GetColumnFromBlob(const std::string &column_name, const std::vector<uint8_t> &columns_blob, 94 const unsigned char **data, std::unique_ptr<unsigned char[]> *data_ptr, 95 uint64_t *const n_bytes); 96 97 /// \brief get column type 98 Status GetColumnTypeByName(const std::string &column_name, ColumnDataType *column_data_type, 99 uint64_t *column_data_type_size, std::vector<int64_t> *column_shape, 100 ColumnCategory *column_category); 101 102 /// \brief get column value from json 103 Status GetColumnFromJson(const std::string &column_name, const json &columns_json, 104 std::unique_ptr<unsigned char[]> *data_ptr, uint64_t *n_bytes); 105 106 private: 107 /// \brief initialization 108 void Init(const json &schema_json, bool compress_integer = true); 109 110 /// \brief get float value from json 111 template <typename T> 112 Status GetFloat(std::unique_ptr<unsigned char[]> *data_ptr, const json &json_column_value, bool use_double); 113 114 /// \brief get integer value from json 115 template <typename T> 116 Status GetInt(std::unique_ptr<unsigned char[]> *data_ptr, const json &json_column_value); 117 118 /// \brief get column offset address and size from blob 119 Status GetColumnAddressInBlock(const uint64_t &column_id, const std::vector<uint8_t> &columns_blob, 120 uint64_t *num_bytes, uint64_t *shift_idx); 121 122 /// \brief check if column name is available 123 ColumnCategory CheckColumnName(const std::string &column_name); 124 125 /// \brief compress integer column 126 static vector<uint8_t> CompressInt(const vector<uint8_t> &src_bytes, const IntegerType &int_type); 127 128 /// \brief uncompress integer array column 129 template <typename T> 130 static Status UncompressInt(const uint64_t &column_id, std::unique_ptr<unsigned char[]> *const data_ptr, 131 const std::vector<uint8_t> &columns_blob, uint64_t *num_bytes, uint64_t shift_idx); 132 133 /// \brief convert big-endian bytes to unsigned int 134 /// \param bytes_array bytes array 135 /// \param pos shift address in bytes array 136 /// \param i_type integer type 137 /// \return unsigned int 138 static uint64_t BytesBigToUInt64(const std::vector<uint8_t> &bytes_array, const uint64_t &pos, 139 const IntegerType &i_type); 140 141 /// \brief convert unsigned int to big-endian bytes 142 /// \param value integer value 143 /// \param i_type integer type 144 /// \return bytes 145 static std::vector<uint8_t> UIntToBytesBig(uint64_t value, const IntegerType &i_type); 146 147 /// \brief convert unsigned int to little-endian bytes 148 /// \param value integer value 149 /// \param i_type integer type 150 /// \return bytes 151 static std::vector<uint8_t> UIntToBytesLittle(uint64_t value, const IntegerType &i_type); 152 153 /// \brief convert unsigned int to little-endian bytes 154 /// \param bytes_array bytes array 155 /// \param pos shift address in bytes array 156 /// \param src_i_type source integer typ0e 157 /// \param dst_i_type (output), destination integer type 158 /// \return integer 159 static int64_t BytesLittleToMinIntType(const std::vector<uint8_t> &bytes_array, const uint64_t &pos, 160 const IntegerType &src_i_type, IntegerType *dst_i_type = nullptr); 161 162 private: 163 std::vector<std::string> column_name_; // column name list 164 std::vector<ColumnDataType> column_data_type_; // column data type list 165 std::vector<std::vector<int64_t>> column_shape_; // column shape list 166 std::unordered_map<string, uint64_t> column_name_id_; // column name id map 167 std::vector<std::string> blob_column_; // blob column list 168 std::unordered_map<std::string, uint64_t> blob_column_id_; // blob column name id map 169 bool has_compress_blob_; // if has compress blob 170 uint64_t num_blob_column_; // number of blob columns 171 }; 172 } // namespace mindrecord 173 } // namespace mindspore 174 175 #endif // MINDSPORE_CCSRC_MINDDATA_MINDRECORD_INCLUDE_SHARD_COLUMN_H_ 176