1 /** 2 * Copyright 2019 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_MINDDATA_MINDRECORD_INCLUDE_COMMON_SHARD_UTILS_H_ 18 #define MINDSPORE_CCSRC_MINDDATA_MINDRECORD_INCLUDE_COMMON_SHARD_UTILS_H_ 19 20 #ifndef _CRT_RAND_S 21 #define _CRT_RAND_S 22 #endif 23 #include <stdlib.h> 24 #ifndef _MSC_VER 25 #include <libgen.h> 26 #endif 27 #include <limits.h> 28 #include <sys/stat.h> 29 #if !defined(_WIN32) && !defined(_WIN64) && !defined(__APPLE__) 30 #include <sys/statfs.h> 31 #include <sys/wait.h> 32 #endif 33 #include <cassert> 34 #include <cmath> 35 #include <cstdio> 36 #include <ctime> 37 #include <future> 38 #include <iostream> 39 #include <limits> 40 #include <map> 41 #include <memory> 42 #include <random> 43 #include <set> 44 #include <sstream> 45 #include <string> 46 #include <thread> 47 #include <unordered_map> 48 #include <utility> 49 #include <vector> 50 51 #include "minddata/mindrecord/include/common/log_adapter.h" 52 #include "minddata/mindrecord/include/shard_error.h" 53 #include "nlohmann/json.hpp" 54 #include "sqlite3.h" 55 56 /* To be used when dlog is ok #include "./slog.h" */ 57 #ifdef DEBUG 58 #define MS_ASSERT(f) assert(f) 59 #else 60 #define MS_ASSERT(f) ((void)0) 61 #endif 62 63 namespace mindspore { 64 namespace mindrecord { 65 using json = nlohmann::json; 66 67 const int kInt0 = 0; 68 const int kInt1 = 1; 69 const int kInt2 = 2; 70 const int kInt3 = 3; 71 const int kUnsignedInt4 = 4; 72 73 enum LabelCategory { kSchemaLabel, kStatisticsLabel, kIndexLabel }; 74 75 const char kVersion[] = "3.0"; 76 const std::vector<std::string> kSupportedVersion = {"2.0", kVersion}; 77 78 enum ShardType { 79 kNLP = 0, 80 kCV = 1, 81 }; 82 83 enum TaskType { 84 kCommonTask = 0, 85 kPaddedTask = 1, 86 }; 87 enum SamplerType { kCustomTopNSampler, kCustomTopPercentSampler, kSubsetRandomSampler, kPKSampler, kSubsetSampler }; 88 89 enum ShuffleType { kShuffleCategory, kShuffleSample }; 90 91 const double kEpsilon = 1e-7; 92 93 const int kThreadNumber = 14; 94 95 // Shard default parameters 96 const uint64_t kDefaultHeaderSize = 1 << 24; // 16MB 97 const uint64_t kDefaultPageSize = 1 << 25; // 32MB 98 99 // HeaderSize [16KB, 128MB] 100 const int kMinHeaderSize = 1 << 14; // 16KB 101 const int kMaxHeaderSize = 1 << 27; // 128MB 102 103 // PageSize [32KB, 256MB] 104 const int kMinPageSize = 1 << 15; // 32KB 105 const int kMaxPageSize = 1 << 28; // 256MB 106 107 // used by value length / schema id length / statistic id length ... 108 const uint64_t kInt64Len = 8; 109 110 // Minimum file size 111 const uint64_t kMinFileSize = kInt64Len; 112 113 const int kMinShardCount = 1; 114 const int kMaxShardCount = 1000; // write 115 116 const int kMinConsumerCount = 1; 117 const int kMaxConsumerCount = 128; 118 119 const int kMaxSchemaCount = 1; 120 const int kMaxThreadCount = 32; 121 const int kMaxFieldCount = 100; 122 123 // Minimum free disk size 124 const int kMinFreeDiskSize = 10; // 10M 125 126 // dummy json 127 const json kDummyId = R"({"id": 0})"_json; 128 129 // translate type in schema to type in sqlite3(NULL, INTEGER, REAL, TEXT, BLOB) 130 const std::unordered_map<std::string, std::string> kDbJsonMap = { 131 {"string", "TEXT"}, {"date", "DATE"}, {"date-time", "DATETIME"}, {"null", "NULL"}, {"integer", "INTEGER"}, 132 {"boolean", "BOOLEAN"}, {"array", "BLOB"}, {"number", "NUMERIC"}, {"int32", "INTEGER"}, {"int64", "INTEGER"}, 133 {"float32", "REAL"}, {"float64", "REAL"}, {"bytes", "BLOB"}}; 134 135 const char kPoint = '.'; 136 137 const char kPathSeparator = 138 #if defined(_WIN32) || defined(_WIN64) 139 '\\'; 140 #else 141 '/'; 142 #endif 143 144 // field type used by check schema validation 145 const std::set<std::string> kFieldTypeSet = {"bytes", "string", "int32", "int64", "float32", "float64"}; 146 147 // can be searched field list 148 const std::set<std::string> kScalarFieldTypeSet = {"string", "int32", "int64", "float32", "float64"}; 149 150 // number field list 151 const std::set<std::string> kNumberFieldTypeSet = {"int32", "int64", "float32", "float64"}; 152 153 const std::unordered_map<std::string, std::string> kTypesMap = { 154 {"bool", "int32"}, {"int8", "int32"}, {"uint8", "int32"}, {"int16", "int32"}, {"uint16", "int32"}, 155 {"int32", "int32"}, {"uint32", "int64"}, {"int64", "int64"}, {"uint64", "int64"}, {"float16", "float32"}, 156 {"float32", "float32"}, {"float64", "float64"}, {"string", "string"}, {"bytes", "bytes"}}; 157 158 /// \brief the max number of samples to enable lazy load 159 const uint32_t LAZY_LOAD_THRESHOLD = 5000000; 160 161 /// \brief the max number of samples 162 const uint32_t SLOW_LOAD_THRESHOLD = 100000000; 163 164 enum LoadMode { 165 kFast = 0, // use std::tuple<TaskType, std::tuple<int, int>, std::vector<uint64_t>, json>; to cache meta data 166 kLazy = 1, // >5,000,000 samples, use std::tuple<TaskType, std::tuple<int, int>, {}, {}> to cache meta data 167 kSlow = 2 // >100,000,000 samples, don't cache meta data which is too large 168 }; 169 170 /// \brief parallel convert from vector<py::bytes> to vector<vector<uint8_t>> 171 const uint32_t kParallelConvert = 4; 172 173 /// \brief split a string using a character 174 /// \param[in] field target string 175 /// \param[in] separator a character for splitting 176 /// \return vector type result 177 std::vector<std::string> StringSplit(const std::string &field, char separator); 178 179 /// \brief validate field name is composed of '0-9' or 'a-z' or 'A-Z' or '_' or '-' 180 /// \param[in] str target string 181 /// \return 182 bool ValidateFieldName(const std::string &str); 183 184 /// \brief get the filename by the path 185 /// \param s file path 186 /// \param fn_ptr shared ptr of file name 187 /// \return Status 188 Status GetFileName(const std::string &path, std::shared_ptr<std::string> *fn_ptr); 189 190 /// \brief get parent dir 191 /// \param path file path 192 /// \param pd_ptr shared ptr of parent path 193 /// \return Status 194 Status GetParentDir(const std::string &path, std::shared_ptr<std::string> *pd_ptr); 195 196 bool CheckIsValidUtf8(const std::string &str); 197 198 /// \brief judge if a path is legal file 199 /// \param path file path 200 /// \return Whether the path is legal or not 201 Status CheckFile(const std::string &path); 202 203 enum DiskSizeType { kTotalSize = 0, kFreeSize }; 204 205 /// \brief get the free space about the disk 206 /// \param str_dir file path 207 /// \param disk_type: kTotalSize / kFreeSize 208 /// \param size: shared ptr of size in Megabytes 209 /// \return Status 210 Status GetDiskSize(const std::string &str_dir, const DiskSizeType &disk_type, std::shared_ptr<uint64_t> *size); 211 212 /// \brief get the max hardware concurrency 213 /// \return max concurrency 214 uint32_t GetMaxThreadNum(); 215 216 /// \brief get absolute path of all mindrecord files 217 /// \param path path to one fo mindrecord files 218 /// \param addresses relative path of all mindrecord files 219 /// \param ds shared ptr of vector of absolute path 220 /// \return Status 221 Status GetDatasetFiles(const std::string &path, const json &addresses, std::shared_ptr<std::vector<std::string>> *ds); 222 223 /// \brief get random 224 /// \return std::mt19937 225 std::mt19937 GetRandomDevice(); 226 } // namespace mindrecord 227 } // namespace mindspore 228 229 #endif // MINDSPORE_CCSRC_MINDDATA_MINDRECORD_INCLUDE_COMMON_SHARD_UTILS_H_ 230