1 /** 2 * Copyright 2019 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_MINDDATA_MINDRECORD_INCLUDE_COMMON_SHARD_UTILS_H_ 18 #define MINDSPORE_CCSRC_MINDDATA_MINDRECORD_INCLUDE_COMMON_SHARD_UTILS_H_ 19 20 #include <libgen.h> 21 #include <limits.h> 22 #include <stdlib.h> 23 #include <sys/stat.h> 24 #if !defined(_WIN32) && !defined(_WIN64) && !defined(__APPLE__) 25 #include <sys/statfs.h> 26 #include <sys/wait.h> 27 #endif 28 #include <unistd.h> 29 #include <cassert> 30 #include <cmath> 31 #include <cstdio> 32 #include <ctime> 33 #include <future> 34 #include <iostream> 35 #include <map> 36 #include <memory> 37 #include <random> 38 #include <set> 39 #include <sstream> 40 #include <string> 41 #include <thread> 42 #include <unordered_map> 43 #include <utility> 44 #include <vector> 45 #include "minddata/mindrecord/include/shard_error.h" 46 #include "nlohmann/json.hpp" 47 #include "./sqlite3.h" 48 #include "utils/log_adapter.h" 49 50 /* To be used when dlog is ok #include "./slog.h" */ 51 #ifdef DEBUG 52 #define MS_ASSERT(f) assert(f) 53 #else 54 #define MS_ASSERT(f) ((void)0) 55 #endif 56 57 namespace mindspore { 58 namespace mindrecord { 59 using json = nlohmann::json; 60 61 const int kInt0 = 0; 62 const int kInt1 = 1; 63 const int kInt2 = 2; 64 const int kInt3 = 3; 65 const int kUnsignedInt4 = 4; 66 67 enum LabelCategory { kSchemaLabel, kStatisticsLabel, kIndexLabel }; 68 69 const char kVersion[] = "3.0"; 70 const std::vector<std::string> kSupportedVersion = {"2.0", kVersion}; 71 72 enum ShardType { 73 kNLP = 0, 74 kCV = 1, 75 }; 76 77 enum TaskType { 78 kCommonTask = 0, 79 kPaddedTask = 1, 80 }; 81 enum SamplerType { kCustomTopNSampler, kCustomTopPercentSampler, kSubsetRandomSampler, kPKSampler, kSubsetSampler }; 82 83 enum ShuffleType { kShuffleCategory, kShuffleSample }; 84 85 const double kEpsilon = 1e-7; 86 87 const int kThreadNumber = 14; 88 89 // Shard default parameters 90 const uint64_t kDefaultHeaderSize = 1 << 24; // 16MB 91 const uint64_t kDefaultPageSize = 1 << 25; // 32MB 92 93 // HeaderSize [16KB, 128MB] 94 const int kMinHeaderSize = 1 << 14; // 16KB 95 const int kMaxHeaderSize = 1 << 27; // 128MB 96 97 // PageSize [32KB, 256MB] 98 const int kMinPageSize = 1 << 15; // 32KB 99 const int kMaxPageSize = 1 << 28; // 256MB 100 101 // used by value length / schema id length / statistic id length ... 102 const uint64_t kInt64Len = 8; 103 104 // Minimum file size 105 const uint64_t kMinFileSize = kInt64Len; 106 107 const int kMinShardCount = 1; 108 const int kMaxShardCount = 1000; // write 109 const int kMaxFileCount = 4096; // read 110 111 const int kMinConsumerCount = 1; 112 const int kMaxConsumerCount = 128; 113 114 const int kMaxSchemaCount = 1; 115 const int kMaxThreadCount = 32; 116 const int kMaxFieldCount = 100; 117 118 // Minimum free disk size 119 const int kMinFreeDiskSize = 10; // 10M 120 121 // dummy json 122 const json kDummyId = R"({"id": 0})"_json; 123 124 // translate type in schema to type in sqlite3(NULL, INTEGER, REAL, TEXT, BLOB) 125 const std::unordered_map<std::string, std::string> kDbJsonMap = { 126 {"string", "TEXT"}, {"date", "DATE"}, {"date-time", "DATETIME"}, {"null", "NULL"}, 127 {"integer", "INTEGER"}, {"boolean", "BOOLEAN"}, {"array", "BLOB"}, {"number", "NUMERIC"}, 128 {"int32", "INTEGER"}, {"int64", "INTEGER"}, {"float32", "NUMERIC"}, {"float64", "NUMERIC"}, 129 {"bytes", "BLOB"}}; 130 131 const char kPoint = '.'; 132 133 const char kPathSeparator = 134 #if defined(_WIN32) || defined(_WIN64) 135 '\\'; 136 #else 137 '/'; 138 #endif 139 140 // field type used by check schema validation 141 const std::set<std::string> kFieldTypeSet = {"bytes", "string", "int32", "int64", "float32", "float64"}; 142 143 // can be searched field list 144 const std::set<std::string> kScalarFieldTypeSet = {"string", "int32", "int64", "float32", "float64"}; 145 146 // number field list 147 const std::set<std::string> kNumberFieldTypeSet = {"int32", "int64", "float32", "float64"}; 148 149 const std::unordered_map<std::string, std::string> kTypesMap = { 150 {"bool", "int32"}, {"int8", "int32"}, {"uint8", "bytes"}, {"int16", "int32"}, 151 {"uint16", "int32"}, {"int32", "int32"}, {"uint32", "int64"}, {"int64", "int64"}, 152 {"float16", "float32"}, {"float32", "float32"}, {"float64", "float64"}, {"string", "string"}}; 153 154 /// \brief the max number of samples to enable lazy load 155 const uint32_t LAZY_LOAD_THRESHOLD = 5000000; 156 157 /// \brief split a string using a character 158 /// \param[in] field target string 159 /// \param[in] separator a character for splitting 160 /// \return vector type result 161 std::vector<std::string> StringSplit(const std::string &field, char separator); 162 163 /// \brief validate field name is composed of '0-9' or 'a-z' or 'A-Z' or '_' or '-' 164 /// \param[in] str target string 165 /// \return 166 bool ValidateFieldName(const std::string &str); 167 168 /// \brief get the filename by the path 169 /// \param s file path 170 /// \param fn_ptr shared ptr of file name 171 /// \return Status 172 Status GetFileName(const std::string &path, std::shared_ptr<std::string> *fn_ptr); 173 174 /// \brief get parent dir 175 /// \param path file path 176 /// \param pd_ptr shared ptr of parent path 177 /// \return Status 178 Status GetParentDir(const std::string &path, std::shared_ptr<std::string> *pd_ptr); 179 180 bool CheckIsValidUtf8(const std::string &str); 181 182 /// \brief judge if a path is legal file 183 /// \param path file path 184 /// \return Whether the path is legal or not 185 bool IsLegalFile(const std::string &path); 186 187 enum DiskSizeType { kTotalSize = 0, kFreeSize }; 188 189 /// \brief get the free space about the disk 190 /// \param str_dir file path 191 /// \param disk_type: kTotalSize / kFreeSize 192 /// \param size: shared ptr of size in Megabytes 193 /// \return Status 194 Status GetDiskSize(const std::string &str_dir, const DiskSizeType &disk_type, std::shared_ptr<uint64_t> *size); 195 196 /// \brief get the max hardware concurrency 197 /// \return max concurrency 198 uint32_t GetMaxThreadNum(); 199 200 /// \brief get absolute path of all mindrecord files 201 /// \param path path to one fo mindrecord files 202 /// \param addresses relative path of all mindrecord files 203 /// \param ds shared ptr of vector of absolute path 204 /// \return Status 205 Status GetDatasetFiles(const std::string &path, const json &addresses, std::shared_ptr<std::vector<std::string>> *ds); 206 } // namespace mindrecord 207 } // namespace mindspore 208 209 #endif // MINDSPORE_CCSRC_MINDDATA_MINDRECORD_INCLUDE_COMMON_SHARD_UTILS_H_ 210