• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_MINDDATA_MINDRECORD_INCLUDE_COMMON_SHARD_UTILS_H_
18 #define MINDSPORE_CCSRC_MINDDATA_MINDRECORD_INCLUDE_COMMON_SHARD_UTILS_H_
19 
20 #ifndef _CRT_RAND_S
21 #define _CRT_RAND_S
22 #endif
23 #include <stdlib.h>
24 #ifndef _MSC_VER
25 #include <libgen.h>
26 #endif
27 #include <limits.h>
28 #include <sys/stat.h>
29 #if !defined(_WIN32) && !defined(_WIN64) && !defined(__APPLE__)
30 #include <sys/statfs.h>
31 #include <sys/wait.h>
32 #endif
33 #include <cassert>
34 #include <cmath>
35 #include <cstdio>
36 #include <ctime>
37 #include <future>
38 #include <iostream>
39 #include <limits>
40 #include <map>
41 #include <memory>
42 #include <random>
43 #include <set>
44 #include <sstream>
45 #include <string>
46 #include <thread>
47 #include <unordered_map>
48 #include <utility>
49 #include <vector>
50 
51 #include "minddata/mindrecord/include/common/log_adapter.h"
52 #include "minddata/mindrecord/include/shard_error.h"
53 #include "nlohmann/json.hpp"
54 #include "sqlite3.h"
55 
56 /* To be used when dlog is ok #include "./slog.h" */
57 #ifdef DEBUG
58 #define MS_ASSERT(f) assert(f)
59 #else
60 #define MS_ASSERT(f) ((void)0)
61 #endif
62 
63 namespace mindspore {
64 namespace mindrecord {
65 using json = nlohmann::json;
66 
67 const int kInt0 = 0;
68 const int kInt1 = 1;
69 const int kInt2 = 2;
70 const int kInt3 = 3;
71 const int kUnsignedInt4 = 4;
72 
73 enum LabelCategory { kSchemaLabel, kStatisticsLabel, kIndexLabel };
74 
75 const char kVersion[] = "3.0";
76 const std::vector<std::string> kSupportedVersion = {"2.0", kVersion};
77 
78 enum ShardType {
79   kNLP = 0,
80   kCV = 1,
81 };
82 
83 enum TaskType {
84   kCommonTask = 0,
85   kPaddedTask = 1,
86 };
87 enum SamplerType { kCustomTopNSampler, kCustomTopPercentSampler, kSubsetRandomSampler, kPKSampler, kSubsetSampler };
88 
89 enum ShuffleType { kShuffleCategory, kShuffleSample };
90 
91 const double kEpsilon = 1e-7;
92 
93 const int kThreadNumber = 14;
94 
95 // Shard default parameters
96 const uint64_t kDefaultHeaderSize = 1 << 24;  // 16MB
97 const uint64_t kDefaultPageSize = 1 << 25;    // 32MB
98 
99 // HeaderSize [16KB, 128MB]
100 const int kMinHeaderSize = 1 << 14;  // 16KB
101 const int kMaxHeaderSize = 1 << 27;  // 128MB
102 
103 // PageSize [32KB, 256MB]
104 const int kMinPageSize = 1 << 15;  // 32KB
105 const int kMaxPageSize = 1 << 28;  // 256MB
106 
107 // used by value length / schema id length / statistic id length ...
108 const uint64_t kInt64Len = 8;
109 
110 // Minimum file size
111 const uint64_t kMinFileSize = kInt64Len;
112 
113 const int kMinShardCount = 1;
114 const int kMaxShardCount = 1000;  // write
115 
116 const int kMinConsumerCount = 1;
117 const int kMaxConsumerCount = 128;
118 
119 const int kMaxSchemaCount = 1;
120 const int kMaxThreadCount = 32;
121 const int kMaxFieldCount = 100;
122 
123 // Minimum free disk size
124 const int kMinFreeDiskSize = 10;  // 10M
125 
126 // dummy json
127 const json kDummyId = R"({"id": 0})"_json;
128 
129 // translate type in schema to type in sqlite3(NULL, INTEGER, REAL, TEXT, BLOB)
130 const std::unordered_map<std::string, std::string> kDbJsonMap = {
131   {"string", "TEXT"},     {"date", "DATE"},    {"date-time", "DATETIME"}, {"null", "NULL"},     {"integer", "INTEGER"},
132   {"boolean", "BOOLEAN"}, {"array", "BLOB"},   {"number", "NUMERIC"},     {"int32", "INTEGER"}, {"int64", "INTEGER"},
133   {"float32", "REAL"},    {"float64", "REAL"}, {"bytes", "BLOB"}};
134 
135 const char kPoint = '.';
136 
137 const char kPathSeparator =
138 #if defined(_WIN32) || defined(_WIN64)
139   '\\';
140 #else
141   '/';
142 #endif
143 
144 // field type used by check schema validation
145 const std::set<std::string> kFieldTypeSet = {"bytes", "string", "int32", "int64", "float32", "float64"};
146 
147 // can be searched field list
148 const std::set<std::string> kScalarFieldTypeSet = {"string", "int32", "int64", "float32", "float64"};
149 
150 // number field list
151 const std::set<std::string> kNumberFieldTypeSet = {"int32", "int64", "float32", "float64"};
152 
153 const std::unordered_map<std::string, std::string> kTypesMap = {
154   {"bool", "int32"},      {"int8", "int32"},      {"uint8", "int32"},   {"int16", "int32"},  {"uint16", "int32"},
155   {"int32", "int32"},     {"uint32", "int64"},    {"int64", "int64"},   {"uint64", "int64"}, {"float16", "float32"},
156   {"float32", "float32"}, {"float64", "float64"}, {"string", "string"}, {"bytes", "bytes"}};
157 
158 /// \brief the max number of samples to enable lazy load
159 const uint32_t LAZY_LOAD_THRESHOLD = 5000000;
160 
161 /// \brief the max number of samples
162 const uint32_t SLOW_LOAD_THRESHOLD = 100000000;
163 
164 enum LoadMode {
165   kFast = 0,  // use std::tuple<TaskType, std::tuple<int, int>, std::vector<uint64_t>, json>; to cache meta data
166   kLazy = 1,  // >5,000,000 samples, use std::tuple<TaskType, std::tuple<int, int>, {}, {}> to cache meta data
167   kSlow = 2   // >100,000,000 samples, don't cache meta data which is too large
168 };
169 
170 /// \brief parallel convert from vector<py::bytes> to vector<vector<uint8_t>>
171 const uint32_t kParallelConvert = 4;
172 
173 /// \brief split a string using a character
174 /// \param[in] field target string
175 /// \param[in] separator a character for splitting
176 /// \return vector type result
177 std::vector<std::string> StringSplit(const std::string &field, char separator);
178 
179 /// \brief validate field name is composed of '0-9' or 'a-z' or 'A-Z' or '_' or '-'
180 /// \param[in]  str target string
181 /// \return
182 bool ValidateFieldName(const std::string &str);
183 
184 /// \brief get the filename by the path
185 /// \param s file path
186 /// \param fn_ptr shared ptr of file name
187 /// \return Status
188 Status GetFileName(const std::string &path, std::shared_ptr<std::string> *fn_ptr);
189 
190 /// \brief get parent dir
191 /// \param path file path
192 /// \param pd_ptr shared ptr of parent path
193 /// \return Status
194 Status GetParentDir(const std::string &path, std::shared_ptr<std::string> *pd_ptr);
195 
196 bool CheckIsValidUtf8(const std::string &str);
197 
198 /// \brief judge if a path is legal file
199 /// \param path file path
200 /// \return Whether the path is legal or not
201 Status CheckFile(const std::string &path);
202 
203 enum DiskSizeType { kTotalSize = 0, kFreeSize };
204 
205 /// \brief get the free space about the disk
206 /// \param str_dir file path
207 /// \param disk_type: kTotalSize / kFreeSize
208 /// \param size: shared ptr of size in Megabytes
209 /// \return Status
210 Status GetDiskSize(const std::string &str_dir, const DiskSizeType &disk_type, std::shared_ptr<uint64_t> *size);
211 
212 /// \brief get the max hardware concurrency
213 /// \return max concurrency
214 uint32_t GetMaxThreadNum();
215 
216 /// \brief get absolute path of all mindrecord files
217 /// \param path path to one fo mindrecord files
218 /// \param addresses relative path of all mindrecord files
219 /// \param ds shared ptr of vector of absolute path
220 /// \return Status
221 Status GetDatasetFiles(const std::string &path, const json &addresses, std::shared_ptr<std::vector<std::string>> *ds);
222 
223 /// \brief get random
224 /// \return std::mt19937
225 std::mt19937 GetRandomDevice();
226 }  // namespace mindrecord
227 }  // namespace mindspore
228 
229 #endif  // MINDSPORE_CCSRC_MINDDATA_MINDRECORD_INCLUDE_COMMON_SHARD_UTILS_H_
230