• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_MINDDATA_MINDRECORD_INCLUDE_COMMON_SHARD_UTILS_H_
18 #define MINDSPORE_CCSRC_MINDDATA_MINDRECORD_INCLUDE_COMMON_SHARD_UTILS_H_
19 
20 #include <libgen.h>
21 #include <limits.h>
22 #include <stdlib.h>
23 #include <sys/stat.h>
24 #if !defined(_WIN32) && !defined(_WIN64) && !defined(__APPLE__)
25 #include <sys/statfs.h>
26 #include <sys/wait.h>
27 #endif
28 #include <unistd.h>
29 #include <cassert>
30 #include <cmath>
31 #include <cstdio>
32 #include <ctime>
33 #include <future>
34 #include <iostream>
35 #include <map>
36 #include <memory>
37 #include <random>
38 #include <set>
39 #include <sstream>
40 #include <string>
41 #include <thread>
42 #include <unordered_map>
43 #include <utility>
44 #include <vector>
45 #include "minddata/mindrecord/include/shard_error.h"
46 #include "nlohmann/json.hpp"
47 #include "./sqlite3.h"
48 #include "utils/log_adapter.h"
49 
50 /* To be used when dlog is ok #include "./slog.h" */
51 #ifdef DEBUG
52 #define MS_ASSERT(f) assert(f)
53 #else
54 #define MS_ASSERT(f) ((void)0)
55 #endif
56 
57 namespace mindspore {
58 namespace mindrecord {
59 using json = nlohmann::json;
60 
61 const int kInt0 = 0;
62 const int kInt1 = 1;
63 const int kInt2 = 2;
64 const int kInt3 = 3;
65 const int kUnsignedInt4 = 4;
66 
67 enum LabelCategory { kSchemaLabel, kStatisticsLabel, kIndexLabel };
68 
69 const char kVersion[] = "3.0";
70 const std::vector<std::string> kSupportedVersion = {"2.0", kVersion};
71 
72 enum ShardType {
73   kNLP = 0,
74   kCV = 1,
75 };
76 
77 enum TaskType {
78   kCommonTask = 0,
79   kPaddedTask = 1,
80 };
81 enum SamplerType { kCustomTopNSampler, kCustomTopPercentSampler, kSubsetRandomSampler, kPKSampler, kSubsetSampler };
82 
83 enum ShuffleType { kShuffleCategory, kShuffleSample };
84 
85 const double kEpsilon = 1e-7;
86 
87 const int kThreadNumber = 14;
88 
89 // Shard default parameters
90 const uint64_t kDefaultHeaderSize = 1 << 24;  // 16MB
91 const uint64_t kDefaultPageSize = 1 << 25;    // 32MB
92 
93 // HeaderSize [16KB, 128MB]
94 const int kMinHeaderSize = 1 << 14;  // 16KB
95 const int kMaxHeaderSize = 1 << 27;  // 128MB
96 
97 // PageSize [32KB, 256MB]
98 const int kMinPageSize = 1 << 15;  // 32KB
99 const int kMaxPageSize = 1 << 28;  // 256MB
100 
101 // used by value length / schema id length / statistic id length ...
102 const uint64_t kInt64Len = 8;
103 
104 // Minimum file size
105 const uint64_t kMinFileSize = kInt64Len;
106 
107 const int kMinShardCount = 1;
108 const int kMaxShardCount = 1000;  // write
109 const int kMaxFileCount = 4096;   // read
110 
111 const int kMinConsumerCount = 1;
112 const int kMaxConsumerCount = 128;
113 
114 const int kMaxSchemaCount = 1;
115 const int kMaxThreadCount = 32;
116 const int kMaxFieldCount = 100;
117 
118 // Minimum free disk size
119 const int kMinFreeDiskSize = 10;  // 10M
120 
121 // dummy json
122 const json kDummyId = R"({"id": 0})"_json;
123 
124 // translate type in schema to type in sqlite3(NULL, INTEGER, REAL, TEXT, BLOB)
125 const std::unordered_map<std::string, std::string> kDbJsonMap = {
126   {"string", "TEXT"},     {"date", "DATE"},       {"date-time", "DATETIME"}, {"null", "NULL"},
127   {"integer", "INTEGER"}, {"boolean", "BOOLEAN"}, {"array", "BLOB"},         {"number", "NUMERIC"},
128   {"int32", "INTEGER"},   {"int64", "INTEGER"},   {"float32", "NUMERIC"},    {"float64", "NUMERIC"},
129   {"bytes", "BLOB"}};
130 
131 const char kPoint = '.';
132 
133 const char kPathSeparator =
134 #if defined(_WIN32) || defined(_WIN64)
135   '\\';
136 #else
137   '/';
138 #endif
139 
140 // field type used by check schema validation
141 const std::set<std::string> kFieldTypeSet = {"bytes", "string", "int32", "int64", "float32", "float64"};
142 
143 // can be searched field list
144 const std::set<std::string> kScalarFieldTypeSet = {"string", "int32", "int64", "float32", "float64"};
145 
146 // number field list
147 const std::set<std::string> kNumberFieldTypeSet = {"int32", "int64", "float32", "float64"};
148 
149 const std::unordered_map<std::string, std::string> kTypesMap = {
150   {"bool", "int32"},      {"int8", "int32"},      {"uint8", "bytes"},     {"int16", "int32"},
151   {"uint16", "int32"},    {"int32", "int32"},     {"uint32", "int64"},    {"int64", "int64"},
152   {"float16", "float32"}, {"float32", "float32"}, {"float64", "float64"}, {"string", "string"}};
153 
154 /// \brief the max number of samples to enable lazy load
155 const uint32_t LAZY_LOAD_THRESHOLD = 5000000;
156 
157 /// \brief split a string using a character
158 /// \param[in] field target string
159 /// \param[in] separator a character for splitting
160 /// \return vector type result
161 std::vector<std::string> StringSplit(const std::string &field, char separator);
162 
163 /// \brief validate field name is composed of '0-9' or 'a-z' or 'A-Z' or '_' or '-'
164 /// \param[in]  str target string
165 /// \return
166 bool ValidateFieldName(const std::string &str);
167 
168 /// \brief get the filename by the path
169 /// \param s file path
170 /// \param fn_ptr shared ptr of file name
171 /// \return Status
172 Status GetFileName(const std::string &path, std::shared_ptr<std::string> *fn_ptr);
173 
174 /// \brief get parent dir
175 /// \param path file path
176 /// \param pd_ptr shared ptr of parent path
177 /// \return Status
178 Status GetParentDir(const std::string &path, std::shared_ptr<std::string> *pd_ptr);
179 
180 bool CheckIsValidUtf8(const std::string &str);
181 
182 /// \brief judge if a path is legal file
183 /// \param path file path
184 /// \return Whether the path is legal or not
185 bool IsLegalFile(const std::string &path);
186 
187 enum DiskSizeType { kTotalSize = 0, kFreeSize };
188 
189 /// \brief get the free space about the disk
190 /// \param str_dir file path
191 /// \param disk_type: kTotalSize / kFreeSize
192 /// \param size: shared ptr of size in Megabytes
193 /// \return Status
194 Status GetDiskSize(const std::string &str_dir, const DiskSizeType &disk_type, std::shared_ptr<uint64_t> *size);
195 
196 /// \brief get the max hardware concurrency
197 /// \return max concurrency
198 uint32_t GetMaxThreadNum();
199 
200 /// \brief get absolute path of all mindrecord files
201 /// \param path path to one fo mindrecord files
202 /// \param addresses relative path of all mindrecord files
203 /// \param ds shared ptr of vector of absolute path
204 /// \return Status
205 Status GetDatasetFiles(const std::string &path, const json &addresses, std::shared_ptr<std::vector<std::string>> *ds);
206 }  // namespace mindrecord
207 }  // namespace mindspore
208 
209 #endif  // MINDSPORE_CCSRC_MINDDATA_MINDRECORD_INCLUDE_COMMON_SHARD_UTILS_H_
210