1 /**
2 * Copyright 2019 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "minddata/mindrecord/include/common/shard_utils.h"
18 #include "utils/file_utils.h"
19 #include "utils/ms_utils.h"
20 #include "securec.h"
21
22 #ifdef _MSC_VER
23 #define stat _stat64 // for file size exceeds (1<<31)-1 bytes
24 #endif
25
26 namespace mindspore {
27 namespace mindrecord {
28 // split a string using a character
StringSplit(const std::string & field,char separator)29 std::vector<std::string> StringSplit(const std::string &field, char separator) {
30 std::vector<std::string> res;
31 uint64_t s_pos = 0;
32 while (s_pos < field.length()) {
33 size_t e_pos = field.find_first_of(separator, s_pos);
34 if (e_pos != std::string::npos) {
35 res.push_back(field.substr(s_pos, e_pos - s_pos));
36 } else {
37 res.push_back(field.substr(s_pos, field.length() - s_pos));
38 break;
39 }
40 s_pos = e_pos + 1;
41 }
42 return res;
43 }
44
ValidateFieldName(const std::string & str)45 bool ValidateFieldName(const std::string &str) {
46 auto it = str.cbegin();
47 if (it == str.cend()) {
48 return false;
49 }
50 for (; it != str.cend(); ++it) {
51 if (*it == '_' || ((*it >= '0') && (*it <= '9')) || ((*it >= 'A') && (*it <= 'Z')) ||
52 ((*it >= 'a') && (*it <= 'z'))) {
53 continue;
54 }
55 return false;
56 }
57 return true;
58 }
59
GetFileName(const std::string & path,std::shared_ptr<std::string> * fn_ptr)60 Status GetFileName(const std::string &path, std::shared_ptr<std::string> *fn_ptr) {
61 RETURN_UNEXPECTED_IF_NULL_MR(fn_ptr);
62
63 std::optional<std::string> prefix_path;
64 std::optional<std::string> file_name;
65 FileUtils::SplitDirAndFileName(path, &prefix_path, &file_name);
66 if (!file_name.has_value()) {
67 RETURN_STATUS_UNEXPECTED_MR(
68 "Invalid file, failed to get the filename of mindrecord file. Please check file path: " + path);
69 }
70 *fn_ptr = std::make_shared<std::string>(file_name.value());
71
72 return Status::OK();
73 }
74
GetParentDir(const std::string & path,std::shared_ptr<std::string> * pd_ptr)75 Status GetParentDir(const std::string &path, std::shared_ptr<std::string> *pd_ptr) {
76 RETURN_UNEXPECTED_IF_NULL_MR(pd_ptr);
77
78 std::optional<std::string> prefix_path;
79 std::optional<std::string> file_name;
80 FileUtils::SplitDirAndFileName(path, &prefix_path, &file_name);
81 if (!prefix_path.has_value()) {
82 prefix_path = ".";
83 }
84
85 auto realpath = FileUtils::GetRealPath(prefix_path.value().c_str());
86 CHECK_FAIL_RETURN_UNEXPECTED_MR(
87 realpath.has_value(), "Invalid file, failed to get the parent dir of mindrecord file. Please check file: " + path);
88
89 *pd_ptr = std::make_shared<std::string>(realpath.value() + kPathSeparator);
90 return Status::OK();
91 }
92
CheckIsValidUtf8(const std::string & str)93 bool CheckIsValidUtf8(const std::string &str) {
94 int n = 0;
95 int ix = str.length();
96 for (int i = 0; i < ix; ++i) {
97 uint8_t c = static_cast<unsigned char>(str[i]);
98 if (c <= 0x7f) {
99 n = 0;
100 } else if ((c & 0xE0) == 0xC0) {
101 n = 1;
102 } else if (c == 0xed && i < (ix - 1) && (static_cast<unsigned char>(str[i + 1]) & 0xa0) == 0xa0) {
103 return false;
104 } else if ((c & 0xF0) == 0xE0) {
105 n = 2;
106 } else if ((c & 0xF8) == 0xF0) {
107 n = 3;
108 } else {
109 return false;
110 }
111 for (int j = 0; j < n && i < ix; ++j) {
112 if ((++i == ix) || ((static_cast<unsigned char>(str[i]) & 0xC0) != 0x80)) {
113 return false;
114 }
115 }
116 }
117 return true;
118 }
119
CheckFile(const std::string & path)120 Status CheckFile(const std::string &path) {
121 struct stat s;
122 #if defined(_WIN32) || defined(_WIN64)
123 if (stat(FileUtils::UTF_8ToGB2312(path.data()).data(), &s) == 0) {
124 #else
125 if (stat(common::SafeCStr(path), &s) == 0) {
126 #endif
127 if (S_ISDIR(s.st_mode)) {
128 RETURN_STATUS_UNEXPECTED_MR("Invalid file, " + path + " is not a mindrecord file, but got directory.");
129 }
130 return Status::OK();
131 }
132 RETURN_STATUS_UNEXPECTED_MR(
133 "Invalid file, mindrecord file: " + path +
134 " can not be found. Please check whether the mindrecord file exists and do not rename the mindrecord file.");
135 }
136
137 Status GetDiskSize(const std::string &str_dir, const DiskSizeType &disk_type, std::shared_ptr<uint64_t> *size_ptr) {
138 RETURN_UNEXPECTED_IF_NULL_MR(size_ptr);
139 #if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__)
140 *size_ptr = std::make_shared<uint64_t>(100);
141 return Status::OK();
142 #else
143 uint64_t ll_count = 0;
144 struct statfs64 disk_info;
145 if (statfs64(common::SafeCStr(str_dir), &disk_info) == -1) {
146 RETURN_STATUS_UNEXPECTED_MR("[Internal ERROR] Failed to get free disk size.");
147 }
148
149 switch (disk_type) {
150 case kTotalSize:
151 ll_count = disk_info.f_bsize * disk_info.f_blocks;
152 ll_count = ll_count >> 20;
153 break;
154 case kFreeSize:
155 ll_count = disk_info.f_bsize * disk_info.f_bavail;
156 ll_count = ll_count >> 20;
157 break;
158 default:
159 ll_count = 0;
160 break;
161 }
162 *size_ptr = std::make_shared<uint64_t>(ll_count);
163 return Status::OK();
164 #endif
165 }
166
167 uint32_t GetMaxThreadNum() {
168 // define the number of thread
169 uint32_t thread_num = std::thread::hardware_concurrency();
170 if (thread_num == 0) {
171 thread_num = kMaxConsumerCount;
172 }
173 return thread_num;
174 }
175
176 Status GetDatasetFiles(const std::string &path, const json &addresses, std::shared_ptr<std::vector<std::string>> *ds) {
177 RETURN_UNEXPECTED_IF_NULL_MR(ds);
178 std::shared_ptr<std::string> parent_dir;
179 RETURN_IF_NOT_OK_MR(GetParentDir(path, &parent_dir));
180 for (const auto &p : addresses) {
181 std::string abs_path = *parent_dir + std::string(p);
182 (*ds)->emplace_back(abs_path);
183 }
184 return Status::OK();
185 }
186
187 std::mt19937 GetRandomDevice() {
188 #if defined(_WIN32) || defined(_WIN64)
189 unsigned int number;
190 rand_s(&number);
191 std::mt19937 random_device{static_cast<uint32_t>(number)};
192 #else
193 int i = 0;
194 while (i < 5) {
195 try {
196 std::mt19937 random_device{std::random_device("/dev/urandom")()};
197 return random_device;
198 } catch (const std::exception &e) {
199 MS_LOG(WARNING) << "Get std::random_device failed, retry: " << i << ", error: " << e.what();
200 std::this_thread::sleep_for(std::chrono::milliseconds(10));
201 i++;
202 }
203 }
204 std::mt19937 random_device{std::random_device("/dev/urandom")()};
205 #endif
206 return random_device;
207 }
208 } // namespace mindrecord
209 } // namespace mindspore
210