• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "minddata/mindrecord/include/common/shard_utils.h"
18 #include "utils/file_utils.h"
19 #include "utils/ms_utils.h"
20 #include "securec.h"
21 
22 #ifdef _MSC_VER
23 #define stat _stat64  //  for file size exceeds (1<<31)-1 bytes
24 #endif
25 
26 namespace mindspore {
27 namespace mindrecord {
28 // split a string using a character
StringSplit(const std::string & field,char separator)29 std::vector<std::string> StringSplit(const std::string &field, char separator) {
30   std::vector<std::string> res;
31   uint64_t s_pos = 0;
32   while (s_pos < field.length()) {
33     size_t e_pos = field.find_first_of(separator, s_pos);
34     if (e_pos != std::string::npos) {
35       res.push_back(field.substr(s_pos, e_pos - s_pos));
36     } else {
37       res.push_back(field.substr(s_pos, field.length() - s_pos));
38       break;
39     }
40     s_pos = e_pos + 1;
41   }
42   return res;
43 }
44 
ValidateFieldName(const std::string & str)45 bool ValidateFieldName(const std::string &str) {
46   auto it = str.cbegin();
47   if (it == str.cend()) {
48     return false;
49   }
50   for (; it != str.cend(); ++it) {
51     if (*it == '_' || ((*it >= '0') && (*it <= '9')) || ((*it >= 'A') && (*it <= 'Z')) ||
52         ((*it >= 'a') && (*it <= 'z'))) {
53       continue;
54     }
55     return false;
56   }
57   return true;
58 }
59 
GetFileName(const std::string & path,std::shared_ptr<std::string> * fn_ptr)60 Status GetFileName(const std::string &path, std::shared_ptr<std::string> *fn_ptr) {
61   RETURN_UNEXPECTED_IF_NULL_MR(fn_ptr);
62 
63   std::optional<std::string> prefix_path;
64   std::optional<std::string> file_name;
65   FileUtils::SplitDirAndFileName(path, &prefix_path, &file_name);
66   if (!file_name.has_value()) {
67     RETURN_STATUS_UNEXPECTED_MR(
68       "Invalid file, failed to get the filename of mindrecord file. Please check file path: " + path);
69   }
70   *fn_ptr = std::make_shared<std::string>(file_name.value());
71 
72   return Status::OK();
73 }
74 
GetParentDir(const std::string & path,std::shared_ptr<std::string> * pd_ptr)75 Status GetParentDir(const std::string &path, std::shared_ptr<std::string> *pd_ptr) {
76   RETURN_UNEXPECTED_IF_NULL_MR(pd_ptr);
77 
78   std::optional<std::string> prefix_path;
79   std::optional<std::string> file_name;
80   FileUtils::SplitDirAndFileName(path, &prefix_path, &file_name);
81   if (!prefix_path.has_value()) {
82     prefix_path = ".";
83   }
84 
85   auto realpath = FileUtils::GetRealPath(prefix_path.value().c_str());
86   CHECK_FAIL_RETURN_UNEXPECTED_MR(
87     realpath.has_value(), "Invalid file, failed to get the parent dir of mindrecord file. Please check file: " + path);
88 
89   *pd_ptr = std::make_shared<std::string>(realpath.value() + kPathSeparator);
90   return Status::OK();
91 }
92 
CheckIsValidUtf8(const std::string & str)93 bool CheckIsValidUtf8(const std::string &str) {
94   int n = 0;
95   int ix = str.length();
96   for (int i = 0; i < ix; ++i) {
97     uint8_t c = static_cast<unsigned char>(str[i]);
98     if (c <= 0x7f) {
99       n = 0;
100     } else if ((c & 0xE0) == 0xC0) {
101       n = 1;
102     } else if (c == 0xed && i < (ix - 1) && (static_cast<unsigned char>(str[i + 1]) & 0xa0) == 0xa0) {
103       return false;
104     } else if ((c & 0xF0) == 0xE0) {
105       n = 2;
106     } else if ((c & 0xF8) == 0xF0) {
107       n = 3;
108     } else {
109       return false;
110     }
111     for (int j = 0; j < n && i < ix; ++j) {
112       if ((++i == ix) || ((static_cast<unsigned char>(str[i]) & 0xC0) != 0x80)) {
113         return false;
114       }
115     }
116   }
117   return true;
118 }
119 
CheckFile(const std::string & path)120 Status CheckFile(const std::string &path) {
121   struct stat s;
122 #if defined(_WIN32) || defined(_WIN64)
123   if (stat(FileUtils::UTF_8ToGB2312(path.data()).data(), &s) == 0) {
124 #else
125   if (stat(common::SafeCStr(path), &s) == 0) {
126 #endif
127     if (S_ISDIR(s.st_mode)) {
128       RETURN_STATUS_UNEXPECTED_MR("Invalid file, " + path + " is not a mindrecord file, but got directory.");
129     }
130     return Status::OK();
131   }
132   RETURN_STATUS_UNEXPECTED_MR(
133     "Invalid file, mindrecord file: " + path +
134     " can not be found. Please check whether the mindrecord file exists and do not rename the mindrecord file.");
135 }
136 
137 Status GetDiskSize(const std::string &str_dir, const DiskSizeType &disk_type, std::shared_ptr<uint64_t> *size_ptr) {
138   RETURN_UNEXPECTED_IF_NULL_MR(size_ptr);
139 #if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__)
140   *size_ptr = std::make_shared<uint64_t>(100);
141   return Status::OK();
142 #else
143   uint64_t ll_count = 0;
144   struct statfs64 disk_info;
145   if (statfs64(common::SafeCStr(str_dir), &disk_info) == -1) {
146     RETURN_STATUS_UNEXPECTED_MR("[Internal ERROR] Failed to get free disk size.");
147   }
148 
149   switch (disk_type) {
150     case kTotalSize:
151       ll_count = disk_info.f_bsize * disk_info.f_blocks;
152       ll_count = ll_count >> 20;
153       break;
154     case kFreeSize:
155       ll_count = disk_info.f_bsize * disk_info.f_bavail;
156       ll_count = ll_count >> 20;
157       break;
158     default:
159       ll_count = 0;
160       break;
161   }
162   *size_ptr = std::make_shared<uint64_t>(ll_count);
163   return Status::OK();
164 #endif
165 }
166 
167 uint32_t GetMaxThreadNum() {
168   // define the number of thread
169   uint32_t thread_num = std::thread::hardware_concurrency();
170   if (thread_num == 0) {
171     thread_num = kMaxConsumerCount;
172   }
173   return thread_num;
174 }
175 
176 Status GetDatasetFiles(const std::string &path, const json &addresses, std::shared_ptr<std::vector<std::string>> *ds) {
177   RETURN_UNEXPECTED_IF_NULL_MR(ds);
178   std::shared_ptr<std::string> parent_dir;
179   RETURN_IF_NOT_OK_MR(GetParentDir(path, &parent_dir));
180   for (const auto &p : addresses) {
181     std::string abs_path = *parent_dir + std::string(p);
182     (*ds)->emplace_back(abs_path);
183   }
184   return Status::OK();
185 }
186 
187 std::mt19937 GetRandomDevice() {
188 #if defined(_WIN32) || defined(_WIN64)
189   unsigned int number;
190   rand_s(&number);
191   std::mt19937 random_device{static_cast<uint32_t>(number)};
192 #else
193   int i = 0;
194   while (i < 5) {
195     try {
196       std::mt19937 random_device{std::random_device("/dev/urandom")()};
197       return random_device;
198     } catch (const std::exception &e) {
199       MS_LOG(WARNING) << "Get std::random_device failed, retry: " << i << ", error: " << e.what();
200       std::this_thread::sleep_for(std::chrono::milliseconds(10));
201       i++;
202     }
203   }
204   std::mt19937 random_device{std::random_device("/dev/urandom")()};
205 #endif
206   return random_device;
207 }
208 }  // namespace mindrecord
209 }  // namespace mindspore
210