1 /**
2 * Copyright 2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "thread/core_affinity.h"
18 #include <string.h>
19 #include <stdlib.h>
20 #include <string>
21 #include <algorithm>
22 #ifdef MS_COMPILE_IOS
23 #include <sys/types.h>
24 #include <sys/sysctl.h>
25 #include <mach/machine.h>
26 #endif // MS_COMPILE_IOS
27 #include "thread/threadpool.h"
28
29 namespace mindspore {
30
31 enum Arch {
32 UnKnown_Arch = 0,
33 Cortex_A5,
34 Cortex_A7,
35 Cortex_A8,
36 Cortex_A9,
37 Cortex_A12,
38 Cortex_A15,
39 Cortex_A17,
40 Cortex_A32,
41 Cortex_A34,
42 Cortex_A35,
43 Cortex_A53,
44 Cortex_A55,
45 Cortex_A57,
46 Cortex_A65,
47 Cortex_A72,
48 Cortex_A73,
49 Cortex_A75,
50 Cortex_A76,
51 Cortex_A77,
52 Cortex_A78,
53 Cortex_X1
54 };
55
56 typedef struct {
57 int core_id;
58 int max_freq;
59 enum Arch arch;
60 } CpuInfo;
61
GetArch(int cpu_part)62 enum Arch GetArch(int cpu_part) {
63 typedef struct {
64 int part;
65 enum Arch arch;
66 } ArchSet;
67 // https://en.wikipedia.org/wiki/Comparison_of_ARMv7-A_cores
68 // https://en.wikipedia.org/wiki/Comparison_of_ARMv8-A_cores
69 std::vector<ArchSet> arch_set = {
70 {0x800, Cortex_A73}, // High-performance Kryo 260 (r10p2) / Kryo 280 (r10p1) "Gold" -> Cortex-A73
71 {0x801, Cortex_A53}, // Low-power Kryo 260 / 280 "Silver" -> Cortex-A53
72 {0x802, Cortex_A75}, // High-performance Kryo 385 "Gold" -> Cortex-A75
73 {0x803, Cortex_A55}, // Low-power Kryo 385 "Silver" -> Cortex-A55r0
74 {0x804, Cortex_A76}, // High-performance Kryo 485 "Gold" / "Gold Prime" -> Cortex-A76
75 {0x805, Cortex_A55}, // Low-performance Kryo 485 "Silver" -> Cortex-A55
76 {0xC05, Cortex_A5},
77 {0xC07, Cortex_A7},
78 {0xC08, Cortex_A8},
79 {0xC09, Cortex_A9},
80 {0xC0C, Cortex_A12},
81 {0xC0D, Cortex_A12},
82 {0xC0E, Cortex_A17},
83 {0xC0F, Cortex_A15},
84 {0xD01, Cortex_A32}, // also Huawei Kunpeng 920
85 // series taishan_v110 when not
86 // on android
87 {0xD02, Cortex_A34},
88 {0xD03, Cortex_A53},
89 {0xD04, Cortex_A35},
90 {0xD05, Cortex_A55},
91 {0xD06, Cortex_A65},
92 {0xD07, Cortex_A57},
93 {0xD08, Cortex_A72},
94 {0xD09, Cortex_A73},
95 {0xD0A, Cortex_A75},
96 {0xD0B, Cortex_A76},
97 {0xD0D, Cortex_A77},
98 {0xD0E, Cortex_A76}, // Cortex-A76AE
99 {0xD40, Cortex_A76}, // Kirin 980 Big/Medium cores -> Cortex-A76
100 {0xD41, Cortex_A78},
101 {0xD43, Cortex_A65}, // Cortex-A65AE
102 {0xD44, Cortex_X1}};
103 auto item =
104 std::find_if(arch_set.begin(), arch_set.end(), [&cpu_part](const ArchSet &a) { return a.part == cpu_part; });
105 return item != arch_set.end() ? item->arch : UnKnown_Arch;
106 }
107
ParseCpuPart(const char * line,int start,int size)108 int ParseCpuPart(const char *line, int start, int size) {
109 int cpu_part = 0;
110 for (int i = start; i < size && i < start + 3; i++) {
111 char c = line[i];
112 int d;
113 if (c >= '0' && c <= '9') {
114 d = c - '0';
115 } else if ((c - 'A') < 6) {
116 d = 10 + (c - 'A');
117 } else if ((c - 'a') < 6) {
118 d = 10 + (c - 'a');
119 } else {
120 THREAD_ERROR("CPU part in /proc/cpuinfo is ignored due to unexpected non-hex character");
121 break;
122 }
123 cpu_part = cpu_part * 16 + d;
124 }
125 return cpu_part;
126 }
127
SetArch(std::vector<CpuInfo> * freq_set,int core_num)128 int SetArch(std::vector<CpuInfo> *freq_set, int core_num) {
129 if (core_num <= 0) {
130 THREAD_ERROR("core_num must be greater than 0.");
131 return THREAD_ERROR;
132 }
133 FILE *fp = fopen("/proc/cpuinfo", "r");
134 if (fp == nullptr) {
135 THREAD_ERROR("read /proc/cpuinfo error.");
136 return THREAD_ERROR;
137 }
138 std::vector<Arch> archs;
139 archs.resize(core_num);
140 const int max_line_size = 1024;
141 char line[max_line_size] = {0};
142 int count = 0;
143 while (!feof(fp)) {
144 if (fgets(line, max_line_size, fp)) {
145 // line start with "CPU part"
146 if (0 == memcmp(line, "CPU part", 8)) {
147 // get number like 0xD03
148 for (int i = 0; i < max_line_size - 4; ++i) {
149 if (line[i] == '0' && line[i + 1] == 'x') {
150 int cpu_part = ParseCpuPart(line, i + 2, max_line_size);
151 enum Arch arch = GetArch(cpu_part);
152 if (arch == UnKnown_Arch) {
153 THREAD_ERROR("cpu's architecture is unknown.");
154 (void)fclose(fp);
155 return THREAD_ERROR;
156 }
157 count++;
158 if (count > core_num) {
159 THREAD_ERROR("number of cpu_part in /proc/cpuinfo is more than core_num.");
160 (void)fclose(fp);
161 return THREAD_ERROR;
162 }
163 archs[count - 1] = arch;
164 }
165 }
166 }
167 }
168 }
169 if (count < core_num) {
170 THREAD_ERROR("number of cpu_part in /proc/cpuinfo is less than core_num.");
171 (void)fclose(fp);
172 return THREAD_ERROR;
173 }
174 for (int i = 0; i < core_num; ++i) {
175 (*freq_set)[i].arch = archs[i];
176 }
177 (void)fclose(fp);
178 return THREAD_OK;
179 }
180
GetMaxFrequency(int core_id)181 int GetMaxFrequency(int core_id) {
182 FILE *fp;
183 std::vector<std::string> paths = {"/sys/devices/system/cpu/cpufreq/stats/cpu",
184 "/sys/devices/system/cpu/cpufreq/stats/cpu", "/sys/devices/system/cpu/cpu"};
185 std::vector<std::string> files = {"/time_in_state", "/cpufreq/stats/time_in_state", "/cpufreq/cpuinfo_max_freq"};
186 for (size_t i = 0; i < paths.size(); ++i) {
187 std::string file = paths[i] + std::to_string(core_id) + files[i];
188 fp = fopen(file.c_str(), "rb");
189 if (fp != nullptr) {
190 break;
191 }
192 }
193 int max_freq = -1;
194 if (fp == nullptr) {
195 THREAD_ERROR("open system file failed");
196 return max_freq;
197 }
198 while (feof(fp) == 0) {
199 int freq = 0;
200 int tmp = fscanf(fp, "%d", &freq);
201 if (tmp != 1) {
202 break;
203 }
204 if (freq > max_freq) {
205 max_freq = freq;
206 }
207 }
208 (void)fclose(fp);
209 return max_freq;
210 }
211
InitHardwareCoreInfo()212 int CoreAffinity::InitHardwareCoreInfo() {
213 core_num_ = std::thread::hardware_concurrency();
214 std::vector<CpuInfo> freq_set;
215 freq_set.resize(core_num_);
216 core_freq_.resize(core_num_);
217 for (size_t i = 0; i < core_num_; ++i) {
218 int max_freq = GetMaxFrequency(i);
219 core_freq_[i] = max_freq;
220 freq_set[i].core_id = i;
221 freq_set[i].max_freq = max_freq;
222 freq_set[i].arch = UnKnown_Arch;
223 }
224 int err_code = SetArch(&freq_set, core_num_);
225 if (err_code != THREAD_OK) {
226 THREAD_INFO("set arch failed, ignoring arch.");
227 }
228 // sort core id by frequency into descending order
229 for (size_t i = 0; i < core_num_; ++i) {
230 for (size_t j = i + 1; j < core_num_; ++j) {
231 if (freq_set[i].max_freq < freq_set[j].max_freq ||
232 (freq_set[i].max_freq == freq_set[j].max_freq && freq_set[i].arch <= freq_set[j].arch)) {
233 CpuInfo temp = freq_set[i];
234 freq_set[i] = freq_set[j];
235 freq_set[j] = temp;
236 }
237 }
238 }
239 higher_num_ = 0;
240 sorted_id_.clear();
241 int max_freq = freq_set.front().max_freq;
242 for (const auto &info : freq_set) {
243 THREAD_INFO("sorted core id: %d, max frequency: %d, arch: %d", info.core_id, info.max_freq, info.arch);
244 sorted_id_.push_back(info.core_id);
245 higher_num_ += info.max_freq == max_freq ? 1 : 0;
246 }
247 return THREAD_OK;
248 }
249
GetCoreId(size_t thread_num,BindMode bind_mode)250 std::vector<int> CoreAffinity::GetCoreId(size_t thread_num, BindMode bind_mode) {
251 std::vector<int> bind_id;
252 if (core_num_ != sorted_id_.size()) {
253 THREAD_ERROR("init sorted core id failed");
254 return bind_id;
255 }
256 if (bind_mode == Power_Higher || bind_mode == Power_NoBind) {
257 for (size_t i = 0; i < thread_num; ++i) {
258 bind_id.push_back(sorted_id_[i % core_num_]);
259 }
260 } else if (bind_mode == Power_Middle) {
261 for (size_t i = 0; i < thread_num; ++i) {
262 bind_id.push_back(sorted_id_[(i + higher_num_) % core_num_]);
263 }
264 } else {
265 return bind_id;
266 }
267 return bind_id;
268 }
SetCoreId(const std::vector<int> & core_list)269 void CoreAffinity::SetCoreId(const std::vector<int> &core_list) { bind_id_ = core_list; }
270
InitBindCoreId(size_t thread_num,BindMode bind_mode)271 int CoreAffinity::InitBindCoreId(size_t thread_num, BindMode bind_mode) {
272 bind_id_.clear();
273 bind_id_ = GetCoreId(thread_num, bind_mode);
274 if (bind_id_.empty()) {
275 return THREAD_ERROR;
276 }
277 return THREAD_OK;
278 }
279
280 #ifdef BIND_CORE
SetAffinity(const pthread_t & thread_id,cpu_set_t * cpu_set) const281 int CoreAffinity::SetAffinity(const pthread_t &thread_id, cpu_set_t *cpu_set) const {
282 #ifdef __ANDROID__
283 #if __ANDROID_API__ >= 21
284 THREAD_INFO("thread: %d, mask: %lu", pthread_gettid_np(thread_id), cpu_set->__bits[0]);
285 int ret = sched_setaffinity(pthread_gettid_np(thread_id), sizeof(cpu_set_t), cpu_set);
286 if (ret != THREAD_OK) {
287 THREAD_ERROR("bind thread %d to cpu failed. ERROR %d", pthread_gettid_np(thread_id), ret);
288 return THREAD_ERROR;
289 }
290 #endif
291 #else
292 #if defined(__APPLE__)
293 THREAD_ERROR("not bind thread to apple's cpu.");
294 return THREAD_ERROR;
295 #else
296 int ret = pthread_setaffinity_np(thread_id, sizeof(cpu_set_t), cpu_set);
297 if (ret != THREAD_OK) {
298 THREAD_ERROR("set thread: %lu to cpu failed", thread_id);
299 return THREAD_ERROR;
300 }
301 #endif // __APPLE__
302 #endif
303 return THREAD_OK;
304 }
305 #endif // BIND_CORE
306
FreeScheduleThreads(const std::vector<Worker * > & workers) const307 int CoreAffinity::FreeScheduleThreads(const std::vector<Worker *> &workers) const {
308 #ifdef BIND_CORE
309 cpu_set_t mask;
310 CPU_ZERO(&mask);
311 for (int i : bind_id_) {
312 CPU_SET(i, &mask);
313 }
314 for (auto worker : workers) {
315 int ret = SetAffinity(worker->handle(), &mask);
316 if (ret != THREAD_OK) {
317 return THREAD_ERROR;
318 }
319 }
320 #endif // BIND_CORE
321 return THREAD_OK;
322 }
323
BindThreadsToCoreList(const std::vector<Worker * > & workers) const324 int CoreAffinity::BindThreadsToCoreList(const std::vector<Worker *> &workers) const {
325 #ifdef BIND_CORE
326 if (bind_id_.empty()) {
327 THREAD_ERROR("bind id is empty");
328 return THREAD_ERROR;
329 }
330 size_t window = bind_id_.size();
331 size_t thread_num = workers.size();
332 for (size_t i = 0; i < thread_num; ++i) {
333 cpu_set_t mask;
334 CPU_ZERO(&mask);
335 CPU_SET(bind_id_[i % window], &mask);
336 // affinity mask determines the CPU core which it is eligible to run
337 int ret = SetAffinity(workers[i]->handle(), &mask);
338 if (ret != THREAD_OK) {
339 return THREAD_ERROR;
340 }
341 THREAD_INFO("set thread[%zu] affinity to core[%d] success", i, bind_id_[i % window]);
342 workers[i]->set_frequency(core_freq_[bind_id_[i]]);
343 }
344 #endif // BIND_CORE
345 return THREAD_OK;
346 }
347
BindProcess(BindMode bind_mode) const348 int CoreAffinity::BindProcess(BindMode bind_mode) const {
349 #ifdef BIND_CORE
350 if (bind_id_.empty()) {
351 // initializes bind id before bind currently process
352 THREAD_ERROR("bind id is empty");
353 return THREAD_ERROR;
354 }
355 cpu_set_t mask;
356 CPU_ZERO(&mask);
357 if (bind_mode != Power_NoBind) {
358 CPU_SET(bind_id_.front(), &mask);
359 } else {
360 for (int id : bind_id_) {
361 CPU_SET(id, &mask);
362 }
363 }
364 return SetAffinity(pthread_self(), &mask);
365 #else
366 return THREAD_OK;
367 #endif // BIND_CORE
368 }
369
BindThreads(const std::vector<Worker * > & workers,BindMode bind_mode)370 int CoreAffinity::BindThreads(const std::vector<Worker *> &workers, BindMode bind_mode) {
371 if (bind_id_.empty()) {
372 int ret = InitBindCoreId(workers.size(), bind_mode);
373 if (ret != THREAD_OK) {
374 THREAD_ERROR("init bind id failed");
375 return THREAD_ERROR;
376 }
377 }
378 if (bind_mode == Power_NoBind) {
379 return FreeScheduleThreads(workers);
380 } else {
381 return BindThreadsToCoreList(workers);
382 }
383 }
384
BindThreads(const std::vector<Worker * > & workers,const std::vector<int> & core_list)385 int CoreAffinity::BindThreads(const std::vector<Worker *> &workers, const std::vector<int> &core_list) {
386 // the size of core_list doesn't have to be the same as the size of workers(thread_num)
387 bind_id_ = core_list;
388 return BindThreadsToCoreList(workers);
389 }
390 } // namespace mindspore
391