1 /**
2 * Copyright 2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "plugin/device/ascend/hal/device/ascend_memory_adapter.h"
18
19 #include <algorithm>
20 #include <set>
21 #include "ir/func_graph.h"
22 #include "utils/ms_context.h"
23 #include "utils/convert_utils_base.h"
24 #include "plugin/device/ascend/hal/common/ascend_utils.h"
25 #include "plugin/device/ascend/hal/device/ascend_gmem_adapter.h"
26 #include "plugin/device/ascend/hal/device/ascend_vmm_adapter.h"
27 #include "transform/symbol/acl_rt_symbol.h"
28 #include "transform/symbol/symbol_utils.h"
29
30 namespace mindspore {
31 namespace device {
32 namespace ascend {
33
34 constexpr uint64_t kAscendMemAlignSize = 512;
35 constexpr double kMSMemoryRatio = 0.9375; // 15/16
36 constexpr double kReservedMemoryRatio = 0.0625; // 1/16
37 constexpr size_t kPerHugePageMemorySize = 2097152; // 2mb
38 constexpr size_t kExtraReservedMemory = 10485760; // 10mb
39 constexpr double kHalfRatio = 0.5;
40 constexpr uint64_t kOverflowAddrSize = 512;
41 constexpr char kGlobalOverflowWorkspace[] = "GLOBAL_OVERFLOW_WORKSPACE";
42
GetRoundDownAlignSize(size_t input_size)43 size_t AscendMemAdapter::GetRoundDownAlignSize(size_t input_size) {
44 return (input_size / kAscendMemAlignSize) * kAscendMemAlignSize;
45 }
46
GetRoundUpAlignSize(size_t input_size)47 size_t AscendMemAdapter::GetRoundUpAlignSize(size_t input_size) {
48 return ((input_size + kAscendMemAlignSize - 1) / kAscendMemAlignSize) * kAscendMemAlignSize;
49 }
50
Initialize()51 bool AscendMemAdapter::Initialize() {
52 if (initialized_) {
53 return true;
54 }
55
56 auto ret = CALL_ASCEND_API(aclrtGetMemInfo, ACL_HBM_MEM, &device_hbm_free_size_, &device_hbm_total_size_);
57 if (ret != ACL_ERROR_NONE || device_hbm_total_size_ == 0) {
58 MS_LOG(EXCEPTION) << "Internal Error: Get Device HBM memory size failed, ret = " << ret
59 << ", total HBM size :" << device_hbm_total_size_;
60 }
61
62 if (device_hbm_free_size_ < LongToSize(DoubleToLong(device_hbm_total_size_ * kHalfRatio))) {
63 auto context_ptr = MsContext::GetInstance();
64 MS_EXCEPTION_IF_NULL(context_ptr);
65 unsigned int device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
66 MS_LOG(EXCEPTION) << "#umsg#Framework Error Message:#umsg#Malloc device memory failed, free memory size is less "
67 "than half of total memory size."
68 << "Device " << device_id << " Device HBM total size:" << device_hbm_total_size_
69 << " Device HBM free size:" << device_hbm_free_size_
70 << " may be other processes occupying this card, check as: ps -ef|grep python";
71 }
72
73 // get user define max backend memory
74 auto user_define_ms_size = GetDeviceMemSizeFromContext();
75 auto recommend_mem_size_for_others = LongToSize(DoubleToLong(device_hbm_free_size_ * kReservedMemoryRatio));
76 size_t reserved_mem_size_for_others;
77 if (user_define_ms_size == 0) {
78 ms_used_hbm_size_ = DoubleToLong(device_hbm_free_size_ * kMSMemoryRatio);
79 // sub the extra reserved 10mb after rounding down the 2mb
80 ms_used_hbm_size_ = (ms_used_hbm_size_ / kPerHugePageMemorySize) * kPerHugePageMemorySize - kExtraReservedMemory;
81 reserved_mem_size_for_others = device_hbm_free_size_ - SizeToLong(ms_used_hbm_size_);
82 } else {
83 if (user_define_ms_size >= device_hbm_free_size_) {
84 MS_LOG(EXCEPTION)
85 << "#umsg#Framework Error Message:#umsg#The Free Device Memory Size is "
86 << (SizeToFloat(device_hbm_free_size_) / kGBToByte)
87 << " GB, variable_memory_max_size/max_device_memory should be in range (0-"
88 << (SizeToFloat(device_hbm_free_size_) / kMBToByte) << "]MB, but got "
89 << (SizeToFloat(user_define_ms_size) / kMBToByte)
90 << "MB, please set the context key 'variable_memory_max_size'/'max_device_memory' in valid range.";
91 }
92 ms_used_hbm_size_ = SizeToLong(user_define_ms_size);
93
94 reserved_mem_size_for_others = device_hbm_total_size_ - LongToSize(ms_used_hbm_size_);
95 if (reserved_mem_size_for_others < recommend_mem_size_for_others) {
96 MS_LOG(WARNING) << "Reserved memory size for other components(" << reserved_mem_size_for_others
97 << ") is less than recommend size(" << recommend_mem_size_for_others
98 << "), It may lead to Out Of Memory in HCCL or other components, Please double check context key "
99 "'variable_memory_max_size'/'max_device_memory'";
100 }
101 }
102
103 if (AscendVmmAdapter::GetInstance().IsEnabled()) {
104 ms_used_hbm_size_ = SizeToLong(AscendVmmAdapter::GetInstance().GetRoundDownAlignSize(ms_used_hbm_size_));
105 } else if (AscendGmemAdapter::GetInstance().is_eager_free_enabled()) {
106 ms_used_hbm_size_ = SizeToLong(AscendGmemAdapter::GetInstance().GetRoundDownAlignSize(ms_used_hbm_size_));
107 } else {
108 ms_used_hbm_size_ = SizeToLong(GetRoundDownAlignSize(ms_used_hbm_size_));
109 }
110 max_available_ms_hbm_size_ = ms_used_hbm_size_;
111 MS_LOG(INFO) << "Device HBM Size:" << device_hbm_total_size_ / kMBToByte
112 << "M, Device free HBM Size:" << device_hbm_free_size_ / kMBToByte
113 << "M, Reserved HBM size for Other Components(HCCL/rts/etc.):"
114 << reserved_mem_size_for_others / kMBToByte
115 << "M, Recommend Reserved HBM size for Other Components:" << recommend_mem_size_for_others / kMBToByte
116 << "M, User define MindSpore HBM Size:" << user_define_ms_size / kGBToByte
117 << "G, MindSpore Used HBM Size:" << ms_used_hbm_size_ / kMBToByte << "M.";
118
119 device_mem_base_addr_ = MallocFromRts(ms_used_hbm_size_);
120 static_mem_offset_ = ms_used_hbm_size_;
121 cur_dynamic_mem_offset_ = 0;
122 max_dynamic_mem_offset_ = 0;
123 history_max_dynamic_mem_offset_ = 0;
124 MS_LOG(INFO) << "Ascend Memory Adapter initialize success, Memory Statistics:" << DevMemStatistics();
125 initialized_ = true;
126 return true;
127 }
128
DeInitialize()129 bool AscendMemAdapter::DeInitialize() {
130 if (!initialized_) {
131 MS_LOG(INFO) << "DeInitialize Ascend Memory Adapter when it is not initialize";
132 return false;
133 }
134
135 auto ret = FreeToRts(device_mem_base_addr_, ms_used_hbm_size_);
136 if (ret) {
137 std::ostringstream oss_buf;
138 oss_buf << "Ascend Memory Adapter deinitialize success, statistics:" << DevMemStatistics();
139 MS_LOG(INFO) << oss_buf.str();
140 if (common::IsNeedProfileMemory() || common::IsNeedMemoryStatistic()) {
141 MS_LOG(WARNING) << oss_buf.str();
142 }
143 if (common::IsEnableRuntimeConfig(common::kRuntimeMemoryStat) ||
144 common::IsEnableRuntimeConfig(common::kRuntimeMemoryTrack)) {
145 std::cout << "[MS_RUNTIME_PROF]" << oss_buf.str() << std::endl;
146 }
147
148 device_hbm_total_size_ = 0;
149 device_hbm_free_size_ = 0;
150 max_available_ms_hbm_size_ = 0;
151 device_mem_base_addr_ = nullptr;
152 ms_used_hbm_size_ = 0;
153
154 cur_dynamic_mem_offset_ = 0;
155 max_dynamic_mem_offset_ = 0;
156 history_max_dynamic_mem_offset_ = 0;
157 dynamic_memory_block_list_.clear();
158
159 static_mem_offset_ = 0;
160 static_memory_block_list_.clear();
161
162 initialized_ = false;
163 }
164
165 return ret;
166 }
167
MallocStaticDevMem(size_t size,const std::string & tag)168 uint8_t *AscendMemAdapter::MallocStaticDevMem(size_t size, const std::string &tag) {
169 std::lock_guard<std::mutex> locker(mutex_);
170 if (AscendVmmAdapter::GetInstance().IsEnabled()) {
171 MS_LOG(ERROR) << "The device virtual memory doesn't support the O2 jit level, please set "
172 "MS_ALLOC_CONF=enable_vmm:False to disable the device virtual memory.";
173 return nullptr;
174 }
175 size = GetRoundUpAlignSize(size);
176 if (!common::IsNeedProfileMemory() && (static_mem_offset_ < static_cast<int64_t>(size) ||
177 (static_mem_offset_ - static_cast<int64_t>(size)) < max_dynamic_mem_offset_)) {
178 MS_LOG(INFO) << DevMemDetailInfo();
179 MS_LOG(EXCEPTION) << "#umsg#Framework Error Message:#umsg#Out of Memory!!! Request memory size: " << size
180 << "B, Memory Statistic:" << DevMemStatistics()
181 << "\nPlease try to reduce 'batch_size' or check whether exists extra large shape. For more "
182 "details, please refer to 'Out of Memory' at https://www.mindspore.cn .";
183 }
184 int64_t new_static_offset = static_mem_offset_ - static_cast<int64_t>(size);
185 auto memory_block_ptr = device_mem_base_addr_ + new_static_offset;
186 static_mem_offset_ = new_static_offset;
187 static_memory_block_list_.push_back(std::make_shared<MemoryBlock>(memory_block_ptr, size, tag));
188 return memory_block_ptr;
189 }
190
MallocDynamicDevMem(size_t size,const std::string & tag)191 uint8_t *AscendMemAdapter::MallocDynamicDevMem(size_t size, const std::string &tag) {
192 std::lock_guard<std::mutex> locker(mutex_);
193 if (AscendVmmAdapter::GetInstance().IsEnabled()) {
194 MS_LOG(EXCEPTION) << "VMM is enabled, can not allocate dynamic memory.";
195 }
196 size = GetRoundUpAlignSize(size);
197 int64_t new_dynamic_offset = cur_dynamic_mem_offset_ + static_cast<int64_t>(size);
198 if (!common::IsNeedProfileMemory() && new_dynamic_offset > static_mem_offset_) {
199 MS_LOG(INFO) << DevMemDetailInfo();
200 MS_LOG(EXCEPTION) << "#umsg#Framework Error Message:#umsg#Out of Memory!!! Request memory size: " << size
201 << "B, Memory Statistic:" << DevMemStatistics()
202 << "\nPlease try to reduce 'batch_size' or check whether exists extra large shape. For more "
203 "details, please refer to 'Out of Memory' at https://www.mindspore.cn .";
204 }
205
206 auto memory_block_ptr = device_mem_base_addr_ + cur_dynamic_mem_offset_;
207 cur_dynamic_mem_offset_ = new_dynamic_offset;
208 max_dynamic_mem_offset_ = std::max(cur_dynamic_mem_offset_, max_dynamic_mem_offset_);
209 history_max_dynamic_mem_offset_ = std::max(max_dynamic_mem_offset_, history_max_dynamic_mem_offset_);
210 dynamic_memory_block_list_.push_back(std::make_shared<MemoryBlock>(memory_block_ptr, size, tag));
211
212 return memory_block_ptr;
213 }
214
GetBaseAddr() const215 uint8_t *AscendMemAdapter::GetBaseAddr() const { return device_mem_base_addr_; }
216
ResetDynamicMemory()217 void AscendMemAdapter::ResetDynamicMemory() {
218 cur_dynamic_mem_offset_ = 0;
219 if (IsMemoryPoolRecycle()) {
220 max_dynamic_mem_offset_ = 0;
221 }
222 if (AscendVmmAdapter::GetInstance().IsEnabled()) {
223 AscendVmmAdapter::GetInstance().ClearAllMemory();
224 } else if (AscendGmemAdapter::GetInstance().is_eager_free_enabled()) {
225 AscendGmemAdapter::GetInstance().EagerFreeDeviceMem(device_mem_base_addr_, ms_used_hbm_size_);
226 }
227 }
228
DevMemStatistics() const229 std::string AscendMemAdapter::DevMemStatistics() const {
230 auto context = MsContext::GetInstance();
231 MS_EXCEPTION_IF_NULL(context);
232 std::ostringstream oss;
233 oss << "\nDevice HBM memory size: " << device_hbm_total_size_ / kMBToByte << "M";
234 oss << "\nMindSpore Used memory size: " << ms_used_hbm_size_ / kMBToByte << "M";
235 oss << "\nMindSpore memory base address: " << reinterpret_cast<void *>(device_mem_base_addr_);
236 if (!context->IsKByKExecutorMode()) {
237 oss << "\nTotal Static Memory size: " << (ms_used_hbm_size_ - static_mem_offset_) / kMBToByte << "M";
238 oss << "\nTotal Dynamic memory size: " << history_max_dynamic_mem_offset_ / kMBToByte << "M";
239 }
240 if (IsMemoryPoolRecycle()) {
241 size_t max_actual = std::max(actual_peak_memory_, (ms_used_hbm_size_ - static_mem_offset_));
242 oss << "\nActual peak memory usage: " << max_actual / kMBToByte << "M";
243 } else if (context->IsKByKExecutorMode()) {
244 oss << "\nUsed peak memory usage (without fragments): " << used_peak_memory_ / kMBToByte << "M";
245 oss << "\nActual peak memory usage (with fragments): " << actual_peak_memory_ / kMBToByte << "M";
246 }
247 if (!context->IsKByKExecutorMode()) {
248 oss << "\nDynamic memory size of this graph: " << cur_dynamic_mem_offset_ / kMBToByte << "M";
249 }
250 oss << std::endl;
251 return oss.str();
252 }
253
DevMemDetailInfo() const254 std::string AscendMemAdapter::DevMemDetailInfo() const {
255 std::ostringstream oss;
256 oss << "\nMemory Detail Info:";
257 oss << "\nStatic Memory Blocks:";
258 oss << "\nAddress \t Size \t tag \t";
259 for (const auto &blk : static_memory_block_list_) {
260 oss << "\n" << blk->mem_ptr << "\t" << blk->mem_size << "\t" << blk->mem_tag;
261 }
262
263 oss << "\nDynamic Memory Blocks:";
264 oss << "\nAddress \t Size \t tag \t";
265 for (const auto &blk : dynamic_memory_block_list_) {
266 oss << "\n" << blk->mem_ptr << "\t" << blk->mem_size << "\t" << blk->mem_tag;
267 }
268 return oss.str();
269 }
270
GetDeviceMemSizeFromContext() const271 size_t AscendMemAdapter::GetDeviceMemSizeFromContext() const {
272 auto context = MsContext::GetInstance();
273 MS_EXCEPTION_IF_NULL(context);
274 size_t size_from_context;
275 auto max_device_memory = context->get_param<float>(MS_CTX_MAX_DEVICE_MEMORY);
276 float total_device_memory = 32.0f;
277 if (context->ascend_soc_version() == kAscendVersion910b || context->ascend_soc_version() == kAscendVersion910c) {
278 total_device_memory = 64.0f;
279 }
280 if (max_device_memory <= total_device_memory) {
281 MS_LOG(INFO) << "context max_device_memory:" << max_device_memory;
282 size_from_context = FloatToSize(max_device_memory * kGBToByte);
283 } else {
284 auto variable_memory_max_size = context->get_param<std::string>(MS_CTX_VARIABLE_MEMORY_MAX_SIZE);
285 if (variable_memory_max_size == "0") {
286 return 0;
287 }
288 MS_LOG(INFO) << "context variable_memory_max_size:" << variable_memory_max_size;
289 auto pos = variable_memory_max_size.find('*');
290 if (pos == std::string::npos) {
291 MS_LOG(EXCEPTION) << "Invalid variable_memory_max_size";
292 }
293 auto gb_str = variable_memory_max_size.substr(0, pos);
294 auto gb_var = std::stoull(gb_str);
295 MS_LOG(INFO) << "variable_memory_max_size(GB):" << gb_var;
296 size_from_context = gb_var * kGBToByte;
297 }
298
299 return size_from_context;
300 }
301
MallocFromRts(size_t size) const302 uint8_t *AscendMemAdapter::MallocFromRts(size_t size) const {
303 uint8_t *ptr = nullptr;
304 if (AscendVmmAdapter::GetInstance().IsEnabled()) {
305 return nullptr;
306 }
307 if (AscendGmemAdapter::GetInstance().is_eager_free_enabled()) {
308 return AscendGmemAdapter::GetInstance().MmapMemory(size, reinterpret_cast<void *>(ptr));
309 }
310
311 auto ret = CALL_ASCEND_API(aclrtMalloc, reinterpret_cast<void **>(&ptr), size, ACL_MEM_TYPE_HIGH_BAND_WIDTH);
312 if (ret != ACL_RT_SUCCESS) {
313 if (ret == ACL_ERROR_RT_MEMORY_ALLOCATION) {
314 auto context_ptr = MsContext::GetInstance();
315 MS_EXCEPTION_IF_NULL(context_ptr);
316 unsigned int device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
317 size_t free = 0;
318 size_t total = 0;
319 (void)CALL_ASCEND_API(aclrtGetMemInfo, ACL_HBM_MEM, &free, &total);
320 MS_LOG(EXCEPTION) << "#umsg#Framework Error Message:#umsg#Malloc device memory failed, size[" << size << "], ret["
321 << ret << "], "
322 << "Device " << device_id << " Available HBM size:" << total << " free size:" << free
323 << " may be other processes occupying this card, check as: ps -ef|grep python";
324 } else {
325 MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << size << "] fail, ret[" << ret << "]";
326 }
327 } else {
328 MS_LOG(INFO) << "Call rtMalloc to allocate device memory Success, size: " << size
329 << " bytes, address start: " << reinterpret_cast<void *>(ptr)
330 << " end: " << reinterpret_cast<void *>(ptr + size);
331 }
332 return ptr;
333 }
334
FreeToRts(void * devPtr,const size_t size) const335 bool AscendMemAdapter::FreeToRts(void *devPtr, const size_t size) const {
336 if (devPtr != nullptr) {
337 if (AscendGmemAdapter::GetInstance().is_eager_free_enabled()) {
338 return AscendGmemAdapter::GetInstance().MunmapMemory(devPtr, size);
339 }
340 auto ret = CALL_ASCEND_API(aclrtFree, devPtr);
341 if (ret != ACL_ERROR_NONE) {
342 MS_LOG(ERROR) << "aclrtFree mem [" << devPtr << "] fail, ret[" << ret << "]";
343 return false;
344 }
345 }
346 return true;
347 }
348 } // namespace ascend
349 } // namespace device
350 } // namespace mindspore
351