• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "plugin/device/ascend/hal/device/ascend_memory_adapter.h"
18 
19 #include <algorithm>
20 #include <set>
21 #include "ir/func_graph.h"
22 #include "utils/ms_context.h"
23 #include "utils/convert_utils_base.h"
24 #include "plugin/device/ascend/hal/common/ascend_utils.h"
25 #include "plugin/device/ascend/hal/device/ascend_gmem_adapter.h"
26 #include "plugin/device/ascend/hal/device/ascend_vmm_adapter.h"
27 #include "transform/symbol/acl_rt_symbol.h"
28 #include "transform/symbol/symbol_utils.h"
29 
30 namespace mindspore {
31 namespace device {
32 namespace ascend {
33 
34 constexpr uint64_t kAscendMemAlignSize = 512;
35 constexpr double kMSMemoryRatio = 0.9375;           // 15/16
36 constexpr double kReservedMemoryRatio = 0.0625;     // 1/16
37 constexpr size_t kPerHugePageMemorySize = 2097152;  // 2mb
38 constexpr size_t kExtraReservedMemory = 10485760;   // 10mb
39 constexpr double kHalfRatio = 0.5;
40 constexpr uint64_t kOverflowAddrSize = 512;
41 constexpr char kGlobalOverflowWorkspace[] = "GLOBAL_OVERFLOW_WORKSPACE";
42 
GetRoundDownAlignSize(size_t input_size)43 size_t AscendMemAdapter::GetRoundDownAlignSize(size_t input_size) {
44   return (input_size / kAscendMemAlignSize) * kAscendMemAlignSize;
45 }
46 
GetRoundUpAlignSize(size_t input_size)47 size_t AscendMemAdapter::GetRoundUpAlignSize(size_t input_size) {
48   return ((input_size + kAscendMemAlignSize - 1) / kAscendMemAlignSize) * kAscendMemAlignSize;
49 }
50 
Initialize()51 bool AscendMemAdapter::Initialize() {
52   if (initialized_) {
53     return true;
54   }
55 
56   auto ret = CALL_ASCEND_API(aclrtGetMemInfo, ACL_HBM_MEM, &device_hbm_free_size_, &device_hbm_total_size_);
57   if (ret != ACL_ERROR_NONE || device_hbm_total_size_ == 0) {
58     MS_LOG(EXCEPTION) << "Internal Error: Get Device HBM memory size failed, ret = " << ret
59                       << ", total HBM size :" << device_hbm_total_size_;
60   }
61 
62   if (device_hbm_free_size_ < LongToSize(DoubleToLong(device_hbm_total_size_ * kHalfRatio))) {
63     auto context_ptr = MsContext::GetInstance();
64     MS_EXCEPTION_IF_NULL(context_ptr);
65     unsigned int device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
66     MS_LOG(EXCEPTION) << "#umsg#Framework Error Message:#umsg#Malloc device memory failed, free memory size is less "
67                          "than half of total memory size."
68                       << "Device " << device_id << " Device HBM total size:" << device_hbm_total_size_
69                       << " Device HBM free size:" << device_hbm_free_size_
70                       << " may be other processes occupying this card, check as: ps -ef|grep python";
71   }
72 
73   // get user define max backend memory
74   auto user_define_ms_size = GetDeviceMemSizeFromContext();
75   auto recommend_mem_size_for_others = LongToSize(DoubleToLong(device_hbm_free_size_ * kReservedMemoryRatio));
76   size_t reserved_mem_size_for_others;
77   if (user_define_ms_size == 0) {
78     ms_used_hbm_size_ = DoubleToLong(device_hbm_free_size_ * kMSMemoryRatio);
79     // sub the extra reserved 10mb after rounding down the 2mb
80     ms_used_hbm_size_ = (ms_used_hbm_size_ / kPerHugePageMemorySize) * kPerHugePageMemorySize - kExtraReservedMemory;
81     reserved_mem_size_for_others = device_hbm_free_size_ - SizeToLong(ms_used_hbm_size_);
82   } else {
83     if (user_define_ms_size >= device_hbm_free_size_) {
84       MS_LOG(EXCEPTION)
85         << "#umsg#Framework Error Message:#umsg#The Free Device Memory Size is "
86         << (SizeToFloat(device_hbm_free_size_) / kGBToByte)
87         << " GB, variable_memory_max_size/max_device_memory should be in range (0-"
88         << (SizeToFloat(device_hbm_free_size_) / kMBToByte) << "]MB, but got "
89         << (SizeToFloat(user_define_ms_size) / kMBToByte)
90         << "MB, please set the context key 'variable_memory_max_size'/'max_device_memory' in valid range.";
91     }
92     ms_used_hbm_size_ = SizeToLong(user_define_ms_size);
93 
94     reserved_mem_size_for_others = device_hbm_total_size_ - LongToSize(ms_used_hbm_size_);
95     if (reserved_mem_size_for_others < recommend_mem_size_for_others) {
96       MS_LOG(WARNING) << "Reserved memory size for other components(" << reserved_mem_size_for_others
97                       << ") is less than recommend size(" << recommend_mem_size_for_others
98                       << "), It may lead to Out Of Memory in HCCL or other components, Please double check context key "
99                          "'variable_memory_max_size'/'max_device_memory'";
100     }
101   }
102 
103   if (AscendVmmAdapter::GetInstance().IsEnabled()) {
104     ms_used_hbm_size_ = SizeToLong(AscendVmmAdapter::GetInstance().GetRoundDownAlignSize(ms_used_hbm_size_));
105   } else if (AscendGmemAdapter::GetInstance().is_eager_free_enabled()) {
106     ms_used_hbm_size_ = SizeToLong(AscendGmemAdapter::GetInstance().GetRoundDownAlignSize(ms_used_hbm_size_));
107   } else {
108     ms_used_hbm_size_ = SizeToLong(GetRoundDownAlignSize(ms_used_hbm_size_));
109   }
110   max_available_ms_hbm_size_ = ms_used_hbm_size_;
111   MS_LOG(INFO) << "Device HBM Size:" << device_hbm_total_size_ / kMBToByte
112                << "M, Device free HBM Size:" << device_hbm_free_size_ / kMBToByte
113                << "M, Reserved HBM size for Other Components(HCCL/rts/etc.):"
114                << reserved_mem_size_for_others / kMBToByte
115                << "M, Recommend Reserved HBM size for Other Components:" << recommend_mem_size_for_others / kMBToByte
116                << "M, User define MindSpore HBM Size:" << user_define_ms_size / kGBToByte
117                << "G, MindSpore Used HBM Size:" << ms_used_hbm_size_ / kMBToByte << "M.";
118 
119   device_mem_base_addr_ = MallocFromRts(ms_used_hbm_size_);
120   static_mem_offset_ = ms_used_hbm_size_;
121   cur_dynamic_mem_offset_ = 0;
122   max_dynamic_mem_offset_ = 0;
123   history_max_dynamic_mem_offset_ = 0;
124   MS_LOG(INFO) << "Ascend Memory Adapter initialize success, Memory Statistics:" << DevMemStatistics();
125   initialized_ = true;
126   return true;
127 }
128 
DeInitialize()129 bool AscendMemAdapter::DeInitialize() {
130   if (!initialized_) {
131     MS_LOG(INFO) << "DeInitialize Ascend Memory Adapter when it is not initialize";
132     return false;
133   }
134 
135   auto ret = FreeToRts(device_mem_base_addr_, ms_used_hbm_size_);
136   if (ret) {
137     std::ostringstream oss_buf;
138     oss_buf << "Ascend Memory Adapter deinitialize success, statistics:" << DevMemStatistics();
139     MS_LOG(INFO) << oss_buf.str();
140     if (common::IsNeedProfileMemory() || common::IsNeedMemoryStatistic()) {
141       MS_LOG(WARNING) << oss_buf.str();
142     }
143     if (common::IsEnableRuntimeConfig(common::kRuntimeMemoryStat) ||
144         common::IsEnableRuntimeConfig(common::kRuntimeMemoryTrack)) {
145       std::cout << "[MS_RUNTIME_PROF]" << oss_buf.str() << std::endl;
146     }
147 
148     device_hbm_total_size_ = 0;
149     device_hbm_free_size_ = 0;
150     max_available_ms_hbm_size_ = 0;
151     device_mem_base_addr_ = nullptr;
152     ms_used_hbm_size_ = 0;
153 
154     cur_dynamic_mem_offset_ = 0;
155     max_dynamic_mem_offset_ = 0;
156     history_max_dynamic_mem_offset_ = 0;
157     dynamic_memory_block_list_.clear();
158 
159     static_mem_offset_ = 0;
160     static_memory_block_list_.clear();
161 
162     initialized_ = false;
163   }
164 
165   return ret;
166 }
167 
MallocStaticDevMem(size_t size,const std::string & tag)168 uint8_t *AscendMemAdapter::MallocStaticDevMem(size_t size, const std::string &tag) {
169   std::lock_guard<std::mutex> locker(mutex_);
170   if (AscendVmmAdapter::GetInstance().IsEnabled()) {
171     MS_LOG(ERROR) << "The device virtual memory doesn't support the O2 jit level, please set "
172                      "MS_ALLOC_CONF=enable_vmm:False to disable the device virtual memory.";
173     return nullptr;
174   }
175   size = GetRoundUpAlignSize(size);
176   if (!common::IsNeedProfileMemory() && (static_mem_offset_ < static_cast<int64_t>(size) ||
177                                          (static_mem_offset_ - static_cast<int64_t>(size)) < max_dynamic_mem_offset_)) {
178     MS_LOG(INFO) << DevMemDetailInfo();
179     MS_LOG(EXCEPTION) << "#umsg#Framework Error Message:#umsg#Out of Memory!!! Request memory size: " << size
180                       << "B, Memory Statistic:" << DevMemStatistics()
181                       << "\nPlease try to reduce 'batch_size' or check whether exists extra large shape. For more "
182                          "details, please refer to 'Out of Memory' at https://www.mindspore.cn .";
183   }
184   int64_t new_static_offset = static_mem_offset_ - static_cast<int64_t>(size);
185   auto memory_block_ptr = device_mem_base_addr_ + new_static_offset;
186   static_mem_offset_ = new_static_offset;
187   static_memory_block_list_.push_back(std::make_shared<MemoryBlock>(memory_block_ptr, size, tag));
188   return memory_block_ptr;
189 }
190 
MallocDynamicDevMem(size_t size,const std::string & tag)191 uint8_t *AscendMemAdapter::MallocDynamicDevMem(size_t size, const std::string &tag) {
192   std::lock_guard<std::mutex> locker(mutex_);
193   if (AscendVmmAdapter::GetInstance().IsEnabled()) {
194     MS_LOG(EXCEPTION) << "VMM is enabled, can not allocate dynamic memory.";
195   }
196   size = GetRoundUpAlignSize(size);
197   int64_t new_dynamic_offset = cur_dynamic_mem_offset_ + static_cast<int64_t>(size);
198   if (!common::IsNeedProfileMemory() && new_dynamic_offset > static_mem_offset_) {
199     MS_LOG(INFO) << DevMemDetailInfo();
200     MS_LOG(EXCEPTION) << "#umsg#Framework Error Message:#umsg#Out of Memory!!! Request memory size: " << size
201                       << "B, Memory Statistic:" << DevMemStatistics()
202                       << "\nPlease try to reduce 'batch_size' or check whether exists extra large shape. For more "
203                          "details, please refer to 'Out of Memory' at https://www.mindspore.cn .";
204   }
205 
206   auto memory_block_ptr = device_mem_base_addr_ + cur_dynamic_mem_offset_;
207   cur_dynamic_mem_offset_ = new_dynamic_offset;
208   max_dynamic_mem_offset_ = std::max(cur_dynamic_mem_offset_, max_dynamic_mem_offset_);
209   history_max_dynamic_mem_offset_ = std::max(max_dynamic_mem_offset_, history_max_dynamic_mem_offset_);
210   dynamic_memory_block_list_.push_back(std::make_shared<MemoryBlock>(memory_block_ptr, size, tag));
211 
212   return memory_block_ptr;
213 }
214 
GetBaseAddr() const215 uint8_t *AscendMemAdapter::GetBaseAddr() const { return device_mem_base_addr_; }
216 
ResetDynamicMemory()217 void AscendMemAdapter::ResetDynamicMemory() {
218   cur_dynamic_mem_offset_ = 0;
219   if (IsMemoryPoolRecycle()) {
220     max_dynamic_mem_offset_ = 0;
221   }
222   if (AscendVmmAdapter::GetInstance().IsEnabled()) {
223     AscendVmmAdapter::GetInstance().ClearAllMemory();
224   } else if (AscendGmemAdapter::GetInstance().is_eager_free_enabled()) {
225     AscendGmemAdapter::GetInstance().EagerFreeDeviceMem(device_mem_base_addr_, ms_used_hbm_size_);
226   }
227 }
228 
DevMemStatistics() const229 std::string AscendMemAdapter::DevMemStatistics() const {
230   auto context = MsContext::GetInstance();
231   MS_EXCEPTION_IF_NULL(context);
232   std::ostringstream oss;
233   oss << "\nDevice HBM memory size: " << device_hbm_total_size_ / kMBToByte << "M";
234   oss << "\nMindSpore Used memory size: " << ms_used_hbm_size_ / kMBToByte << "M";
235   oss << "\nMindSpore memory base address: " << reinterpret_cast<void *>(device_mem_base_addr_);
236   if (!context->IsKByKExecutorMode()) {
237     oss << "\nTotal Static Memory size: " << (ms_used_hbm_size_ - static_mem_offset_) / kMBToByte << "M";
238     oss << "\nTotal Dynamic memory size: " << history_max_dynamic_mem_offset_ / kMBToByte << "M";
239   }
240   if (IsMemoryPoolRecycle()) {
241     size_t max_actual = std::max(actual_peak_memory_, (ms_used_hbm_size_ - static_mem_offset_));
242     oss << "\nActual peak memory usage: " << max_actual / kMBToByte << "M";
243   } else if (context->IsKByKExecutorMode()) {
244     oss << "\nUsed peak memory usage (without fragments): " << used_peak_memory_ / kMBToByte << "M";
245     oss << "\nActual peak memory usage (with fragments): " << actual_peak_memory_ / kMBToByte << "M";
246   }
247   if (!context->IsKByKExecutorMode()) {
248     oss << "\nDynamic memory size of this graph: " << cur_dynamic_mem_offset_ / kMBToByte << "M";
249   }
250   oss << std::endl;
251   return oss.str();
252 }
253 
DevMemDetailInfo() const254 std::string AscendMemAdapter::DevMemDetailInfo() const {
255   std::ostringstream oss;
256   oss << "\nMemory Detail Info:";
257   oss << "\nStatic Memory Blocks:";
258   oss << "\nAddress \t Size \t tag \t";
259   for (const auto &blk : static_memory_block_list_) {
260     oss << "\n" << blk->mem_ptr << "\t" << blk->mem_size << "\t" << blk->mem_tag;
261   }
262 
263   oss << "\nDynamic Memory Blocks:";
264   oss << "\nAddress \t Size \t tag \t";
265   for (const auto &blk : dynamic_memory_block_list_) {
266     oss << "\n" << blk->mem_ptr << "\t" << blk->mem_size << "\t" << blk->mem_tag;
267   }
268   return oss.str();
269 }
270 
GetDeviceMemSizeFromContext() const271 size_t AscendMemAdapter::GetDeviceMemSizeFromContext() const {
272   auto context = MsContext::GetInstance();
273   MS_EXCEPTION_IF_NULL(context);
274   size_t size_from_context;
275   auto max_device_memory = context->get_param<float>(MS_CTX_MAX_DEVICE_MEMORY);
276   float total_device_memory = 32.0f;
277   if (context->ascend_soc_version() == kAscendVersion910b || context->ascend_soc_version() == kAscendVersion910c) {
278     total_device_memory = 64.0f;
279   }
280   if (max_device_memory <= total_device_memory) {
281     MS_LOG(INFO) << "context max_device_memory:" << max_device_memory;
282     size_from_context = FloatToSize(max_device_memory * kGBToByte);
283   } else {
284     auto variable_memory_max_size = context->get_param<std::string>(MS_CTX_VARIABLE_MEMORY_MAX_SIZE);
285     if (variable_memory_max_size == "0") {
286       return 0;
287     }
288     MS_LOG(INFO) << "context variable_memory_max_size:" << variable_memory_max_size;
289     auto pos = variable_memory_max_size.find('*');
290     if (pos == std::string::npos) {
291       MS_LOG(EXCEPTION) << "Invalid variable_memory_max_size";
292     }
293     auto gb_str = variable_memory_max_size.substr(0, pos);
294     auto gb_var = std::stoull(gb_str);
295     MS_LOG(INFO) << "variable_memory_max_size(GB):" << gb_var;
296     size_from_context = gb_var * kGBToByte;
297   }
298 
299   return size_from_context;
300 }
301 
MallocFromRts(size_t size) const302 uint8_t *AscendMemAdapter::MallocFromRts(size_t size) const {
303   uint8_t *ptr = nullptr;
304   if (AscendVmmAdapter::GetInstance().IsEnabled()) {
305     return nullptr;
306   }
307   if (AscendGmemAdapter::GetInstance().is_eager_free_enabled()) {
308     return AscendGmemAdapter::GetInstance().MmapMemory(size, reinterpret_cast<void *>(ptr));
309   }
310 
311   auto ret = CALL_ASCEND_API(aclrtMalloc, reinterpret_cast<void **>(&ptr), size, ACL_MEM_TYPE_HIGH_BAND_WIDTH);
312   if (ret != ACL_RT_SUCCESS) {
313     if (ret == ACL_ERROR_RT_MEMORY_ALLOCATION) {
314       auto context_ptr = MsContext::GetInstance();
315       MS_EXCEPTION_IF_NULL(context_ptr);
316       unsigned int device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
317       size_t free = 0;
318       size_t total = 0;
319       (void)CALL_ASCEND_API(aclrtGetMemInfo, ACL_HBM_MEM, &free, &total);
320       MS_LOG(EXCEPTION) << "#umsg#Framework Error Message:#umsg#Malloc device memory failed, size[" << size << "], ret["
321                         << ret << "], "
322                         << "Device " << device_id << " Available HBM size:" << total << " free size:" << free
323                         << " may be other processes occupying this card, check as: ps -ef|grep python";
324     } else {
325       MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << size << "] fail, ret[" << ret << "]";
326     }
327   } else {
328     MS_LOG(INFO) << "Call rtMalloc to allocate device memory Success, size: " << size
329                  << " bytes, address start: " << reinterpret_cast<void *>(ptr)
330                  << " end: " << reinterpret_cast<void *>(ptr + size);
331   }
332   return ptr;
333 }
334 
FreeToRts(void * devPtr,const size_t size) const335 bool AscendMemAdapter::FreeToRts(void *devPtr, const size_t size) const {
336   if (devPtr != nullptr) {
337     if (AscendGmemAdapter::GetInstance().is_eager_free_enabled()) {
338       return AscendGmemAdapter::GetInstance().MunmapMemory(devPtr, size);
339     }
340     auto ret = CALL_ASCEND_API(aclrtFree, devPtr);
341     if (ret != ACL_ERROR_NONE) {
342       MS_LOG(ERROR) << "aclrtFree mem [" << devPtr << "] fail, ret[" << ret << "]";
343       return false;
344     }
345   }
346   return true;
347 }
348 }  // namespace ascend
349 }  // namespace device
350 }  // namespace mindspore
351