OpenHarmony-v3.2.1-Release/s

/**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "runtime/device/bucket.h"

#include "runtime/device/kernel_runtime_manager.h"
#include "frontend/parallel/context.h"
#include "utils/profile.h"

namespace mindspore::device {
void Bucket::AddGradTensor(const tensor::TensorPtr &tensor) {
  if (grad_tensor_list_.size() >= bucket_size_) {
    MS_LOG(EXCEPTION) << "bucket is full";
  }
  grad_tensor_list_.emplace_back(tensor);
  if (grad_tensor_list_.size() > bucket_size_) {
    MS_LOG(EXCEPTION) << "too many tensor add to the bucket, bucket_size_:" << bucket_size_
                      << " total tensor size:" << grad_tensor_list_.size();
  }
  MS_LOG(INFO) << "current bucket tensors size:" << grad_tensor_list_.size();
  // bucket is full, start to launch allreduce
  if (grad_tensor_list_.size() == bucket_size_) {
    full_ = true;
  }
}

void Bucket::Launch() {
  auto start = GetTime();
  if (grad_tensor_list_.size() != bucket_size_) {
    MS_LOG(EXCEPTION) << "Bucket is not full, grad_tensor_list_ size:" << grad_tensor_list_.size()
                      << " bucket_size_:" << bucket_size_;
  }
  MS_LOG(INFO) << "Bucket is full, start to launch AllReduce";
  MS_EXCEPTION_IF_NULL(pre_event_);
  MS_EXCEPTION_IF_NULL(post_event_);
  AllocateAllReduceAddr();
  CopyTensorToContiguousMemory();
  pre_event_->RecordEvent();
  pre_event_->WaitEvent();
  LaunchAllReduce();
  // mul fusion
  CalculateMean();
  post_event_->RecordEvent();
  UpdateTensorAddr();
  // pass event to the tensor
  for (auto &tensor : grad_tensor_list_) {
    MS_EXCEPTION_IF_NULL(tensor);
    tensor->SetDeviceEvent(post_event_);
  }
  MS_LOG(INFO) << "Bucket launch cost:" << (GetTime() - start) * 1e6 << " us";
}

void Bucket::UpdateTensorAddr() {
  if (grad_tensor_list_.size() != bucket_size_ || new_tensor_output_addrs_.size() != bucket_size_) {
    MS_LOG(EXCEPTION) << "grad_tensor_list size:" << grad_tensor_list_.size()
                      << " tensor output addr size:" << new_tensor_output_addrs_.size()
                      << " bucket size:" << bucket_size_;
  }

  for (size_t i = 0; i < bucket_size_; ++i) {
    auto &tensor = grad_tensor_list_[i];
    MS_EXCEPTION_IF_NULL(tensor);
    auto device_address = std::dynamic_pointer_cast<DeviceAddress>(tensor->device_address());
    // release old addr and manage addr by this Bucket.
    MS_EXCEPTION_IF_NULL(device_address);
    auto origin_dev_ptr = device_address->GetMutablePtr();
    tensor_old_addr_list_.emplace_back(origin_dev_ptr);
    device_address->from_mem_pool_ = false;
    device_address->set_ptr(new_tensor_output_addrs_[i]);
  }
}

void Bucket::CalculateMean() {
  auto parallel_context = parallel::ParallelContext::GetInstance();
  MS_EXCEPTION_IF_NULL(parallel_context);
  auto grad_mean = parallel_context->gradients_mean();
  if (!grad_mean) {
    UpdateTensorOutputAddr(ar_output_addr_);
    return;
  }
  if (launch_mul_ == nullptr) {
    launch_mul_ = CreateLaunchMul();
    MS_EXCEPTION_IF_NULL(launch_mul_);
  }
  // set mul input1 addr
  launch_mul_->SetInputAddr(ar_output_addr_);
  // launch mean
  launch_mul_->LaunchOpKernel();
  // store tensor output addr
  auto launch_output = launch_mul_->GetKernelOutputAddr();
  if (launch_output.size() != 1) {
    MS_LOG(EXCEPTION) << "launch mul outputs should have one output";
  }
  UpdateTensorOutputAddr(launch_output[0]);
}

void Bucket::UpdateTensorOutputAddr(uint8_t *addr) {
  uint8_t *tensor_output = addr;
  for (size_t i = 0; i < bucket_size_; ++i) {
    (void)new_tensor_output_addrs_.emplace_back(tensor_output);
    tensor_output += align_size_list_[i];
  }
}

void Bucket::LazyDeleteOldAddr() {
  MS_LOG(INFO) << "Lazy delete old grad address";
  for (auto old_addr : tensor_old_addr_list_) {
    FreeDeviceMem(old_addr);
  }
  tensor_old_addr_list_.clear();
}

void Bucket::Release() {
  MS_LOG(INFO) << "Clear bucket:" << id_;
  grad_tensor_list_.clear();
  align_size_list_.clear();
  new_tensor_output_addrs_.clear();
  memcpy_input_addrs_.clear();
  memcpy_output_addrs_.clear();
  tensor_type_list_.clear();
  LazyDeleteOldAddr();
  FreeAllDeviceMem();
  full_ = false;
}
}  // namespace mindspore::device