1 /** 2 * Copyright 2021 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_OUTPUT_ACTOR_H_ 18 #define MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_OUTPUT_ACTOR_H_ 19 20 #include <vector> 21 #include <string> 22 #include <memory> 23 #include <utility> 24 #include <algorithm> 25 #include <map> 26 #include "utils/hash_map.h" 27 #include "runtime/graph_scheduler/control_node_parser.h" 28 #include "runtime/graph_scheduler/device_tensor_store.h" 29 #include "runtime/graph_scheduler/actor/actor_common.h" 30 #include "runtime/graph_scheduler/actor/abstract_actor.h" 31 #include "runtime/hardware/device_context.h" 32 #include "include/backend/anf_runtime_algorithm.h" 33 #include "include/common/utils/anfalgo.h" 34 #include "ir/tensor.h" 35 36 namespace mindspore { 37 namespace runtime { 38 using mindspore::device::DeviceContext; 39 using mindspore::session::KernelWithIndex; 40 using mindspore::tensor::TensorPtr; 41 42 // The output actor is used to receive the output result of actor which represents the graph output. 43 class OutputActor : public AbstractActor { 44 public: OutputActor(const std::string & name,size_t loop_count,size_t outputs_num,const std::vector<KernelWithIndex> & summary_nodes)45 OutputActor(const std::string &name, size_t loop_count, size_t outputs_num, 46 const std::vector<KernelWithIndex> &summary_nodes) 47 : AbstractActor(name, KernelTransformType::kOutputActor, nullptr), 48 loop_count_(loop_count), 49 current_count_(0), 50 summary_nodes_(summary_nodes), 51 outputs_num_(outputs_num), 52 current_outputs_num_(0) { 53 outputs_.resize(outputs_num); 54 output_nodes_.resize(outputs_num); 55 output_device_tensors_.resize(outputs_num); 56 device_contexts_.resize(outputs_num); 57 } 58 ~OutputActor() override = default; 59 60 // The output actor collects loop count when receive the input control of loop count actor. 61 void RunOpControl(AID *const input_control, OpContext<DeviceTensor> *const context) override; 62 63 // The output actor collects output result when receive the data of actor. 64 void RunOpData(OpData<DeviceTensor> *const input_data, OpContext<DeviceTensor> *const context) override; 65 66 // The graph output need be set new device address every step or loop, to avoid that the device address 67 // context of tensor be rewritten in the next step or next loop. 68 void UpdateOutputDeviceAddress(); 69 70 // Summary node will keep the inputs, so if the input(except parameter, weight) size changes in dynamic shape, 71 // the input device address will be reuse in a wrong way. So we should free summary node inputs after usage. 72 void FreeSummaryNodeMem(); 73 74 // Get the member. loop_count()75 size_t loop_count() const { return loop_count_; } outputs_num()76 size_t outputs_num() const { return outputs_num_; } outputs()77 const std::vector<TensorPtr> &outputs() const { return outputs_; } 78 79 protected: 80 void Init() override; 81 82 private: 83 friend class GraphScheduler; 84 friend class ControlNodeScheduler; 85 86 TensorPtr CreateOutputTensor(const AnfNodePtr &output_node, size_t output_index, size_t output_position, 87 OpContext<DeviceTensor> *const context); 88 89 // The output device memory will be taken over by tensor in the last loop, otherwise needs to free the memory. 90 // 1.Avoid the memory leak when memory used by dynamic ref count in the control flow scene. 91 // 2.Alloc the new memory in the next step using the new shape size in the dynamic shape scene. 92 void FreeOutputNodeMem(); 93 94 // Clear output nodes and tensors in cache. 95 void ClearOutputCache(); 96 97 // The loop count is constant, the current count is increased after each step running finished. 98 // Collect the output result in the last loop which is represented by "loop_count_ - current_count_ == 1". 99 size_t loop_count_; 100 size_t current_count_; 101 102 // The outputs. 103 std::vector<KernelWithIndex> summary_nodes_; 104 std::vector<TensorPtr> outputs_; 105 std::vector<KernelWithIndex> output_nodes_; 106 std::vector<DeviceTensor *> output_device_tensors_; 107 size_t outputs_num_; 108 size_t current_outputs_num_; 109 110 std::map<KernelWithIndex, DeviceTensorPtr> output_node_to_tensor_device_address_; 111 }; 112 113 using OutputActorPtr = std::shared_ptr<OutputActor>; 114 } // namespace runtime 115 } // namespace mindspore 116 117 #endif // MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_OUTPUT_ACTOR_H_ 118