1 /** 2 * Copyright 2020-2021 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_PROFILER_DEVICE_GPU_GPU_PROFILING_H 18 #define MINDSPORE_CCSRC_PROFILER_DEVICE_GPU_GPU_PROFILING_H 19 #include <cuda.h> 20 #include <cupti.h> 21 #include <algorithm> 22 #include <cstdio> 23 #include <map> 24 #include <memory> 25 #include <mutex> 26 #include <string> 27 #include <unordered_map> 28 #include <utility> 29 #include <vector> 30 #include "profiler/device/profiling.h" 31 #include "profiler/device/gpu/gpu_profiling_utils.h" 32 33 namespace mindspore { 34 namespace profiler { 35 namespace gpu { 36 enum class CUPTIApiType { kCallback = 0, kActivity = 1 }; 37 enum class ActivityType { 38 kKernel = 0, 39 kMemcpyH2D = 1, 40 kMemcpyD2H = 2, 41 kMemcpyH2A = 3, 42 kMemcpyA2H = 4, 43 kMemcpyA2D = 5, 44 kMemcpyD2A = 6, 45 kMemcpyD2D = 7, 46 kMemcpyP2P = 8, 47 kMemcpyH2H = 9, 48 kMemset = 10, 49 kMemcpyUnknown = 11 50 }; 51 52 struct MemcpyInfo { 53 size_t bytes; 54 unsigned char src_kind; 55 unsigned char dst_kind; 56 }; 57 58 struct KernelInfo { 59 uint64_t registers_per_thread; 60 uint64_t static_shared_memory; 61 uint64_t dynamic_shared_memory; 62 uint64_t block_x; 63 uint64_t block_y; 64 uint64_t block_z; 65 uint64_t grid_x; 66 uint64_t grid_y; 67 uint64_t grid_z; 68 }; 69 70 struct Event { 71 std::string kernel_name; 72 std::string kernel_type; 73 CUPTIApiType api_type; 74 ActivityType activity_type; 75 uint64_t start_time_stamp; 76 uint64_t end_time_stamp; 77 std::string op_name; 78 uint32_t device_id; 79 uint32_t correlation_id; 80 uint32_t thread_id; 81 uint32_t context_id; 82 uint32_t stream_id; 83 CUpti_CallbackId cb_id; 84 union { 85 MemcpyInfo memcpy_info; 86 KernelInfo kernel_info; 87 }; 88 }; 89 90 struct BaseTime { 91 // nanosecond 92 uint64_t host_start_time = 0l; 93 uint64_t host_start_monotonic_raw_time = 0l; 94 uint64_t gpu_start_time = 0l; 95 }; 96 97 const float kTimeUnit = 1000; 98 99 class ProfilingOp { 100 public: 101 ProfilingOp() = default; 102 virtual ~ProfilingOp() = default; 103 virtual void SaveProfilingData() = 0; 104 virtual void Init() = 0; Name()105 std::string Name() const { return op_name_; } 106 107 protected: 108 std::string op_name_; 109 }; 110 111 class GPUProfiler : public Profiler { 112 public: 113 static std::shared_ptr<GPUProfiler> &GetInstance(); 114 GPUProfiler() = default; ~GPUProfiler()115 ~GPUProfiler() { StopCUPTI(); } 116 GPUProfiler(const GPUProfiler &) = delete; 117 GPUProfiler &operator=(const GPUProfiler &) = delete; 118 119 void Init(const std::string &profileDataPath) override; 120 void Stop() override; 121 void StopCUPTI(); 122 void StepProfilingEnable(const bool enable_flag) override; 123 void SyncEnable(const bool enable_flag); GetEnableFlag()124 bool GetEnableFlag() const { return enable_flag_; } GetSyncEnableFlag()125 bool GetSyncEnableFlag() const { return sync_enable_flag_; } 126 void EventHandleProcess(CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, const std::string &typestring, 127 uint64_t startTimestamp, uint64_t endTimestamp); 128 void CUPTIAPI AllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords); 129 void CUPTIAPI ProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize); 130 void OpDataProducerBegin(const std::string op_name, void *stream); 131 void OpDataProducerEnd() override; 132 void ProcessEvents(); 133 void RegisterProfilingOp(std::shared_ptr<ProfilingOp> node); 134 void SetStepTraceOpName(ProfilingTraceInfo trace_op_name); ProfileDataPath()135 std::string ProfileDataPath() const { return profile_data_path_; } 136 137 private: 138 void SingleOpLaunchTimeProcess(float op_time_elapsed); 139 void OpsParser(); 140 void EventLog(const Event &event); 141 void ClearInst() override; 142 void HandleActivityRecord(CUpti_Activity *record); 143 void AddEvent(Event &&event); 144 void SetRunTimeData(const std::string &op_name, void *stream); 145 void FixOpNameByCorrelationId(Event *event); 146 147 static std::shared_ptr<GPUProfiler> profiler_inst_; 148 bool enable_flag_ = false; 149 bool sync_enable_flag_ = true; 150 std::unordered_map<uint32_t, std::string> op_name_map_; 151 std::vector<Event> events_; 152 BaseTime base_time_; 153 std::string op_name_; 154 void *stream_; 155 void SaveProfileData() override; 156 void SaveExtraProfileData(); 157 std::mutex event_mutex_; 158 159 std::vector<CUpti_ActivityKind> activities_enable_; 160 161 uint64_t cupti_callback_events_count_ = 0l; 162 uint64_t cupti_callback_events_drop_count_ = 0l; 163 uint64_t max_cupti_callback_events_ = 2 * 1024 * 10000; 164 165 uint64_t cupti_activity_events_count_ = 0l; 166 uint64_t cupti_activity_events_drop_count_ = 0l; 167 uint64_t max_cupti_activity_events_ = 2 * 1024 * 10000; 168 169 CUpti_SubscriberHandle subscriber_ = nullptr; 170 cudaEvent_t op_event_start_; 171 cudaEvent_t op_event_stop_; 172 uint64_t op_host_time_start_; 173 uint64_t op_host_time_stop_; 174 uint64_t op_cupti_time_start_; 175 std::string profile_data_path_; 176 std::map<std::string, std::shared_ptr<ProfilingOp>> profiling_op_; 177 ProfilingTraceInfo step_trace_op_name_; 178 }; 179 } // namespace gpu 180 } // namespace profiler 181 } // namespace mindspore 182 183 #endif // MINDSPORE_CCSRC_PROFILER_DEVICE_GPU_PROFILING_H 184