1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_THUNK_H_ 17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_THUNK_H_ 18 19 #include <memory> 20 #include <vector> 21 22 #include "tensorflow/compiler/xla/executable_run_options.h" 23 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" 24 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h" 25 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" 26 #include "tensorflow/compiler/xla/service/hlo_instruction.h" 27 #include "tensorflow/core/lib/core/status.h" 28 #include "tensorflow/core/platform/stream_executor_no_cuda.h" 29 30 namespace xla { 31 namespace gpu { 32 33 class GpuExecutable; 34 35 // Thunk acts as the bridge between IrEmitter and GpuExecutable. It stores the 36 // metadata IrEmitter generates for GpuExecutable to invoke an HloInstruction. 37 // 38 // Thunk provides the Initialize and ExecuteOnStream interface for GpuExecutable 39 // to initialize and execute the invocation respectively. Its subclasses are 40 // supposed to override these interfaces to launch a generated kernel or call an 41 // external library function (such as operations in cuBLAS). 42 // 43 // This is thread-compatible. 44 class Thunk { 45 public: 46 enum Kind { 47 kCholesky, 48 kCollectivePermute, 49 kConditional, 50 kConvolution, 51 kCopy, 52 kCudnnBatchNormBackward, 53 kCudnnBatchNormForwardInference, 54 kCudnnBatchNormForwardTraining, 55 kCustomCall, 56 kFft, 57 kGemm, 58 kInfeed, 59 kKernel, 60 kMemset32BitValue, 61 kMemzero, 62 kNcclAllGather, 63 kNcclAllReduce, 64 kNcclAllToAll, 65 kOutfeed, 66 kReplicaId, 67 kPartitionId, 68 kSequential, 69 kTriangularSolve, 70 kTuple, 71 kWhile, 72 }; 73 74 struct ThunkInfo { 75 absl::optional<int64> profile_index; 76 std::string profile_annotation; 77 }; 78 79 // The hlo_instruction argument is meant to be the instruction this thunk was 80 // generated from, but Thunk never uses this argument other than to save it 81 // to Thunk::hlo_instruction, so it can be null. Thunk(Kind kind,ThunkInfo thunk_info)82 explicit Thunk(Kind kind, ThunkInfo thunk_info) 83 : kind_(kind), 84 profile_index_(thunk_info.profile_index), 85 profile_annotation_(thunk_info.profile_annotation) {} ~Thunk()86 virtual ~Thunk() {} 87 Thunk(const Thunk&) = delete; 88 Thunk& operator=(const Thunk&) = delete; 89 kind()90 Kind kind() const { return kind_; } profile_annotation()91 string profile_annotation() const { return profile_annotation_; } 92 93 // Prepares the thunk for execution on the given StreamExecutor. 94 // 95 // This may be called multiple times. Its main purpose is to give us a chance 96 // to do initialization outside of ExecuteOnStream() so that the 97 // time spent initializing doesn't count towards our execution profile. Initialize(const GpuExecutable &,se::StreamExecutor *)98 virtual Status Initialize(const GpuExecutable& /*executable*/, 99 se::StreamExecutor* /*executor*/) { 100 return Status::OK(); 101 } 102 103 // Parameters passed to ExecuteOnStream. Encapsulated in a struct so that 104 // when we add something we don't have to change every subclass of Thunk. 105 struct ExecuteParams { 106 const BufferAllocations* buffer_allocations; // never null 107 se::Stream* stream; 108 RunId run_id; 109 HloExecutionProfiler* profiler; // never null 110 const DeviceAssignment* device_assn; // never null 111 std::vector<std::function<void()>>* deferred_host_callbacks; // never null 112 const std::vector<GlobalDeviceId>* gpu_global_device_ids; // may be null 113 const NcclUniqueIdCallback* nccl_unique_id_callback; // may be null 114 115 StatusOr<GlobalDeviceId> GetGlobalDeviceId() const; 116 }; 117 118 // Execute the kernel for the thunk on the given stream. This method must be 119 // called after Initialize and can be called multiple times over Thunk's 120 // lifetime. 121 // 122 // Precondition: Initialize(stream->parent()) has been called. 123 virtual Status ExecuteOnStream(const ExecuteParams& params) = 0; 124 125 protected: profile_index()126 absl::optional<int64> profile_index() const { return profile_index_; } 127 128 // Safely copies the given buffer to the GPU, deleting it on the host only 129 // after the copy has completed. 130 template <typename T> SafeH2DMemcpy(se::DeviceMemory<T> dest,std::unique_ptr<T[]> buf,int64 count,se::Stream * stream,std::vector<std::function<void ()>> * deferred_host_callbacks)131 void SafeH2DMemcpy( 132 se::DeviceMemory<T> dest, std::unique_ptr<T[]> buf, int64 count, 133 se::Stream* stream, 134 std::vector<std::function<void()>>* deferred_host_callbacks) { 135 stream->ThenMemcpy(&dest, buf.get(), count * sizeof(T)); 136 auto* buf_raw = buf.release(); 137 deferred_host_callbacks->push_back([buf_raw] { delete[] buf_raw; }); 138 } 139 140 private: 141 Kind kind_; 142 absl::optional<int64> profile_index_; 143 std::string profile_annotation_; 144 }; 145 146 // A sequence of thunks. 147 using ThunkSequence = std::vector<std::unique_ptr<Thunk>>; 148 149 absl::string_view ThunkKindToString(Thunk::Kind); 150 std::ostream& operator<<(std::ostream& os, Thunk::Kind kind); 151 152 // A struct that defines a shaped slice, i.e., a BufferAllocation::Slice and its 153 // shape. 154 struct ShapedSlice { 155 BufferAllocation::Slice slice; 156 Shape shape; 157 }; 158 159 } // namespace gpu 160 } // namespace xla 161 162 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_THUNK_H_ 163