1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_THUNK_H_ 17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_THUNK_H_ 18 19 #include <memory> 20 #include <vector> 21 22 #include "tensorflow/compiler/xla/executable_run_options.h" 23 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" 24 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h" 25 #include "tensorflow/compiler/xla/service/hlo_instruction.h" 26 #include "tensorflow/compiler/xla/service/service_executable_run_options.h" 27 #include "tensorflow/core/lib/core/status.h" 28 #include "tensorflow/core/platform/stream_executor_no_cuda.h" 29 30 namespace xla { 31 namespace gpu { 32 33 class GpuExecutable; 34 35 // Thunk acts as the bridge between IrEmitter and GpuExecutable. It stores the 36 // metadata IrEmitter generates for GpuExecutable to invoke an HloInstruction. 37 // 38 // Thunk provides the Initialize and ExecuteOnStream interface for GpuExecutable 39 // to initialize and execute the invocation respectively. Its subclasses are 40 // supposed to override these interfaces to launch a generated kernel or call an 41 // external library function (such as operations in cuBLAS). 42 // 43 // This is thread-compatible. 44 class Thunk { 45 public: 46 enum Kind { 47 kCholesky, 48 kCollectivePermute, 49 kConditional, 50 kConvolution, 51 kCopy, 52 kCublasLtMatmul, 53 kCustomCall, 54 kFft, 55 kGemm, 56 kInfeed, 57 kKernel, 58 kMemset32BitValue, 59 kMemzero, 60 kNcclAllGather, 61 kNcclAllReduce, 62 kNcclAllReduceStart, 63 kNcclAllReduceDone, 64 kNcclReduceScatter, 65 kNcclAllToAll, 66 kOutfeed, 67 kReplicaId, 68 kPartitionId, 69 kSequential, 70 kTriangularSolve, 71 kWhile, 72 }; 73 74 struct ThunkInfo { 75 std::optional<int64_t> profile_index; 76 std::string profile_annotation; 77 }; 78 79 // The hlo_instruction argument is meant to be the instruction this thunk was 80 // generated from, but Thunk never uses this argument other than to save it 81 // to Thunk::hlo_instruction, so it can be null. Thunk(Kind kind,ThunkInfo thunk_info)82 explicit Thunk(Kind kind, ThunkInfo thunk_info) 83 : kind_(kind), 84 profile_index_(thunk_info.profile_index), 85 profile_annotation_(thunk_info.profile_annotation) {} ~Thunk()86 virtual ~Thunk() {} 87 Thunk(const Thunk&) = delete; 88 Thunk& operator=(const Thunk&) = delete; 89 ToStringExtra(int indent)90 virtual std::string ToStringExtra(int indent) const { return ""; } kind()91 Kind kind() const { return kind_; } profile_annotation()92 std::string profile_annotation() const { return profile_annotation_; } 93 94 // Prepares the thunk for execution on the given StreamExecutor. 95 // 96 // This may be called multiple times. Its main purpose is to give us a chance 97 // to do initialization outside of ExecuteOnStream() so that the 98 // time spent initializing doesn't count towards our execution profile. Initialize(const GpuExecutable &,se::StreamExecutor *)99 virtual Status Initialize(const GpuExecutable& /*executable*/, 100 se::StreamExecutor* /*executor*/) { 101 return OkStatus(); 102 } 103 104 // Parameters passed to ExecuteOnStream. Encapsulated in a struct so that 105 // when we add something we don't have to change every subclass of Thunk. 106 struct ExecuteParams { 107 ExecuteParams(const ServiceExecutableRunOptions& run_options, 108 const BufferAllocations& buffer_allocations, 109 se::Stream* stream, se::Stream* async_comms_stream); 110 111 const BufferAllocations* buffer_allocations; // never null 112 se::Stream* stream; 113 se::Stream* async_comms_stream; 114 NcclExecuteParams nccl_params; 115 }; 116 117 // Execute the kernel for the thunk on the given stream. This method must be 118 // called after Initialize and can be called multiple times over Thunk's 119 // lifetime. 120 // 121 // Precondition: Initialize(stream->parent()) has been called. 122 virtual Status ExecuteOnStream(const ExecuteParams& params) = 0; 123 124 static absl::string_view KindToString(Thunk::Kind kind); 125 126 protected: profile_index()127 std::optional<int64_t> profile_index() const { return profile_index_; } 128 129 private: 130 Kind kind_; 131 std::optional<int64_t> profile_index_; 132 std::string profile_annotation_; 133 }; 134 135 // A sequence of thunks. 136 class ThunkSequence : public std::vector<std::unique_ptr<Thunk>> { 137 public: 138 std::string ToString(int indent = 0, 139 std::function<std::string(const Thunk*)> 140 get_thunk_annotation = nullptr) const; 141 }; 142 143 std::ostream& operator<<(std::ostream& os, Thunk::Kind kind); 144 145 // A struct that defines a shaped slice, i.e., a BufferAllocation::Slice and its 146 // shape. 147 struct ShapedSlice { 148 BufferAllocation::Slice slice; 149 Shape shape; 150 }; 151 152 } // namespace gpu 153 } // namespace xla 154 155 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_THUNK_H_ 156