• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_THUNK_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_THUNK_H_
18 
19 #include <memory>
20 #include <vector>
21 
22 #include "tensorflow/compiler/xla/executable_run_options.h"
23 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
24 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
25 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
26 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
27 #include "tensorflow/core/lib/core/status.h"
28 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
29 
30 namespace xla {
31 namespace gpu {
32 
33 class GpuExecutable;
34 
35 // Thunk acts as the bridge between IrEmitter and GpuExecutable. It stores the
36 // metadata IrEmitter generates for GpuExecutable to invoke an HloInstruction.
37 //
38 // Thunk provides the Initialize and ExecuteOnStream interface for GpuExecutable
39 // to initialize and execute the invocation respectively. Its subclasses are
40 // supposed to override these interfaces to launch a generated kernel or call an
41 // external library function (such as operations in cuBLAS).
42 //
43 // This is thread-compatible.
44 class Thunk {
45  public:
46   enum Kind {
47     kCholesky,
48     kCollectivePermute,
49     kConditional,
50     kConvolution,
51     kCopy,
52     kCublasLtMatmul,
53     kCustomCall,
54     kFft,
55     kGemm,
56     kInfeed,
57     kKernel,
58     kMemset32BitValue,
59     kMemzero,
60     kNcclAllGather,
61     kNcclAllReduce,
62     kNcclAllReduceStart,
63     kNcclAllReduceDone,
64     kNcclReduceScatter,
65     kNcclAllToAll,
66     kOutfeed,
67     kReplicaId,
68     kPartitionId,
69     kSequential,
70     kTriangularSolve,
71     kWhile,
72   };
73 
74   struct ThunkInfo {
75     std::optional<int64_t> profile_index;
76     std::string profile_annotation;
77   };
78 
79   // The hlo_instruction argument is meant to be the instruction this thunk was
80   // generated from, but Thunk never uses this argument other than to save it
81   // to Thunk::hlo_instruction, so it can be null.
Thunk(Kind kind,ThunkInfo thunk_info)82   explicit Thunk(Kind kind, ThunkInfo thunk_info)
83       : kind_(kind),
84         profile_index_(thunk_info.profile_index),
85         profile_annotation_(thunk_info.profile_annotation) {}
~Thunk()86   virtual ~Thunk() {}
87   Thunk(const Thunk&) = delete;
88   Thunk& operator=(const Thunk&) = delete;
89 
ToStringExtra(int indent)90   virtual std::string ToStringExtra(int indent) const { return ""; }
kind()91   Kind kind() const { return kind_; }
profile_annotation()92   std::string profile_annotation() const { return profile_annotation_; }
93 
94   // Prepares the thunk for execution on the given StreamExecutor.
95   //
96   // This may be called multiple times.  Its main purpose is to give us a chance
97   // to do initialization outside of ExecuteOnStream() so that the
98   // time spent initializing doesn't count towards our execution profile.
Initialize(const GpuExecutable &,se::StreamExecutor *)99   virtual Status Initialize(const GpuExecutable& /*executable*/,
100                             se::StreamExecutor* /*executor*/) {
101     return OkStatus();
102   }
103 
104   // Parameters passed to ExecuteOnStream.  Encapsulated in a struct so that
105   // when we add something we don't have to change every subclass of Thunk.
106   struct ExecuteParams {
107     ExecuteParams(const ServiceExecutableRunOptions& run_options,
108                   const BufferAllocations& buffer_allocations,
109                   se::Stream* stream, se::Stream* async_comms_stream);
110 
111     const BufferAllocations* buffer_allocations;  // never null
112     se::Stream* stream;
113     se::Stream* async_comms_stream;
114     NcclExecuteParams nccl_params;
115   };
116 
117   // Execute the kernel for the thunk on the given stream. This method must be
118   // called after Initialize and can be called multiple times over Thunk's
119   // lifetime.
120   //
121   // Precondition: Initialize(stream->parent()) has been called.
122   virtual Status ExecuteOnStream(const ExecuteParams& params) = 0;
123 
124   static absl::string_view KindToString(Thunk::Kind kind);
125 
126  protected:
profile_index()127   std::optional<int64_t> profile_index() const { return profile_index_; }
128 
129  private:
130   Kind kind_;
131   std::optional<int64_t> profile_index_;
132   std::string profile_annotation_;
133 };
134 
135 // A sequence of thunks.
136 class ThunkSequence : public std::vector<std::unique_ptr<Thunk>> {
137  public:
138   std::string ToString(int indent = 0,
139                        std::function<std::string(const Thunk*)>
140                            get_thunk_annotation = nullptr) const;
141 };
142 
143 std::ostream& operator<<(std::ostream& os, Thunk::Kind kind);
144 
145 // A struct that defines a shaped slice, i.e., a BufferAllocation::Slice and its
146 // shape.
147 struct ShapedSlice {
148   BufferAllocation::Slice slice;
149   Shape shape;
150 };
151 
152 }  // namespace gpu
153 }  // namespace xla
154 
155 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_THUNK_H_
156