• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_
18 
19 #include <memory>
20 #include <string>
21 
22 #include "absl/container/flat_hash_map.h"
23 #include "absl/strings/string_view.h"
24 #include "absl/types/optional.h"
25 #include "absl/types/span.h"
26 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
27 #include "tensorflow/compiler/xla/service/executable.h"
28 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
29 #include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
30 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
31 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
32 #include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h"
33 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
34 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
35 #include "tensorflow/compiler/xla/service/hlo_module.h"
36 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
37 #include "tensorflow/compiler/xla/statusor.h"
38 #include "tensorflow/core/platform/macros.h"
39 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
40 #include "tensorflow/stream_executor/device_memory_allocator.h"
41 
42 namespace xla {
43 namespace gpu {
44 
45 // GPU-targeting implementation of the XLA Executable interface.
46 //
47 // Launches the given GPU kernel via the StreamExecutor.
48 //
49 // This is an immutable data type after initialization, and thus thread safe.
50 class GpuExecutable : public Executable {
51  public:
52   // We need to share ownership of hlo_module and assignment with profiler to
53   // safely keep a reference to these objects during tracing period, thus they
54   // are passed as shared pointers.
55   GpuExecutable(const string& text, const std::vector<uint8>& binary,
56                 GpuVersion gpu_version,
57                 std::unique_ptr<const ThunkSchedule> thunk_schedule,
58                 std::shared_ptr<HloModule> hlo_module,
59                 std::shared_ptr<const BufferAssignment> assignment,
60                 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
61                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
62   ~GpuExecutable() override;
63 
64   int64 SizeOfGeneratedCodeInBytes() override;
65 
66   // This should be called after set_ir_module_string.
ir_module_string()67   const string& ir_module_string() const { return ir_module_string_; }
68 
69   // This should be called before ExecuteOnStream.
set_ir_module_string(const string & ir_module_string)70   void set_ir_module_string(const string& ir_module_string) {
71     ir_module_string_ = ir_module_string;
72   }
73 
74   // Returns the compiled code for the computation. The compiled code is PTX in
75   // Cuda and unused empty string in ROCm.
text()76   const string& text() const { return text_; }
77 
78   // Returns the binary stored in this GpuExecutable. The binary is cubin in
79   // Cuda, and HSA code object in ROCm. It may be empty, in which case
80   // compilation is left up to the GPU driver.
binary()81   const std::vector<uint8>& binary() const { return binary_; }
82 
83   // ExecuteAsyncOnStream will fail if the compute capability of the stream
84   // doesn't match the compute capability passed to this object's constructor.
85   StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
86       const ServiceExecutableRunOptions* run_options,
87       std::vector<ShapeTree<MaybeOwningDeviceMemory>> arguments,
88       HloExecutionProfile* hlo_execution_profile) override;
89 
GetBufferAssignment()90   std::shared_ptr<const BufferAssignment> GetBufferAssignment() const {
91     return assignment_;
92   }
93 
94  private:
95   // If `block_host_until_done` is false, execution will not block the host
96   // until the kernels have completed. This is used as an optimization for
97   // clients, such as Tensorflow, that use a single stream of execution for
98   // computations, and allow host-side deallocation from the allocator before
99   // GPU execution completes.
100   Status ExecuteThunks(const ServiceExecutableRunOptions* run_options,
101                        const BufferAllocations& buffer_allocations,
102                        bool block_host_until_done,
103                        HloExecutionProfile* hlo_execution_profile);
104 
105   // Returns the value set of the root instruction of the entry
106   // computation. Uses dataflow analysis from buffer assignment.
107   const InstructionValueSet& GetRootValueSet() const;
108 
109   using BufferAllocToDeviceMemoryMap =
110       absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>;
111 
112   // Loads the PTX or CUBIN for this executable into `executor` and resolves the
113   // globals corresponding to constant buffers.  Returns a map mapping buffer
114   // allocation indices to GPU pointers.
115   StatusOr<const BufferAllocToDeviceMemoryMap*> ResolveConstantGlobals(
116       stream_executor::Stream* stream);
117 
118   // Computes annotations for each thunk and store them in thunk_annotations_.
119   void ComputeThunkAnnotations();
120 
121   // GpuExecutable check with either AMD's ISA version, or Nvidia's major minor
122   // version for compute capability, depending on the hardware.
123   Status CheckCompatibilityWithServiceExecutableRunOptions(
124       const ServiceExecutableRunOptions* run_options);
125 
126   // The LLVM IR, in string format, of the unoptimized module generated for this
127   // GpuExecutable. We save a string instead of an llvm::Module* because leaving
128   // llvm::Module* in a singleton can cause the heap checker to emit false
129   // positives.
130   //
131   // This string should be modified only before ExecuteOnStream.
132   string ir_module_string_;
133 
134   // The compiled code for the computation.
135   const string text_;
136 
137   // The GPU machine code for the computation, targeting GPUs at
138   // compute_capability_.
139   //
140   // May be empty, in which case we leave compilation up to the GPU driver.
141   const std::vector<uint8> binary_;
142 
143   // The GPU version for compute compatibility check.
144   GpuVersion gpu_version_;
145 
146   // The thunks to be invoked by this GpuExecutable. They are generated by the
147   // IrEmitter.
148   const std::unique_ptr<const ThunkSchedule> thunk_schedule_;
149 
150   // Owns the buffer data at runtime. It provides information to allocate
151   // memory for every output/temp buffers.
152   const std::shared_ptr<const BufferAssignment> assignment_;
153 
154   // Maps a thunk to a string describing the thunk.  This is useful when
155   // constructing ScopeAnnotation objects.
156   absl::flat_hash_map<Thunk*, string> thunk_annotations_;
157 
158   // Cache of module handles and constant buffer allocation maps used by
159   // `ResolveConstantGlobals`.
160   tensorflow::mutex module_handle_mutex_;
161   std::map<stream_executor::StreamExecutor*, se::ScopedModuleHandle>
162       module_handles_ GUARDED_BY(module_handle_mutex_);
163   std::map<stream_executor::StreamExecutor*, BufferAllocToDeviceMemoryMap>
164       module_globals_ GUARDED_BY(module_handle_mutex_);
165 
166   TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable);
167 };
168 
169 }  // namespace gpu
170 }  // namespace xla
171 
172 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_
173