• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_
18 
19 #include <memory>
20 #include <string>
21 
22 #include "absl/container/flat_hash_map.h"
23 #include "absl/strings/string_view.h"
24 #include "absl/types/optional.h"
25 #include "absl/types/span.h"
26 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
27 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
28 #include "tensorflow/compiler/xla/service/executable.h"
29 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
30 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
31 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
32 #include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h"
33 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
34 #include "tensorflow/compiler/xla/service/hlo_module.h"
35 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
36 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
37 #include "tensorflow/compiler/xla/statusor.h"
38 #include "tensorflow/compiler/xla/types.h"
39 #include "tensorflow/core/platform/macros.h"
40 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
41 
42 namespace xla {
43 namespace gpu {
44 
45 // GPU-targeting implementation of the XLA Executable interface.
46 //
47 // Launches the given CUDA kernel via the StreamExecutor.
48 //
49 // This is an immutable data type after initialization, and thus thread safe.
50 class GpuExecutable : public Executable {
51  public:
52   // cubin (i.e. the compiled ptx) may be empty, in which case we leave
53   // compilation up to the GPU driver.
54   GpuExecutable(const string& ptx, const std::vector<uint8>& cubin,
55                 std::pair<int, int> compute_capability,
56                 std::unique_ptr<const ThunkSchedule> thunk_schedule,
57                 std::unique_ptr<HloModule> hlo_module,
58                 std::unique_ptr<const BufferAssignment> assignment,
59                 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
60                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
61 
62   // This should be called after set_ir_module_string.
ir_module_string()63   const string& ir_module_string() const { return ir_module_string_; }
64 
65   // This should be called before ExecuteOnStream.
set_ir_module_string(const string & ir_module_string)66   void set_ir_module_string(const string& ir_module_string) {
67     ir_module_string_ = ir_module_string;
68   }
69 
70   // Returns the compiled PTX for the computation.
ptx()71   const string& ptx() const { return ptx_; }
72 
73   // Returns the cubin (compiled PTX) stored in this GpuExecutable.  May be
74   // empty, in which case compilation is left up to the GPU driver.
cubin()75   const std::vector<uint8>& cubin() const { return cubin_; }
76 
77   // ExecuteOnStream will fail if the compute capability of the stream doesn't
78   // match the compute capability passed to this object's constructor.
79   StatusOr<ScopedShapedBuffer> ExecuteOnStream(
80       const ServiceExecutableRunOptions* run_options,
81       absl::Span<const ShapedBuffer* const> arguments,
82       HloExecutionProfile* hlo_execution_profile) override;
83 
84   StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
85       const ServiceExecutableRunOptions* run_options,
86       absl::Span<const ShapedBuffer* const> arguments) override;
87 
88  private:
89   // If `block_host_until_done` is false, execution will not block the host
90   // until the kernels have completed. This is used as an optimization for
91   // clients, such as Tensorflow, that use a single stream of execution for
92   // computations, and allow host-side deallocation from the allocator before
93   // GPU execution completes.
94   Status ExecuteThunks(const ServiceExecutableRunOptions* run_options,
95                        const BufferAllocations& buffer_allocations,
96                        bool block_host_until_done,
97                        HloExecutionProfile* hlo_execution_profile);
98 
99   // Returns the points-to set of the root instruction of the entry
100   // computation. Uses points-to analysis from buffer assignment.
101   const PointsToSet& GetRootPointsToSet() const;
102 
103   using BufferAllocToDeviceMemoryMap =
104       absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>;
105 
106   // Loads the PTX or CUBIN for this executable into `executor` and resolves the
107   // globals corresponding to constant buffers.  Returns a map mapping buffer
108   // allocation indices to GPU pointers.
109   StatusOr<const BufferAllocToDeviceMemoryMap*> ResolveConstantGlobals(
110       stream_executor::StreamExecutor* executor);
111 
112   // The LLVM IR, in string format, of the unoptimized module generated for this
113   // GpuExecutable. We save a string instead of an llvm::Module* because leaving
114   // llvm::Module* in a singleton can cause the heap checker to emit false
115   // positives.
116   //
117   // This string should be modified only before ExecuteOnStream.
118   string ir_module_string_;
119 
120   // The PTX for the computation.
121   const string ptx_;
122 
123   // The GPU machine code for the computation, targeting GPUs at
124   // compute_capability_.
125   //
126   // May be empty, in which case we leave compilation up to the GPU driver.
127   const std::vector<uint8> cubin_;
128 
129   // The compute capability of the GPU we're targeting with this GpuExecutable.
130   std::pair<int, int> compute_capability_;
131 
132   // The thunks to be invoked by this GpuExecutable. They are generated by the
133   // IrEmitter.
134   const std::unique_ptr<const ThunkSchedule> thunk_schedule_;
135 
136   // Owns the buffer data at runtime. It provides information to allocate
137   // memory for every output/temp buffers.
138   const std::unique_ptr<const BufferAssignment> assignment_;
139 
140   // Cache of module handles and constant buffer allocation maps used by
141   // `ResolveConstantGlobals`.
142   tensorflow::mutex module_handle_mutex_;
143   std::map<stream_executor::StreamExecutor*, se::ScopedModuleHandle>
144       module_handles_ GUARDED_BY(module_handle_mutex_);
145   std::map<stream_executor::StreamExecutor*, BufferAllocToDeviceMemoryMap>
146       module_globals_ GUARDED_BY(module_handle_mutex_);
147 
148   TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable);
149 };
150 
151 }  // namespace gpu
152 }  // namespace xla
153 
154 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_
155