1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // Contains utilities for launching compiled XLA kernels for a KernelContext. 17 18 #ifndef TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_ 19 #define TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_ 20 21 #include "absl/base/thread_annotations.h" 22 #include "tensorflow/compiler/jit/xla_compilation_cache.h" 23 #include "tensorflow/compiler/jit/xla_tensor.h" 24 #include "tensorflow/compiler/tf2xla/xla_compiler.h" 25 #include "tensorflow/compiler/xla/client/local_client.h" 26 #include "tensorflow/compiler/xla/service/device_memory_allocator.h" 27 #include "tensorflow/compiler/xla/service/owning_device_memory.h" 28 #include "tensorflow/core/framework/allocation_description.pb.h" 29 #include "tensorflow/core/framework/resource_var.h" 30 #include "tensorflow/core/framework/tensor.h" 31 #include "tensorflow/core/framework/types.h" 32 #include "tensorflow/core/lib/core/status.h" 33 #include "tensorflow/core/lib/gtl/array_slice.h" 34 35 namespace tensorflow { 36 class XlaAllocator; 37 38 // Struct that represents a possibly-absent Tensor. 39 struct OptionalTensor { 40 string name; // A descriptive name 41 bool present = false; // Is the tensor present? 42 Tensor value; // If present, what is the Tensor's value? 43 }; 44 45 // Takes a snapshot of the values of resource variable arguments, whose indices 46 // are specified in `variable_indices` argument. We snapshot tensors that back 47 // resource variables since concurrent updates may modify the shape, and it is 48 // important that the shapes used for compilation match the true shapes of the 49 // buffers. 50 // 51 // We snapshot the entire set of resource variables as one atomic operation. 52 // This models Read->* dependencies between resource variable operations. See 53 // jit/resource_operation_safety_analysis for details. 54 // 55 // Returns a map of TensorFlow argument index to resource variable. If a 56 // resource variable is not initialized, the corresponding OptionalTensor 57 // will have its `present` field set to false. 58 Status SnapshotResourceVariables(OpKernelContext* ctx, 59 absl::Span<const int> variable_indices, 60 std::map<int, OptionalTensor>* result); 61 62 // Information about the state of a variable passed as input to the _XlaCompile 63 // and _XlaRun operators. Unlocks the resource variable and decrements its 64 // refcount on destruction. 65 class VariableInfo { 66 public: 67 explicit VariableInfo(int index, Var* var); 68 VariableInfo(VariableInfo&& other); 69 70 VariableInfo& operator=(VariableInfo&& other); 71 72 VariableInfo(const VariableInfo&) = delete; 73 VariableInfo& operator=(const VariableInfo&) = delete; 74 75 // The index of the DT_RESOURCE input to the _XlaCompile/_XlaRun operator. 76 // Note that the indices can be different between _XlaCompile and _XlaRun. index()77 int index() const { return index_; } 78 79 // A pointer to the resource variable. May be null if this VariableInfo is 80 // "empty", i.e. it does not track a resource variable. var()81 Var* var() const { return var_; } 82 83 // Returns true if the resource variable lock was successfully acquired by 84 // this thread. lock_held()85 bool lock_held() const { return lock_held_; } set_lock_held()86 void set_lock_held() { lock_held_ = true; } 87 88 ~VariableInfo(); 89 90 private: 91 int index_; 92 Var* var_; 93 94 // We can't use a optional<mutex_lock> here because it confuses the compiler's 95 // thread safety analysis. Instead we use a boolean flag and release the lock 96 // in the VariableInfo destructor. 97 bool lock_held_ = false; 98 }; 99 100 // Acquires the mutexes for all the variables in `variables` using a 101 // deadlock-safe protocol (acquire the mutexes in increasing-address order). 102 // 103 // `variables` is allowed to contain instances that don't track a resource 104 // variable (i.e. variables[i].var() can be null for some i). 105 Status LockVariables(absl::Span<VariableInfo> variables) 106 EXCLUSIVE_LOCK_FUNCTION(); 107 108 // Adapter class that wraps a Tensorflow allocator as an XLA allocator. 109 // Assumes that the Tensorflow allocator permits asynchronous deallocation: 110 // see comment on `AllowsAsynchronousDeallocation()`. 111 class XlaAllocator : public xla::DeviceMemoryAllocator { 112 public: 113 XlaAllocator(const se::Platform* platform, Allocator* wrapped); 114 ~XlaAllocator() override; 115 xla::StatusOr<xla::OwningDeviceMemory> Allocate( 116 int device_ordinal, uint64 size, bool retry_on_failure) override; 117 Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override; 118 119 // The Tensorflow BFC allocator used on GPU allows host-side deallocation 120 // before GPU execution takes place. Tensorflow uses the ordering of the main 121 // compute stream to enforce a happens-before relationship between a memory 122 // allocation and code that reuses the same memory. If Tensorflow adds 123 // support for multiple GPU streams or allocators with different ordering 124 // requirements, this code may need to change. 125 // (This attribute has no effect on CPU.) AllowsAsynchronousDeallocation()126 bool AllowsAsynchronousDeallocation() const override { return true; } 127 128 private: 129 Allocator* wrapped_; 130 }; 131 132 // Helper class to perform the marshalling of TensorFlow inputs and outputs to 133 // ShapedBuffers suitable for passing to an XLA computation. 134 class XlaComputationLaunchContext { 135 public: 136 // Create a new launch context. 'allocate_xla_tensors' is true if allocated 137 // output tensors and variables are always XlaTensors. If false they are 138 // assumed to be "normal" device pointers. 139 // If 'use_multiple_streams' is true, tensors may be defined and used on 140 // multiple streams and so se::Events must be defined and waited for. If 141 // 'use_multiple_streams' is true, 'allocate_xla_tensors' must also be true 142 // because we track inter-stream dependencies through events inside XlaTensor 143 // objects. 144 XlaComputationLaunchContext(xla::LocalClient* client, 145 xla::DeviceMemoryAllocator* xla_allocator, 146 bool allocate_xla_tensors, 147 bool use_multiple_streams); 148 149 // Builds a XlaCompiler::Argument vector from the arguments to an XlaLaunch 150 // op. 151 static Status BuildXlaCompilerArguments( 152 const std::map<int, Tensor>& constant_args, 153 const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx, 154 std::vector<XlaCompiler::Argument>* args); 155 156 // Add all inputs within `ctx` as XLA arguments (returned by arguments()). 157 // `variables` is a map from TensorFlow argument number to resource variable. 158 // 159 // Assumes that the first `missing_ctx_input_prefix` inputs to the kernel are 160 // missing and adjusts input indices accordingly. All elements in kernel's 161 // input_mapping must be greater than or equal to `missing_ctx_input_prefix` 162 // (in other words, no inputs actually required by the kernel can be missing). 163 void PopulateInputs(OpKernelContext* ctx, 164 const XlaCompiler::CompilationResult* kernel, 165 const std::map<int, OptionalTensor>& variables, 166 int missing_ctx_input_prefix); 167 168 // Given the XLA output in `output`, populate all outputs of `ctx`. Also 169 // writes out the resource variable updates. 170 // 171 // Updates to all resource variables are written in a single atomic operation. 172 // This models *->Write dependencies between resource variable operations. 173 // See jit/resource_operation_safety_analysis for details. 174 // 175 // 176 // Assumes that the first `missing_ctx_input_prefix` inputs to the kernel are 177 // missing and adjusts input indices accordingly. 178 Status PopulateOutputs(OpKernelContext* ctx, 179 const XlaCompiler::CompilationResult* kernel, 180 xla::ScopedShapedBuffer output, 181 int missing_ctx_input_prefix); 182 183 // Return the argument list. Only valid after PopulateInputs() has been 184 // called. arguments()185 const std::vector<xla::ShapedBuffer*>& arguments() const { return arg_ptrs_; } 186 187 private: 188 xla::LocalClient* client_; 189 xla::DeviceMemoryAllocator* xla_allocator_; 190 bool allocate_xla_tensors_; 191 bool use_multiple_streams_; 192 std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers_; 193 std::vector<xla::ShapedBuffer*> arg_ptrs_; 194 }; 195 196 // A simple TensorBuffer implementation that allows us to create Tensors that 197 // take ownership of pre-allocated memory. 198 class XlaTensorBuffer : public TensorBuffer { 199 public: XlaTensorBuffer(const void * ptr,size_t expected_size,size_t actual_size,Allocator * allocator)200 XlaTensorBuffer(const void* ptr, size_t expected_size, size_t actual_size, 201 Allocator* allocator) 202 : TensorBuffer(const_cast<void*>(ptr)), 203 expected_size_(expected_size), 204 actual_size_(actual_size), 205 allocator_(allocator) {} 206 ~XlaTensorBuffer()207 ~XlaTensorBuffer() override { 208 if (data()) { 209 allocator_->DeallocateRaw(data()); 210 } 211 } 212 size()213 size_t size() const override { return expected_size_; } 214 root_buffer()215 TensorBuffer* root_buffer() override { return this; } 216 FillAllocationDescription(AllocationDescription * proto)217 void FillAllocationDescription(AllocationDescription* proto) const override { 218 proto->set_allocated_bytes(actual_size_); 219 } 220 MakeTensor(DataType dtype,const TensorShape & shape,se::DeviceMemoryBase buffer,Allocator * allocator)221 static Tensor MakeTensor(DataType dtype, const TensorShape& shape, 222 se::DeviceMemoryBase buffer, Allocator* allocator) { 223 size_t expected_size = shape.num_elements() * DataTypeSize(dtype); 224 auto* tensor_buffer = new XlaTensorBuffer(buffer.opaque(), expected_size, 225 buffer.size(), allocator); 226 Tensor t(dtype, shape, tensor_buffer); 227 tensor_buffer->Unref(); 228 return t; 229 } 230 231 private: 232 size_t expected_size_; 233 size_t actual_size_; 234 Allocator* allocator_; 235 }; 236 237 } // namespace tensorflow 238 239 #endif // TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_ 240