1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // Contains utilities for launching compiled XLA kernels for a KernelContext. 17 18 #ifndef TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_ 19 #define TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_ 20 21 #include "tensorflow/compiler/jit/xla_compilation_cache.h" 22 #include "tensorflow/compiler/jit/xla_tensor.h" 23 #include "tensorflow/compiler/tf2xla/xla_compiler.h" 24 #include "tensorflow/compiler/xla/client/local_client.h" 25 #include "tensorflow/compiler/xla/service/shaped_buffer.h" 26 #include "tensorflow/core/framework/allocation_description.pb.h" 27 #include "tensorflow/core/framework/resource_var.h" 28 #include "tensorflow/core/framework/tensor.h" 29 #include "tensorflow/core/framework/types.h" 30 #include "tensorflow/core/lib/core/status.h" 31 #include "tensorflow/core/lib/gtl/array_slice.h" 32 #include "tensorflow/core/platform/thread_annotations.h" 33 #include "tensorflow/stream_executor/device_memory_allocator.h" 34 35 namespace tensorflow { 36 37 // Snapshot of resource variables for a TF kernel invocation, mapping from 38 // parameter number to values at execution time. If the resource variable is not 39 // initialized, the value will not be present. 40 using ResourceVarsSnapshot = absl::flat_hash_map<int, absl::optional<Tensor>>; 41 42 // Information about the state of a variable passed as input to the _XlaCompile 43 // and _XlaRun operators. Unlocks the resource variable and decrements its 44 // refcount on destruction. 45 class VariableInfo { 46 public: 47 explicit VariableInfo(int index, absl::string_view name, Var* var, 48 const absl::optional<ManagedStackTrace>& 49 definition_stack_trace = absl::nullopt); 50 VariableInfo(VariableInfo&& other); 51 52 VariableInfo& operator=(VariableInfo&& other); 53 54 VariableInfo(const VariableInfo&) = delete; 55 VariableInfo& operator=(const VariableInfo&) = delete; 56 57 // The index of the DT_RESOURCE input to the _XlaCompile/_XlaRun operator. 58 // Note that the indices can be different between _XlaCompile and _XlaRun. index()59 int index() const { return index_; } 60 61 // A pointer to the resource variable. May be null if this VariableInfo is 62 // "empty", i.e. it does not track a resource variable. var()63 Var* var() const { return var_; } 64 65 // Returns the variable name. name()66 absl::string_view name() const { return name_; } 67 68 // Returns true if the resource variable lock was successfully acquired by 69 // this thread. lock_held()70 bool lock_held() const { return lock_held_; } set_lock_held()71 void set_lock_held() { lock_held_ = true; } 72 definition_stack_trace()73 const absl::optional<ManagedStackTrace>& definition_stack_trace() const { 74 return definition_stack_trace_; 75 } 76 77 ~VariableInfo(); 78 79 private: 80 int index_; 81 std::string name_; 82 Var* var_; 83 absl::optional<ManagedStackTrace> definition_stack_trace_; 84 85 // We can't use a optional<mutex_lock> here because it confuses the compiler's 86 // thread safety analysis. Instead we use a boolean flag and release the lock 87 // in the VariableInfo destructor. 88 bool lock_held_ = false; 89 }; 90 91 // Creates a list of updated resource variables. 92 StatusOr<std::vector<VariableInfo>> GatherVariableInfo( 93 OpKernelContext* ctx, 94 const XlaCompiler::CompilationResult& compilation_result, 95 int missing_ctx_input_prefix); 96 97 // Takes a snapshot of the values of resource variable arguments, whose indices 98 // are specified in `variable_indices` argument. We snapshot tensors that back 99 // resource variables since concurrent updates may modify the shape, and it is 100 // important that the shapes used for compilation match the true shapes of the 101 // buffers. 102 // 103 // We snapshot the entire set of resource variables as one atomic operation. 104 // This models Read->* dependencies between resource variable operations. See 105 // jit/resource_operation_safety_analysis for details. 106 Status SnapshotResourceVariables(OpKernelContext* ctx, 107 absl::Span<const int> variable_indices, 108 absl::Span<VariableInfo const> variable_infos, 109 ResourceVarsSnapshot* result); 110 111 // Acquires the mutexes for all the variables in `variables` using a 112 // deadlock-safe protocol (acquire the mutexes in increasing-address order). 113 // 114 // `variables` is allowed to contain instances that don't track a resource 115 // variable (i.e. variables[i].var() can be null for some i). 116 Status LockVariables(absl::Span<VariableInfo> variables) 117 TF_EXCLUSIVE_LOCK_FUNCTION(); 118 119 // Returns a vector of VariableInfo instances for the resource variable inputs, 120 // given that *all* inputs are in `inputs`. The input indices for the resource 121 // variable inputs are in `variable_indices`. 122 Status GetVariableInfosFromInputs(ResourceMgr* rm, DeviceBase* dev, 123 absl::Span<const Tensor* const> inputs, 124 absl::Span<const int> variable_indices, 125 std::vector<VariableInfo>* result); 126 127 // Returns pointers to inputs stored in `ctx`. 128 std::vector<const Tensor*> InputsFromContext(OpKernelContext* ctx); 129 130 // Helper class to perform the marshalling of TensorFlow inputs and outputs to 131 // ShapedBuffers suitable for passing to an XLA computation. 132 class XlaComputationLaunchContext { 133 public: 134 // Create a new launch context. 'allocate_xla_tensors' is true if allocated 135 // output tensors and variables are always XlaTensors. If false they are 136 // assumed to be "normal" device pointers. 137 // If 'use_multiple_streams' is true, tensors may be defined and used on 138 // multiple streams and so se::Events must be defined and waited for. If 139 // 'use_multiple_streams' is true, 'allocate_xla_tensors' must also be true 140 // because we track inter-stream dependencies through events inside XlaTensor 141 // objects. 142 XlaComputationLaunchContext(xla::LocalClient* client, 143 se::DeviceMemoryAllocator* xla_allocator, 144 int device_ordinal, bool allocate_xla_tensors, 145 bool use_multiple_streams); 146 147 // Builds a XlaCompiler::Argument vector from the arguments to an XlaLaunch 148 // op. 149 // Precondition: variables in `variable_args` are locked. 150 static StatusOr<std::vector<XlaCompiler::Argument>> BuildXlaCompilerArguments( 151 absl::Span<int const> must_be_constant_idxs, 152 absl::Span<const Tensor* const> inputs, 153 absl::Span<VariableInfo const> variable_args, Device* device); 154 155 // Add all inputs within `ctx` as XLA arguments (returned by arguments()). 156 // `variables` is a map from TensorFlow argument number to resource variable. 157 // 158 // Assumes that the first `missing_ctx_input_prefix` inputs to the kernel are 159 // missing and adjusts input indices accordingly. All elements in kernel's 160 // input_mapping must be greater than or equal to `missing_ctx_input_prefix` 161 // (in other words, no inputs actually required by the kernel can be missing). 162 StatusOr<std::vector<xla::ExecutionInput>> PopulateInputs( 163 OpKernelContext* ctx, 164 const XlaCompiler::CompilationResult* compilation_result, 165 const std::map<int, const Tensor*>& resource_vars, 166 int missing_ctx_input_prefix, 167 const xla::HloInputOutputAliasConfig& input_output_alias); 168 169 // Given the XLA output in `output`, populate all outputs of `ctx`. Also 170 // writes out the resource variable updates. 171 // 172 // Updates to all resource variables are written in a single atomic operation. 173 // This models *->Write dependencies between resource variable operations. 174 // See jit/resource_operation_safety_analysis for details. 175 // 176 // 177 // Assumes that the first `missing_ctx_input_prefix` inputs to the 178 // compilation_result are missing and adjusts input indices accordingly. 179 Status PopulateOutputs( 180 OpKernelContext* ctx, 181 const XlaCompiler::CompilationResult* compilation_result, 182 xla::ScopedShapedBuffer output, int missing_ctx_input_prefix, 183 absl::Span<VariableInfo> variable_infos, 184 const xla::HloInputOutputAliasConfig& input_output_alias, 185 const std::map<int, const Tensor*>& resource_vars); 186 187 private: 188 xla::LocalClient* client_; 189 se::DeviceMemoryAllocator* xla_allocator_; 190 bool allocate_xla_tensors_; 191 bool use_multiple_streams_; 192 int device_ordinal_; 193 }; 194 195 // A simple TensorBuffer implementation that allows us to create Tensors that 196 // take ownership of pre-allocated memory. 197 class XlaTensorBuffer : public TensorBuffer { 198 public: XlaTensorBuffer(const void * ptr,size_t expected_size,size_t actual_size,Allocator * allocator)199 XlaTensorBuffer(const void* ptr, size_t expected_size, size_t actual_size, 200 Allocator* allocator) 201 : TensorBuffer(const_cast<void*>(ptr)), 202 expected_size_(expected_size), 203 actual_size_(actual_size), 204 allocator_(allocator) {} 205 ~XlaTensorBuffer()206 ~XlaTensorBuffer() override { 207 if (data()) { 208 allocator_->DeallocateRaw(data()); 209 } 210 } 211 size()212 size_t size() const override { return expected_size_; } 213 root_buffer()214 TensorBuffer* root_buffer() override { return this; } 215 FillAllocationDescription(AllocationDescription * proto)216 void FillAllocationDescription(AllocationDescription* proto) const override { 217 proto->set_requested_bytes(static_cast<int64>(expected_size_)); 218 proto->set_allocator_name(allocator_->Name()); 219 proto->set_ptr(reinterpret_cast<uintptr_t>(data())); 220 if (allocator_->TracksAllocationSizes()) { 221 auto ab = static_cast<int64>(allocator_->AllocatedSize(data())); 222 proto->set_allocated_bytes(ab); 223 int64_t id = allocator_->AllocationId(data()); 224 if (id > 0) { 225 proto->set_allocation_id(id); 226 } 227 if (RefCountIsOne()) { 228 proto->set_has_single_reference(true); 229 } 230 } 231 } 232 233 private: 234 size_t expected_size_; 235 size_t actual_size_; 236 Allocator* allocator_; 237 }; 238 239 } // namespace tensorflow 240 241 #endif // TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_ 242