1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COLOCATION_GRAPH_H_ 17 #define TENSORFLOW_CORE_COMMON_RUNTIME_COLOCATION_GRAPH_H_ 18 19 #include <unordered_map> 20 #include <vector> 21 22 #include "absl/strings/str_join.h" 23 #include "tensorflow/core/common_runtime/device.h" 24 #include "tensorflow/core/common_runtime/inspecting_placer.h" 25 #include "tensorflow/core/common_runtime/placer_inspection_required_ops_utils.h" 26 #include "tensorflow/core/framework/function.h" 27 #include "tensorflow/core/framework/types.h" 28 #include "tensorflow/core/lib/core/stringpiece.h" 29 #include "tensorflow/core/util/device_name_utils.h" 30 #include "tensorflow/core/util/port.h" 31 32 namespace tensorflow { 33 34 // Represents a node in the disjoint node forest and the 35 // accumulated constraints on the device used by that node. 36 class Member { 37 public: 38 Member() = default; 39 40 Status SetParentAndSupportedDevices( 41 const Node& node, const std::vector<DeviceType>& types, 42 const DeviceNameUtils::ParsedName* local_address_spec); 43 requested_device_name()44 const DeviceNameUtils::ParsedName& requested_device_name() const { 45 return requested_device_name_; 46 } 47 48 Status SetAssignedDeviceName(const string& device_name); 49 Status SetResourceDeviceName(const Node& node); 50 Status SetRequestedDeviceName(const Node& node); 51 52 Status FillPossibleDevices(PossibleDevices* possible_device) const; 53 54 // Returns whether `src_root` is assigned to a CompositeDevice and `this` is 55 // assigned to a physical device. 56 bool IsEdgeFromCompositeDeviceToPhysicalDevice(const Member& src_root) const; 57 58 Status EnsureCompatibilityAcrossResourceEdge( 59 const Node& src, const Member& src_root, 60 const Node& dst, /*dst_root is this*/ 61 bool log_device_placement); 62 supported_device_types()63 const PrioritizedDeviceTypeVector& supported_device_types() const { 64 return supported_device_types_; 65 } 66 67 // If `dry_run` is true, just sets `new_root` and `old_root` and does not 68 // actually modify anything in the `tree`. 69 static void Merge(std::vector<Member>* tree, int x_root, int y_root, 70 Member** new_root, Member** old_root, bool dry_run); 71 72 // Returns the root node of the disjoint tree to which the node with the 73 // given id is connected. 74 // FindRoot should be called only for debugging or after the members have 75 // been updated with direct root pointers because it does not update 76 // root pointers and can traverse many links. It exists to have 77 // a const version of FindAndUpdateRoot 78 static int FindRoot(const std::vector<Member>& tree, int node_id); 79 static int FindAndUpdateRoot(std::vector<Member>* tree, int node_id); 80 81 Status MergeDeviceNames(const Member& other, bool allow_soft_placement); 82 83 // Updates this to contain the intersection of the device types in 84 // this and "other". If the intersection is empty, returns false and does 85 // not update this. Else returns true and updates this. 86 bool MergeSupportedDevices(const Member& other); 87 88 Status AssignDevice(const Node& node); 89 90 // If user does not explicitly request XLA device and non-XLA device is 91 // supported for this node, use only the non-XLA device. See b/140896502. 92 void MaybeExcludeXlaDevices(); 93 94 // Limit the possible devices of this (should be a root) to the device 95 // specifications in `devices`. 96 Status LimitToPossibleDevices(const PossibleDevices& devices, 97 bool allow_soft_placement); 98 set_possible_devices(std::vector<Device * > && devices)99 void set_possible_devices(std::vector<Device*>&& devices) { 100 possible_devices_ = devices; 101 } possible_devices()102 const std::vector<Device*>& possible_devices() { return possible_devices_; } 103 104 // Returns a (parsed) device name that is based on requested_device_name() 105 // but with potentially cleared device type and ID fields. A field is cleared 106 // if the assigned_device_name does not specify it. If it does, the field 107 // is not cleared because soft placement cannot violate assigned device names. 108 DeviceNameUtils::ParsedName GetSoftDeviceName() const; 109 110 // Same as GetSoftDeviceName but device type and device ID fields are not 111 // cleared if resource device has them set. 112 DeviceNameUtils::ParsedName GetPreferredSoftDeviceName() const; 113 114 string DebugString() const; 115 116 private: 117 // Updates this to contain the intersection of the device types in 118 // this and `other_devices`. 119 bool MergeSupportedDevices(const PrioritizedDeviceTypeVector& other_devices); 120 121 // The id of the node that is the parent of this one, or its own 122 // id if it is a root. parent <= 0 indicates that this member is invalid. 123 int parent_ = -1; 124 125 // A proxy for the depth of the tree that is used to prefer 126 // connecting smaller trees to larger trees when merging disjoint 127 // sets. 128 int rank_ = 0; 129 130 // Once colocation groups have been formed, the Placer starts actually 131 // choosing devices. All nodes in a group must be assigned to the same 132 // device. Once we assigned the first device to some node in this group, 133 // we set assigned_device_name_index to this device name's index in the 134 // graph. 135 // The `*_device_name_` fields will contain the parsed name of this device 136 // and `possible_devices`, if computed, will contain just this device. 137 // `assigned_device_name_index` is an optimization to avoid parsing and 138 // comparing device names. The value of -1 signals that a single device 139 // has not been chosen yet. 140 int assigned_device_name_index_ = -1; 141 142 // The merged form of the device requested for this node, with those of all of 143 // its children. requested_device_name_ is always kept a specialization (i.e. 144 // DeviceNameUtils::IsSpecification) of assigned_device_name_. When no device 145 // is requested, this field is set to assigned_device_name_. As a 146 // specialization of assigned_device_name_, requested_device_name_ represents 147 // the most specific form of all assigned and requested devices of this node 148 // and its children, if this node is a root. requested_device_name_ is used 149 // to finally select devices for nodes. We can override requested devices due 150 // to resource colocation constraints but not assigned devices (unless soft 151 // placement is on). 152 // INVARIANT: requested_device_name_ is always kept a 153 // DeviceNameUtils::IsSpecification of assigned_device_name_ and 154 // resource_device_name_. This makes requested_device_name_ the "accumulation 155 // of all wishes" about the device. 156 DeviceNameUtils::ParsedName requested_device_name_; 157 158 // The merged form of the device assigned for this node, with 159 // those of all of its children. 160 // This field is used to raise errors due to unsatisfiable constraints. 161 // Can be a partial specification. 162 DeviceNameUtils::ParsedName assigned_device_name_; 163 164 // The merged form of the requested resource device assigned for this node, 165 // with those of all of its children. 166 // This field is used to raise errors due to unsatisfiable constraints. 167 // Can be a partial specification. 168 // resource_device_name_ is initialized with user-requested device on nodes 169 // producing resources, e.g. VarHandleOp. 170 // For historical reasons, with soft placement enabled, Placer can "move" 171 // resources (place resource producing ops on a device different from what 172 // the user explicitly requested) when the colocation group of a resource 173 // producing op contains ops that are not supported on the user-requested 174 // resource device. A classic example of this is a sparse optimizer (only 175 // supported on CPU) used on a GPU variable. In this case, the whole group 176 // will be assigned to some device supported by all ops in the colocation 177 // group. This is a surprising and unfortunate behavior because: 178 // 1. Since soft_placement is on by default, users don't know that their 179 // variables are created on a different device than what they requested. 180 // Among other things, this can lead to surprising poor performance. 181 // 2. Eager runtime cannot "move" resources. The same code can "work" when 182 // wrapped in tf.function but will fail when run eagerly. 183 // 3. Extra complexity here to preserve these resource moving capabilities. 184 DeviceNameUtils::ParsedName resource_device_name_; 185 186 // The intersection of all device types supported by this node, 187 // and those of all of its children, in priority order 188 // of the preferred device. 189 // It is possible that supported_device_types_ has an empty intersection with 190 // requested/assigned/resource devices. We could have detected such cases 191 // as soon as they happen and raise an error. Instead, for historical reasons, 192 // we leave such error detection to the final device picking stage. 193 PrioritizedDeviceTypeVector supported_device_types_; 194 195 // If this node is a root, stores a list of Devices to which this node 196 // and all of its children can be assigned. 197 // `possible_devices` is empty if they have not yet been computed. 198 std::vector<Device*> possible_devices_; 199 }; 200 201 // This class maintains the connected components of a colocation 202 // constraint graph, and uses this information to assign a satisfying 203 // device placement to the nodes of the graph. 204 // 205 // This implementation uses the Union-Find algorithm to efficiently maintain the 206 // connected components and incrementally adds edges via 207 // ColocationGraph::ColocateNodes() invocations. 208 // 209 // ColocationGraph does not assign any devices to graph nodes. The 210 // `log_device_placement` argument is used to log messages when requested 211 // device is ignored. 212 class ColocationGraph { 213 public: 214 // graph, flib_def, and device_set must not be null and must outlive 215 // this ColocationGraph. default_local_device can be null. If not, must 216 // outlive this. 217 ColocationGraph(const Graph* graph, const FunctionStack& stack, 218 const FunctionLibraryDefinition* flib_def, 219 const DeviceSet* device_set, 220 const Device* default_local_device, bool allow_soft_placement, 221 bool log_device_placement); 222 223 Status Initialize(); 224 members()225 const std::vector<Member>& members() const { return members_; } 226 227 // Limit the group containing `node` to the device specifications in 228 // `devices`. 229 Status LimitToPossibleDevices(const Node& node, 230 const PossibleDevices& devices); 231 232 // Limits the possible devices of `node`'s colocation group to the device 233 // to which `node` is assigned. This makes sure that all nodes in this 234 // colocation group will be assigned to the same device. Without this 235 // explicit restriction, heuristics can choose a different possible device 236 // for other nodes in the group. 237 Status LimitToAssignedDevice(const Node& node); 238 239 // Returns the root node of the disjoint tree to which the node with the 240 // given id is connected. 241 // Updates the internal pointers so that future calls will returns faster. FindAndUpdateRoot(int node_id)242 int FindAndUpdateRoot(int node_id) { 243 return Member::FindAndUpdateRoot(&members_, node_id); 244 } 245 246 // For the given node, subject to the constraints previously given 247 // to this ColocationGraph, set its assigned_device_name. Returns OK 248 // if a satisfying device can be found, otherwise an error. 249 // 250 // Note: This method returns a pointer to a field within members_. 251 // The caller must not use the returned pointer after there is any possibility 252 // that the members_[i].possible_devices field has been modified. 253 Status GetDevicesForNode(Node* node, 254 const std::vector<Device*>** possible_devices); 255 256 // Returns debugging info for the node referred to by 'node_root'. 257 string DebugInfo(const int node_root) const; 258 259 string DebugString() const; 260 261 // Returns a list of devices having type in supported_device_types. The 262 // returned list is sorted by preferred type (higher numeric type is 263 // preferred). 264 static std::vector<Device*> FilterSupportedDevices( 265 const std::vector<Device*>& devices, 266 const PrioritizedDeviceTypeVector& supported_device_types, 267 const Device* default_local_device); 268 269 private: 270 // Adds each node of the Graph to this ColocationGraph as a singleton. 271 // 272 // NOTE: The implementation assumes that the ids of nodes passed to 273 // this method are dense and zero-based; the memory used will be linear in 274 // the largest node ID. 275 // NOTE: If this method returns an error, *this is left in an undefined 276 // state. 277 Status ColocateAllNodes(); 278 279 Status ColocateResourceOrRefEdge(const Node* src, const Node* dst); 280 281 // Updates this ColocationGraph by making sure that all nodes 282 // touching resource and/or ref tensors are colocated. 283 // As it iterates over the edges, fills the `inspection_required` set with 284 // the nodes that 285 // PlacerInspectionRequiredOpChecker::IsPlacerInspectionRequired 286 // deems as requiring deep inspection by placer. This is an optimization. 287 Status ColocateResourceAndRefEdges( 288 std::unordered_set<Node*>* inspection_required); 289 290 // Updates this ColocationGraph by making sure that all nodes having inputs of 291 // a DT_VARIANT data type with a host-only underlying types (e.g. strings) can 292 // be placed only on CPU device. We do that by reverse-DFS traversal from all 293 // nodes that take variant inputs to the node that produces that variant. 294 // TODO(ezhulenev): This function does not yet support "deep op" inspection, 295 // that we have for DT_RESOURCE edges. 296 Status AddHostOnlyDataTypesConstraints(); 297 298 Status AddInspectionConstraints( 299 const std::unordered_set<Node*>& inspection_required); 300 301 // Applies colocation groups for `node`'s inputs and outputs to this 302 // ColocationGraph. 303 // `groups` are the colocation groups to which `nodes`'s inputs and outputs 304 // belong. 305 // `node` is a node requiring deep inspection (e.g. a node calling 306 // a function) 307 // 308 // For example, consider a `node` taking two inputs and producing one output 309 // a b 310 // | | 311 // v v 312 // node 313 // | 314 // v 315 // c 316 // 317 // `groups` can tell us that `a` and `c` must be colocated and their device 318 // must be a GPU. `b` might be in a group by itself without any device 319 // restrictions. 320 // 321 // ApplyIOColocationGroups will have an effect of calling 322 // ColocateNodes(a, c) and LimitToPossibleDevices(`a`, "GPU"). The colocation 323 // group of the `node` itself is not directly impacted. 324 // 325 Status ApplyIOColocationGroups(const IOColocationGroups& groups, 326 const Node& node); 327 328 Status ColocateNodeToGroup( 329 std::unordered_map<StringPiece, const Node*, StringPieceHasher>* 330 colocation_group_root, 331 const Node* node, StringPiece colocation_group); 332 333 // Merge the (possibly disjoint) sets containing nodes "x" and 334 // "y". Returns OK if the all nodes in the union of these sets can 335 // be placed on the same device type. 336 // 337 // If this method returns an error, *this is unchanged. 338 Status ColocateNodes(const Node& x, const Node& y); 339 340 // This overload of ColocateNodes() allows a caller to provide the root node 341 // ids for the two nodes. For large graphs, this noticeably reduces the 342 // graph load time. 343 // If this method returns an error, *this is unchanged. 344 Status ColocateNodes(const Node& x, int x_root, const Node& y, int y_root); 345 346 void GetSoftDeviceCandidates(const Node& node, const Member& root_member, 347 int root_id, 348 std::vector<Device*>* possible_devices); 349 350 Status InitializeMembers(); 351 352 Status InitializeMemberWithAssignedDevice(const string& assigned_device_name, 353 const string& node_type, 354 Member* member); 355 356 Status InitializeMember(const Node& node, Member* member); 357 358 // Returns the root node of the disjoint tree to which the node with the 359 // given id is connected. 360 // FindRoot should be called only for debugging or after the members have 361 // been updated with direct root pointers because it does not update 362 // root pointers and can traverse many links. It exists to have 363 // a const version of FindAndUpdateRoot FindRoot(int node_id)364 int FindRoot(int node_id) const { 365 return Member::FindRoot(members_, node_id); 366 } 367 368 const Graph& graph_; 369 const FunctionStack stack_; 370 const FunctionLibraryDefinition& flib_def_; 371 std::vector<Member> members_; 372 InspectingPlacer inspecting_placer_; 373 PlacerInspectionRequiredOpChecker inspection_required_checker_; 374 const DeviceSet& device_set_; 375 const std::vector<DeviceType> device_types_; 376 const DeviceNameUtils::ParsedName local_address_spec_; 377 const Device* default_local_device_; 378 const bool allow_soft_placement_; 379 const bool log_device_placement_; 380 381 TF_DISALLOW_COPY_AND_ASSIGN(ColocationGraph); 382 }; 383 384 } // namespace tensorflow 385 386 #endif // TENSORFLOW_CORE_COMMON_RUNTIME_COLOCATION_GRAPH_H_ 387