1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_XLA_DEBUG_INFO_MANAGER_H_ 17 #define TENSORFLOW_COMPILER_XLA_SERVICE_XLA_DEBUG_INFO_MANAGER_H_ 18 19 #include "absl/container/flat_hash_map.h" 20 #include "tensorflow/compiler/xla/service/hlo.pb.h" 21 #include "tensorflow/compiler/xla/service/hlo_module.h" 22 #include "tensorflow/core/lib/core/status.h" 23 24 namespace xla { 25 26 using ModuleIdentifier = string; 27 28 struct XlaModuleDebugInfo { 29 ModuleIdentifier module_id; 30 // The hlo proto associated with this xla program. 31 std::unique_ptr<HloProto> hlo_proto; 32 // TODO(b/133503446): We might need add performance info from cost analysis 33 // and DeviceDescription which contains peak memory bandwidth, clock speed, 34 // core count, and other device characteristics. 35 }; 36 37 // Debug info manager keeps track of all the debug information (symbol table, 38 // HLO proto etc) during tracing period. Because tracing period can start 39 // during module execution, therefore even when tracing is off, we still need 40 // minimum level of monitoring (i.e. which program is running lately). 41 // We allow multiple programs with the same module_id, however from tracing 42 // debug information perspective, same module id implies the same debug 43 // information. We will only keep track unique debug information, identified 44 // by module_id. 45 // This class is thread-safe. 46 class XlaDebugInfoManager { 47 public: Get()48 static XlaDebugInfoManager* Get() { 49 static XlaDebugInfoManager* singleton = new XlaDebugInfoManager(); 50 return singleton; 51 } 52 53 // Register an active module to XlaDebugInfoManager. We will keep track all 54 // existing HloModules within the process. 55 // Modules with same module id can be registered and tracked separately. 56 void RegisterModule( 57 const ModuleIdentifier& module_id, std::shared_ptr<HloModule> hlo_module, 58 std::shared_ptr<const BufferAssignmentProto> buffer_assignment); 59 60 // Unregister an active module. When the last active module of the same 61 // module id is out of scope, we remove it from our database. 62 // However during tracing, we will defer the cleanup after serialization. 63 void UnregisterModule( 64 const ModuleIdentifier& module_id, std::shared_ptr<HloModule> hlo_module, 65 std::shared_ptr<const BufferAssignmentProto> buffer_assignment); 66 67 // Register when the module start execution on certain device. 68 // TODO(jiesun): Do we need to track which device this is? 69 void OnModuleStart(ModuleIdentifier module_id); 70 // Register when the module stop execution on certain device. 71 void OnModuleStop(ModuleIdentifier module_id); 72 73 // Start tracing, began to collecting debug information for all the running 74 // modules during the tracing period. 75 void StartTracing(); 76 77 // Stop tracing and drop all instances that have been stoped during tracing, 78 // Then drop all modules that have no instances registered. Dump debug 79 // information for all the running modules to module_debug_info if specified. 80 void StopTracing( 81 std::vector<XlaModuleDebugInfo>* module_debug_info = nullptr); 82 83 friend class XlaDebugInfoManagerTest; 84 85 private: XlaDebugInfoManager()86 XlaDebugInfoManager() {} 87 88 // Test accessors. GetRunningModules()89 std::set<ModuleIdentifier> GetRunningModules() { 90 tensorflow::mutex_lock lock(mutex_); 91 std::set<ModuleIdentifier> running; 92 for (const auto& id : running_module_ids_) { 93 running.insert(id.first); 94 } 95 return running; 96 } GetActiveModules()97 std::set<ModuleIdentifier> GetActiveModules() { 98 tensorflow::mutex_lock lock(mutex_); 99 std::set<ModuleIdentifier> active; 100 for (const auto& id : active_modules_) { 101 active.insert(id.first); 102 } 103 return active; 104 } 105 106 // We track each instance of GpuExecutable. Assuming multiple GpuExecutable 107 // can have same unique id if they are actually same program. From the 108 // perspective of symbol table, they are identical, but for the life time 109 // tracking, they need to be tracked separately. 110 struct XlaModuleInstance { XlaModuleInstanceXlaModuleInstance111 XlaModuleInstance(std::shared_ptr<HloModule> m, 112 std::shared_ptr<const BufferAssignmentProto> b) 113 : hlo_module(std::move(m)), buffer_assignment(std::move(b)) {} 114 std::shared_ptr<HloModule> hlo_module; 115 std::shared_ptr<const BufferAssignmentProto> buffer_assignment; 116 bool active = true; 117 }; 118 119 // Each XlaModuleEntry can have multiple XlaModuleInstance's if XlA registers 120 // them with the same ModuleIdentifier. 121 struct XlaModuleEntry { 122 // The module symbol table/debug info that shared by all instances. 123 ModuleIdentifier module_id; 124 std::vector<XlaModuleInstance> instances; 125 }; 126 127 tensorflow::mutex mutex_; 128 bool tracing_active_ TF_GUARDED_BY(mutex_) = false; 129 // Modules that was running currently. Because multiple instances of the 130 // modules can be running in the same time, a reference count is maintained 131 // as map value. 132 absl::flat_hash_map<ModuleIdentifier, int> running_module_ids_ 133 TF_GUARDED_BY(mutex_); 134 // Active modules are those still tracked by us. There could be much more 135 // active modules than running modules, we will try to reduce the trace size 136 // by only transfer those modules that were running during tracing period. 137 absl::flat_hash_map<ModuleIdentifier, XlaModuleEntry> active_modules_ 138 TF_GUARDED_BY(mutex_); 139 }; 140 141 } // namespace xla 142 143 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_XLA_DEBUG_INFO_MANAGER_H_ 144