• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_XLA_DEBUG_INFO_MANAGER_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_XLA_DEBUG_INFO_MANAGER_H_
18 
19 #include "absl/container/flat_hash_map.h"
20 #include "tensorflow/compiler/xla/service/hlo.pb.h"
21 #include "tensorflow/compiler/xla/service/hlo_module.h"
22 #include "tensorflow/core/lib/core/status.h"
23 
24 namespace xla {
25 
26 using ModuleIdentifier = string;
27 
28 struct XlaModuleDebugInfo {
29   ModuleIdentifier module_id;
30   // The hlo proto associated with this xla program.
31   std::unique_ptr<HloProto> hlo_proto;
32   // TODO(b/133503446): We might need add performance info from cost analysis
33   // and DeviceDescription which contains peak memory bandwidth, clock speed,
34   // core count, and other device characteristics.
35 };
36 
37 // Debug info manager keeps track of all the debug information (symbol table,
38 // HLO proto etc) during tracing period. Because tracing period can start
39 // during module execution, therefore even when tracing is off, we still need
40 // minimum level of monitoring (i.e. which program is running lately).
41 // We allow multiple programs with the same module_id, however from tracing
42 // debug information perspective, same module id implies the same debug
43 // information. We will only keep track unique debug information, identified
44 // by module_id.
45 // This class is thread-safe.
46 class XlaDebugInfoManager {
47  public:
Get()48   static XlaDebugInfoManager* Get() {
49     static XlaDebugInfoManager* singleton = new XlaDebugInfoManager();
50     return singleton;
51   }
52 
53   // Register an active module to XlaDebugInfoManager. We will keep track all
54   // existing HloModules within the process.
55   // Modules with same module id can be registered and tracked separately.
56   void RegisterModule(
57       const ModuleIdentifier& module_id, std::shared_ptr<HloModule> hlo_module,
58       std::shared_ptr<const BufferAssignmentProto> buffer_assignment);
59 
60   // Unregister an active module. When the last active module of the same
61   // module id is out of scope, we remove it from our database.
62   // However during tracing, we will defer the cleanup after serialization.
63   void UnregisterModule(
64       const ModuleIdentifier& module_id, std::shared_ptr<HloModule> hlo_module,
65       std::shared_ptr<const BufferAssignmentProto> buffer_assignment);
66 
67   // Register when the module start execution on certain device.
68   // TODO(jiesun): Do we need to track which device this is?
69   void OnModuleStart(ModuleIdentifier module_id);
70   // Register when the module stop execution on certain device.
71   void OnModuleStop(ModuleIdentifier module_id);
72 
73   // Start tracing, began to collecting debug information for all the running
74   // modules during the tracing period.
75   void StartTracing();
76 
77   // Stop tracing and drop all instances that have been stoped during tracing,
78   // Then drop all modules that have no instances registered. Dump debug
79   // information for all the running modules to module_debug_info if specified.
80   void StopTracing(
81       std::vector<XlaModuleDebugInfo>* module_debug_info = nullptr);
82 
83   friend class XlaDebugInfoManagerTest;
84 
85  private:
XlaDebugInfoManager()86   XlaDebugInfoManager() {}
87 
88   // Test accessors.
GetRunningModules()89   std::set<ModuleIdentifier> GetRunningModules() {
90     tensorflow::mutex_lock lock(mutex_);
91     std::set<ModuleIdentifier> running;
92     for (const auto& id : running_module_ids_) {
93       running.insert(id.first);
94     }
95     return running;
96   }
GetActiveModules()97   std::set<ModuleIdentifier> GetActiveModules() {
98     tensorflow::mutex_lock lock(mutex_);
99     std::set<ModuleIdentifier> active;
100     for (const auto& id : active_modules_) {
101       active.insert(id.first);
102     }
103     return active;
104   }
105 
106   // We track each instance of GpuExecutable. Assuming multiple GpuExecutable
107   // can have same unique id if they are actually same program. From the
108   // perspective of symbol table, they are identical, but for the life time
109   // tracking, they need to be tracked separately.
110   struct XlaModuleInstance {
XlaModuleInstanceXlaModuleInstance111     XlaModuleInstance(std::shared_ptr<HloModule> m,
112                       std::shared_ptr<const BufferAssignmentProto> b)
113         : hlo_module(std::move(m)), buffer_assignment(std::move(b)) {}
114     std::shared_ptr<HloModule> hlo_module;
115     std::shared_ptr<const BufferAssignmentProto> buffer_assignment;
116     bool active = true;
117   };
118 
119   // Each XlaModuleEntry can have multiple XlaModuleInstance's if XlA registers
120   // them with the same ModuleIdentifier.
121   struct XlaModuleEntry {
122     // The module symbol table/debug info that shared by all instances.
123     ModuleIdentifier module_id;
124     std::vector<XlaModuleInstance> instances;
125   };
126 
127   tensorflow::mutex mutex_;
128   bool tracing_active_ TF_GUARDED_BY(mutex_) = false;
129   // Modules that was running currently. Because multiple instances of the
130   // modules can be running in the same time, a reference count is maintained
131   // as map value.
132   absl::flat_hash_map<ModuleIdentifier, int> running_module_ids_
133       TF_GUARDED_BY(mutex_);
134   // Active modules are those still tracked by us. There could be much more
135   // active modules than running modules, we will try to reduce the trace size
136   // by only transfer those modules that were running during tracing period.
137   absl::flat_hash_map<ModuleIdentifier, XlaModuleEntry> active_modules_
138       TF_GUARDED_BY(mutex_);
139 };
140 
141 }  // namespace xla
142 
143 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_XLA_DEBUG_INFO_MANAGER_H_
144