1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.h"
17
18 #include "absl/container/flat_hash_map.h"
19 #include "absl/container/flat_hash_set.h"
20 #include "tensorflow/compiler/xla/service/hlo_computation.h"
21 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
22 #include "tensorflow/compiler/xla/types.h"
23 #include "tensorflow/core/platform/logging.h"
24
25 namespace xla {
26
FusionNodeIndexingEvaluation(const HloInstruction * fusion,int64 root_usage_count)27 FusionNodeIndexingEvaluation::FusionNodeIndexingEvaluation(
28 const HloInstruction* fusion, int64 root_usage_count)
29 : fusion_(fusion) {
30 HloInstruction* root = fusion->fused_expression_root();
31 indexing_users_[root].insert(fusion);
32 index_usage_count_[fusion] = root_usage_count;
33 RecomputeCache();
34 }
35
36 // This constant is arbitrarily chosen. Essentially we don't want to have too
37 // much code duplication, because it slows down the compilation time. There is
38 // a tradeoff between compilation time and runtime here.
39 const int64 FusionNodeIndexingEvaluation::kAllowedCodeDuplication = 15;
40
41 namespace {
42
43 // Returns which ops invalidate the cache of emitted instructions by creating a
44 // new BasicBlock and setting the insertion point to the newly created
45 // BasicBlock. We can only reuse cached values if they were emitted in the same
46 // BasicBlock as the current BasicBlock.
OpInvalidatesCache(const HloInstruction * hlo)47 bool OpInvalidatesCache(const HloInstruction* hlo) {
48 switch (hlo->opcode()) {
49 // This list of ops was created by inspecting the code. There is no
50 // guarantee that it is complete.
51 case HloOpcode::kConcatenate:
52 case HloOpcode::kDot:
53 case HloOpcode::kDynamicUpdateSlice:
54 case HloOpcode::kPad:
55 case HloOpcode::kReduce:
56 case HloOpcode::kReduceWindow:
57 return true;
58 default:
59 return false;
60 }
61 }
62
63 // Counts the number of "real" users of 'hlo'. When 'hlo' has a fusion node as
64 // user, we consider the users of the fusion parameter corresponding to 'hlo' as
65 // the real users.
UserCount(const HloInstruction * hlo)66 int64 UserCount(const HloInstruction* hlo) {
67 int64 cnt = 0;
68 for (HloInstruction* user : hlo->users()) {
69 if (user->opcode() == HloOpcode::kFusion) {
70 // Count the number of users of the parameter corresponding to the fusion
71 // operand.
72 int64 operand_index = user->operand_index(hlo);
73 cnt += user->fused_parameter(operand_index)->user_count();
74 } else {
75 ++cnt;
76 }
77 }
78 return cnt;
79 }
80 } // namespace
81
CodeDuplicationTooHigh(const HloInstruction * producer) const82 bool FusionNodeIndexingEvaluation::CodeDuplicationTooHigh(
83 const HloInstruction* producer) const {
84 int64 emitted_instructions = EvaluateEmittedInstructions(producer);
85 return emitted_instructions > kAllowedCodeDuplication ||
86 (OpInvalidatesCache(producer) &&
87 (emitted_instructions > 1 || UserCount(producer) > 1));
88 }
89
MaxCodeDuplicationTooHigh() const90 bool FusionNodeIndexingEvaluation::MaxCodeDuplicationTooHigh() const {
91 for (const auto& entry : index_usage_count_) {
92 if (entry.second > kAllowedCodeDuplication ||
93 (OpInvalidatesCache(entry.first) &&
94 (entry.second > 1 || UserCount(entry.first) > 1))) {
95 return true;
96 }
97 }
98 return false;
99 }
100
EvaluateEmittedInstructions(const HloInstruction * producer) const101 int64 FusionNodeIndexingEvaluation::EvaluateEmittedInstructions(
102 const HloInstruction* producer) const {
103 int64 total = 0;
104 for (const auto* user : indexing_users_.at(producer)) {
105 total += index_usage_count_.at(user);
106 }
107 return total;
108 }
109
UpdateEvaluationCache(const HloInstruction * producer,absl::flat_hash_set<const HloInstruction * > indexing_users_of_producer)110 void FusionNodeIndexingEvaluation::UpdateEvaluationCache(
111 const HloInstruction* producer,
112 absl::flat_hash_set<const HloInstruction*> indexing_users_of_producer) {
113 CHECK(!indexing_users_.contains(producer));
114 indexing_users_[producer] = std::move(indexing_users_of_producer);
115 UpdateIndexUsageCount(producer);
116 UpdateIndexingUsersOfOperands(producer);
117 }
118
119 absl::flat_hash_set<const HloInstruction*>
RemoveFusionOperand(HloInstruction * fusion_operand)120 FusionNodeIndexingEvaluation::RemoveFusionOperand(
121 HloInstruction* fusion_operand) {
122 auto indexing_users_of_operand =
123 std::move(indexing_users_.at(fusion_operand));
124 indexing_users_.erase(fusion_operand);
125 CHECK(!index_usage_count_.contains(fusion_operand));
126 return indexing_users_of_operand;
127 }
128
RecomputeCache()129 void FusionNodeIndexingEvaluation::RecomputeCache() {
130 auto postorder =
131 fusion_->fused_instructions_computation()->MakeInstructionPostOrder();
132 std::reverse(postorder.begin(), postorder.end());
133 for (const auto* instruction : postorder) {
134 if (instruction->opcode() == HloOpcode::kParameter) {
135 continue;
136 }
137 UpdateIndexUsageCount(instruction);
138 UpdateIndexingUsersOfOperands(instruction);
139 }
140 }
141
UpdateIndexUsageCount(const HloInstruction * instruction)142 void FusionNodeIndexingEvaluation::UpdateIndexUsageCount(
143 const HloInstruction* instruction) {
144 int64 total = 0;
145 for (const auto* user : indexing_users_[instruction]) {
146 total += index_usage_count_.at(user);
147 }
148 CHECK(index_usage_count_.emplace(instruction, total).second);
149 }
150
UpdateIndexingUsersOfOperands(const HloInstruction * instruction)151 void FusionNodeIndexingEvaluation::UpdateIndexingUsersOfOperands(
152 const HloInstruction* instruction) {
153 for (const auto* operand : instruction->operands()) {
154 if (operand->opcode() == HloOpcode::kParameter) {
155 // Although actually the parameter gets indexed, we store it as indexing
156 // of the corresponding fusion operand instead because parameter
157 // instruction pointers can be invalidated when we fuse another
158 // instruction into 'fusion_'.
159 operand = fusion_->operand(operand->parameter_number());
160 }
161 // For simplicity we assume that all shape and layout changing
162 // operations except Transposes invalidate index reuse. Transposes are
163 // special: although they are shape changing, we can reuse the
164 // multi-dimensional index for the operand by permuting it.
165 if (instruction->opcode() == HloOpcode::kTranspose ||
166 Shape::Equal().IgnoreElementType()(operand->shape(),
167 instruction->shape())) {
168 // If the index is reused, it means the operand gets index values
169 // from the same set of (indirect) users as 'instruction' itself.
170 indexing_users_[operand].insert(indexing_users_[instruction].begin(),
171 indexing_users_[instruction].end());
172 } else {
173 // If the index is not reused, it means 'instruction' computes a
174 // new index derived from the index it gets.
175 indexing_users_[operand].insert(instruction);
176 }
177 }
178 }
179
180 } // namespace xla
181