1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 // For Google-internal use only.
17 #include "tensorflow/core/util/autotune_maps/autotune_serialize.h"
18
19 #include <map>
20 #include <string>
21 #include <unordered_map>
22 #include <vector>
23
24 #include "tensorflow/compiler/xla/stream_executor/dnn.pb.h"
25 #include "tensorflow/core/platform/str_util.h"
26 #include "tensorflow/core/util/activation_mode.h"
27 #include "tensorflow/core/util/autotune_maps/autotune_map.pb.h"
28 #include "tensorflow/core/util/autotune_maps/autotune_maps_utils.h"
29 #include "tensorflow/core/util/autotune_maps/conv_autotune_maps.h"
30 #include "tensorflow/core/util/autotune_maps/conv_parameters.h"
31 #include "tensorflow/core/util/autotune_maps/conv_parameters.pb.h"
32 #include "tensorflow/stream_executor/dnn.h"
33
34 namespace tensorflow {
35
36 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
37 namespace {
38 using stream_executor::dnn::AlgorithmConfig;
39 using stream_executor::dnn::AlgorithmConfigProto;
40 using stream_executor::dnn::AlgorithmDesc;
41 using stream_executor::dnn::AlgorithmProto;
42
43 template <typename Op>
ConvMapToProto(const AutotuneMap<ConvParameters,AutotuneEntry<Op>> & autotune_map)44 ConvMapProto ConvMapToProto(
45 const AutotuneMap<ConvParameters, AutotuneEntry<Op>> &autotune_map) {
46 ConvMapProto proto;
47
48 // Deterministically sort the entries in autotune maps
49 // according to the serialized string of ConvParametersProto in order to
50 // enable deterministic serialization. The actual order is meaningless.
51 //
52 // This step also filters out duplicate entries (only device_id's are
53 // different) in the autotune maps. So that there is only one entry for a
54 // convolution operation with a specific GPU device type.
55 std::map<string, ConvMapProto::Entry> sorted_map;
56
57 for (auto const &p : autotune_map.GetMap()) {
58 const ConvParameters ¶ms = p.first;
59 const ConvParametersProto ¶ms_proto = params.proto();
60 VLOG(1) << "Reading: " << params.ToString();
61
62 ConvMapProto::Entry kv;
63 *kv.mutable_key() = params_proto;
64
65 if (p.second.is_algorithm_config()) {
66 *kv.mutable_value() = p.second.GetAlgorithmConfig().ToProto();
67 } else {
68 const auto &runners = p.second.GetOpRunners();
69 *kv.mutable_value()->mutable_algorithm() =
70 runners.primary->ToAlgorithmDesc().ToProto();
71 if (runners.no_scratch_fallback) {
72 *kv.mutable_value()->mutable_algorithm_no_scratch() =
73 runners.no_scratch_fallback->ToAlgorithmDesc().ToProto();
74 }
75 }
76
77 sorted_map.insert(std::make_pair(
78 autotune_maps_utils::SerializeProtoDeterministic(params_proto), kv));
79 }
80
81 for (auto const &p : sorted_map) {
82 ConvMapProto::Entry *kv = proto.add_kv_pairs();
83 *kv = p.second;
84 }
85 return proto;
86 }
87
88 template <typename Op>
PopulateConvMap(const ConvMapProto & m,AutotuneMap<ConvParameters,AutotuneEntry<Op>> * autotune_map)89 Status PopulateConvMap(
90 const ConvMapProto &m,
91 AutotuneMap<ConvParameters, AutotuneEntry<Op>> *autotune_map) {
92 if (m.kv_pairs().size() == 0) {
93 return OkStatus();
94 }
95 std::set<std::string> unmatched_device_ids;
96 // Map device_id's to corresponding device_identifiers.
97 std::vector<string> device_ids_map =
98 autotune_maps_utils::GetDeviceIdToIdentifierMap();
99 // Map device_identifiers to device_ids whose corresponding GPU devices have
100 // the given device_identifier.
101 std::unordered_map<string, std::vector<int>> device_identifiers_map;
102 bool devices_matched = false;
103 for (const ConvMapProto::Entry &kv : m.kv_pairs()) {
104 const ConvParametersProto ¶ms_proto = kv.key();
105 // Abort loading process whenever there is an entry whose version number
106 // doesn't match runtime version because the autotune results may be
107 // incorrect.
108 if (params_proto.version() != ConvParameters::kVersion) {
109 VLOG(1) << "ConvParametersProto with the incompatible version:"
110 << params_proto.DebugString();
111 return errors::Aborted(
112 "Aborted because the loaded autotune results for convolution "
113 "operations have a version different "
114 "from runtime's version. Expected version: ",
115 ConvParameters::kVersion,
116 ". Actual version: ", params_proto.version());
117 }
118
119 auto iter = device_identifiers_map.find(params_proto.device_identifier());
120 std::vector<int> device_ids;
121 if (iter == device_identifiers_map.end()) {
122 for (int i = 0; i < device_ids_map.size(); i++) {
123 if (device_ids_map[i] == params_proto.device_identifier()) {
124 device_ids.push_back(i);
125 }
126 }
127 device_identifiers_map.insert(
128 std::make_pair(params_proto.device_identifier(), device_ids));
129 } else {
130 device_ids = iter->second;
131 }
132
133 if (device_ids.empty()) {
134 unmatched_device_ids.insert(params_proto.device_identifier());
135 } else {
136 devices_matched = true;
137 }
138
139 const AlgorithmConfigProto &algorithm_config_proto = kv.value();
140 const AlgorithmDesc primary(algorithm_config_proto.algorithm());
141 const absl::optional<AlgorithmDesc> fallback =
142 algorithm_config_proto.has_algorithm_no_scratch()
143 ? absl::optional<AlgorithmDesc>(
144 AlgorithmDesc(algorithm_config_proto.algorithm_no_scratch()))
145 : absl::nullopt;
146
147 for (int device_id : device_ids) {
148 AutotuneEntry<Op> entry;
149 #if TENSORFLOW_USE_ROCM
150 // ROCm doesn't yet support the OpRunner-based API, so for the time being
151 // we still need legacy AlgorithmDesc entries in the autotune map.
152 // Long-term, this should be folded into the next case.
153 entry = AutotuneEntry<Op>(AlgorithmConfig(algorithm_config_proto));
154 #else
155 entry = AutotuneEntry<Op>(primary, fallback);
156 #endif
157
158 autotune_map->Insert(ConvParameters(device_id, params_proto), entry);
159 }
160 }
161
162 if (!unmatched_device_ids.empty()) {
163 LOG(WARNING) << "Unmatched device id's from AoT autotuning data: "
164 << str_util::Join(unmatched_device_ids, ", ")
165 << "; existing devices: "
166 << str_util::Join(device_ids_map, ", ");
167 }
168
169 // When no matching devices are found, populating autotuning map will not
170 // happen. Instead of silently reporting an OK status, report an error back.
171 if (!devices_matched) {
172 return errors::NotFound("No matching devices found for ",
173 str_util::Join(device_ids_map, ", "));
174 }
175 return OkStatus();
176 }
177
178 } // namespace
179 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
180
SerializeAutotuneMaps(std::string * output)181 Status SerializeAutotuneMaps(std::string *output) {
182 AutotuneMapsProto proto;
183 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
184 *proto.mutable_conv_map() = ConvMapToProto(*ConvAutotuneMap::GetInstance());
185 *proto.mutable_fused_conv_map() =
186 ConvMapToProto(*FusedConvAutotuneMap::GetInstance());
187 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
188 *output = autotune_maps_utils::SerializeProtoDeterministic(proto);
189 return OkStatus();
190 }
191
LoadSerializedAutotuneMaps(absl::string_view s)192 Status LoadSerializedAutotuneMaps(absl::string_view s) {
193 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
194 AutotuneMapsProto proto;
195 // The explicit string conversion here is a workaround for
196 // resolving the issue that OSS proto library's ParseFromString only accepts
197 // std::string.
198 if (!proto.ParseFromString(string(s))) {
199 return errors::InvalidArgument(
200 "Failed to parse the autotune maps from string.");
201 }
202 TF_RETURN_IF_ERROR(
203 PopulateConvMap(proto.conv_map(), ConvAutotuneMap::GetInstance()));
204 TF_RETURN_IF_ERROR(PopulateConvMap(proto.fused_conv_map(),
205 FusedConvAutotuneMap::GetInstance()));
206 // TODO(b/189530096): Populate autotune maps for more ops.
207 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
208 return OkStatus();
209 }
210
ResetAutotuneMaps()211 void ResetAutotuneMaps() {
212 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
213 ConvAutotuneMap::GetInstance()->ClearMap();
214 FusedConvAutotuneMap::GetInstance()->ClearMap();
215 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
216 }
217
218 } // namespace tensorflow
219