• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // For Google-internal use only.
17 #include "tensorflow/core/util/autotune_maps/autotune_serialize.h"
18 
19 #include <map>
20 #include <string>
21 #include <unordered_map>
22 #include <vector>
23 
24 #include "tensorflow/compiler/xla/stream_executor/dnn.pb.h"
25 #include "tensorflow/core/platform/str_util.h"
26 #include "tensorflow/core/util/activation_mode.h"
27 #include "tensorflow/core/util/autotune_maps/autotune_map.pb.h"
28 #include "tensorflow/core/util/autotune_maps/autotune_maps_utils.h"
29 #include "tensorflow/core/util/autotune_maps/conv_autotune_maps.h"
30 #include "tensorflow/core/util/autotune_maps/conv_parameters.h"
31 #include "tensorflow/core/util/autotune_maps/conv_parameters.pb.h"
32 #include "tensorflow/stream_executor/dnn.h"
33 
34 namespace tensorflow {
35 
36 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
37 namespace {
38 using stream_executor::dnn::AlgorithmConfig;
39 using stream_executor::dnn::AlgorithmConfigProto;
40 using stream_executor::dnn::AlgorithmDesc;
41 using stream_executor::dnn::AlgorithmProto;
42 
43 template <typename Op>
ConvMapToProto(const AutotuneMap<ConvParameters,AutotuneEntry<Op>> & autotune_map)44 ConvMapProto ConvMapToProto(
45     const AutotuneMap<ConvParameters, AutotuneEntry<Op>> &autotune_map) {
46   ConvMapProto proto;
47 
48   // Deterministically sort the entries in autotune maps
49   // according to the serialized string of ConvParametersProto in order to
50   // enable deterministic serialization. The actual order is meaningless.
51   //
52   // This step also filters out duplicate entries (only device_id's are
53   // different) in the autotune maps. So that there is only one entry for a
54   // convolution operation with a specific GPU device type.
55   std::map<string, ConvMapProto::Entry> sorted_map;
56 
57   for (auto const &p : autotune_map.GetMap()) {
58     const ConvParameters &params = p.first;
59     const ConvParametersProto &params_proto = params.proto();
60     VLOG(1) << "Reading: " << params.ToString();
61 
62     ConvMapProto::Entry kv;
63     *kv.mutable_key() = params_proto;
64 
65     if (p.second.is_algorithm_config()) {
66       *kv.mutable_value() = p.second.GetAlgorithmConfig().ToProto();
67     } else {
68       const auto &runners = p.second.GetOpRunners();
69       *kv.mutable_value()->mutable_algorithm() =
70           runners.primary->ToAlgorithmDesc().ToProto();
71       if (runners.no_scratch_fallback) {
72         *kv.mutable_value()->mutable_algorithm_no_scratch() =
73             runners.no_scratch_fallback->ToAlgorithmDesc().ToProto();
74       }
75     }
76 
77     sorted_map.insert(std::make_pair(
78         autotune_maps_utils::SerializeProtoDeterministic(params_proto), kv));
79   }
80 
81   for (auto const &p : sorted_map) {
82     ConvMapProto::Entry *kv = proto.add_kv_pairs();
83     *kv = p.second;
84   }
85   return proto;
86 }
87 
88 template <typename Op>
PopulateConvMap(const ConvMapProto & m,AutotuneMap<ConvParameters,AutotuneEntry<Op>> * autotune_map)89 Status PopulateConvMap(
90     const ConvMapProto &m,
91     AutotuneMap<ConvParameters, AutotuneEntry<Op>> *autotune_map) {
92   if (m.kv_pairs().size() == 0) {
93     return OkStatus();
94   }
95   std::set<std::string> unmatched_device_ids;
96   // Map device_id's to corresponding device_identifiers.
97   std::vector<string> device_ids_map =
98       autotune_maps_utils::GetDeviceIdToIdentifierMap();
99   // Map device_identifiers to device_ids whose corresponding GPU devices have
100   // the given device_identifier.
101   std::unordered_map<string, std::vector<int>> device_identifiers_map;
102   bool devices_matched = false;
103   for (const ConvMapProto::Entry &kv : m.kv_pairs()) {
104     const ConvParametersProto &params_proto = kv.key();
105     // Abort loading process whenever there is an entry whose version number
106     // doesn't match runtime version because the autotune results may be
107     // incorrect.
108     if (params_proto.version() != ConvParameters::kVersion) {
109       VLOG(1) << "ConvParametersProto with the incompatible version:"
110               << params_proto.DebugString();
111       return errors::Aborted(
112           "Aborted because the loaded autotune results for convolution "
113           "operations have a version different "
114           "from runtime's version. Expected version: ",
115           ConvParameters::kVersion,
116           ". Actual version: ", params_proto.version());
117     }
118 
119     auto iter = device_identifiers_map.find(params_proto.device_identifier());
120     std::vector<int> device_ids;
121     if (iter == device_identifiers_map.end()) {
122       for (int i = 0; i < device_ids_map.size(); i++) {
123         if (device_ids_map[i] == params_proto.device_identifier()) {
124           device_ids.push_back(i);
125         }
126       }
127       device_identifiers_map.insert(
128           std::make_pair(params_proto.device_identifier(), device_ids));
129     } else {
130       device_ids = iter->second;
131     }
132 
133     if (device_ids.empty()) {
134       unmatched_device_ids.insert(params_proto.device_identifier());
135     } else {
136       devices_matched = true;
137     }
138 
139     const AlgorithmConfigProto &algorithm_config_proto = kv.value();
140     const AlgorithmDesc primary(algorithm_config_proto.algorithm());
141     const absl::optional<AlgorithmDesc> fallback =
142         algorithm_config_proto.has_algorithm_no_scratch()
143             ? absl::optional<AlgorithmDesc>(
144                   AlgorithmDesc(algorithm_config_proto.algorithm_no_scratch()))
145             : absl::nullopt;
146 
147     for (int device_id : device_ids) {
148       AutotuneEntry<Op> entry;
149 #if TENSORFLOW_USE_ROCM
150       // ROCm doesn't yet support the OpRunner-based API, so for the time being
151       // we still need legacy AlgorithmDesc entries in the autotune map.
152       // Long-term, this should be folded into the next case.
153       entry = AutotuneEntry<Op>(AlgorithmConfig(algorithm_config_proto));
154 #else
155       entry = AutotuneEntry<Op>(primary, fallback);
156 #endif
157 
158       autotune_map->Insert(ConvParameters(device_id, params_proto), entry);
159     }
160   }
161 
162   if (!unmatched_device_ids.empty()) {
163     LOG(WARNING) << "Unmatched device id's from AoT autotuning data: "
164                  << str_util::Join(unmatched_device_ids, ", ")
165                  << "; existing devices: "
166                  << str_util::Join(device_ids_map, ", ");
167   }
168 
169   // When no matching devices are found, populating autotuning map will not
170   // happen. Instead of silently reporting an OK status, report an error back.
171   if (!devices_matched) {
172     return errors::NotFound("No matching devices found for ",
173                             str_util::Join(device_ids_map, ", "));
174   }
175   return OkStatus();
176 }
177 
178 }  // namespace
179 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
180 
SerializeAutotuneMaps(std::string * output)181 Status SerializeAutotuneMaps(std::string *output) {
182   AutotuneMapsProto proto;
183 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
184   *proto.mutable_conv_map() = ConvMapToProto(*ConvAutotuneMap::GetInstance());
185   *proto.mutable_fused_conv_map() =
186       ConvMapToProto(*FusedConvAutotuneMap::GetInstance());
187 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
188   *output = autotune_maps_utils::SerializeProtoDeterministic(proto);
189   return OkStatus();
190 }
191 
LoadSerializedAutotuneMaps(absl::string_view s)192 Status LoadSerializedAutotuneMaps(absl::string_view s) {
193 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
194   AutotuneMapsProto proto;
195   // The explicit string conversion here is a workaround for
196   // resolving the issue that OSS proto library's ParseFromString only accepts
197   // std::string.
198   if (!proto.ParseFromString(string(s))) {
199     return errors::InvalidArgument(
200         "Failed to parse the autotune maps from string.");
201   }
202   TF_RETURN_IF_ERROR(
203       PopulateConvMap(proto.conv_map(), ConvAutotuneMap::GetInstance()));
204   TF_RETURN_IF_ERROR(PopulateConvMap(proto.fused_conv_map(),
205                                      FusedConvAutotuneMap::GetInstance()));
206   // TODO(b/189530096): Populate autotune maps for more ops.
207 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
208   return OkStatus();
209 }
210 
ResetAutotuneMaps()211 void ResetAutotuneMaps() {
212 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
213   ConvAutotuneMap::GetInstance()->ClearMap();
214   FusedConvAutotuneMap::GetInstance()->ClearMap();
215 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
216 }
217 
218 }  // namespace tensorflow
219