1 /* Copyright 2019 Google LLC. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "ruy/ctx.h"
17
18 #include <cstdlib>
19 #include <functional>
20 #include <string>
21
22 #include "ruy/check_macros.h"
23 #include "ruy/cpuinfo.h"
24 #include "ruy/ctx_impl.h"
25 #include "ruy/have_built_path_for.h"
26 #include "ruy/path.h"
27 #include "ruy/performance_advisory.h"
28 #include "ruy/platform.h"
29 #include "ruy/prepacked_cache.h"
30 #include "ruy/trace.h"
31
32 namespace ruy {
33
impl() const34 const CtxImpl& Ctx::impl() const { return static_cast<const CtxImpl&>(*this); }
mutable_impl()35 CtxImpl* Ctx::mutable_impl() { return static_cast<CtxImpl*>(this); }
36
last_used_path() const37 Path Ctx::last_used_path() const { return impl().last_used_path_; }
explicit_tuning() const38 Tuning Ctx::explicit_tuning() const { return impl().explicit_tuning_; }
set_explicit_tuning(Tuning value)39 void Ctx::set_explicit_tuning(Tuning value) {
40 mutable_impl()->explicit_tuning_ = value;
41 }
thread_pool() const42 const ThreadPool& Ctx::thread_pool() const { return impl().thread_pool_; }
mutable_thread_pool()43 ThreadPool* Ctx::mutable_thread_pool() { return &mutable_impl()->thread_pool_; }
max_num_threads() const44 int Ctx::max_num_threads() const { return impl().max_num_threads_; }
set_max_num_threads(int value)45 void Ctx::set_max_num_threads(int value) {
46 mutable_impl()->max_num_threads_ = value;
47 }
clear_performance_advisories()48 void Ctx::clear_performance_advisories() {
49 mutable_impl()->performance_advisory_ = PerformanceAdvisory::kNone;
50 }
set_performance_advisory(PerformanceAdvisory advisory)51 void Ctx::set_performance_advisory(PerformanceAdvisory advisory) {
52 mutable_impl()->performance_advisory_ =
53 mutable_impl()->performance_advisory_ | advisory;
54 }
performance_advisory(PerformanceAdvisory advisory) const55 bool Ctx::performance_advisory(PerformanceAdvisory advisory) const {
56 return (impl().performance_advisory_ & advisory) !=
57 PerformanceAdvisory::kNone;
58 }
59
SetRuntimeEnabledPaths(Path paths)60 void Ctx::SetRuntimeEnabledPaths(Path paths) {
61 if (paths == Path::kNone) {
62 // Revert to default behavior using runtime detection.
63 mutable_impl()->runtime_enabled_paths_ = Path::kNone;
64 } else {
65 // Explicitly set enabled paths. Ensure that non-arch are always enabled
66 // (needed for fallbacks).
67 mutable_impl()->runtime_enabled_paths_ = paths | kNonArchPaths;
68 }
69 }
70
mutable_cpuinfo()71 CpuInfo* Ctx::mutable_cpuinfo() { return &mutable_impl()->cpuinfo_; }
72
73 namespace {
74
GetHexIntEnvVarOrZero(const char * name)75 int GetHexIntEnvVarOrZero(const char* name) {
76 const char* val = getenv(name);
77 if (!val) {
78 return 0;
79 }
80 return std::stoi(val, nullptr, 16);
81 }
82
83 // For each Path bit set in `paths_to_test`, performs runtime detection and
84 // sets the corresponding bit in the return value if and only if it is
85 // supported. Path bits that are not set in the input
86 // `paths_to_detect` value are also left not set in the return value.
DetectRuntimeSupportedPaths(Path paths_to_detect,CpuInfo * cpuinfo)87 Path DetectRuntimeSupportedPaths(Path paths_to_detect, CpuInfo* cpuinfo) {
88 // Paths in kNonArchPathsIncludingInternalVariants are always implicitly
89 // supported. Further logic below may add more bits to `results`.
90 Path result = kNonArchPathsIncludingInternalVariants;
91
92 // Conditionally sets the `path` bit in `result`, if reported as supported
93 // by the `is_supported` predicate.
94 auto maybe_add = [&](Path path, std::function<bool(void)> is_supported) {
95 if ((paths_to_detect & path) != Path::kNone) {
96 if (is_supported()) {
97 result = result | path;
98 }
99 }
100 };
101
102 #if RUY_PLATFORM_ARM
103 // NEON is unconditionally available on ARM64.
104 // On ARM32 it's technically possible for it to be unavailable, but we've
105 // always chosen to just crash on such devices. We could reevaluate that,
106 // however for non-NEON devices to be actually supported, we would need to
107 // address also compiler-generated NEON code. That would mean to remove
108 // -mfpu=neon from ruy_copts and only use this flag in select NEON translation
109 // units, and implement have_built_path_for_neon, similar to the x86 SIMD
110 // paths.
111 maybe_add(Path::kNeon, []() { return true; });
112
113 // NEON dotprod requires runtime detection, however unlike the x86 SIMD paths
114 // it still does not require have_built_path_for because we unconditionally
115 // build it at the moment. That is largely because we have had to machine
116 // encode dotprod instructions, so we don't actually rely on toolchain support
117 // for them.
118 maybe_add(Path::kNeonDotprod, [=]() { return cpuinfo->NeonDotprod(); });
119 #elif RUY_PLATFORM_X86
120 // x86 SIMD paths currently require both runtime detection, and detection of
121 // whether we're building the path at all.
122 maybe_add(Path::kAvx,
123 [=]() { return HaveBuiltPathForAvx() && cpuinfo->Avx(); });
124 maybe_add(Path::kAvx2Fma,
125 [=]() { return HaveBuiltPathForAvx2Fma() && cpuinfo->Avx2Fma(); });
126 maybe_add(Path::kAvx512,
127 [=]() { return HaveBuiltPathForAvx512() && cpuinfo->Avx512(); });
128 #else
129 (void)maybe_add;
130 (void)cpuinfo;
131 #endif
132
133 // Sanity checks
134 RUY_DCHECK_EQ(kNonArchPaths & ~result, Path::kNone);
135 RUY_DCHECK_EQ(
136 result & ~(kNonArchPathsIncludingInternalVariants | paths_to_detect),
137 Path::kNone);
138 return result;
139 }
140
141 } // namespace
142
GetRuntimeEnabledPaths()143 Path Ctx::GetRuntimeEnabledPaths() {
144 RUY_TRACE_SCOPE;
145 // Just a shorthand alias. Using a pointer to make it clear we're mutating
146 // this value in-place.
147 Path* paths = &mutable_impl()->runtime_enabled_paths_;
148
149 // The value Path::kNone indicates the initial state before detection has been
150 // performed.
151 if (*paths != Path::kNone) {
152 RUY_TRACE_INFO(GET_RUNTIME_ENABLED_PATHS_USING_SET_VALUE);
153 return *paths;
154 }
155 // User may have set path explicitly in env var.
156 Path paths_bitfield = static_cast<Path>(GetHexIntEnvVarOrZero("RUY_PATHS"));
157 if (paths_bitfield != Path::kNone) {
158 *paths = paths_bitfield;
159 RUY_TRACE_INFO(GET_RUNTIME_ENABLED_PATHS_USING_ENV_VAR);
160 return *paths;
161 }
162 // Finally, use runtime detection.
163 *paths = DetectRuntimeSupportedPaths(kAllPaths, mutable_cpuinfo());
164 RUY_TRACE_INFO(GET_RUNTIME_ENABLED_PATHS_USING_DETECTION);
165 return *paths;
166 }
167
SelectPath(Path compiled_paths)168 Path Ctx::SelectPath(Path compiled_paths) {
169 return mutable_impl()->last_used_path_ =
170 GetMostSignificantPath(compiled_paths & GetRuntimeEnabledPaths());
171 }
172
EnsureThreadSpecificResources(int thread_count)173 void Ctx::EnsureThreadSpecificResources(int thread_count) {
174 auto& resources = mutable_impl()->thread_specific_resources_;
175 while (thread_count > static_cast<int>(resources.size())) {
176 resources.emplace_back(new ThreadSpecificResource);
177 }
178 RUY_DCHECK_LE(thread_count, static_cast<int>(resources.size()));
179 }
180
GetThreadSpecificTuningResolver(int thread_index) const181 TuningResolver* Ctx::GetThreadSpecificTuningResolver(int thread_index) const {
182 const auto& resources = impl().thread_specific_resources_;
183 RUY_DCHECK_LT(thread_index, static_cast<int>(resources.size()));
184 return &resources[thread_index]->tuning_resolver;
185 }
186
GetThreadSpecificAllocator(int thread_index) const187 Allocator* Ctx::GetThreadSpecificAllocator(int thread_index) const {
188 const auto& resources = impl().thread_specific_resources_;
189 RUY_DCHECK_LT(thread_index, static_cast<int>(resources.size()));
190 return &resources[thread_index]->allocator;
191 }
192
GetMainAllocator()193 Allocator* Ctx::GetMainAllocator() {
194 if (!impl().main_allocator_) {
195 mutable_impl()->main_allocator_.reset(new Allocator);
196 }
197 return impl().main_allocator_.get();
198 }
199
GetPrepackedCache()200 PrepackedCache* Ctx::GetPrepackedCache() {
201 if (!impl().prepacked_cache_) {
202 mutable_impl()->prepacked_cache_.reset(new PrepackedCache);
203 }
204 return impl().prepacked_cache_.get();
205 }
206
GetMainThreadTuning()207 Tuning Ctx::GetMainThreadTuning() {
208 EnsureThreadSpecificResources(1);
209 TuningResolver* tuning_resolver = GetThreadSpecificTuningResolver(0);
210 tuning_resolver->SetTuning(explicit_tuning());
211 return tuning_resolver->Resolve(mutable_cpuinfo());
212 }
213
ClearPrepackedCache()214 void Ctx::ClearPrepackedCache() { mutable_impl()->prepacked_cache_ = nullptr; }
215
216 } // namespace ruy
217