1 /* Copyright 2019 Google LLC. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // Library doing minimal CPU detection to decide what to tune asm code for. 17 // 18 // # Tuning vs Path 19 // 20 // Tunings are merely local variations of optimized code paths, that are 21 // drop-in replacements for each other --- the input and output data layouts 22 // are identical. By contrast, what ruy calls a Path dictates its own 23 // data layouts. For example, Path::kNeonDotprod will use different 24 // layouts compared to Path::kNeon; but within each, different tunings 25 // will share that same layout. 26 // 27 // # Tuning is for now only based on 1 bit: Generic / A55ish 28 // 29 // In practice, each of our asm code paths only needs one bit information to 30 // decide on tuning: whether the CPU is out-of-order or in-order. 31 // That is because out-of-order CPUs are by definition relatively insensitive 32 // to small-scale asm details (which is what "tuning" is about); and for each 33 // asm code path, there tends to be one main in-order CPU architecture that 34 // we focus our tuning effort on. Examples: 35 // * For Path::kNeon, the main in-order CPU is Cortex-A53/A55 (pre-dotprod) 36 // * For Path::kNeonDotprod, the main in-order CPU is Cortex-A55r1 (dotprod) 37 // 38 // Because having tuned code paths is a compromise of efficiency gains 39 // versus implementation effort and code size, we are happy to stop at just this 40 // single bit of information, Generic / A55ish, at least in the current CPU 41 // landscape. This could change in the future. 42 #ifndef RUY_RUY_TUNE_H_ 43 #define RUY_RUY_TUNE_H_ 44 45 #include "ruy/cpuinfo.h" 46 #include "ruy/opt_set.h" 47 #include "ruy/platform.h" 48 #include "ruy/time.h" 49 50 namespace ruy { 51 52 enum class Tuning { 53 // kAuto means please use auto-detection. It's the default in the 54 // user-visible parts (see Context). It's meant to be resolved to an 55 // actual tuning at some point by means of TuningResolver. 56 kAuto, 57 // Use code not tuned for any particular CPU, typically performing well 58 // on out-of-order cores that don't require as much tuning. 59 kGeneric, 60 // Use code tuned for "Cortex-A55-ish" CPUs, by which we mean mostly: 61 // A53, A55r0 (pre-dotprod), A55r1 (with dotprod). These CPUs have in common 62 // that they are in-order CPU cores with largely similar requirements of code 63 // tuning. The most important such requirement is to use only 64-bit loads 64 // to maximize dual-issuing. 65 // 66 // A55r1 differs from A55r0 and A53 in that it dual-issues 64-bit NEON loads 67 // whereas A55r0 and A53 require using non-NEON ARM 64-bit loads together with 68 // INS instructions to insert 64bit lanes into NEON registers. However, since 69 // A55r1 supports dotprod unlike A55r0 and A53, they are not using the same 70 // kernels in practice anyway, so there was no need to distinguish them with 71 // separate Tuning values. 72 kA55ish 73 }; 74 75 // Why a TuningResolver class? 76 // 77 // Ideally, this Library would offer a single function, 78 // Tuning GetCurrentCPUTuning(); 79 // 80 // However, determining information about the current CPU is not necessarily 81 // cheap, so we currently cache that and only invalidate/reevaluate after 82 // a fixed amount of time. This need to store state is why this library 83 // has to expose a class, TuningResolver, not just a function. 84 class TuningResolver { 85 public: 86 TuningResolver(); 87 88 // Allows the user to specify an explicit Tuning value, bypassing auto 89 // detection; or to specify Tuning::kAuto, reverting to auto detection. SetTuning(Tuning tuning)90 void SetTuning(Tuning tuning) { unresolved_tuning_ = tuning; } 91 92 // Get an actual tuning --- that is the function that this class wanted to be. 93 Tuning Resolve(CpuInfo* cpuinfo); 94 95 private: 96 TuningResolver(const TuningResolver&) = delete; 97 98 // Perform the tuning resolution now. That may typically use EvalRatio and 99 // ThresholdRatio, but an implementation may use a different approach instead. 100 Tuning ResolveNow(CpuInfo* cpuinfo); 101 102 // The tuning as specified by the user, before actual resolution happens 103 // i.e. before querying any specifics of the current CPU. 104 // The default value kAuto means try to auto-detect. Other values mean 105 // bypass auto-detect, use explicit value instead. See SetTuning(). 106 Tuning unresolved_tuning_ = Tuning::kAuto; 107 // Cached last resolved tuning. 108 Tuning last_resolved_tuning_ = Tuning::kAuto; 109 // Timepoint of cached last resolved tuning, for invalidation purposes. 110 TimePoint last_resolved_timepoint_; 111 // Cached last resolved tunings that are older than this age are invalid. 112 const Duration expiry_duration_; 113 }; 114 115 } // namespace ruy 116 117 #endif // RUY_RUY_TUNE_H_ 118