1 /* Copyright 2019 Google LLC. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // Library doing minimal CPU detection to decide what to tune asm code for. 17 // 18 // # Tuning vs Path 19 // 20 // Tunings are merely local variations of optimized code paths, that are 21 // drop-in replacements for each other --- the input and output data layouts 22 // are identical. By contrast, what ruy calls a Path dictates its own 23 // data layouts. For example, Path::kNeonDotprod will use different 24 // layouts compared to Path::kNeon; but within each, different tunings 25 // will share that same layout. 26 // 27 // # Tuning is for now only based on 1 bit: Generic / A55ish 28 // 29 // In practice, each of our asm code paths only needs one bit information to 30 // decide on tuning: whether the CPU is out-of-order or in-order. 31 // That is because out-of-order CPUs are by definition relatively insensitive 32 // to small-scale asm details (which is what "tuning" is about); and for each 33 // asm code path, there tends to be one main in-order CPU architecture that 34 // we focus our tuning effort on. Examples: 35 // * For Path::kNeon, the main in-order CPU is Cortex-A53/A55 (pre-dotprod) 36 // * For Path::kNeonDotprod, the main in-order CPU is Cortex-A55r1 (dotprod) 37 // 38 // Because having tuned code paths is a compromise of efficiency gains 39 // versus implementation effort and code size, we are happy to stop at just this 40 // single bit of information, Generic / A55ish, at least in the current CPU 41 // landscape. This could change in the future. 42 #ifndef RUY_RUY_TUNE_H_ 43 #define RUY_RUY_TUNE_H_ 44 45 #include "ruy/cpuinfo.h" 46 #include "ruy/opt_set.h" 47 #include "ruy/platform.h" 48 #include "ruy/time.h" 49 50 namespace ruy { 51 52 enum class Tuning { 53 // kAuto means please use auto-detection. It's the default in the 54 // user-visible parts (see Context). It's meant to be resolved to an 55 // actual tuning at some point by means of TuningResolver. 56 kAuto, 57 // Use code not tuned for any particular CPU, typically performing well 58 // on out-of-order cores that don't require as much tuning. 59 kGeneric, 60 // Use code tuned for "Cortex-A55-ish" CPUs, by which we mean mostly: 61 // A53, A55r0 (pre-dotprod), A55r1 (with dotprod). These CPUs have in common 62 // that they are in-order CPU cores with largely similar requirements of code 63 // tuning. The most important such requirement is to use only 64-bit loads 64 // to maximize dual-issuing. 65 // 66 // A55r1 differs from A55r0 and A53 in that it dual-issues 64-bit NEON loads 67 // whereas A55r0 and A53 require using non-NEON ARM 64-bit loads together with 68 // INS instructions to insert 64bit lanes into NEON registers. However, since 69 // A55r1 supports dotprod unlike A55r0 and A53, they are not using the same 70 // kernels in practice anyway, so there was no need to distinguish them with 71 // separate Tuning values. 72 kA55ish, 73 // Use code tuned for Cortex-X1 CPUs. Currently, the driver to distinguish 74 // this CPU is the get maximum performance on the dotprod kernels, where we 75 // attain high performance simply by avoiding any manual loop unrolling. As a 76 // purely performance oriented microarchitecture, there will likely be 77 // additional reasons to distinguish the X1 from other CPUs. 78 kX1 79 }; 80 81 // Why a TuningResolver class? 82 // 83 // Ideally, this Library would offer a single function, 84 // Tuning GetCurrentCPUTuning(); 85 // 86 // However, determining information about the current CPU is not necessarily 87 // cheap, so we currently cache that and only invalidate/reevaluate after 88 // a fixed amount of time. This need to store state is why this library 89 // has to expose a class, TuningResolver, not just a function. 90 class TuningResolver { 91 public: 92 TuningResolver(); 93 94 // Allows the user to specify an explicit Tuning value, bypassing auto 95 // detection; or to specify Tuning::kAuto, reverting to auto detection. SetTuning(Tuning tuning)96 void SetTuning(Tuning tuning) { unresolved_tuning_ = tuning; } 97 98 // Get an actual tuning --- that is the function that this class wanted to be. 99 Tuning Resolve(CpuInfo* cpuinfo); 100 101 private: 102 TuningResolver(const TuningResolver&) = delete; 103 104 // Perform the tuning resolution now. That may typically use EvalRatio and 105 // ThresholdRatio, but an implementation may use a different approach instead. 106 Tuning ResolveNow(CpuInfo* cpuinfo); 107 108 // The tuning as specified by the user, before actual resolution happens 109 // i.e. before querying any specifics of the current CPU. 110 // The default value kAuto means try to auto-detect. Other values mean 111 // bypass auto-detect, use explicit value instead. See SetTuning(). 112 Tuning unresolved_tuning_ = Tuning::kAuto; 113 // Cached last resolved tuning. 114 Tuning last_resolved_tuning_ = Tuning::kAuto; 115 // Timepoint of cached last resolved tuning, for invalidation purposes. 116 TimePoint last_resolved_timepoint_; 117 // Cached last resolved tunings that are older than this age are invalid. 118 const Duration expiry_duration_; 119 }; 120 121 } // namespace ruy 122 123 #endif // RUY_RUY_TUNE_H_ 124