• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 Google LLC. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Library doing minimal CPU detection to decide what to tune asm code for.
17 //
18 // # Tuning vs Path
19 //
20 // Tunings are merely local variations of optimized code paths, that are
21 // drop-in replacements for each other --- the input and output data layouts
22 // are identical.  By contrast, what ruy calls a Path dictates its own
23 // data layouts. For example, Path::kNeonDotprod will use different
24 // layouts compared to Path::kNeon; but within each, different tunings
25 // will share that same layout.
26 //
27 // # Tuning is for now only based on 1 bit: Generic / A55ish
28 //
29 // In practice, each of our asm code paths only needs one bit information to
30 // decide on tuning: whether the CPU is out-of-order or in-order.
31 // That is because out-of-order CPUs are by definition relatively insensitive
32 // to small-scale asm details (which is what "tuning" is about); and for each
33 // asm code path, there tends to be one main in-order CPU architecture that
34 // we focus our tuning effort on. Examples:
35 //  * For Path::kNeon, the main in-order CPU is Cortex-A53/A55 (pre-dotprod)
36 //  * For Path::kNeonDotprod, the main in-order CPU is Cortex-A55r1 (dotprod)
37 //
38 // Because having tuned code paths is a compromise of efficiency gains
39 // versus implementation effort and code size, we are happy to stop at just this
40 // single bit of information, Generic / A55ish, at least in the current CPU
41 // landscape. This could change in the future.
42 #ifndef RUY_RUY_TUNE_H_
43 #define RUY_RUY_TUNE_H_
44 
45 #include "ruy/cpuinfo.h"
46 #include "ruy/opt_set.h"
47 #include "ruy/platform.h"
48 #include "ruy/time.h"
49 
50 namespace ruy {
51 
52 enum class Tuning {
53   // kAuto means please use auto-detection. It's the default in the
54   // user-visible parts (see Context). It's meant to be resolved to an
55   // actual tuning at some point by means of TuningResolver.
56   kAuto,
57   // Use code not tuned for any particular CPU, typically performing well
58   // on out-of-order cores that don't require as much tuning.
59   kGeneric,
60   // Use code tuned for "Cortex-A55-ish" CPUs, by which we mean mostly:
61   // A53, A55r0 (pre-dotprod), A55r1 (with dotprod). These CPUs have in common
62   // that they are in-order CPU cores with largely similar requirements of code
63   // tuning. The most important such requirement is to use only 64-bit loads
64   // to maximize dual-issuing.
65   //
66   // A55r1 differs from A55r0 and A53 in that it dual-issues 64-bit NEON loads
67   // whereas A55r0 and A53 require using non-NEON ARM 64-bit loads together with
68   // INS instructions to insert 64bit lanes into NEON registers. However, since
69   // A55r1 supports dotprod unlike A55r0 and A53, they are not using the same
70   // kernels in practice anyway, so there was no need to distinguish them with
71   // separate Tuning values.
72   kA55ish
73 };
74 
75 // Why a TuningResolver class?
76 //
77 // Ideally, this Library would offer a single function,
78 //   Tuning GetCurrentCPUTuning();
79 //
80 // However, determining information about the current CPU is not necessarily
81 // cheap, so we currently cache that and only invalidate/reevaluate after
82 // a fixed amount of time. This need to store state is why this library
83 // has to expose a class, TuningResolver, not just a function.
84 class TuningResolver {
85  public:
86   TuningResolver();
87 
88   // Allows the user to specify an explicit Tuning value, bypassing auto
89   // detection; or to specify Tuning::kAuto, reverting to auto detection.
SetTuning(Tuning tuning)90   void SetTuning(Tuning tuning) { unresolved_tuning_ = tuning; }
91 
92   // Get an actual tuning --- that is the function that this class wanted to be.
93   Tuning Resolve(CpuInfo* cpuinfo);
94 
95  private:
96   TuningResolver(const TuningResolver&) = delete;
97 
98   // Perform the tuning resolution now. That may typically use EvalRatio and
99   // ThresholdRatio, but an implementation may use a different approach instead.
100   Tuning ResolveNow(CpuInfo* cpuinfo);
101 
102   // The tuning as specified by the user, before actual resolution happens
103   // i.e. before querying any specifics of the current CPU.
104   // The default value kAuto means try to auto-detect. Other values mean
105   // bypass auto-detect, use explicit value instead. See SetTuning().
106   Tuning unresolved_tuning_ = Tuning::kAuto;
107   // Cached last resolved tuning.
108   Tuning last_resolved_tuning_ = Tuning::kAuto;
109   // Timepoint of cached last resolved tuning, for invalidation purposes.
110   TimePoint last_resolved_timepoint_;
111   // Cached last resolved tunings that are older than this age are invalid.
112   const Duration expiry_duration_;
113 };
114 
115 }  // namespace ruy
116 
117 #endif  // RUY_RUY_TUNE_H_
118