1 /*
2 * Copyright 2025 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <functional>
18 #include <random>
19 #include <vector>
20
21 #include <benchmark/benchmark.h>
22
23 /*
24 Pixel 6 Pro Android 14
25 ------------------------------------------------------------------------------
26 Benchmark Time CPU Iterations
27 ------------------------------------------------------------------------------
28 BM_VectorTestLoopFloat/1 1216 ns 1212 ns 580560
29 BM_VectorTestLoopFloat/2 2272 ns 2264 ns 309745
30 BM_VectorTestLoopFloat/3 3366 ns 3354 ns 209391
31 BM_VectorTestLoopFloat/4 4495 ns 4478 ns 157291
32 BM_VectorTestLoopFloat/5 5660 ns 5627 ns 124649
33 BM_VectorTestLoopFloat/6 6776 ns 6750 ns 104102
34 BM_VectorTestLoopFloat/7 7942 ns 7916 ns 89257
35 BM_VectorTestLoopFloat/8 9120 ns 9086 ns 77234
36 BM_VectorTestLoopFloat/9 10252 ns 10212 ns 69253
37 BM_VectorTestLoopFloat/10 11475 ns 11432 ns 61646
38 BM_VectorTestLoopFloat/11 12704 ns 12658 ns 55493
39 BM_VectorTestLoopFloat/12 13864 ns 13812 ns 50944
40 BM_VectorTestLoopFloat/13 15024 ns 14967 ns 47169
41 BM_VectorTestLoopFloat/14 16340 ns 16282 ns 43531
42 BM_VectorTestLoopFloat/15 17422 ns 17356 ns 40328
43 BM_VectorTestLoopFloat/16 18680 ns 18609 ns 37820
44 BM_VectorTestLoopFloat/17 19892 ns 19819 ns 35348
45 BM_VectorTestLoopFloat/18 21099 ns 21015 ns 33253
46 BM_VectorTestLoopFloat/19 22238 ns 22154 ns 31681
47 BM_VectorTestLoopFloat/20 23551 ns 23433 ns 29829
48 BM_VectorTestLoopFloat/21 24707 ns 24612 ns 28525
49 BM_VectorTestLoopFloat/22 26041 ns 25916 ns 27004
50 BM_VectorTestLoopFloat/23 27236 ns 27122 ns 25123
51 BM_VectorTestLoopFloat/24 28535 ns 28409 ns 24505
52 BM_VectorTestLoopFloat/25 29715 ns 29542 ns 23744
53 BM_VectorTestLoopFloat/26 31163 ns 31002 ns 22640
54 BM_VectorTestLoopFloat/27 32259 ns 32065 ns 21859
55 BM_VectorTestLoopFloat/28 33580 ns 33391 ns 20702
56 BM_VectorTestLoopFloat/29 34891 ns 34699 ns 20281
57 BM_VectorTestLoopFloat/30 36242 ns 36007 ns 19400
58 BM_VectorTestLoopFloat/31 37423 ns 37154 ns 18875
59 BM_VectorTestLoopFloat/32 38858 ns 38608 ns 17699
60 BM_VectorTestConstArraySizeFloat/1 185 ns 184 ns 3771794
61 BM_VectorTestConstArraySizeFloat/2 663 ns 660 ns 1068518
62 BM_VectorTestConstArraySizeFloat/3 2159 ns 2152 ns 318170
63 BM_VectorTestConstArraySizeFloat/4 3919 ns 3905 ns 179267
64 BM_VectorTestConstArraySizeFloat/5 1861 ns 1854 ns 374407
65 BM_VectorTestConstArraySizeFloat/6 1964 ns 1956 ns 362563
66 BM_VectorTestConstArraySizeFloat/7 2789 ns 2779 ns 252684
67 BM_VectorTestConstArraySizeFloat/8 2070 ns 2062 ns 342189
68 BM_VectorTestConstArraySizeFloat/9 3191 ns 3179 ns 220216
69 BM_VectorTestConstArraySizeFloat/10 3128 ns 3117 ns 225340
70 BM_VectorTestConstArraySizeFloat/11 4049 ns 4025 ns 174288
71 BM_VectorTestConstArraySizeFloat/12 3124 ns 3106 ns 225711
72 BM_VectorTestConstArraySizeFloat/13 4440 ns 4424 ns 158540
73 BM_VectorTestConstArraySizeFloat/14 4276 ns 4256 ns 164144
74 BM_VectorTestConstArraySizeFloat/15 5325 ns 5306 ns 132282
75 BM_VectorTestConstArraySizeFloat/16 4091 ns 4072 ns 172111
76 BM_VectorTestConstArraySizeFloat/17 5711 ns 5682 ns 122226
77 BM_VectorTestConstArraySizeFloat/18 5373 ns 5349 ns 129827
78 BM_VectorTestConstArraySizeFloat/19 6500 ns 6474 ns 108150
79 BM_VectorTestConstArraySizeFloat/20 5131 ns 5109 ns 136649
80 BM_VectorTestConstArraySizeFloat/21 6896 ns 6867 ns 99598
81 BM_VectorTestConstArraySizeFloat/22 6579 ns 6529 ns 108221
82 BM_VectorTestConstArraySizeFloat/23 7752 ns 7705 ns 91673
83 BM_VectorTestConstArraySizeFloat/24 6129 ns 6102 ns 114269
84 BM_VectorTestConstArraySizeFloat/25 8151 ns 8120 ns 85643
85 BM_VectorTestConstArraySizeFloat/26 7512 ns 7474 ns 94708
86 BM_VectorTestConstArraySizeFloat/27 9100 ns 9047 ns 79200
87 BM_VectorTestConstArraySizeFloat/28 7191 ns 7149 ns 97121
88 BM_VectorTestConstArraySizeFloat/29 9417 ns 9362 ns 74720
89 BM_VectorTestConstArraySizeFloat/30 8952 ns 8893 ns 80378
90 BM_VectorTestConstArraySizeFloat/31 10342 ns 10284 ns 66481
91 BM_VectorTestConstArraySizeFloat/32 8189 ns 8132 ns 85186
92 BM_VectorTestForcedIntrinsics/1 189 ns 189 ns 3629410
93 BM_VectorTestForcedIntrinsics/2 1192 ns 1188 ns 572025
94 BM_VectorTestForcedIntrinsics/3 1701 ns 1695 ns 412319
95 BM_VectorTestForcedIntrinsics/4 1234 ns 1229 ns 563105
96 BM_VectorTestForcedIntrinsics/5 1936 ns 1929 ns 367124
97 BM_VectorTestForcedIntrinsics/6 2002 ns 1994 ns 350985
98 BM_VectorTestForcedIntrinsics/7 2826 ns 2814 ns 247821
99 BM_VectorTestForcedIntrinsics/8 2106 ns 2098 ns 332577
100 BM_VectorTestForcedIntrinsics/9 3240 ns 3229 ns 216567
101 BM_VectorTestForcedIntrinsics/10 3176 ns 3164 ns 219614
102 BM_VectorTestForcedIntrinsics/11 4086 ns 4065 ns 173103
103 BM_VectorTestForcedIntrinsics/12 3095 ns 3083 ns 226427
104 BM_VectorTestForcedIntrinsics/13 4459 ns 4441 ns 157019
105 BM_VectorTestForcedIntrinsics/14 4298 ns 4281 ns 162819
106 BM_VectorTestForcedIntrinsics/15 5232 ns 5211 ns 130653
107 BM_VectorTestForcedIntrinsics/16 4166 ns 4150 ns 168336
108 BM_VectorTestForcedIntrinsics/17 5713 ns 5687 ns 122828
109 BM_VectorTestForcedIntrinsics/18 5424 ns 5403 ns 131831
110 BM_VectorTestForcedIntrinsics/19 6517 ns 6487 ns 107246
111 BM_VectorTestForcedIntrinsics/20 5208 ns 5179 ns 135608
112 BM_VectorTestForcedIntrinsics/21 6927 ns 6882 ns 101059
113 BM_VectorTestForcedIntrinsics/22 6593 ns 6542 ns 108036
114 BM_VectorTestForcedIntrinsics/23 7789 ns 7745 ns 90793
115 BM_VectorTestForcedIntrinsics/24 6241 ns 6200 ns 113967
116 BM_VectorTestForcedIntrinsics/25 8178 ns 8130 ns 84883
117 BM_VectorTestForcedIntrinsics/26 7768 ns 7724 ns 91931
118 BM_VectorTestForcedIntrinsics/27 9017 ns 8954 ns 78657
119 BM_VectorTestForcedIntrinsics/28 7250 ns 7206 ns 98287
120 BM_VectorTestForcedIntrinsics/29 9419 ns 9365 ns 74588
121 BM_VectorTestForcedIntrinsics/30 8943 ns 8885 ns 77512
122 BM_VectorTestForcedIntrinsics/31 10217 ns 10159 ns 69207
123 BM_VectorTestForcedIntrinsics/32 8271 ns 8221 ns 86206
124
125 Pixel 6 Pro (1/29/2025)
126 ------------------------------------------------------------------------------
127 Benchmark Time CPU Iterations
128 ------------------------------------------------------------------------------
129 BM_VectorTestLoopFloat/1 1522 ns 1514 ns 459906
130 BM_VectorTestLoopFloat/2 2391 ns 2383 ns 293707
131 BM_VectorTestLoopFloat/3 3437 ns 3426 ns 205663
132 BM_VectorTestLoopFloat/4 4482 ns 4468 ns 157406
133 BM_VectorTestLoopFloat/5 5665 ns 5645 ns 125564
134 BM_VectorTestLoopFloat/6 6784 ns 6762 ns 105112
135 BM_VectorTestLoopFloat/7 7930 ns 7902 ns 89104
136 BM_VectorTestLoopFloat/8 9043 ns 9011 ns 77654
137 BM_VectorTestLoopFloat/9 10178 ns 10145 ns 68967
138 BM_VectorTestLoopFloat/10 11338 ns 11296 ns 61958
139 BM_VectorTestLoopFloat/11 12500 ns 12456 ns 56104
140 BM_VectorTestLoopFloat/12 13686 ns 13634 ns 51361
141 BM_VectorTestLoopFloat/13 14794 ns 14744 ns 47477
142 BM_VectorTestLoopFloat/14 16040 ns 15979 ns 43158
143 BM_VectorTestLoopFloat/15 17098 ns 17036 ns 40926
144 BM_VectorTestLoopFloat/16 18413 ns 18343 ns 37962
145 BM_VectorTestLoopFloat/17 19462 ns 19382 ns 36093
146 BM_VectorTestLoopFloat/18 20788 ns 20704 ns 33897
147 BM_VectorTestLoopFloat/19 22168 ns 21967 ns 31994
148 BM_VectorTestLoopFloat/20 23420 ns 23322 ns 30136
149 BM_VectorTestLoopFloat/21 24424 ns 24316 ns 28773
150 BM_VectorTestLoopFloat/22 25789 ns 25686 ns 27195
151 BM_VectorTestLoopFloat/23 26980 ns 26870 ns 25939
152 BM_VectorTestLoopFloat/24 28349 ns 28238 ns 24906
153 BM_VectorTestLoopFloat/25 29486 ns 29355 ns 23815
154 BM_VectorTestLoopFloat/26 30686 ns 30554 ns 22853
155 BM_VectorTestLoopFloat/27 31781 ns 31630 ns 22034
156 BM_VectorTestLoopFloat/28 33161 ns 33008 ns 21133
157 BM_VectorTestLoopFloat/29 34482 ns 34329 ns 20290
158 BM_VectorTestLoopFloat/30 35676 ns 35531 ns 19434
159 BM_VectorTestLoopFloat/31 37037 ns 36835 ns 19033
160 BM_VectorTestLoopFloat/32 38379 ns 38178 ns 18409
161 BM_VectorTestConstArraySizeFloat/1 1138 ns 1134 ns 605601
162 BM_VectorTestConstArraySizeFloat/2 1551 ns 1546 ns 451139
163 BM_VectorTestConstArraySizeFloat/3 2157 ns 2149 ns 326085
164 BM_VectorTestConstArraySizeFloat/4 3082 ns 3070 ns 228235
165 BM_VectorTestConstArraySizeFloat/5 3694 ns 3668 ns 191253
166 BM_VectorTestConstArraySizeFloat/6 4708 ns 4691 ns 149290
167 BM_VectorTestConstArraySizeFloat/7 5255 ns 5236 ns 133227
168 BM_VectorTestConstArraySizeFloat/8 6239 ns 6217 ns 115033
169 BM_VectorTestConstArraySizeFloat/9 7087 ns 7058 ns 99388
170 BM_VectorTestConstArraySizeFloat/10 7640 ns 7613 ns 91195
171 BM_VectorTestConstArraySizeFloat/11 8471 ns 8438 ns 83724
172 BM_VectorTestConstArraySizeFloat/12 9132 ns 9101 ns 77836
173 BM_VectorTestConstArraySizeFloat/13 9963 ns 9928 ns 71043
174 BM_VectorTestConstArraySizeFloat/14 10601 ns 10565 ns 67362
175 BM_VectorTestConstArraySizeFloat/15 11428 ns 11384 ns 61646
176 BM_VectorTestConstArraySizeFloat/16 12061 ns 12017 ns 58708
177 BM_VectorTestConstArraySizeFloat/17 13094 ns 13043 ns 53478
178 BM_VectorTestConstArraySizeFloat/18 13624 ns 13553 ns 52138
179 BM_VectorTestConstArraySizeFloat/19 15633 ns 15541 ns 45464
180 BM_VectorTestConstArraySizeFloat/20 17379 ns 17299 ns 40665
181 BM_VectorTestConstArraySizeFloat/21 20772 ns 20675 ns 34104
182 BM_VectorTestConstArraySizeFloat/22 23613 ns 23485 ns 29856
183 BM_VectorTestConstArraySizeFloat/23 24967 ns 24800 ns 28081
184 BM_VectorTestConstArraySizeFloat/24 27395 ns 27278 ns 25481
185 BM_VectorTestConstArraySizeFloat/25 28858 ns 28701 ns 24520
186 BM_VectorTestConstArraySizeFloat/26 29251 ns 29068 ns 24195
187 BM_VectorTestConstArraySizeFloat/27 31487 ns 31293 ns 22507
188 BM_VectorTestConstArraySizeFloat/28 33355 ns 33137 ns 20929
189 BM_VectorTestConstArraySizeFloat/29 34385 ns 34229 ns 20417
190 BM_VectorTestConstArraySizeFloat/30 36031 ns 35811 ns 19543
191 BM_VectorTestConstArraySizeFloat/31 37079 ns 36905 ns 19051
192 BM_VectorTestConstArraySizeFloat/32 36857 ns 36715 ns 19077
193 BM_VectorTestForcedIntrinsics/1 1163 ns 1159 ns 598027
194 BM_VectorTestForcedIntrinsics/2 1175 ns 1170 ns 599275
195 BM_VectorTestForcedIntrinsics/3 1680 ns 1673 ns 419149
196 BM_VectorTestForcedIntrinsics/4 1210 ns 1205 ns 581791
197 BM_VectorTestForcedIntrinsics/5 1874 ns 1867 ns 374320
198 BM_VectorTestForcedIntrinsics/6 1954 ns 1946 ns 364700
199 BM_VectorTestForcedIntrinsics/7 2763 ns 2753 ns 253086
200 BM_VectorTestForcedIntrinsics/8 2057 ns 2049 ns 347318
201 BM_VectorTestForcedIntrinsics/9 3186 ns 3175 ns 218684
202 BM_VectorTestForcedIntrinsics/10 3112 ns 3101 ns 225780
203 BM_VectorTestForcedIntrinsics/11 4044 ns 4023 ns 175125
204 BM_VectorTestForcedIntrinsics/12 3088 ns 3077 ns 229106
205 BM_VectorTestForcedIntrinsics/13 4405 ns 4388 ns 159480
206 BM_VectorTestForcedIntrinsics/14 4248 ns 4232 ns 164753
207 BM_VectorTestForcedIntrinsics/15 5018 ns 4983 ns 140497
208 BM_VectorTestForcedIntrinsics/16 4131 ns 4095 ns 172113
209 BM_VectorTestForcedIntrinsics/17 5714 ns 5679 ns 123282
210 BM_VectorTestForcedIntrinsics/18 5387 ns 5358 ns 132204
211 BM_VectorTestForcedIntrinsics/19 6515 ns 6481 ns 110209
212 BM_VectorTestForcedIntrinsics/20 5108 ns 5081 ns 100000
213 BM_VectorTestForcedIntrinsics/21 6913 ns 6876 ns 101935
214 BM_VectorTestForcedIntrinsics/22 6564 ns 6517 ns 108434
215 BM_VectorTestForcedIntrinsics/23 7763 ns 7718 ns 92602
216 BM_VectorTestForcedIntrinsics/24 6184 ns 6132 ns 115958
217 BM_VectorTestForcedIntrinsics/25 8152 ns 8099 ns 87568
218 BM_VectorTestForcedIntrinsics/26 7720 ns 7674 ns 93561
219 BM_VectorTestForcedIntrinsics/27 8977 ns 8919 ns 78819
220 BM_VectorTestForcedIntrinsics/28 7206 ns 7153 ns 99046
221 BM_VectorTestForcedIntrinsics/29 9373 ns 9310 ns 74948
222 BM_VectorTestForcedIntrinsics/30 8888 ns 8830 ns 79500
223 BM_VectorTestForcedIntrinsics/31 10233 ns 10163 ns 70094
224 BM_VectorTestForcedIntrinsics/32 8209 ns 8139 ns 84943
225
226 */
227
228 // A small subset of code from audio_utils/intrinsic_utils.h
229
230 // We conditionally include neon optimizations for ARM devices
231 #pragma push_macro("USE_NEON")
232 #undef USE_NEON
233
234 #if defined(__ARM_NEON__) || defined(__aarch64__)
235 #include <arm_neon.h>
236 #define USE_NEON
237 #endif
238
239 template <typename T>
240 inline constexpr bool dependent_false_v = false;
241
242 // Type of array embedded in a struct that is usable in the Neon template functions below.
243 // This type must satisfy std::is_array_v<>.
244 template<typename T, size_t N>
245 struct internal_array_t {
246 T v[N];
sizeinternal_array_t247 static constexpr size_t size() { return N; }
248 };
249
250 #ifdef USE_NEON
251
252 template<int N>
253 struct vfloat_struct {};
254
255 template<int N>
256 using vfloat_t = typename vfloat_struct<N>::t; // typnemae required for Android 14 and earlier.
257
258 template<typename F, int N>
259 using vector_hw_t = std::conditional_t<
260 std::is_same_v<F, float>, vfloat_t<N>, internal_array_t<F, N>>;
261
262 // Recursively define the NEON types required for a given vector size.
263 // intrinsic_utils.h allows structurally recursive type definitions based on
264 // pairs of types (much like Lisp list cons pairs).
265 template<>
266 struct vfloat_struct<1> { using t = float; };
267 template<>
268 struct vfloat_struct<2> { using t = float32x2_t; };
269 template<>
270 struct vfloat_struct<3> { using t = struct { struct __attribute__((packed)) {
271 vfloat_t<2> a; vfloat_t<1> b; } s; }; };
272 template<>
273 struct vfloat_struct<4> { using t = float32x4_t; };
274 template<>
275 struct vfloat_struct<5> { using t = struct { struct __attribute__((packed)) {
276 vfloat_t<4> a; vfloat_t<1> b; } s; }; };
277 template<>
278 struct vfloat_struct<6> { using t = struct { struct __attribute__((packed)) {
279 vfloat_t<4> a; vfloat_t<2> b; } s; }; };
280 template<>
281 struct vfloat_struct<7> { using t = struct { struct __attribute__((packed)) {
282 vfloat_t<4> a; vfloat_t<3> b; } s; }; };
283 template<>
284 struct vfloat_struct<8> { using t = float32x4x2_t; };
285 template<>
286 struct vfloat_struct<9> { using t = struct { struct __attribute__((packed)) {
287 vfloat_t<8> a; vfloat_t<1> b; } s; }; };
288 template<>
289 struct vfloat_struct<10> { using t = struct { struct __attribute__((packed)) {
290 vfloat_t<8> a; vfloat_t<2> b; } s; }; };
291 template<>
292 struct vfloat_struct<11> { using t = struct { struct __attribute__((packed)) {
293 vfloat_t<8> a; vfloat_t<3> b; } s; }; };
294 template<>
295 struct vfloat_struct<12> { using t = struct { struct __attribute__((packed)) {
296 vfloat_t<8> a; vfloat_t<4> b; } s; }; };
297 template<>
298 struct vfloat_struct<13> { using t = struct { struct __attribute__((packed)) {
299 vfloat_t<8> a; vfloat_t<5> b; } s; }; };
300 template<>
301 struct vfloat_struct<14> { using t = struct { struct __attribute__((packed)) {
302 vfloat_t<8> a; vfloat_t<6> b; } s; }; };
303 template<>
304 struct vfloat_struct<15> { using t = struct { struct __attribute__((packed)) {
305 vfloat_t<8> a; vfloat_t<7> b; } s; }; };
306 template<>
307 struct vfloat_struct<16> { using t = float32x4x4_t; };
308 template<>
309 struct vfloat_struct<17> { using t = struct { struct __attribute__((packed)) {
310 vfloat_t<16> a; vfloat_t<1> b; } s; }; };
311 template<>
312 struct vfloat_struct<18> { using t = struct { struct __attribute__((packed)) {
313 vfloat_t<16> a; vfloat_t<2> b; } s; }; };
314 template<>
315 struct vfloat_struct<19> { using t = struct { struct __attribute__((packed)) {
316 vfloat_t<16> a; vfloat_t<3> b; } s; }; };
317 template<>
318 struct vfloat_struct<20> { using t = struct { struct __attribute__((packed)) {
319 vfloat_t<16> a; vfloat_t<4> b; } s; }; };
320 template<>
321 struct vfloat_struct<21> { using t = struct { struct __attribute__((packed)) {
322 vfloat_t<16> a; vfloat_t<5> b; } s; }; };
323 template<>
324 struct vfloat_struct<22> { using t = struct { struct __attribute__((packed)) {
325 vfloat_t<16> a; vfloat_t<6> b; } s; }; };
326 template<>
327 struct vfloat_struct<23> { using t = struct { struct __attribute__((packed)) {
328 vfloat_t<16> a; vfloat_t<7> b; } s; }; };
329 template<>
330 struct vfloat_struct<24> { using t = struct { struct __attribute__((packed)) {
331 vfloat_t<16> a; vfloat_t<8> b; } s; }; };
332 template<>
333 struct vfloat_struct<25> { using t = struct { struct __attribute__((packed)) {
334 vfloat_t<16> a; vfloat_t<9> b; } s; }; };
335 template<>
336 struct vfloat_struct<26> { using t = struct { struct __attribute__((packed)) {
337 vfloat_t<16> a; vfloat_t<10> b; } s; }; };
338 template<>
339 struct vfloat_struct<27> { using t = struct { struct __attribute__((packed)) {
340 vfloat_t<16> a; vfloat_t<11> b; } s; }; };
341 template<>
342 struct vfloat_struct<28> { using t = struct { struct __attribute__((packed)) {
343 vfloat_t<16> a; vfloat_t<12> b; } s; }; };
344 template<>
345 struct vfloat_struct<29> { using t = struct { struct __attribute__((packed)) {
346 vfloat_t<16> a; vfloat_t<13> b; } s; }; };
347 template<>
348 struct vfloat_struct<30> { using t = struct { struct __attribute__((packed)) {
349 vfloat_t<16> a; vfloat_t<14> b; } s; }; };
350 template<>
351 struct vfloat_struct<31> { using t = struct { struct __attribute__((packed)) {
352 vfloat_t<16> a; vfloat_t<15> b; } s; }; };
353 template<>
354 struct vfloat_struct<32> { using t = struct { struct __attribute__((packed)) {
355 vfloat_t<16> a; vfloat_t<16> b; } s; }; };
356
357 #else
358
359 // use loop vectorization if no HW type exists.
360 template<typename F, int N>
361 using vector_hw_t = internal_array_t<F, N>;
362
363 #endif
364
365 template<typename T>
vmul(T a,T b)366 static inline T vmul(T a, T b) {
367 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
368 return a * b;
369
370 #ifdef USE_NEON
371 } else if constexpr (std::is_same_v<T, float32x2_t>) {
372 return vmul_f32(a, b);
373 } else if constexpr (std::is_same_v<T, float32x4_t>) {
374 return vmulq_f32(a, b);
375 #if defined(__aarch64__)
376 } else if constexpr (std::is_same_v<T, float64x2_t>) {
377 return vmulq_f64(a, b);
378 #endif
379 #endif // USE_NEON
380
381 } else /* constexpr */ {
382 T ret;
383 auto &[retval] = ret; // single-member struct
384 const auto &[aval] = a;
385 const auto &[bval] = b;
386 if constexpr (std::is_array_v<decltype(retval)>) {
387 #pragma unroll
388 for (size_t i = 0; i < std::size(aval); ++i) {
389 retval[i] = vmul(aval[i], bval[i]);
390 }
391 return ret;
392 } else /* constexpr */ {
393 auto &[r1, r2] = retval;
394 const auto &[a1, a2] = aval;
395 const auto &[b1, b2] = bval;
396 r1 = vmul(a1, b1);
397 r2 = vmul(a2, b2);
398 return ret;
399 }
400 }
401 }
402
403 #pragma pop_macro("USE_NEON")
404
405 // end intrinsics subset
406
407 static constexpr size_t kDataSize = 2048;
408
TestArgs(benchmark::internal::Benchmark * b)409 static void TestArgs(benchmark::internal::Benchmark* b) {
410 constexpr int kChannelCountMin = 1;
411 constexpr int kChannelCountMax = 32;
412 for (int i = kChannelCountMin; i <= kChannelCountMax; ++i) {
413 b->Args({i});
414 }
415 }
416
417 // Macro test operator
418
419 #define OPERATOR(N) \
420 *reinterpret_cast<V<F, N>*>(out) = vmul( \
421 *reinterpret_cast<const V<F, N>*>(in1), \
422 *reinterpret_cast<const V<F, N>*>(in2)); \
423 out += N; \
424 in1 += N; \
425 in2 += N;
426
427 // Macro to instantiate switch case statements.
428
429 #define INSTANTIATE(N) \
430 case N: \
431 mFunc = [](F* out, const F* in1, const F* in2, size_t count) { \
432 static_assert(sizeof(V<F, N>) == N * sizeof(F)); \
433 for (size_t i = 0; i < count; ++i) { \
434 OPERATOR(N); \
435 } \
436 }; \
437 break;
438
439 template <typename Traits>
440 class Processor {
441 public:
442 // shorthand aliases
443 using F = typename Traits::data_t;
444 template <typename T, int N>
445 using V = typename Traits::template container_t<T, N>;
446
Processor(int channelCount)447 Processor(int channelCount)
448 : mChannelCount(channelCount) {
449
450 if constexpr (Traits::loop_) {
451 mFunc = [channelCount](F* out, const F* in1, const F* in2, size_t count) {
452 for (size_t i = 0; i < count; ++i) {
453 for (size_t j = 0; j < channelCount; ++j) {
454 OPERATOR(1);
455 }
456 }
457 };
458 return;
459 }
460 switch (channelCount) {
461 INSTANTIATE(1);
462 INSTANTIATE(2);
463 INSTANTIATE(3);
464 INSTANTIATE(4);
465 INSTANTIATE(5);
466 INSTANTIATE(6);
467 INSTANTIATE(7);
468 INSTANTIATE(8);
469 INSTANTIATE(9);
470 INSTANTIATE(10);
471 INSTANTIATE(11);
472 INSTANTIATE(12);
473 INSTANTIATE(13);
474 INSTANTIATE(14);
475 INSTANTIATE(15);
476 INSTANTIATE(16);
477 INSTANTIATE(17);
478 INSTANTIATE(18);
479 INSTANTIATE(19);
480 INSTANTIATE(20);
481 INSTANTIATE(21);
482 INSTANTIATE(22);
483 INSTANTIATE(23);
484 INSTANTIATE(24);
485 INSTANTIATE(25);
486 INSTANTIATE(26);
487 INSTANTIATE(27);
488 INSTANTIATE(28);
489 INSTANTIATE(29);
490 INSTANTIATE(30);
491 INSTANTIATE(31);
492 INSTANTIATE(32);
493 }
494 }
495
process(F * out,const F * in1,const F * in2,size_t frames)496 void process(F* out, const F* in1, const F* in2, size_t frames) {
497 mFunc(out, in1, in2, frames);
498 }
499
500 const size_t mChannelCount;
501 /* const */ std::function<void(F*, const F*, const F*, size_t)> mFunc;
502 };
503
504 template <typename Traits>
BM_VectorTest(benchmark::State & state)505 static void BM_VectorTest(benchmark::State& state) {
506 using F = typename Traits::data_t;
507 const size_t channelCount = state.range(0);
508
509 std::vector<F> input1(kDataSize * channelCount);
510 std::vector<F> input2(kDataSize * channelCount);
511 std::vector<F> output(kDataSize * channelCount);
512
513 // Initialize input buffer and coefs with deterministic pseudo-random values
514 std::minstd_rand gen(42);
515 const F amplitude = 1.;
516 std::uniform_real_distribution<> dis(-amplitude, amplitude);
517 for (auto& in : input1) {
518 in = dis(gen);
519 }
520 for (auto& in : input2) {
521 in = dis(gen);
522 }
523
524 Processor<Traits> processor(channelCount);
525
526 // Run the test
527 while (state.KeepRunning()) {
528 benchmark::DoNotOptimize(input1.data());
529 benchmark::DoNotOptimize(input2.data());
530 benchmark::DoNotOptimize(output.data());
531 processor.process(output.data(), input1.data(), input2.data(), kDataSize);
532 benchmark::ClobberMemory();
533 }
534 state.SetComplexityN(channelCount);
535 }
536
537 // Clang has an issue with -frelaxed-template-template-args where
538 // it may not follow the C++17 guidelines. Use a traits struct to
539 // pass in parameters.
540
541 // Test using two loops.
542 struct LoopFloatTraits {
543 template <typename F, int N>
544 using container_t = internal_array_t<F, N>;
545 using data_t = float;
546 static constexpr bool loop_ = true;
547 };
BM_VectorTestLoopFloat(benchmark::State & state)548 static void BM_VectorTestLoopFloat(benchmark::State& state) {
549 BM_VectorTest<LoopFloatTraits>(state);
550 }
551
552 // Test using two loops, the inner loop is constexpr size.
553 struct ConstArraySizeFloatTraits {
554 template <typename F, int N>
555 using container_t = internal_array_t<F, N>;
556 using data_t = float;
557 static constexpr bool loop_ = false;
558 };
BM_VectorTestConstArraySizeFloat(benchmark::State & state)559 static void BM_VectorTestConstArraySizeFloat(benchmark::State& state) {
560 BM_VectorTest<ConstArraySizeFloatTraits>(state);
561 }
562
563 // Test using intrinsics, if available.
564 struct ForcedIntrinsicsTraits {
565 template <typename F, int N>
566 using container_t = vector_hw_t<F, N>;
567 using data_t = float;
568 static constexpr bool loop_ = false;
569 };
BM_VectorTestForcedIntrinsics(benchmark::State & state)570 static void BM_VectorTestForcedIntrinsics(benchmark::State& state) {
571 BM_VectorTest<ForcedIntrinsicsTraits>(state);
572 }
573
574 BENCHMARK(BM_VectorTestLoopFloat)->Apply(TestArgs);
575
576 BENCHMARK(BM_VectorTestConstArraySizeFloat)->Apply(TestArgs);
577
578 BENCHMARK(BM_VectorTestForcedIntrinsics)->Apply(TestArgs);
579
580 BENCHMARK_MAIN();
581