• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2025 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <functional>
18 #include <random>
19 #include <vector>
20 
21 #include <benchmark/benchmark.h>
22 
23 /*
24 Pixel 6 Pro Android 14
25 ------------------------------------------------------------------------------
26 Benchmark                                    Time             CPU   Iterations
27 ------------------------------------------------------------------------------
28 BM_VectorTestLoopFloat/1                  1216 ns         1212 ns       580560
29 BM_VectorTestLoopFloat/2                  2272 ns         2264 ns       309745
30 BM_VectorTestLoopFloat/3                  3366 ns         3354 ns       209391
31 BM_VectorTestLoopFloat/4                  4495 ns         4478 ns       157291
32 BM_VectorTestLoopFloat/5                  5660 ns         5627 ns       124649
33 BM_VectorTestLoopFloat/6                  6776 ns         6750 ns       104102
34 BM_VectorTestLoopFloat/7                  7942 ns         7916 ns        89257
35 BM_VectorTestLoopFloat/8                  9120 ns         9086 ns        77234
36 BM_VectorTestLoopFloat/9                 10252 ns        10212 ns        69253
37 BM_VectorTestLoopFloat/10                11475 ns        11432 ns        61646
38 BM_VectorTestLoopFloat/11                12704 ns        12658 ns        55493
39 BM_VectorTestLoopFloat/12                13864 ns        13812 ns        50944
40 BM_VectorTestLoopFloat/13                15024 ns        14967 ns        47169
41 BM_VectorTestLoopFloat/14                16340 ns        16282 ns        43531
42 BM_VectorTestLoopFloat/15                17422 ns        17356 ns        40328
43 BM_VectorTestLoopFloat/16                18680 ns        18609 ns        37820
44 BM_VectorTestLoopFloat/17                19892 ns        19819 ns        35348
45 BM_VectorTestLoopFloat/18                21099 ns        21015 ns        33253
46 BM_VectorTestLoopFloat/19                22238 ns        22154 ns        31681
47 BM_VectorTestLoopFloat/20                23551 ns        23433 ns        29829
48 BM_VectorTestLoopFloat/21                24707 ns        24612 ns        28525
49 BM_VectorTestLoopFloat/22                26041 ns        25916 ns        27004
50 BM_VectorTestLoopFloat/23                27236 ns        27122 ns        25123
51 BM_VectorTestLoopFloat/24                28535 ns        28409 ns        24505
52 BM_VectorTestLoopFloat/25                29715 ns        29542 ns        23744
53 BM_VectorTestLoopFloat/26                31163 ns        31002 ns        22640
54 BM_VectorTestLoopFloat/27                32259 ns        32065 ns        21859
55 BM_VectorTestLoopFloat/28                33580 ns        33391 ns        20702
56 BM_VectorTestLoopFloat/29                34891 ns        34699 ns        20281
57 BM_VectorTestLoopFloat/30                36242 ns        36007 ns        19400
58 BM_VectorTestLoopFloat/31                37423 ns        37154 ns        18875
59 BM_VectorTestLoopFloat/32                38858 ns        38608 ns        17699
60 BM_VectorTestConstArraySizeFloat/1         185 ns          184 ns      3771794
61 BM_VectorTestConstArraySizeFloat/2         663 ns          660 ns      1068518
62 BM_VectorTestConstArraySizeFloat/3        2159 ns         2152 ns       318170
63 BM_VectorTestConstArraySizeFloat/4        3919 ns         3905 ns       179267
64 BM_VectorTestConstArraySizeFloat/5        1861 ns         1854 ns       374407
65 BM_VectorTestConstArraySizeFloat/6        1964 ns         1956 ns       362563
66 BM_VectorTestConstArraySizeFloat/7        2789 ns         2779 ns       252684
67 BM_VectorTestConstArraySizeFloat/8        2070 ns         2062 ns       342189
68 BM_VectorTestConstArraySizeFloat/9        3191 ns         3179 ns       220216
69 BM_VectorTestConstArraySizeFloat/10       3128 ns         3117 ns       225340
70 BM_VectorTestConstArraySizeFloat/11       4049 ns         4025 ns       174288
71 BM_VectorTestConstArraySizeFloat/12       3124 ns         3106 ns       225711
72 BM_VectorTestConstArraySizeFloat/13       4440 ns         4424 ns       158540
73 BM_VectorTestConstArraySizeFloat/14       4276 ns         4256 ns       164144
74 BM_VectorTestConstArraySizeFloat/15       5325 ns         5306 ns       132282
75 BM_VectorTestConstArraySizeFloat/16       4091 ns         4072 ns       172111
76 BM_VectorTestConstArraySizeFloat/17       5711 ns         5682 ns       122226
77 BM_VectorTestConstArraySizeFloat/18       5373 ns         5349 ns       129827
78 BM_VectorTestConstArraySizeFloat/19       6500 ns         6474 ns       108150
79 BM_VectorTestConstArraySizeFloat/20       5131 ns         5109 ns       136649
80 BM_VectorTestConstArraySizeFloat/21       6896 ns         6867 ns        99598
81 BM_VectorTestConstArraySizeFloat/22       6579 ns         6529 ns       108221
82 BM_VectorTestConstArraySizeFloat/23       7752 ns         7705 ns        91673
83 BM_VectorTestConstArraySizeFloat/24       6129 ns         6102 ns       114269
84 BM_VectorTestConstArraySizeFloat/25       8151 ns         8120 ns        85643
85 BM_VectorTestConstArraySizeFloat/26       7512 ns         7474 ns        94708
86 BM_VectorTestConstArraySizeFloat/27       9100 ns         9047 ns        79200
87 BM_VectorTestConstArraySizeFloat/28       7191 ns         7149 ns        97121
88 BM_VectorTestConstArraySizeFloat/29       9417 ns         9362 ns        74720
89 BM_VectorTestConstArraySizeFloat/30       8952 ns         8893 ns        80378
90 BM_VectorTestConstArraySizeFloat/31      10342 ns        10284 ns        66481
91 BM_VectorTestConstArraySizeFloat/32       8189 ns         8132 ns        85186
92 BM_VectorTestForcedIntrinsics/1            189 ns          189 ns      3629410
93 BM_VectorTestForcedIntrinsics/2           1192 ns         1188 ns       572025
94 BM_VectorTestForcedIntrinsics/3           1701 ns         1695 ns       412319
95 BM_VectorTestForcedIntrinsics/4           1234 ns         1229 ns       563105
96 BM_VectorTestForcedIntrinsics/5           1936 ns         1929 ns       367124
97 BM_VectorTestForcedIntrinsics/6           2002 ns         1994 ns       350985
98 BM_VectorTestForcedIntrinsics/7           2826 ns         2814 ns       247821
99 BM_VectorTestForcedIntrinsics/8           2106 ns         2098 ns       332577
100 BM_VectorTestForcedIntrinsics/9           3240 ns         3229 ns       216567
101 BM_VectorTestForcedIntrinsics/10          3176 ns         3164 ns       219614
102 BM_VectorTestForcedIntrinsics/11          4086 ns         4065 ns       173103
103 BM_VectorTestForcedIntrinsics/12          3095 ns         3083 ns       226427
104 BM_VectorTestForcedIntrinsics/13          4459 ns         4441 ns       157019
105 BM_VectorTestForcedIntrinsics/14          4298 ns         4281 ns       162819
106 BM_VectorTestForcedIntrinsics/15          5232 ns         5211 ns       130653
107 BM_VectorTestForcedIntrinsics/16          4166 ns         4150 ns       168336
108 BM_VectorTestForcedIntrinsics/17          5713 ns         5687 ns       122828
109 BM_VectorTestForcedIntrinsics/18          5424 ns         5403 ns       131831
110 BM_VectorTestForcedIntrinsics/19          6517 ns         6487 ns       107246
111 BM_VectorTestForcedIntrinsics/20          5208 ns         5179 ns       135608
112 BM_VectorTestForcedIntrinsics/21          6927 ns         6882 ns       101059
113 BM_VectorTestForcedIntrinsics/22          6593 ns         6542 ns       108036
114 BM_VectorTestForcedIntrinsics/23          7789 ns         7745 ns        90793
115 BM_VectorTestForcedIntrinsics/24          6241 ns         6200 ns       113967
116 BM_VectorTestForcedIntrinsics/25          8178 ns         8130 ns        84883
117 BM_VectorTestForcedIntrinsics/26          7768 ns         7724 ns        91931
118 BM_VectorTestForcedIntrinsics/27          9017 ns         8954 ns        78657
119 BM_VectorTestForcedIntrinsics/28          7250 ns         7206 ns        98287
120 BM_VectorTestForcedIntrinsics/29          9419 ns         9365 ns        74588
121 BM_VectorTestForcedIntrinsics/30          8943 ns         8885 ns        77512
122 BM_VectorTestForcedIntrinsics/31         10217 ns        10159 ns        69207
123 BM_VectorTestForcedIntrinsics/32          8271 ns         8221 ns        86206
124 
125 Pixel 6 Pro (1/29/2025)
126 ------------------------------------------------------------------------------
127 Benchmark                                    Time             CPU   Iterations
128 ------------------------------------------------------------------------------
129 BM_VectorTestLoopFloat/1                  1522 ns         1514 ns       459906
130 BM_VectorTestLoopFloat/2                  2391 ns         2383 ns       293707
131 BM_VectorTestLoopFloat/3                  3437 ns         3426 ns       205663
132 BM_VectorTestLoopFloat/4                  4482 ns         4468 ns       157406
133 BM_VectorTestLoopFloat/5                  5665 ns         5645 ns       125564
134 BM_VectorTestLoopFloat/6                  6784 ns         6762 ns       105112
135 BM_VectorTestLoopFloat/7                  7930 ns         7902 ns        89104
136 BM_VectorTestLoopFloat/8                  9043 ns         9011 ns        77654
137 BM_VectorTestLoopFloat/9                 10178 ns        10145 ns        68967
138 BM_VectorTestLoopFloat/10                11338 ns        11296 ns        61958
139 BM_VectorTestLoopFloat/11                12500 ns        12456 ns        56104
140 BM_VectorTestLoopFloat/12                13686 ns        13634 ns        51361
141 BM_VectorTestLoopFloat/13                14794 ns        14744 ns        47477
142 BM_VectorTestLoopFloat/14                16040 ns        15979 ns        43158
143 BM_VectorTestLoopFloat/15                17098 ns        17036 ns        40926
144 BM_VectorTestLoopFloat/16                18413 ns        18343 ns        37962
145 BM_VectorTestLoopFloat/17                19462 ns        19382 ns        36093
146 BM_VectorTestLoopFloat/18                20788 ns        20704 ns        33897
147 BM_VectorTestLoopFloat/19                22168 ns        21967 ns        31994
148 BM_VectorTestLoopFloat/20                23420 ns        23322 ns        30136
149 BM_VectorTestLoopFloat/21                24424 ns        24316 ns        28773
150 BM_VectorTestLoopFloat/22                25789 ns        25686 ns        27195
151 BM_VectorTestLoopFloat/23                26980 ns        26870 ns        25939
152 BM_VectorTestLoopFloat/24                28349 ns        28238 ns        24906
153 BM_VectorTestLoopFloat/25                29486 ns        29355 ns        23815
154 BM_VectorTestLoopFloat/26                30686 ns        30554 ns        22853
155 BM_VectorTestLoopFloat/27                31781 ns        31630 ns        22034
156 BM_VectorTestLoopFloat/28                33161 ns        33008 ns        21133
157 BM_VectorTestLoopFloat/29                34482 ns        34329 ns        20290
158 BM_VectorTestLoopFloat/30                35676 ns        35531 ns        19434
159 BM_VectorTestLoopFloat/31                37037 ns        36835 ns        19033
160 BM_VectorTestLoopFloat/32                38379 ns        38178 ns        18409
161 BM_VectorTestConstArraySizeFloat/1        1138 ns         1134 ns       605601
162 BM_VectorTestConstArraySizeFloat/2        1551 ns         1546 ns       451139
163 BM_VectorTestConstArraySizeFloat/3        2157 ns         2149 ns       326085
164 BM_VectorTestConstArraySizeFloat/4        3082 ns         3070 ns       228235
165 BM_VectorTestConstArraySizeFloat/5        3694 ns         3668 ns       191253
166 BM_VectorTestConstArraySizeFloat/6        4708 ns         4691 ns       149290
167 BM_VectorTestConstArraySizeFloat/7        5255 ns         5236 ns       133227
168 BM_VectorTestConstArraySizeFloat/8        6239 ns         6217 ns       115033
169 BM_VectorTestConstArraySizeFloat/9        7087 ns         7058 ns        99388
170 BM_VectorTestConstArraySizeFloat/10       7640 ns         7613 ns        91195
171 BM_VectorTestConstArraySizeFloat/11       8471 ns         8438 ns        83724
172 BM_VectorTestConstArraySizeFloat/12       9132 ns         9101 ns        77836
173 BM_VectorTestConstArraySizeFloat/13       9963 ns         9928 ns        71043
174 BM_VectorTestConstArraySizeFloat/14      10601 ns        10565 ns        67362
175 BM_VectorTestConstArraySizeFloat/15      11428 ns        11384 ns        61646
176 BM_VectorTestConstArraySizeFloat/16      12061 ns        12017 ns        58708
177 BM_VectorTestConstArraySizeFloat/17      13094 ns        13043 ns        53478
178 BM_VectorTestConstArraySizeFloat/18      13624 ns        13553 ns        52138
179 BM_VectorTestConstArraySizeFloat/19      15633 ns        15541 ns        45464
180 BM_VectorTestConstArraySizeFloat/20      17379 ns        17299 ns        40665
181 BM_VectorTestConstArraySizeFloat/21      20772 ns        20675 ns        34104
182 BM_VectorTestConstArraySizeFloat/22      23613 ns        23485 ns        29856
183 BM_VectorTestConstArraySizeFloat/23      24967 ns        24800 ns        28081
184 BM_VectorTestConstArraySizeFloat/24      27395 ns        27278 ns        25481
185 BM_VectorTestConstArraySizeFloat/25      28858 ns        28701 ns        24520
186 BM_VectorTestConstArraySizeFloat/26      29251 ns        29068 ns        24195
187 BM_VectorTestConstArraySizeFloat/27      31487 ns        31293 ns        22507
188 BM_VectorTestConstArraySizeFloat/28      33355 ns        33137 ns        20929
189 BM_VectorTestConstArraySizeFloat/29      34385 ns        34229 ns        20417
190 BM_VectorTestConstArraySizeFloat/30      36031 ns        35811 ns        19543
191 BM_VectorTestConstArraySizeFloat/31      37079 ns        36905 ns        19051
192 BM_VectorTestConstArraySizeFloat/32      36857 ns        36715 ns        19077
193 BM_VectorTestForcedIntrinsics/1           1163 ns         1159 ns       598027
194 BM_VectorTestForcedIntrinsics/2           1175 ns         1170 ns       599275
195 BM_VectorTestForcedIntrinsics/3           1680 ns         1673 ns       419149
196 BM_VectorTestForcedIntrinsics/4           1210 ns         1205 ns       581791
197 BM_VectorTestForcedIntrinsics/5           1874 ns         1867 ns       374320
198 BM_VectorTestForcedIntrinsics/6           1954 ns         1946 ns       364700
199 BM_VectorTestForcedIntrinsics/7           2763 ns         2753 ns       253086
200 BM_VectorTestForcedIntrinsics/8           2057 ns         2049 ns       347318
201 BM_VectorTestForcedIntrinsics/9           3186 ns         3175 ns       218684
202 BM_VectorTestForcedIntrinsics/10          3112 ns         3101 ns       225780
203 BM_VectorTestForcedIntrinsics/11          4044 ns         4023 ns       175125
204 BM_VectorTestForcedIntrinsics/12          3088 ns         3077 ns       229106
205 BM_VectorTestForcedIntrinsics/13          4405 ns         4388 ns       159480
206 BM_VectorTestForcedIntrinsics/14          4248 ns         4232 ns       164753
207 BM_VectorTestForcedIntrinsics/15          5018 ns         4983 ns       140497
208 BM_VectorTestForcedIntrinsics/16          4131 ns         4095 ns       172113
209 BM_VectorTestForcedIntrinsics/17          5714 ns         5679 ns       123282
210 BM_VectorTestForcedIntrinsics/18          5387 ns         5358 ns       132204
211 BM_VectorTestForcedIntrinsics/19          6515 ns         6481 ns       110209
212 BM_VectorTestForcedIntrinsics/20          5108 ns         5081 ns       100000
213 BM_VectorTestForcedIntrinsics/21          6913 ns         6876 ns       101935
214 BM_VectorTestForcedIntrinsics/22          6564 ns         6517 ns       108434
215 BM_VectorTestForcedIntrinsics/23          7763 ns         7718 ns        92602
216 BM_VectorTestForcedIntrinsics/24          6184 ns         6132 ns       115958
217 BM_VectorTestForcedIntrinsics/25          8152 ns         8099 ns        87568
218 BM_VectorTestForcedIntrinsics/26          7720 ns         7674 ns        93561
219 BM_VectorTestForcedIntrinsics/27          8977 ns         8919 ns        78819
220 BM_VectorTestForcedIntrinsics/28          7206 ns         7153 ns        99046
221 BM_VectorTestForcedIntrinsics/29          9373 ns         9310 ns        74948
222 BM_VectorTestForcedIntrinsics/30          8888 ns         8830 ns        79500
223 BM_VectorTestForcedIntrinsics/31         10233 ns        10163 ns        70094
224 BM_VectorTestForcedIntrinsics/32          8209 ns         8139 ns        84943
225 
226 */
227 
228 // A small subset of code from audio_utils/intrinsic_utils.h
229 
230 // We conditionally include neon optimizations for ARM devices
231 #pragma push_macro("USE_NEON")
232 #undef USE_NEON
233 
234 #if defined(__ARM_NEON__) || defined(__aarch64__)
235 #include <arm_neon.h>
236 #define USE_NEON
237 #endif
238 
239 template <typename T>
240 inline constexpr bool dependent_false_v = false;
241 
242 // Type of array embedded in a struct that is usable in the Neon template functions below.
243 // This type must satisfy std::is_array_v<>.
244 template<typename T, size_t N>
245 struct internal_array_t {
246     T v[N];
sizeinternal_array_t247     static constexpr size_t size() { return N; }
248 };
249 
250 #ifdef USE_NEON
251 
252 template<int N>
253 struct vfloat_struct {};
254 
255 template<int N>
256 using vfloat_t = typename vfloat_struct<N>::t;  // typnemae required for Android 14 and earlier.
257 
258 template<typename F, int N>
259 using vector_hw_t = std::conditional_t<
260         std::is_same_v<F, float>, vfloat_t<N>, internal_array_t<F, N>>;
261 
262 // Recursively define the NEON types required for a given vector size.
263 // intrinsic_utils.h allows structurally recursive type definitions based on
264 // pairs of types (much like Lisp list cons pairs).
265 template<>
266 struct vfloat_struct<1> { using t = float; };
267 template<>
268 struct vfloat_struct<2> { using t = float32x2_t; };
269 template<>
270 struct vfloat_struct<3> { using t = struct { struct __attribute__((packed)) {
271     vfloat_t<2> a; vfloat_t<1> b; } s; }; };
272 template<>
273 struct vfloat_struct<4> { using t = float32x4_t; };
274 template<>
275 struct vfloat_struct<5> { using t = struct { struct __attribute__((packed)) {
276     vfloat_t<4> a; vfloat_t<1> b; } s; }; };
277 template<>
278 struct vfloat_struct<6> { using t = struct { struct __attribute__((packed)) {
279     vfloat_t<4> a; vfloat_t<2> b; } s; }; };
280 template<>
281 struct vfloat_struct<7> { using t = struct { struct __attribute__((packed)) {
282     vfloat_t<4> a; vfloat_t<3> b; } s; }; };
283 template<>
284 struct vfloat_struct<8> { using t = float32x4x2_t; };
285 template<>
286 struct vfloat_struct<9> { using t = struct { struct __attribute__((packed)) {
287     vfloat_t<8> a; vfloat_t<1> b; } s; }; };
288 template<>
289 struct vfloat_struct<10> { using t = struct { struct __attribute__((packed)) {
290     vfloat_t<8> a; vfloat_t<2> b; } s; }; };
291 template<>
292 struct vfloat_struct<11> { using t = struct { struct __attribute__((packed)) {
293     vfloat_t<8> a; vfloat_t<3> b; } s; }; };
294 template<>
295 struct vfloat_struct<12> { using t = struct { struct __attribute__((packed)) {
296     vfloat_t<8> a; vfloat_t<4> b; } s; }; };
297 template<>
298 struct vfloat_struct<13> { using t = struct { struct __attribute__((packed)) {
299     vfloat_t<8> a; vfloat_t<5> b; } s; }; };
300 template<>
301 struct vfloat_struct<14> { using t = struct { struct __attribute__((packed)) {
302     vfloat_t<8> a; vfloat_t<6> b; } s; }; };
303 template<>
304 struct vfloat_struct<15> { using t = struct { struct __attribute__((packed)) {
305     vfloat_t<8> a; vfloat_t<7> b; } s; }; };
306 template<>
307 struct vfloat_struct<16> { using t = float32x4x4_t; };
308 template<>
309 struct vfloat_struct<17> { using t = struct { struct __attribute__((packed)) {
310     vfloat_t<16> a; vfloat_t<1> b; } s; }; };
311 template<>
312 struct vfloat_struct<18> { using t = struct { struct __attribute__((packed)) {
313     vfloat_t<16> a; vfloat_t<2> b; } s; }; };
314 template<>
315 struct vfloat_struct<19> { using t = struct { struct __attribute__((packed)) {
316     vfloat_t<16> a; vfloat_t<3> b; } s; }; };
317 template<>
318 struct vfloat_struct<20> { using t = struct { struct __attribute__((packed)) {
319     vfloat_t<16> a; vfloat_t<4> b; } s; }; };
320 template<>
321 struct vfloat_struct<21> { using t = struct { struct __attribute__((packed)) {
322     vfloat_t<16> a; vfloat_t<5> b; } s; }; };
323 template<>
324 struct vfloat_struct<22> { using t = struct { struct __attribute__((packed)) {
325     vfloat_t<16> a; vfloat_t<6> b; } s; }; };
326 template<>
327 struct vfloat_struct<23> { using t = struct { struct __attribute__((packed)) {
328     vfloat_t<16> a; vfloat_t<7> b; } s; }; };
329 template<>
330 struct vfloat_struct<24> { using t = struct { struct __attribute__((packed)) {
331     vfloat_t<16> a; vfloat_t<8> b; } s; }; };
332 template<>
333 struct vfloat_struct<25> { using t = struct { struct __attribute__((packed)) {
334     vfloat_t<16> a; vfloat_t<9> b; } s; }; };
335 template<>
336 struct vfloat_struct<26> { using t = struct { struct __attribute__((packed)) {
337     vfloat_t<16> a; vfloat_t<10> b; } s; }; };
338 template<>
339 struct vfloat_struct<27> { using t = struct { struct __attribute__((packed)) {
340     vfloat_t<16> a; vfloat_t<11> b; } s; }; };
341 template<>
342 struct vfloat_struct<28> { using t = struct { struct __attribute__((packed)) {
343     vfloat_t<16> a; vfloat_t<12> b; } s; }; };
344 template<>
345 struct vfloat_struct<29> { using t = struct { struct __attribute__((packed)) {
346     vfloat_t<16> a; vfloat_t<13> b; } s; }; };
347 template<>
348 struct vfloat_struct<30> { using t = struct { struct __attribute__((packed)) {
349     vfloat_t<16> a; vfloat_t<14> b; } s; }; };
350 template<>
351 struct vfloat_struct<31> { using t = struct { struct __attribute__((packed)) {
352     vfloat_t<16> a; vfloat_t<15> b; } s; }; };
353 template<>
354 struct vfloat_struct<32> { using t = struct { struct __attribute__((packed)) {
355     vfloat_t<16> a; vfloat_t<16> b; } s; }; };
356 
357 #else
358 
359 // use loop vectorization if no HW type exists.
360 template<typename F, int N>
361 using vector_hw_t = internal_array_t<F, N>;
362 
363 #endif
364 
365 template<typename T>
vmul(T a,T b)366 static inline T vmul(T a, T b) {
367     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
368         return a * b;
369 
370 #ifdef USE_NEON
371     } else if constexpr (std::is_same_v<T, float32x2_t>) {
372         return vmul_f32(a, b);
373     } else if constexpr (std::is_same_v<T, float32x4_t>) {
374         return vmulq_f32(a, b);
375 #if defined(__aarch64__)
376     } else if constexpr (std::is_same_v<T, float64x2_t>) {
377         return vmulq_f64(a, b);
378 #endif
379 #endif // USE_NEON
380 
381     } else /* constexpr */ {
382         T ret;
383         auto &[retval] = ret;  // single-member struct
384         const auto &[aval] = a;
385         const auto &[bval] = b;
386         if constexpr (std::is_array_v<decltype(retval)>) {
387 #pragma unroll
388             for (size_t i = 0; i < std::size(aval); ++i) {
389                 retval[i] = vmul(aval[i], bval[i]);
390             }
391             return ret;
392         } else /* constexpr */ {
393              auto &[r1, r2] = retval;
394              const auto &[a1, a2] = aval;
395              const auto &[b1, b2] = bval;
396              r1 = vmul(a1, b1);
397              r2 = vmul(a2, b2);
398              return ret;
399         }
400     }
401 }
402 
403 #pragma pop_macro("USE_NEON")
404 
405 // end intrinsics subset
406 
407 static constexpr size_t kDataSize = 2048;
408 
TestArgs(benchmark::internal::Benchmark * b)409 static void TestArgs(benchmark::internal::Benchmark* b) {
410     constexpr int kChannelCountMin = 1;
411     constexpr int kChannelCountMax = 32;
412     for (int i = kChannelCountMin; i <= kChannelCountMax; ++i) {
413         b->Args({i});
414     }
415 }
416 
417 // Macro test operator
418 
419 #define OPERATOR(N) \
420     *reinterpret_cast<V<F, N>*>(out) = vmul( \
421     *reinterpret_cast<const V<F, N>*>(in1), \
422     *reinterpret_cast<const V<F, N>*>(in2)); \
423     out += N; \
424     in1 += N; \
425     in2 += N;
426 
427 // Macro to instantiate switch case statements.
428 
429 #define INSTANTIATE(N) \
430     case N: \
431     mFunc = [](F* out, const F* in1, const F* in2, size_t count) { \
432         static_assert(sizeof(V<F, N>) == N * sizeof(F)); \
433         for (size_t i = 0; i < count; ++i) { \
434             OPERATOR(N); \
435         } \
436     }; \
437     break;
438 
439 template <typename Traits>
440 class Processor {
441 public:
442     // shorthand aliases
443     using F = typename Traits::data_t;
444     template <typename T, int N>
445     using V = typename Traits::template container_t<T, N>;
446 
Processor(int channelCount)447     Processor(int channelCount)
448         : mChannelCount(channelCount) {
449 
450         if constexpr (Traits::loop_) {
451             mFunc = [channelCount](F* out, const F* in1, const F* in2, size_t count) {
452                 for (size_t i = 0; i < count; ++i) {
453                     for (size_t j = 0; j < channelCount; ++j) {
454                         OPERATOR(1);
455                     }
456                 }
457             };
458             return;
459         }
460         switch (channelCount) {
461         INSTANTIATE(1);
462         INSTANTIATE(2);
463         INSTANTIATE(3);
464         INSTANTIATE(4);
465         INSTANTIATE(5);
466         INSTANTIATE(6);
467         INSTANTIATE(7);
468         INSTANTIATE(8);
469         INSTANTIATE(9);
470         INSTANTIATE(10);
471         INSTANTIATE(11);
472         INSTANTIATE(12);
473         INSTANTIATE(13);
474         INSTANTIATE(14);
475         INSTANTIATE(15);
476         INSTANTIATE(16);
477         INSTANTIATE(17);
478         INSTANTIATE(18);
479         INSTANTIATE(19);
480         INSTANTIATE(20);
481         INSTANTIATE(21);
482         INSTANTIATE(22);
483         INSTANTIATE(23);
484         INSTANTIATE(24);
485         INSTANTIATE(25);
486         INSTANTIATE(26);
487         INSTANTIATE(27);
488         INSTANTIATE(28);
489         INSTANTIATE(29);
490         INSTANTIATE(30);
491         INSTANTIATE(31);
492         INSTANTIATE(32);
493         }
494     }
495 
process(F * out,const F * in1,const F * in2,size_t frames)496     void process(F* out, const F* in1, const F* in2, size_t frames) {
497         mFunc(out, in1, in2, frames);
498     }
499 
500     const size_t mChannelCount;
501     /* const */ std::function<void(F*, const F*, const F*, size_t)> mFunc;
502 };
503 
504 template <typename Traits>
BM_VectorTest(benchmark::State & state)505 static void BM_VectorTest(benchmark::State& state) {
506     using F = typename Traits::data_t;
507     const size_t channelCount = state.range(0);
508 
509     std::vector<F> input1(kDataSize * channelCount);
510     std::vector<F> input2(kDataSize * channelCount);
511     std::vector<F> output(kDataSize * channelCount);
512 
513     // Initialize input buffer and coefs with deterministic pseudo-random values
514     std::minstd_rand gen(42);
515     const F amplitude = 1.;
516     std::uniform_real_distribution<> dis(-amplitude, amplitude);
517     for (auto& in : input1) {
518         in = dis(gen);
519     }
520     for (auto& in : input2) {
521         in = dis(gen);
522     }
523 
524     Processor<Traits> processor(channelCount);
525 
526     // Run the test
527     while (state.KeepRunning()) {
528         benchmark::DoNotOptimize(input1.data());
529         benchmark::DoNotOptimize(input2.data());
530         benchmark::DoNotOptimize(output.data());
531         processor.process(output.data(), input1.data(), input2.data(), kDataSize);
532         benchmark::ClobberMemory();
533     }
534     state.SetComplexityN(channelCount);
535 }
536 
537 // Clang has an issue with -frelaxed-template-template-args where
538 // it may not follow the C++17 guidelines.  Use a traits struct to
539 // pass in parameters.
540 
541 // Test using two loops.
542 struct LoopFloatTraits {
543     template <typename F, int N>
544     using container_t = internal_array_t<F, N>;
545     using data_t = float;
546     static constexpr bool loop_ = true;
547 };
BM_VectorTestLoopFloat(benchmark::State & state)548 static void BM_VectorTestLoopFloat(benchmark::State& state) {
549     BM_VectorTest<LoopFloatTraits>(state);
550 }
551 
552 // Test using two loops, the inner loop is constexpr size.
553 struct ConstArraySizeFloatTraits {
554     template <typename F, int N>
555     using container_t = internal_array_t<F, N>;
556     using data_t = float;
557     static constexpr bool loop_ = false;
558 };
BM_VectorTestConstArraySizeFloat(benchmark::State & state)559 static void BM_VectorTestConstArraySizeFloat(benchmark::State& state) {
560     BM_VectorTest<ConstArraySizeFloatTraits>(state);
561 }
562 
563 // Test using intrinsics, if available.
564 struct ForcedIntrinsicsTraits {
565     template <typename F, int N>
566     using container_t = vector_hw_t<F, N>;
567     using data_t = float;
568     static constexpr bool loop_ = false;
569 };
BM_VectorTestForcedIntrinsics(benchmark::State & state)570 static void BM_VectorTestForcedIntrinsics(benchmark::State& state) {
571     BM_VectorTest<ForcedIntrinsicsTraits>(state);
572 }
573 
574 BENCHMARK(BM_VectorTestLoopFloat)->Apply(TestArgs);
575 
576 BENCHMARK(BM_VectorTestConstArraySizeFloat)->Apply(TestArgs);
577 
578 BENCHMARK(BM_VectorTestForcedIntrinsics)->Apply(TestArgs);
579 
580 BENCHMARK_MAIN();
581