• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cfloat>
8 #include <cmath>
9 #include <functional>
10 #include <random>
11 #include <vector>
12 
13 #include <cpuinfo.h>
14 
15 #include <benchmark/benchmark.h>
16 #include <fp16/fp16.h>
17 #include "bench/conv.h"
18 #include "bench/utils.h"
19 #include <xnnpack/AlignedAllocator.h>
20 #include <xnnpack/common.h>
21 #include <xnnpack/igemm.h>
22 #include <xnnpack/indirection.h>
23 #include <xnnpack/operator.h>
24 #include <xnnpack/pack.h>
25 #include <xnnpack/params-init.h>
26 #include <xnnpack/params.h>
27 
28 
IGEMMBenchmark(benchmark::State & state,xnn_f16_igemm_minmax_ukernel_function f16_igemm,uint32_t mr,uint32_t nr,uint32_t kr,uint32_t sr,xnn_init_f16_scaleminmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)29 static void IGEMMBenchmark(benchmark::State& state,
30   xnn_f16_igemm_minmax_ukernel_function f16_igemm,
31   uint32_t mr, uint32_t nr, uint32_t kr, uint32_t sr,
32   xnn_init_f16_scaleminmax_params_fn init_params,
33   benchmark::utils::IsaCheckFunction isa_check = nullptr)
34 {
35   if (isa_check && !isa_check(state)) {
36     return;
37   }
38 
39   const size_t input_height = state.range(0);
40   const size_t input_width = state.range(1);
41   const size_t kernel_height = state.range(2);
42   const size_t kernel_width = state.range(3);
43   const size_t kernel_size = kernel_height * kernel_width;
44   const size_t padding_height = state.range(4);
45   const size_t padding_width = state.range(5);
46   const size_t subsampling = state.range(6);
47   const size_t dilation = state.range(7);
48   const size_t group_input_channels = state.range(8);
49   const size_t group_output_channels = state.range(9);
50 
51   std::random_device random_device;
52   auto rng = std::mt19937(random_device());
53   auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
54   auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
55 
56   const size_t output_pixel_stride = group_output_channels;
57   const size_t input_pixel_stride = group_input_channels;
58   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
59   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
60   const size_t padding_left = padding_width / 2;
61   const size_t padding_top = padding_height / 2;
62   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
63   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
64   const size_t output_size = output_height * output_width;
65 
66   const size_t mc_stride = benchmark::utils::RoundUp<size_t>(output_size, mr);
67   const size_t nc_stride = benchmark::utils::RoundUp<size_t>(group_output_channels, nr);
68   const size_t kc_stride = benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
69 
70   std::vector<uint16_t> a(input_height * input_width * input_pixel_stride);
71   std::generate(a.begin(), a.end(), std::ref(f16rng));
72   std::vector<uint16_t> k(group_output_channels * kernel_height * kernel_width * group_input_channels);
73   std::generate(k.begin(), k.end(), std::ref(f16rng));
74   std::vector<uint16_t> b(group_output_channels);
75   std::generate(b.begin(), b.end(), std::ref(f16rng));
76 
77   std::vector<uint16_t> z(group_input_channels);
78 
79   const size_t w_elements = (kernel_size * kc_stride + 1) * nc_stride;
80   const size_t i_elements = mc_stride * kernel_size;
81   const size_t c_elements = output_height * output_width * output_pixel_stride;
82   const size_t num_buffers = 1 +
83     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
84       sizeof(uint16_t) * (w_elements + c_elements) + sizeof(void*) * i_elements);
85 
86   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
87   std::fill(w.begin(), w.end(), 0);
88   xnn_pack_f16_conv_goki_w(
89     1 /* groups */, group_output_channels, kernel_size, group_input_channels,
90     nr, kr, sr, k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
91   for (size_t n = 1; n < num_buffers; n++) {
92     std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
93   }
94 
95   std::vector<const uint16_t*> i(i_elements * num_buffers);
96   xnn_operator convolution_op = { };
97   convolution_op.indirection_buffer   = reinterpret_cast<const void**>(i.data());
98   convolution_op.input                = a.data();
99   convolution_op.input_pixel_stride   = input_pixel_stride;
100   convolution_op.zero_buffer          = z.data();
101   convolution_op.groups               = 1;
102   convolution_op.group_input_channels = group_input_channels;
103   convolution_op.batch_size           = 1;
104   convolution_op.input_height         = input_height;
105   convolution_op.input_width          = input_width;
106   convolution_op.output_height        = output_height;
107   convolution_op.output_width         = output_width;
108   convolution_op.kernel_height        = kernel_height;
109   convolution_op.kernel_width         = kernel_width;
110   convolution_op.stride_height        = subsampling;
111   convolution_op.stride_width         = subsampling;
112   convolution_op.dilation_height      = dilation;
113   convolution_op.dilation_width       = dilation;
114   convolution_op.padding_top          = padding_top;
115   convolution_op.padding_left         = padding_left;
116   xnn_indirection_init_conv2d(&convolution_op, mr, 1 /* log2(sizeof(uint16_t)) */);
117   for (size_t n = 1; n < num_buffers; n++) {
118     std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
119   }
120 
121   std::vector<uint16_t> c(c_elements * num_buffers);
122   std::fill(c.begin(), c.end(), std::nanf(""));
123 
124   // Prepare minmax parameters.
125   xnn_f16_scaleminmax_params params;
126   init_params(&params,
127     UINT16_C(0x3C00) /* 1.0 */, UINT16_C(0x7C00) /* inf */, UINT16_C(0xFC00) /* -inf */);
128 
129   size_t buffer_index = 0;
130   for (auto _ : state) {
131     state.PauseTiming();
132     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
133     buffer_index = (buffer_index + 1) % num_buffers;
134     state.ResumeTiming();
135 
136     for (uint32_t m = 0; m < output_size; m += mr) {
137       const uint32_t mb = min(output_size - m, mr);
138       for (uint32_t n = 0; n < group_output_channels; n += nr) {
139         const uint32_t nb = min(group_output_channels - n, nr);
140         f16_igemm(
141           mb, nb, group_input_channels * sizeof(uint16_t), kernel_size * mr * sizeof(void*),
142           reinterpret_cast<const void**>(i.data()) + buffer_index * i_elements + m,
143           w.data() + buffer_index * w_elements + n * (kc_stride * kernel_size + 1),
144           c.data() + buffer_index * c_elements + m * group_output_channels + n, group_output_channels * sizeof(uint16_t), nr * sizeof(uint16_t),
145           0, z.data(), &params);
146       }
147     }
148   }
149 
150   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
151   if (cpu_frequency != 0) {
152     state.counters["cpufreq"] = cpu_frequency;
153   }
154 
155   state.counters["FLOPS"] = benchmark::Counter(
156     uint64_t(state.iterations()) * 2 *
157       output_height * output_width *
158       group_input_channels * group_output_channels *
159       kernel_height * kernel_width,
160     benchmark::Counter::kIsRate);
161 }
162 
163 #if XNN_ARCH_ARM64
f16_igemm_1x8__neonfp16arith_ld64(benchmark::State & state,const char * net)164   static void f16_igemm_1x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
165     IGEMMBenchmark(state, xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, 1, 8, 1, 1,
166       xnn_init_f16_scaleminmax_neon_params, benchmark::utils::CheckNEONFP16ARITH);
167   }
f16_igemm_4x8__neonfp16arith_ld64(benchmark::State & state,const char * net)168   static void f16_igemm_4x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
169     IGEMMBenchmark(state, xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, 4, 8, 1, 1,
170       xnn_init_f16_scaleminmax_neon_params, benchmark::utils::CheckNEONFP16ARITH);
171   }
f16_igemm_6x8__neonfp16arith_ld64(benchmark::State & state,const char * net)172   static void f16_igemm_6x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
173     IGEMMBenchmark(state, xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, 6, 8, 1, 1,
174       xnn_init_f16_scaleminmax_neon_params, benchmark::utils::CheckNEONFP16ARITH);
175   }
f16_igemm_8x8__neonfp16arith_ld64(benchmark::State & state,const char * net)176   static void f16_igemm_8x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
177     IGEMMBenchmark(state, xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, 8, 8, 1, 1,
178       xnn_init_f16_scaleminmax_neon_params, benchmark::utils::CheckNEONFP16ARITH);
179   }
f16_igemm_1x16__neonfp16arith_ld64(benchmark::State & state,const char * net)180   static void f16_igemm_1x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
181     IGEMMBenchmark(state, xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, 1, 16, 1, 1,
182       xnn_init_f16_scaleminmax_neon_params, benchmark::utils::CheckNEONFP16ARITH);
183   }
f16_igemm_4x16__neonfp16arith_ld64(benchmark::State & state,const char * net)184   static void f16_igemm_4x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
185     IGEMMBenchmark(state, xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, 4, 16, 1, 1,
186       xnn_init_f16_scaleminmax_neon_params, benchmark::utils::CheckNEONFP16ARITH);
187   }
f16_igemm_6x16__neonfp16arith_ld64(benchmark::State & state,const char * net)188   static void f16_igemm_6x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
189     IGEMMBenchmark(state, xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, 6, 16, 1, 1,
190       xnn_init_f16_scaleminmax_neon_params, benchmark::utils::CheckNEONFP16ARITH);
191   }
f16_igemm_8x16__neonfp16arith_ld64(benchmark::State & state,const char * net)192   static void f16_igemm_8x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
193     IGEMMBenchmark(state, xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, 8, 16, 1, 1,
194       xnn_init_f16_scaleminmax_neon_params, benchmark::utils::CheckNEONFP16ARITH);
195   }
196 
197   BENCHMARK_CONV(f16_igemm_1x8__neonfp16arith_ld64)
BENCHMARK_CONV(f16_igemm_4x8__neonfp16arith_ld64)198   BENCHMARK_CONV(f16_igemm_4x8__neonfp16arith_ld64)
199   BENCHMARK_CONV(f16_igemm_6x8__neonfp16arith_ld64)
200   BENCHMARK_CONV(f16_igemm_8x8__neonfp16arith_ld64)
201   BENCHMARK_CONV(f16_igemm_1x16__neonfp16arith_ld64)
202   BENCHMARK_CONV(f16_igemm_4x16__neonfp16arith_ld64)
203   BENCHMARK_CONV(f16_igemm_6x16__neonfp16arith_ld64)
204   BENCHMARK_CONV(f16_igemm_8x16__neonfp16arith_ld64)
205 #endif  // XNN_ARCH_ARM64
206 
207 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
208   static void f16_igemm_1x8__avx2_broadcast(benchmark::State& state, const char* net) {
209     IGEMMBenchmark(state, xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, 1, 8, 1, 1,
210       xnn_init_f16_scaleminmax_avx_params, benchmark::utils::CheckAVX2);
211   }
f16_igemm_4x8__avx2_broadcast(benchmark::State & state,const char * net)212   static void f16_igemm_4x8__avx2_broadcast(benchmark::State& state, const char* net) {
213     IGEMMBenchmark(state, xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, 4, 8, 1, 1,
214       xnn_init_f16_scaleminmax_avx_params, benchmark::utils::CheckAVX2);
215   }
f16_igemm_5x8__avx2_broadcast(benchmark::State & state,const char * net)216   static void f16_igemm_5x8__avx2_broadcast(benchmark::State& state, const char* net) {
217     IGEMMBenchmark(state, xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, 5, 8, 1, 1,
218       xnn_init_f16_scaleminmax_avx_params, benchmark::utils::CheckAVX2);
219   }
f16_igemm_6x8__avx2_broadcast(benchmark::State & state,const char * net)220   static void f16_igemm_6x8__avx2_broadcast(benchmark::State& state, const char* net) {
221     IGEMMBenchmark(state, xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, 6, 8, 1, 1,
222       xnn_init_f16_scaleminmax_avx_params, benchmark::utils::CheckAVX2);
223   }
f16_igemm_7x8__avx2_broadcast(benchmark::State & state,const char * net)224   static void f16_igemm_7x8__avx2_broadcast(benchmark::State& state, const char* net) {
225     IGEMMBenchmark(state, xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, 7, 8, 1, 1,
226       xnn_init_f16_scaleminmax_avx_params, benchmark::utils::CheckAVX2);
227   }
f16_igemm_1x16__avx2_broadcast(benchmark::State & state,const char * net)228   static void f16_igemm_1x16__avx2_broadcast(benchmark::State& state, const char* net) {
229     IGEMMBenchmark(state, xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, 1, 16, 1, 1,
230       xnn_init_f16_scaleminmax_avx_params, benchmark::utils::CheckAVX2);
231   }
f16_igemm_3x16__avx2_broadcast(benchmark::State & state,const char * net)232   static void f16_igemm_3x16__avx2_broadcast(benchmark::State& state, const char* net) {
233     IGEMMBenchmark(state, xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, 3, 16, 1, 1,
234       xnn_init_f16_scaleminmax_avx_params, benchmark::utils::CheckAVX2);
235   }
f16_igemm_4x16__avx2_broadcast(benchmark::State & state,const char * net)236   static void f16_igemm_4x16__avx2_broadcast(benchmark::State& state, const char* net) {
237     IGEMMBenchmark(state, xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, 4, 16, 1, 1,
238       xnn_init_f16_scaleminmax_avx_params, benchmark::utils::CheckAVX2);
239   }
f16_igemm_5x16__avx2_broadcast(benchmark::State & state,const char * net)240   static void f16_igemm_5x16__avx2_broadcast(benchmark::State& state, const char* net) {
241     IGEMMBenchmark(state, xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, 5, 16, 1, 1,
242       xnn_init_f16_scaleminmax_avx_params, benchmark::utils::CheckAVX2);
243   }
244 
245   BENCHMARK_CONV(f16_igemm_1x8__avx2_broadcast)
246   BENCHMARK_CONV(f16_igemm_4x8__avx2_broadcast)
247   BENCHMARK_CONV(f16_igemm_5x8__avx2_broadcast)
248   BENCHMARK_CONV(f16_igemm_6x8__avx2_broadcast)
249   BENCHMARK_CONV(f16_igemm_7x8__avx2_broadcast)
250   BENCHMARK_CONV(f16_igemm_1x16__avx2_broadcast)
251   BENCHMARK_CONV(f16_igemm_3x16__avx2_broadcast)
252   BENCHMARK_CONV(f16_igemm_4x16__avx2_broadcast)
253   BENCHMARK_CONV(f16_igemm_5x16__avx2_broadcast)
254 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
255 
256 #ifndef XNNPACK_BENCHMARK_NO_MAIN
257 BENCHMARK_MAIN();
258 #endif
259