• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #pragma once
10 
11 #include <gtest/gtest.h>
12 
13 #include <algorithm>
14 #include <cassert>
15 #include <cmath>
16 #include <cstddef>
17 #include <cstdlib>
18 #include <functional>
19 #include <limits>
20 #include <random>
21 #include <vector>
22 
23 #include <fp16.h>
24 
25 #include <xnnpack.h>
26 #include <xnnpack/AlignedAllocator.h>
27 #include <xnnpack/pack.h>
28 #include <xnnpack/params-init.h>
29 #include <xnnpack/params.h>
30 #include <xnnpack/requantization.h>
31 
32 
33 class DWConvMicrokernelTester {
34  public:
35   enum class Variant {
36     Native,
37     Scalar,
38   };
39 
width(uint32_t width)40   inline DWConvMicrokernelTester& width(uint32_t width) {
41     assert(width >= 1);
42     this->width_ = width;
43     return *this;
44   }
45 
width()46   inline uint32_t width() const {
47     return this->width_;
48   }
49 
step(uint32_t step)50   inline DWConvMicrokernelTester& step(uint32_t step) {
51     assert(step >= 1);
52     this->step_ = step;
53     return *this;
54   }
55 
step()56   inline uint32_t step() const {
57     return this->step_;
58   }
59 
channels(uint32_t channels)60   inline DWConvMicrokernelTester& channels(uint32_t channels) {
61     assert(channels >= 1);
62     this->channels_ = channels;
63     return *this;
64   }
65 
channels()66   inline uint32_t channels() const {
67     return this->channels_;
68   }
69 
cr(uint32_t cr)70   inline DWConvMicrokernelTester& cr(uint32_t cr) {
71     assert(cr != 0);
72     this->cr_ = cr;
73     return *this;
74   }
75 
cr()76   inline uint32_t cr() const {
77     return this->cr_;
78   }
79 
kr(uint32_t kr)80   inline DWConvMicrokernelTester& kr(uint32_t kr) {
81     assert(kr != 0);
82     this->kr_ = kr;
83     return *this;
84   }
85 
kr()86   inline uint32_t kr() const {
87     return this->kr_;
88   }
89 
packed_channels()90   inline uint32_t packed_channels() const {
91     return (channels() / cr() + !!(channels() % cr())) * cr();
92   }
93 
output_stride(uint32_t output_stride)94   inline DWConvMicrokernelTester& output_stride(uint32_t output_stride) {
95     assert(output_stride != 0);
96     this->output_stride_ = output_stride;
97     return *this;
98   }
99 
output_stride()100   inline uint32_t output_stride() const {
101     if (this->output_stride_ == 0) {
102       return channels();
103     } else {
104       assert(this->output_stride_ >= channels());
105       return this->output_stride_;
106     }
107   }
108 
input_zero_point(uint8_t input_zero_point)109   inline DWConvMicrokernelTester& input_zero_point(uint8_t input_zero_point) {
110     this->input_zero_point_ = input_zero_point;
111     return *this;
112   }
113 
input_zero_point()114   inline uint8_t input_zero_point() const {
115     return this->input_zero_point_;
116   }
117 
kernel_zero_point(uint8_t kernel_zero_point)118   inline DWConvMicrokernelTester& kernel_zero_point(uint8_t kernel_zero_point) {
119     this->kernel_zero_point_ = kernel_zero_point;
120     return *this;
121   }
122 
kernel_zero_point()123   inline uint8_t kernel_zero_point() const {
124     return this->kernel_zero_point_;
125   }
126 
qmin(uint8_t qmin)127   inline DWConvMicrokernelTester& qmin(uint8_t qmin) {
128     this->qmin_ = qmin;
129     return *this;
130   }
131 
qmin()132   inline uint8_t qmin() const {
133     return this->qmin_;
134   }
135 
qmax(uint8_t qmax)136   inline DWConvMicrokernelTester& qmax(uint8_t qmax) {
137     this->qmax_ = qmax;
138     return *this;
139   }
140 
qmax()141   inline uint8_t qmax() const {
142     return this->qmax_;
143   }
144 
input_offset(size_t input_offset)145   inline DWConvMicrokernelTester& input_offset(size_t input_offset) {
146     this->input_offset_ = input_offset;
147     return *this;
148   }
149 
input_offset()150   inline size_t input_offset() const {
151     return this->input_offset_;
152   }
153 
zero_index(size_t zero_index)154   inline DWConvMicrokernelTester& zero_index(size_t zero_index) {
155     this->zero_index_ = zero_index;
156     return *this;
157   }
158 
zero_index()159   inline size_t zero_index() const {
160     return this->zero_index_;
161   }
162 
iterations(size_t iterations)163   inline DWConvMicrokernelTester& iterations(size_t iterations) {
164     this->iterations_ = iterations;
165     return *this;
166   }
167 
iterations()168   inline size_t iterations() const {
169     return this->iterations_;
170   }
171 
172   void Test(xnn_qu8_dwconv_minmax_unipass_ukernel_function dwconv_minmax, Variant variant = Variant::Native) const {
173     std::random_device random_device;
174     auto rng = std::mt19937(random_device());
175     auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
176     auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
177 
178     std::vector<const uint8_t*> indirection((width() - 1) * step() + kr());
179     std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + indirection.size() * channels());
180     std::vector<uint8_t> kernel(channels() * kr());
181     std::vector<int32_t> bias(channels());
182     std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> packed_weights((kr() + sizeof(int32_t) / sizeof(uint8_t)) * packed_channels());
183     std::vector<uint8_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint8_t));
184     std::vector<uint8_t> output((width() - 1) * output_stride() + channels());
185     std::vector<int32_t> accumulators(width() * channels());
186     std::vector<uint8_t> output_ref(width() * channels());
187 
188     for (size_t iteration = 0; iteration < iterations(); iteration++) {
189       do {
190         std::generate(input.begin(), input.end(), std::ref(u8rng));
191       } while (input.size() > 1 && *std::max_element(input.cbegin(), input.cend()) == *std::min_element(input.cbegin(), input.cend()));
192       do {
193         std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
194       } while (kernel.size() > 1 && *std::max_element(kernel.cbegin(), kernel.cend()) == *std::min_element(kernel.cbegin(), kernel.cend()));
195       std::generate(bias.begin(), bias.end(), std::ref(i32rng));
196       std::fill(zero.begin(), zero.end(), input_zero_point());
197       std::fill(output.begin(), output.end(), 0xA5);
198 
199       std::fill(packed_weights.begin(), packed_weights.end(), 0);
200       const xnn_qu8_packing_params packing_params = { input_zero_point(), kernel_zero_point() };
201       xnn_pack_qu8_dwconv_ghw_w(
202         kr(), 1, channels(), cr(),
203         kernel.data(), bias.data(), packed_weights.data(), &packing_params);
204       for (size_t i = 0; i < indirection.size(); i++) {
205         indirection[i] = input.data() + i * channels() - input_offset();
206       }
207       std::shuffle(indirection.begin(), indirection.end(), rng);
208       if (zero_index() != SIZE_MAX) {
209         for (size_t i = 0; i < indirection.size(); i += kr()) {
210           indirection[i + zero_index()] = zero.data();
211         }
212       }
213 
214       // Compute reference results, without renormalization.
215       for (size_t x = 0; x < width(); x++) {
216         for (size_t c = 0; c < channels(); c++) {
217           float acc = bias[c];
218           for (size_t k = 0; k < kr(); k++) {
219             if (indirection[x * step() + k] != zero.data()) {
220               acc +=
221                 (int32_t(indirection[x * step() + k][c + input_offset()]) - int32_t(input_zero_point())) *
222                 (int32_t(kernel[c * kr() + k]) - int32_t(kernel_zero_point()));
223             }
224           }
225           accumulators[x * channels() + c] = acc;
226         }
227       }
228 
229       // Compute renormalization parameters.
230       const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
231       const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
232       const uint32_t accumulated_range = uint32_t(accumulated_max) - uint32_t(accumulated_min);
233       const double output_scale = accumulated_range >= 256 ? double(accumulated_range) / 255.0 : 1.00001;
234       const uint8_t output_zero_point = uint8_t(std::max(std::min(
235         lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
236         long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));
237 
238       // Prepare parameters.
239       const float requantization_scale = 1.0f / float(output_scale);
240       union xnn_qu8_gemm_params quantization_params = { };
241       switch (variant) {
242         case Variant::Native:
243           quantization_params = xnn_init_qu8_gemm_params(
244             kernel_zero_point(), requantization_scale, output_zero_point, qmin(), qmax());
245           break;
246         case Variant::Scalar:
247           quantization_params = xnn_init_scalar_qu8_gemm_params(
248             kernel_zero_point(), requantization_scale, output_zero_point, qmin(), qmax());
249           break;
250       }
251       const union xnn_qu8_requantization_params scalar_requantization_params =
252         xnn_init_scalar_qu8_requantization_params(requantization_scale, output_zero_point, qmin(), qmax());
253 
254       // Renormalize reference results.
255       for (size_t x = 0; x < width(); x++) {
256         for (size_t c = 0; c < channels(); c++) {
257           output_ref[x * channels() + c] = xnn_qu8_requantize_q31(accumulators[x * channels() + c], scalar_requantization_params);
258         }
259       }
260 
261       // Call optimized micro-kernel.
262       dwconv_minmax(
263         channels(), width(),
264         indirection.data(), packed_weights.data(), output.data(),
265         step() * sizeof(void*),
266         (output_stride() - channels()) * sizeof(uint8_t),
267         input_offset() * sizeof(uint8_t), zero.data(),
268         &quantization_params);
269 
270       // Verify results.
271       for (size_t x = 0; x < width(); x++) {
272         for (size_t c = 0; c < channels(); c++) {
273           ASSERT_GE(uint32_t(output[x * output_stride() + c]), uint32_t(qmin()))
274             << "x = " << x << ", channel = " << c;
275           ASSERT_LE(uint32_t(output[x * output_stride() + c]), uint32_t(qmax()))
276             << "x = " << x << ", channel = " << c;
277           ASSERT_EQ(uint32_t(output[x * output_stride() + c]), uint32_t(output_ref[x * channels() + c]))
278             << "x = " << x << ", channel = " << c << ", accumulator = " << accumulators[x * channels() + c];
279         }
280       }
281     }
282   }
283 
284   void Test(xnn_qs8_dwconv_minmax_unipass_ukernel_function dwconv_minmax, Variant variant = Variant::Native) const {
285     std::random_device random_device;
286     auto rng = std::mt19937(random_device());
287     auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
288     auto i8rng = std::bind(
289       std::uniform_int_distribution<uint32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), rng);
290 
291     std::vector<const int8_t*> indirection((width() - 1) * step() + kr());
292     std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) + indirection.size() * channels());
293     std::vector<int8_t> kernel(channels() * kr());
294     std::vector<int32_t> bias(channels());
295     std::vector<int8_t, AlignedAllocator<int8_t, 64>> packed_weights((kr() + sizeof(int32_t) / sizeof(int8_t)) * packed_channels());
296     std::vector<int8_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(int8_t));
297     std::vector<int8_t> output((width() - 1) * output_stride() + channels());
298     std::vector<int32_t> accumulators(width() * channels());
299     std::vector<int8_t> output_ref(width() * channels());
300 
301     for (size_t iteration = 0; iteration < iterations(); iteration++) {
302       do {
303         std::generate(input.begin(), input.end(), std::ref(i8rng));
304       } while (input.size() > 1 && *std::max_element(input.cbegin(), input.cend()) == *std::min_element(input.cbegin(), input.cend()));
305       do {
306         std::generate(kernel.begin(), kernel.end(), std::ref(i8rng));
307       } while (kernel.size() > 1 && *std::max_element(kernel.cbegin(), kernel.cend()) == *std::min_element(kernel.cbegin(), kernel.cend()));
308       std::generate(bias.begin(), bias.end(), std::ref(i32rng));
309       std::fill(zero.begin(), zero.end(), int8_t(input_zero_point() - 0x80));
310       std::fill(output.begin(), output.end(), 0xA5);
311 
312       std::fill(packed_weights.begin(), packed_weights.end(), 0);
313       const xnn_qs8_packing_params packing_params = { int8_t(input_zero_point() - 0x80) };
314       xnn_pack_qs8_dwconv_ghw_w(
315         kr(), 1, channels(), cr(),
316         kernel.data(), bias.data(), packed_weights.data(), &packing_params);
317       for (size_t i = 0; i < indirection.size(); i++) {
318         indirection[i] = input.data() + i * channels() - input_offset();
319       }
320       std::shuffle(indirection.begin(), indirection.end(), rng);
321       if (zero_index() != SIZE_MAX) {
322         for (size_t i = 0; i < indirection.size(); i += kr()) {
323           indirection[i + zero_index()] = zero.data();
324         }
325       }
326 
327       // Compute reference results, without renormalization.
328       for (size_t x = 0; x < width(); x++) {
329         for (size_t c = 0; c < channels(); c++) {
330           float acc = bias[c];
331           for (size_t k = 0; k < kr(); k++) {
332             if (indirection[x * step() + k] != zero.data()) {
333               acc +=
334                 (int32_t(indirection[x * step() + k][c + input_offset()]) - int32_t(input_zero_point() - 0x80)) *
335                 int32_t(kernel[c * kr() + k]);
336             }
337           }
338           accumulators[x * channels() + c] = acc;
339         }
340       }
341 
342       // Compute renormalization parameters.
343       const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
344       const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
345       const uint32_t accumulated_range = uint32_t(accumulated_max) - uint32_t(accumulated_min);
346       const double output_scale = accumulated_range >= 256 ? double(accumulated_range) / 255.0 : 1.00001;
347       const int8_t output_zero_point = int8_t(std::max(std::min(
348         lrint(-0.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
349         long(std::numeric_limits<int8_t>::max())), long(std::numeric_limits<int8_t>::min())));
350 
351       // Prepare parameters.
352       const float requantization_scale = 1.0f / float(output_scale);
353       union xnn_qs8_gemm_params quantization_params = { };
354       switch (variant) {
355         case Variant::Native:
356           quantization_params = xnn_init_qs8_gemm_params(
357             requantization_scale, output_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
358           break;
359         case Variant::Scalar:
360           quantization_params = xnn_init_scalar_qs8_gemm_params(
361             requantization_scale, output_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
362           break;
363       }
364       const union xnn_qs8_requantization_params scalar_requantization_params =
365         xnn_init_scalar_qs8_requantization_params(requantization_scale, output_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
366 
367       // Renormalize reference results.
368       for (size_t x = 0; x < width(); x++) {
369         for (size_t c = 0; c < channels(); c++) {
370           output_ref[x * channels() + c] = xnn_qs8_requantize_q31(accumulators[x * channels() + c], scalar_requantization_params);
371         }
372       }
373 
374       // Call optimized micro-kernel.
375       dwconv_minmax(
376         channels(), width(),
377         indirection.data(), packed_weights.data(), output.data(),
378         step() * sizeof(void*),
379         (output_stride() - channels()) * sizeof(int8_t),
380         input_offset() * sizeof(int8_t), zero.data(),
381         &quantization_params);
382 
383       // Verify results.
384       for (size_t x = 0; x < width(); x++) {
385         for (size_t c = 0; c < channels(); c++) {
386           ASSERT_GE(int32_t(output[x * output_stride() + c]), int32_t(qmin()) - 0x80)
387             << "x = " << x << ", channel = " << c;
388           ASSERT_LE(int32_t(output[x * output_stride() + c]), int32_t(qmax()) - 0x80)
389             << "x = " << x << ", channel = " << c;
390           ASSERT_EQ(int32_t(output[x * output_stride() + c]), int32_t(output_ref[x * channels() + c]))
391             << "x = " << x << ", channel = " << c << ", accumulator = " << accumulators[x * channels() + c];
392         }
393       }
394     }
395   }
396 
397   void Test(xnn_f16_dwconv_minmax_unipass_ukernel_function dwconv_minmax, Variant variant = Variant::Native) const {
398     std::random_device random_device;
399     auto rng = std::mt19937(random_device());
400     auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
401     auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
402 
403     std::vector<const uint16_t*> indirection((width() - 1) * step() + kr());
404     std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + indirection.size() * channels());
405     std::vector<uint16_t> kernel(channels() * kr());
406     std::vector<uint16_t> bias(channels());
407     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> packed_weights((kr() + 1) * packed_channels());
408     std::vector<uint16_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint16_t));
409     std::vector<uint16_t> output((width() - 1) * output_stride() + channels());
410     std::vector<float> output_ref(width() * channels());
411 
412     for (size_t iteration = 0; iteration < iterations(); iteration++) {
413       std::generate(input.begin(), input.end(), std::ref(f16rng));
414       std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));
415       std::generate(bias.begin(), bias.end(), std::ref(f16rng));
416       std::fill(zero.begin(), zero.end(), 0);
417       std::fill(output_ref.begin(), output_ref.end(), 0.0f);
418       std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
419 
420       std::fill(packed_weights.begin(), packed_weights.end(), 0);
421       xnn_pack_f16_dwconv_ghw_w(
422         kr(), 1, channels(), cr(),
423         kernel.data(), bias.data(), packed_weights.data(), nullptr);
424       for (size_t i = 0; i < indirection.size(); i++) {
425         indirection[i] = input.data() + i * channels() - input_offset();
426       }
427       std::shuffle(indirection.begin(), indirection.end(), rng);
428       if (zero_index() != SIZE_MAX) {
429         for (size_t i = 0; i < indirection.size(); i += kr()) {
430           indirection[i + zero_index()] = zero.data();
431         }
432       }
433 
434       // Compute reference results, without clamping.
435       for (size_t x = 0; x < width(); x++) {
436         for (size_t c = 0; c < channels(); c++) {
437           float acc = fp16_ieee_to_fp32_value(bias[c]);
438           for (size_t k = 0; k < kr(); k++) {
439             if (indirection[x * step() + k] != zero.data()) {
440               acc += fp16_ieee_to_fp32_value(indirection[x * step() + k][c + input_offset()]) * fp16_ieee_to_fp32_value(kernel[c * kr() + k]);
441             }
442           }
443           output_ref[x * channels() + c] = acc;
444         }
445       }
446 
447       // Compute clamping parameters.
448       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
449       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
450       const float accumulated_range = accumulated_max - accumulated_min;
451       const float output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_min + accumulated_range / 255.0f * float(qmin())));
452       const float output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_max - accumulated_range / 255.0f * float(255 - qmax())));
453 
454       // Prepare parameters.
455       xnn_f16_minmax_params params = xnn_init_f16_minmax_params(
456         fp16_ieee_from_fp32_value(output_min),
457         fp16_ieee_from_fp32_value(output_max));
458 
459       // Clamp reference results.
460       for (float& output_val : output_ref) {
461         output_val = std::max(std::min(output_val, output_max), output_min);
462       }
463 
464       // Call optimized micro-kernel.
465       dwconv_minmax(
466         channels(), width(),
467         reinterpret_cast<const void**>(indirection.data()), packed_weights.data(), output.data(),
468         step() * sizeof(void*),
469         (output_stride() - channels()) * sizeof(uint16_t),
470         input_offset() * sizeof(uint16_t), zero.data(),
471         &params);
472 
473       // Verify results.
474       for (size_t x = 0; x < width(); x++) {
475         for (size_t c = 0; c < channels(); c++) {
476           ASSERT_GE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_min)
477             << "x = " << x << ", channel = " << c;
478           ASSERT_LE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_max)
479             << "x = " << x << ", channel = " << c;
480           ASSERT_NEAR(output_ref[x * channels() + c], fp16_ieee_to_fp32_value(output[x * output_stride() + c]), std::max(1.0e-4f, std::abs(output_ref[x * channels() + c]) * 1.0e-2f))
481             << "x = " << x << ", channel = " << c;
482         }
483       }
484     }
485   }
486 
Test(xnn_f32_dwconv_unipass_ukernel_function dwconv)487   void Test(xnn_f32_dwconv_unipass_ukernel_function dwconv) const {
488     std::random_device random_device;
489     auto rng = std::mt19937(random_device());
490     auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
491 
492     std::vector<const float*> indirection((width() - 1) * step() + kr());
493     std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + indirection.size() * channels());
494     std::vector<float> kernel(channels() * kr());
495     std::vector<float> bias(channels());
496     std::vector<float, AlignedAllocator<float, 64>> packed_weights((kr() + 1) * packed_channels());
497     std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float));
498     std::vector<float> output((width() - 1) * output_stride() + channels());
499     std::vector<float> output_ref(width() * channels());
500 
501     for (size_t iteration = 0; iteration < iterations(); iteration++) {
502       std::generate(input.begin(), input.end(), std::ref(f32rng));
503       std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
504       std::generate(bias.begin(), bias.end(), std::ref(f32rng));
505       std::fill(zero.begin(), zero.end(), 0.0f);
506       std::fill(output_ref.begin(), output_ref.end(), nanf(""));
507       std::fill(output.begin(), output.end(), nanf(""));
508 
509       std::fill(packed_weights.begin(), packed_weights.end(), 0.0f);
510       xnn_pack_f32_dwconv_ghw_w(
511         kr(), 1, channels(), cr(),
512         kernel.data(), bias.data(), packed_weights.data(), nullptr);
513       for (size_t i = 0; i < indirection.size(); i++) {
514         indirection[i] = input.data() + i * channels() - input_offset();
515       }
516       std::shuffle(indirection.begin(), indirection.end(), rng);
517       if (zero_index() != SIZE_MAX) {
518         for (size_t i = 0; i < indirection.size(); i += kr()) {
519           indirection[i + zero_index()] = zero.data();
520         }
521       }
522 
523       // Compute reference results, without clamping.
524       for (size_t x = 0; x < width(); x++) {
525         for (size_t c = 0; c < channels(); c++) {
526           float acc = bias[c];
527           for (size_t k = 0; k < kr(); k++) {
528             if (indirection[x * step() + k] != zero.data()) {
529               acc += indirection[x * step() + k][c + input_offset()] * kernel[c * kr() + k];
530             }
531           }
532           output_ref[x * channels() + c] = acc;
533         }
534       }
535 
536       // Call optimized micro-kernel.
537       dwconv(
538         channels(), width(),
539         indirection.data(), packed_weights.data(), output.data(),
540         step() * sizeof(void*),
541         (output_stride() - channels()) * sizeof(float),
542         input_offset() * sizeof(float), zero.data(),
543         nullptr);
544 
545       // Verify results.
546       for (size_t x = 0; x < width(); x++) {
547         for (size_t c = 0; c < channels(); c++) {
548           ASSERT_NEAR(
549               output_ref[x * channels() + c],
550               output[x * output_stride() + c],
551               std::abs(output_ref[x * channels() + c]) * 1.0e-5)
552             << "x = " << x << ", channel = " << c;
553         }
554       }
555     }
556   }
557 
558   void Test(xnn_f32_dwconv_minmax_unipass_ukernel_function dwconv_minmax, Variant variant = Variant::Native) const {
559     std::random_device random_device;
560     auto rng = std::mt19937(random_device());
561     auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
562 
563     std::vector<const float*> indirection((width() - 1) * step() + kr());
564     std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + indirection.size() * channels());
565     std::vector<float> kernel(channels() * kr());
566     std::vector<float> bias(channels());
567     std::vector<float, AlignedAllocator<float, 64>> packed_weights((kr() + 1) * packed_channels());
568     std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float));
569     std::vector<float> output((width() - 1) * output_stride() + channels());
570     std::vector<float> output_ref(width() * channels());
571 
572     for (size_t iteration = 0; iteration < iterations(); iteration++) {
573       std::generate(input.begin(), input.end(), std::ref(f32rng));
574       std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
575       std::generate(bias.begin(), bias.end(), std::ref(f32rng));
576       std::fill(zero.begin(), zero.end(), 0.0f);
577       std::fill(output_ref.begin(), output_ref.end(), nanf(""));
578       std::fill(output.begin(), output.end(), nanf(""));
579 
580       std::fill(packed_weights.begin(), packed_weights.end(), 0.0f);
581       xnn_pack_f32_dwconv_ghw_w(
582         kr(), 1, channels(), cr(),
583         kernel.data(), bias.data(), packed_weights.data(), nullptr);
584       for (size_t i = 0; i < indirection.size(); i++) {
585         indirection[i] = input.data() + i * channels() - input_offset();
586       }
587       std::shuffle(indirection.begin(), indirection.end(), rng);
588       if (zero_index() != SIZE_MAX) {
589         for (size_t i = 0; i < indirection.size(); i += kr()) {
590           indirection[i + zero_index()] = zero.data();
591         }
592       }
593 
594       // Compute reference results, without clamping.
595       for (size_t x = 0; x < width(); x++) {
596         for (size_t c = 0; c < channels(); c++) {
597           float acc = bias[c];
598           for (size_t k = 0; k < kr(); k++) {
599             if (indirection[x * step() + k] != zero.data()) {
600               acc += indirection[x * step() + k][c + input_offset()] * kernel[c * kr() + k];
601             }
602           }
603           output_ref[x * channels() + c] = acc;
604         }
605       }
606 
607       // Compute clamping parameters.
608       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
609       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
610       const float accumulated_range = accumulated_max - accumulated_min;
611       const float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin());
612       const float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax());
613 
614       // Prepare parameters.
615       xnn_f32_minmax_params params = { };
616       switch (variant) {
617         case Variant::Native:
618           params = xnn_init_f32_minmax_params(output_min, output_max);
619           break;
620         case Variant::Scalar:
621           params = xnn_init_scalar_f32_minmax_params(output_min, output_max);
622           break;
623       }
624 
625       // Clamp reference results.
626       for (float& output_val : output_ref) {
627         output_val = std::max(std::min(output_val, output_max), output_min);
628       }
629 
630       // Call optimized micro-kernel.
631       dwconv_minmax(
632         channels(), width(),
633         indirection.data(), packed_weights.data(), output.data(),
634         step() * sizeof(void*),
635         (output_stride() - channels()) * sizeof(float),
636         input_offset() * sizeof(float), zero.data(),
637         &params);
638 
639       // Verify results.
640       for (size_t x = 0; x < width(); x++) {
641         for (size_t c = 0; c < channels(); c++) {
642           ASSERT_GE(output[x * output_stride() + c], output_min)
643             << "x = " << x << ", channel = " << c;
644           ASSERT_LE(output[x * output_stride() + c], output_max)
645             << "x = " << x << ", channel = " << c;
646           ASSERT_NEAR(
647               output_ref[x * channels() + c],
648               output[x * output_stride() + c],
649               std::abs(output_ref[x * channels() + c]) * 1.0e-5)
650             << "x = " << x << ", channel = " << c;
651         }
652       }
653     }
654   }
655 
656  private:
657   uint32_t channels_{1};
658   uint32_t cr_{1};
659   uint32_t kr_{1};
660   uint32_t width_{1};
661   uint32_t step_{1};
662   uint32_t output_stride_{0};
663   uint8_t input_zero_point_{127};
664   uint8_t kernel_zero_point_{127};
665   uint8_t qmin_{0};
666   uint8_t qmax_{255};
667   size_t input_offset_{0};
668   size_t zero_index_{SIZE_MAX};
669   size_t iterations_{3};
670 };
671