• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #pragma once
10 
11 #include <gtest/gtest.h>
12 
13 #include <algorithm>
14 #include <cassert>
15 #include <cmath>
16 #include <cstddef>
17 #include <cstdlib>
18 #include <limits>
19 #include <random>
20 #include <vector>
21 
22 #include <fp16.h>
23 
24 #include <xnnpack.h>
25 #include <xnnpack/aligned-allocator.h>
26 #include <xnnpack/pack.h>
27 #include <xnnpack/microfnptr.h>
28 #include <xnnpack/microparams-init.h>
29 #include <xnnpack/requantization.h>
30 
31 
32 class DWConvMicrokernelTester {
33  public:
width(uint32_t width)34   inline DWConvMicrokernelTester& width(uint32_t width) {
35     assert(width >= 1);
36     this->width_ = width;
37     return *this;
38   }
39 
width()40   inline uint32_t width() const {
41     return this->width_;
42   }
43 
step(uint32_t step)44   inline DWConvMicrokernelTester& step(uint32_t step) {
45     assert(step >= 1);
46     this->step_ = step;
47     return *this;
48   }
49 
step()50   inline uint32_t step() const {
51     return this->step_;
52   }
53 
channels(uint32_t channels)54   inline DWConvMicrokernelTester& channels(uint32_t channels) {
55     assert(channels >= 1);
56     this->channels_ = channels;
57     return *this;
58   }
59 
channels()60   inline uint32_t channels() const {
61     return this->channels_;
62   }
63 
cr(uint32_t cr)64   inline DWConvMicrokernelTester& cr(uint32_t cr) {
65     assert(cr != 0);
66     this->cr_ = cr;
67     return *this;
68   }
69 
cr()70   inline uint32_t cr() const {
71     return this->cr_;
72   }
73 
kr(uint32_t kr)74   inline DWConvMicrokernelTester& kr(uint32_t kr) {
75     assert(kr != 0);
76     this->kr_ = kr;
77     return *this;
78   }
79 
kr()80   inline uint32_t kr() const {
81     return this->kr_;
82   }
83 
packed_channels()84   inline uint32_t packed_channels() const {
85     return (channels() / cr() + !!(channels() % cr())) * cr();
86   }
87 
output_stride(uint32_t output_stride)88   inline DWConvMicrokernelTester& output_stride(uint32_t output_stride) {
89     assert(output_stride != 0);
90     this->output_stride_ = output_stride;
91     return *this;
92   }
93 
output_stride()94   inline uint32_t output_stride() const {
95     if (this->output_stride_ == 0) {
96       return channels();
97     } else {
98       assert(this->output_stride_ >= channels());
99       return this->output_stride_;
100     }
101   }
102 
input_zero_point(uint8_t input_zero_point)103   inline DWConvMicrokernelTester& input_zero_point(uint8_t input_zero_point) {
104     this->input_zero_point_ = input_zero_point;
105     return *this;
106   }
107 
input_zero_point()108   inline uint8_t input_zero_point() const {
109     return this->input_zero_point_;
110   }
111 
kernel_zero_point(uint8_t kernel_zero_point)112   inline DWConvMicrokernelTester& kernel_zero_point(uint8_t kernel_zero_point) {
113     this->kernel_zero_point_ = kernel_zero_point;
114     return *this;
115   }
116 
kernel_zero_point()117   inline uint8_t kernel_zero_point() const {
118     return this->kernel_zero_point_;
119   }
120 
qmin(uint8_t qmin)121   inline DWConvMicrokernelTester& qmin(uint8_t qmin) {
122     this->qmin_ = qmin;
123     return *this;
124   }
125 
qmin()126   inline uint8_t qmin() const {
127     return this->qmin_;
128   }
129 
qmax(uint8_t qmax)130   inline DWConvMicrokernelTester& qmax(uint8_t qmax) {
131     this->qmax_ = qmax;
132     return *this;
133   }
134 
qmax()135   inline uint8_t qmax() const {
136     return this->qmax_;
137   }
138 
input_offset(size_t input_offset)139   inline DWConvMicrokernelTester& input_offset(size_t input_offset) {
140     this->input_offset_ = input_offset;
141     return *this;
142   }
143 
input_offset()144   inline size_t input_offset() const {
145     return this->input_offset_;
146   }
147 
zero_index(size_t zero_index)148   inline DWConvMicrokernelTester& zero_index(size_t zero_index) {
149     this->zero_index_ = zero_index;
150     return *this;
151   }
152 
zero_index()153   inline size_t zero_index() const {
154     return this->zero_index_;
155   }
156 
iterations(size_t iterations)157   inline DWConvMicrokernelTester& iterations(size_t iterations) {
158     this->iterations_ = iterations;
159     return *this;
160   }
161 
iterations()162   inline size_t iterations() const {
163     return this->iterations_;
164   }
165 
Test(xnn_qu8_dwconv_minmax_unipass_ukernel_function dwconv_minmax,xnn_init_qu8_conv_minmax_params_fn init_params,xnn_qu8_requantize_fn requantize)166   void Test(
167     xnn_qu8_dwconv_minmax_unipass_ukernel_function dwconv_minmax,
168     xnn_init_qu8_conv_minmax_params_fn init_params,
169     xnn_qu8_requantize_fn requantize) const
170   {
171     std::random_device random_device;
172     auto rng = std::mt19937(random_device());
173     std::uniform_int_distribution<int32_t> i32dist(-10000, 10000);
174     std::uniform_int_distribution<int32_t> u8dist(
175       std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max());
176 
177     std::vector<const uint8_t*> indirection((width() - 1) * step() + kr());
178     std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + indirection.size() * channels());
179     std::vector<uint8_t> kernel(channels() * kr());
180     std::vector<int32_t> bias(channels());
181     std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> packed_weights((kr() + sizeof(int32_t) / sizeof(uint8_t)) * packed_channels());
182     std::vector<uint8_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint8_t));
183     std::vector<uint8_t> output((width() - 1) * output_stride() + channels());
184     std::vector<int32_t> accumulators(width() * channels());
185     std::vector<uint8_t> output_ref(width() * channels());
186 
187     for (size_t iteration = 0; iteration < iterations(); iteration++) {
188       do {
189         std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); });
190       } while (input.size() > 1 && *std::max_element(input.cbegin(), input.cend()) == *std::min_element(input.cbegin(), input.cend()));
191       do {
192         std::generate(kernel.begin(), kernel.end(), [&]() { return u8dist(rng); });
193       } while (kernel.size() > 1 && *std::max_element(kernel.cbegin(), kernel.cend()) == *std::min_element(kernel.cbegin(), kernel.cend()));
194       std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); });
195       std::fill(zero.begin(), zero.end(), input_zero_point());
196       std::fill(output.begin(), output.end(), UINT8_C(0xA5));
197 
198       std::fill(packed_weights.begin(), packed_weights.end(), 0);
199       const xnn_qu8_packing_params packing_params = { input_zero_point(), kernel_zero_point() };
200       xnn_pack_qu8_dwconv_ghw_w(
201         kr(), kr(), 1, channels(), cr(),
202         kernel.data(), bias.data(), packed_weights.data(),
203         0 /* extra bytes */, &packing_params);
204       for (size_t i = 0; i < indirection.size(); i++) {
205         indirection[i] = input.data() + i * channels() - input_offset();
206       }
207       std::shuffle(indirection.begin(), indirection.end(), rng);
208       if (zero_index() != SIZE_MAX) {
209         for (size_t i = 0; i < indirection.size(); i += kr()) {
210           indirection[i + zero_index()] = zero.data();
211         }
212       }
213 
214       // Compute reference results, without renormalization.
215       for (size_t x = 0; x < width(); x++) {
216         for (size_t c = 0; c < channels(); c++) {
217           float acc = bias[c];
218           for (size_t k = 0; k < kr(); k++) {
219             if (indirection[x * step() + k] != zero.data()) {
220               acc +=
221                 (int32_t(indirection[x * step() + k][c + input_offset()]) - int32_t(input_zero_point())) *
222                 (int32_t(kernel[c * kr() + k]) - int32_t(kernel_zero_point()));
223             }
224           }
225           accumulators[x * channels() + c] = acc;
226         }
227       }
228 
229       // Compute renormalization parameters.
230       const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
231       const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
232       const uint32_t accumulated_range = uint32_t(accumulated_max) - uint32_t(accumulated_min);
233       const double output_scale = accumulated_range >= 256 ? double(accumulated_range) / 255.0 : 1.00001;
234       const uint8_t output_zero_point = uint8_t(std::max(std::min(
235         lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
236         long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));
237 
238       // Prepare parameters.
239       const float requantization_scale = 1.0f / float(output_scale);
240       union xnn_qu8_conv_minmax_params quantization_params;
241       init_params(&quantization_params,
242         kernel_zero_point(), requantization_scale, output_zero_point, qmin(), qmax());
243 
244       // Renormalize reference results.
245       for (size_t x = 0; x < width(); x++) {
246         for (size_t c = 0; c < channels(); c++) {
247           output_ref[x * channels() + c] = requantize(
248             accumulators[x * channels() + c], requantization_scale, output_zero_point, qmin(), qmax());
249         }
250       }
251 
252       // Call optimized micro-kernel.
253       dwconv_minmax(
254         channels(), width(),
255         indirection.data(), packed_weights.data(), output.data(),
256         step() * sizeof(void*),
257         (output_stride() - channels()) * sizeof(uint8_t),
258         input_offset() * sizeof(uint8_t), zero.data(),
259         &quantization_params);
260 
261       // Verify results.
262       for (size_t x = 0; x < width(); x++) {
263         for (size_t c = 0; c < channels(); c++) {
264           ASSERT_GE(uint32_t(output[x * output_stride() + c]), uint32_t(qmin()))
265             << "x = " << x << ", channel = " << c;
266           ASSERT_LE(uint32_t(output[x * output_stride() + c]), uint32_t(qmax()))
267             << "x = " << x << ", channel = " << c;
268           ASSERT_EQ(uint32_t(output[x * output_stride() + c]), uint32_t(output_ref[x * channels() + c]))
269             << "x = " << x << ", channel = " << c << ", accumulator = " << accumulators[x * channels() + c];
270         }
271       }
272     }
273   }
274 
Test(xnn_qc8_dwconv_minmax_unipass_ukernel_function dwconv_minmax,xnn_init_qc8_conv_minmax_params_fn init_params,xnn_qs8_requantize_fn requantize)275   void Test(
276     xnn_qc8_dwconv_minmax_unipass_ukernel_function dwconv_minmax,
277     xnn_init_qc8_conv_minmax_params_fn init_params,
278     xnn_qs8_requantize_fn requantize) const
279   {
280     std::random_device random_device;
281     auto rng = std::mt19937(random_device());
282     std::uniform_int_distribution<int32_t> i32dist(-10000, 10000);
283     std::uniform_int_distribution<int32_t> i8dist(
284       std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
285     std::uniform_int_distribution<int32_t> w8dist(
286       -std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max());
287 
288     std::vector<const int8_t*> indirection((width() - 1) * step() + kr());
289     std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) + indirection.size() * channels());
290     std::vector<int8_t> kernel(channels() * kr());
291     std::vector<int32_t> bias(channels());
292     std::vector<int8_t, AlignedAllocator<int8_t, 64>> packed_weights((kr() + (sizeof(int32_t) + sizeof(float)) / sizeof(int8_t)) * packed_channels());
293     std::vector<int8_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(int8_t));
294     std::vector<int8_t> output((width() - 1) * output_stride() + channels());
295     std::vector<int32_t> accumulators(width() * channels());
296     std::vector<float> scale(channels());
297     std::vector<int8_t> output_ref(width() * channels());
298 
299     for (size_t iteration = 0; iteration < iterations(); iteration++) {
300       do {
301         std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); });
302       } while (input.size() > 1 && *std::max_element(input.cbegin(), input.cend()) == *std::min_element(input.cbegin(), input.cend()));
303       do {
304         std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); });
305       } while (kernel.size() > 1 && *std::max_element(kernel.cbegin(), kernel.cend()) == *std::min_element(kernel.cbegin(), kernel.cend()));
306       std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); });
307       std::fill(zero.begin(), zero.end(), int8_t(input_zero_point() - 0x80));
308       std::fill(output.begin(), output.end(), INT8_C(0xA5));
309 
310       std::fill(packed_weights.begin(), packed_weights.end(), 0);
311       const xnn_qs8_packing_params packing_params = { int8_t(input_zero_point() - 0x80) };
312       xnn_pack_qs8_dwconv_ghw_w(
313         kr(), kr(), 1, channels(), cr(),
314         kernel.data(), bias.data(), packed_weights.data(), cr() * sizeof(float),
315         &packing_params);
316       for (size_t i = 0; i < indirection.size(); i++) {
317         indirection[i] = input.data() + i * channels() - input_offset();
318       }
319       std::shuffle(indirection.begin(), indirection.end(), rng);
320       if (zero_index() != SIZE_MAX) {
321         for (size_t i = 0; i < indirection.size(); i += kr()) {
322           indirection[i + zero_index()] = zero.data();
323         }
324       }
325 
326       // Compute reference results, without renormalization.
327       for (size_t x = 0; x < width(); x++) {
328         for (size_t c = 0; c < channels(); c++) {
329           float acc = bias[c];
330           for (size_t k = 0; k < kr(); k++) {
331             if (indirection[x * step() + k] != zero.data()) {
332               acc +=
333                 (int32_t(indirection[x * step() + k][c + input_offset()]) - int32_t(input_zero_point() - 0x80)) *
334                 int32_t(kernel[c * kr() + k]);
335             }
336           }
337           accumulators[x * channels() + c] = acc;
338         }
339       }
340 
341       // Compute renormalization parameters.
342       const int8_t output_zero_point = -1;
343       for (size_t c = 0; c < channels(); c++) {
344         int32_t accumulated_min = accumulators[c];
345         int32_t accumulated_max = accumulators[c];
346         for (size_t x = 0; x < width(); x++) {
347           accumulated_min = std::min(accumulated_min, accumulators[x * channels() + c]);
348           accumulated_max = std::max(accumulated_max, accumulators[x * channels() + c]);
349         }
350         const uint32_t accumulated_range = uint32_t(accumulated_max - accumulated_min);
351         const float output_scale = accumulated_range >= 256 ? double(accumulated_range) / 255.0 : 1.00001;
352         scale[c] = 1.0f / output_scale;
353       }
354       xnn_init_qc8_scale_fp32_params(
355         channels(), cr(),
356         cr() * (kr() * sizeof(int8_t) + sizeof(int32_t) + sizeof(float)), scale.data(),
357         (void*) ((uintptr_t) packed_weights.data() + cr() * (kr() * sizeof(int8_t) + sizeof(int32_t))));
358 
359       // Prepare parameters.
360       union xnn_qc8_conv_minmax_params minmax_params;
361       init_params(&minmax_params,
362         output_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
363 
364       // Renormalize reference results.
365       for (size_t x = 0; x < width(); x++) {
366         for (size_t c = 0; c < channels(); c++) {
367           output_ref[x * channels() + c] = requantize(
368             accumulators[x * channels() + c], scale[c], output_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
369         }
370       }
371 
372       // Call optimized micro-kernel.
373       dwconv_minmax(
374         channels(), width(),
375         indirection.data(), packed_weights.data(), output.data(),
376         step() * sizeof(void*),
377         (output_stride() - channels()) * sizeof(int8_t),
378         input_offset() * sizeof(int8_t), zero.data(),
379         &minmax_params);
380 
381       // Verify results.
382       for (size_t x = 0; x < width(); x++) {
383         for (size_t c = 0; c < channels(); c++) {
384           ASSERT_GE(int32_t(output[x * output_stride() + c]), int32_t(qmin()) - 0x80)
385             << "x = " << x << ", channel = " << c;
386           ASSERT_LE(int32_t(output[x * output_stride() + c]), int32_t(qmax()) - 0x80)
387             << "x = " << x << ", channel = " << c;
388           ASSERT_EQ(int32_t(output[x * output_stride() + c]), int32_t(output_ref[x * channels() + c]))
389             << "x = " << x << ", channel = " << c << ", accumulator = " << accumulators[x * channels() + c];
390         }
391       }
392     }
393   }
394 
Test(xnn_qs8_dwconv_minmax_unipass_ukernel_function dwconv_minmax,xnn_init_qs8_conv_minmax_params_fn init_params,xnn_qs8_requantize_fn requantize)395   void Test(
396     xnn_qs8_dwconv_minmax_unipass_ukernel_function dwconv_minmax,
397     xnn_init_qs8_conv_minmax_params_fn init_params,
398     xnn_qs8_requantize_fn requantize) const
399   {
400     std::random_device random_device;
401     auto rng = std::mt19937(random_device());
402     std::uniform_int_distribution<int32_t> i32dist(-10000, 10000);
403     std::uniform_int_distribution<int32_t> i8dist(
404       std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
405     std::uniform_int_distribution<int32_t> w8dist(
406       -std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max());
407 
408     std::vector<const int8_t*> indirection((width() - 1) * step() + kr());
409     std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) + indirection.size() * channels());
410     std::vector<int8_t> kernel(channels() * kr());
411     std::vector<int32_t> bias(channels());
412     std::vector<int8_t, AlignedAllocator<int8_t, 64>> packed_weights((kr() + sizeof(int32_t) / sizeof(int8_t)) * packed_channels());
413     std::vector<int8_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(int8_t));
414     std::vector<int8_t> output((width() - 1) * output_stride() + channels());
415     std::vector<int32_t> accumulators(width() * channels());
416     std::vector<int8_t> output_ref(width() * channels());
417 
418     for (size_t iteration = 0; iteration < iterations(); iteration++) {
419       do {
420         std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); });
421       } while (input.size() > 1 && *std::max_element(input.cbegin(), input.cend()) == *std::min_element(input.cbegin(), input.cend()));
422       do {
423         std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); });
424       } while (kernel.size() > 1 && *std::max_element(kernel.cbegin(), kernel.cend()) == *std::min_element(kernel.cbegin(), kernel.cend()));
425       std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); });
426       std::fill(zero.begin(), zero.end(), int8_t(input_zero_point() - 0x80));
427       std::fill(output.begin(), output.end(), INT8_C(0xA5));
428 
429       std::fill(packed_weights.begin(), packed_weights.end(), 0);
430       const xnn_qs8_packing_params packing_params = { int8_t(input_zero_point() - 0x80) };
431       xnn_pack_qs8_dwconv_ghw_w(
432         kr(), kr(), 1, channels(), cr(),
433         kernel.data(), bias.data(), packed_weights.data(),
434         0 /* extra bytes */, &packing_params);
435       for (size_t i = 0; i < indirection.size(); i++) {
436         indirection[i] = input.data() + i * channels() - input_offset();
437       }
438       std::shuffle(indirection.begin(), indirection.end(), rng);
439       if (zero_index() != SIZE_MAX) {
440         for (size_t i = 0; i < indirection.size(); i += kr()) {
441           indirection[i + zero_index()] = zero.data();
442         }
443       }
444 
445       // Compute reference results, without renormalization.
446       for (size_t x = 0; x < width(); x++) {
447         for (size_t c = 0; c < channels(); c++) {
448           float acc = bias[c];
449           for (size_t k = 0; k < kr(); k++) {
450             if (indirection[x * step() + k] != zero.data()) {
451               acc +=
452                 (int32_t(indirection[x * step() + k][c + input_offset()]) - int32_t(input_zero_point() - 0x80)) *
453                 int32_t(kernel[c * kr() + k]);
454             }
455           }
456           accumulators[x * channels() + c] = acc;
457         }
458       }
459 
460       // Compute renormalization parameters.
461       const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
462       const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
463       const uint32_t accumulated_range = uint32_t(accumulated_max) - uint32_t(accumulated_min);
464       const double output_scale = accumulated_range >= 256 ? double(accumulated_range) / 255.0 : 1.00001;
465       const int8_t output_zero_point = int8_t(std::max(std::min(
466         lrint(-0.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
467         long(std::numeric_limits<int8_t>::max())), long(std::numeric_limits<int8_t>::min())));
468 
469       // Prepare parameters.
470       const float requantization_scale = 1.0f / float(output_scale);
471       union xnn_qs8_conv_minmax_params quantization_params;
472       init_params(&quantization_params,
473         requantization_scale, output_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
474 
475       // Renormalize reference results.
476       for (size_t x = 0; x < width(); x++) {
477         for (size_t c = 0; c < channels(); c++) {
478           output_ref[x * channels() + c] = requantize(
479             accumulators[x * channels() + c], requantization_scale, output_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
480         }
481       }
482 
483       // Call optimized micro-kernel.
484       dwconv_minmax(
485         channels(), width(),
486         indirection.data(), packed_weights.data(), output.data(),
487         step() * sizeof(void*),
488         (output_stride() - channels()) * sizeof(int8_t),
489         input_offset() * sizeof(int8_t), zero.data(),
490         &quantization_params);
491 
492       // Verify results.
493       for (size_t x = 0; x < width(); x++) {
494         for (size_t c = 0; c < channels(); c++) {
495           ASSERT_GE(int32_t(output[x * output_stride() + c]), int32_t(qmin()) - 0x80)
496             << "x = " << x << ", channel = " << c;
497           ASSERT_LE(int32_t(output[x * output_stride() + c]), int32_t(qmax()) - 0x80)
498             << "x = " << x << ", channel = " << c;
499           ASSERT_EQ(int32_t(output[x * output_stride() + c]), int32_t(output_ref[x * channels() + c]))
500             << "x = " << x << ", channel = " << c << ", accumulator = " << accumulators[x * channels() + c];
501         }
502       }
503     }
504   }
505 
Test(xnn_f16_dwconv_minmax_unipass_ukernel_function dwconv_minmax,xnn_init_f16_minmax_params_fn init_params)506   void Test(xnn_f16_dwconv_minmax_unipass_ukernel_function dwconv_minmax, xnn_init_f16_minmax_params_fn init_params) const {
507     std::random_device random_device;
508     auto rng = std::mt19937(random_device());
509     std::uniform_real_distribution<float> f32dist;
510 
511     std::vector<const uint16_t*> indirection((width() - 1) * step() + kr());
512     std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + indirection.size() * channels());
513     std::vector<uint16_t> kernel(channels() * kr());
514     std::vector<uint16_t> bias(channels());
515     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> packed_weights((kr() + 1) * packed_channels());
516     std::vector<uint16_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint16_t));
517     std::vector<uint16_t> output((width() - 1) * output_stride() + channels());
518     std::vector<float> output_ref(width() * channels());
519 
520     for (size_t iteration = 0; iteration < iterations(); iteration++) {
521       std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
522       std::generate(kernel.begin(), kernel.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
523       std::generate(bias.begin(), bias.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
524       std::fill(zero.begin(), zero.end(), 0);
525       std::fill(output_ref.begin(), output_ref.end(), 0.0f);
526       std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
527 
528       std::fill(packed_weights.begin(), packed_weights.end(), 0);
529       xnn_pack_f16_dwconv_ghw_w(
530         kr(), kr(), 1, channels(), cr(),
531         kernel.data(), bias.data(), packed_weights.data(),
532         0 /* extra bytes */, nullptr);
533       for (size_t i = 0; i < indirection.size(); i++) {
534         indirection[i] = input.data() + i * channels() - input_offset();
535       }
536       std::shuffle(indirection.begin(), indirection.end(), rng);
537       if (zero_index() != SIZE_MAX) {
538         for (size_t i = 0; i < indirection.size(); i += kr()) {
539           indirection[i + zero_index()] = zero.data();
540         }
541       }
542 
543       // Compute reference results, without clamping.
544       for (size_t x = 0; x < width(); x++) {
545         for (size_t c = 0; c < channels(); c++) {
546           float acc = fp16_ieee_to_fp32_value(bias[c]);
547           for (size_t k = 0; k < kr(); k++) {
548             if (indirection[x * step() + k] != zero.data()) {
549               acc += fp16_ieee_to_fp32_value(indirection[x * step() + k][c + input_offset()]) * fp16_ieee_to_fp32_value(kernel[c * kr() + k]);
550             }
551           }
552           output_ref[x * channels() + c] = acc;
553         }
554       }
555 
556       // Compute clamping parameters.
557       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
558       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
559       const float accumulated_range = accumulated_max - accumulated_min;
560       const float output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_min + accumulated_range / 255.0f * float(qmin())));
561       const float output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_max - accumulated_range / 255.0f * float(255 - qmax())));
562 
563       // Prepare parameters.
564       xnn_f16_minmax_params params;
565       init_params(&params,
566         fp16_ieee_from_fp32_value(output_min),
567         fp16_ieee_from_fp32_value(output_max));
568 
569       // Clamp reference results.
570       for (float& output_val : output_ref) {
571         output_val = std::max(std::min(output_val, output_max), output_min);
572       }
573 
574       // Call optimized micro-kernel.
575       dwconv_minmax(
576         channels(), width(),
577         reinterpret_cast<const void**>(indirection.data()), packed_weights.data(), output.data(),
578         step() * sizeof(void*),
579         (output_stride() - channels()) * sizeof(uint16_t),
580         input_offset() * sizeof(uint16_t), zero.data(),
581         &params);
582 
583       // Verify results.
584       for (size_t x = 0; x < width(); x++) {
585         for (size_t c = 0; c < channels(); c++) {
586           ASSERT_GE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_min)
587             << "x = " << x << ", channel = " << c;
588           ASSERT_LE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_max)
589             << "x = " << x << ", channel = " << c;
590           ASSERT_NEAR(output_ref[x * channels() + c], fp16_ieee_to_fp32_value(output[x * output_stride() + c]), std::max(1.0e-4f, std::abs(output_ref[x * channels() + c]) * 1.0e-2f))
591             << "x = " << x << ", channel = " << c;
592         }
593       }
594     }
595   }
596 
Test(xnn_f32_dwconv_unipass_ukernel_function dwconv)597   void Test(xnn_f32_dwconv_unipass_ukernel_function dwconv) const {
598     std::random_device random_device;
599     auto rng = std::mt19937(random_device());
600     std::uniform_real_distribution<float> f32dist;
601 
602     std::vector<const float*> indirection((width() - 1) * step() + kr());
603     std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + indirection.size() * channels());
604     std::vector<float> kernel(channels() * kr());
605     std::vector<float> bias(channels());
606     std::vector<float, AlignedAllocator<float, 64>> packed_weights((kr() + 1) * packed_channels());
607     std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float));
608     std::vector<float> output((width() - 1) * output_stride() + channels());
609     std::vector<float> output_ref(width() * channels());
610 
611     for (size_t iteration = 0; iteration < iterations(); iteration++) {
612       std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
613       std::generate(kernel.begin(), kernel.end(), [&]() { return f32dist(rng); });
614       std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
615       std::fill(zero.begin(), zero.end(), 0.0f);
616       std::fill(output_ref.begin(), output_ref.end(), nanf(""));
617       std::fill(output.begin(), output.end(), nanf(""));
618 
619       std::fill(packed_weights.begin(), packed_weights.end(), 0.0f);
620       xnn_pack_f32_dwconv_ghw_w(
621         kr(), kr(), 1, channels(), cr(),
622         kernel.data(), bias.data(), packed_weights.data(),
623         0 /* extra bytes */, nullptr);
624       for (size_t i = 0; i < indirection.size(); i++) {
625         indirection[i] = input.data() + i * channels() - input_offset();
626       }
627       std::shuffle(indirection.begin(), indirection.end(), rng);
628       if (zero_index() != SIZE_MAX) {
629         for (size_t i = 0; i < indirection.size(); i += kr()) {
630           indirection[i + zero_index()] = zero.data();
631         }
632       }
633 
634       // Compute reference results, without clamping.
635       for (size_t x = 0; x < width(); x++) {
636         for (size_t c = 0; c < channels(); c++) {
637           float acc = bias[c];
638           for (size_t k = 0; k < kr(); k++) {
639             if (indirection[x * step() + k] != zero.data()) {
640               acc += indirection[x * step() + k][c + input_offset()] * kernel[c * kr() + k];
641             }
642           }
643           output_ref[x * channels() + c] = acc;
644         }
645       }
646 
647       // Call optimized micro-kernel.
648       dwconv(
649         channels(), width(),
650         indirection.data(), packed_weights.data(), output.data(),
651         step() * sizeof(void*),
652         (output_stride() - channels()) * sizeof(float),
653         input_offset() * sizeof(float), zero.data(),
654         nullptr);
655 
656       // Verify results.
657       for (size_t x = 0; x < width(); x++) {
658         for (size_t c = 0; c < channels(); c++) {
659           ASSERT_NEAR(
660               output_ref[x * channels() + c],
661               output[x * output_stride() + c],
662               std::abs(output_ref[x * channels() + c]) * 1.0e-5)
663             << "x = " << x << ", channel = " << c;
664         }
665       }
666     }
667   }
668 
Test(xnn_f32_dwconv_minmax_unipass_ukernel_function dwconv_minmax,xnn_init_f32_minmax_params_fn init_params)669   void Test(xnn_f32_dwconv_minmax_unipass_ukernel_function dwconv_minmax, xnn_init_f32_minmax_params_fn init_params) const {
670     std::random_device random_device;
671     auto rng = std::mt19937(random_device());
672     std::uniform_real_distribution<float> f32dist;
673 
674     std::vector<const float*> indirection((width() - 1) * step() + kr());
675     std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + indirection.size() * channels());
676     std::vector<float> kernel(channels() * kr());
677     std::vector<float> bias(channels());
678     std::vector<float, AlignedAllocator<float, 64>> packed_weights((kr() + 1) * packed_channels());
679     std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float));
680     std::vector<float> output((width() - 1) * output_stride() + channels());
681     std::vector<float> output_ref(width() * channels());
682 
683     for (size_t iteration = 0; iteration < iterations(); iteration++) {
684       std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
685       std::generate(kernel.begin(), kernel.end(), [&]() { return f32dist(rng); });
686       std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
687       std::fill(zero.begin(), zero.end(), 0.0f);
688       std::fill(output_ref.begin(), output_ref.end(), nanf(""));
689       std::fill(output.begin(), output.end(), nanf(""));
690 
691       std::fill(packed_weights.begin(), packed_weights.end(), 0.0f);
692       xnn_pack_f32_dwconv_ghw_w(
693         kr(), kr(), 1, channels(), cr(),
694         kernel.data(), bias.data(), packed_weights.data(),
695         0 /* extra bytes */, nullptr);
696       for (size_t i = 0; i < indirection.size(); i++) {
697         indirection[i] = input.data() + i * channels() - input_offset();
698       }
699       std::shuffle(indirection.begin(), indirection.end(), rng);
700       if (zero_index() != SIZE_MAX) {
701         for (size_t i = 0; i < indirection.size(); i += kr()) {
702           indirection[i + zero_index()] = zero.data();
703         }
704       }
705 
706       // Compute reference results, without clamping.
707       for (size_t x = 0; x < width(); x++) {
708         for (size_t c = 0; c < channels(); c++) {
709           float acc = bias[c];
710           for (size_t k = 0; k < kr(); k++) {
711             if (indirection[x * step() + k] != zero.data()) {
712               acc += indirection[x * step() + k][c + input_offset()] * kernel[c * kr() + k];
713             }
714           }
715           output_ref[x * channels() + c] = acc;
716         }
717       }
718 
719       // Compute clamping parameters.
720       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
721       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
722       const float accumulated_range = accumulated_max - accumulated_min;
723       const float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin());
724       const float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax());
725 
726       // Prepare parameters.
727       xnn_f32_minmax_params params;
728       init_params(&params, output_min, output_max);
729 
730       // Clamp reference results.
731       for (float& output_val : output_ref) {
732         output_val = std::max(std::min(output_val, output_max), output_min);
733       }
734 
735       // Call optimized micro-kernel.
736       dwconv_minmax(
737         channels(), width(),
738         indirection.data(), packed_weights.data(), output.data(),
739         step() * sizeof(void*),
740         (output_stride() - channels()) * sizeof(float),
741         input_offset() * sizeof(float), zero.data(),
742         &params);
743 
744       // Verify results.
745       for (size_t x = 0; x < width(); x++) {
746         for (size_t c = 0; c < channels(); c++) {
747           ASSERT_GE(output[x * output_stride() + c], output_min)
748             << "x = " << x << ", channel = " << c;
749           ASSERT_LE(output[x * output_stride() + c], output_max)
750             << "x = " << x << ", channel = " << c;
751           ASSERT_NEAR(
752               output_ref[x * channels() + c],
753               output[x * output_stride() + c],
754               std::abs(output_ref[x * channels() + c]) * 1.0e-5)
755             << "x = " << x << ", channel = " << c;
756         }
757       }
758     }
759   }
760 
761  private:
762   uint32_t channels_{1};
763   uint32_t cr_{1};
764   uint32_t kr_{1};
765   uint32_t width_{1};
766   uint32_t step_{1};
767   uint32_t output_stride_{0};
768   uint8_t input_zero_point_{127};
769   uint8_t kernel_zero_point_{127};
770   uint8_t qmin_{0};
771   uint8_t qmax_{255};
772   size_t input_offset_{0};
773   size_t zero_index_{SIZE_MAX};
774   size_t iterations_{3};
775 };
776