• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #pragma once
10 
11 #include <gtest/gtest.h>
12 
13 #include <algorithm>
14 #include <cassert>
15 #include <cmath>
16 #include <cstddef>
17 #include <cstdlib>
18 #include <functional>
19 #include <limits>
20 #include <random>
21 #include <vector>
22 
23 #include <fp16.h>
24 
25 #include <xnnpack.h>
26 #include <xnnpack/AlignedAllocator.h>
27 #include <xnnpack/params-init.h>
28 #include <xnnpack/params.h>
29 #include <xnnpack/requantization.h>
30 
31 
32 class GAvgPoolMicrokernelTester {
33  public:
34   enum class Variant {
35     Native,
36     Scalar,
37   };
38 
rows(size_t rows)39   inline GAvgPoolMicrokernelTester& rows(size_t rows) {
40     assert(rows != 0);
41     this->rows_ = rows;
42     return *this;
43   }
44 
rows()45   inline size_t rows() const {
46     return this->rows_;
47   }
48 
channels(size_t channels)49   inline GAvgPoolMicrokernelTester& channels(size_t channels) {
50     assert(channels != 0);
51     this->channels_ = channels;
52     return *this;
53   }
54 
channels()55   inline size_t channels() const {
56     return this->channels_;
57   }
58 
channel_tile(size_t channel_tile)59   inline GAvgPoolMicrokernelTester& channel_tile(size_t channel_tile) {
60     assert(channel_tile != 0);
61     this->channel_tile_ = channel_tile;
62     return *this;
63   }
64 
channel_tile()65   inline size_t channel_tile() const {
66     return this->channel_tile_;
67   }
68 
input_stride(size_t input_stride)69   inline GAvgPoolMicrokernelTester& input_stride(size_t input_stride) {
70     assert(input_stride != 0);
71     this->input_stride_ = input_stride;
72     return *this;
73   }
74 
input_stride()75   inline size_t input_stride() const {
76     if (this->input_stride_ == 0) {
77       return channels();
78     } else {
79       assert(this->input_stride_ >= channels());
80       return this->input_stride_;
81     }
82   }
83 
input_scale(float input_scale)84   inline GAvgPoolMicrokernelTester& input_scale(float input_scale) {
85     assert(input_scale > 0.0f);
86     assert(std::isnormal(input_scale));
87     this->input_scale_ = input_scale;
88     return *this;
89   }
90 
input_scale()91   inline float input_scale() const {
92     return this->input_scale_;
93   }
94 
input_zero_point(uint8_t input_zero_point)95   inline GAvgPoolMicrokernelTester& input_zero_point(uint8_t input_zero_point) {
96     this->input_zero_point_ = input_zero_point;
97     return *this;
98   }
99 
input_zero_point()100   inline uint8_t input_zero_point() const {
101     return this->input_zero_point_;
102   }
103 
output_scale(float output_scale)104   inline GAvgPoolMicrokernelTester& output_scale(float output_scale) {
105     assert(output_scale > 0.0f);
106     assert(std::isnormal(output_scale));
107     this->output_scale_ = output_scale;
108     return *this;
109   }
110 
output_scale()111   inline float output_scale() const {
112     return this->output_scale_;
113   }
114 
output_zero_point(uint8_t output_zero_point)115   inline GAvgPoolMicrokernelTester& output_zero_point(uint8_t output_zero_point) {
116     this->output_zero_point_ = output_zero_point;
117     return *this;
118   }
119 
output_zero_point()120   inline uint8_t output_zero_point() const {
121     return this->output_zero_point_;
122   }
123 
qmin(uint8_t qmin)124   inline GAvgPoolMicrokernelTester& qmin(uint8_t qmin) {
125     this->qmin_ = qmin;
126     return *this;
127   }
128 
qmin()129   inline uint8_t qmin() const {
130     return this->qmin_;
131   }
132 
qmax(uint8_t qmax)133   inline GAvgPoolMicrokernelTester& qmax(uint8_t qmax) {
134     this->qmax_ = qmax;
135     return *this;
136   }
137 
qmax()138   inline uint8_t qmax() const {
139     return this->qmax_;
140   }
141 
iterations(size_t iterations)142   inline GAvgPoolMicrokernelTester& iterations(size_t iterations) {
143     this->iterations_ = iterations;
144     return *this;
145   }
146 
iterations()147   inline size_t iterations() const {
148     return this->iterations_;
149   }
150 
151   void Test(xnn_qu8_gavgpool_minmax_unipass_ukernel_function gavgpool_minmax, Variant variant = Variant::Native) const {
152     std::random_device random_device;
153     auto rng = std::mt19937(random_device());
154     auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
155 
156     std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) +
157       (rows() - 1) * input_stride() + channels());
158     std::vector<uint8_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint8_t));
159     std::vector<uint8_t> output(channels());
160     std::vector<uint8_t> output_ref(channels());
161     std::vector<float> output_fp(channels());
162     std::vector<int32_t> accumulators(channels());
163     for (size_t iteration = 0; iteration < iterations(); iteration++) {
164       std::generate(input.begin(), input.end(), std::ref(u8rng));
165       std::fill(output.begin(), output.end(), 0xA5);
166 
167       // Prepare parameters.
168       union xnn_qu8_avgpool_params quantization_params = { };
169       switch (variant) {
170         case Variant::Native:
171           quantization_params = xnn_init_qu8_avgpool_params(
172             -int32_t(input_zero_point()) * int32_t(rows()),
173             input_scale() / (output_scale() * float(rows())),
174             output_zero_point(), qmin(), qmax());
175           break;
176         case Variant::Scalar:
177           quantization_params = xnn_init_scalar_qu8_avgpool_params(
178             -int32_t(input_zero_point()) * int32_t(rows()),
179             input_scale() / (output_scale() * float(rows())),
180             output_zero_point(), qmin(), qmax());
181           break;
182       }
183       const union xnn_qu8_avgpool_params scalar_quantization_params =
184         xnn_init_scalar_qu8_avgpool_params(
185           -int32_t(input_zero_point()) * int32_t(rows()),
186           input_scale() / (output_scale() * float(rows())),
187           output_zero_point(), qmin(), qmax());
188 
189       // Compute reference results.
190       for (size_t c = 0; c < channels(); c++) {
191         int32_t acc = scalar_quantization_params.scalar.bias;
192         for (size_t n = 0; n < rows(); n++) {
193           acc += input[n * input_stride() + c];
194         }
195         accumulators[c] = acc;
196         output_ref[c] = xnn_qu8_quantize_avgpool(acc, scalar_quantization_params);
197         output_fp[c] = float(acc) * (input_scale() / (output_scale() * float(rows()))) + float(output_zero_point());
198         output_fp[c] = std::min<float>(output_fp[c], float(qmax()));
199         output_fp[c] = std::max<float>(output_fp[c], float(qmin()));
200       }
201 
202       // Call optimized micro-kernel.
203       gavgpool_minmax(rows(), channels(),
204         input.data(), input_stride() * sizeof(uint8_t),
205         zero.data(),
206         output.data(),
207         &quantization_params);
208 
209       // Verify results.
210       for (size_t c = 0; c < channels(); c++) {
211         ASSERT_LE(uint32_t(output[c]), uint32_t(qmax()))
212           << "at position " << c << ", rows = " << rows() << ", channels = " << channels();
213         ASSERT_GE(uint32_t(output[c]), uint32_t(qmin()))
214           << "at position " << c << ", rows = " << rows() << ", channels = " << channels();
215         ASSERT_NEAR(float(int32_t(output[c])), output_fp[c], 0.5f)
216           << "at position " << c << ", rows = " << rows() << ", channels = " << channels()
217           << ", acc = " << accumulators[c];
218         ASSERT_EQ(uint32_t(output_ref[c]), uint32_t(output[c]))
219           << "at position " << c << ", rows = " << rows() << ", channels = " << channels()
220           << ", acc = " << accumulators[c];
221       }
222     }
223   }
224 
225   void Test(xnn_qu8_gavgpool_minmax_multipass_ukernel_function gavgpool_minmax, Variant variant = Variant::Native) const {
226     std::random_device random_device;
227     auto rng = std::mt19937(random_device());
228     auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
229 
230     std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) +
231       (rows() - 1) * input_stride() + channels());
232     std::vector<int32_t, AlignedAllocator<int32_t, 64>> buffer(channels() + XNN_EXTRA_BYTES / sizeof(uint8_t));
233     std::vector<uint8_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint8_t));
234     std::vector<uint8_t> output(channels());
235     std::vector<uint8_t> output_ref(channels());
236     std::vector<float> output_fp(channels());
237     std::vector<int32_t> accumulators(channels());
238     for (size_t iteration = 0; iteration < iterations(); iteration++) {
239       std::generate(input.begin(), input.end(), std::ref(u8rng));
240       std::fill(output.begin(), output.end(), 0xA5);
241 
242       // Prepare parameters.
243       union xnn_qu8_avgpool_params quantization_params = { };
244       switch (variant) {
245         case Variant::Native:
246           quantization_params = xnn_init_qu8_avgpool_params(
247             -int32_t(input_zero_point()) * int32_t(rows()),
248             input_scale() / (output_scale() * float(rows())),
249             output_zero_point(), qmin(), qmax());
250           break;
251         case Variant::Scalar:
252           quantization_params = xnn_init_scalar_qu8_avgpool_params(
253             -int32_t(input_zero_point()) * int32_t(rows()),
254             input_scale() / (output_scale() * float(rows())),
255             output_zero_point(), qmin(), qmax());
256           break;
257       }
258       const union xnn_qu8_avgpool_params scalar_quantization_params =
259         xnn_init_scalar_qu8_avgpool_params(
260           -int32_t(input_zero_point()) * int32_t(rows()),
261           input_scale() / (output_scale() * float(rows())),
262           output_zero_point(), qmin(), qmax());
263 
264       // Compute reference results.
265       for (size_t c = 0; c < channels(); c++) {
266         int32_t acc = scalar_quantization_params.scalar.bias;
267         for (size_t n = 0; n < rows(); n++) {
268           acc += input[n * input_stride() + c];
269         }
270 
271         accumulators[c] = acc;
272         output_ref[c] = xnn_qu8_quantize_avgpool(acc, scalar_quantization_params);
273         output_fp[c] = float(acc) * (input_scale() / (output_scale() * float(rows()))) + float(output_zero_point());
274         output_fp[c] = std::min<float>(output_fp[c], float(qmax()));
275         output_fp[c] = std::max<float>(output_fp[c], float(qmin()));
276       }
277 
278       // Call optimized micro-kernel.
279       gavgpool_minmax(rows(), channels(),
280         input.data(), input_stride() * sizeof(uint8_t),
281         zero.data(),
282         buffer.data(),
283         output.data(),
284         &quantization_params);
285 
286       // Verify results.
287       for (size_t c = 0; c < channels(); c++) {
288         ASSERT_LE(uint32_t(output[c]), uint32_t(qmax()))
289           << "at position " << c << ", rows = " << rows() << ", channels = " << channels();
290         ASSERT_GE(uint32_t(output[c]), uint32_t(qmin()))
291           << "at position " << c << ", rows = " << rows() << ", channels = " << channels();
292         ASSERT_NEAR(float(int32_t(output[c])), output_fp[c], 0.5f)
293           << "at position " << c << ", rows = " << rows() << ", channels = " << channels()
294           << ", acc = " << accumulators[c];
295         ASSERT_EQ(uint32_t(output_ref[c]), uint32_t(output[c]))
296           << "at position " << c << ", rows = " << rows() << ", channels = " << channels()
297           << ", acc = " << accumulators[c];
298       }
299     }
300   }
301 
302   void Test(xnn_qs8_gavgpool_minmax_unipass_ukernel_function gavgpool_minmax, Variant variant = Variant::Native) const {
303     std::random_device random_device;
304     auto rng = std::mt19937(random_device());
305     auto i8rng = std::bind(
306       std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), rng);
307 
308     std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) +
309       (rows() - 1) * input_stride() + channels());
310     std::vector<int8_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(int8_t));
311     std::vector<int8_t> output(channels());
312     std::vector<int8_t> output_ref(channels());
313     std::vector<float> output_fp(channels());
314     std::vector<int32_t> accumulators(channels());
315     for (size_t iteration = 0; iteration < iterations(); iteration++) {
316       std::generate(input.begin(), input.end(), std::ref(i8rng));
317       std::fill(output.begin(), output.end(), 0xA5);
318 
319       // Prepare parameters.
320       union xnn_qs8_avgpool_params quantization_params = { };
321       switch (variant) {
322         case Variant::Native:
323           quantization_params = xnn_init_qs8_avgpool_params(
324             -int32_t(input_zero_point() - 0x80) * int32_t(rows()),
325             input_scale() / (output_scale() * float(rows())),
326             int8_t(output_zero_point() - 0x80), int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
327           break;
328         case Variant::Scalar:
329           quantization_params = xnn_init_scalar_qs8_avgpool_params(
330             -int32_t(input_zero_point() - 0x80) * int32_t(rows()),
331             input_scale() / (output_scale() * float(rows())),
332             int8_t(output_zero_point() - 0x80), int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
333           break;
334       }
335       const union xnn_qs8_avgpool_params scalar_quantization_params =
336         xnn_init_scalar_qs8_avgpool_params(
337           -int32_t(input_zero_point() - 0x80) * int32_t(rows()),
338           input_scale() / (output_scale() * float(rows())),
339           int8_t(output_zero_point() - 0x80), int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
340 
341       // Compute reference results.
342       for (size_t c = 0; c < channels(); c++) {
343         int32_t acc = scalar_quantization_params.scalar.bias;
344         for (size_t n = 0; n < rows(); n++) {
345           acc += input[n * input_stride() + c];
346         }
347         accumulators[c] = acc;
348         output_ref[c] = xnn_qs8_quantize_avgpool(acc, scalar_quantization_params);
349         output_fp[c] = float(acc) * (input_scale() / (output_scale() * float(rows()))) + float(output_zero_point() - 0x80);
350         output_fp[c] = std::min<float>(output_fp[c], float(qmax() - 0x80));
351         output_fp[c] = std::max<float>(output_fp[c], float(qmin() - 0x80));
352       }
353 
354       // Call optimized micro-kernel.
355       gavgpool_minmax(rows(), channels(),
356         input.data(), input_stride() * sizeof(int8_t),
357         zero.data(),
358         output.data(),
359         &quantization_params);
360 
361       // Verify results.
362       for (size_t c = 0; c < channels(); c++) {
363         ASSERT_LE(int32_t(output[c]), int32_t(qmax() - 0x80))
364           << "at channel " << c << " / " << channels() << ", rows = " << rows();
365         ASSERT_GE(int32_t(output[c]), int32_t(qmin() - 0x80))
366           << "at channel " << c << " / " << channels() << ", rows = " << rows();
367         ASSERT_NEAR(float(int32_t(output[c])), output_fp[c], 0.5f)
368           << "at channel " << c << " / " << channels() << ", rows = " << rows()
369           << ", accumulator = " << accumulators[c];
370         ASSERT_EQ(int32_t(output_ref[c]), int32_t(output[c]))
371           << "at channel " << c << " / " << channels() << ", rows = " << rows()
372           << ", accumulator = " << accumulators[c];
373       }
374     }
375   }
376 
377   void Test(xnn_qs8_gavgpool_minmax_multipass_ukernel_function gavgpool_minmax, Variant variant = Variant::Native) const {
378     std::random_device random_device;
379     auto rng = std::mt19937(random_device());
380     auto i8rng = std::bind(
381       std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), rng);
382 
383     std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) +
384       (rows() - 1) * input_stride() + channels());
385     std::vector<int32_t, AlignedAllocator<int32_t, 64>> buffer(channels() + XNN_EXTRA_BYTES / sizeof(int8_t));
386     std::vector<int8_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(int8_t));
387     std::vector<int8_t> output(channels());
388     std::vector<int8_t> output_ref(channels());
389     std::vector<float> output_fp(channels());
390     std::vector<int32_t> accumulators(channels());
391     for (size_t iteration = 0; iteration < iterations(); iteration++) {
392       std::generate(input.begin(), input.end(), std::ref(i8rng));
393       std::fill(output.begin(), output.end(), 0xA5);
394 
395       // Prepare parameters.
396       union xnn_qs8_avgpool_params quantization_params = { };
397       switch (variant) {
398         case Variant::Native:
399           quantization_params = xnn_init_qs8_avgpool_params(
400             -int32_t(input_zero_point() - 0x80) * int32_t(rows()),
401             input_scale() / (output_scale() * float(rows())),
402             int8_t(output_zero_point() - 0x80), int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
403           break;
404         case Variant::Scalar:
405           quantization_params = xnn_init_scalar_qs8_avgpool_params(
406             -int32_t(input_zero_point() - 0x80) * int32_t(rows()),
407             input_scale() / (output_scale() * float(rows())),
408             int8_t(output_zero_point() - 0x80), int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
409           break;
410       }
411       const union xnn_qs8_avgpool_params scalar_quantization_params =
412         xnn_init_scalar_qs8_avgpool_params(
413           -int32_t(input_zero_point() - 0x80) * int32_t(rows()),
414           input_scale() / (output_scale() * float(rows())),
415           int8_t(output_zero_point() - 0x80), int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
416 
417       // Compute reference results.
418       for (size_t c = 0; c < channels(); c++) {
419         int32_t acc = scalar_quantization_params.scalar.bias;
420         for (size_t n = 0; n < rows(); n++) {
421           acc += input[n * input_stride() + c];
422         }
423         accumulators[c] = acc;
424         output_ref[c] = xnn_qs8_quantize_avgpool(acc, scalar_quantization_params);
425         output_fp[c] = float(acc) * (input_scale() / (output_scale() * float(rows()))) + float(output_zero_point() - 0x80);
426         output_fp[c] = std::min<float>(output_fp[c], float(qmax() - 0x80));
427         output_fp[c] = std::max<float>(output_fp[c], float(qmin() - 0x80));
428       }
429 
430       // Call optimized micro-kernel.
431       gavgpool_minmax(rows(), channels(),
432         input.data(), input_stride() * sizeof(int8_t),
433         zero.data(),
434         buffer.data(),
435         output.data(),
436         &quantization_params);
437 
438       // Verify results.
439       for (size_t c = 0; c < channels(); c++) {
440         ASSERT_LE(int32_t(output[c]), int32_t(qmax() - 0x80))
441           << "at channel " << c << " / " << channels() << ", rows = " << rows();
442         ASSERT_GE(int32_t(output[c]), int32_t(qmin() - 0x80))
443           << "at channel " << c << " / " << channels() << ", rows = " << rows();
444         ASSERT_NEAR(float(int32_t(output[c])), output_fp[c], 0.5f)
445           << "at channel " << c << " / " << channels() << ", rows = " << rows()
446           << ", accumulator = " << accumulators[c];
447         ASSERT_EQ(int32_t(output_ref[c]), int32_t(output[c]))
448           << "at channel " << c << " / " << channels() << ", rows = " << rows()
449           << ", accumulator = " << accumulators[c];
450       }
451     }
452   }
453 
454   void Test(xnn_f16_gavgpool_minmax_unipass_ukernel_function gavgpool_minmax, Variant variant = Variant::Native) const {
455     std::random_device random_device;
456     auto rng = std::mt19937(random_device());
457     auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
458     auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
459 
460     std::vector<uint16_t> input((rows() - 1) * input_stride() + channels() + XNN_EXTRA_BYTES / sizeof(uint16_t));
461     std::vector<uint16_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint16_t));
462     std::vector<uint16_t> output(channels());
463     std::vector<float> output_ref(channels());
464 
465     std::fill(zero.begin(), zero.end(), 0);
466     for (size_t iteration = 0; iteration < iterations(); iteration++) {
467       std::generate(input.begin(), input.end(), std::ref(f16rng));
468       std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
469 
470       // Compute reference results, without clamping.
471       for (size_t c = 0; c < channels(); c++) {
472         float acc = 0.0f;
473         for (size_t n = 0; n < rows(); n++) {
474           acc += fp16_ieee_to_fp32_value(input[n * input_stride() + c]);
475         }
476         output_ref[c] = acc / float(rows());
477       }
478 
479       // Compute clamping parameters.
480       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
481       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
482       const float accumulated_range = accumulated_max - accumulated_min;
483       const float output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_min + float(qmin()) / 255.0f * accumulated_range));
484       const float output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range));
485 
486       // Clamp reference results.
487       for (float& output_values : output_ref) {
488         output_values = std::max(std::min(output_values, output_max), output_min);
489       }
490 
491       // Prepare parameters.
492       xnn_f16_scaleminmax_params params = xnn_init_f16_scaleminmax_params(
493         fp16_ieee_from_fp32_value(1.0f / float(rows())),
494         fp16_ieee_from_fp32_value(output_min),
495         fp16_ieee_from_fp32_value(output_max));
496 
497       // Call optimized micro-kernel.
498       gavgpool_minmax(rows(), channels(),
499         input.data(), input_stride() * sizeof(uint16_t),
500         zero.data(),
501         output.data(),
502         &params);
503 
504       // Verify results.
505       for (size_t c = 0; c < channels(); c++) {
506         ASSERT_LE(fp16_ieee_to_fp32_value(output[c]), output_max)
507           << "at position " << c << ", rows = " << rows() << ", channels = " << channels();
508         ASSERT_GE(fp16_ieee_to_fp32_value(output[c]), output_min)
509           << "at position " << c << ", rows = " << rows() << ", channels = " << channels();
510         ASSERT_NEAR(fp16_ieee_to_fp32_value(output[c]), output_ref[c], std::max(1.0e-4f, std::abs(output_ref[c]) * 1.0e-2f))
511           << "at position " << c << ", rows = " << rows() << ", channels = " << channels();
512       }
513     }
514   }
515 
516   void Test(xnn_f16_gavgpool_minmax_multipass_ukernel_function gavgpool_minmax, Variant variant = Variant::Native) const {
517     std::random_device random_device;
518     auto rng = std::mt19937(random_device());
519     auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
520     auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
521 
522     std::vector<uint16_t> input((rows() - 1) * input_stride() + channels() + XNN_EXTRA_BYTES / sizeof(uint16_t));
523     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> buffer(channels() + XNN_EXTRA_BYTES / sizeof(uint16_t));
524     std::vector<uint16_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint16_t));
525     std::vector<uint16_t> output(channels());
526     std::vector<float> output_ref(channels());
527     for (size_t iteration = 0; iteration < iterations(); iteration++) {
528       std::generate(input.begin(), input.end(), std::ref(f16rng));
529       std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
530 
531       // Compute reference results, without clamping.
532       for (size_t c = 0; c < channels(); c++) {
533         float acc = 0.0f;
534         for (size_t n = 0; n < rows(); n++) {
535           acc += fp16_ieee_to_fp32_value(input[n * input_stride() + c]);
536         }
537         output_ref[c] = acc / float(rows());
538       }
539 
540       // Compute clamping parameters.
541       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
542       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
543       const float accumulated_range = accumulated_max - accumulated_min;
544       const float output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_min + float(qmin()) / 255.0f * accumulated_range));
545       const float output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range));
546 
547       // Prepare parameters.
548       xnn_f16_scaleminmax_params params = xnn_init_f16_scaleminmax_params(
549         fp16_ieee_from_fp32_value(1.0f / float(rows())),
550         fp16_ieee_from_fp32_value(output_min),
551         fp16_ieee_from_fp32_value(output_max));
552 
553       // Clamp reference results.
554       for (float& output_values : output_ref) {
555         output_values = std::max(std::min(output_values, output_max), output_min);
556       }
557 
558       // Call optimized micro-kernel.
559       gavgpool_minmax(rows(), channels(),
560         input.data(), input_stride() * sizeof(uint16_t),
561         zero.data(),
562         buffer.data(),
563         output.data(),
564         &params);
565 
566       // Verify results.
567       for (size_t c = 0; c < channels(); c++) {
568         ASSERT_LE(fp16_ieee_to_fp32_value(output[c]), output_max)
569           << "at position " << c << ", rows = " << rows() << ", channels = " << channels();
570         ASSERT_GE(fp16_ieee_to_fp32_value(output[c]), output_min)
571           << "at position " << c << ", rows = " << rows() << ", channels = " << channels();
572         ASSERT_NEAR(fp16_ieee_to_fp32_value(output[c]), output_ref[c], std::abs(output_ref[c]) * 1.0e-0f)
573           << "at position " << c << ", rows = " << rows() << ", channels = " << channels();
574       }
575     }
576   }
577 
578   void Test(xnn_f32_gavgpool_minmax_unipass_ukernel_function gavgpool_minmax, Variant variant = Variant::Native) const {
579     std::random_device random_device;
580     auto rng = std::mt19937(random_device());
581     auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
582 
583     std::vector<float> input((rows() - 1) * input_stride() + channels() + XNN_EXTRA_BYTES / sizeof(float));
584     std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float));
585     std::vector<float> output(channels());
586     std::vector<float> output_ref(channels());
587 
588     std::fill(zero.begin(), zero.end(), 0.0f);
589     for (size_t iteration = 0; iteration < iterations(); iteration++) {
590       std::generate(input.begin(), input.end(), std::ref(f32rng));
591       std::fill(output.begin(), output.end(), std::nanf(""));
592 
593       // Compute reference results, without clamping.
594       for (size_t c = 0; c < channels(); c++) {
595         float acc = 0.0f;
596         for (size_t n = 0; n < rows(); n++) {
597           acc += input[n * input_stride() + c];
598         }
599         output_ref[c] = acc / float(rows());
600       }
601 
602       // Compute clamping parameters.
603       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
604       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
605       const float accumulated_range = accumulated_max - accumulated_min;
606       const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
607       const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
608 
609       // Clamp reference results.
610       for (float& output_values : output_ref) {
611         output_values = std::max(std::min(output_values, output_max), output_min);
612       }
613 
614       // Prepare parameters.
615       union xnn_f32_scaleminmax_params params = { };
616       switch (variant) {
617         case Variant::Native:
618           params = xnn_init_f32_scaleminmax_params(
619             1.0f / float(rows()), output_min, output_max);
620           break;
621         case Variant::Scalar:
622           params = xnn_init_scalar_f32_scaleminmax_params(
623             1.0f / float(rows()), output_min, output_max);
624           break;
625       }
626 
627       // Call optimized micro-kernel.
628       gavgpool_minmax(rows(), channels(),
629         input.data(), input_stride() * sizeof(float),
630         zero.data(),
631         output.data(),
632         &params);
633 
634       // Verify results.
635       for (size_t c = 0; c < channels(); c++) {
636         ASSERT_LE(output[c], output_max)
637           << "at position " << c << ", rows = " << rows() << ", channels = " << channels();
638         ASSERT_GE(output[c], output_min)
639           << "at position " << c << ", rows = " << rows() << ", channels = " << channels();
640         ASSERT_NEAR(output[c], output_ref[c], std::abs(output_ref[c]) * 1.0e-6f)
641           << "at position " << c << ", rows = " << rows() << ", channels = " << channels();
642       }
643     }
644   }
645 
646   void Test(xnn_f32_gavgpool_minmax_multipass_ukernel_function gavgpool_minmax, Variant variant = Variant::Native) const {
647     std::random_device random_device;
648     auto rng = std::mt19937(random_device());
649     auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
650 
651     std::vector<float> input((rows() - 1) * input_stride() + channels() + XNN_EXTRA_BYTES / sizeof(float));
652     std::vector<float, AlignedAllocator<float, 64>> buffer(channels() + XNN_EXTRA_BYTES / sizeof(float));
653     std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float));
654     std::vector<float> output(channels());
655     std::vector<float> output_ref(channels());
656     for (size_t iteration = 0; iteration < iterations(); iteration++) {
657       std::generate(input.begin(), input.end(), std::ref(f32rng));
658       std::fill(output.begin(), output.end(), std::nanf(""));
659 
660       // Compute reference results, without clamping.
661       for (size_t c = 0; c < channels(); c++) {
662         float acc = 0.0f;
663         for (size_t n = 0; n < rows(); n++) {
664           acc += input[n * input_stride() + c];
665         }
666         output_ref[c] = acc / float(rows());
667       }
668 
669       // Compute clamping parameters.
670       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
671       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
672       const float accumulated_range = accumulated_max - accumulated_min;
673       const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
674       const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
675 
676       // Prepare parameters.
677       union xnn_f32_scaleminmax_params params = { };
678       switch (variant) {
679         case Variant::Native:
680           params = xnn_init_f32_scaleminmax_params(
681             1.0f / float(rows()), output_min, output_max);
682           break;
683         case Variant::Scalar:
684           params = xnn_init_scalar_f32_scaleminmax_params(
685             1.0f / float(rows()), output_min, output_max);
686           break;
687       }
688 
689       // Clamp reference results.
690       for (float& output_values : output_ref) {
691         output_values = std::max(std::min(output_values, output_max), output_min);
692       }
693 
694       // Call optimized micro-kernel.
695       gavgpool_minmax(rows(), channels(),
696         input.data(), input_stride() * sizeof(float),
697         zero.data(),
698         buffer.data(),
699         output.data(),
700         &params);
701 
702       // Verify results.
703       for (size_t c = 0; c < channels(); c++) {
704         ASSERT_LE(output[c], output_max)
705           << "at position " << c << ", rows = " << rows() << ", channels = " << channels();
706         ASSERT_GE(output[c], output_min)
707           << "at position " << c << ", rows = " << rows() << ", channels = " << channels();
708         ASSERT_NEAR(output[c], output_ref[c], std::abs(output_ref[c]) * 1.0e-6f)
709           << "at position " << c << ", rows = " << rows() << ", channels = " << channels();
710       }
711     }
712   }
713 
714  private:
715   size_t rows_{1};
716   size_t channels_{1};
717   size_t channel_tile_{1};
718   size_t input_stride_{0};
719   float input_scale_{1.25f};
720   float output_scale_{0.75f};
721   uint8_t input_zero_point_{121};
722   uint8_t output_zero_point_{133};
723   uint8_t qmin_{0};
724   uint8_t qmax_{255};
725   size_t iterations_{15};
726 };
727