• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #pragma once
10 
11 #include <gtest/gtest.h>
12 
13 #include <algorithm>
14 #include <cassert>
15 #include <cmath>
16 #include <cstddef>
17 #include <cstdlib>
18 #include <functional>
19 #include <limits>
20 #include <random>
21 #include <vector>
22 
23 #include <xnnpack.h>
24 #include <xnnpack/AlignedAllocator.h>
25 #include <xnnpack/params-init.h>
26 #include <xnnpack/params.h>
27 #include <xnnpack/requantization.h>
28 
29 
30 class AvgPoolMicrokernelTester {
31  public:
output_pixels(size_t output_pixels)32   inline AvgPoolMicrokernelTester& output_pixels(size_t output_pixels) {
33     assert(output_pixels != 0);
34     this->output_pixels_ = output_pixels;
35     return *this;
36   }
37 
output_pixels()38   inline size_t output_pixels() const {
39     return this->output_pixels_;
40   }
41 
step(size_t step)42   inline AvgPoolMicrokernelTester& step(size_t step) {
43     assert(step != 0);
44     this->step_ = step;
45     return *this;
46   }
47 
step()48   inline size_t step() const {
49     return this->step_;
50   }
51 
input_offset(size_t input_offset)52   inline AvgPoolMicrokernelTester& input_offset(size_t input_offset) {
53     assert(input_offset != 0);
54     this->input_offset_ = input_offset;
55     return *this;
56   }
57 
input_offset()58   inline size_t input_offset() const {
59     return this->input_offset_;
60   }
61 
zero_index(size_t zero_index)62   inline AvgPoolMicrokernelTester& zero_index(size_t zero_index) {
63     this->zero_index_ = zero_index;
64     return *this;
65   }
66 
zero_index()67   inline size_t zero_index() const {
68     return this->zero_index_;
69   }
70 
pooling_elements(size_t pooling_elements)71   inline AvgPoolMicrokernelTester& pooling_elements(size_t pooling_elements) {
72     assert(pooling_elements != 0);
73     this->pooling_elements_ = pooling_elements;
74     return *this;
75   }
76 
pooling_elements()77   inline size_t pooling_elements() const {
78     return this->pooling_elements_;
79   }
80 
packed_pooling_elements()81   inline size_t packed_pooling_elements() const {
82     if (pooling_elements() <= primary_pooling_tile()) {
83       return primary_pooling_tile();
84     } else {
85       return (pooling_elements() - primary_pooling_tile()) % incremental_pooling_tile() == 0 ? pooling_elements() : ((pooling_elements() - primary_pooling_tile()) / incremental_pooling_tile() + 1) * incremental_pooling_tile() + primary_pooling_tile();
86     }
87   }
88 
89   inline AvgPoolMicrokernelTester& pooling_tile(size_t primary_tile, size_t incremental_tile = 0) {
90     assert(primary_tile != 0);
91     this->primary_pooling_tile_ = primary_tile;
92     this->incremental_pooling_tile_ = incremental_tile;
93     return *this;
94   }
95 
primary_pooling_tile(size_t primary_pooling_tile)96   inline AvgPoolMicrokernelTester& primary_pooling_tile(size_t primary_pooling_tile) {
97     assert(primary_pooling_tile != 0);
98     this->primary_pooling_tile_ = primary_pooling_tile;
99     return *this;
100   }
101 
primary_pooling_tile()102   inline size_t primary_pooling_tile() const {
103     return this->primary_pooling_tile_;
104   }
105 
incremental_pooling_tile(size_t incremental_pooling_tile)106   inline AvgPoolMicrokernelTester& incremental_pooling_tile(size_t incremental_pooling_tile) {
107     assert(incremental_pooling_tile != 0);
108     this->incremental_pooling_tile_ = incremental_pooling_tile;
109     return *this;
110   }
111 
incremental_pooling_tile()112   inline size_t incremental_pooling_tile() const {
113     return this->incremental_pooling_tile_;
114   }
115 
channels(size_t channels)116   inline AvgPoolMicrokernelTester& channels(size_t channels) {
117     assert(channels != 0);
118     this->channels_ = channels;
119     return *this;
120   }
121 
channels()122   inline size_t channels() const {
123     return this->channels_;
124   }
125 
output_stride(size_t output_stride)126   inline AvgPoolMicrokernelTester& output_stride(size_t output_stride) {
127     assert(output_stride != 0);
128     this->output_stride_ = output_stride;
129     return *this;
130   }
131 
output_stride()132   inline size_t output_stride() const {
133     if (this->output_stride_ == 0) {
134       return channels();
135     } else {
136       assert(this->output_stride_ >= channels());
137       return this->output_stride_;
138     }
139   }
140 
input_scale(float input_scale)141   inline AvgPoolMicrokernelTester& input_scale(float input_scale) {
142     assert(input_scale > 0.0f);
143     assert(std::isnormal(input_scale));
144     this->input_scale_ = input_scale;
145     return *this;
146   }
147 
input_scale()148   inline float input_scale() const {
149     return this->input_scale_;
150   }
151 
input_zero_point(uint8_t input_zero_point)152   inline AvgPoolMicrokernelTester& input_zero_point(uint8_t input_zero_point) {
153     this->input_zero_point_ = input_zero_point;
154     return *this;
155   }
156 
input_zero_point()157   inline uint8_t input_zero_point() const {
158     return this->input_zero_point_;
159   }
160 
output_scale(float output_scale)161   inline AvgPoolMicrokernelTester& output_scale(float output_scale) {
162     assert(output_scale > 0.0f);
163     assert(std::isnormal(output_scale));
164     this->output_scale_ = output_scale;
165     return *this;
166   }
167 
output_scale()168   inline float output_scale() const {
169     return this->output_scale_;
170   }
171 
output_zero_point(uint8_t output_zero_point)172   inline AvgPoolMicrokernelTester& output_zero_point(uint8_t output_zero_point) {
173     this->output_zero_point_ = output_zero_point;
174     return *this;
175   }
176 
output_zero_point()177   inline uint8_t output_zero_point() const {
178     return this->output_zero_point_;
179   }
180 
qmin(uint8_t qmin)181   inline AvgPoolMicrokernelTester& qmin(uint8_t qmin) {
182     this->qmin_ = qmin;
183     return *this;
184   }
185 
qmin()186   inline uint8_t qmin() const {
187     return this->qmin_;
188   }
189 
qmax(uint8_t qmax)190   inline AvgPoolMicrokernelTester& qmax(uint8_t qmax) {
191     this->qmax_ = qmax;
192     return *this;
193   }
194 
qmax()195   inline uint8_t qmax() const {
196     return this->qmax_;
197   }
198 
iterations(size_t iterations)199   inline AvgPoolMicrokernelTester& iterations(size_t iterations) {
200     this->iterations_ = iterations;
201     return *this;
202   }
203 
iterations()204   inline size_t iterations() const {
205     return this->iterations_;
206   }
207 
Test(xnn_qu8_avgpool_minmax_unipass_ukernel_function avgpool_minmax,xnn_init_qu8_avgpool_minmax_params_fn init_params)208   void Test(xnn_qu8_avgpool_minmax_unipass_ukernel_function avgpool_minmax, xnn_init_qu8_avgpool_minmax_params_fn init_params) const {
209     std::random_device random_device;
210     auto rng = std::mt19937(random_device());
211     auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
212 
213     std::vector<const uint8_t*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
214     std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) +
215       input_offset() + indirect_input.size() * channels());
216     std::vector<uint8_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint8_t));
217     std::vector<uint8_t> output((output_pixels() - 1) * output_stride() + channels());
218     std::vector<uint8_t> output_ref(output_pixels() * channels());
219     std::vector<float> output_real(output_pixels() * channels());
220     std::vector<int32_t> accumulator(output_pixels() * channels());
221     for (size_t iteration = 0; iteration < iterations(); iteration++) {
222       do {
223         std::generate(input.begin(), input.end(), std::ref(u8rng));
224       } while (input.size() > 1 && *std::max_element(input.cbegin(), input.cend()) == *std::min_element(input.cbegin(), input.cend()));
225       std::fill(input.begin(), input.begin() + input_offset(), 0xA5);
226       std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(uint8_t), input.end(), 0xA5);
227       std::fill(output.begin(), output.end(), 0xA5);
228 
229       for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
230         indirect_input[i] = input.data() + i * channels();
231       }
232       std::shuffle(indirect_input.begin(),
233         indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
234       if (zero_index() != SIZE_MAX) {
235         indirect_input[zero_index()] = zero.data();
236       }
237 
238       // Prepare parameters.
239       xnn_qu8_avgpool_minmax_params params;
240       init_params(
241         &params,
242         -int32_t(input_zero_point()) * int32_t(pooling_elements()),
243         input_scale() / (output_scale() * float(pooling_elements())),
244         output_zero_point(), qmin(), qmax());
245 
246       // Compute reference results.
247       for (size_t x = 0; x < output_pixels(); x++) {
248         for (size_t c = 0; c < channels(); c++) {
249           int32_t acc = 0;
250           for (size_t p = 0; p < pooling_elements(); p++) {
251             const uint8_t* row = indirect_input[x * step() + p];
252             if (row != zero.data()) {
253               acc += int32_t(row[c + input_offset()]);
254             }
255             acc -= int32_t(input_zero_point());
256           }
257           accumulator[x * channels() + c] = acc;
258           output_ref[x * channels() + c] = xnn_qu8_requantize_rndna(
259             acc, input_scale() / (output_scale() * float(pooling_elements())), output_zero_point(), qmin(), qmax());
260           const float scaled_acc =
261             float(acc) * input_scale() / (output_scale() * float(pooling_elements())) + float(output_zero_point());
262           output_real[x * channels() + c] = std::min(std::max(scaled_acc, float(qmin())), float(qmax()));
263         }
264       }
265 
266       // Call optimized micro-kernel.
267       avgpool_minmax(output_pixels(), pooling_elements(), channels(),
268         indirect_input.data(), input_offset() * sizeof(uint8_t), zero.data(),
269         output.data(),
270         step() * sizeof(void*),
271         (output_stride() - channels()) * sizeof(uint8_t),
272         &params);
273 
274       // Verify results.
275       for (size_t x = 0; x < output_pixels(); x++) {
276         for (size_t c = 0; c < channels(); c++) {
277           ASSERT_GE(uint32_t(output[x * output_stride() + c]), uint32_t(qmin()))
278             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
279             << ", pooling elements = " << pooling_elements() << ", step = " << step()
280             << ", input offset = " << input_offset();
281           ASSERT_LE(uint32_t(output[x * output_stride() + c]), uint32_t(qmax()))
282             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
283             << ", pooling elements = " << pooling_elements() << ", step = " << step()
284             << ", input offset = " << input_offset();
285           ASSERT_NEAR(float(int32_t(output[x * output_stride() + c])), output_real[x * channels() + c], 0.5f)
286             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
287             << ", pooling elements = " << pooling_elements() << ", step = " << step()
288             << ", input offset = " << input_offset() << ", accumulator = " << accumulator[x * channels() + c];
289           ASSERT_EQ(uint32_t(output_ref[x * channels() + c]), uint32_t(output[x * output_stride() + c]))
290             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
291             << ", pooling elements = " << pooling_elements() << ", step = " << step()
292             << ", input offset = " << input_offset() << ", accumulator = " << accumulator[x * channels() + c];
293         }
294       }
295     }
296   }
297 
Test(xnn_qu8_avgpool_minmax_multipass_ukernel_function avgpool_minmax,xnn_init_qu8_avgpool_minmax_params_fn init_params)298   void Test(xnn_qu8_avgpool_minmax_multipass_ukernel_function avgpool_minmax, xnn_init_qu8_avgpool_minmax_params_fn init_params) const {
299     std::random_device random_device;
300     auto rng = std::mt19937(random_device());
301     auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
302 
303     std::vector<const uint8_t*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
304     std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) +
305       input_offset() + indirect_input.size() * channels());
306     std::vector<uint8_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint8_t));
307     std::vector<uint8_t> output((output_pixels() - 1) * output_stride() + channels());
308     std::vector<uint8_t> output_ref(output_pixels() * channels());
309     std::vector<float> output_real(output_pixels() * channels());
310     std::vector<int32_t> accumulator(output_pixels() * channels());
311     std::vector<int32_t, AlignedAllocator<int32_t, 64>> buffer(XNN_EXTRA_BYTES / sizeof(uint8_t) + channels());
312     for (size_t iteration = 0; iteration < iterations(); iteration++) {
313       do {
314         std::generate(input.begin(), input.end(), std::ref(u8rng));
315       } while (input.size() > 1 && *std::max_element(input.cbegin(), input.cend()) == *std::min_element(input.cbegin(), input.cend()));
316       std::fill(input.begin(), input.begin() + input_offset(), 0xA5);
317       std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(uint8_t), input.end(), 0xA5);
318       std::fill(output.begin(), output.end(), 0xA5);
319 
320       for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
321         indirect_input[i] = input.data() + i * channels();
322       }
323       std::shuffle(indirect_input.begin(),
324         indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
325       if (zero_index() != SIZE_MAX) {
326         indirect_input[zero_index()] = zero.data();
327       }
328 
329       // Prepare parameters.
330       xnn_qu8_avgpool_minmax_params params;
331       init_params(
332         &params,
333         -int32_t(input_zero_point()) * int32_t(pooling_elements()),
334         input_scale() / (output_scale() * float(pooling_elements())),
335         output_zero_point(), qmin(), qmax());
336 
337       // Compute reference results.
338       for (size_t x = 0; x < output_pixels(); x++) {
339         for (size_t c = 0; c < channels(); c++) {
340           int32_t acc = 0;
341           for (size_t p = 0; p < pooling_elements(); p++) {
342             const uint8_t* row = indirect_input[x * step() + p];
343             if (row != zero.data()) {
344               acc += int32_t(row[c + input_offset()]);
345             }
346             acc -= int32_t(input_zero_point());
347           }
348           accumulator[x * channels() + c] = acc;
349           output_ref[x * channels() + c] = xnn_qu8_requantize_rndna(
350             acc, input_scale() / (output_scale() * float(pooling_elements())), output_zero_point(), qmin(), qmax());
351           const float scaled_acc =
352             float(acc) * input_scale() / (output_scale() * float(pooling_elements())) + float(output_zero_point());
353           output_real[x * channels() + c] = std::min(std::max(scaled_acc, float(qmin())), float(qmax()));
354         }
355       }
356 
357       // Call optimized micro-kernel.
358       avgpool_minmax(output_pixels(), pooling_elements(), channels(),
359         indirect_input.data(), input_offset() * sizeof(uint8_t), zero.data(),
360         buffer.data(), output.data(),
361         (step() - (packed_pooling_elements() - incremental_pooling_tile())) * sizeof(void*),
362         (output_stride() - channels()) * sizeof(uint8_t),
363         &params);
364 
365       // Verify results.
366       for (size_t x = 0; x < output_pixels(); x++) {
367         for (size_t c = 0; c < channels(); c++) {
368           ASSERT_GE(uint32_t(output[x * output_stride() + c]), uint32_t(qmin()))
369             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
370             << ", pooling elements = " << pooling_elements() << ", step = " << step()
371             << ", input offset = " << input_offset();
372           ASSERT_LE(uint32_t(output[x * output_stride() + c]), uint32_t(qmax()))
373             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
374             << ", pooling elements = " << pooling_elements() << ", step = " << step()
375             << ", input offset = " << input_offset();
376           ASSERT_NEAR(float(int32_t(output[x * output_stride() + c])), output_real[x * channels() + c], 0.5f)
377             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
378             << ", pooling elements = " << pooling_elements() << ", step = " << step()
379             << ", input offset = " << input_offset() << ", accumulator = " << accumulator[x * channels() + c];
380           ASSERT_EQ(uint32_t(output_ref[x * channels() + c]), uint32_t(output[x * output_stride() + c]))
381             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
382             << ", pooling elements = " << pooling_elements() << ", step = " << step()
383             << ", input offset = " << input_offset() << ", accumulator = " << accumulator[x * channels() + c];
384         }
385       }
386     }
387   }
388 
Test(xnn_f32_avgpool_minmax_unipass_ukernel_function avgpool_minmax,xnn_init_f32_scaleminmax_params_fn init_params)389   void Test(xnn_f32_avgpool_minmax_unipass_ukernel_function avgpool_minmax, xnn_init_f32_scaleminmax_params_fn init_params) const {
390     std::random_device random_device;
391     auto rng = std::mt19937(random_device());
392     auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
393 
394     std::vector<const float*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
395     std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
396       input_offset() + indirect_input.size() * channels());
397     std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float));
398     std::vector<float> output((output_pixels() - 1) * output_stride() + channels());
399     std::vector<float> output_ref(output_pixels() * channels());
400     for (size_t iteration = 0; iteration < iterations(); iteration++) {
401       std::generate(input.begin(), input.end(), std::ref(f32rng));
402       std::fill(input.begin(), input.begin() + input_offset(), std::nanf(""));
403       std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(float), input.end(), std::nanf(""));
404       std::fill(output.begin(), output.end(), std::nanf(""));
405 
406       for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
407         indirect_input[i] = input.data() + i * channels();
408       }
409       std::shuffle(indirect_input.begin(),
410         indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
411       if (zero_index() != SIZE_MAX) {
412         indirect_input[zero_index()] = zero.data();
413       }
414 
415       // Compute reference results, without clamping.
416       for (size_t x = 0; x < output_pixels(); x++) {
417         for (size_t c = 0; c < channels(); c++) {
418           float acc = 0.0f;
419           for (size_t p = 0; p < pooling_elements(); p++) {
420             const float* row = indirect_input[x * step() + p];
421             if (row != zero.data()) {
422               acc += row[c + input_offset()];
423             }
424           }
425           output_ref[x * channels() + c] = acc / float(pooling_elements());
426         }
427       }
428 
429       // Compute clamping parameters.
430       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
431       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
432       const float accumulated_range = accumulated_max - accumulated_min;
433       const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
434       const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
435 
436       // Clamp reference results.
437       for (float& output_value : output_ref) {
438         output_value = std::max(std::min(output_value, output_max), output_min);
439       }
440 
441       // Prepare parameters.
442       xnn_f32_scaleminmax_params params;
443       init_params(&params, 1.0f / float(pooling_elements()), output_min, output_max);
444 
445       // Call optimized micro-kernel.
446       avgpool_minmax(output_pixels(), pooling_elements(), channels(),
447         indirect_input.data(), input_offset() * sizeof(float), zero.data(),
448         output.data(),
449         step() * sizeof(void*),
450         (output_stride() - channels()) * sizeof(float),
451         &params);
452 
453       // Verify results.
454       for (size_t x = 0; x < output_pixels(); x++) {
455         for (size_t c = 0; c < channels(); c++) {
456           ASSERT_GE(output[x * output_stride() + c], output_min)
457             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
458             << ", pooling elements = " << pooling_elements() << ", step = " << step()
459             << ", input offset = " << input_offset();
460           ASSERT_LE(output[x * output_stride() + c], output_max)
461             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
462             << ", pooling elements = " << pooling_elements() << ", step = " << step()
463             << ", input offset = " << input_offset();
464           ASSERT_NEAR(
465               output[x * output_stride() + c],
466               output_ref[x * channels() + c],
467               std::abs(output_ref[x * channels() + c]) * 1.0e-6f)
468             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
469             << ", pooling elements = " << pooling_elements() << ", step = " << step()
470             << ", input offset = " << input_offset();
471         }
472       }
473     }
474   }
475 
Test(xnn_f32_avgpool_minmax_multipass_ukernel_function avgpool_minmax,xnn_init_f32_scaleminmax_params_fn init_params)476   void Test(xnn_f32_avgpool_minmax_multipass_ukernel_function avgpool_minmax, xnn_init_f32_scaleminmax_params_fn init_params) const {
477     std::random_device random_device;
478     auto rng = std::mt19937(random_device());
479     auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
480 
481     std::vector<const float*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
482     std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
483       input_offset() + indirect_input.size() * channels());
484     std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float));
485     std::vector<float> output((output_pixels() - 1) * output_stride() + channels());
486     std::vector<float> output_ref(output_pixels() * channels());
487     std::vector<float, AlignedAllocator<float, 64>> buffer(XNN_EXTRA_BYTES / sizeof(float) + channels());
488     for (size_t iteration = 0; iteration < iterations(); iteration++) {
489       std::generate(input.begin(), input.end(), std::ref(f32rng));
490       std::fill(input.begin(), input.begin() + input_offset(), std::nanf(""));
491       std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(float), input.end(), std::nanf(""));
492       std::fill(output.begin(), output.end(), std::nanf(""));
493 
494       for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
495         indirect_input[i] = input.data() + i * channels();
496       }
497       std::shuffle(indirect_input.begin(),
498         indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
499       if (zero_index() != SIZE_MAX) {
500         indirect_input[zero_index()] = zero.data();
501       }
502 
503       // Compute reference results, without clamping.
504       for (size_t x = 0; x < output_pixels(); x++) {
505         for (size_t c = 0; c < channels(); c++) {
506           float acc = 0.0f;
507           for (size_t p = 0; p < pooling_elements(); p++) {
508             const float* row = indirect_input[x * step() + p];
509             if (row != zero.data()) {
510               acc += row[c + input_offset()];
511             }
512           }
513           output_ref[x * channels() + c] = acc / float(pooling_elements());
514         }
515       }
516 
517       // Compute clamping parameters.
518       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
519       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
520       const float accumulated_range = accumulated_max - accumulated_min;
521       const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
522       const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
523 
524       // Clamp reference results.
525       for (float& output_value : output_ref) {
526         output_value = std::max(std::min(output_value, output_max), output_min);
527       }
528 
529       // Prepare parameters.
530       xnn_f32_scaleminmax_params params;
531       init_params(&params, 1.0f / float(pooling_elements()), output_min, output_max);
532 
533       // Call optimized micro-kernel.
534       avgpool_minmax(output_pixels(), pooling_elements(), channels(),
535         indirect_input.data(), input_offset() * sizeof(float), zero.data(),
536         buffer.data(), output.data(),
537         (step() - (packed_pooling_elements() - incremental_pooling_tile())) * sizeof(void*),
538         (output_stride() - channels()) * sizeof(float),
539         &params);
540 
541       // Verify results.
542       for (size_t x = 0; x < output_pixels(); x++) {
543         for (size_t c = 0; c < channels(); c++) {
544           ASSERT_GE(output[x * output_stride() + c], output_min)
545             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
546             << ", pooling elements = " << pooling_elements() << ", step = " << step()
547             << ", input offset = " << input_offset();
548           ASSERT_LE(output[x * output_stride() + c], output_max)
549             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
550             << ", pooling elements = " << pooling_elements() << ", step = " << step()
551             << ", input offset = " << input_offset();
552           ASSERT_NEAR(
553               output[x * output_stride() + c],
554               output_ref[x * channels() + c],
555               std::abs(output_ref[x * channels() + c]) * 1.0e-6f)
556             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
557             << ", pooling elements = " << pooling_elements() << ", step = " << step()
558             << ", input offset = " << input_offset();
559         }
560       }
561     }
562   }
563 
Test(xnn_f32_pavgpool_minmax_unipass_ukernel_function pavgpool_minmax,xnn_init_f32_minmax_params_fn init_params)564   void Test(xnn_f32_pavgpool_minmax_unipass_ukernel_function pavgpool_minmax, xnn_init_f32_minmax_params_fn init_params) const {
565     std::random_device random_device;
566     auto rng = std::mt19937(random_device());
567     auto f32irng = std::bind(std::uniform_real_distribution<float>(), rng);
568     auto f32mrng = std::bind(std::uniform_real_distribution<float>(0.1f, 0.5f), rng);
569 
570     std::vector<const float*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
571     std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
572       input_offset() + indirect_input.size() * channels());
573     std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float));
574     std::vector<float> multiplier(output_pixels());
575     std::vector<float> output((output_pixels() - 1) * output_stride() + channels());
576     std::vector<float> output_ref(output_pixels() * channels());
577     for (size_t iteration = 0; iteration < iterations(); iteration++) {
578       std::generate(input.begin(), input.end(), std::ref(f32irng));
579       std::fill(input.begin(), input.begin() + input_offset(), std::nanf(""));
580       std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(float), input.end(), std::nanf(""));
581       std::generate(multiplier.begin(), multiplier.end(), std::ref(f32mrng));
582       std::fill(output.begin(), output.end(), std::nanf(""));
583 
584       for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
585         indirect_input[i] = input.data() + i * channels();
586       }
587       std::shuffle(indirect_input.begin(),
588         indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
589       if (zero_index() != SIZE_MAX) {
590         indirect_input[zero_index()] = zero.data();
591       }
592 
593       // Compute reference results, without clamping.
594       for (size_t x = 0; x < output_pixels(); x++) {
595         for (size_t c = 0; c < channels(); c++) {
596           float acc = 0.0f;
597           for (size_t p = 0; p < pooling_elements(); p++) {
598             const float* row = indirect_input[x * step() + p];
599             if (row != zero.data()) {
600               acc += row[c + input_offset()];
601             }
602           }
603           output_ref[x * channels() + c] = acc * multiplier[x];
604         }
605       }
606 
607       // Compute clamping parameters.
608       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
609       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
610       const float accumulated_range = accumulated_max - accumulated_min;
611       const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
612       const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
613 
614       // Clamp reference results.
615       for (float& output_value : output_ref) {
616         output_value = std::max(std::min(output_value, output_max), output_min);
617       }
618 
619       // Prepare parameters.
620       xnn_f32_minmax_params params;
621       init_params(&params, output_min, output_max);
622 
623       // Call optimized micro-kernel.
624       pavgpool_minmax(output_pixels(), pooling_elements(), channels(),
625         indirect_input.data(), input_offset() * sizeof(float), zero.data(),
626         multiplier.data(), output.data(),
627         step() * sizeof(void*),
628         (output_stride() - channels()) * sizeof(float),
629         &params);
630 
631       // Verify results.
632       for (size_t x = 0; x < output_pixels(); x++) {
633         for (size_t c = 0; c < channels(); c++) {
634           ASSERT_GE(output[x * output_stride() + c], output_min)
635             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
636             << ", pooling elements = " << pooling_elements() << ", step = " << step()
637             << ", input offset = " << input_offset();
638           ASSERT_LE(output[x * output_stride() + c], output_max)
639             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
640             << ", pooling elements = " << pooling_elements() << ", step = " << step()
641             << ", input offset = " << input_offset();
642           ASSERT_NEAR(
643               output[x * output_stride() + c],
644               output_ref[x * channels() + c],
645               std::abs(output_ref[x * channels() + c]) * 1.0e-6f)
646             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
647             << ", pooling elements = " << pooling_elements() << ", step = " << step()
648             << ", input offset = " << input_offset();
649         }
650       }
651     }
652   }
653 
Test(xnn_f32_pavgpool_minmax_multipass_ukernel_function pavgpool_minmax,xnn_init_f32_minmax_params_fn init_params)654   void Test(xnn_f32_pavgpool_minmax_multipass_ukernel_function pavgpool_minmax, xnn_init_f32_minmax_params_fn init_params) const {
655     std::random_device random_device;
656     auto rng = std::mt19937(random_device());
657     auto f32irng = std::bind(std::uniform_real_distribution<float>(), rng);
658     auto f32mrng = std::bind(std::uniform_real_distribution<float>(0.1f, 0.5f), rng);
659 
660     std::vector<const float*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
661     std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
662       input_offset() + indirect_input.size() * channels());
663     std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float));
664     std::vector<float> multiplier(output_pixels());
665     std::vector<float> output((output_pixels() - 1) * output_stride() + channels());
666     std::vector<float> output_ref(output_pixels() * channels());
667     std::vector<float, AlignedAllocator<float, 64>> buffer(XNN_EXTRA_BYTES / sizeof(float) + channels());
668     for (size_t iteration = 0; iteration < iterations(); iteration++) {
669       std::generate(input.begin(), input.end(), std::ref(f32irng));
670       std::fill(input.begin(), input.begin() + input_offset(), std::nanf(""));
671       std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(float), input.end(), std::nanf(""));
672       std::generate(multiplier.begin(), multiplier.end(), std::ref(f32mrng));
673       std::fill(output.begin(), output.end(), std::nanf(""));
674 
675       for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
676         indirect_input[i] = input.data() + i * channels();
677       }
678       std::shuffle(indirect_input.begin(),
679         indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
680       if (zero_index() != SIZE_MAX) {
681         indirect_input[zero_index()] = zero.data();
682       }
683 
684       // Compute reference results, without clamping.
685       for (size_t x = 0; x < output_pixels(); x++) {
686         for (size_t c = 0; c < channels(); c++) {
687           float acc = 0.0f;
688           for (size_t p = 0; p < pooling_elements(); p++) {
689             const float* row = indirect_input[x * step() + p];
690             if (row != zero.data()) {
691               acc += row[c + input_offset()];
692             }
693           }
694           output_ref[x * channels() + c] = acc * multiplier[x];
695         }
696       }
697 
698       // Compute clamping parameters.
699       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
700       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
701       const float accumulated_range = accumulated_max - accumulated_min;
702       const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
703       const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
704 
705       // Clamp reference results.
706       for (float& output_value : output_ref) {
707         output_value = std::max(std::min(output_value, output_max), output_min);
708       }
709 
710       // Prepare parameters.
711       xnn_f32_minmax_params params;
712       init_params(&params, output_min, output_max);
713 
714       // Call optimized micro-kernel.
715       pavgpool_minmax(output_pixels(), pooling_elements(), channels(),
716         indirect_input.data(), input_offset() * sizeof(float), zero.data(),
717         multiplier.data(), buffer.data(), output.data(),
718         (step() - (packed_pooling_elements() - incremental_pooling_tile())) * sizeof(void*),
719         (output_stride() - channels()) * sizeof(float),
720         &params);
721 
722       // Verify results.
723       for (size_t x = 0; x < output_pixels(); x++) {
724         for (size_t c = 0; c < channels(); c++) {
725           ASSERT_GE(output[x * output_stride() + c], output_min)
726             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
727             << ", pooling elements = " << pooling_elements() << ", step = " << step()
728             << ", input offset = " << input_offset();
729           ASSERT_LE(output[x * output_stride() + c], output_max)
730             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
731             << ", pooling elements = " << pooling_elements() << ", step = " << step()
732             << ", input offset = " << input_offset();
733           ASSERT_NEAR(
734               output[x * output_stride() + c],
735               output_ref[x * channels() + c],
736               std::abs(output_ref[x * channels() + c]) * 1.0e-6f)
737             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
738             << ", pooling elements = " << pooling_elements() << ", step = " << step()
739             << ", input offset = " << input_offset();
740         }
741       }
742     }
743   }
744 
745  private:
746   size_t output_pixels_{1};
747   size_t pooling_elements_{1};
748   size_t channels_{1};
749   size_t input_offset_{0};
750   size_t zero_index_{SIZE_MAX};
751   size_t step_{1};
752   size_t primary_pooling_tile_{1};
753   size_t incremental_pooling_tile_{1};
754   size_t output_stride_{0};
755   float input_scale_{1.25f};
756   float output_scale_{0.75f};
757   uint8_t input_zero_point_{121};
758   uint8_t output_zero_point_{133};
759   uint8_t qmin_{0};
760   uint8_t qmax_{255};
761   size_t iterations_{3};
762 };
763