• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #pragma once
7 
8 #include <gtest/gtest.h>
9 
10 #include <algorithm>
11 #include <cassert>
12 #include <cstddef>
13 #include <cstdlib>
14 #include <random>
15 #include <vector>
16 
17 #include <fp16.h>
18 
19 #include <xnnpack.h>
20 #include <xnnpack/microfnptr.h>
21 #include <xnnpack/microparams-init.h>
22 
23 
24 class VUnaryMicrokernelTester {
25  public:
26   enum class OpType {
27     ReLU,
28     RoundToNearestEven,
29     RoundTowardsZero,
30     RoundUp,
31     RoundDown,
32   };
33 
34   enum class Variant {
35     Native,
36     Scalar,
37   };
38 
batch_size(size_t batch_size)39   inline VUnaryMicrokernelTester& batch_size(size_t batch_size) {
40     assert(batch_size != 0);
41     this->batch_size_ = batch_size;
42     return *this;
43   }
44 
batch_size()45   inline size_t batch_size() const {
46     return this->batch_size_;
47   }
48 
inplace(bool inplace)49   inline VUnaryMicrokernelTester& inplace(bool inplace) {
50     this->inplace_ = inplace;
51     return *this;
52   }
53 
inplace()54   inline bool inplace() const {
55     return this->inplace_;
56   }
57 
slope(float slope)58   inline VUnaryMicrokernelTester& slope(float slope) {
59     this->slope_ = slope;
60     return *this;
61   }
62 
slope()63   inline float slope() const {
64     return this->slope_;
65   }
66 
prescale(float prescale)67   inline VUnaryMicrokernelTester& prescale(float prescale) {
68     this->prescale_ = prescale;
69     return *this;
70   }
71 
prescale()72   inline float prescale() const {
73     return this->prescale_;
74   }
75 
alpha(float alpha)76   inline VUnaryMicrokernelTester& alpha(float alpha) {
77     this->alpha_ = alpha;
78     return *this;
79   }
80 
alpha()81   inline float alpha() const {
82     return this->alpha_;
83   }
84 
beta(float beta)85   inline VUnaryMicrokernelTester& beta(float beta) {
86     this->beta_ = beta;
87     return *this;
88   }
89 
beta()90   inline float beta() const {
91     return this->beta_;
92   }
93 
shift(uint32_t shift)94   inline VUnaryMicrokernelTester& shift(uint32_t shift) {
95     this->shift_ = shift;
96     return *this;
97   }
98 
shift()99   inline uint32_t shift() const {
100     return this->shift_;
101   }
102 
qmin(uint8_t qmin)103   inline VUnaryMicrokernelTester& qmin(uint8_t qmin) {
104     this->qmin_ = qmin;
105     return *this;
106   }
107 
qmin()108   inline uint8_t qmin() const {
109     return this->qmin_;
110   }
111 
qmax(uint8_t qmax)112   inline VUnaryMicrokernelTester& qmax(uint8_t qmax) {
113     this->qmax_ = qmax;
114     return *this;
115   }
116 
qmax()117   inline uint8_t qmax() const {
118     return this->qmax_;
119   }
120 
iterations(size_t iterations)121   inline VUnaryMicrokernelTester& iterations(size_t iterations) {
122     this->iterations_ = iterations;
123     return *this;
124   }
125 
iterations()126   inline size_t iterations() const {
127     return this->iterations_;
128   }
129 
Test(xnn_f32_vrelu_ukernel_function vrelu)130   void Test(xnn_f32_vrelu_ukernel_function vrelu) const {
131     std::random_device random_device;
132     auto rng = std::mt19937(random_device());
133     std::uniform_real_distribution<float> f32dist(-1.0f, 1.0f);
134 
135     std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float));
136     std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0));
137     std::vector<double> y_ref(batch_size());
138     for (size_t iteration = 0; iteration < iterations(); iteration++) {
139       if (inplace()) {
140         std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); });
141       } else {
142         std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); });
143         std::fill(y.begin(), y.end(), nanf(""));
144       }
145       const float* x_data = inplace() ? y.data() : x.data();
146 
147       // Compute reference results.
148       for (size_t i = 0; i < batch_size(); i++) {
149         y_ref[i] = std::max(x_data[i], 0.0f);
150       }
151 
152       // Call optimized micro-kernel.
153       vrelu(batch_size() * sizeof(float), x_data, y.data(), nullptr);
154 
155       // Verify results.
156       for (size_t i = 0; i < batch_size(); i++) {
157         ASSERT_EQ(y[i], y_ref[i])
158           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i];
159       }
160     }
161   }
162 
163   void Test(xnn_f16_vabs_ukernel_function vabs, xnn_init_f16_abs_params_fn init_params = nullptr) const {
164     std::random_device random_device;
165     auto rng = std::mt19937(random_device());
166     std::uniform_real_distribution<float> f32dist(-1.0f, 1.0f);
167 
168     std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t));
169     std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0));
170     std::vector<uint16_t> y_ref(batch_size());
171     for (size_t iteration = 0; iteration < iterations(); iteration++) {
172       if (inplace()) {
173         std::generate(y.begin(), y.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
174       } else {
175         std::generate(x.begin(), x.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
176         std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
177       }
178       const uint16_t* x_data = inplace() ? y.data() : x.data();
179 
180       // Compute reference results.
181       for (size_t i = 0; i < batch_size(); i++) {
182         y_ref[i] = x_data[i] & UINT16_C(0x7FFF);
183       }
184 
185       // Prepare parameters.
186       union xnn_f16_abs_params params;
187       if (init_params != nullptr) {
188         init_params(&params);
189       }
190 
191       // Call optimized micro-kernel.
192       vabs(batch_size() * sizeof(uint16_t), x_data, y.data(), &params);
193 
194       // Verify results.
195       for (size_t i = 0; i < batch_size(); i++) {
196         ASSERT_EQ(y[i], y_ref[i])
197           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i];
198       }
199     }
200   }
201 
202   void Test(xnn_f32_vabs_ukernel_function vabs, xnn_init_f32_abs_params_fn init_params = nullptr) const {
203     std::random_device random_device;
204     auto rng = std::mt19937(random_device());
205     std::uniform_real_distribution<float> f32dist(-1.0f, 1.0f);
206 
207     std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float));
208     std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0));
209     std::vector<float> y_ref(batch_size());
210     for (size_t iteration = 0; iteration < iterations(); iteration++) {
211       if (inplace()) {
212         std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); });
213       } else {
214         std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); });
215         std::fill(y.begin(), y.end(), nanf(""));
216       }
217       const float* x_data = inplace() ? y.data() : x.data();
218 
219       // Compute reference results.
220       for (size_t i = 0; i < batch_size(); i++) {
221         y_ref[i] = std::abs(x_data[i]);
222       }
223 
224       // Prepare parameters.
225       union xnn_f32_abs_params params;
226       if (init_params != nullptr) {
227         init_params(&params);
228       }
229 
230       // Call optimized micro-kernel.
231       vabs(batch_size() * sizeof(float), x_data, y.data(), &params);
232 
233       // Verify results.
234       for (size_t i = 0; i < batch_size(); i++) {
235         ASSERT_EQ(y[i], y_ref[i])
236           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i];
237       }
238     }
239   }
240 
Test(xnn_f32_vclamp_ukernel_function vclamp,xnn_init_f32_minmax_params_fn init_params)241   void Test(xnn_f32_vclamp_ukernel_function vclamp, xnn_init_f32_minmax_params_fn init_params) const {
242     std::random_device random_device;
243     auto rng = std::mt19937(random_device());
244     std::uniform_real_distribution<float> f32dist(0.0f, 255.0f);
245 
246     std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float));
247     std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0));
248     std::vector<float> y_ref(batch_size());
249     for (size_t iteration = 0; iteration < iterations(); iteration++) {
250       if (inplace()) {
251         std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); });
252       } else {
253         std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); });
254         std::fill(y.begin(), y.end(), nanf(""));
255       }
256       const float* x_data = inplace() ? y.data() : x.data();
257 
258       // Compute reference results.
259       for (size_t i = 0; i < batch_size(); i++) {
260         y_ref[i] = std::max(std::min(x_data[i], float(qmax())), float(qmin()));
261       }
262 
263       // Prepare parameters.
264       union xnn_f32_minmax_params params;
265       init_params(&params, float(qmin()), float(qmax()));
266 
267       // Call optimized micro-kernel.
268       vclamp(batch_size() * sizeof(float), x_data, y.data(), &params);
269 
270       // Verify results.
271       for (size_t i = 0; i < batch_size(); i++) {
272         ASSERT_EQ(y[i], y_ref[i])
273           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i];
274       }
275     }
276   }
277 
Test(xnn_f16_velu_ukernel_function velu,xnn_init_f16_elu_params_fn init_params)278   void Test(xnn_f16_velu_ukernel_function velu, xnn_init_f16_elu_params_fn init_params) const {
279     std::random_device random_device;
280     auto rng = std::mt19937(random_device());
281     std::uniform_real_distribution<float> f32dist(-9.0f, 9.0f);
282 
283     std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t));
284     std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0));
285     std::vector<float> y_ref(batch_size());
286     for (size_t iteration = 0; iteration < iterations(); iteration++) {
287       if (inplace()) {
288         std::generate(y.begin(), y.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
289       } else {
290         std::generate(x.begin(), x.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
291         std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
292       }
293       const uint16_t* x_data = inplace() ? y.data() : x.data();
294 
295       // Compute reference results.
296       for (size_t i = 0; i < batch_size(); i++) {
297         const float x_value = fp16_ieee_to_fp32_value(x_data[i]);
298         y_ref[i] = std::signbit(x_value) ? alpha() * std::expm1(x_value * prescale()) : x_value * beta();
299       }
300 
301       // Prepare parameters.
302       union xnn_f16_elu_params params;
303       init_params(&params, fp16_ieee_from_fp32_value(prescale()), fp16_ieee_from_fp32_value(alpha()), fp16_ieee_from_fp32_value(beta()));
304 
305       // Call optimized micro-kernel.
306       velu(batch_size() * sizeof(uint16_t), x_data, y.data(), &params);
307 
308       // Verify results.
309       for (size_t i = 0; i < batch_size(); i++) {
310         ASSERT_NEAR(
311             fp16_ieee_to_fp32_value(y[i]),
312             y_ref[i],
313             std::max(1.0e-4f, std::abs(y_ref[i]) * 5.0e-3f))
314           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << fp16_ieee_to_fp32_value(x[i]);
315       }
316     }
317   }
318 
Test(xnn_f32_velu_ukernel_function velu,xnn_init_f32_elu_params_fn init_params)319   void Test(xnn_f32_velu_ukernel_function velu, xnn_init_f32_elu_params_fn init_params) const {
320     std::random_device random_device;
321     auto rng = std::mt19937(random_device());
322     std::uniform_real_distribution<float> f32dist(-20.0f, 20.0f);
323 
324     std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float));
325     std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0));
326     std::vector<double> y_ref(batch_size());
327     for (size_t iteration = 0; iteration < iterations(); iteration++) {
328       if (inplace()) {
329         std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); });
330       } else {
331         std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); });
332         std::fill(y.begin(), y.end(), nanf(""));
333       }
334       const float* x_data = inplace() ? y.data() : x.data();
335 
336       // Compute reference results.
337       for (size_t i = 0; i < batch_size(); i++) {
338         y_ref[i] = std::signbit(x_data[i]) ? alpha() * std::expm1(double(x_data[i]) * prescale()) : double(x_data[i]) * beta();
339       }
340 
341       // Prepare parameters.
342       union xnn_f32_elu_params params;
343       init_params(&params, prescale(), alpha(), beta());
344 
345       // Call optimized micro-kernel.
346       velu(batch_size() * sizeof(float), x_data, y.data(), &params);
347 
348       // Verify results.
349       for (size_t i = 0; i < batch_size(); i++) {
350         ASSERT_NEAR(y[i], y_ref[i], std::max(5.0e-6, std::abs(y_ref[i]) * 1.0e-5))
351           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i];
352       }
353     }
354   }
355 
Test(xnn_f16_vhswish_ukernel_function vhswish,xnn_init_f16_hswish_params_fn init_params)356   void Test(xnn_f16_vhswish_ukernel_function vhswish, xnn_init_f16_hswish_params_fn init_params) const {
357     std::random_device random_device;
358     auto rng = std::mt19937(random_device());
359     auto f32rng = std::bind(std::uniform_real_distribution<float>(-4.0f, 4.0f), std::ref(rng));
360     auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
361 
362     std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t));
363     std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0));
364     std::vector<float> y_ref(batch_size());
365     for (size_t iteration = 0; iteration < iterations(); iteration++) {
366       std::generate(x.begin(), x.end(), std::ref(f16rng));
367       if (inplace()) {
368         std::generate(y.begin(), y.end(), std::ref(f16rng));
369       } else {
370         std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
371       }
372       const uint16_t* x_data = inplace() ? y.data() : x.data();
373 
374       // Compute reference results.
375       for (size_t i = 0; i < batch_size(); i++) {
376         const float x_value = fp16_ieee_to_fp32_value(x_data[i]);
377         y_ref[i] = (x_value / 6.0f) * std::max(std::min(x_value + 3.0f, 6.0f), 0.0f);
378       }
379 
380       // Prepare parameters.
381       union xnn_f16_hswish_params params;
382       init_params(&params);
383 
384       // Call optimized micro-kernel.
385       vhswish(batch_size() * sizeof(uint16_t), x_data, y.data(), &params);
386 
387       // Verify results.
388       for (size_t i = 0; i < batch_size(); i++) {
389         ASSERT_NEAR(y_ref[i], fp16_ieee_to_fp32_value(y[i]), std::max(1.0e-3f, std::abs(y_ref[i]) * 1.0e-2f))
390           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << fp16_ieee_to_fp32_value(x[i]);
391       }
392     }
393   }
394 
Test(xnn_f32_vhswish_ukernel_function vhswish,xnn_init_f32_hswish_params_fn init_params)395   void Test(xnn_f32_vhswish_ukernel_function vhswish, xnn_init_f32_hswish_params_fn init_params) const {
396     std::random_device random_device;
397     auto rng = std::mt19937(random_device());
398     std::uniform_real_distribution<float> f32dist(-4.0f, 4.0f);
399 
400     std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float));
401     std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0));
402     std::vector<double> y_ref(batch_size());
403     for (size_t iteration = 0; iteration < iterations(); iteration++) {
404       if (inplace()) {
405         std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); });
406       } else {
407         std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); });
408         std::fill(y.begin(), y.end(), nanf(""));
409       }
410       const float* x_data = inplace() ? y.data() : x.data();
411 
412       // Compute reference results.
413       for (size_t i = 0; i < batch_size(); i++) {
414         y_ref[i] = (x_data[i] / 6.0f) * std::max(std::min(x_data[i] + 3.0f, 6.0f), 0.0f);
415       }
416 
417       // Prepare parameters.
418       union xnn_f32_hswish_params params;
419       init_params(&params);
420 
421       // Call optimized micro-kernel.
422       vhswish(batch_size() * sizeof(float), x_data, y.data(), &params);
423 
424       // Verify results.
425       for (size_t i = 0; i < batch_size(); i++) {
426         ASSERT_NEAR(y[i], y_ref[i], std::max(5.0e-6, std::abs(y_ref[i]) * 1.0e-5))
427           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i];
428       }
429     }
430   }
431 
Test(xnn_f16_vlrelu_ukernel_function vlrelu,xnn_init_f16_lrelu_params_fn init_params)432   void Test(xnn_f16_vlrelu_ukernel_function vlrelu, xnn_init_f16_lrelu_params_fn init_params) const {
433     std::random_device random_device;
434     auto rng = std::mt19937(random_device());
435     auto f32rng = std::bind(std::uniform_real_distribution<float>(-125.0f, 125.0f), std::ref(rng));
436     auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
437 
438     std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t));
439     std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0));
440     std::vector<float> y_ref(batch_size());
441     const uint16_t slope_as_half = fp16_ieee_from_fp32_value(slope());
442     const float slope_as_float = fp16_ieee_to_fp32_value(slope_as_half);
443     for (size_t iteration = 0; iteration < iterations(); iteration++) {
444       if (inplace()) {
445         std::generate(y.begin(), y.end(), std::ref(f16rng));
446       } else {
447         std::generate(x.begin(), x.end(), std::ref(f16rng));
448         std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
449       }
450       const uint16_t* x_data = inplace() ? y.data() : x.data();
451 
452       // Compute reference results.
453       for (size_t i = 0; i < batch_size(); i++) {
454         const float x_value = fp16_ieee_to_fp32_value(x_data[i]);
455         y_ref[i] = std::signbit(x_value) ? x_value * slope_as_float : x_value;
456       }
457 
458       // Prepare parameters.
459       union xnn_f16_lrelu_params params;
460       init_params(&params, slope_as_half);
461 
462       // Call optimized micro-kernel.
463       vlrelu(batch_size() * sizeof(uint16_t), x_data, y.data(), &params);
464 
465       // Verify results.
466       for (size_t i = 0; i < batch_size(); i++) {
467         ASSERT_NEAR(
468             fp16_ieee_to_fp32_value(y[i]),
469             y_ref[i],
470             std::max(1.0e-4f, std::abs(y_ref[i]) * 1.0e-3f))
471           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << fp16_ieee_to_fp32_value(x[i]);
472       }
473     }
474   }
475 
Test(xnn_f32_vlrelu_ukernel_function vlrelu,xnn_init_f32_lrelu_params_fn init_params)476   void Test(xnn_f32_vlrelu_ukernel_function vlrelu, xnn_init_f32_lrelu_params_fn init_params) const {
477     std::random_device random_device;
478     auto rng = std::mt19937(random_device());
479     std::uniform_real_distribution<float> f32dist(-125.0f, 125.0f);
480 
481     std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float));
482     std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0));
483     std::vector<double> y_ref(batch_size());
484     for (size_t iteration = 0; iteration < iterations(); iteration++) {
485       if (inplace()) {
486         std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); });
487       } else {
488         std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); });
489         std::fill(y.begin(), y.end(), nanf(""));
490       }
491       const float* x_data = inplace() ? y.data() : x.data();
492 
493       // Compute reference results.
494       for (size_t i = 0; i < batch_size(); i++) {
495         y_ref[i] = std::signbit(x_data[i]) ? x_data[i] * slope() : x_data[i];
496       }
497 
498       // Prepare parameters.
499       union xnn_f32_lrelu_params params;
500       init_params(&params, slope());
501 
502       // Call optimized micro-kernel.
503       vlrelu(batch_size() * sizeof(float), x_data, y.data(), &params);
504 
505       // Verify results.
506       for (size_t i = 0; i < batch_size(); i++) {
507         ASSERT_EQ(y[i], y_ref[i])
508           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i];
509       }
510     }
511   }
512 
513   void Test(xnn_f16_vneg_ukernel_function vneg, xnn_init_f16_neg_params_fn init_params = nullptr) const {
514     std::random_device random_device;
515     auto rng = std::mt19937(random_device());
516     std::uniform_real_distribution<float> f32dist(-1.0f, 1.0f);
517 
518     std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t));
519     std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0));
520     std::vector<uint16_t> y_ref(batch_size());
521     for (size_t iteration = 0; iteration < iterations(); iteration++) {
522       if (inplace()) {
523         std::generate(y.begin(), y.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
524       } else {
525         std::generate(x.begin(), x.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
526         std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
527       }
528       const uint16_t* x_data = inplace() ? y.data() : x.data();
529 
530       // Compute reference results.
531       for (size_t i = 0; i < batch_size(); i++) {
532         y_ref[i] = x_data[i] ^ UINT16_C(0x8000);
533       }
534 
535       // Prepare parameters.
536       union xnn_f16_neg_params params;
537       if (init_params != nullptr) {
538         init_params(&params);
539       }
540 
541       // Call optimized micro-kernel.
542       vneg(batch_size() * sizeof(uint16_t), x_data, y.data(), &params);
543 
544       // Verify results.
545       for (size_t i = 0; i < batch_size(); i++) {
546         ASSERT_EQ(y[i], y_ref[i])
547           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i];
548       }
549     }
550   }
551 
552   void Test(xnn_f32_vneg_ukernel_function vneg, xnn_init_f32_neg_params_fn init_params = nullptr) const {
553     std::random_device random_device;
554     auto rng = std::mt19937(random_device());
555     std::uniform_real_distribution<float> f32dist(-1.0f, 1.0f);
556 
557     std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float));
558     std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0));
559     std::vector<float> y_ref(batch_size());
560     for (size_t iteration = 0; iteration < iterations(); iteration++) {
561       if (inplace()) {
562         std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); });
563       } else {
564         std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); });
565         std::fill(y.begin(), y.end(), nanf(""));
566       }
567       const float* x_data = inplace() ? y.data() : x.data();
568 
569       // Compute reference results.
570       for (size_t i = 0; i < batch_size(); i++) {
571         y_ref[i] = -x_data[i];
572       }
573 
574       // Prepare parameters.
575       union xnn_f32_neg_params params;
576       if (init_params != nullptr) {
577         init_params(&params);
578       }
579 
580       // Call optimized micro-kernel.
581       vneg(batch_size() * sizeof(float), x_data, y.data(), &params);
582 
583       // Verify results.
584       for (size_t i = 0; i < batch_size(); i++) {
585         ASSERT_EQ(y[i], y_ref[i])
586           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i];
587       }
588     }
589   }
590 
591   void Test(xnn_f16_vround_ukernel_function vrnd, OpType op_type, xnn_init_f16_rnd_params_fn init_params = nullptr) const {
592     std::random_device random_device;
593     auto rng = std::mt19937(random_device());
594     std::uniform_real_distribution<float> f32dist(-5.0f, 5.0f);
595 
596     std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t));
597     std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0));
598     std::vector<uint16_t> y_ref(batch_size());
599     for (size_t iteration = 0; iteration < iterations(); iteration++) {
600       if (inplace()) {
601         std::generate(y.begin(), y.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
602       } else {
603         std::generate(x.begin(), x.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
604         std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
605       }
606       const uint16_t* x_data = inplace() ? y.data() : x.data();
607 
608       // Compute reference results.
609       for (size_t i = 0; i < batch_size(); i++) {
610         switch (op_type) {
611           case OpType::RoundToNearestEven:
612             y_ref[i] = fp16_ieee_from_fp32_value(std::nearbyint(fp16_ieee_to_fp32_value(x_data[i])));
613             break;
614           case OpType::RoundTowardsZero:
615             y_ref[i] = fp16_ieee_from_fp32_value(std::trunc(fp16_ieee_to_fp32_value(x_data[i])));
616             break;
617           case OpType::RoundUp:
618             y_ref[i] = fp16_ieee_from_fp32_value(std::ceil(fp16_ieee_to_fp32_value(x_data[i])));
619             break;
620           case OpType::RoundDown:
621             y_ref[i] = fp16_ieee_from_fp32_value(std::floor(fp16_ieee_to_fp32_value(x_data[i])));
622             break;
623           default:
624             GTEST_FAIL() << "Unexpected operation type";
625             return;
626         }
627       }
628 
629       // Prepare parameters.
630       xnn_f16_rnd_params params;
631       if (init_params != nullptr) {
632         init_params(&params);
633       }
634 
635       // Call optimized micro-kernel.
636       vrnd(batch_size() * sizeof(uint16_t), x_data, y.data(), &params);
637 
638       // Verify results.
639       for (size_t i = 0; i < batch_size(); i++) {
640         ASSERT_EQ(y[i], y_ref[i])
641           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i];
642       }
643     }
644   }
645 
646   void Test(xnn_f32_vround_ukernel_function vrnd, OpType op_type, xnn_init_f32_rnd_params_fn init_params = nullptr) const {
647     std::random_device random_device;
648     auto rng = std::mt19937(random_device());
649     std::uniform_real_distribution<float> f32dist(-5.0f, 5.0f);
650 
651     std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float));
652     std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0));
653     std::vector<float> y_ref(batch_size());
654     for (size_t iteration = 0; iteration < iterations(); iteration++) {
655       if (inplace()) {
656         std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); });
657       } else {
658         std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); });
659         std::fill(y.begin(), y.end(), nanf(""));
660       }
661       const float* x_data = inplace() ? y.data() : x.data();
662 
663       // Compute reference results.
664       for (size_t i = 0; i < batch_size(); i++) {
665         switch (op_type) {
666           case OpType::RoundToNearestEven:
667             y_ref[i] = std::nearbyint(x_data[i]);
668             break;
669           case OpType::RoundTowardsZero:
670             y_ref[i] = std::trunc(x_data[i]);
671             break;
672           case OpType::RoundUp:
673             y_ref[i] = std::ceil(x_data[i]);
674             break;
675           case OpType::RoundDown:
676             y_ref[i] = std::floor(x_data[i]);
677             break;
678           default:
679             GTEST_FAIL() << "Unexpected operation type";
680             return;
681         }
682       }
683 
684       // Prepare parameters.
685       xnn_f32_rnd_params params;
686       if (init_params != nullptr) {
687         init_params(&params);
688       }
689 
690       // Call optimized micro-kernel.
691       vrnd(batch_size() * sizeof(float), x_data, y.data(), &params);
692 
693       // Verify results.
694       for (size_t i = 0; i < batch_size(); i++) {
695         ASSERT_EQ(y[i], y_ref[i])
696           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i];
697       }
698     }
699   }
700 
Test(xnn_f16_vsigmoid_ukernel_function vsigmoid,xnn_init_f16_sigmoid_params_fn init_params)701   void Test(xnn_f16_vsigmoid_ukernel_function vsigmoid, xnn_init_f16_sigmoid_params_fn init_params) const {
702     std::random_device random_device;
703     auto rng = std::mt19937(random_device());
704     auto distribution = std::uniform_real_distribution<float>(-25.0f, 25.0f);
705     auto f32rng = std::bind(distribution, std::ref(rng));
706     auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
707 
708     std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t));
709     std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0));
710     std::vector<float> y_ref(batch_size());
711     for (size_t iteration = 0; iteration < iterations(); iteration++) {
712       if (inplace()) {
713         std::generate(y.begin(), y.end(), std::ref(f16rng));
714       } else {
715         std::generate(x.begin(), x.end(), std::ref(f16rng));
716         std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
717       }
718       const uint16_t* x_data = inplace() ? y.data() : x.data();
719 
720       // Compute reference results.
721       for (size_t i = 0; i < batch_size(); i++) {
722         const float e = std::exp(fp16_ieee_to_fp32_value(x_data[i]));
723         y_ref[i] = e / (1.0f + e);
724       }
725 
726       // Prepare parameters.
727       union xnn_f16_sigmoid_params params;
728       init_params(&params);
729 
730       // Call optimized micro-kernel.
731       vsigmoid(batch_size() * sizeof(uint16_t), x_data, y.data(), &params);
732 
733       // Verify results.
734       for (size_t i = 0; i < batch_size(); i++) {
735         ASSERT_NEAR(
736             fp16_ieee_to_fp32_value(y[i]),
737             y_ref[i],
738             std::max(1.0e-4f, std::abs(y_ref[i]) * 5.0e-3f))
739           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << fp16_ieee_to_fp32_value(x[i]);
740       }
741     }
742   }
743 
Test(xnn_f32_vsigmoid_ukernel_function vsigmoid,xnn_init_f32_sigmoid_params_fn init_params)744   void Test(xnn_f32_vsigmoid_ukernel_function vsigmoid, xnn_init_f32_sigmoid_params_fn init_params) const {
745     std::random_device random_device;
746     auto rng = std::mt19937(random_device());
747     std::uniform_real_distribution<float> f32dist(-125.0f, 125.0f);
748 
749     std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float));
750     std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0));
751     std::vector<double> y_ref(batch_size());
752     for (size_t iteration = 0; iteration < iterations(); iteration++) {
753       if (inplace()) {
754         std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); });
755       } else {
756         std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); });
757         std::fill(y.begin(), y.end(), nanf(""));
758       }
759       const float* x_data = inplace() ? y.data() : x.data();
760 
761       // Compute reference results.
762       for (size_t i = 0; i < batch_size(); i++) {
763         const double e = std::exp(double(x_data[i]));
764         y_ref[i] = e / (1.0 + e);
765       }
766 
767       // Prepare parameters.
768       union xnn_f32_sigmoid_params params;
769       init_params(&params);
770 
771       // Call optimized micro-kernel.
772       vsigmoid(batch_size() * sizeof(float), x_data, y.data(), &params);
773 
774       // Verify results.
775       for (size_t i = 0; i < batch_size(); i++) {
776         ASSERT_NEAR(y[i], y_ref[i], std::max(5.0e-6, std::abs(y_ref[i]) * 1.0e-5))
777           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i];
778       }
779     }
780   }
781 
782   void Test(xnn_f16_vsqr_ukernel_function vsqr, xnn_init_f16_default_params_fn init_params = nullptr) const {
783     std::random_device random_device;
784     auto rng = std::mt19937(random_device());
785     std::uniform_real_distribution<float> f32dist(-10.0f, 10.0f);
786 
787     std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t));
788     std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0));
789     std::vector<float> y_ref(batch_size());
790     for (size_t iteration = 0; iteration < iterations(); iteration++) {
791       if (inplace()) {
792         std::generate(y.begin(), y.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
793       } else {
794         std::generate(x.begin(), x.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
795         std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
796       }
797       const uint16_t* x_data = inplace() ? y.data() : x.data();
798 
799       // Compute reference results.
800       for (size_t i = 0; i < batch_size(); i++) {
801         const float x_value = fp16_ieee_to_fp32_value(x_data[i]);
802         y_ref[i] = x_value * x_value;
803       }
804 
805       // Prepare parameters.
806       union xnn_f16_default_params params;
807       if (init_params != nullptr) {
808         init_params(&params);
809       }
810 
811       // Call optimized micro-kernel.
812       vsqr(batch_size() * sizeof(uint16_t), x_data, y.data(), &params);
813 
814       // Verify results.
815       for (size_t i = 0; i < batch_size(); i++) {
816         ASSERT_NEAR(
817             fp16_ieee_to_fp32_value(y[i]),
818             y_ref[i],
819             std::max(1.0e-4f, std::abs(y_ref[i]) * 5.0e-3f))
820           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << fp16_ieee_to_fp32_value(x[i]);
821       }
822     }
823   }
824 
825   void Test(xnn_f32_vsqr_ukernel_function vsqr, xnn_init_f32_default_params_fn init_params = nullptr) const {
826     std::random_device random_device;
827     auto rng = std::mt19937(random_device());
828     std::uniform_real_distribution<float> f32dist(-10.0f, 10.0f);
829 
830     std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float));
831     std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0));
832     std::vector<float> y_ref(batch_size());
833     for (size_t iteration = 0; iteration < iterations(); iteration++) {
834       if (inplace()) {
835         std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); });
836       } else {
837         std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); });
838         std::fill(y.begin(), y.end(), nanf(""));
839       }
840       const float* x_data = inplace() ? y.data() : x.data();
841 
842       // Compute reference results.
843       for (size_t i = 0; i < batch_size(); i++) {
844         y_ref[i] = x_data[i] * x_data[i];
845       }
846 
847       // Prepare parameters.
848       union xnn_f32_default_params params;
849       if (init_params != nullptr) {
850         init_params(&params);
851       }
852 
853       // Call optimized micro-kernel.
854       vsqr(batch_size() * sizeof(float), x_data, y.data(), &params);
855 
856       // Verify results.
857       for (size_t i = 0; i < batch_size(); i++) {
858         ASSERT_EQ(y[i], y_ref[i])
859           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i];
860       }
861     }
862   }
863 
864   void Test(xnn_f16_vsqrt_ukernel_function vsqrt, xnn_init_f16_sqrt_params_fn init_params = nullptr) const {
865     std::random_device random_device;
866     auto rng = std::mt19937(random_device());
867     std::uniform_real_distribution<float> f32dist(0.0f, 10.0f);
868 
869     std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t));
870     std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0));
871     std::vector<float> y_ref(batch_size());
872     for (size_t iteration = 0; iteration < iterations(); iteration++) {
873       if (inplace()) {
874         std::generate(y.begin(), y.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
875       } else {
876         std::generate(x.begin(), x.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
877         std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
878       }
879       const uint16_t* x_data = inplace() ? y.data() : x.data();
880 
881       // Compute reference results.
882       for (size_t i = 0; i < batch_size(); i++) {
883         y_ref[i] = std::sqrt(fp16_ieee_to_fp32_value(x_data[i]));
884       }
885 
886       // Prepare parameters.
887       union xnn_f16_sqrt_params params;
888       if (init_params != nullptr) {
889         init_params(&params);
890       }
891 
892       // Call optimized micro-kernel.
893       vsqrt(batch_size() * sizeof(uint16_t), x_data, y.data(), init_params != nullptr ? &params : nullptr);
894 
895       // Verify results.
896       for (size_t i = 0; i < batch_size(); i++) {
897         ASSERT_NEAR(
898             fp16_ieee_to_fp32_value(y[i]),
899             y_ref[i],
900             std::max(1.0e-4f, std::abs(y_ref[i]) * 5.0e-3f))
901           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << fp16_ieee_to_fp32_value(x[i]);
902       }
903     }
904   }
905 
906   void Test(xnn_f32_vsqrt_ukernel_function vsqrt, xnn_init_f32_sqrt_params_fn init_params = nullptr) const {
907     std::random_device random_device;
908     auto rng = std::mt19937(random_device());
909     std::uniform_real_distribution<float> f32dist(0.0f, 10.0f);
910 
911     std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float));
912     std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0));
913     std::vector<float> y_ref(batch_size());
914     for (size_t iteration = 0; iteration < iterations(); iteration++) {
915       if (inplace()) {
916         std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); });
917       } else {
918         std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); });
919         std::fill(y.begin(), y.end(), nanf(""));
920       }
921       const float* x_data = inplace() ? y.data() : x.data();
922 
923       // Compute reference results.
924       for (size_t i = 0; i < batch_size(); i++) {
925         y_ref[i] = std::sqrt(x_data[i]);
926       }
927 
928       // Prepare parameters.
929       union xnn_f32_sqrt_params params;
930       if (init_params != nullptr) {
931         init_params(&params);
932       }
933 
934       // Call optimized micro-kernel.
935       vsqrt(batch_size() * sizeof(float), x_data, y.data(), init_params != nullptr ? &params : nullptr);
936 
937       // Verify results.
938       for (size_t i = 0; i < batch_size(); i++) {
939         ASSERT_EQ(y[i], y_ref[i])
940           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i];
941       }
942     }
943   }
944 
Test(xnn_f16_vclamp_ukernel_function vclamp,xnn_init_f16_minmax_params_fn init_params)945   void Test(xnn_f16_vclamp_ukernel_function vclamp, xnn_init_f16_minmax_params_fn init_params) const {
946     std::random_device random_device;
947     auto rng = std::mt19937(random_device());
948     auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 255.0f), std::ref(rng));
949     auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
950 
951     std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t));
952     std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0));
953     std::vector<float> y_ref(batch_size());
954     for (size_t iteration = 0; iteration < iterations(); iteration++) {
955       std::generate(x.begin(), x.end(), std::ref(f16rng));
956       if (inplace()) {
957         std::generate(y.begin(), y.end(), std::ref(f16rng));
958       } else {
959         std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
960       }
961       const uint16_t* x_data = inplace() ? y.data() : x.data();
962 
963       // Compute reference results.
964       for (size_t i = 0; i < batch_size(); i++) {
965         y_ref[i] = std::max(std::min(fp16_ieee_to_fp32_value(x_data[i]), float(qmax())), float(qmin()));
966       }
967 
968       // Prepare parameters.
969       union xnn_f16_minmax_params params;
970       init_params(&params, fp16_ieee_from_fp32_value(float(qmin())), fp16_ieee_from_fp32_value(float(qmax())));
971 
972       // Call optimized micro-kernel.
973       vclamp(batch_size() * sizeof(uint16_t), x_data, y.data(), &params);
974 
975       // Verify results.
976       for (size_t i = 0; i < batch_size(); i++) {
977         ASSERT_NEAR(y_ref[i], fp16_ieee_to_fp32_value(y[i]), std::max(1.0e-3f, std::abs(y_ref[i]) * 1.0e-2f))
978           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << fp16_ieee_to_fp32_value(x[i]);
979       }
980     }
981   }
982 
Test(xnn_s8_vclamp_ukernel_function vclamp,xnn_init_s8_minmax_params_fn init_params)983   void Test(xnn_s8_vclamp_ukernel_function vclamp, xnn_init_s8_minmax_params_fn init_params) const {
984     std::random_device random_device;
985     auto rng = std::mt19937(random_device());
986     auto i8rng = std::bind(
987       std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
988       std::ref(rng));
989 
990     std::vector<int8_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(int8_t));
991     std::vector<int8_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(int8_t) : 0));
992     std::vector<int8_t> y_ref(batch_size());
993     for (size_t iteration = 0; iteration < iterations(); iteration++) {
994       std::generate(x.begin(), x.end(), std::ref(i8rng));
995       if (inplace()) {
996         std::copy(x.cbegin(), x.cend(), y.begin());
997       } else {
998         std::fill(y.begin(), y.end(), INT8_C(0xA5));
999       }
1000       const int8_t* x_data = inplace() ? y.data() : x.data();
1001 
1002       // Compute reference results.
1003       for (size_t i = 0; i < batch_size(); i++) {
1004         y_ref[i] = std::min(std::max(x_data[i], int8_t(qmin() - 0x80)), int8_t(qmax() - 0x80));
1005       }
1006 
1007       // Prepare parameters.
1008       union xnn_s8_minmax_params params;
1009       init_params(&params, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
1010 
1011       // Call optimized micro-kernel.
1012       vclamp(batch_size() * sizeof(int8_t), x_data, y.data(), &params);
1013 
1014       // Verify results.
1015       for (size_t i = 0; i < batch_size(); i++) {
1016         ASSERT_EQ(int32_t(y_ref[i]), int32_t(y[i]))
1017           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << int32_t(x[i]);
1018       }
1019     }
1020   }
1021 
Test(xnn_u8_vclamp_ukernel_function vclamp,xnn_init_u8_minmax_params_fn init_params)1022   void Test(xnn_u8_vclamp_ukernel_function vclamp, xnn_init_u8_minmax_params_fn init_params) const {
1023     std::random_device random_device;
1024     auto rng = std::mt19937(random_device());
1025     auto u8rng = std::bind(
1026       std::uniform_int_distribution<int32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
1027 
1028     std::vector<uint8_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint8_t));
1029     std::vector<uint8_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint8_t) : 0));
1030     std::vector<uint8_t> y_ref(batch_size());
1031     for (size_t iteration = 0; iteration < iterations(); iteration++) {
1032       std::generate(x.begin(), x.end(), std::ref(u8rng));
1033       if (inplace()) {
1034         std::copy(x.cbegin(), x.cend(), y.begin());
1035       } else {
1036         std::fill(y.begin(), y.end(), UINT8_C(0xA5));
1037       }
1038       const uint8_t* x_data = inplace() ? y.data() : x.data();
1039 
1040       // Compute reference results.
1041       for (size_t i = 0; i < batch_size(); i++) {
1042         y_ref[i] = std::min(std::max(x_data[i], qmin()), qmax());
1043       }
1044 
1045       // Prepare parameters.
1046       union xnn_u8_minmax_params params;
1047       init_params(&params, qmin(), qmax());
1048 
1049       // Call optimized micro-kernel.
1050       vclamp(batch_size() * sizeof(uint8_t), x_data, y.data(), &params);
1051 
1052       // Verify results.
1053       for (size_t i = 0; i < batch_size(); i++) {
1054         ASSERT_EQ(uint32_t(y_ref[i]), uint32_t(y[i]))
1055           << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << uint32_t(x[i]);
1056       }
1057     }
1058   }
1059 
Test(xnn_u64_u32_vsqrtshift_ukernel_function vsqrtshift)1060   void Test(xnn_u64_u32_vsqrtshift_ukernel_function vsqrtshift) const {
1061     ASSERT_FALSE(inplace());
1062 
1063     std::random_device random_device;
1064     auto rng = std::mt19937(random_device());
1065     auto u64rng = std::bind( std::uniform_int_distribution<uint64_t>(), std::ref(rng));
1066 
1067     std::vector<uint64_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint64_t));
1068     std::vector<uint32_t> y(batch_size());
1069     std::vector<uint32_t> y_ref(batch_size());
1070     for (size_t iteration = 0; iteration < iterations(); iteration++) {
1071       std::generate(x.begin(), x.end(), std::ref(u64rng));
1072       std::fill(y.begin(), y.end(), UINT32_C(0xDEADBEEF));
1073 
1074       // Compute reference results.
1075       for (size_t i = 0; i < batch_size(); i++) {
1076         const uint64_t x_value = x[i];
1077         uint32_t y_value = 0;
1078         // Match TFLM semantics, including bugs
1079         if (uint32_t(x_value) == x_value) {
1080           y_value = (uint32_t) std::lrint(std::sqrt(double(int64_t(uint64_t(x_value)))));
1081           y_value = std::min<uint32_t>(y_value, std::numeric_limits<uint16_t>::max());
1082         } else if (x_value != 0) {
1083           uint64_t y0 = x_value >> 1;
1084           uint64_t y1 = (y0 + x_value / y0) >> 1;
1085           do {
1086             y0 = y1;
1087             y1 = (y0 + x_value / y0) >> 1;
1088           } while (y1 < y0);
1089 
1090           // y0 is sqrt(x_value) rounded down, round up if needed
1091           if (int64_t(y0 * y0 + y0 - x_value) < 0) {
1092             y0 += 1;
1093           }
1094           y_value = static_cast<uint32_t>(std::min<uint64_t>(y0, std::numeric_limits<uint32_t>::max()));
1095         }
1096         y_ref[i] = y_value >> shift();
1097       }
1098 
1099       // Call optimized micro-kernel.
1100       vsqrtshift(batch_size() * sizeof(uint64_t), x.data(), y.data(), shift());
1101 
1102       // Verify results.
1103       for (size_t i = 0; i < batch_size(); i++) {
1104         ASSERT_EQ(y_ref[i], y[i])
1105           << "at " << i << " / " << batch_size()
1106           << ", x[" << i << "]: " << x[i]
1107           << ", shift: " << shift();
1108       }
1109     }
1110   }
1111 
1112  private:
1113   size_t batch_size_ = 1;
1114   bool inplace_ = false;
1115   float slope_ = 0.5f;
1116   float prescale_ = 1.0f;
1117   float alpha_ = 1.0f;
1118   float beta_ = 1.0f;
1119   uint32_t shift_ = 1;
1120   uint8_t qmin_ = 0;
1121   uint8_t qmax_ = 255;
1122   size_t iterations_ = 15;
1123 };
1124