1 // Copyright 2019 Google LLC 2 // 3 // This source code is licensed under the BSD-style license found in the 4 // LICENSE file in the root directory of this source tree. 5 6 #pragma once 7 8 #include <gtest/gtest.h> 9 10 #include <algorithm> 11 #include <cassert> 12 #include <cstddef> 13 #include <cstdlib> 14 #include <random> 15 #include <vector> 16 17 #include <fp16.h> 18 19 #include <xnnpack.h> 20 #include <xnnpack/microfnptr.h> 21 #include <xnnpack/microparams-init.h> 22 23 24 class VUnaryMicrokernelTester { 25 public: 26 enum class OpType { 27 ReLU, 28 RoundToNearestEven, 29 RoundTowardsZero, 30 RoundUp, 31 RoundDown, 32 }; 33 34 enum class Variant { 35 Native, 36 Scalar, 37 }; 38 batch_size(size_t batch_size)39 inline VUnaryMicrokernelTester& batch_size(size_t batch_size) { 40 assert(batch_size != 0); 41 this->batch_size_ = batch_size; 42 return *this; 43 } 44 batch_size()45 inline size_t batch_size() const { 46 return this->batch_size_; 47 } 48 inplace(bool inplace)49 inline VUnaryMicrokernelTester& inplace(bool inplace) { 50 this->inplace_ = inplace; 51 return *this; 52 } 53 inplace()54 inline bool inplace() const { 55 return this->inplace_; 56 } 57 slope(float slope)58 inline VUnaryMicrokernelTester& slope(float slope) { 59 this->slope_ = slope; 60 return *this; 61 } 62 slope()63 inline float slope() const { 64 return this->slope_; 65 } 66 prescale(float prescale)67 inline VUnaryMicrokernelTester& prescale(float prescale) { 68 this->prescale_ = prescale; 69 return *this; 70 } 71 prescale()72 inline float prescale() const { 73 return this->prescale_; 74 } 75 alpha(float alpha)76 inline VUnaryMicrokernelTester& alpha(float alpha) { 77 this->alpha_ = alpha; 78 return *this; 79 } 80 alpha()81 inline float alpha() const { 82 return this->alpha_; 83 } 84 beta(float beta)85 inline VUnaryMicrokernelTester& beta(float beta) { 86 this->beta_ = beta; 87 return *this; 88 } 89 beta()90 inline float beta() const { 91 return this->beta_; 92 } 93 shift(uint32_t shift)94 inline VUnaryMicrokernelTester& shift(uint32_t shift) { 95 this->shift_ = shift; 96 return *this; 97 } 98 shift()99 inline uint32_t shift() const { 100 return this->shift_; 101 } 102 qmin(uint8_t qmin)103 inline VUnaryMicrokernelTester& qmin(uint8_t qmin) { 104 this->qmin_ = qmin; 105 return *this; 106 } 107 qmin()108 inline uint8_t qmin() const { 109 return this->qmin_; 110 } 111 qmax(uint8_t qmax)112 inline VUnaryMicrokernelTester& qmax(uint8_t qmax) { 113 this->qmax_ = qmax; 114 return *this; 115 } 116 qmax()117 inline uint8_t qmax() const { 118 return this->qmax_; 119 } 120 iterations(size_t iterations)121 inline VUnaryMicrokernelTester& iterations(size_t iterations) { 122 this->iterations_ = iterations; 123 return *this; 124 } 125 iterations()126 inline size_t iterations() const { 127 return this->iterations_; 128 } 129 Test(xnn_f32_vrelu_ukernel_function vrelu)130 void Test(xnn_f32_vrelu_ukernel_function vrelu) const { 131 std::random_device random_device; 132 auto rng = std::mt19937(random_device()); 133 std::uniform_real_distribution<float> f32dist(-1.0f, 1.0f); 134 135 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 136 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 137 std::vector<double> y_ref(batch_size()); 138 for (size_t iteration = 0; iteration < iterations(); iteration++) { 139 if (inplace()) { 140 std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); }); 141 } else { 142 std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); }); 143 std::fill(y.begin(), y.end(), nanf("")); 144 } 145 const float* x_data = inplace() ? y.data() : x.data(); 146 147 // Compute reference results. 148 for (size_t i = 0; i < batch_size(); i++) { 149 y_ref[i] = std::max(x_data[i], 0.0f); 150 } 151 152 // Call optimized micro-kernel. 153 vrelu(batch_size() * sizeof(float), x_data, y.data(), nullptr); 154 155 // Verify results. 156 for (size_t i = 0; i < batch_size(); i++) { 157 ASSERT_EQ(y[i], y_ref[i]) 158 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 159 } 160 } 161 } 162 163 void Test(xnn_f16_vabs_ukernel_function vabs, xnn_init_f16_abs_params_fn init_params = nullptr) const { 164 std::random_device random_device; 165 auto rng = std::mt19937(random_device()); 166 std::uniform_real_distribution<float> f32dist(-1.0f, 1.0f); 167 168 std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t)); 169 std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0)); 170 std::vector<uint16_t> y_ref(batch_size()); 171 for (size_t iteration = 0; iteration < iterations(); iteration++) { 172 if (inplace()) { 173 std::generate(y.begin(), y.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 174 } else { 175 std::generate(x.begin(), x.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 176 std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */); 177 } 178 const uint16_t* x_data = inplace() ? y.data() : x.data(); 179 180 // Compute reference results. 181 for (size_t i = 0; i < batch_size(); i++) { 182 y_ref[i] = x_data[i] & UINT16_C(0x7FFF); 183 } 184 185 // Prepare parameters. 186 union xnn_f16_abs_params params; 187 if (init_params != nullptr) { 188 init_params(¶ms); 189 } 190 191 // Call optimized micro-kernel. 192 vabs(batch_size() * sizeof(uint16_t), x_data, y.data(), ¶ms); 193 194 // Verify results. 195 for (size_t i = 0; i < batch_size(); i++) { 196 ASSERT_EQ(y[i], y_ref[i]) 197 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 198 } 199 } 200 } 201 202 void Test(xnn_f32_vabs_ukernel_function vabs, xnn_init_f32_abs_params_fn init_params = nullptr) const { 203 std::random_device random_device; 204 auto rng = std::mt19937(random_device()); 205 std::uniform_real_distribution<float> f32dist(-1.0f, 1.0f); 206 207 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 208 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 209 std::vector<float> y_ref(batch_size()); 210 for (size_t iteration = 0; iteration < iterations(); iteration++) { 211 if (inplace()) { 212 std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); }); 213 } else { 214 std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); }); 215 std::fill(y.begin(), y.end(), nanf("")); 216 } 217 const float* x_data = inplace() ? y.data() : x.data(); 218 219 // Compute reference results. 220 for (size_t i = 0; i < batch_size(); i++) { 221 y_ref[i] = std::abs(x_data[i]); 222 } 223 224 // Prepare parameters. 225 union xnn_f32_abs_params params; 226 if (init_params != nullptr) { 227 init_params(¶ms); 228 } 229 230 // Call optimized micro-kernel. 231 vabs(batch_size() * sizeof(float), x_data, y.data(), ¶ms); 232 233 // Verify results. 234 for (size_t i = 0; i < batch_size(); i++) { 235 ASSERT_EQ(y[i], y_ref[i]) 236 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 237 } 238 } 239 } 240 Test(xnn_f32_vclamp_ukernel_function vclamp,xnn_init_f32_minmax_params_fn init_params)241 void Test(xnn_f32_vclamp_ukernel_function vclamp, xnn_init_f32_minmax_params_fn init_params) const { 242 std::random_device random_device; 243 auto rng = std::mt19937(random_device()); 244 std::uniform_real_distribution<float> f32dist(0.0f, 255.0f); 245 246 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 247 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 248 std::vector<float> y_ref(batch_size()); 249 for (size_t iteration = 0; iteration < iterations(); iteration++) { 250 if (inplace()) { 251 std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); }); 252 } else { 253 std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); }); 254 std::fill(y.begin(), y.end(), nanf("")); 255 } 256 const float* x_data = inplace() ? y.data() : x.data(); 257 258 // Compute reference results. 259 for (size_t i = 0; i < batch_size(); i++) { 260 y_ref[i] = std::max(std::min(x_data[i], float(qmax())), float(qmin())); 261 } 262 263 // Prepare parameters. 264 union xnn_f32_minmax_params params; 265 init_params(¶ms, float(qmin()), float(qmax())); 266 267 // Call optimized micro-kernel. 268 vclamp(batch_size() * sizeof(float), x_data, y.data(), ¶ms); 269 270 // Verify results. 271 for (size_t i = 0; i < batch_size(); i++) { 272 ASSERT_EQ(y[i], y_ref[i]) 273 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 274 } 275 } 276 } 277 Test(xnn_f16_velu_ukernel_function velu,xnn_init_f16_elu_params_fn init_params)278 void Test(xnn_f16_velu_ukernel_function velu, xnn_init_f16_elu_params_fn init_params) const { 279 std::random_device random_device; 280 auto rng = std::mt19937(random_device()); 281 std::uniform_real_distribution<float> f32dist(-9.0f, 9.0f); 282 283 std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t)); 284 std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0)); 285 std::vector<float> y_ref(batch_size()); 286 for (size_t iteration = 0; iteration < iterations(); iteration++) { 287 if (inplace()) { 288 std::generate(y.begin(), y.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 289 } else { 290 std::generate(x.begin(), x.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 291 std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */); 292 } 293 const uint16_t* x_data = inplace() ? y.data() : x.data(); 294 295 // Compute reference results. 296 for (size_t i = 0; i < batch_size(); i++) { 297 const float x_value = fp16_ieee_to_fp32_value(x_data[i]); 298 y_ref[i] = std::signbit(x_value) ? alpha() * std::expm1(x_value * prescale()) : x_value * beta(); 299 } 300 301 // Prepare parameters. 302 union xnn_f16_elu_params params; 303 init_params(¶ms, fp16_ieee_from_fp32_value(prescale()), fp16_ieee_from_fp32_value(alpha()), fp16_ieee_from_fp32_value(beta())); 304 305 // Call optimized micro-kernel. 306 velu(batch_size() * sizeof(uint16_t), x_data, y.data(), ¶ms); 307 308 // Verify results. 309 for (size_t i = 0; i < batch_size(); i++) { 310 ASSERT_NEAR( 311 fp16_ieee_to_fp32_value(y[i]), 312 y_ref[i], 313 std::max(1.0e-4f, std::abs(y_ref[i]) * 5.0e-3f)) 314 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << fp16_ieee_to_fp32_value(x[i]); 315 } 316 } 317 } 318 Test(xnn_f32_velu_ukernel_function velu,xnn_init_f32_elu_params_fn init_params)319 void Test(xnn_f32_velu_ukernel_function velu, xnn_init_f32_elu_params_fn init_params) const { 320 std::random_device random_device; 321 auto rng = std::mt19937(random_device()); 322 std::uniform_real_distribution<float> f32dist(-20.0f, 20.0f); 323 324 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 325 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 326 std::vector<double> y_ref(batch_size()); 327 for (size_t iteration = 0; iteration < iterations(); iteration++) { 328 if (inplace()) { 329 std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); }); 330 } else { 331 std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); }); 332 std::fill(y.begin(), y.end(), nanf("")); 333 } 334 const float* x_data = inplace() ? y.data() : x.data(); 335 336 // Compute reference results. 337 for (size_t i = 0; i < batch_size(); i++) { 338 y_ref[i] = std::signbit(x_data[i]) ? alpha() * std::expm1(double(x_data[i]) * prescale()) : double(x_data[i]) * beta(); 339 } 340 341 // Prepare parameters. 342 union xnn_f32_elu_params params; 343 init_params(¶ms, prescale(), alpha(), beta()); 344 345 // Call optimized micro-kernel. 346 velu(batch_size() * sizeof(float), x_data, y.data(), ¶ms); 347 348 // Verify results. 349 for (size_t i = 0; i < batch_size(); i++) { 350 ASSERT_NEAR(y[i], y_ref[i], std::max(5.0e-6, std::abs(y_ref[i]) * 1.0e-5)) 351 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 352 } 353 } 354 } 355 Test(xnn_f16_vhswish_ukernel_function vhswish,xnn_init_f16_hswish_params_fn init_params)356 void Test(xnn_f16_vhswish_ukernel_function vhswish, xnn_init_f16_hswish_params_fn init_params) const { 357 std::random_device random_device; 358 auto rng = std::mt19937(random_device()); 359 auto f32rng = std::bind(std::uniform_real_distribution<float>(-4.0f, 4.0f), std::ref(rng)); 360 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng); 361 362 std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t)); 363 std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0)); 364 std::vector<float> y_ref(batch_size()); 365 for (size_t iteration = 0; iteration < iterations(); iteration++) { 366 std::generate(x.begin(), x.end(), std::ref(f16rng)); 367 if (inplace()) { 368 std::generate(y.begin(), y.end(), std::ref(f16rng)); 369 } else { 370 std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */); 371 } 372 const uint16_t* x_data = inplace() ? y.data() : x.data(); 373 374 // Compute reference results. 375 for (size_t i = 0; i < batch_size(); i++) { 376 const float x_value = fp16_ieee_to_fp32_value(x_data[i]); 377 y_ref[i] = (x_value / 6.0f) * std::max(std::min(x_value + 3.0f, 6.0f), 0.0f); 378 } 379 380 // Prepare parameters. 381 union xnn_f16_hswish_params params; 382 init_params(¶ms); 383 384 // Call optimized micro-kernel. 385 vhswish(batch_size() * sizeof(uint16_t), x_data, y.data(), ¶ms); 386 387 // Verify results. 388 for (size_t i = 0; i < batch_size(); i++) { 389 ASSERT_NEAR(y_ref[i], fp16_ieee_to_fp32_value(y[i]), std::max(1.0e-3f, std::abs(y_ref[i]) * 1.0e-2f)) 390 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << fp16_ieee_to_fp32_value(x[i]); 391 } 392 } 393 } 394 Test(xnn_f32_vhswish_ukernel_function vhswish,xnn_init_f32_hswish_params_fn init_params)395 void Test(xnn_f32_vhswish_ukernel_function vhswish, xnn_init_f32_hswish_params_fn init_params) const { 396 std::random_device random_device; 397 auto rng = std::mt19937(random_device()); 398 std::uniform_real_distribution<float> f32dist(-4.0f, 4.0f); 399 400 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 401 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 402 std::vector<double> y_ref(batch_size()); 403 for (size_t iteration = 0; iteration < iterations(); iteration++) { 404 if (inplace()) { 405 std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); }); 406 } else { 407 std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); }); 408 std::fill(y.begin(), y.end(), nanf("")); 409 } 410 const float* x_data = inplace() ? y.data() : x.data(); 411 412 // Compute reference results. 413 for (size_t i = 0; i < batch_size(); i++) { 414 y_ref[i] = (x_data[i] / 6.0f) * std::max(std::min(x_data[i] + 3.0f, 6.0f), 0.0f); 415 } 416 417 // Prepare parameters. 418 union xnn_f32_hswish_params params; 419 init_params(¶ms); 420 421 // Call optimized micro-kernel. 422 vhswish(batch_size() * sizeof(float), x_data, y.data(), ¶ms); 423 424 // Verify results. 425 for (size_t i = 0; i < batch_size(); i++) { 426 ASSERT_NEAR(y[i], y_ref[i], std::max(5.0e-6, std::abs(y_ref[i]) * 1.0e-5)) 427 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 428 } 429 } 430 } 431 Test(xnn_f16_vlrelu_ukernel_function vlrelu,xnn_init_f16_lrelu_params_fn init_params)432 void Test(xnn_f16_vlrelu_ukernel_function vlrelu, xnn_init_f16_lrelu_params_fn init_params) const { 433 std::random_device random_device; 434 auto rng = std::mt19937(random_device()); 435 auto f32rng = std::bind(std::uniform_real_distribution<float>(-125.0f, 125.0f), std::ref(rng)); 436 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng); 437 438 std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t)); 439 std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0)); 440 std::vector<float> y_ref(batch_size()); 441 const uint16_t slope_as_half = fp16_ieee_from_fp32_value(slope()); 442 const float slope_as_float = fp16_ieee_to_fp32_value(slope_as_half); 443 for (size_t iteration = 0; iteration < iterations(); iteration++) { 444 if (inplace()) { 445 std::generate(y.begin(), y.end(), std::ref(f16rng)); 446 } else { 447 std::generate(x.begin(), x.end(), std::ref(f16rng)); 448 std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */); 449 } 450 const uint16_t* x_data = inplace() ? y.data() : x.data(); 451 452 // Compute reference results. 453 for (size_t i = 0; i < batch_size(); i++) { 454 const float x_value = fp16_ieee_to_fp32_value(x_data[i]); 455 y_ref[i] = std::signbit(x_value) ? x_value * slope_as_float : x_value; 456 } 457 458 // Prepare parameters. 459 union xnn_f16_lrelu_params params; 460 init_params(¶ms, slope_as_half); 461 462 // Call optimized micro-kernel. 463 vlrelu(batch_size() * sizeof(uint16_t), x_data, y.data(), ¶ms); 464 465 // Verify results. 466 for (size_t i = 0; i < batch_size(); i++) { 467 ASSERT_NEAR( 468 fp16_ieee_to_fp32_value(y[i]), 469 y_ref[i], 470 std::max(1.0e-4f, std::abs(y_ref[i]) * 1.0e-3f)) 471 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << fp16_ieee_to_fp32_value(x[i]); 472 } 473 } 474 } 475 Test(xnn_f32_vlrelu_ukernel_function vlrelu,xnn_init_f32_lrelu_params_fn init_params)476 void Test(xnn_f32_vlrelu_ukernel_function vlrelu, xnn_init_f32_lrelu_params_fn init_params) const { 477 std::random_device random_device; 478 auto rng = std::mt19937(random_device()); 479 std::uniform_real_distribution<float> f32dist(-125.0f, 125.0f); 480 481 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 482 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 483 std::vector<double> y_ref(batch_size()); 484 for (size_t iteration = 0; iteration < iterations(); iteration++) { 485 if (inplace()) { 486 std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); }); 487 } else { 488 std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); }); 489 std::fill(y.begin(), y.end(), nanf("")); 490 } 491 const float* x_data = inplace() ? y.data() : x.data(); 492 493 // Compute reference results. 494 for (size_t i = 0; i < batch_size(); i++) { 495 y_ref[i] = std::signbit(x_data[i]) ? x_data[i] * slope() : x_data[i]; 496 } 497 498 // Prepare parameters. 499 union xnn_f32_lrelu_params params; 500 init_params(¶ms, slope()); 501 502 // Call optimized micro-kernel. 503 vlrelu(batch_size() * sizeof(float), x_data, y.data(), ¶ms); 504 505 // Verify results. 506 for (size_t i = 0; i < batch_size(); i++) { 507 ASSERT_EQ(y[i], y_ref[i]) 508 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 509 } 510 } 511 } 512 513 void Test(xnn_f16_vneg_ukernel_function vneg, xnn_init_f16_neg_params_fn init_params = nullptr) const { 514 std::random_device random_device; 515 auto rng = std::mt19937(random_device()); 516 std::uniform_real_distribution<float> f32dist(-1.0f, 1.0f); 517 518 std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t)); 519 std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0)); 520 std::vector<uint16_t> y_ref(batch_size()); 521 for (size_t iteration = 0; iteration < iterations(); iteration++) { 522 if (inplace()) { 523 std::generate(y.begin(), y.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 524 } else { 525 std::generate(x.begin(), x.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 526 std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */); 527 } 528 const uint16_t* x_data = inplace() ? y.data() : x.data(); 529 530 // Compute reference results. 531 for (size_t i = 0; i < batch_size(); i++) { 532 y_ref[i] = x_data[i] ^ UINT16_C(0x8000); 533 } 534 535 // Prepare parameters. 536 union xnn_f16_neg_params params; 537 if (init_params != nullptr) { 538 init_params(¶ms); 539 } 540 541 // Call optimized micro-kernel. 542 vneg(batch_size() * sizeof(uint16_t), x_data, y.data(), ¶ms); 543 544 // Verify results. 545 for (size_t i = 0; i < batch_size(); i++) { 546 ASSERT_EQ(y[i], y_ref[i]) 547 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 548 } 549 } 550 } 551 552 void Test(xnn_f32_vneg_ukernel_function vneg, xnn_init_f32_neg_params_fn init_params = nullptr) const { 553 std::random_device random_device; 554 auto rng = std::mt19937(random_device()); 555 std::uniform_real_distribution<float> f32dist(-1.0f, 1.0f); 556 557 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 558 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 559 std::vector<float> y_ref(batch_size()); 560 for (size_t iteration = 0; iteration < iterations(); iteration++) { 561 if (inplace()) { 562 std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); }); 563 } else { 564 std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); }); 565 std::fill(y.begin(), y.end(), nanf("")); 566 } 567 const float* x_data = inplace() ? y.data() : x.data(); 568 569 // Compute reference results. 570 for (size_t i = 0; i < batch_size(); i++) { 571 y_ref[i] = -x_data[i]; 572 } 573 574 // Prepare parameters. 575 union xnn_f32_neg_params params; 576 if (init_params != nullptr) { 577 init_params(¶ms); 578 } 579 580 // Call optimized micro-kernel. 581 vneg(batch_size() * sizeof(float), x_data, y.data(), ¶ms); 582 583 // Verify results. 584 for (size_t i = 0; i < batch_size(); i++) { 585 ASSERT_EQ(y[i], y_ref[i]) 586 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 587 } 588 } 589 } 590 591 void Test(xnn_f16_vround_ukernel_function vrnd, OpType op_type, xnn_init_f16_rnd_params_fn init_params = nullptr) const { 592 std::random_device random_device; 593 auto rng = std::mt19937(random_device()); 594 std::uniform_real_distribution<float> f32dist(-5.0f, 5.0f); 595 596 std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t)); 597 std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0)); 598 std::vector<uint16_t> y_ref(batch_size()); 599 for (size_t iteration = 0; iteration < iterations(); iteration++) { 600 if (inplace()) { 601 std::generate(y.begin(), y.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 602 } else { 603 std::generate(x.begin(), x.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 604 std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */); 605 } 606 const uint16_t* x_data = inplace() ? y.data() : x.data(); 607 608 // Compute reference results. 609 for (size_t i = 0; i < batch_size(); i++) { 610 switch (op_type) { 611 case OpType::RoundToNearestEven: 612 y_ref[i] = fp16_ieee_from_fp32_value(std::nearbyint(fp16_ieee_to_fp32_value(x_data[i]))); 613 break; 614 case OpType::RoundTowardsZero: 615 y_ref[i] = fp16_ieee_from_fp32_value(std::trunc(fp16_ieee_to_fp32_value(x_data[i]))); 616 break; 617 case OpType::RoundUp: 618 y_ref[i] = fp16_ieee_from_fp32_value(std::ceil(fp16_ieee_to_fp32_value(x_data[i]))); 619 break; 620 case OpType::RoundDown: 621 y_ref[i] = fp16_ieee_from_fp32_value(std::floor(fp16_ieee_to_fp32_value(x_data[i]))); 622 break; 623 default: 624 GTEST_FAIL() << "Unexpected operation type"; 625 return; 626 } 627 } 628 629 // Prepare parameters. 630 xnn_f16_rnd_params params; 631 if (init_params != nullptr) { 632 init_params(¶ms); 633 } 634 635 // Call optimized micro-kernel. 636 vrnd(batch_size() * sizeof(uint16_t), x_data, y.data(), ¶ms); 637 638 // Verify results. 639 for (size_t i = 0; i < batch_size(); i++) { 640 ASSERT_EQ(y[i], y_ref[i]) 641 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 642 } 643 } 644 } 645 646 void Test(xnn_f32_vround_ukernel_function vrnd, OpType op_type, xnn_init_f32_rnd_params_fn init_params = nullptr) const { 647 std::random_device random_device; 648 auto rng = std::mt19937(random_device()); 649 std::uniform_real_distribution<float> f32dist(-5.0f, 5.0f); 650 651 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 652 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 653 std::vector<float> y_ref(batch_size()); 654 for (size_t iteration = 0; iteration < iterations(); iteration++) { 655 if (inplace()) { 656 std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); }); 657 } else { 658 std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); }); 659 std::fill(y.begin(), y.end(), nanf("")); 660 } 661 const float* x_data = inplace() ? y.data() : x.data(); 662 663 // Compute reference results. 664 for (size_t i = 0; i < batch_size(); i++) { 665 switch (op_type) { 666 case OpType::RoundToNearestEven: 667 y_ref[i] = std::nearbyint(x_data[i]); 668 break; 669 case OpType::RoundTowardsZero: 670 y_ref[i] = std::trunc(x_data[i]); 671 break; 672 case OpType::RoundUp: 673 y_ref[i] = std::ceil(x_data[i]); 674 break; 675 case OpType::RoundDown: 676 y_ref[i] = std::floor(x_data[i]); 677 break; 678 default: 679 GTEST_FAIL() << "Unexpected operation type"; 680 return; 681 } 682 } 683 684 // Prepare parameters. 685 xnn_f32_rnd_params params; 686 if (init_params != nullptr) { 687 init_params(¶ms); 688 } 689 690 // Call optimized micro-kernel. 691 vrnd(batch_size() * sizeof(float), x_data, y.data(), ¶ms); 692 693 // Verify results. 694 for (size_t i = 0; i < batch_size(); i++) { 695 ASSERT_EQ(y[i], y_ref[i]) 696 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 697 } 698 } 699 } 700 Test(xnn_f16_vsigmoid_ukernel_function vsigmoid,xnn_init_f16_sigmoid_params_fn init_params)701 void Test(xnn_f16_vsigmoid_ukernel_function vsigmoid, xnn_init_f16_sigmoid_params_fn init_params) const { 702 std::random_device random_device; 703 auto rng = std::mt19937(random_device()); 704 auto distribution = std::uniform_real_distribution<float>(-25.0f, 25.0f); 705 auto f32rng = std::bind(distribution, std::ref(rng)); 706 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng); 707 708 std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t)); 709 std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0)); 710 std::vector<float> y_ref(batch_size()); 711 for (size_t iteration = 0; iteration < iterations(); iteration++) { 712 if (inplace()) { 713 std::generate(y.begin(), y.end(), std::ref(f16rng)); 714 } else { 715 std::generate(x.begin(), x.end(), std::ref(f16rng)); 716 std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */); 717 } 718 const uint16_t* x_data = inplace() ? y.data() : x.data(); 719 720 // Compute reference results. 721 for (size_t i = 0; i < batch_size(); i++) { 722 const float e = std::exp(fp16_ieee_to_fp32_value(x_data[i])); 723 y_ref[i] = e / (1.0f + e); 724 } 725 726 // Prepare parameters. 727 union xnn_f16_sigmoid_params params; 728 init_params(¶ms); 729 730 // Call optimized micro-kernel. 731 vsigmoid(batch_size() * sizeof(uint16_t), x_data, y.data(), ¶ms); 732 733 // Verify results. 734 for (size_t i = 0; i < batch_size(); i++) { 735 ASSERT_NEAR( 736 fp16_ieee_to_fp32_value(y[i]), 737 y_ref[i], 738 std::max(1.0e-4f, std::abs(y_ref[i]) * 5.0e-3f)) 739 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << fp16_ieee_to_fp32_value(x[i]); 740 } 741 } 742 } 743 Test(xnn_f32_vsigmoid_ukernel_function vsigmoid,xnn_init_f32_sigmoid_params_fn init_params)744 void Test(xnn_f32_vsigmoid_ukernel_function vsigmoid, xnn_init_f32_sigmoid_params_fn init_params) const { 745 std::random_device random_device; 746 auto rng = std::mt19937(random_device()); 747 std::uniform_real_distribution<float> f32dist(-125.0f, 125.0f); 748 749 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 750 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 751 std::vector<double> y_ref(batch_size()); 752 for (size_t iteration = 0; iteration < iterations(); iteration++) { 753 if (inplace()) { 754 std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); }); 755 } else { 756 std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); }); 757 std::fill(y.begin(), y.end(), nanf("")); 758 } 759 const float* x_data = inplace() ? y.data() : x.data(); 760 761 // Compute reference results. 762 for (size_t i = 0; i < batch_size(); i++) { 763 const double e = std::exp(double(x_data[i])); 764 y_ref[i] = e / (1.0 + e); 765 } 766 767 // Prepare parameters. 768 union xnn_f32_sigmoid_params params; 769 init_params(¶ms); 770 771 // Call optimized micro-kernel. 772 vsigmoid(batch_size() * sizeof(float), x_data, y.data(), ¶ms); 773 774 // Verify results. 775 for (size_t i = 0; i < batch_size(); i++) { 776 ASSERT_NEAR(y[i], y_ref[i], std::max(5.0e-6, std::abs(y_ref[i]) * 1.0e-5)) 777 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 778 } 779 } 780 } 781 782 void Test(xnn_f16_vsqr_ukernel_function vsqr, xnn_init_f16_default_params_fn init_params = nullptr) const { 783 std::random_device random_device; 784 auto rng = std::mt19937(random_device()); 785 std::uniform_real_distribution<float> f32dist(-10.0f, 10.0f); 786 787 std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t)); 788 std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0)); 789 std::vector<float> y_ref(batch_size()); 790 for (size_t iteration = 0; iteration < iterations(); iteration++) { 791 if (inplace()) { 792 std::generate(y.begin(), y.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 793 } else { 794 std::generate(x.begin(), x.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 795 std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */); 796 } 797 const uint16_t* x_data = inplace() ? y.data() : x.data(); 798 799 // Compute reference results. 800 for (size_t i = 0; i < batch_size(); i++) { 801 const float x_value = fp16_ieee_to_fp32_value(x_data[i]); 802 y_ref[i] = x_value * x_value; 803 } 804 805 // Prepare parameters. 806 union xnn_f16_default_params params; 807 if (init_params != nullptr) { 808 init_params(¶ms); 809 } 810 811 // Call optimized micro-kernel. 812 vsqr(batch_size() * sizeof(uint16_t), x_data, y.data(), ¶ms); 813 814 // Verify results. 815 for (size_t i = 0; i < batch_size(); i++) { 816 ASSERT_NEAR( 817 fp16_ieee_to_fp32_value(y[i]), 818 y_ref[i], 819 std::max(1.0e-4f, std::abs(y_ref[i]) * 5.0e-3f)) 820 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << fp16_ieee_to_fp32_value(x[i]); 821 } 822 } 823 } 824 825 void Test(xnn_f32_vsqr_ukernel_function vsqr, xnn_init_f32_default_params_fn init_params = nullptr) const { 826 std::random_device random_device; 827 auto rng = std::mt19937(random_device()); 828 std::uniform_real_distribution<float> f32dist(-10.0f, 10.0f); 829 830 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 831 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 832 std::vector<float> y_ref(batch_size()); 833 for (size_t iteration = 0; iteration < iterations(); iteration++) { 834 if (inplace()) { 835 std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); }); 836 } else { 837 std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); }); 838 std::fill(y.begin(), y.end(), nanf("")); 839 } 840 const float* x_data = inplace() ? y.data() : x.data(); 841 842 // Compute reference results. 843 for (size_t i = 0; i < batch_size(); i++) { 844 y_ref[i] = x_data[i] * x_data[i]; 845 } 846 847 // Prepare parameters. 848 union xnn_f32_default_params params; 849 if (init_params != nullptr) { 850 init_params(¶ms); 851 } 852 853 // Call optimized micro-kernel. 854 vsqr(batch_size() * sizeof(float), x_data, y.data(), ¶ms); 855 856 // Verify results. 857 for (size_t i = 0; i < batch_size(); i++) { 858 ASSERT_EQ(y[i], y_ref[i]) 859 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 860 } 861 } 862 } 863 864 void Test(xnn_f16_vsqrt_ukernel_function vsqrt, xnn_init_f16_sqrt_params_fn init_params = nullptr) const { 865 std::random_device random_device; 866 auto rng = std::mt19937(random_device()); 867 std::uniform_real_distribution<float> f32dist(0.0f, 10.0f); 868 869 std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t)); 870 std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0)); 871 std::vector<float> y_ref(batch_size()); 872 for (size_t iteration = 0; iteration < iterations(); iteration++) { 873 if (inplace()) { 874 std::generate(y.begin(), y.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 875 } else { 876 std::generate(x.begin(), x.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 877 std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */); 878 } 879 const uint16_t* x_data = inplace() ? y.data() : x.data(); 880 881 // Compute reference results. 882 for (size_t i = 0; i < batch_size(); i++) { 883 y_ref[i] = std::sqrt(fp16_ieee_to_fp32_value(x_data[i])); 884 } 885 886 // Prepare parameters. 887 union xnn_f16_sqrt_params params; 888 if (init_params != nullptr) { 889 init_params(¶ms); 890 } 891 892 // Call optimized micro-kernel. 893 vsqrt(batch_size() * sizeof(uint16_t), x_data, y.data(), init_params != nullptr ? ¶ms : nullptr); 894 895 // Verify results. 896 for (size_t i = 0; i < batch_size(); i++) { 897 ASSERT_NEAR( 898 fp16_ieee_to_fp32_value(y[i]), 899 y_ref[i], 900 std::max(1.0e-4f, std::abs(y_ref[i]) * 5.0e-3f)) 901 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << fp16_ieee_to_fp32_value(x[i]); 902 } 903 } 904 } 905 906 void Test(xnn_f32_vsqrt_ukernel_function vsqrt, xnn_init_f32_sqrt_params_fn init_params = nullptr) const { 907 std::random_device random_device; 908 auto rng = std::mt19937(random_device()); 909 std::uniform_real_distribution<float> f32dist(0.0f, 10.0f); 910 911 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 912 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 913 std::vector<float> y_ref(batch_size()); 914 for (size_t iteration = 0; iteration < iterations(); iteration++) { 915 if (inplace()) { 916 std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); }); 917 } else { 918 std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); }); 919 std::fill(y.begin(), y.end(), nanf("")); 920 } 921 const float* x_data = inplace() ? y.data() : x.data(); 922 923 // Compute reference results. 924 for (size_t i = 0; i < batch_size(); i++) { 925 y_ref[i] = std::sqrt(x_data[i]); 926 } 927 928 // Prepare parameters. 929 union xnn_f32_sqrt_params params; 930 if (init_params != nullptr) { 931 init_params(¶ms); 932 } 933 934 // Call optimized micro-kernel. 935 vsqrt(batch_size() * sizeof(float), x_data, y.data(), init_params != nullptr ? ¶ms : nullptr); 936 937 // Verify results. 938 for (size_t i = 0; i < batch_size(); i++) { 939 ASSERT_EQ(y[i], y_ref[i]) 940 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 941 } 942 } 943 } 944 Test(xnn_f16_vclamp_ukernel_function vclamp,xnn_init_f16_minmax_params_fn init_params)945 void Test(xnn_f16_vclamp_ukernel_function vclamp, xnn_init_f16_minmax_params_fn init_params) const { 946 std::random_device random_device; 947 auto rng = std::mt19937(random_device()); 948 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 255.0f), std::ref(rng)); 949 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng); 950 951 std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t)); 952 std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0)); 953 std::vector<float> y_ref(batch_size()); 954 for (size_t iteration = 0; iteration < iterations(); iteration++) { 955 std::generate(x.begin(), x.end(), std::ref(f16rng)); 956 if (inplace()) { 957 std::generate(y.begin(), y.end(), std::ref(f16rng)); 958 } else { 959 std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */); 960 } 961 const uint16_t* x_data = inplace() ? y.data() : x.data(); 962 963 // Compute reference results. 964 for (size_t i = 0; i < batch_size(); i++) { 965 y_ref[i] = std::max(std::min(fp16_ieee_to_fp32_value(x_data[i]), float(qmax())), float(qmin())); 966 } 967 968 // Prepare parameters. 969 union xnn_f16_minmax_params params; 970 init_params(¶ms, fp16_ieee_from_fp32_value(float(qmin())), fp16_ieee_from_fp32_value(float(qmax()))); 971 972 // Call optimized micro-kernel. 973 vclamp(batch_size() * sizeof(uint16_t), x_data, y.data(), ¶ms); 974 975 // Verify results. 976 for (size_t i = 0; i < batch_size(); i++) { 977 ASSERT_NEAR(y_ref[i], fp16_ieee_to_fp32_value(y[i]), std::max(1.0e-3f, std::abs(y_ref[i]) * 1.0e-2f)) 978 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << fp16_ieee_to_fp32_value(x[i]); 979 } 980 } 981 } 982 Test(xnn_s8_vclamp_ukernel_function vclamp,xnn_init_s8_minmax_params_fn init_params)983 void Test(xnn_s8_vclamp_ukernel_function vclamp, xnn_init_s8_minmax_params_fn init_params) const { 984 std::random_device random_device; 985 auto rng = std::mt19937(random_device()); 986 auto i8rng = std::bind( 987 std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), 988 std::ref(rng)); 989 990 std::vector<int8_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(int8_t)); 991 std::vector<int8_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(int8_t) : 0)); 992 std::vector<int8_t> y_ref(batch_size()); 993 for (size_t iteration = 0; iteration < iterations(); iteration++) { 994 std::generate(x.begin(), x.end(), std::ref(i8rng)); 995 if (inplace()) { 996 std::copy(x.cbegin(), x.cend(), y.begin()); 997 } else { 998 std::fill(y.begin(), y.end(), INT8_C(0xA5)); 999 } 1000 const int8_t* x_data = inplace() ? y.data() : x.data(); 1001 1002 // Compute reference results. 1003 for (size_t i = 0; i < batch_size(); i++) { 1004 y_ref[i] = std::min(std::max(x_data[i], int8_t(qmin() - 0x80)), int8_t(qmax() - 0x80)); 1005 } 1006 1007 // Prepare parameters. 1008 union xnn_s8_minmax_params params; 1009 init_params(¶ms, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80)); 1010 1011 // Call optimized micro-kernel. 1012 vclamp(batch_size() * sizeof(int8_t), x_data, y.data(), ¶ms); 1013 1014 // Verify results. 1015 for (size_t i = 0; i < batch_size(); i++) { 1016 ASSERT_EQ(int32_t(y_ref[i]), int32_t(y[i])) 1017 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << int32_t(x[i]); 1018 } 1019 } 1020 } 1021 Test(xnn_u8_vclamp_ukernel_function vclamp,xnn_init_u8_minmax_params_fn init_params)1022 void Test(xnn_u8_vclamp_ukernel_function vclamp, xnn_init_u8_minmax_params_fn init_params) const { 1023 std::random_device random_device; 1024 auto rng = std::mt19937(random_device()); 1025 auto u8rng = std::bind( 1026 std::uniform_int_distribution<int32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng)); 1027 1028 std::vector<uint8_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint8_t)); 1029 std::vector<uint8_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint8_t) : 0)); 1030 std::vector<uint8_t> y_ref(batch_size()); 1031 for (size_t iteration = 0; iteration < iterations(); iteration++) { 1032 std::generate(x.begin(), x.end(), std::ref(u8rng)); 1033 if (inplace()) { 1034 std::copy(x.cbegin(), x.cend(), y.begin()); 1035 } else { 1036 std::fill(y.begin(), y.end(), UINT8_C(0xA5)); 1037 } 1038 const uint8_t* x_data = inplace() ? y.data() : x.data(); 1039 1040 // Compute reference results. 1041 for (size_t i = 0; i < batch_size(); i++) { 1042 y_ref[i] = std::min(std::max(x_data[i], qmin()), qmax()); 1043 } 1044 1045 // Prepare parameters. 1046 union xnn_u8_minmax_params params; 1047 init_params(¶ms, qmin(), qmax()); 1048 1049 // Call optimized micro-kernel. 1050 vclamp(batch_size() * sizeof(uint8_t), x_data, y.data(), ¶ms); 1051 1052 // Verify results. 1053 for (size_t i = 0; i < batch_size(); i++) { 1054 ASSERT_EQ(uint32_t(y_ref[i]), uint32_t(y[i])) 1055 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << uint32_t(x[i]); 1056 } 1057 } 1058 } 1059 Test(xnn_u64_u32_vsqrtshift_ukernel_function vsqrtshift)1060 void Test(xnn_u64_u32_vsqrtshift_ukernel_function vsqrtshift) const { 1061 ASSERT_FALSE(inplace()); 1062 1063 std::random_device random_device; 1064 auto rng = std::mt19937(random_device()); 1065 auto u64rng = std::bind( std::uniform_int_distribution<uint64_t>(), std::ref(rng)); 1066 1067 std::vector<uint64_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint64_t)); 1068 std::vector<uint32_t> y(batch_size()); 1069 std::vector<uint32_t> y_ref(batch_size()); 1070 for (size_t iteration = 0; iteration < iterations(); iteration++) { 1071 std::generate(x.begin(), x.end(), std::ref(u64rng)); 1072 std::fill(y.begin(), y.end(), UINT32_C(0xDEADBEEF)); 1073 1074 // Compute reference results. 1075 for (size_t i = 0; i < batch_size(); i++) { 1076 const uint64_t x_value = x[i]; 1077 uint32_t y_value = 0; 1078 // Match TFLM semantics, including bugs 1079 if (uint32_t(x_value) == x_value) { 1080 y_value = (uint32_t) std::lrint(std::sqrt(double(int64_t(uint64_t(x_value))))); 1081 y_value = std::min<uint32_t>(y_value, std::numeric_limits<uint16_t>::max()); 1082 } else if (x_value != 0) { 1083 uint64_t y0 = x_value >> 1; 1084 uint64_t y1 = (y0 + x_value / y0) >> 1; 1085 do { 1086 y0 = y1; 1087 y1 = (y0 + x_value / y0) >> 1; 1088 } while (y1 < y0); 1089 1090 // y0 is sqrt(x_value) rounded down, round up if needed 1091 if (int64_t(y0 * y0 + y0 - x_value) < 0) { 1092 y0 += 1; 1093 } 1094 y_value = static_cast<uint32_t>(std::min<uint64_t>(y0, std::numeric_limits<uint32_t>::max())); 1095 } 1096 y_ref[i] = y_value >> shift(); 1097 } 1098 1099 // Call optimized micro-kernel. 1100 vsqrtshift(batch_size() * sizeof(uint64_t), x.data(), y.data(), shift()); 1101 1102 // Verify results. 1103 for (size_t i = 0; i < batch_size(); i++) { 1104 ASSERT_EQ(y_ref[i], y[i]) 1105 << "at " << i << " / " << batch_size() 1106 << ", x[" << i << "]: " << x[i] 1107 << ", shift: " << shift(); 1108 } 1109 } 1110 } 1111 1112 private: 1113 size_t batch_size_ = 1; 1114 bool inplace_ = false; 1115 float slope_ = 0.5f; 1116 float prescale_ = 1.0f; 1117 float alpha_ = 1.0f; 1118 float beta_ = 1.0f; 1119 uint32_t shift_ = 1; 1120 uint8_t qmin_ = 0; 1121 uint8_t qmax_ = 255; 1122 size_t iterations_ = 15; 1123 }; 1124