1 // Copyright 2019 Google LLC 2 // 3 // This source code is licensed under the BSD-style license found in the 4 // LICENSE file in the root directory of this source tree. 5 6 #pragma once 7 8 #include <gtest/gtest.h> 9 10 #include <algorithm> 11 #include <cassert> 12 #include <cstddef> 13 #include <cstdlib> 14 #include <functional> 15 #include <random> 16 #include <vector> 17 18 #include <fp16.h> 19 20 #include <xnnpack.h> 21 #include <xnnpack/params-init.h> 22 #include <xnnpack/params.h> 23 24 25 class VUnaryMicrokernelTester { 26 public: 27 enum class OpType { 28 ReLU, 29 RoundToNearestEven, 30 RoundTowardsZero, 31 RoundUp, 32 RoundDown, 33 }; 34 35 enum class Variant { 36 Native, 37 Scalar, 38 }; 39 batch_size(size_t batch_size)40 inline VUnaryMicrokernelTester& batch_size(size_t batch_size) { 41 assert(batch_size != 0); 42 this->batch_size_ = batch_size; 43 return *this; 44 } 45 batch_size()46 inline size_t batch_size() const { 47 return this->batch_size_; 48 } 49 inplace(bool inplace)50 inline VUnaryMicrokernelTester& inplace(bool inplace) { 51 this->inplace_ = inplace; 52 return *this; 53 } 54 inplace()55 inline bool inplace() const { 56 return this->inplace_; 57 } 58 slope(float slope)59 inline VUnaryMicrokernelTester& slope(float slope) { 60 this->slope_ = slope; 61 return *this; 62 } 63 slope()64 inline float slope() const { 65 return this->slope_; 66 } 67 prescale(float prescale)68 inline VUnaryMicrokernelTester& prescale(float prescale) { 69 this->prescale_ = prescale; 70 return *this; 71 } 72 prescale()73 inline float prescale() const { 74 return this->prescale_; 75 } 76 alpha(float alpha)77 inline VUnaryMicrokernelTester& alpha(float alpha) { 78 this->alpha_ = alpha; 79 return *this; 80 } 81 alpha()82 inline float alpha() const { 83 return this->alpha_; 84 } 85 beta(float beta)86 inline VUnaryMicrokernelTester& beta(float beta) { 87 this->beta_ = beta; 88 return *this; 89 } 90 beta()91 inline float beta() const { 92 return this->beta_; 93 } 94 qmin(uint8_t qmin)95 inline VUnaryMicrokernelTester& qmin(uint8_t qmin) { 96 this->qmin_ = qmin; 97 return *this; 98 } 99 qmin()100 inline uint8_t qmin() const { 101 return this->qmin_; 102 } 103 qmax(uint8_t qmax)104 inline VUnaryMicrokernelTester& qmax(uint8_t qmax) { 105 this->qmax_ = qmax; 106 return *this; 107 } 108 qmax()109 inline uint8_t qmax() const { 110 return this->qmax_; 111 } 112 iterations(size_t iterations)113 inline VUnaryMicrokernelTester& iterations(size_t iterations) { 114 this->iterations_ = iterations; 115 return *this; 116 } 117 iterations()118 inline size_t iterations() const { 119 return this->iterations_; 120 } 121 122 void Test(xnn_f32_vunary_ukernel_function vunary, OpType op_type, Variant variant = Variant::Native) const { 123 std::random_device random_device; 124 auto rng = std::mt19937(random_device()); 125 auto distribution = std::uniform_real_distribution<float>(-125.0f, 125.0f); 126 auto f32rng = std::bind(distribution, std::ref(rng)); 127 128 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 129 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 130 std::vector<double> y_ref(batch_size()); 131 for (size_t iteration = 0; iteration < iterations(); iteration++) { 132 if (inplace()) { 133 std::generate(y.begin(), y.end(), std::ref(f32rng)); 134 } else { 135 std::generate(x.begin(), x.end(), std::ref(f32rng)); 136 std::fill(y.begin(), y.end(), nanf("")); 137 } 138 const float* x_data = inplace() ? y.data() : x.data(); 139 140 // Compute reference results. 141 for (size_t i = 0; i < batch_size(); i++) { 142 switch (op_type) { 143 case OpType::ReLU: 144 y_ref[i] = std::max(x_data[i], 0.0f); 145 break; 146 default: 147 GTEST_FAIL() << "Unexpected operation type"; 148 return; 149 } 150 } 151 152 // Call optimized micro-kernel. 153 vunary(batch_size() * sizeof(float), x_data, y.data(), nullptr); 154 155 // Verify results. 156 for (size_t i = 0; i < batch_size(); i++) { 157 ASSERT_NEAR(y[i], y_ref[i], std::max(5.0e-6, std::abs(y_ref[i]) * 1.0e-5)) 158 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 159 } 160 } 161 } 162 163 void Test(xnn_f32_vabs_ukernel_function vabs, xnn_init_f32_abs_params_fn init_params = nullptr) const { 164 std::random_device random_device; 165 auto rng = std::mt19937(random_device()); 166 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng)); 167 168 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 169 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 170 std::vector<float> y_ref(batch_size()); 171 for (size_t iteration = 0; iteration < iterations(); iteration++) { 172 if (inplace()) { 173 std::generate(y.begin(), y.end(), std::ref(f32rng)); 174 } else { 175 std::generate(x.begin(), x.end(), std::ref(f32rng)); 176 std::fill(y.begin(), y.end(), nanf("")); 177 } 178 const float* x_data = inplace() ? y.data() : x.data(); 179 180 // Compute reference results. 181 for (size_t i = 0; i < batch_size(); i++) { 182 y_ref[i] = std::abs(x_data[i]); 183 } 184 185 // Prepare parameters. 186 union xnn_f32_abs_params params; 187 if (init_params != nullptr) { 188 init_params(¶ms); 189 } 190 191 // Call optimized micro-kernel. 192 vabs(batch_size() * sizeof(float), x_data, y.data(), ¶ms); 193 194 // Verify results. 195 for (size_t i = 0; i < batch_size(); i++) { 196 ASSERT_EQ(y[i], y_ref[i]) 197 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 198 } 199 } 200 } 201 Test(xnn_f32_vclamp_ukernel_function vclamp,xnn_init_f32_minmax_params_fn init_params)202 void Test(xnn_f32_vclamp_ukernel_function vclamp, xnn_init_f32_minmax_params_fn init_params) const { 203 std::random_device random_device; 204 auto rng = std::mt19937(random_device()); 205 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 255.0f), std::ref(rng)); 206 207 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 208 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 209 std::vector<float> y_ref(batch_size()); 210 for (size_t iteration = 0; iteration < iterations(); iteration++) { 211 if (inplace()) { 212 std::generate(y.begin(), y.end(), std::ref(f32rng)); 213 } else { 214 std::generate(x.begin(), x.end(), std::ref(f32rng)); 215 std::fill(y.begin(), y.end(), nanf("")); 216 } 217 const float* x_data = inplace() ? y.data() : x.data(); 218 219 // Compute reference results. 220 for (size_t i = 0; i < batch_size(); i++) { 221 y_ref[i] = std::max(std::min(x_data[i], float(qmax())), float(qmin())); 222 } 223 224 // Prepare parameters. 225 union xnn_f32_minmax_params params; 226 init_params(¶ms, float(qmin()), float(qmax())); 227 228 // Call optimized micro-kernel. 229 vclamp(batch_size() * sizeof(float), x_data, y.data(), ¶ms); 230 231 // Verify results. 232 for (size_t i = 0; i < batch_size(); i++) { 233 ASSERT_EQ(y[i], y_ref[i]) 234 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 235 } 236 } 237 } 238 Test(xnn_f32_velu_ukernel_function velu,xnn_init_f32_elu_params_fn init_params)239 void Test(xnn_f32_velu_ukernel_function velu, xnn_init_f32_elu_params_fn init_params) const { 240 std::random_device random_device; 241 auto rng = std::mt19937(random_device()); 242 auto f32rng = std::bind(std::uniform_real_distribution<float>(-20.0f, 20.0f), std::ref(rng)); 243 244 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 245 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 246 std::vector<double> y_ref(batch_size()); 247 for (size_t iteration = 0; iteration < iterations(); iteration++) { 248 if (inplace()) { 249 std::generate(y.begin(), y.end(), std::ref(f32rng)); 250 } else { 251 std::generate(x.begin(), x.end(), std::ref(f32rng)); 252 std::fill(y.begin(), y.end(), nanf("")); 253 } 254 const float* x_data = inplace() ? y.data() : x.data(); 255 256 // Compute reference results. 257 for (size_t i = 0; i < batch_size(); i++) { 258 y_ref[i] = std::signbit(x_data[i]) ? alpha() * std::expm1(double(x_data[i]) * prescale()) : double(x_data[i]) * beta(); 259 } 260 261 // Prepare parameters. 262 union xnn_f32_elu_params params; 263 init_params(¶ms, prescale(), alpha(), beta()); 264 265 // Call optimized micro-kernel. 266 velu(batch_size() * sizeof(float), x_data, y.data(), ¶ms); 267 268 // Verify results. 269 for (size_t i = 0; i < batch_size(); i++) { 270 ASSERT_NEAR(y[i], y_ref[i], std::max(5.0e-6, std::abs(y_ref[i]) * 1.0e-5)) 271 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 272 } 273 } 274 } 275 Test(xnn_f32_vhswish_ukernel_function vhswish,xnn_init_f32_hswish_params_fn init_params)276 void Test(xnn_f32_vhswish_ukernel_function vhswish, xnn_init_f32_hswish_params_fn init_params) const { 277 std::random_device random_device; 278 auto rng = std::mt19937(random_device()); 279 auto f32rng = std::bind(std::uniform_real_distribution<float>(-4.0f, 4.0f), std::ref(rng)); 280 281 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 282 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 283 std::vector<double> y_ref(batch_size()); 284 for (size_t iteration = 0; iteration < iterations(); iteration++) { 285 if (inplace()) { 286 std::generate(y.begin(), y.end(), std::ref(f32rng)); 287 } else { 288 std::generate(x.begin(), x.end(), std::ref(f32rng)); 289 std::fill(y.begin(), y.end(), nanf("")); 290 } 291 const float* x_data = inplace() ? y.data() : x.data(); 292 293 // Compute reference results. 294 for (size_t i = 0; i < batch_size(); i++) { 295 y_ref[i] = (x_data[i] / 6.0f) * std::max(std::min(x_data[i] + 3.0f, 6.0f), 0.0f); 296 } 297 298 // Prepare parameters. 299 union xnn_f32_hswish_params params; 300 init_params(¶ms); 301 302 // Call optimized micro-kernel. 303 vhswish(batch_size() * sizeof(float), x_data, y.data(), ¶ms); 304 305 // Verify results. 306 for (size_t i = 0; i < batch_size(); i++) { 307 ASSERT_NEAR(y[i], y_ref[i], std::max(5.0e-6, std::abs(y_ref[i]) * 1.0e-5)) 308 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 309 } 310 } 311 } 312 Test(xnn_f32_vlrelu_ukernel_function vlrelu,xnn_init_f32_lrelu_params_fn init_params)313 void Test(xnn_f32_vlrelu_ukernel_function vlrelu, xnn_init_f32_lrelu_params_fn init_params) const { 314 std::random_device random_device; 315 auto rng = std::mt19937(random_device()); 316 auto f32rng = std::bind(std::uniform_real_distribution<float>(-125.0f, 125.0f), std::ref(rng)); 317 318 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 319 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 320 std::vector<double> y_ref(batch_size()); 321 for (size_t iteration = 0; iteration < iterations(); iteration++) { 322 if (inplace()) { 323 std::generate(y.begin(), y.end(), std::ref(f32rng)); 324 } else { 325 std::generate(x.begin(), x.end(), std::ref(f32rng)); 326 std::fill(y.begin(), y.end(), nanf("")); 327 } 328 const float* x_data = inplace() ? y.data() : x.data(); 329 330 // Compute reference results. 331 for (size_t i = 0; i < batch_size(); i++) { 332 y_ref[i] = std::signbit(x_data[i]) ? x_data[i] * slope() : x_data[i]; 333 } 334 335 // Prepare parameters. 336 union xnn_f32_lrelu_params params; 337 init_params(¶ms, slope()); 338 339 // Call optimized micro-kernel. 340 vlrelu(batch_size() * sizeof(float), x_data, y.data(), ¶ms); 341 342 // Verify results. 343 for (size_t i = 0; i < batch_size(); i++) { 344 ASSERT_EQ(y[i], y_ref[i]) 345 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 346 } 347 } 348 } 349 350 void Test(xnn_f32_vneg_ukernel_function vneg, xnn_init_f32_neg_params_fn init_params = nullptr) const { 351 std::random_device random_device; 352 auto rng = std::mt19937(random_device()); 353 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng)); 354 355 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 356 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 357 std::vector<float> y_ref(batch_size()); 358 for (size_t iteration = 0; iteration < iterations(); iteration++) { 359 if (inplace()) { 360 std::generate(y.begin(), y.end(), std::ref(f32rng)); 361 } else { 362 std::generate(x.begin(), x.end(), std::ref(f32rng)); 363 std::fill(y.begin(), y.end(), nanf("")); 364 } 365 const float* x_data = inplace() ? y.data() : x.data(); 366 367 // Compute reference results. 368 for (size_t i = 0; i < batch_size(); i++) { 369 y_ref[i] = -x_data[i]; 370 } 371 372 // Prepare parameters. 373 union xnn_f32_neg_params params; 374 if (init_params != nullptr) { 375 init_params(¶ms); 376 } 377 378 // Call optimized micro-kernel. 379 vneg(batch_size() * sizeof(float), x_data, y.data(), ¶ms); 380 381 // Verify results. 382 for (size_t i = 0; i < batch_size(); i++) { 383 ASSERT_EQ(y[i], y_ref[i]) 384 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 385 } 386 } 387 } 388 389 void Test(xnn_f32_vround_ukernel_function vrnd, OpType op_type, xnn_init_f32_rnd_params_fn init_params = nullptr) const { 390 std::random_device random_device; 391 auto rng = std::mt19937(random_device()); 392 auto distribution = std::uniform_real_distribution<float>(-5.0f, 5.0f); 393 auto f32rng = std::bind(distribution, std::ref(rng)); 394 395 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 396 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 397 std::vector<float> y_ref(batch_size()); 398 for (size_t iteration = 0; iteration < iterations(); iteration++) { 399 if (inplace()) { 400 std::generate(y.begin(), y.end(), std::ref(f32rng)); 401 } else { 402 std::generate(x.begin(), x.end(), std::ref(f32rng)); 403 std::fill(y.begin(), y.end(), nanf("")); 404 } 405 const float* x_data = inplace() ? y.data() : x.data(); 406 407 // Compute reference results. 408 for (size_t i = 0; i < batch_size(); i++) { 409 switch (op_type) { 410 case OpType::RoundToNearestEven: 411 y_ref[i] = std::nearbyint(double(x_data[i])); 412 break; 413 case OpType::RoundTowardsZero: 414 y_ref[i] = std::trunc(double(x_data[i])); 415 break; 416 case OpType::RoundUp: 417 y_ref[i] = std::ceil(double(x_data[i])); 418 break; 419 case OpType::RoundDown: 420 y_ref[i] = std::floor(double(x_data[i])); 421 break; 422 default: 423 GTEST_FAIL() << "Unexpected operation type"; 424 return; 425 } 426 } 427 428 // Prepare parameters. 429 xnn_f32_rnd_params params; 430 if (init_params != nullptr) { 431 init_params(¶ms); 432 } 433 434 // Call optimized micro-kernel. 435 vrnd(batch_size() * sizeof(float), x_data, y.data(), ¶ms); 436 437 // Verify results. 438 for (size_t i = 0; i < batch_size(); i++) { 439 ASSERT_EQ(y[i], y_ref[i]) 440 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 441 } 442 } 443 } 444 Test(xnn_f32_vsigmoid_ukernel_function vsigmoid,xnn_init_f32_sigmoid_params_fn init_params)445 void Test(xnn_f32_vsigmoid_ukernel_function vsigmoid, xnn_init_f32_sigmoid_params_fn init_params) const { 446 std::random_device random_device; 447 auto rng = std::mt19937(random_device()); 448 auto distribution = std::uniform_real_distribution<float>(-125.0f, 125.0f); 449 auto f32rng = std::bind(distribution, std::ref(rng)); 450 451 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 452 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 453 std::vector<double> y_ref(batch_size()); 454 for (size_t iteration = 0; iteration < iterations(); iteration++) { 455 if (inplace()) { 456 std::generate(y.begin(), y.end(), std::ref(f32rng)); 457 } else { 458 std::generate(x.begin(), x.end(), std::ref(f32rng)); 459 std::fill(y.begin(), y.end(), nanf("")); 460 } 461 const float* x_data = inplace() ? y.data() : x.data(); 462 463 // Compute reference results. 464 for (size_t i = 0; i < batch_size(); i++) { 465 const double e = std::exp(double(x_data[i])); 466 y_ref[i] = e / (1.0 + e); 467 } 468 469 // Prepare parameters. 470 union xnn_f32_sigmoid_params params; 471 init_params(¶ms); 472 473 // Call optimized micro-kernel. 474 vsigmoid(batch_size() * sizeof(float), x_data, y.data(), ¶ms); 475 476 // Verify results. 477 for (size_t i = 0; i < batch_size(); i++) { 478 ASSERT_NEAR(y[i], y_ref[i], std::max(5.0e-6, std::abs(y_ref[i]) * 1.0e-5)) 479 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 480 } 481 } 482 } 483 484 void Test(xnn_f32_vsqr_ukernel_function vsqr, xnn_init_f32_default_params_fn init_params = nullptr) const { 485 std::random_device random_device; 486 auto rng = std::mt19937(random_device()); 487 auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng)); 488 489 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 490 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 491 std::vector<float> y_ref(batch_size()); 492 for (size_t iteration = 0; iteration < iterations(); iteration++) { 493 if (inplace()) { 494 std::generate(y.begin(), y.end(), std::ref(f32rng)); 495 } else { 496 std::generate(x.begin(), x.end(), std::ref(f32rng)); 497 std::fill(y.begin(), y.end(), nanf("")); 498 } 499 const float* x_data = inplace() ? y.data() : x.data(); 500 501 // Compute reference results. 502 for (size_t i = 0; i < batch_size(); i++) { 503 y_ref[i] = x_data[i] * x_data[i]; 504 } 505 506 // Prepare parameters. 507 union xnn_f32_default_params params; 508 if (init_params != nullptr) { 509 init_params(¶ms); 510 } 511 512 // Call optimized micro-kernel. 513 vsqr(batch_size() * sizeof(float), x_data, y.data(), ¶ms); 514 515 // Verify results. 516 for (size_t i = 0; i < batch_size(); i++) { 517 ASSERT_EQ(y[i], y_ref[i]) 518 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 519 } 520 } 521 } 522 523 void Test(xnn_f32_vsqrt_ukernel_function vsqrt, xnn_init_f32_sqrt_params_fn init_params = nullptr) const { 524 std::random_device random_device; 525 auto rng = std::mt19937(random_device()); 526 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 10.0f), std::ref(rng)); 527 528 std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); 529 std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0)); 530 std::vector<float> y_ref(batch_size()); 531 for (size_t iteration = 0; iteration < iterations(); iteration++) { 532 if (inplace()) { 533 std::generate(y.begin(), y.end(), std::ref(f32rng)); 534 } else { 535 std::generate(x.begin(), x.end(), std::ref(f32rng)); 536 std::fill(y.begin(), y.end(), nanf("")); 537 } 538 const float* x_data = inplace() ? y.data() : x.data(); 539 540 // Compute reference results. 541 for (size_t i = 0; i < batch_size(); i++) { 542 y_ref[i] = std::sqrt(x_data[i]); 543 } 544 545 // Prepare parameters. 546 union xnn_f32_sqrt_params params; 547 if (init_params != nullptr) { 548 init_params(¶ms); 549 } 550 551 // Call optimized micro-kernel. 552 vsqrt(batch_size() * sizeof(float), x_data, y.data(), init_params != nullptr ? ¶ms : nullptr); 553 554 // Verify results. 555 for (size_t i = 0; i < batch_size(); i++) { 556 ASSERT_EQ(y[i], y_ref[i]) 557 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i]; 558 } 559 } 560 } 561 562 inline void Test(xnn_f32_vabs_ukernel_function vunary, OpType op_type, Variant variant = Variant::Native) const { 563 Test(xnn_f32_vunary_ukernel_function(vunary), op_type, variant); 564 } 565 566 inline void Test(xnn_f32_velu_ukernel_function vunary, OpType op_type, Variant variant = Variant::Native) const { 567 Test(xnn_f32_vunary_ukernel_function(vunary), op_type, variant); 568 } 569 570 inline void Test(xnn_f32_vneg_ukernel_function vunary, OpType op_type, Variant variant = Variant::Native) const { 571 Test(xnn_f32_vunary_ukernel_function(vunary), op_type, variant); 572 } 573 574 inline void Test(xnn_f32_vrelu_ukernel_function vunary, OpType op_type, Variant variant = Variant::Native) const { 575 Test(xnn_f32_vunary_ukernel_function(vunary), op_type, variant); 576 } 577 Test(xnn_f16_vclamp_ukernel_function vclamp,xnn_init_f16_minmax_params_fn init_params)578 void Test(xnn_f16_vclamp_ukernel_function vclamp, xnn_init_f16_minmax_params_fn init_params) const { 579 std::random_device random_device; 580 auto rng = std::mt19937(random_device()); 581 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 255.0f), std::ref(rng)); 582 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng); 583 584 std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t)); 585 std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0)); 586 std::vector<float> y_ref(batch_size()); 587 for (size_t iteration = 0; iteration < iterations(); iteration++) { 588 std::generate(x.begin(), x.end(), std::ref(f16rng)); 589 if (inplace()) { 590 std::generate(y.begin(), y.end(), std::ref(f16rng)); 591 } else { 592 std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */); 593 } 594 const uint16_t* x_data = inplace() ? y.data() : x.data(); 595 596 // Compute reference results. 597 for (size_t i = 0; i < batch_size(); i++) { 598 y_ref[i] = std::max(std::min(fp16_ieee_to_fp32_value(x_data[i]), float(qmax())), float(qmin())); 599 } 600 601 // Prepare parameters. 602 union xnn_f16_minmax_params params; 603 init_params(¶ms, fp16_ieee_from_fp32_value(float(qmin())), fp16_ieee_from_fp32_value(float(qmax()))); 604 605 // Call optimized micro-kernel. 606 vclamp(batch_size() * sizeof(uint16_t), x_data, y.data(), ¶ms); 607 608 // Verify results. 609 for (size_t i = 0; i < batch_size(); i++) { 610 ASSERT_NEAR(y_ref[i], fp16_ieee_to_fp32_value(y[i]), std::max(1.0e-3f, std::abs(y_ref[i]) * 1.0e-2f)) 611 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << fp16_ieee_to_fp32_value(x[i]); 612 } 613 } 614 } 615 Test(xnn_f16_vhswish_ukernel_function vhswish,xnn_init_f16_hswish_params_fn init_params)616 void Test(xnn_f16_vhswish_ukernel_function vhswish, xnn_init_f16_hswish_params_fn init_params) const { 617 std::random_device random_device; 618 auto rng = std::mt19937(random_device()); 619 auto f32rng = std::bind(std::uniform_real_distribution<float>(-4.0f, 4.0f), std::ref(rng)); 620 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng); 621 622 std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t)); 623 std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0)); 624 std::vector<float> y_ref(batch_size()); 625 for (size_t iteration = 0; iteration < iterations(); iteration++) { 626 std::generate(x.begin(), x.end(), std::ref(f16rng)); 627 if (inplace()) { 628 std::generate(y.begin(), y.end(), std::ref(f16rng)); 629 } else { 630 std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */); 631 } 632 const uint16_t* x_data = inplace() ? y.data() : x.data(); 633 634 // Compute reference results. 635 for (size_t i = 0; i < batch_size(); i++) { 636 const float x_value = fp16_ieee_to_fp32_value(x_data[i]); 637 y_ref[i] = (x_value / 6.0f) * std::max(std::min(x_value + 3.0f, 6.0f), 0.0f); 638 } 639 640 // Prepare parameters. 641 union xnn_f16_hswish_params params; 642 init_params(¶ms); 643 644 // Call optimized micro-kernel. 645 vhswish(batch_size() * sizeof(uint16_t), x_data, y.data(), ¶ms); 646 647 // Verify results. 648 for (size_t i = 0; i < batch_size(); i++) { 649 ASSERT_NEAR(y_ref[i], fp16_ieee_to_fp32_value(y[i]), std::max(1.0e-3f, std::abs(y_ref[i]) * 1.0e-2f)) 650 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << fp16_ieee_to_fp32_value(x[i]); 651 } 652 } 653 } 654 Test(xnn_s8_vclamp_ukernel_function vclamp,xnn_init_s8_minmax_params_fn init_params)655 void Test(xnn_s8_vclamp_ukernel_function vclamp, xnn_init_s8_minmax_params_fn init_params) const { 656 std::random_device random_device; 657 auto rng = std::mt19937(random_device()); 658 auto i8rng = std::bind( 659 std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), 660 std::ref(rng)); 661 662 std::vector<int8_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(int8_t)); 663 std::vector<int8_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(int8_t) : 0)); 664 std::vector<int8_t> y_ref(batch_size()); 665 for (size_t iteration = 0; iteration < iterations(); iteration++) { 666 std::generate(x.begin(), x.end(), std::ref(i8rng)); 667 if (inplace()) { 668 std::copy(x.cbegin(), x.cend(), y.begin()); 669 } else { 670 std::fill(y.begin(), y.end(), INT8_C(0xA5)); 671 } 672 const int8_t* x_data = inplace() ? y.data() : x.data(); 673 674 // Compute reference results. 675 for (size_t i = 0; i < batch_size(); i++) { 676 y_ref[i] = std::min(std::max(x_data[i], int8_t(qmin() - 0x80)), int8_t(qmax() - 0x80)); 677 } 678 679 // Prepare parameters. 680 union xnn_s8_minmax_params params; 681 init_params(¶ms, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80)); 682 683 // Call optimized micro-kernel. 684 vclamp(batch_size() * sizeof(int8_t), x_data, y.data(), ¶ms); 685 686 // Verify results. 687 for (size_t i = 0; i < batch_size(); i++) { 688 ASSERT_EQ(int32_t(y_ref[i]), int32_t(y[i])) 689 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << int32_t(x[i]); 690 } 691 } 692 } 693 Test(xnn_u8_vclamp_ukernel_function vclamp,xnn_init_u8_minmax_params_fn init_params)694 void Test(xnn_u8_vclamp_ukernel_function vclamp, xnn_init_u8_minmax_params_fn init_params) const { 695 std::random_device random_device; 696 auto rng = std::mt19937(random_device()); 697 auto u8rng = std::bind( 698 std::uniform_int_distribution<int32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng)); 699 700 std::vector<uint8_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint8_t)); 701 std::vector<uint8_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint8_t) : 0)); 702 std::vector<uint8_t> y_ref(batch_size()); 703 for (size_t iteration = 0; iteration < iterations(); iteration++) { 704 std::generate(x.begin(), x.end(), std::ref(u8rng)); 705 if (inplace()) { 706 std::copy(x.cbegin(), x.cend(), y.begin()); 707 } else { 708 std::fill(y.begin(), y.end(), UINT8_C(0xA5)); 709 } 710 const uint8_t* x_data = inplace() ? y.data() : x.data(); 711 712 // Compute reference results. 713 for (size_t i = 0; i < batch_size(); i++) { 714 y_ref[i] = std::min(std::max(x_data[i], qmin()), qmax()); 715 } 716 717 // Prepare parameters. 718 union xnn_u8_minmax_params params; 719 init_params(¶ms, qmin(), qmax()); 720 721 // Call optimized micro-kernel. 722 vclamp(batch_size() * sizeof(uint8_t), x_data, y.data(), ¶ms); 723 724 // Verify results. 725 for (size_t i = 0; i < batch_size(); i++) { 726 ASSERT_EQ(uint32_t(y_ref[i]), uint32_t(y[i])) 727 << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << uint32_t(x[i]); 728 } 729 } 730 } 731 732 private: 733 size_t batch_size_ = 1; 734 bool inplace_ = false; 735 float slope_ = 0.5f; 736 float prescale_ = 1.0f; 737 float alpha_ = 1.0f; 738 float beta_ = 1.0f; 739 uint8_t qmin_ = 0; 740 uint8_t qmax_ = 255; 741 size_t iterations_ = 15; 742 }; 743