1 // Copyright (c) Facebook, Inc. and its affiliates. 2 // All rights reserved. 3 // 4 // Copyright 2019 Google LLC 5 // 6 // This source code is licensed under the BSD-style license found in the 7 // LICENSE file in the root directory of this source tree. 8 9 #pragma once 10 11 #include <gtest/gtest.h> 12 13 #include <algorithm> 14 #include <cassert> 15 #include <cmath> 16 #include <cstddef> 17 #include <cstdlib> 18 #include <functional> 19 #include <limits> 20 #include <random> 21 #include <vector> 22 23 #include <fp16.h> 24 25 #include <xnnpack.h> 26 #include <xnnpack/AlignedAllocator.h> 27 #include <xnnpack/pack.h> 28 #include <xnnpack/params-init.h> 29 #include <xnnpack/params.h> 30 #include <xnnpack/requantization.h> 31 32 33 class DWConvMicrokernelTester { 34 public: 35 enum class Variant { 36 Native, 37 Scalar, 38 }; 39 width(uint32_t width)40 inline DWConvMicrokernelTester& width(uint32_t width) { 41 assert(width >= 1); 42 this->width_ = width; 43 return *this; 44 } 45 width()46 inline uint32_t width() const { 47 return this->width_; 48 } 49 step(uint32_t step)50 inline DWConvMicrokernelTester& step(uint32_t step) { 51 assert(step >= 1); 52 this->step_ = step; 53 return *this; 54 } 55 step()56 inline uint32_t step() const { 57 return this->step_; 58 } 59 channels(uint32_t channels)60 inline DWConvMicrokernelTester& channels(uint32_t channels) { 61 assert(channels >= 1); 62 this->channels_ = channels; 63 return *this; 64 } 65 channels()66 inline uint32_t channels() const { 67 return this->channels_; 68 } 69 cr(uint32_t cr)70 inline DWConvMicrokernelTester& cr(uint32_t cr) { 71 assert(cr != 0); 72 this->cr_ = cr; 73 return *this; 74 } 75 cr()76 inline uint32_t cr() const { 77 return this->cr_; 78 } 79 kr(uint32_t kr)80 inline DWConvMicrokernelTester& kr(uint32_t kr) { 81 assert(kr != 0); 82 this->kr_ = kr; 83 return *this; 84 } 85 kr()86 inline uint32_t kr() const { 87 return this->kr_; 88 } 89 packed_channels()90 inline uint32_t packed_channels() const { 91 return (channels() / cr() + !!(channels() % cr())) * cr(); 92 } 93 output_stride(uint32_t output_stride)94 inline DWConvMicrokernelTester& output_stride(uint32_t output_stride) { 95 assert(output_stride != 0); 96 this->output_stride_ = output_stride; 97 return *this; 98 } 99 output_stride()100 inline uint32_t output_stride() const { 101 if (this->output_stride_ == 0) { 102 return channels(); 103 } else { 104 assert(this->output_stride_ >= channels()); 105 return this->output_stride_; 106 } 107 } 108 input_zero_point(uint8_t input_zero_point)109 inline DWConvMicrokernelTester& input_zero_point(uint8_t input_zero_point) { 110 this->input_zero_point_ = input_zero_point; 111 return *this; 112 } 113 input_zero_point()114 inline uint8_t input_zero_point() const { 115 return this->input_zero_point_; 116 } 117 kernel_zero_point(uint8_t kernel_zero_point)118 inline DWConvMicrokernelTester& kernel_zero_point(uint8_t kernel_zero_point) { 119 this->kernel_zero_point_ = kernel_zero_point; 120 return *this; 121 } 122 kernel_zero_point()123 inline uint8_t kernel_zero_point() const { 124 return this->kernel_zero_point_; 125 } 126 qmin(uint8_t qmin)127 inline DWConvMicrokernelTester& qmin(uint8_t qmin) { 128 this->qmin_ = qmin; 129 return *this; 130 } 131 qmin()132 inline uint8_t qmin() const { 133 return this->qmin_; 134 } 135 qmax(uint8_t qmax)136 inline DWConvMicrokernelTester& qmax(uint8_t qmax) { 137 this->qmax_ = qmax; 138 return *this; 139 } 140 qmax()141 inline uint8_t qmax() const { 142 return this->qmax_; 143 } 144 input_offset(size_t input_offset)145 inline DWConvMicrokernelTester& input_offset(size_t input_offset) { 146 this->input_offset_ = input_offset; 147 return *this; 148 } 149 input_offset()150 inline size_t input_offset() const { 151 return this->input_offset_; 152 } 153 zero_index(size_t zero_index)154 inline DWConvMicrokernelTester& zero_index(size_t zero_index) { 155 this->zero_index_ = zero_index; 156 return *this; 157 } 158 zero_index()159 inline size_t zero_index() const { 160 return this->zero_index_; 161 } 162 iterations(size_t iterations)163 inline DWConvMicrokernelTester& iterations(size_t iterations) { 164 this->iterations_ = iterations; 165 return *this; 166 } 167 iterations()168 inline size_t iterations() const { 169 return this->iterations_; 170 } 171 172 void Test(xnn_qu8_dwconv_minmax_unipass_ukernel_function dwconv_minmax, Variant variant = Variant::Native) const { 173 std::random_device random_device; 174 auto rng = std::mt19937(random_device()); 175 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng); 176 auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng); 177 178 std::vector<const uint8_t*> indirection((width() - 1) * step() + kr()); 179 std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + indirection.size() * channels()); 180 std::vector<uint8_t> kernel(channels() * kr()); 181 std::vector<int32_t> bias(channels()); 182 std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> packed_weights((kr() + sizeof(int32_t) / sizeof(uint8_t)) * packed_channels()); 183 std::vector<uint8_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint8_t)); 184 std::vector<uint8_t> output((width() - 1) * output_stride() + channels()); 185 std::vector<int32_t> accumulators(width() * channels()); 186 std::vector<uint8_t> output_ref(width() * channels()); 187 188 for (size_t iteration = 0; iteration < iterations(); iteration++) { 189 do { 190 std::generate(input.begin(), input.end(), std::ref(u8rng)); 191 } while (input.size() > 1 && *std::max_element(input.cbegin(), input.cend()) == *std::min_element(input.cbegin(), input.cend())); 192 do { 193 std::generate(kernel.begin(), kernel.end(), std::ref(u8rng)); 194 } while (kernel.size() > 1 && *std::max_element(kernel.cbegin(), kernel.cend()) == *std::min_element(kernel.cbegin(), kernel.cend())); 195 std::generate(bias.begin(), bias.end(), std::ref(i32rng)); 196 std::fill(zero.begin(), zero.end(), input_zero_point()); 197 std::fill(output.begin(), output.end(), 0xA5); 198 199 std::fill(packed_weights.begin(), packed_weights.end(), 0); 200 const xnn_qu8_packing_params packing_params = { input_zero_point(), kernel_zero_point() }; 201 xnn_pack_qu8_dwconv_ghw_w( 202 kr(), 1, channels(), cr(), 203 kernel.data(), bias.data(), packed_weights.data(), &packing_params); 204 for (size_t i = 0; i < indirection.size(); i++) { 205 indirection[i] = input.data() + i * channels() - input_offset(); 206 } 207 std::shuffle(indirection.begin(), indirection.end(), rng); 208 if (zero_index() != SIZE_MAX) { 209 for (size_t i = 0; i < indirection.size(); i += kr()) { 210 indirection[i + zero_index()] = zero.data(); 211 } 212 } 213 214 // Compute reference results, without renormalization. 215 for (size_t x = 0; x < width(); x++) { 216 for (size_t c = 0; c < channels(); c++) { 217 float acc = bias[c]; 218 for (size_t k = 0; k < kr(); k++) { 219 if (indirection[x * step() + k] != zero.data()) { 220 acc += 221 (int32_t(indirection[x * step() + k][c + input_offset()]) - int32_t(input_zero_point())) * 222 (int32_t(kernel[c * kr() + k]) - int32_t(kernel_zero_point())); 223 } 224 } 225 accumulators[x * channels() + c] = acc; 226 } 227 } 228 229 // Compute renormalization parameters. 230 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend()); 231 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend()); 232 const uint32_t accumulated_range = uint32_t(accumulated_max) - uint32_t(accumulated_min); 233 const double output_scale = accumulated_range >= 256 ? double(accumulated_range) / 255.0 : 1.00001; 234 const uint8_t output_zero_point = uint8_t(std::max(std::min( 235 lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale), 236 long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min()))); 237 238 // Prepare parameters. 239 const float requantization_scale = 1.0f / float(output_scale); 240 union xnn_qu8_gemm_params quantization_params = { }; 241 switch (variant) { 242 case Variant::Native: 243 quantization_params = xnn_init_qu8_gemm_params( 244 kernel_zero_point(), requantization_scale, output_zero_point, qmin(), qmax()); 245 break; 246 case Variant::Scalar: 247 quantization_params = xnn_init_scalar_qu8_gemm_params( 248 kernel_zero_point(), requantization_scale, output_zero_point, qmin(), qmax()); 249 break; 250 } 251 const union xnn_qu8_requantization_params scalar_requantization_params = 252 xnn_init_scalar_qu8_requantization_params(requantization_scale, output_zero_point, qmin(), qmax()); 253 254 // Renormalize reference results. 255 for (size_t x = 0; x < width(); x++) { 256 for (size_t c = 0; c < channels(); c++) { 257 output_ref[x * channels() + c] = xnn_qu8_requantize_q31(accumulators[x * channels() + c], scalar_requantization_params); 258 } 259 } 260 261 // Call optimized micro-kernel. 262 dwconv_minmax( 263 channels(), width(), 264 indirection.data(), packed_weights.data(), output.data(), 265 step() * sizeof(void*), 266 (output_stride() - channels()) * sizeof(uint8_t), 267 input_offset() * sizeof(uint8_t), zero.data(), 268 &quantization_params); 269 270 // Verify results. 271 for (size_t x = 0; x < width(); x++) { 272 for (size_t c = 0; c < channels(); c++) { 273 ASSERT_GE(uint32_t(output[x * output_stride() + c]), uint32_t(qmin())) 274 << "x = " << x << ", channel = " << c; 275 ASSERT_LE(uint32_t(output[x * output_stride() + c]), uint32_t(qmax())) 276 << "x = " << x << ", channel = " << c; 277 ASSERT_EQ(uint32_t(output[x * output_stride() + c]), uint32_t(output_ref[x * channels() + c])) 278 << "x = " << x << ", channel = " << c << ", accumulator = " << accumulators[x * channels() + c]; 279 } 280 } 281 } 282 } 283 284 void Test(xnn_qs8_dwconv_minmax_unipass_ukernel_function dwconv_minmax, Variant variant = Variant::Native) const { 285 std::random_device random_device; 286 auto rng = std::mt19937(random_device()); 287 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng); 288 auto i8rng = std::bind( 289 std::uniform_int_distribution<uint32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), rng); 290 291 std::vector<const int8_t*> indirection((width() - 1) * step() + kr()); 292 std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) + indirection.size() * channels()); 293 std::vector<int8_t> kernel(channels() * kr()); 294 std::vector<int32_t> bias(channels()); 295 std::vector<int8_t, AlignedAllocator<int8_t, 64>> packed_weights((kr() + sizeof(int32_t) / sizeof(int8_t)) * packed_channels()); 296 std::vector<int8_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(int8_t)); 297 std::vector<int8_t> output((width() - 1) * output_stride() + channels()); 298 std::vector<int32_t> accumulators(width() * channels()); 299 std::vector<int8_t> output_ref(width() * channels()); 300 301 for (size_t iteration = 0; iteration < iterations(); iteration++) { 302 do { 303 std::generate(input.begin(), input.end(), std::ref(i8rng)); 304 } while (input.size() > 1 && *std::max_element(input.cbegin(), input.cend()) == *std::min_element(input.cbegin(), input.cend())); 305 do { 306 std::generate(kernel.begin(), kernel.end(), std::ref(i8rng)); 307 } while (kernel.size() > 1 && *std::max_element(kernel.cbegin(), kernel.cend()) == *std::min_element(kernel.cbegin(), kernel.cend())); 308 std::generate(bias.begin(), bias.end(), std::ref(i32rng)); 309 std::fill(zero.begin(), zero.end(), int8_t(input_zero_point() - 0x80)); 310 std::fill(output.begin(), output.end(), 0xA5); 311 312 std::fill(packed_weights.begin(), packed_weights.end(), 0); 313 const xnn_qs8_packing_params packing_params = { int8_t(input_zero_point() - 0x80) }; 314 xnn_pack_qs8_dwconv_ghw_w( 315 kr(), 1, channels(), cr(), 316 kernel.data(), bias.data(), packed_weights.data(), &packing_params); 317 for (size_t i = 0; i < indirection.size(); i++) { 318 indirection[i] = input.data() + i * channels() - input_offset(); 319 } 320 std::shuffle(indirection.begin(), indirection.end(), rng); 321 if (zero_index() != SIZE_MAX) { 322 for (size_t i = 0; i < indirection.size(); i += kr()) { 323 indirection[i + zero_index()] = zero.data(); 324 } 325 } 326 327 // Compute reference results, without renormalization. 328 for (size_t x = 0; x < width(); x++) { 329 for (size_t c = 0; c < channels(); c++) { 330 float acc = bias[c]; 331 for (size_t k = 0; k < kr(); k++) { 332 if (indirection[x * step() + k] != zero.data()) { 333 acc += 334 (int32_t(indirection[x * step() + k][c + input_offset()]) - int32_t(input_zero_point() - 0x80)) * 335 int32_t(kernel[c * kr() + k]); 336 } 337 } 338 accumulators[x * channels() + c] = acc; 339 } 340 } 341 342 // Compute renormalization parameters. 343 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend()); 344 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend()); 345 const uint32_t accumulated_range = uint32_t(accumulated_max) - uint32_t(accumulated_min); 346 const double output_scale = accumulated_range >= 256 ? double(accumulated_range) / 255.0 : 1.00001; 347 const int8_t output_zero_point = int8_t(std::max(std::min( 348 lrint(-0.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale), 349 long(std::numeric_limits<int8_t>::max())), long(std::numeric_limits<int8_t>::min()))); 350 351 // Prepare parameters. 352 const float requantization_scale = 1.0f / float(output_scale); 353 union xnn_qs8_gemm_params quantization_params = { }; 354 switch (variant) { 355 case Variant::Native: 356 quantization_params = xnn_init_qs8_gemm_params( 357 requantization_scale, output_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80)); 358 break; 359 case Variant::Scalar: 360 quantization_params = xnn_init_scalar_qs8_gemm_params( 361 requantization_scale, output_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80)); 362 break; 363 } 364 const union xnn_qs8_requantization_params scalar_requantization_params = 365 xnn_init_scalar_qs8_requantization_params(requantization_scale, output_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80)); 366 367 // Renormalize reference results. 368 for (size_t x = 0; x < width(); x++) { 369 for (size_t c = 0; c < channels(); c++) { 370 output_ref[x * channels() + c] = xnn_qs8_requantize_q31(accumulators[x * channels() + c], scalar_requantization_params); 371 } 372 } 373 374 // Call optimized micro-kernel. 375 dwconv_minmax( 376 channels(), width(), 377 indirection.data(), packed_weights.data(), output.data(), 378 step() * sizeof(void*), 379 (output_stride() - channels()) * sizeof(int8_t), 380 input_offset() * sizeof(int8_t), zero.data(), 381 &quantization_params); 382 383 // Verify results. 384 for (size_t x = 0; x < width(); x++) { 385 for (size_t c = 0; c < channels(); c++) { 386 ASSERT_GE(int32_t(output[x * output_stride() + c]), int32_t(qmin()) - 0x80) 387 << "x = " << x << ", channel = " << c; 388 ASSERT_LE(int32_t(output[x * output_stride() + c]), int32_t(qmax()) - 0x80) 389 << "x = " << x << ", channel = " << c; 390 ASSERT_EQ(int32_t(output[x * output_stride() + c]), int32_t(output_ref[x * channels() + c])) 391 << "x = " << x << ", channel = " << c << ", accumulator = " << accumulators[x * channels() + c]; 392 } 393 } 394 } 395 } 396 397 void Test(xnn_f16_dwconv_minmax_unipass_ukernel_function dwconv_minmax, Variant variant = Variant::Native) const { 398 std::random_device random_device; 399 auto rng = std::mt19937(random_device()); 400 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng); 401 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng); 402 403 std::vector<const uint16_t*> indirection((width() - 1) * step() + kr()); 404 std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + indirection.size() * channels()); 405 std::vector<uint16_t> kernel(channels() * kr()); 406 std::vector<uint16_t> bias(channels()); 407 std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> packed_weights((kr() + 1) * packed_channels()); 408 std::vector<uint16_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint16_t)); 409 std::vector<uint16_t> output((width() - 1) * output_stride() + channels()); 410 std::vector<float> output_ref(width() * channels()); 411 412 for (size_t iteration = 0; iteration < iterations(); iteration++) { 413 std::generate(input.begin(), input.end(), std::ref(f16rng)); 414 std::generate(kernel.begin(), kernel.end(), std::ref(f16rng)); 415 std::generate(bias.begin(), bias.end(), std::ref(f16rng)); 416 std::fill(zero.begin(), zero.end(), 0); 417 std::fill(output_ref.begin(), output_ref.end(), 0.0f); 418 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */); 419 420 std::fill(packed_weights.begin(), packed_weights.end(), 0); 421 xnn_pack_f16_dwconv_ghw_w( 422 kr(), 1, channels(), cr(), 423 kernel.data(), bias.data(), packed_weights.data(), nullptr); 424 for (size_t i = 0; i < indirection.size(); i++) { 425 indirection[i] = input.data() + i * channels() - input_offset(); 426 } 427 std::shuffle(indirection.begin(), indirection.end(), rng); 428 if (zero_index() != SIZE_MAX) { 429 for (size_t i = 0; i < indirection.size(); i += kr()) { 430 indirection[i + zero_index()] = zero.data(); 431 } 432 } 433 434 // Compute reference results, without clamping. 435 for (size_t x = 0; x < width(); x++) { 436 for (size_t c = 0; c < channels(); c++) { 437 float acc = fp16_ieee_to_fp32_value(bias[c]); 438 for (size_t k = 0; k < kr(); k++) { 439 if (indirection[x * step() + k] != zero.data()) { 440 acc += fp16_ieee_to_fp32_value(indirection[x * step() + k][c + input_offset()]) * fp16_ieee_to_fp32_value(kernel[c * kr() + k]); 441 } 442 } 443 output_ref[x * channels() + c] = acc; 444 } 445 } 446 447 // Compute clamping parameters. 448 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 449 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 450 const float accumulated_range = accumulated_max - accumulated_min; 451 const float output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_min + accumulated_range / 255.0f * float(qmin()))); 452 const float output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_max - accumulated_range / 255.0f * float(255 - qmax()))); 453 454 // Prepare parameters. 455 xnn_f16_minmax_params params = xnn_init_f16_minmax_params( 456 fp16_ieee_from_fp32_value(output_min), 457 fp16_ieee_from_fp32_value(output_max)); 458 459 // Clamp reference results. 460 for (float& output_val : output_ref) { 461 output_val = std::max(std::min(output_val, output_max), output_min); 462 } 463 464 // Call optimized micro-kernel. 465 dwconv_minmax( 466 channels(), width(), 467 reinterpret_cast<const void**>(indirection.data()), packed_weights.data(), output.data(), 468 step() * sizeof(void*), 469 (output_stride() - channels()) * sizeof(uint16_t), 470 input_offset() * sizeof(uint16_t), zero.data(), 471 ¶ms); 472 473 // Verify results. 474 for (size_t x = 0; x < width(); x++) { 475 for (size_t c = 0; c < channels(); c++) { 476 ASSERT_GE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_min) 477 << "x = " << x << ", channel = " << c; 478 ASSERT_LE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_max) 479 << "x = " << x << ", channel = " << c; 480 ASSERT_NEAR(output_ref[x * channels() + c], fp16_ieee_to_fp32_value(output[x * output_stride() + c]), std::max(1.0e-4f, std::abs(output_ref[x * channels() + c]) * 1.0e-2f)) 481 << "x = " << x << ", channel = " << c; 482 } 483 } 484 } 485 } 486 Test(xnn_f32_dwconv_unipass_ukernel_function dwconv)487 void Test(xnn_f32_dwconv_unipass_ukernel_function dwconv) const { 488 std::random_device random_device; 489 auto rng = std::mt19937(random_device()); 490 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng); 491 492 std::vector<const float*> indirection((width() - 1) * step() + kr()); 493 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + indirection.size() * channels()); 494 std::vector<float> kernel(channels() * kr()); 495 std::vector<float> bias(channels()); 496 std::vector<float, AlignedAllocator<float, 64>> packed_weights((kr() + 1) * packed_channels()); 497 std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float)); 498 std::vector<float> output((width() - 1) * output_stride() + channels()); 499 std::vector<float> output_ref(width() * channels()); 500 501 for (size_t iteration = 0; iteration < iterations(); iteration++) { 502 std::generate(input.begin(), input.end(), std::ref(f32rng)); 503 std::generate(kernel.begin(), kernel.end(), std::ref(f32rng)); 504 std::generate(bias.begin(), bias.end(), std::ref(f32rng)); 505 std::fill(zero.begin(), zero.end(), 0.0f); 506 std::fill(output_ref.begin(), output_ref.end(), nanf("")); 507 std::fill(output.begin(), output.end(), nanf("")); 508 509 std::fill(packed_weights.begin(), packed_weights.end(), 0.0f); 510 xnn_pack_f32_dwconv_ghw_w( 511 kr(), 1, channels(), cr(), 512 kernel.data(), bias.data(), packed_weights.data(), nullptr); 513 for (size_t i = 0; i < indirection.size(); i++) { 514 indirection[i] = input.data() + i * channels() - input_offset(); 515 } 516 std::shuffle(indirection.begin(), indirection.end(), rng); 517 if (zero_index() != SIZE_MAX) { 518 for (size_t i = 0; i < indirection.size(); i += kr()) { 519 indirection[i + zero_index()] = zero.data(); 520 } 521 } 522 523 // Compute reference results, without clamping. 524 for (size_t x = 0; x < width(); x++) { 525 for (size_t c = 0; c < channels(); c++) { 526 float acc = bias[c]; 527 for (size_t k = 0; k < kr(); k++) { 528 if (indirection[x * step() + k] != zero.data()) { 529 acc += indirection[x * step() + k][c + input_offset()] * kernel[c * kr() + k]; 530 } 531 } 532 output_ref[x * channels() + c] = acc; 533 } 534 } 535 536 // Call optimized micro-kernel. 537 dwconv( 538 channels(), width(), 539 indirection.data(), packed_weights.data(), output.data(), 540 step() * sizeof(void*), 541 (output_stride() - channels()) * sizeof(float), 542 input_offset() * sizeof(float), zero.data(), 543 nullptr); 544 545 // Verify results. 546 for (size_t x = 0; x < width(); x++) { 547 for (size_t c = 0; c < channels(); c++) { 548 ASSERT_NEAR( 549 output_ref[x * channels() + c], 550 output[x * output_stride() + c], 551 std::abs(output_ref[x * channels() + c]) * 1.0e-5) 552 << "x = " << x << ", channel = " << c; 553 } 554 } 555 } 556 } 557 558 void Test(xnn_f32_dwconv_minmax_unipass_ukernel_function dwconv_minmax, Variant variant = Variant::Native) const { 559 std::random_device random_device; 560 auto rng = std::mt19937(random_device()); 561 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng); 562 563 std::vector<const float*> indirection((width() - 1) * step() + kr()); 564 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + indirection.size() * channels()); 565 std::vector<float> kernel(channels() * kr()); 566 std::vector<float> bias(channels()); 567 std::vector<float, AlignedAllocator<float, 64>> packed_weights((kr() + 1) * packed_channels()); 568 std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float)); 569 std::vector<float> output((width() - 1) * output_stride() + channels()); 570 std::vector<float> output_ref(width() * channels()); 571 572 for (size_t iteration = 0; iteration < iterations(); iteration++) { 573 std::generate(input.begin(), input.end(), std::ref(f32rng)); 574 std::generate(kernel.begin(), kernel.end(), std::ref(f32rng)); 575 std::generate(bias.begin(), bias.end(), std::ref(f32rng)); 576 std::fill(zero.begin(), zero.end(), 0.0f); 577 std::fill(output_ref.begin(), output_ref.end(), nanf("")); 578 std::fill(output.begin(), output.end(), nanf("")); 579 580 std::fill(packed_weights.begin(), packed_weights.end(), 0.0f); 581 xnn_pack_f32_dwconv_ghw_w( 582 kr(), 1, channels(), cr(), 583 kernel.data(), bias.data(), packed_weights.data(), nullptr); 584 for (size_t i = 0; i < indirection.size(); i++) { 585 indirection[i] = input.data() + i * channels() - input_offset(); 586 } 587 std::shuffle(indirection.begin(), indirection.end(), rng); 588 if (zero_index() != SIZE_MAX) { 589 for (size_t i = 0; i < indirection.size(); i += kr()) { 590 indirection[i + zero_index()] = zero.data(); 591 } 592 } 593 594 // Compute reference results, without clamping. 595 for (size_t x = 0; x < width(); x++) { 596 for (size_t c = 0; c < channels(); c++) { 597 float acc = bias[c]; 598 for (size_t k = 0; k < kr(); k++) { 599 if (indirection[x * step() + k] != zero.data()) { 600 acc += indirection[x * step() + k][c + input_offset()] * kernel[c * kr() + k]; 601 } 602 } 603 output_ref[x * channels() + c] = acc; 604 } 605 } 606 607 // Compute clamping parameters. 608 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 609 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 610 const float accumulated_range = accumulated_max - accumulated_min; 611 const float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin()); 612 const float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax()); 613 614 // Prepare parameters. 615 xnn_f32_minmax_params params = { }; 616 switch (variant) { 617 case Variant::Native: 618 params = xnn_init_f32_minmax_params(output_min, output_max); 619 break; 620 case Variant::Scalar: 621 params = xnn_init_scalar_f32_minmax_params(output_min, output_max); 622 break; 623 } 624 625 // Clamp reference results. 626 for (float& output_val : output_ref) { 627 output_val = std::max(std::min(output_val, output_max), output_min); 628 } 629 630 // Call optimized micro-kernel. 631 dwconv_minmax( 632 channels(), width(), 633 indirection.data(), packed_weights.data(), output.data(), 634 step() * sizeof(void*), 635 (output_stride() - channels()) * sizeof(float), 636 input_offset() * sizeof(float), zero.data(), 637 ¶ms); 638 639 // Verify results. 640 for (size_t x = 0; x < width(); x++) { 641 for (size_t c = 0; c < channels(); c++) { 642 ASSERT_GE(output[x * output_stride() + c], output_min) 643 << "x = " << x << ", channel = " << c; 644 ASSERT_LE(output[x * output_stride() + c], output_max) 645 << "x = " << x << ", channel = " << c; 646 ASSERT_NEAR( 647 output_ref[x * channels() + c], 648 output[x * output_stride() + c], 649 std::abs(output_ref[x * channels() + c]) * 1.0e-5) 650 << "x = " << x << ", channel = " << c; 651 } 652 } 653 } 654 } 655 656 private: 657 uint32_t channels_{1}; 658 uint32_t cr_{1}; 659 uint32_t kr_{1}; 660 uint32_t width_{1}; 661 uint32_t step_{1}; 662 uint32_t output_stride_{0}; 663 uint8_t input_zero_point_{127}; 664 uint8_t kernel_zero_point_{127}; 665 uint8_t qmin_{0}; 666 uint8_t qmax_{255}; 667 size_t input_offset_{0}; 668 size_t zero_index_{SIZE_MAX}; 669 size_t iterations_{3}; 670 }; 671