1 // Copyright (c) Facebook, Inc. and its affiliates. 2 // All rights reserved. 3 // 4 // Copyright 2019 Google LLC 5 // 6 // This source code is licensed under the BSD-style license found in the 7 // LICENSE file in the root directory of this source tree. 8 9 #pragma once 10 11 #include <gtest/gtest.h> 12 13 #include <algorithm> 14 #include <cassert> 15 #include <cmath> 16 #include <cstddef> 17 #include <cstdlib> 18 #include <limits> 19 #include <random> 20 #include <vector> 21 22 #include <fp16.h> 23 24 #include <xnnpack.h> 25 #include <xnnpack/aligned-allocator.h> 26 #include <xnnpack/pack.h> 27 #include <xnnpack/microfnptr.h> 28 #include <xnnpack/microparams-init.h> 29 #include <xnnpack/requantization.h> 30 31 32 class DWConvMicrokernelTester { 33 public: width(uint32_t width)34 inline DWConvMicrokernelTester& width(uint32_t width) { 35 assert(width >= 1); 36 this->width_ = width; 37 return *this; 38 } 39 width()40 inline uint32_t width() const { 41 return this->width_; 42 } 43 step(uint32_t step)44 inline DWConvMicrokernelTester& step(uint32_t step) { 45 assert(step >= 1); 46 this->step_ = step; 47 return *this; 48 } 49 step()50 inline uint32_t step() const { 51 return this->step_; 52 } 53 channels(uint32_t channels)54 inline DWConvMicrokernelTester& channels(uint32_t channels) { 55 assert(channels >= 1); 56 this->channels_ = channels; 57 return *this; 58 } 59 channels()60 inline uint32_t channels() const { 61 return this->channels_; 62 } 63 cr(uint32_t cr)64 inline DWConvMicrokernelTester& cr(uint32_t cr) { 65 assert(cr != 0); 66 this->cr_ = cr; 67 return *this; 68 } 69 cr()70 inline uint32_t cr() const { 71 return this->cr_; 72 } 73 kr(uint32_t kr)74 inline DWConvMicrokernelTester& kr(uint32_t kr) { 75 assert(kr != 0); 76 this->kr_ = kr; 77 return *this; 78 } 79 kr()80 inline uint32_t kr() const { 81 return this->kr_; 82 } 83 packed_channels()84 inline uint32_t packed_channels() const { 85 return (channels() / cr() + !!(channels() % cr())) * cr(); 86 } 87 output_stride(uint32_t output_stride)88 inline DWConvMicrokernelTester& output_stride(uint32_t output_stride) { 89 assert(output_stride != 0); 90 this->output_stride_ = output_stride; 91 return *this; 92 } 93 output_stride()94 inline uint32_t output_stride() const { 95 if (this->output_stride_ == 0) { 96 return channels(); 97 } else { 98 assert(this->output_stride_ >= channels()); 99 return this->output_stride_; 100 } 101 } 102 input_zero_point(uint8_t input_zero_point)103 inline DWConvMicrokernelTester& input_zero_point(uint8_t input_zero_point) { 104 this->input_zero_point_ = input_zero_point; 105 return *this; 106 } 107 input_zero_point()108 inline uint8_t input_zero_point() const { 109 return this->input_zero_point_; 110 } 111 kernel_zero_point(uint8_t kernel_zero_point)112 inline DWConvMicrokernelTester& kernel_zero_point(uint8_t kernel_zero_point) { 113 this->kernel_zero_point_ = kernel_zero_point; 114 return *this; 115 } 116 kernel_zero_point()117 inline uint8_t kernel_zero_point() const { 118 return this->kernel_zero_point_; 119 } 120 qmin(uint8_t qmin)121 inline DWConvMicrokernelTester& qmin(uint8_t qmin) { 122 this->qmin_ = qmin; 123 return *this; 124 } 125 qmin()126 inline uint8_t qmin() const { 127 return this->qmin_; 128 } 129 qmax(uint8_t qmax)130 inline DWConvMicrokernelTester& qmax(uint8_t qmax) { 131 this->qmax_ = qmax; 132 return *this; 133 } 134 qmax()135 inline uint8_t qmax() const { 136 return this->qmax_; 137 } 138 input_offset(size_t input_offset)139 inline DWConvMicrokernelTester& input_offset(size_t input_offset) { 140 this->input_offset_ = input_offset; 141 return *this; 142 } 143 input_offset()144 inline size_t input_offset() const { 145 return this->input_offset_; 146 } 147 zero_index(size_t zero_index)148 inline DWConvMicrokernelTester& zero_index(size_t zero_index) { 149 this->zero_index_ = zero_index; 150 return *this; 151 } 152 zero_index()153 inline size_t zero_index() const { 154 return this->zero_index_; 155 } 156 iterations(size_t iterations)157 inline DWConvMicrokernelTester& iterations(size_t iterations) { 158 this->iterations_ = iterations; 159 return *this; 160 } 161 iterations()162 inline size_t iterations() const { 163 return this->iterations_; 164 } 165 Test(xnn_qu8_dwconv_minmax_unipass_ukernel_function dwconv_minmax,xnn_init_qu8_conv_minmax_params_fn init_params,xnn_qu8_requantize_fn requantize)166 void Test( 167 xnn_qu8_dwconv_minmax_unipass_ukernel_function dwconv_minmax, 168 xnn_init_qu8_conv_minmax_params_fn init_params, 169 xnn_qu8_requantize_fn requantize) const 170 { 171 std::random_device random_device; 172 auto rng = std::mt19937(random_device()); 173 std::uniform_int_distribution<int32_t> i32dist(-10000, 10000); 174 std::uniform_int_distribution<int32_t> u8dist( 175 std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()); 176 177 std::vector<const uint8_t*> indirection((width() - 1) * step() + kr()); 178 std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + indirection.size() * channels()); 179 std::vector<uint8_t> kernel(channels() * kr()); 180 std::vector<int32_t> bias(channels()); 181 std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> packed_weights((kr() + sizeof(int32_t) / sizeof(uint8_t)) * packed_channels()); 182 std::vector<uint8_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint8_t)); 183 std::vector<uint8_t> output((width() - 1) * output_stride() + channels()); 184 std::vector<int32_t> accumulators(width() * channels()); 185 std::vector<uint8_t> output_ref(width() * channels()); 186 187 for (size_t iteration = 0; iteration < iterations(); iteration++) { 188 do { 189 std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); 190 } while (input.size() > 1 && *std::max_element(input.cbegin(), input.cend()) == *std::min_element(input.cbegin(), input.cend())); 191 do { 192 std::generate(kernel.begin(), kernel.end(), [&]() { return u8dist(rng); }); 193 } while (kernel.size() > 1 && *std::max_element(kernel.cbegin(), kernel.cend()) == *std::min_element(kernel.cbegin(), kernel.cend())); 194 std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); }); 195 std::fill(zero.begin(), zero.end(), input_zero_point()); 196 std::fill(output.begin(), output.end(), UINT8_C(0xA5)); 197 198 std::fill(packed_weights.begin(), packed_weights.end(), 0); 199 const xnn_qu8_packing_params packing_params = { input_zero_point(), kernel_zero_point() }; 200 xnn_pack_qu8_dwconv_ghw_w( 201 kr(), kr(), 1, channels(), cr(), 202 kernel.data(), bias.data(), packed_weights.data(), 203 0 /* extra bytes */, &packing_params); 204 for (size_t i = 0; i < indirection.size(); i++) { 205 indirection[i] = input.data() + i * channels() - input_offset(); 206 } 207 std::shuffle(indirection.begin(), indirection.end(), rng); 208 if (zero_index() != SIZE_MAX) { 209 for (size_t i = 0; i < indirection.size(); i += kr()) { 210 indirection[i + zero_index()] = zero.data(); 211 } 212 } 213 214 // Compute reference results, without renormalization. 215 for (size_t x = 0; x < width(); x++) { 216 for (size_t c = 0; c < channels(); c++) { 217 float acc = bias[c]; 218 for (size_t k = 0; k < kr(); k++) { 219 if (indirection[x * step() + k] != zero.data()) { 220 acc += 221 (int32_t(indirection[x * step() + k][c + input_offset()]) - int32_t(input_zero_point())) * 222 (int32_t(kernel[c * kr() + k]) - int32_t(kernel_zero_point())); 223 } 224 } 225 accumulators[x * channels() + c] = acc; 226 } 227 } 228 229 // Compute renormalization parameters. 230 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend()); 231 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend()); 232 const uint32_t accumulated_range = uint32_t(accumulated_max) - uint32_t(accumulated_min); 233 const double output_scale = accumulated_range >= 256 ? double(accumulated_range) / 255.0 : 1.00001; 234 const uint8_t output_zero_point = uint8_t(std::max(std::min( 235 lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale), 236 long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min()))); 237 238 // Prepare parameters. 239 const float requantization_scale = 1.0f / float(output_scale); 240 union xnn_qu8_conv_minmax_params quantization_params; 241 init_params(&quantization_params, 242 kernel_zero_point(), requantization_scale, output_zero_point, qmin(), qmax()); 243 244 // Renormalize reference results. 245 for (size_t x = 0; x < width(); x++) { 246 for (size_t c = 0; c < channels(); c++) { 247 output_ref[x * channels() + c] = requantize( 248 accumulators[x * channels() + c], requantization_scale, output_zero_point, qmin(), qmax()); 249 } 250 } 251 252 // Call optimized micro-kernel. 253 dwconv_minmax( 254 channels(), width(), 255 indirection.data(), packed_weights.data(), output.data(), 256 step() * sizeof(void*), 257 (output_stride() - channels()) * sizeof(uint8_t), 258 input_offset() * sizeof(uint8_t), zero.data(), 259 &quantization_params); 260 261 // Verify results. 262 for (size_t x = 0; x < width(); x++) { 263 for (size_t c = 0; c < channels(); c++) { 264 ASSERT_GE(uint32_t(output[x * output_stride() + c]), uint32_t(qmin())) 265 << "x = " << x << ", channel = " << c; 266 ASSERT_LE(uint32_t(output[x * output_stride() + c]), uint32_t(qmax())) 267 << "x = " << x << ", channel = " << c; 268 ASSERT_EQ(uint32_t(output[x * output_stride() + c]), uint32_t(output_ref[x * channels() + c])) 269 << "x = " << x << ", channel = " << c << ", accumulator = " << accumulators[x * channels() + c]; 270 } 271 } 272 } 273 } 274 Test(xnn_qc8_dwconv_minmax_unipass_ukernel_function dwconv_minmax,xnn_init_qc8_conv_minmax_params_fn init_params,xnn_qs8_requantize_fn requantize)275 void Test( 276 xnn_qc8_dwconv_minmax_unipass_ukernel_function dwconv_minmax, 277 xnn_init_qc8_conv_minmax_params_fn init_params, 278 xnn_qs8_requantize_fn requantize) const 279 { 280 std::random_device random_device; 281 auto rng = std::mt19937(random_device()); 282 std::uniform_int_distribution<int32_t> i32dist(-10000, 10000); 283 std::uniform_int_distribution<int32_t> i8dist( 284 std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()); 285 std::uniform_int_distribution<int32_t> w8dist( 286 -std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()); 287 288 std::vector<const int8_t*> indirection((width() - 1) * step() + kr()); 289 std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) + indirection.size() * channels()); 290 std::vector<int8_t> kernel(channels() * kr()); 291 std::vector<int32_t> bias(channels()); 292 std::vector<int8_t, AlignedAllocator<int8_t, 64>> packed_weights((kr() + (sizeof(int32_t) + sizeof(float)) / sizeof(int8_t)) * packed_channels()); 293 std::vector<int8_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(int8_t)); 294 std::vector<int8_t> output((width() - 1) * output_stride() + channels()); 295 std::vector<int32_t> accumulators(width() * channels()); 296 std::vector<float> scale(channels()); 297 std::vector<int8_t> output_ref(width() * channels()); 298 299 for (size_t iteration = 0; iteration < iterations(); iteration++) { 300 do { 301 std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); 302 } while (input.size() > 1 && *std::max_element(input.cbegin(), input.cend()) == *std::min_element(input.cbegin(), input.cend())); 303 do { 304 std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); }); 305 } while (kernel.size() > 1 && *std::max_element(kernel.cbegin(), kernel.cend()) == *std::min_element(kernel.cbegin(), kernel.cend())); 306 std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); }); 307 std::fill(zero.begin(), zero.end(), int8_t(input_zero_point() - 0x80)); 308 std::fill(output.begin(), output.end(), INT8_C(0xA5)); 309 310 std::fill(packed_weights.begin(), packed_weights.end(), 0); 311 const xnn_qs8_packing_params packing_params = { int8_t(input_zero_point() - 0x80) }; 312 xnn_pack_qs8_dwconv_ghw_w( 313 kr(), kr(), 1, channels(), cr(), 314 kernel.data(), bias.data(), packed_weights.data(), cr() * sizeof(float), 315 &packing_params); 316 for (size_t i = 0; i < indirection.size(); i++) { 317 indirection[i] = input.data() + i * channels() - input_offset(); 318 } 319 std::shuffle(indirection.begin(), indirection.end(), rng); 320 if (zero_index() != SIZE_MAX) { 321 for (size_t i = 0; i < indirection.size(); i += kr()) { 322 indirection[i + zero_index()] = zero.data(); 323 } 324 } 325 326 // Compute reference results, without renormalization. 327 for (size_t x = 0; x < width(); x++) { 328 for (size_t c = 0; c < channels(); c++) { 329 float acc = bias[c]; 330 for (size_t k = 0; k < kr(); k++) { 331 if (indirection[x * step() + k] != zero.data()) { 332 acc += 333 (int32_t(indirection[x * step() + k][c + input_offset()]) - int32_t(input_zero_point() - 0x80)) * 334 int32_t(kernel[c * kr() + k]); 335 } 336 } 337 accumulators[x * channels() + c] = acc; 338 } 339 } 340 341 // Compute renormalization parameters. 342 const int8_t output_zero_point = -1; 343 for (size_t c = 0; c < channels(); c++) { 344 int32_t accumulated_min = accumulators[c]; 345 int32_t accumulated_max = accumulators[c]; 346 for (size_t x = 0; x < width(); x++) { 347 accumulated_min = std::min(accumulated_min, accumulators[x * channels() + c]); 348 accumulated_max = std::max(accumulated_max, accumulators[x * channels() + c]); 349 } 350 const uint32_t accumulated_range = uint32_t(accumulated_max - accumulated_min); 351 const float output_scale = accumulated_range >= 256 ? double(accumulated_range) / 255.0 : 1.00001; 352 scale[c] = 1.0f / output_scale; 353 } 354 xnn_init_qc8_scale_fp32_params( 355 channels(), cr(), 356 cr() * (kr() * sizeof(int8_t) + sizeof(int32_t) + sizeof(float)), scale.data(), 357 (void*) ((uintptr_t) packed_weights.data() + cr() * (kr() * sizeof(int8_t) + sizeof(int32_t)))); 358 359 // Prepare parameters. 360 union xnn_qc8_conv_minmax_params minmax_params; 361 init_params(&minmax_params, 362 output_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80)); 363 364 // Renormalize reference results. 365 for (size_t x = 0; x < width(); x++) { 366 for (size_t c = 0; c < channels(); c++) { 367 output_ref[x * channels() + c] = requantize( 368 accumulators[x * channels() + c], scale[c], output_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80)); 369 } 370 } 371 372 // Call optimized micro-kernel. 373 dwconv_minmax( 374 channels(), width(), 375 indirection.data(), packed_weights.data(), output.data(), 376 step() * sizeof(void*), 377 (output_stride() - channels()) * sizeof(int8_t), 378 input_offset() * sizeof(int8_t), zero.data(), 379 &minmax_params); 380 381 // Verify results. 382 for (size_t x = 0; x < width(); x++) { 383 for (size_t c = 0; c < channels(); c++) { 384 ASSERT_GE(int32_t(output[x * output_stride() + c]), int32_t(qmin()) - 0x80) 385 << "x = " << x << ", channel = " << c; 386 ASSERT_LE(int32_t(output[x * output_stride() + c]), int32_t(qmax()) - 0x80) 387 << "x = " << x << ", channel = " << c; 388 ASSERT_EQ(int32_t(output[x * output_stride() + c]), int32_t(output_ref[x * channels() + c])) 389 << "x = " << x << ", channel = " << c << ", accumulator = " << accumulators[x * channels() + c]; 390 } 391 } 392 } 393 } 394 Test(xnn_qs8_dwconv_minmax_unipass_ukernel_function dwconv_minmax,xnn_init_qs8_conv_minmax_params_fn init_params,xnn_qs8_requantize_fn requantize)395 void Test( 396 xnn_qs8_dwconv_minmax_unipass_ukernel_function dwconv_minmax, 397 xnn_init_qs8_conv_minmax_params_fn init_params, 398 xnn_qs8_requantize_fn requantize) const 399 { 400 std::random_device random_device; 401 auto rng = std::mt19937(random_device()); 402 std::uniform_int_distribution<int32_t> i32dist(-10000, 10000); 403 std::uniform_int_distribution<int32_t> i8dist( 404 std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()); 405 std::uniform_int_distribution<int32_t> w8dist( 406 -std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()); 407 408 std::vector<const int8_t*> indirection((width() - 1) * step() + kr()); 409 std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) + indirection.size() * channels()); 410 std::vector<int8_t> kernel(channels() * kr()); 411 std::vector<int32_t> bias(channels()); 412 std::vector<int8_t, AlignedAllocator<int8_t, 64>> packed_weights((kr() + sizeof(int32_t) / sizeof(int8_t)) * packed_channels()); 413 std::vector<int8_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(int8_t)); 414 std::vector<int8_t> output((width() - 1) * output_stride() + channels()); 415 std::vector<int32_t> accumulators(width() * channels()); 416 std::vector<int8_t> output_ref(width() * channels()); 417 418 for (size_t iteration = 0; iteration < iterations(); iteration++) { 419 do { 420 std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); 421 } while (input.size() > 1 && *std::max_element(input.cbegin(), input.cend()) == *std::min_element(input.cbegin(), input.cend())); 422 do { 423 std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); }); 424 } while (kernel.size() > 1 && *std::max_element(kernel.cbegin(), kernel.cend()) == *std::min_element(kernel.cbegin(), kernel.cend())); 425 std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); }); 426 std::fill(zero.begin(), zero.end(), int8_t(input_zero_point() - 0x80)); 427 std::fill(output.begin(), output.end(), INT8_C(0xA5)); 428 429 std::fill(packed_weights.begin(), packed_weights.end(), 0); 430 const xnn_qs8_packing_params packing_params = { int8_t(input_zero_point() - 0x80) }; 431 xnn_pack_qs8_dwconv_ghw_w( 432 kr(), kr(), 1, channels(), cr(), 433 kernel.data(), bias.data(), packed_weights.data(), 434 0 /* extra bytes */, &packing_params); 435 for (size_t i = 0; i < indirection.size(); i++) { 436 indirection[i] = input.data() + i * channels() - input_offset(); 437 } 438 std::shuffle(indirection.begin(), indirection.end(), rng); 439 if (zero_index() != SIZE_MAX) { 440 for (size_t i = 0; i < indirection.size(); i += kr()) { 441 indirection[i + zero_index()] = zero.data(); 442 } 443 } 444 445 // Compute reference results, without renormalization. 446 for (size_t x = 0; x < width(); x++) { 447 for (size_t c = 0; c < channels(); c++) { 448 float acc = bias[c]; 449 for (size_t k = 0; k < kr(); k++) { 450 if (indirection[x * step() + k] != zero.data()) { 451 acc += 452 (int32_t(indirection[x * step() + k][c + input_offset()]) - int32_t(input_zero_point() - 0x80)) * 453 int32_t(kernel[c * kr() + k]); 454 } 455 } 456 accumulators[x * channels() + c] = acc; 457 } 458 } 459 460 // Compute renormalization parameters. 461 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend()); 462 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend()); 463 const uint32_t accumulated_range = uint32_t(accumulated_max) - uint32_t(accumulated_min); 464 const double output_scale = accumulated_range >= 256 ? double(accumulated_range) / 255.0 : 1.00001; 465 const int8_t output_zero_point = int8_t(std::max(std::min( 466 lrint(-0.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale), 467 long(std::numeric_limits<int8_t>::max())), long(std::numeric_limits<int8_t>::min()))); 468 469 // Prepare parameters. 470 const float requantization_scale = 1.0f / float(output_scale); 471 union xnn_qs8_conv_minmax_params quantization_params; 472 init_params(&quantization_params, 473 requantization_scale, output_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80)); 474 475 // Renormalize reference results. 476 for (size_t x = 0; x < width(); x++) { 477 for (size_t c = 0; c < channels(); c++) { 478 output_ref[x * channels() + c] = requantize( 479 accumulators[x * channels() + c], requantization_scale, output_zero_point, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80)); 480 } 481 } 482 483 // Call optimized micro-kernel. 484 dwconv_minmax( 485 channels(), width(), 486 indirection.data(), packed_weights.data(), output.data(), 487 step() * sizeof(void*), 488 (output_stride() - channels()) * sizeof(int8_t), 489 input_offset() * sizeof(int8_t), zero.data(), 490 &quantization_params); 491 492 // Verify results. 493 for (size_t x = 0; x < width(); x++) { 494 for (size_t c = 0; c < channels(); c++) { 495 ASSERT_GE(int32_t(output[x * output_stride() + c]), int32_t(qmin()) - 0x80) 496 << "x = " << x << ", channel = " << c; 497 ASSERT_LE(int32_t(output[x * output_stride() + c]), int32_t(qmax()) - 0x80) 498 << "x = " << x << ", channel = " << c; 499 ASSERT_EQ(int32_t(output[x * output_stride() + c]), int32_t(output_ref[x * channels() + c])) 500 << "x = " << x << ", channel = " << c << ", accumulator = " << accumulators[x * channels() + c]; 501 } 502 } 503 } 504 } 505 Test(xnn_f16_dwconv_minmax_unipass_ukernel_function dwconv_minmax,xnn_init_f16_minmax_params_fn init_params)506 void Test(xnn_f16_dwconv_minmax_unipass_ukernel_function dwconv_minmax, xnn_init_f16_minmax_params_fn init_params) const { 507 std::random_device random_device; 508 auto rng = std::mt19937(random_device()); 509 std::uniform_real_distribution<float> f32dist; 510 511 std::vector<const uint16_t*> indirection((width() - 1) * step() + kr()); 512 std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + indirection.size() * channels()); 513 std::vector<uint16_t> kernel(channels() * kr()); 514 std::vector<uint16_t> bias(channels()); 515 std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> packed_weights((kr() + 1) * packed_channels()); 516 std::vector<uint16_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint16_t)); 517 std::vector<uint16_t> output((width() - 1) * output_stride() + channels()); 518 std::vector<float> output_ref(width() * channels()); 519 520 for (size_t iteration = 0; iteration < iterations(); iteration++) { 521 std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 522 std::generate(kernel.begin(), kernel.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 523 std::generate(bias.begin(), bias.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 524 std::fill(zero.begin(), zero.end(), 0); 525 std::fill(output_ref.begin(), output_ref.end(), 0.0f); 526 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */); 527 528 std::fill(packed_weights.begin(), packed_weights.end(), 0); 529 xnn_pack_f16_dwconv_ghw_w( 530 kr(), kr(), 1, channels(), cr(), 531 kernel.data(), bias.data(), packed_weights.data(), 532 0 /* extra bytes */, nullptr); 533 for (size_t i = 0; i < indirection.size(); i++) { 534 indirection[i] = input.data() + i * channels() - input_offset(); 535 } 536 std::shuffle(indirection.begin(), indirection.end(), rng); 537 if (zero_index() != SIZE_MAX) { 538 for (size_t i = 0; i < indirection.size(); i += kr()) { 539 indirection[i + zero_index()] = zero.data(); 540 } 541 } 542 543 // Compute reference results, without clamping. 544 for (size_t x = 0; x < width(); x++) { 545 for (size_t c = 0; c < channels(); c++) { 546 float acc = fp16_ieee_to_fp32_value(bias[c]); 547 for (size_t k = 0; k < kr(); k++) { 548 if (indirection[x * step() + k] != zero.data()) { 549 acc += fp16_ieee_to_fp32_value(indirection[x * step() + k][c + input_offset()]) * fp16_ieee_to_fp32_value(kernel[c * kr() + k]); 550 } 551 } 552 output_ref[x * channels() + c] = acc; 553 } 554 } 555 556 // Compute clamping parameters. 557 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 558 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 559 const float accumulated_range = accumulated_max - accumulated_min; 560 const float output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_min + accumulated_range / 255.0f * float(qmin()))); 561 const float output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_max - accumulated_range / 255.0f * float(255 - qmax()))); 562 563 // Prepare parameters. 564 xnn_f16_minmax_params params; 565 init_params(¶ms, 566 fp16_ieee_from_fp32_value(output_min), 567 fp16_ieee_from_fp32_value(output_max)); 568 569 // Clamp reference results. 570 for (float& output_val : output_ref) { 571 output_val = std::max(std::min(output_val, output_max), output_min); 572 } 573 574 // Call optimized micro-kernel. 575 dwconv_minmax( 576 channels(), width(), 577 reinterpret_cast<const void**>(indirection.data()), packed_weights.data(), output.data(), 578 step() * sizeof(void*), 579 (output_stride() - channels()) * sizeof(uint16_t), 580 input_offset() * sizeof(uint16_t), zero.data(), 581 ¶ms); 582 583 // Verify results. 584 for (size_t x = 0; x < width(); x++) { 585 for (size_t c = 0; c < channels(); c++) { 586 ASSERT_GE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_min) 587 << "x = " << x << ", channel = " << c; 588 ASSERT_LE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_max) 589 << "x = " << x << ", channel = " << c; 590 ASSERT_NEAR(output_ref[x * channels() + c], fp16_ieee_to_fp32_value(output[x * output_stride() + c]), std::max(1.0e-4f, std::abs(output_ref[x * channels() + c]) * 1.0e-2f)) 591 << "x = " << x << ", channel = " << c; 592 } 593 } 594 } 595 } 596 Test(xnn_f32_dwconv_unipass_ukernel_function dwconv)597 void Test(xnn_f32_dwconv_unipass_ukernel_function dwconv) const { 598 std::random_device random_device; 599 auto rng = std::mt19937(random_device()); 600 std::uniform_real_distribution<float> f32dist; 601 602 std::vector<const float*> indirection((width() - 1) * step() + kr()); 603 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + indirection.size() * channels()); 604 std::vector<float> kernel(channels() * kr()); 605 std::vector<float> bias(channels()); 606 std::vector<float, AlignedAllocator<float, 64>> packed_weights((kr() + 1) * packed_channels()); 607 std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float)); 608 std::vector<float> output((width() - 1) * output_stride() + channels()); 609 std::vector<float> output_ref(width() * channels()); 610 611 for (size_t iteration = 0; iteration < iterations(); iteration++) { 612 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); 613 std::generate(kernel.begin(), kernel.end(), [&]() { return f32dist(rng); }); 614 std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); }); 615 std::fill(zero.begin(), zero.end(), 0.0f); 616 std::fill(output_ref.begin(), output_ref.end(), nanf("")); 617 std::fill(output.begin(), output.end(), nanf("")); 618 619 std::fill(packed_weights.begin(), packed_weights.end(), 0.0f); 620 xnn_pack_f32_dwconv_ghw_w( 621 kr(), kr(), 1, channels(), cr(), 622 kernel.data(), bias.data(), packed_weights.data(), 623 0 /* extra bytes */, nullptr); 624 for (size_t i = 0; i < indirection.size(); i++) { 625 indirection[i] = input.data() + i * channels() - input_offset(); 626 } 627 std::shuffle(indirection.begin(), indirection.end(), rng); 628 if (zero_index() != SIZE_MAX) { 629 for (size_t i = 0; i < indirection.size(); i += kr()) { 630 indirection[i + zero_index()] = zero.data(); 631 } 632 } 633 634 // Compute reference results, without clamping. 635 for (size_t x = 0; x < width(); x++) { 636 for (size_t c = 0; c < channels(); c++) { 637 float acc = bias[c]; 638 for (size_t k = 0; k < kr(); k++) { 639 if (indirection[x * step() + k] != zero.data()) { 640 acc += indirection[x * step() + k][c + input_offset()] * kernel[c * kr() + k]; 641 } 642 } 643 output_ref[x * channels() + c] = acc; 644 } 645 } 646 647 // Call optimized micro-kernel. 648 dwconv( 649 channels(), width(), 650 indirection.data(), packed_weights.data(), output.data(), 651 step() * sizeof(void*), 652 (output_stride() - channels()) * sizeof(float), 653 input_offset() * sizeof(float), zero.data(), 654 nullptr); 655 656 // Verify results. 657 for (size_t x = 0; x < width(); x++) { 658 for (size_t c = 0; c < channels(); c++) { 659 ASSERT_NEAR( 660 output_ref[x * channels() + c], 661 output[x * output_stride() + c], 662 std::abs(output_ref[x * channels() + c]) * 1.0e-5) 663 << "x = " << x << ", channel = " << c; 664 } 665 } 666 } 667 } 668 Test(xnn_f32_dwconv_minmax_unipass_ukernel_function dwconv_minmax,xnn_init_f32_minmax_params_fn init_params)669 void Test(xnn_f32_dwconv_minmax_unipass_ukernel_function dwconv_minmax, xnn_init_f32_minmax_params_fn init_params) const { 670 std::random_device random_device; 671 auto rng = std::mt19937(random_device()); 672 std::uniform_real_distribution<float> f32dist; 673 674 std::vector<const float*> indirection((width() - 1) * step() + kr()); 675 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + indirection.size() * channels()); 676 std::vector<float> kernel(channels() * kr()); 677 std::vector<float> bias(channels()); 678 std::vector<float, AlignedAllocator<float, 64>> packed_weights((kr() + 1) * packed_channels()); 679 std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float)); 680 std::vector<float> output((width() - 1) * output_stride() + channels()); 681 std::vector<float> output_ref(width() * channels()); 682 683 for (size_t iteration = 0; iteration < iterations(); iteration++) { 684 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); 685 std::generate(kernel.begin(), kernel.end(), [&]() { return f32dist(rng); }); 686 std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); }); 687 std::fill(zero.begin(), zero.end(), 0.0f); 688 std::fill(output_ref.begin(), output_ref.end(), nanf("")); 689 std::fill(output.begin(), output.end(), nanf("")); 690 691 std::fill(packed_weights.begin(), packed_weights.end(), 0.0f); 692 xnn_pack_f32_dwconv_ghw_w( 693 kr(), kr(), 1, channels(), cr(), 694 kernel.data(), bias.data(), packed_weights.data(), 695 0 /* extra bytes */, nullptr); 696 for (size_t i = 0; i < indirection.size(); i++) { 697 indirection[i] = input.data() + i * channels() - input_offset(); 698 } 699 std::shuffle(indirection.begin(), indirection.end(), rng); 700 if (zero_index() != SIZE_MAX) { 701 for (size_t i = 0; i < indirection.size(); i += kr()) { 702 indirection[i + zero_index()] = zero.data(); 703 } 704 } 705 706 // Compute reference results, without clamping. 707 for (size_t x = 0; x < width(); x++) { 708 for (size_t c = 0; c < channels(); c++) { 709 float acc = bias[c]; 710 for (size_t k = 0; k < kr(); k++) { 711 if (indirection[x * step() + k] != zero.data()) { 712 acc += indirection[x * step() + k][c + input_offset()] * kernel[c * kr() + k]; 713 } 714 } 715 output_ref[x * channels() + c] = acc; 716 } 717 } 718 719 // Compute clamping parameters. 720 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 721 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 722 const float accumulated_range = accumulated_max - accumulated_min; 723 const float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin()); 724 const float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax()); 725 726 // Prepare parameters. 727 xnn_f32_minmax_params params; 728 init_params(¶ms, output_min, output_max); 729 730 // Clamp reference results. 731 for (float& output_val : output_ref) { 732 output_val = std::max(std::min(output_val, output_max), output_min); 733 } 734 735 // Call optimized micro-kernel. 736 dwconv_minmax( 737 channels(), width(), 738 indirection.data(), packed_weights.data(), output.data(), 739 step() * sizeof(void*), 740 (output_stride() - channels()) * sizeof(float), 741 input_offset() * sizeof(float), zero.data(), 742 ¶ms); 743 744 // Verify results. 745 for (size_t x = 0; x < width(); x++) { 746 for (size_t c = 0; c < channels(); c++) { 747 ASSERT_GE(output[x * output_stride() + c], output_min) 748 << "x = " << x << ", channel = " << c; 749 ASSERT_LE(output[x * output_stride() + c], output_max) 750 << "x = " << x << ", channel = " << c; 751 ASSERT_NEAR( 752 output_ref[x * channels() + c], 753 output[x * output_stride() + c], 754 std::abs(output_ref[x * channels() + c]) * 1.0e-5) 755 << "x = " << x << ", channel = " << c; 756 } 757 } 758 } 759 } 760 761 private: 762 uint32_t channels_{1}; 763 uint32_t cr_{1}; 764 uint32_t kr_{1}; 765 uint32_t width_{1}; 766 uint32_t step_{1}; 767 uint32_t output_stride_{0}; 768 uint8_t input_zero_point_{127}; 769 uint8_t kernel_zero_point_{127}; 770 uint8_t qmin_{0}; 771 uint8_t qmax_{255}; 772 size_t input_offset_{0}; 773 size_t zero_index_{SIZE_MAX}; 774 size_t iterations_{3}; 775 }; 776