1 // Copyright (c) Facebook, Inc. and its affiliates. 2 // All rights reserved. 3 // 4 // Copyright 2019 Google LLC 5 // 6 // This source code is licensed under the BSD-style license found in the 7 // LICENSE file in the root directory of this source tree. 8 9 #pragma once 10 11 #include <gtest/gtest.h> 12 13 #include <algorithm> 14 #include <cassert> 15 #include <cmath> 16 #include <cstddef> 17 #include <cstdlib> 18 #include <functional> 19 #include <random> 20 #include <vector> 21 22 #include <xnnpack.h> 23 #include <xnnpack/AlignedAllocator.h> 24 #include <xnnpack/pack.h> 25 #include <xnnpack/params-init.h> 26 #include <xnnpack/params.h> 27 #include <xnnpack/requantization.h> 28 29 30 class DWConvMicrokernelTester { 31 public: 32 enum class Variant { 33 Native, 34 Scalar, 35 }; 36 width(uint32_t width)37 inline DWConvMicrokernelTester& width(uint32_t width) { 38 assert(width >= 1); 39 this->width_ = width; 40 return *this; 41 } 42 width()43 inline uint32_t width() const { 44 return this->width_; 45 } 46 step(uint32_t step)47 inline DWConvMicrokernelTester& step(uint32_t step) { 48 assert(step >= 1); 49 this->step_ = step; 50 return *this; 51 } 52 step()53 inline uint32_t step() const { 54 return this->step_; 55 } 56 channels(uint32_t channels)57 inline DWConvMicrokernelTester& channels(uint32_t channels) { 58 assert(channels >= 1); 59 this->channels_ = channels; 60 return *this; 61 } 62 channels()63 inline uint32_t channels() const { 64 return this->channels_; 65 } 66 cr(uint32_t cr)67 inline DWConvMicrokernelTester& cr(uint32_t cr) { 68 assert(cr != 0); 69 assert((cr & (cr - 1)) == 0); 70 this->cr_ = cr; 71 return *this; 72 } 73 cr()74 inline uint32_t cr() const { 75 return this->cr_; 76 } 77 kr(uint32_t kr)78 inline DWConvMicrokernelTester& kr(uint32_t kr) { 79 assert(kr != 0); 80 this->kr_ = kr; 81 return *this; 82 } 83 kr()84 inline uint32_t kr() const { 85 return this->kr_; 86 } 87 packed_channels()88 inline uint32_t packed_channels() const { 89 return (channels() / cr() + !!(channels() % cr())) * cr(); 90 } 91 output_stride(uint32_t output_stride)92 inline DWConvMicrokernelTester& output_stride(uint32_t output_stride) { 93 assert(output_stride != 0); 94 this->output_stride_ = output_stride; 95 return *this; 96 } 97 output_stride()98 inline uint32_t output_stride() const { 99 if (this->output_stride_ == 0) { 100 return channels(); 101 } else { 102 assert(this->output_stride_ >= channels()); 103 return this->output_stride_; 104 } 105 } 106 input_zero_point(uint8_t input_zero_point)107 inline DWConvMicrokernelTester& input_zero_point(uint8_t input_zero_point) { 108 this->input_zero_point_ = input_zero_point; 109 return *this; 110 } 111 input_zero_point()112 inline uint8_t input_zero_point() const { 113 return this->input_zero_point_; 114 } 115 kernel_zero_point(uint8_t kernel_zero_point)116 inline DWConvMicrokernelTester& kernel_zero_point(uint8_t kernel_zero_point) { 117 this->kernel_zero_point_ = kernel_zero_point; 118 return *this; 119 } 120 kernel_zero_point()121 inline uint8_t kernel_zero_point() const { 122 return this->kernel_zero_point_; 123 } 124 qmin(uint8_t qmin)125 inline DWConvMicrokernelTester& qmin(uint8_t qmin) { 126 this->qmin_ = qmin; 127 return *this; 128 } 129 qmin()130 inline uint8_t qmin() const { 131 return this->qmin_; 132 } 133 qmax(uint8_t qmax)134 inline DWConvMicrokernelTester& qmax(uint8_t qmax) { 135 this->qmax_ = qmax; 136 return *this; 137 } 138 qmax()139 inline uint8_t qmax() const { 140 return this->qmax_; 141 } 142 iterations(size_t iterations)143 inline DWConvMicrokernelTester& iterations(size_t iterations) { 144 this->iterations_ = iterations; 145 return *this; 146 } 147 iterations()148 inline size_t iterations() const { 149 return this->iterations_; 150 } 151 152 void Test(xnn_q8_dwconv_up_ukernel_function dwconv, Variant variant = Variant::Native) const { 153 std::random_device random_device; 154 auto rng = std::mt19937(random_device()); 155 auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng); 156 auto u8rng = std::bind(std::uniform_int_distribution<uint8_t>(), rng); 157 158 std::vector<const uint8_t*> indirection((width() - 1) * step() + kr()); 159 std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + indirection.size() * channels()); 160 std::vector<uint8_t> kernel(channels() * kr()); 161 std::vector<int32_t> bias(channels()); 162 std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> packed_weights((kr() + sizeof(int32_t) / sizeof(uint8_t)) * packed_channels()); 163 std::vector<uint8_t> output((width() - 1) * output_stride() + channels()); 164 std::vector<int32_t> accumulators(width() * channels()); 165 std::vector<uint8_t> output_ref(width() * channels()); 166 167 for (size_t iteration = 0; iteration < iterations(); iteration++) { 168 do { 169 std::generate(input.begin(), input.end(), std::ref(u8rng)); 170 } while (input.size() > 1 && *std::max_element(input.cbegin(), input.cend()) == *std::min_element(input.cbegin(), input.cend())); 171 do { 172 std::generate(kernel.begin(), kernel.end(), std::ref(u8rng)); 173 } while (kernel.size() > 1 && *std::max_element(kernel.cbegin(), kernel.cend()) == *std::min_element(kernel.cbegin(), kernel.cend())); 174 std::generate(bias.begin(), bias.end(), std::ref(s32rng)); 175 std::fill(output.begin(), output.end(), 0xA5); 176 177 std::fill(packed_weights.begin(), packed_weights.end(), 0); 178 xnn_pack_q8_dwconv_ghw_w( 179 kr(), 1, channels(), cr(), 180 input_zero_point(), kernel_zero_point(), 181 kernel.data(), bias.data(), packed_weights.data()); 182 for (size_t i = 0; i < indirection.size(); i++) { 183 indirection[i] = input.data() + i * channels(); 184 } 185 std::shuffle(indirection.begin(), indirection.end(), rng); 186 187 // Compute reference results, without renormalization. 188 for (size_t x = 0; x < width(); x++) { 189 for (size_t c = 0; c < channels(); c++) { 190 float acc = bias[c]; 191 for (size_t k = 0; k < kr(); k++) { 192 acc += 193 (int32_t(indirection[x * step() + k][c]) - int32_t(input_zero_point())) * 194 (int32_t(kernel[c * kr() + k]) - int32_t(kernel_zero_point())); 195 } 196 accumulators[x * channels() + c] = acc; 197 } 198 } 199 200 // Compute renormalization parameters. 201 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend()); 202 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend()); 203 const uint32_t accumulated_range = uint32_t(accumulated_max) - uint32_t(accumulated_min); 204 const double output_scale = accumulated_range >= 256 ? double(accumulated_range) / 255.0 : 1.00001; 205 const uint8_t output_zero_point = uint8_t(std::max(std::min( 206 lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale), 207 long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min()))); 208 209 // Prepare convolution parameters. 210 const float requantization_scale = 1.0f / float(output_scale); 211 union xnn_q8_gemm_params quantization_params = { }; 212 switch (variant) { 213 case Variant::Native: 214 quantization_params = xnn_init_q8_gemm_params( 215 input_zero_point(), kernel_zero_point(), 216 requantization_scale, output_zero_point, qmin(), qmax()); 217 break; 218 case Variant::Scalar: 219 quantization_params = xnn_init_scalar_q8_gemm_params( 220 input_zero_point(), kernel_zero_point(), 221 requantization_scale, output_zero_point, qmin(), qmax()); 222 break; 223 } 224 const union xnn_q31_requantization_params scalar_requantization_params = 225 xnn_init_scalar_requantization_params( 226 requantization_scale, output_zero_point, qmin(), qmax()); 227 228 // Renormalize reference results. 229 for (size_t x = 0; x < width(); x++) { 230 for (size_t c = 0; c < channels(); c++) { 231 output_ref[x * channels() + c] = xnn_q31_requantize(accumulators[x * channels() + c], scalar_requantization_params); 232 } 233 } 234 235 // Call optimized micro-kernel. 236 dwconv( 237 channels(), width(), 238 indirection.data(), packed_weights.data(), output.data(), 239 step() * sizeof(void*), 240 (output_stride() - channels()) * sizeof(uint8_t), 241 &quantization_params); 242 243 // Verify results. 244 for (size_t x = 0; x < width(); x++) { 245 for (size_t c = 0; c < channels(); c++) { 246 ASSERT_GE(uint32_t(output[x * output_stride() + c]), uint32_t(qmin())) 247 << "x = " << x << ", channel = " << c; 248 ASSERT_LE(uint32_t(output[x * output_stride() + c]), uint32_t(qmax())) 249 << "x = " << x << ", channel = " << c; 250 ASSERT_EQ(uint32_t(output[x * output_stride() + c]), uint32_t(output_ref[x * channels() + c])) 251 << "x = " << x << ", channel = " << c << ", accumulator = " << accumulators[x * channels() + c]; 252 } 253 } 254 } 255 } 256 257 void Test(xnn_f32_dwconv_up_ukernel_function dwconv, Variant variant = Variant::Native) const { 258 std::random_device random_device; 259 auto rng = std::mt19937(random_device()); 260 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng); 261 262 std::vector<const float*> indirection((width() - 1) * step() + kr()); 263 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + indirection.size() * channels()); 264 std::vector<float> kernel(channels() * kr()); 265 std::vector<float> bias(channels()); 266 std::vector<float, AlignedAllocator<float, 64>> packed_weights((kr() + 1) * packed_channels()); 267 std::vector<float> output((width() - 1) * output_stride() + channels()); 268 std::vector<float> output_ref(width() * channels()); 269 270 for (size_t iteration = 0; iteration < iterations(); iteration++) { 271 std::generate(input.begin(), input.end(), std::ref(f32rng)); 272 std::generate(kernel.begin(), kernel.end(), std::ref(f32rng)); 273 std::generate(bias.begin(), bias.end(), std::ref(f32rng)); 274 std::fill(output_ref.begin(), output_ref.end(), nanf("")); 275 std::fill(output.begin(), output.end(), nanf("")); 276 277 std::fill(packed_weights.begin(), packed_weights.end(), 0.0f); 278 xnn_pack_f32_dwconv_ghw_w( 279 kr(), 1, channels(), cr(), 280 kernel.data(), bias.data(), packed_weights.data()); 281 for (size_t i = 0; i < indirection.size(); i++) { 282 indirection[i] = input.data() + i * channels(); 283 } 284 std::shuffle(indirection.begin(), indirection.end(), rng); 285 286 // Compute reference results, without clamping. 287 for (size_t x = 0; x < width(); x++) { 288 for (size_t c = 0; c < channels(); c++) { 289 float acc = bias[c]; 290 for (size_t k = 0; k < kr(); k++) { 291 acc += indirection[x * step() + k][c] * kernel[c * kr() + k]; 292 } 293 output_ref[x * channels() + c] = acc; 294 } 295 } 296 297 // Compute clamping parameters. 298 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 299 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 300 const float accumulated_range = accumulated_max - accumulated_min; 301 const float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin()); 302 const float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax()); 303 304 // Prepare output parameters. 305 xnn_f32_output_params output_params = { }; 306 switch (variant) { 307 case Variant::Native: 308 output_params = xnn_init_f32_output_params(output_min, output_max); 309 break; 310 case Variant::Scalar: 311 output_params = xnn_init_scalar_f32_output_params(output_min, output_max); 312 break; 313 } 314 315 // Clamp reference results. 316 for (float& output_val : output_ref) { 317 output_val = std::max(std::min(output_val, output_max), output_min); 318 } 319 320 // Call optimized micro-kernel. 321 dwconv( 322 channels(), width(), 323 indirection.data(), packed_weights.data(), output.data(), 324 step() * sizeof(void*), 325 (output_stride() - channels()) * sizeof(float), 326 &output_params); 327 328 // Verify results. 329 for (size_t x = 0; x < width(); x++) { 330 for (size_t c = 0; c < channels(); c++) { 331 ASSERT_GE(output[x * output_stride() + c], output_min) 332 << "x = " << x << ", channel = " << c; 333 ASSERT_LE(output[x * output_stride() + c], output_max) 334 << "x = " << x << ", channel = " << c; 335 ASSERT_NEAR( 336 output_ref[x * channels() + c], 337 output[x * output_stride() + c], 338 std::abs(output_ref[x * channels() + c]) * 1.0e-5) 339 << "x = " << x << ", channel = " << c; 340 } 341 } 342 } 343 } 344 345 private: 346 uint32_t channels_{1}; 347 uint32_t cr_{1}; 348 uint32_t kr_{1}; 349 uint32_t width_{1}; 350 uint32_t step_{1}; 351 uint32_t output_stride_{0}; 352 uint8_t input_zero_point_{127}; 353 uint8_t kernel_zero_point_{127}; 354 uint8_t qmin_{0}; 355 uint8_t qmax_{255}; 356 size_t iterations_{3}; 357 }; 358