1 /* 2 * Copyright (c) 2020 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 #pragma once 25 26 #include "arm_gemm.hpp" 27 #include "utils.hpp" 28 29 #include "mergeresults.hpp" 30 #include "transform.hpp" 31 32 #ifdef CYCLE_PROFILING 33 #include "profiler.hpp" 34 #endif 35 36 #include <algorithm> 37 #include <cassert> 38 #include <cmath> 39 40 // Some macros used to decide how much working space to allocate. 41 // Round allocations up to the next cache line. 42 #define ALLOC_ROUND 64 43 #define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND) 44 45 // Implementation of the GemmCommon abstract class. 46 // 47 // This implementation interleaves the source matrices in blocks - good for 48 // larger matrices. 49 namespace arm_gemm { 50 51 template<typename strategy, typename To, typename Tr> 52 class GemmInterleavedPretransposed2d : public GemmCommon<To, Tr> { 53 typedef typename strategy::operand_type Toi; 54 typedef typename strategy::result_type Tri; 55 56 /* const properties set by constructor */ 57 const CPUInfo * const _ci; 58 59 const unsigned int _Msize; 60 const unsigned int _Nsize; 61 const unsigned int _Ksize; 62 63 const unsigned int _nbatches; 64 const unsigned int _nmulti; 65 66 const Activation _act; 67 68 const int _maxthreads; 69 int _nthreads; 70 71 /* Blocking info */ 72 unsigned int _k_block=0; 73 unsigned int _x_block=0; 74 75 unsigned int _Mround_div=0; 76 unsigned int _Mround=0; 77 unsigned int _Nround_div=0; 78 unsigned int _Nround=0; 79 80 /* Working space, pretransposed buffer */ 81 const Toi *_B_transposed=nullptr; 82 void *_working_space=nullptr; 83 84 /* We will need to walk through the blocks of B in a few contexts, so 85 * factor that out. */ 86 class blockwalker { 87 private: 88 /* Size loops, etc. based on our parent's configuration */ 89 const GemmInterleavedPretransposed2d<strategy, To, Tr> &_parent; 90 91 /* K, X and multi parameters for current iteration. */ 92 unsigned int _k0=0, _x0=0, _xmin=0, _xmax=0, _multi=0; 93 94 unsigned int _index=0; 95 bool _done=false; 96 bool _newkblock=true; 97 bool _newmulti=true; 98 99 public: blockwalker(const GemmInterleavedPretransposed2d<strategy,To,Tr> & parent)100 blockwalker(const GemmInterleavedPretransposed2d<strategy, To, Tr> &parent) 101 : _parent(parent) 102 , _xmax { parent._Nsize } 103 { } 104 blockwalker(const GemmInterleavedPretransposed2d<strategy,To,Tr> & parent,unsigned int x0,unsigned int xmax)105 blockwalker(const GemmInterleavedPretransposed2d<strategy, To, Tr> &parent, unsigned int x0, unsigned int xmax) 106 : _parent(parent) 107 , _x0 { x0 } 108 , _xmin { x0 } 109 , _xmax { xmax } 110 { 111 assert(_x0 <= _xmax); 112 } 113 xmax()114 unsigned int xmax() { 115 return std::min(_x0 + _parent._x_block, _xmax); 116 } 117 kmax()118 unsigned int kmax() { 119 return std::min(_k0 + _parent._k_block, _parent._Ksize); 120 } 121 122 /* Advance to the next block, return false at the end. */ advance(void)123 bool advance(void) { 124 if (_done) { 125 return false; 126 } 127 128 _newkblock=false; 129 _x0 += _parent._x_block; 130 if (_x0 >= _xmax) { 131 _x0=_xmin; 132 _k0 += _parent._k_block; 133 if (_k0 >= _parent._Ksize) { 134 _k0=0; 135 _multi++; 136 if (_multi >= _parent._nmulti) { 137 _done=true; 138 return false; 139 } 140 _newmulti=true; 141 } 142 _newkblock=true; 143 } 144 _index++; 145 146 return true; 147 } 148 k0(void)149 unsigned int k0(void) { return _k0; } x0(void)150 unsigned int x0(void) { return _x0; } multi(void)151 unsigned int multi(void) { return _multi; } index(void)152 unsigned int index(void) { return _index; } done(void)153 bool done(void) { return _done; } newkblock(void)154 bool newkblock(void) { return _newkblock; } 155 }; 156 157 // A working size: One of these needed, regardless of thread count. Divided according to window. get_a_working_size() const158 size_t get_a_working_size() const { 159 return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches) * 2; 160 } 161 162 // As B will be pretranspose we do not need to alloc any space for it get_b_working_size() const163 size_t get_b_working_size() const { 164 return 0; 165 } 166 167 // C working size: One needed per thread. get_c_working_size() const168 size_t get_c_working_size() const { 169 return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height()); 170 } 171 172 // Internal execute function. 173 // This supports both the "pretransposed" and "standard" interfaces via the template parameter. execute_pretranspose(unsigned int m_start,unsigned int m_end,unsigned int n_start,unsigned int n_end,int threadid,int,int)174 void execute_pretranspose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int, int) { 175 /* Make sure we've been set up correctly. */ 176 assert(_B_transposed); 177 assert(_working_space); 178 assert(this->_Aptr); 179 assert(this->_Cptr); 180 181 #ifdef CYCLE_PROFILING 182 profiler prof; 183 #endif 184 strategy strat(_ci); 185 186 /* Translate 'start' and 'end' into a position within the batches and rows. */ 187 const unsigned int window_per_batch = _Mround / strategy::out_height(); 188 unsigned int batch_0 = m_start / window_per_batch; 189 unsigned int batch_end = m_end / window_per_batch; 190 191 /* Compute the M values to operate on */ 192 unsigned int m_0 = (m_start - (batch_0 * window_per_batch)) * strategy::out_height(); 193 unsigned int m_max = (m_end - (batch_end * window_per_batch)) * strategy::out_height(); 194 195 unsigned int n_0 = std::min(this->_Nsize, strategy::out_width() * n_start); 196 unsigned int n_max = std::min(this->_Nsize, strategy::out_width() * n_end); 197 198 blockwalker current(*this, n_0, n_max); 199 200 int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space); 201 202 auto c_panel_start = working_space_bytes; 203 auto a_panel_start = c_panel_start + get_c_working_size() * _maxthreads; 204 205 auto c_panel = reinterpret_cast<Tri *>(c_panel_start + get_c_working_size() * threadid); 206 auto a_panel = reinterpret_cast<Toi *>(a_panel_start + get_a_working_size() * threadid); 207 208 /* B^t is stored in interleaved panels separated by their K-block component 209 * we want to store a pointer to the start of the current k-page 210 * then when we come to the next k-block we just add the size of the previous to 211 * this base pointer 212 */ 213 const Toi *b_panel_start = _B_transposed; 214 // b_panels stores a pointer to the start of our current block inside of the k-block 215 const Toi *b_panel = b_panel_start; 216 217 // newkblock() is always true on the first iteration, so this will be set properly on the first loop. 218 unsigned b_page_size = 0; 219 int kern_k = 0; 220 for (;!current.done();current.advance()) { 221 int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width()); 222 223 if (current.newkblock()) { 224 kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll()); 225 kern_k *= strat.k_unroll(); 226 227 unsigned b_thread_start_offset = iceildiv(current.x0(), strategy::out_width()); 228 229 b_panel_start += b_page_size; 230 b_panel = b_panel_start + (b_thread_start_offset * strat.out_width() * kern_k); 231 b_page_size = _Nround * kern_k; 232 233 for (unsigned int batch = batch_0; batch <= batch_end; batch++) { 234 unsigned int first_m = (batch == batch_0) ? m_0 : 0; 235 unsigned int last_m = (batch == batch_end) ? m_max : _Msize; 236 237 if (first_m >= last_m) 238 continue; 239 240 auto a_thread_panel_in = this->_Aptr 241 + (batch * this->_A_batch_stride) 242 + (current.multi() * this->_A_multi_stride); 243 244 auto a_thread_panel_out = a_panel + ((batch * _Mround + first_m) * _k_block); 245 246 strat.transforms.PrepareA( 247 a_thread_panel_out, 248 a_thread_panel_in, 249 this->_lda, 250 first_m, 251 last_m, 252 current.k0(), 253 current.kmax(), 254 0); 255 } 256 } 257 258 /* Do the actual work. */ 259 for (unsigned int batch = batch_0; batch <= batch_end; batch++) { 260 unsigned int first_m = (batch == batch_0) ? m_0 : 0; 261 unsigned int last_m = (batch == batch_end) ? m_max : _Msize; 262 263 const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block; 264 265 if (first_m >= last_m) 266 continue; 267 268 for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) { 269 unsigned int ymax = std::min(_Msize, y + strategy::out_height()); 270 271 strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k); 272 a_ptr += (strategy::out_height() * kern_k); 273 274 /* Only activate on last pass, only add bias on first pass, ask for accumulation on any non-first pass */ 275 const bool first_pass = current.k0()==0; 276 const bool last_pass = current.kmax()==_Ksize; 277 278 auto c_panel_out = this->_Cptr 279 + this->_C_batch_stride * batch 280 + this->_C_multi_stride * current.multi(); 281 282 auto bias = (first_pass && this->_bias) 283 ? this->_bias + (current.multi() * this->_bias_multi_stride) 284 : nullptr; 285 286 auto act = last_pass ? _act : Activation(); 287 288 strat.transforms.Merge( 289 c_panel_out, 290 c_panel, 291 this->_ldc, 292 y, 293 ymax, 294 current.x0(), 295 current.xmax(), 296 bias, 297 act, 298 !first_pass); //Append 299 } 300 } 301 302 b_panel += (bblocks * strat.out_width() * kern_k); 303 } 304 } 305 get_k_block_size(const GemmArgs & args)306 static unsigned int get_k_block_size(const GemmArgs &args) { 307 // Work out blocking parameters, or override from provided GemmConfig 308 if (args._cfg && args._cfg->inner_block_size) { 309 return args._cfg->inner_block_size; 310 } 311 312 const unsigned int L1_size = args._ci->get_L1_cache_size(); 313 unsigned int k_block; 314 315 // k_block: Find out how much of the larger array can be loaded into half the cache. 316 // This should account for associative caches. 317 k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); 318 319 // Needs to be (at least a single) multiple of the K unroll level. 320 k_block /= strategy::k_unroll(); 321 k_block = std::max(k_block, 1U) * strategy::k_unroll(); 322 323 // Now tune to presented problem size; this is how many blocks we need. 324 unsigned int numk_blocks = iceildiv(args._Ksize, k_block); 325 326 // So divide the space equally into that many blocks. 327 k_block = iceildiv(args._Ksize, numk_blocks); 328 329 // And round UP to the K unroll level required. 330 k_block = iceildiv(k_block, strategy::k_unroll()); 331 k_block *= strategy::k_unroll(); 332 333 return k_block; 334 } 335 336 public: 337 GemmInterleavedPretransposed2d(GemmInterleavedPretransposed2d &) = delete; 338 GemmInterleavedPretransposed2d & operator= (GemmInterleavedPretransposed2d &) = delete; 339 340 /* Constructor */ GemmInterleavedPretransposed2d(const GemmArgs & args)341 GemmInterleavedPretransposed2d(const GemmArgs &args) 342 : _ci(args._ci) 343 , _Msize(args._Msize) 344 , _Nsize(args._Nsize) 345 , _Ksize(args._Ksize) 346 , _nbatches(args._nbatches) 347 , _nmulti(args._nmulti) 348 , _act(args._act) 349 , _maxthreads(args._maxthreads) 350 , _nthreads(args._maxthreads) 351 , _k_block(get_k_block_size(args)) 352 // Work out the rounded size of M - needed for some buffers. 353 , _Mround_div ( iceildiv(_Msize, strategy::out_height()) ) 354 , _Mround ( _Mround_div * strategy::out_height() ) 355 356 , _Nround_div ( iceildiv(_Nsize, strategy::out_width()) ) 357 , _Nround ( _Nround_div * strategy::out_width() ) 358 { 359 assert(_maxthreads > 0); 360 361 const unsigned int L2_size = _ci->get_L2_cache_size(); 362 363 if (args._cfg && args._cfg->outer_block_size) { 364 _x_block = args._cfg->outer_block_size; 365 } else { 366 // x_block: Work out how many rows (of length k_block) will fit in the L2 367 // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. 368 _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / 369 (sizeof(Toi) * _k_block); 370 371 // Needs to be (at least a single) multiple of the kernel output width. 372 _x_block /= strategy::out_width(); 373 _x_block = std::max(_x_block, 1U) * strategy::out_width(); 374 375 // And tune to the presented problem size. 376 unsigned int num_x_blocks = iceildiv(_Nsize, _x_block); 377 _x_block = iceildiv(_Nsize, num_x_blocks); 378 379 _x_block = iceildiv(_x_block, strategy::out_width()); 380 _x_block *= strategy::out_width(); 381 } 382 } 383 384 // Interface implementation - Compulsory functions get_window_size() const385 ndrange_t get_window_size() const override { 386 unsigned m = (_Mround / strategy::out_height()) * _nbatches; 387 unsigned n = _Nround_div; 388 389 return { m, n }; 390 } 391 supports_dynamic_scheduling() const392 bool supports_dynamic_scheduling() const override { 393 return true; 394 } 395 396 // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads. set_nthreads(int nthreads)397 void set_nthreads(int nthreads) override { 398 _nthreads = std::min(nthreads, _maxthreads); 399 } 400 execute(const ndcoord_t & work_range,const ndcoord_t & thread_locator,int threadid)401 void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { 402 /* This particular GEMM implementation can only be broken up over the M & N 403 * dimensions, we inform the frame work of this limitation via the get_window_size function 404 */ 405 const auto m_start = work_range.get_position(0); 406 const auto n_start = work_range.get_position(1); 407 const auto m_size = work_range.get_size(0); 408 const auto n_size = work_range.get_size(1); 409 const auto m_end = m_start + m_size; 410 const auto n_end = n_start + n_size; 411 412 const auto m_threadid = thread_locator.get_position(0); 413 const auto n_threadid = thread_locator.get_position(1); 414 415 execute_pretranspose(m_start, m_end, n_start, n_end, threadid, m_threadid, n_threadid); 416 } 417 get_working_size() const418 std::size_t get_working_size() const override { 419 /* Because we do not know how schedular will break up 420 * the task, we need to ensure that alloc enough 421 * space to be able to handle the case where every thread 422 * is parallelised across B AND also every thrread is parallelised across A 423 * 424 * If we parallelise across A, then we only need one buffer of A and 64 buffers of B 425 * If we parallelise across B, then we only need 64 buffer of B and 426 */ 427 return get_c_working_size() * _maxthreads 428 + get_a_working_size() * _maxthreads 429 + 64; //to account for cacheline alignment 430 } 431 432 set_working_space(void * working_space)433 void set_working_space(void *working_space) override { 434 // Make sure everything ends up cache line aligned 435 int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space); 436 intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space); 437 438 size_t diff=0; 439 440 if (working_space_int & 0x3F) { 441 diff = 0x40 - (working_space_int & 0x3F); 442 } 443 444 working_space_bytes += diff; 445 446 _working_space = reinterpret_cast<void *>(working_space_bytes); 447 } 448 449 // Interface implementation - pretransposed B_is_pretransposed() const450 bool B_is_pretransposed() const override { 451 return true; 452 } 453 B_pretranspose_required() const454 bool B_pretranspose_required() const override { 455 return _B_transposed==nullptr; 456 } 457 458 // TODO: this could almost certainly be considerably simpler. get_B_pretransposed_array_size() const459 size_t get_B_pretransposed_array_size() const override { 460 size_t total=0; 461 blockwalker current(*this); 462 463 do { 464 /* Figure out the size of each block. */ 465 unsigned int x_size = (current.xmax() - current.x0()); 466 unsigned int k_size = (current.kmax() - current.k0()); 467 468 /* Round sizes up as needed. */ 469 x_size = iceildiv(x_size, strategy::out_width()); 470 x_size *= strategy::out_width(); 471 472 k_size = iceildiv(k_size, strategy::k_unroll()); 473 k_size *= strategy::k_unroll(); 474 475 total += x_size * k_size * sizeof(Toi); 476 } while (current.advance()); 477 478 return total; 479 } 480 pretranspose_B_array(void * in_buffer,const To * B,const int ldb,const int B_multi_stride)481 void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { 482 blockwalker current(*this); 483 Toi *buffer = reinterpret_cast<Toi *>(in_buffer); 484 _B_transposed = buffer; 485 strategy strat(_ci); 486 487 do { 488 /* Figure out the size of each block. */ 489 unsigned int x_size = (current.xmax() - current.x0()); 490 unsigned int k_size = (current.kmax() - current.k0()); 491 492 /* Round sizes up as needed. */ 493 x_size = iceildiv(x_size, strategy::out_width()); 494 x_size *= strategy::out_width(); 495 496 k_size = iceildiv(k_size, strategy::k_unroll()); 497 k_size *= strategy::k_unroll(); 498 499 strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb, 500 current.x0(), current.xmax(), current.k0(), current.kmax()); 501 502 buffer += (x_size * k_size); 503 } while (current.advance()); 504 } 505 set_pretransposed_B_data(void * in_buffer)506 void set_pretransposed_B_data(void *in_buffer) override { 507 _B_transposed = reinterpret_cast<Toi *>(in_buffer); 508 } 509 510 // Estimate cycles for given problem given provided parameters estimate_cycles(const GemmArgs & args,const PerformanceParameters & params)511 static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters ¶ms) { 512 unsigned int k_blocks = iceildiv(args._Ksize, get_k_block_size(args)); 513 unsigned int m_blocks = iceildiv(args._Msize, strategy::out_height()) * args._nbatches; 514 unsigned int n_blocks = iceildiv(args._Nsize, strategy::out_width()); 515 516 uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll()); 517 uint64_t prepare_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Ksize, strategy::k_unroll()) * sizeof(Toi); 518 uint64_t merge_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * k_blocks * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr); 519 520 // Wide problems incur extra preparation cost, as it is done per thread. 521 // Duplicate the logic the scheduler will later use to figure out how much that will affect us 522 float ratio = m_blocks / static_cast<float>(n_blocks); 523 524 unsigned int ideal_height = static_cast<unsigned int>(std::sqrt(args._maxthreads * ratio) + 0.5); 525 unsigned int height = 1; 526 527 if (ideal_height == 0) { 528 height = 1; 529 } else { 530 for (unsigned int adj=0; adj<ideal_height; adj++) { 531 const unsigned int round_down = ideal_height - adj; 532 if (args._maxthreads % round_down == 0) { 533 height = round_down; 534 break; 535 } 536 537 const unsigned int round_up = ideal_height + adj; 538 if (args._maxthreads % round_up == 0) { 539 height = round_up; 540 break; 541 } 542 } 543 } 544 545 // We've computed the height here - we need to multiply the amount of preparation effort by the width (which is total threads / height) 546 prepare_bytes *= (args._maxthreads / height); 547 548 float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle; 549 float prepare_cycles = static_cast<float>(prepare_bytes) / params.prepare_bytes_cycle; 550 float merge_cycles = static_cast<float>(merge_bytes) / params.merge_bytes_cycle; 551 552 float total_cycles = mac_cycles + prepare_cycles + merge_cycles; 553 554 // We can't thread over multis, which might be a problem in some 555 // threaded cases. Penalize that here. 556 float parallelism_available = static_cast<float>(iceildiv(args._Msize, strategy::out_height()) * args._nbatches * iceildiv(args._Nsize, strategy::out_width())) * 0.9; 557 558 if (parallelism_available < args._maxthreads) { 559 total_cycles *= (static_cast<float>(args._maxthreads) / parallelism_available); 560 } 561 562 return static_cast<uint64_t>(total_cycles); 563 } 564 }; 565 566 } // namespace arm_gemm 567