1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // kernel_neon.h: a collection of NEON optimized kernels. 16 // Check in kernel_default.h which one(s) are actually used by default. 17 // Others are mere experiments; they are still covered by tests 18 // in case they might be useful some day. 19 20 #ifndef GEMMLOWP_INTERNAL_KERNEL_NEON_H_ 21 #define GEMMLOWP_INTERNAL_KERNEL_NEON_H_ 22 23 #include "kernel.h" 24 25 #include <arm_neon.h> 26 #include <cassert> 27 28 namespace gemmlowp { 29 30 // The kernels here are specifically arm 32bit assembly, not arm 64bit. 31 #ifdef GEMMLOWP_NEON_32 32 33 // Our main GEMM kernel. 34 struct NEON_32_Kernel12x4Depth2 : KernelBase { 35 typedef KernelFormat<KernelSideFormat<CellFormat<4, 2>, 3>, 36 KernelSideFormat<CellFormat<4, 2>, 1> > 37 Format; 38 NameNEON_32_Kernel12x4Depth239 const char* Name() const override { return "NEON, 12x4, depth 2"; } 40 41 // TODO(benoitjacob): reorder function arguments so dst comes last RunNEON_32_Kernel12x4Depth242 void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride, 43 std::size_t dst_col_stride, const std::uint8_t* lhs_ptr, 44 const std::uint8_t* rhs_ptr, std::size_t start_depth, 45 std::size_t run_depth) const override { 46 ScopedProfilingLabel label("optimized kernel (NEON 12x4)"); 47 48 // For iOS assembler, the %= style of local labels cause compilation errors, 49 // so use numerical ones instead. See 50 // http://stackoverflow.com/questions/3898435/labels-in-gcc-inline-assembly 51 // If you add any labels, remember to undef them at the end. 52 #define GEMMLOWP_LABEL_CLEAR_ACCUMULATORS "1" 53 #define GEMMLOWP_LABEL_BEFORE_LOOP "2" 54 #define GEMMLOWP_LABEL_LOOP "3" 55 #define GEMMLOWP_LABEL_AFTER_LOOP "4" 56 57 assert(dst_row_stride == 1); 58 (void)dst_row_stride; 59 asm volatile( 60 // Overview of register layout: 61 // 62 // A 2x4 cell of Rhs is stored in 16bit in d0--d1 (q0). 63 // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in d2--d7 64 // (q1--q3). 65 // A 12x4 block of accumulators is stored in 32bit in q4--q15. 66 // 67 // +-----+-----+-----+-----+ 68 // |d0[0]|d0[1]|d0[2]|d0[3]| 69 // Rhs +-----+-----+-----+-----+ 70 // |d1[0]|d1[1]|d1[2]|d1[3]| 71 // +-----+-----+-----+-----+ 72 // 73 // | | | | | 74 // 75 // Lhs | | | | | 76 // 77 // +--+--+ - - - - +-----+-----+-----+-----+ 78 // |d2|d3| | q4 | q5 | q6 | q7 | 79 // |d2|d3| | q4 | q5 | q6 | q7 | 80 // |d2|d3| | q4 | q5 | q6 | q7 | 81 // |d2|d3| | q4 | q5 | q6 | q7 | 82 // +--+--+ - - - - +-----+-----+-----+-----+ 83 // |d4|d5| | q8 | q9 | q10 | q11 | 84 // |d4|d5| | q8 | q9 | q10 | q11 | 85 // |d4|d5| | q8 | q9 | q10 | q11 | 86 // |d4|d5| | q8 | q9 | q10 | q11 | 87 // +--+--+ - - - - +-----+-----+-----+-----+ 88 // |d6|d7| | q12 | q13 | q14 | q15 | 89 // |d6|d7| | q12 | q13 | q14 | q15 | 90 // |d6|d7| | q12 | q13 | q14 | q15 | 91 // |d6|d7| | q12 | q13 | q14 | q15 | 92 // +--+--+ - - - - +-----+-----+-----+-----+ 93 // 94 // Accumulator 95 96 // Load 1 Rhs cell of size 2x4 97 "vld1.8 {d0}, [%[rhs_ptr]]!\n" 98 // Load 3 Lhs cells of size 4x2 each 99 "vld1.8 {d2}, [%[lhs_ptr]]!\n" 100 "vld1.8 {d4}, [%[lhs_ptr]]!\n" 101 "vld1.8 {d6}, [%[lhs_ptr]]!\n" 102 103 // Check if start_depth==0 to decide whether we will clear 104 // accumulators or load existing accumulators. 105 "cmp %[start_depth], #0\n" 106 107 // Multiply dst_col_stride by 4 == sizeof(int32) to use 108 // it as a byte offset below. 109 "lsl %[dst_col_stride], #2\n" 110 111 "beq " GEMMLOWP_LABEL_CLEAR_ACCUMULATORS 112 "f\n" 113 114 // Load accumulators (start_depth != 0) 115 "mov r1, %[dst_ptr]\n" 116 "subs %[run_depth], #2\n" 117 "mov r0, r1\n" 118 "vld1.32 {d8, d9}, [r0]!\n" 119 "add r1, %[dst_col_stride]\n" 120 "vld1.32 {d16, d17}, [r0]!\n" 121 "vld1.32 {d24, d25}, [r0]\n" 122 "mov r0, r1\n" 123 "vld1.32 {d10, d11}, [r0]!\n" 124 "add r1, %[dst_col_stride]\n" 125 "vld1.32 {d18, d19}, [r0]!\n" 126 "vld1.32 {d26, d27}, [r0]\n" 127 "mov r0, r1\n" 128 "vld1.32 {d12, d13}, [r0]!\n" 129 "add r1, %[dst_col_stride]\n" 130 "vld1.32 {d20, d21}, [r0]!\n" 131 "vld1.32 {d28, d29}, [r0]\n" 132 "mov r0, r1\n" 133 "vld1.32 {d14, d15}, [r0]!\n" 134 "vld1.32 {d22, d23}, [r0]!\n" 135 "vld1.32 {d30, d31}, [r0]\n" 136 137 "b " GEMMLOWP_LABEL_BEFORE_LOOP "f\n" 138 139 GEMMLOWP_LABEL_CLEAR_ACCUMULATORS 140 ":\n" 141 142 // Clear accumulators (start_depth == 0) 143 "vmov.s32 q4, #0\n" 144 "subs %[run_depth], #2\n" 145 "vmov.s32 q8, q4\n" 146 "vmov.s32 q12, q4\n" 147 "vmov.s32 q5, q4\n" 148 "vmov.s32 q9, q4\n" 149 "vmov.s32 q13, q4\n" 150 "vmov.s32 q6, q4\n" 151 "vmov.s32 q10, q4\n" 152 "vmov.s32 q14, q4\n" 153 "vmov.s32 q7, q4\n" 154 "vmov.s32 q11, q4\n" 155 "vmov.s32 q15, q4\n" 156 157 GEMMLOWP_LABEL_BEFORE_LOOP 158 ":\n" 159 160 // If there are only two levels of depth, skip the loop. 161 "beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n" 162 163 GEMMLOWP_LABEL_LOOP 164 ":\n" 165 // Expand Lhs/Rhs cells to 16 bit. 166 // Note: moving theses vmovls further down to allow for 167 // longer data pipelining helps a little on A57 but is 168 // harmful on A53 --- It looks as if A53 doesn't like 169 // interleaving vmovl's into the vmlal's. 170 "vmovl.u8 q0, d0\n" 171 "vmovl.u8 q1, d2\n" 172 "vmovl.u8 q2, d4\n" 173 "vmovl.u8 q3, d6\n" 174 175 // Multiply-accumulate, level of depth 0 176 "vmlal.u16 q4, d2, d0[0]\n" 177 "vmlal.u16 q5, d2, d0[1]\n" 178 "vmlal.u16 q6, d2, d0[2]\n" 179 "vmlal.u16 q7, d2, d0[3]\n" 180 "vldr d2, [%[lhs_ptr]]\n" 181 "vmlal.u16 q8, d4, d0[0]\n" 182 "vmlal.u16 q9, d4, d0[1]\n" 183 "vmlal.u16 q10, d4, d0[2]\n" 184 "vmlal.u16 q11, d4, d0[3]\n" 185 "vldr d4, [%[lhs_ptr], #8]\n" 186 "vmlal.u16 q12, d6, d0[0]\n" 187 "vmlal.u16 q13, d6, d0[1]\n" 188 "vmlal.u16 q14, d6, d0[2]\n" 189 "vmlal.u16 q15, d6, d0[3]\n" 190 "vldr d6, [%[lhs_ptr], #16]\n" 191 "vldr d0, [%[rhs_ptr]]\n" 192 193 // Multiply-accumulate, level of depth 1 194 "vmlal.u16 q4, d3, d1[0]\n" 195 "vmlal.u16 q5, d3, d1[1]\n" 196 "add %[lhs_ptr], #24\n" 197 "vmlal.u16 q6, d3, d1[2]\n" 198 "vmlal.u16 q7, d3, d1[3]\n" 199 "add %[rhs_ptr], #8\n" 200 "vmlal.u16 q8, d5, d1[0]\n" 201 "vmlal.u16 q9, d5, d1[1]\n" 202 "subs %[run_depth], #2\n" 203 "vmlal.u16 q10, d5, d1[2]\n" 204 "vmlal.u16 q11, d5, d1[3]\n" 205 "vmlal.u16 q12, d7, d1[0]\n" 206 "vmlal.u16 q13, d7, d1[1]\n" 207 "vmlal.u16 q14, d7, d1[2]\n" 208 "vmlal.u16 q15, d7, d1[3]\n" 209 210 "bne " GEMMLOWP_LABEL_LOOP "b\n" 211 212 GEMMLOWP_LABEL_AFTER_LOOP 213 ":\n" 214 215 // Do remaining arithmetic for the last 2 levels of depth. 216 217 // Expand Lhs/Rhs cells to 16 bit. 218 "vmovl.u8 q0, d0\n" 219 "vmovl.u8 q1, d2\n" 220 "vmovl.u8 q2, d4\n" 221 "vmovl.u8 q3, d6\n" 222 223 // Multiply-accumulate, level of depth 0 224 "vmlal.u16 q4, d2, d0[0]\n" 225 "vmlal.u16 q5, d2, d0[1]\n" 226 "vmlal.u16 q6, d2, d0[2]\n" 227 "vmlal.u16 q7, d2, d0[3]\n" 228 "vmlal.u16 q8, d4, d0[0]\n" 229 "vmlal.u16 q9, d4, d0[1]\n" 230 "vmlal.u16 q10, d4, d0[2]\n" 231 "vmlal.u16 q11, d4, d0[3]\n" 232 "vmlal.u16 q12, d6, d0[0]\n" 233 "vmlal.u16 q13, d6, d0[1]\n" 234 "vmlal.u16 q14, d6, d0[2]\n" 235 "vmlal.u16 q15, d6, d0[3]\n" 236 237 // Multiply-accumulate, level of depth 1 238 "vmlal.u16 q4, d3, d1[0]\n" 239 "vmlal.u16 q5, d3, d1[1]\n" 240 "vmlal.u16 q6, d3, d1[2]\n" 241 "vmlal.u16 q7, d3, d1[3]\n" 242 "vmlal.u16 q8, d5, d1[0]\n" 243 "vmlal.u16 q9, d5, d1[1]\n" 244 "vmlal.u16 q10, d5, d1[2]\n" 245 "vmlal.u16 q11, d5, d1[3]\n" 246 "vmlal.u16 q12, d7, d1[0]\n" 247 "vmlal.u16 q13, d7, d1[1]\n" 248 "vmlal.u16 q14, d7, d1[2]\n" 249 "vmlal.u16 q15, d7, d1[3]\n" 250 251 // Store accumulators 252 "mov r1, %[dst_ptr]\n" 253 "mov r0, r1\n" 254 "vst1.32 {d8, d9}, [r0]!\n" 255 "add r1, %[dst_col_stride]\n" 256 "vst1.32 {d16, d17}, [r0]!\n" 257 "vst1.32 {d24, d25}, [r0]\n" 258 "mov r0, r1\n" 259 "vst1.32 {d10, d11}, [r0]!\n" 260 "add r1, %[dst_col_stride]\n" 261 "vst1.32 {d18, d19}, [r0]!\n" 262 "vst1.32 {d26, d27}, [r0]\n" 263 "mov r0, r1\n" 264 "vst1.32 {d12, d13}, [r0]!\n" 265 "add r1, %[dst_col_stride]\n" 266 "vst1.32 {d20, d21}, [r0]!\n" 267 "vst1.32 {d28, d29}, [r0]\n" 268 "mov r0, r1\n" 269 "vst1.32 {d14, d15}, [r0]!\n" 270 "vst1.32 {d22, d23}, [r0]!\n" 271 "vst1.32 {d30, d31}, [r0]\n" 272 : // outputs 273 [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 274 [dst_ptr] "+r"(dst_ptr), 275 [run_depth] "+r"(run_depth) 276 : // inputs 277 [start_depth] "r"(start_depth), 278 [dst_col_stride] "r"(dst_col_stride) 279 : // clobbers 280 "cc", "memory", "r0", "r1", 281 // note: someone on internet says that quad registers are 282 // unsupported in the clobber list! 283 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 284 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 285 "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", 286 "d31"); 287 #undef GEMMLOWP_LABEL_CLEAR_ACCUMULATORS 288 #undef GEMMLOWP_LABEL_BEFORE_LOOP 289 #undef GEMMLOWP_LABEL_LOOP 290 #undef GEMMLOWP_LABEL_AFTER_LOOP 291 } 292 }; 293 294 struct NEON_32_Kernel12x4Depth2Assuming12BitProducts : KernelBase { 295 typedef KernelFormat< 296 KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 3>, 297 KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> > 298 Format; 299 NameNEON_32_Kernel12x4Depth2Assuming12BitProducts300 const char* Name() const override { 301 return "NEON, 12x4, depth 2, assuming 12-bit products"; 302 } 303 304 // TODO(benoitjacob): reorder function arguments so dst comes last RunNEON_32_Kernel12x4Depth2Assuming12BitProducts305 void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride, 306 std::size_t dst_col_stride, const std::uint8_t* lhs_ptr, 307 const std::uint8_t* rhs_ptr, std::size_t start_depth, 308 std::size_t run_depth) const override { 309 ScopedProfilingLabel label( 310 "optimized kernel (NEON 12x4, assuming 12-bit products)"); 311 assert(dst_row_stride == 1); 312 (void)dst_row_stride; 313 314 // See comments above for why we need local numerical labels in our asm. 315 #define GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS "1" 316 #define GEMMLOWP_LOAD_GLOBAL_ACCUMULATORS_NEON_32_KERNEL_12X4_DEPTH2_12BIT "2" 317 #define GEMMLOWP_LABEL_32 "3" 318 #define GEMMLOWP_LABEL_24 "4" 319 #define GEMMLOWP_LABEL_16 "5" 320 #define GEMMLOWP_LABEL_8 "6" 321 #define GEMMLOWP_LABEL_2 "7" 322 323 // This kernel is special in that it uses local 16-bit accumulators. 324 // Because it assumes that each product fits in 12 bits, it can accumulate 325 // 16 products into a local 16-bit accumulator without risking overflow. 326 // At that point, it must accumulate these local 16-bit accumulators back 327 // into global 32-bit accumulators, which have to be stored in memory for 328 // lack of register space. 329 // This 12x4 block of global accumulators is laid out as 3 cells of size 4x4 330 // stored in diagonal-major order like this for the first 4x4 cell: 331 // 332 // 0 4 8 12 333 // 13 1 5 9 334 // 10 14 2 6 335 // 7 11 15 3 336 // 337 // and likewise for the 2nd cell (16--31) and 3rd cell (32--47) 338 std::int32_t global_accumulators[3 * 4 * 4]; 339 asm volatile( 340 // Compute stride between consecutive columns, in bytes 341 "mov r0, #4\n" // multiply by 4 = sizeof(int32) 342 "mul %[dst_col_stride], r0\n" 343 344 "cmp %[start_depth], #0\n" 345 "bne" 346 " " GEMMLOWP_LOAD_GLOBAL_ACCUMULATORS_NEON_32_KERNEL_12X4_DEPTH2_12BIT 347 "f\n" 348 349 // If start_depth==0, we need to clear our global accumulators 350 "mov r0, %[global_accumulators]\n" 351 "vmov.s32 q8, #0\n" 352 "vmov.s32 q9, q8\n" 353 "vst1.32 {d16,d17,d18,d19}, [r0]!\n" 354 "vst1.32 {d16,d17,d18,d19}, [r0]!\n" 355 "vst1.32 {d16,d17,d18,d19}, [r0]!\n" 356 "vst1.32 {d16,d17,d18,d19}, [r0]!\n" 357 "vst1.32 {d16,d17,d18,d19}, [r0]!\n" 358 "vst1.32 {d16,d17,d18,d19}, [r0]!\n" 359 "b " GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS 360 "f\n" 361 362 // If start_depth!=0, we need to load our existing global accumulators 363 GEMMLOWP_LOAD_GLOBAL_ACCUMULATORS_NEON_32_KERNEL_12X4_DEPTH2_12BIT 364 ":\n" 365 // Load global accumulators from destination matrix, column-major 366 "mov r1, %[dst_ptr]\n" 367 "mov r0, %[dst_col_stride]\n" 368 "sub r0, #32\n" 369 "vld1.32 {d0,d1}, [r1]!\n" 370 "vld1.32 {d8,d9}, [r1]!\n" 371 "vld1.32 {d16,d17}, [r1], r0\n" 372 "vld1.32 {d2,d3}, [r1]!\n" 373 "vld1.32 {d10,d11}, [r1]!\n" 374 "vld1.32 {d18,d19}, [r1], r0\n" 375 "vld1.32 {d4,d5}, [r1]!\n" 376 "vld1.32 {d12,d13}, [r1]!\n" 377 "vld1.32 {d20,d21}, [r1], r0\n" 378 "vld1.32 {d6,d7}, [r1]!\n" 379 "vld1.32 {d14,d15}, [r1]!\n" 380 "vld1.32 {d22,d23}, [r1], r0\n" 381 // Now we need to convert the global accumulator registers to 382 // 4x4-block-wise diagonal-major order. What we effectively want to do 383 // is to rotate the rows, however the accumulators are stored in 384 // column-major order in registers. So we achieve this by 385 // transposing, rotating the registers, and transposing again each 386 // 4x4 block. 387 // 388 // Transpose 3 4x4 blocks separately 389 "vtrn.32 q0, q1\n" 390 "vtrn.32 q2, q3\n" 391 "vswp d1, d4\n" 392 "vswp d3, d6\n" 393 "vtrn.32 q4, q5\n" 394 "vtrn.32 q6, q7\n" 395 "vswp d9, d12\n" 396 "vswp d11, d14\n" 397 "vtrn.32 q8, q9\n" 398 "vtrn.32 q10, q11\n" 399 "vswp d17, d20\n" 400 "vswp d19, d22\n" 401 // Rotate the registers 402 "vext.32 q1, q1, q1, #1\n" 403 "vext.32 q2, q2, q2, #2\n" 404 "vext.32 q3, q3, q3, #3\n" 405 "vext.32 q5, q5, q5, #1\n" 406 "vext.32 q6, q6, q6, #2\n" 407 "vext.32 q7, q7, q7, #3\n" 408 "vext.32 q9, q9, q9, #1\n" 409 "vext.32 q10, q10, q10, #2\n" 410 "vext.32 q11, q11, q11, #3\n" 411 // Transpose again and store into our global accumulators 412 // buffer. These two operations are done at once using vst4. 413 "mov r0, %[global_accumulators]\n" 414 "vst4.32 {d0,d2,d4,d6}, [r0]!\n" 415 "vst4.32 {d1,d3,d5,d7}, [r0]!\n" 416 "vst4.32 {d8,d10,d12,d14}, [r0]!\n" 417 "vst4.32 {d9,d11,d13,d15}, [r0]!\n" 418 "vst4.32 {d16,d18,d20,d22}, [r0]!\n" 419 "vst4.32 {d17,d19,d21,d23}, [r0]!\n" 420 421 /* Main loop */ 422 423 GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS 424 ":\n" 425 426 // Overview of register layout: 427 // 428 // Registers q4--q16 are the local 16-bit accumulators. 429 // However, each entry in the result matrix is represented 430 // by *two* local 16-bit accumulators: one for even levels 431 // of depth and one for odd levels of depth. These correspond 432 // to the scalars at even and odd indices within each q-register. 433 // Thus we effectively use 32 bits of register space for each 434 // entry in the result matrix. The accumulators register layout 435 // is the same as was described above for the global 32-bit 436 // accumulators (3 cells of size 4x4 in diagonal-major order) 437 // with the only difference that instead of 32bit values we have 438 // pairs of 16bit values. 439 // 440 // A 2x4 cell of Rhs is stored in 8bit in d0. 441 // A 12x2 block of 3 4x2 cells Lhs is stored in 8bit in d1--d3. 442 // 443 // +--------+--------+--------+--------+ 444 // |d0[0] |d0[2] |d0[4] |d0[6] | 445 // Rhs +--------+--------+--------+--------+ 446 // |d0[1] |d0[3] |d0[5] |d0[7] | 447 // +--------+--------+--------+--------+ 448 // 449 // | | | | | 450 // 451 // Lhs | | | | | 452 // 453 // +-----+-----+ - - - +--------+--------+--------+--------+ 454 // |d1[0]|d1[1]| |q4[0,1] |q5[0,1] |q6[0,1] |q7[0,1] | 455 // |d1[2]|d1[3]| |q7[2,3] |q4[2,3] |q5[2,3] |q6[2,3] | 456 // |d1[4]|d1[5]| |q6[4,5] |q7[4,5] |q4[4,5] |q5[4,5] | 457 // |d1[6]|d1[7]| |q5[6,7] |q6[6,7] |q7[6,7] |q4[6,7] | 458 // +-----+-----+ - - - +--------+--------+--------+--------+ 459 // |d2[0]|d2[1]| |q8[0,1] |q8[0,1] |q8[0,1] |q8[0,1] | 460 // |d2[2]|d2[3]| |q9[2,3] |q9[2,3] |q9[2,3] |q9[2,3] | 461 // |d2[4]|d2[5]| |q10[4,5]|q10[4,5]|q10[4,5]|q10[4,5]| 462 // |d2[6]|d2[7]| |q11[6,7]|q11[6,7]|q11[6,7]|q11[6,7]| 463 // +-----+-----+ - - - +--------+--------+--------+--------+ 464 // |d3[0]|d3[1]| |q12[0,1]|q12[0,1]|q12[0,1]|q12[0,1]| 465 // |d3[2]|d3[3]| |q13[2,3]|q13[2,3]|q13[2,3]|q13[2,3]| 466 // |d3[4]|d3[5]| |q14[4,5]|q14[4,5]|q14[4,5]|q14[4,5]| 467 // |d3[6]|d3[7]| |q15[6,7]|q15[6,7]|q15[6,7]|q15[6,7]| 468 // +-----+-----+ - - - +--------+--------+--------+--------+ 469 // 470 // Local 16-bit accumulators 471 // Note: 2 scalars per matrix entry 472 473 #define GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH \ 474 /* Load 3 Lhs cells of size 4x2 */ \ 475 "vld1.8 {d1,d2,d3}, [%[lhs_ptr]:64]!\n" \ 476 \ 477 /* Load 1 Rhs cell of size 2x4 */ \ 478 "vld1.8 {d0}, [%[rhs_ptr]:64]!\n" \ 479 \ 480 /* Multiply-accumulate */ \ 481 "vmlal.u8 q4, d1, d0\n" \ 482 "vmlal.u8 q8, d2, d0\n" \ 483 "vmlal.u8 q12, d3, d0\n" \ 484 "vext.8 d0, d0, d0, #2\n" \ 485 "vmlal.u8 q5, d1, d0\n" \ 486 "vmlal.u8 q9, d2, d0\n" \ 487 "vmlal.u8 q13, d3, d0\n" \ 488 "vext.8 d0, d0, d0, #2\n" \ 489 "vmlal.u8 q6, d1, d0\n" \ 490 "vmlal.u8 q10, d2, d0\n" \ 491 "vmlal.u8 q14, d3, d0\n" \ 492 "vext.8 d0, d0, d0, #2\n" \ 493 "vmlal.u8 q7, d1, d0\n" \ 494 "vmlal.u8 q11, d2, d0\n" \ 495 "vmlal.u8 q15, d3, d0\n" \ 496 \ 497 "sub %[run_depth], #2\n" 498 499 #define GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH \ 500 GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH \ 501 GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH \ 502 GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH \ 503 GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH 504 505 // Clear local 16-bit accumulators 506 "vmov.s32 q4, #0\n" 507 "vmov.s32 q5, q4\n" 508 "vmov.s32 q6, q4\n" 509 "vmov.s32 q7, q4\n" 510 "vmov.s32 q8, q4\n" 511 "vmov.s32 q9, q4\n" 512 "vmov.s32 q10, q4\n" 513 "vmov.s32 q11, q4\n" 514 "vmov.s32 q12, q4\n" 515 "vmov.s32 q13, q4\n" 516 "vmov.s32 q14, q4\n" 517 "vmov.s32 q15, q4\n" 518 519 // Select a suitable number of depth levels 520 // to process at this iteration. TODO (benoitjacob) I guess that 521 // someone who really knows asm should make this a jump table. 522 "cmp %[run_depth], #32\n" 523 "bge " GEMMLOWP_LABEL_32 524 "f\n" 525 "cmp %[run_depth], #24\n" 526 "bge " GEMMLOWP_LABEL_24 527 "f\n" 528 "cmp %[run_depth], #16\n" 529 "bge " GEMMLOWP_LABEL_16 530 "f\n" 531 "cmp %[run_depth], #8\n" 532 "bge " GEMMLOWP_LABEL_8 533 "f\n" 534 "b " GEMMLOWP_LABEL_2 "f\n" 535 536 GEMMLOWP_LABEL_32 537 ":\n" GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH GEMMLOWP_LABEL_24 538 ":\n" GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH GEMMLOWP_LABEL_16 539 ":\n" GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH GEMMLOWP_LABEL_8 540 ":\n" GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH 541 GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH 542 GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH GEMMLOWP_LABEL_2 543 ":\n" GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH 544 545 // Accumulate the local accumulators into the global accumulators. 546 // This is about summing adjacent pairs of 16-bit scalars into 547 // single 32-bit scalars, so we use pairwise long addition (vpadal). 548 "mov r0, %[global_accumulators]\n" 549 "mov r1, %[global_accumulators]\n" 550 "vld1.32 {d0,d1,d2,d3}, [r0]!\n" 551 "vld1.32 {d4,d5,d6,d7}, [r0]!\n" 552 "vpadal.u16 q0, q4\n" 553 "vpadal.u16 q1, q5\n" 554 "vpadal.u16 q2, q6\n" 555 "vpadal.u16 q3, q7\n" 556 "vst1.32 {d0,d1,d2,d3}, [r1]!\n" 557 "vst1.32 {d4,d5,d6,d7}, [r1]!\n" 558 "vld1.32 {d0,d1,d2,d3}, [r0]!\n" 559 "vld1.32 {d4,d5,d6,d7}, [r0]!\n" 560 "vpadal.u16 q0, q8\n" 561 "vpadal.u16 q1, q9\n" 562 "vpadal.u16 q2, q10\n" 563 "vpadal.u16 q3, q11\n" 564 "vst1.32 {d0,d1,d2,d3}, [r1]!\n" 565 "vst1.32 {d4,d5,d6,d7}, [r1]!\n" 566 "vld1.32 {d0,d1,d2,d3}, [r0]!\n" 567 "vld1.32 {d4,d5,d6,d7}, [r0]!\n" 568 "vpadal.u16 q0, q12\n" 569 "vpadal.u16 q1, q13\n" 570 "vpadal.u16 q2, q14\n" 571 "vpadal.u16 q3, q15\n" 572 "vst1.32 {d0,d1,d2,d3}, [r1]!\n" 573 "vst1.32 {d4,d5,d6,d7}, [r1]!\n" 574 575 // Loop. 576 "cmp %[run_depth], #0\n" 577 "bne " GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS 578 "b\n" 579 580 #undef GEMMLOWP_CLEAR_LOCAL_ACCUMULATORS 581 #undef GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH 582 #undef GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH 583 #undef GEMMLOWP_ADD_TO_GLOBAL_ACCUMULATORS 584 585 /* end of main loop */ 586 587 // Store the global accumulators to the destination matrix 588 // (column-major) 589 // This is the reverse of the steps that we followed at the beginning 590 // when we load the global accumulators from the destination matrix. 591 // The problem is the same: how to convert 4x4 blocks 592 // between column-major and diagonal-major orders. 593 // Like above, we do this by rotating rows, and we achieve that by 594 // tranposing, rotating columns, and transposing again. 595 // 596 // Load and transpose 4x4 blocks of global accumulators 597 // These two steps are done at once by the vld4 instruction. 598 "mov r0, %[global_accumulators]\n" 599 "vld4.32 {d0,d2,d4,d6}, [r0]!\n" 600 "vld4.32 {d1,d3,d5,d7}, [r0]!\n" 601 "vld4.32 {d8,d10,d12,d14}, [r0]!\n" 602 "vld4.32 {d9,d11,d13,d15}, [r0]!\n" 603 "vld4.32 {d16,d18,d20,d22}, [r0]!\n" 604 "vld4.32 {d17,d19,d21,d23}, [r0]!\n" 605 // Rotate the rows of each 4x4 block 606 "vext.32 q1, q1, q1, #3\n" 607 "vext.32 q2, q2, q2, #2\n" 608 "vext.32 q3, q3, q3, #1\n" 609 "vext.32 q5, q5, q5, #3\n" 610 "vext.32 q6, q6, q6, #2\n" 611 "vext.32 q7, q7, q7, #1\n" 612 "vext.32 q9, q9, q9, #3\n" 613 "vext.32 q10, q10, q10, #2\n" 614 "vext.32 q11, q11, q11, #1\n" 615 // Transpose again each 4x4 block 616 "vtrn.32 q0, q1\n" 617 "vtrn.32 q2, q3\n" 618 "vswp d1, d4\n" 619 "vswp d3, d6\n" 620 "vtrn.32 q4, q5\n" 621 "vtrn.32 q6, q7\n" 622 "vswp d9, d12\n" 623 "vswp d11, d14\n" 624 "vtrn.32 q8, q9\n" 625 "vtrn.32 q10, q11\n" 626 "vswp d17, d20\n" 627 "vswp d19, d22\n" 628 // Store into the column-major destination matrix 629 "mov r1, %[dst_ptr]\n" 630 "mov r0, %[dst_col_stride]\n" 631 "sub r0, #32\n" 632 "vst1.32 {d0,d1}, [r1]!\n" 633 "vst1.32 {d8,d9}, [r1]!\n" 634 "vst1.32 {d16,d17}, [r1], r0\n" 635 "vst1.32 {d2,d3}, [r1]!\n" 636 "vst1.32 {d10,d11}, [r1]!\n" 637 "vst1.32 {d18,d19}, [r1], r0\n" 638 "vst1.32 {d4,d5}, [r1]!\n" 639 "vst1.32 {d12,d13}, [r1]!\n" 640 "vst1.32 {d20,d21}, [r1], r0\n" 641 "vst1.32 {d6,d7}, [r1]!\n" 642 "vst1.32 {d14,d15}, [r1]!\n" 643 "vst1.32 {d22,d23}, [r1], r0\n" 644 : // outputs 645 [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 646 [dst_ptr] "+r"(dst_ptr), 647 [run_depth] "+r"(run_depth) 648 : // inputs 649 [start_depth] "r"(start_depth), [dst_col_stride] "r"(dst_col_stride), 650 [global_accumulators] "r"(&global_accumulators[0]) 651 : // clobbers 652 "cc", "memory", "r0", "r1", 653 // note: someone on internet says that quad registers are 654 // unsupported in the clobber list! 655 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 656 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 657 "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", 658 "d31"); 659 #undef GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS 660 #undef GEMMLOWP_LOAD_GLOBAL_ACCUMULATORS_NEON_32_KERNEL_12X4_DEPTH2_12BIT 661 #undef GEMMLOWP_LABEL_32 662 #undef GEMMLOWP_LABEL_24 663 #undef GEMMLOWP_LABEL_16 664 #undef GEMMLOWP_LABEL_8 665 #undef GEMMLOWP_LABEL_2 666 } 667 }; 668 669 struct NEON_32bit_GEMM_Int8Operands_LhsNonzero : KernelBase { 670 typedef KernelFormat< 671 KernelSideFormatInt8<CellFormat<4, 16, CellOrder::WidthMajor>, 1>, 672 KernelSideFormatInt8<CellFormat<2, 16, CellOrder::WidthMajor>, 1> > 673 Format; NameNEON_32bit_GEMM_Int8Operands_LhsNonzero674 const char* Name() const override { 675 return "NEON, 4x2, depth 16, accumulating two within signed int16"; 676 } 677 678 // TODO(benoitjacob): reorder function arguments so dst comes last RunNEON_32bit_GEMM_Int8Operands_LhsNonzero679 void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride, 680 std::size_t dst_col_stride, const std::uint8_t* lhs_ptr, 681 const std::uint8_t* rhs_ptr, std::size_t start_depth, 682 std::size_t run_depth) const override { 683 (void)dst_row_stride; 684 #define GEMMLOWP_LABEL_AFTER_LOOP "1" 685 #define GEMMLOWP_LABEL_LOOP "2" 686 #define GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES "3" 687 #define GEMMLOWP_LABEL_STORE "4" 688 asm volatile( 689 // Multiply dst_col_stride by 4 == sizeof(int32) to use 690 // it as a byte offset below. 691 "lsl %[dst_col_stride], %[dst_col_stride], #2\n" 692 693 // Overview of register layout: 694 // 695 // A 2x16 block of Rhs is stored in 8 bit in d0--d3. 696 // A 4x16 block of Lhs is stored in 8 bit in d4--d7. That is only 697 // half of the register space required, so we loop over these registers 698 // twice. Only half of it, a 2x16 block, is stored in d4--d7 at 699 // any given time. 700 // 701 // A 4x2 block of accumulators is stored in q8--q15 (as 4x32 bit 702 // components which need to be horizontally-added at the end) 703 // 704 // The Lhs vectors are multiplied by the Rhs vectors with a widening 705 // multiply over the 8 first levels of depth, producing int16x8 706 // vectors of products for each position in the accumulator matrix. 707 // Here comes the special trick: since the operands are signed int8, 708 // their range being [ -2^7 , 2^7 ), their products are in range 709 // [ -2^14 , 2^14 - 1 ), meaning that we can add two such values 710 // without any risk of overflowing int16. 711 // We thus proceed with the 8 next levels of depth, multiplying 712 // again Lhs by Rhs, accumulating into this existing int16x8 vector. 713 // 714 // Only then, having processed 16 levels of depth, do we need to 715 // horizontally add these int16x8 accumulators into the final 716 // int32x4 accumulators. 717 // 718 // As we do not have enough registers to store all 16 int16x8 719 // temporary-16bit-accumulators, we have them cycle through q4--q7. 720 // 721 // 722 // Register layout (ignoring the q4--q7 temporary 16bit accumulators): 723 // 724 // +----+----+ 725 // | d0 | d2 | 726 // | . | . | 727 // | . | . | 728 // | . | . | 729 // Rhs +----+----+ 730 // | d1 | d3 | 731 // | . | . | 732 // | . | . | 733 // | . | . | 734 // +----+----+ 735 // 736 // | | | 737 // 738 // Lhs | | | 739 // 740 // +--------+--------+ - - - - +----+----+ 741 // | d4 ... | d5 ... | | q8 | q9 | 742 // | d6 ... | d7 ... | | q10| q11| 743 // | d4 ... | d5 ... | | q12| q13| 744 // | d6 ... | d7 ... | | q14| q15| 745 // +--------+--------+ - - - - +----+----+ 746 // 747 // Accumulator 748 // 749 750 // Clear accumulators, and, interleaved with it, 751 // initial loads of the first loop iteration, 752 // taken out of the loop so that in the loop itself we have 753 // optimal streaming of data from memory. 754 "vldr d0, [%[rhs_ptr], #0]\n" 755 "vmov.i32 q8, #0\n" 756 "vldr d4, [%[lhs_ptr], #0]\n" 757 "vmov.i32 q9, #0\n" 758 "vldr d2, [%[rhs_ptr], #16]\n" 759 "vmov.i32 q10, q8\n" 760 "vldr d6, [%[lhs_ptr], #16]\n" 761 "vmov.i32 q11, q8\n" 762 "vldr d1, [%[rhs_ptr], #8]\n" 763 "vmov.i32 q12, q8\n" 764 "vldr d5, [%[lhs_ptr], #8]\n" 765 "vmov.i32 q13, q8\n" 766 "vldr d3, [%[rhs_ptr], #24]\n" 767 "vmov.i32 q14, q8\n" 768 "vldr d7, [%[lhs_ptr], #24]\n" 769 "vmov.i32 q15, q8\n" 770 771 // General loop. 772 GEMMLOWP_LABEL_LOOP 773 ":\n" 774 775 // Multiply 8 first levels of depth. 776 "vmull.s8 q4, d0, d4\n" 777 "add %[rhs_ptr], %[rhs_ptr], #32\n" 778 "vmull.s8 q5, d2, d4\n" 779 "vldr d4, [%[lhs_ptr], #32]\n" 780 "vmull.s8 q6, d0, d6\n" 781 "vmull.s8 q7, d2, d6\n" 782 "vldr d6, [%[lhs_ptr], #48]\n" 783 784 // Multiply-accumulate second-half, again into the same 785 // 16bit local accumulator registers. This is where we 786 // take advantage of having int8 instead of uint8 and therefore 787 // being able to accumulate two products into int16. 788 "vmlal.s8 q4, d1, d5\n" 789 "vmlal.s8 q5, d3, d5\n" 790 "vldr d5, [%[lhs_ptr], #40]\n" 791 "vmlal.s8 q6, d1, d7\n" 792 "vmlal.s8 q7, d3, d7\n" 793 "vldr d7, [%[lhs_ptr], #56]\n" 794 795 // Add pairwise, accumulate into 32-bit accumulators. 796 "vpadal.s16 q8, q4\n" 797 "add %[lhs_ptr], %[lhs_ptr], #64\n" 798 "vpadal.s16 q9, q5\n" 799 "subs %[run_depth], %[run_depth], #16\n" 800 "vpadal.s16 q10, q6\n" 801 "vpadal.s16 q11, q7\n" 802 803 "beq " GEMMLOWP_LABEL_AFTER_LOOP 804 "f\n" 805 806 // Multiply first half. 807 "vmull.s8 q4, d0, d4\n" 808 "vmull.s8 q5, d2, d4\n" 809 "vldr d4, [%[lhs_ptr], #0]\n" 810 "vmull.s8 q6, d0, d6\n" 811 "vldr d0, [%[rhs_ptr], #0]\n" 812 "vmull.s8 q7, d2, d6\n" 813 "vldr d2, [%[rhs_ptr], #16]\n" 814 815 // Multiply-accumulate second-half, again into the same 816 // 16bit local accumulator registers. This is where we 817 // take advantage of having int8 instead of uint8 and therefore 818 // being able to accumulate two products into int16. 819 "vmlal.s8 q4, d1, d5\n" 820 "vldr d6, [%[lhs_ptr], #16]\n" 821 "vmlal.s8 q5, d3, d5\n" 822 "vldr d5, [%[lhs_ptr], #8]\n" 823 "vmlal.s8 q6, d1, d7\n" 824 "vldr d1, [%[rhs_ptr], #8]\n" 825 "vmlal.s8 q7, d3, d7\n" 826 "vldr d3, [%[rhs_ptr], #24]\n" 827 828 // Add pairwise, accumulate into 32-bit accumulators. 829 "vpadal.s16 q12, q4\n" 830 "vldr d7, [%[lhs_ptr], #24]\n" 831 "vpadal.s16 q13, q5\n" 832 "vpadal.s16 q14, q6\n" 833 "vpadal.s16 q15, q7\n" 834 835 "b " GEMMLOWP_LABEL_LOOP "b\n" 836 837 GEMMLOWP_LABEL_AFTER_LOOP 838 ":\n" 839 840 // Multiply first half. 841 "vmull.s8 q4, d0, d4\n" 842 "vmull.s8 q5, d2, d4\n" 843 "vmull.s8 q6, d0, d6\n" 844 "vmull.s8 q7, d2, d6\n" 845 846 // Multiply-accumulate second-half, again into the same 847 // 16bit local accumulator registers. This is where we 848 // take advantage of having int8 instead of uint8 and therefore 849 // being able to accumulate two products into int16. 850 "vmlal.s8 q4, d1, d5\n" 851 "vmlal.s8 q5, d3, d5\n" 852 "vmlal.s8 q6, d1, d7\n" 853 "vmlal.s8 q7, d3, d7\n" 854 855 // Add pairwise, accumulate into 32-bit accumulators. 856 "vpadal.s16 q12, q4\n" 857 "vpadal.s16 q13, q5\n" 858 "vpadal.s16 q14, q6\n" 859 "vpadal.s16 q15, q7\n" 860 "cmp %[start_depth], #0\n" 861 862 // Reduce 32bit accumulators horizontally. 863 "vpadd.s32 d0, d16, d17\n" 864 "vpadd.s32 d1, d18, d19\n" 865 "vpadd.s32 d2, d20, d21\n" 866 "vpadd.s32 d3, d22, d23\n" 867 "vpadd.s32 d4, d24, d25\n" 868 "vpadd.s32 d5, d26, d27\n" 869 "vpadd.s32 d6, d28, d29\n" 870 "vpadd.s32 d7, d30, d31\n" 871 872 "bne " GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES 873 "f\n" 874 875 // Reduce 32bit accumulators horizontally, second pass 876 // (each pass adds pairwise. we need to add 4-wise). 877 "vpadd.s32 d8, d0, d2\n" 878 "vpadd.s32 d9, d4, d6\n" 879 "vpadd.s32 d10, d1, d3\n" 880 "vpadd.s32 d11, d5, d7\n" 881 882 "b " GEMMLOWP_LABEL_STORE "f\n" 883 884 GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES 885 ":\n" 886 887 // Reduce 32bit accumulators horizontally, second pass 888 // (each pass adds pairwise. we need to add 4-wise), 889 // and load destination values from memory. 890 "mov r0, %[dst_ptr]\n" 891 "vld1.32 {d16, d17}, [r0], %[dst_col_stride]\n" 892 "vpadd.s32 d8, d0, d2\n" 893 "vpadd.s32 d9, d4, d6\n" 894 "vld1.32 {d18, d19}, [r0]\n" 895 "vpadd.s32 d10, d1, d3\n" 896 "vpadd.s32 d11, d5, d7\n" 897 898 // Add horizontally-reduced accumulators into 899 // the values loaded from memory 900 "vadd.s32 q4, q8, q4\n" 901 "vadd.s32 q5, q9, q5\n" 902 903 GEMMLOWP_LABEL_STORE 904 ":\n" 905 // Store back into memory 906 "mov r0, %[dst_ptr]\n" 907 "vst1.32 {d8, d9}, [r0], %[dst_col_stride]\n" 908 "vst1.32 {d10, d11}, [r0]\n" 909 : // outputs 910 [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 911 [dst_ptr] "+r"(dst_ptr), [run_depth] "+r"(run_depth) 912 : // inputs 913 [start_depth] "r"(start_depth), 914 [dst_col_stride] "r"(dst_col_stride) 915 : // clobbers 916 "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 917 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", 918 "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", 919 "d28", "d29", "d30", "d31"); 920 #undef GEMMLOWP_LABEL_LOOP 921 #undef GEMMLOWP_LABEL_AFTER_LOOP 922 #undef GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES 923 #undef GEMMLOWP_LABEL_STORE 924 } 925 }; 926 927 // Same as NEON_32bit_GEMM_Int8Operands_LhsNonzero, but uses a side format that 928 // requires that user inputs were originally int8. This avoids the uint8->int8 929 // conversion in the pack step. 930 struct NEON_32bit_GEMM_Int8Operands_LhsNonzero_Int8Inputs 931 : NEON_32bit_GEMM_Int8Operands_LhsNonzero { 932 typedef KernelFormat< 933 KernelSideFormatInt8Inputs<CellFormat<4, 16, CellOrder::WidthMajor>, 1>, 934 KernelSideFormatInt8Inputs<CellFormat<2, 16, CellOrder::WidthMajor>, 1> > 935 Format; 936 }; 937 938 #endif // GEMMLOWP_NEON_32 939 940 // The kernels here are specifically arm 64bit assembly, not arm 32bit. 941 #ifdef GEMMLOWP_NEON_64 942 943 struct NEON_64bit_GEMM_Int8Operands_LhsNonzero : KernelBase { 944 typedef KernelFormat< 945 KernelSideFormatInt8<CellFormat<4, 16, CellOrder::WidthMajor>, 1>, 946 KernelSideFormatInt8<CellFormat<4, 16, CellOrder::WidthMajor>, 1> > 947 Format; NameNEON_64bit_GEMM_Int8Operands_LhsNonzero948 const char* Name() const override { 949 return "NEON, 4x4, depth 16, accumulating two within signed int16"; 950 } 951 952 // TODO(benoitjacob): reorder function arguments so dst comes last RunNEON_64bit_GEMM_Int8Operands_LhsNonzero953 void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride, 954 std::size_t dst_col_stride, const std::uint8_t* lhs_ptr, 955 const std::uint8_t* rhs_ptr, std::size_t start_depth, 956 std::size_t run_depth) const override { 957 (void)dst_row_stride; 958 #define GEMMLOWP_LABEL_AFTER_LOOP_LAST16 "1" 959 #define GEMMLOWP_LABEL_LOOP "2" 960 #define GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES "3" 961 #define GEMMLOWP_LABEL_STORE "4" 962 asm volatile( 963 // Clear accumulators, and, interleaved with it, 964 // initial loads of the first loop iteration, 965 // taken out of the loop so that in the loop itself we have 966 // optimal streaming of data from memory. 967 "ld1 {v0.16b}, [%[rhs_ptr]], #16\n" 968 "dup v16.4s, wzr\n" 969 "ld1 {v4.16b}, [%[lhs_ptr]], #16\n" 970 "dup v17.4s, wzr\n" 971 "ld1 {v1.16b}, [%[rhs_ptr]], #16\n" 972 "dup v18.4s, wzr\n" 973 "ld1 {v5.16b}, [%[lhs_ptr]], #16\n" 974 "dup v19.4s, wzr\n" 975 "ld1 {v2.16b}, [%[rhs_ptr]], #16\n" 976 "dup v20.4s, wzr\n" 977 "ld1 {v3.16b}, [%[rhs_ptr]], #16\n" 978 "dup v21.4s, wzr\n" 979 "ld1 {v6.16b}, [%[lhs_ptr]], #16\n" 980 "dup v22.4s, wzr\n" 981 "ld1 {v7.16b}, [%[lhs_ptr]], #16\n" 982 "dup v23.4s, wzr\n" 983 "dup v24.4s, wzr\n" 984 "dup v25.4s, wzr\n" 985 "dup v26.4s, wzr\n" 986 "dup v27.4s, wzr\n" 987 "dup v28.4s, wzr\n" 988 "dup v29.4s, wzr\n" 989 "dup v30.4s, wzr\n" 990 "dup v31.4s, wzr\n" 991 992 // Multiply dst_col_stride by 4 == sizeof(int32) to use 993 // it as a byte offset below. 994 "lsl %[dst_col_stride], %[dst_col_stride], #2\n" 995 996 // Initial arithmetic of the first loop iteration, 997 // taken out of the loop so that in the loop itself we have 998 // optimal streaming of data from memory. 999 "smull v8.8h, v0.8b, v4.8b\n" 1000 "smull v9.8h, v1.8b, v4.8b\n" 1001 "smull v10.8h, v2.8b, v4.8b\n" 1002 "smull v11.8h, v3.8b, v4.8b\n" 1003 "smull v12.8h, v0.8b, v5.8b\n" 1004 "smull v13.8h, v1.8b, v5.8b\n" 1005 "smull v14.8h, v2.8b, v5.8b\n" 1006 "smull v15.8h, v3.8b, v5.8b\n" 1007 1008 // Multiply-accumulate second-half, again into the same 1009 // 16bit local accumulator registers. This is where we 1010 // take advantage of having int8 instead of uint8 and therefore 1011 // being able to accumulate two products into int16. 1012 "smlal2 v8.8h, v0.16b, v4.16b\n" 1013 "smlal2 v9.8h, v1.16b, v4.16b\n" 1014 "smlal2 v10.8h, v2.16b, v4.16b\n" 1015 "smlal2 v11.8h, v3.16b, v4.16b\n" 1016 "smlal2 v12.8h, v0.16b, v5.16b\n" 1017 "smlal2 v13.8h, v1.16b, v5.16b\n" 1018 "smlal2 v14.8h, v2.16b, v5.16b\n" 1019 "smlal2 v15.8h, v3.16b, v5.16b\n" 1020 1021 "subs %[run_depth], %[run_depth], #16\n" 1022 1023 // If the loop depth is only 16, then we can skip the general loop 1024 // and go straight to the final part of the code. 1025 "beq " GEMMLOWP_LABEL_AFTER_LOOP_LAST16 "f\n" 1026 1027 // General loop. 1028 GEMMLOWP_LABEL_LOOP 1029 ":\n" 1030 1031 // Overview of register layout: 1032 // 1033 // A 4x16 block of Rhs is stored in 8 bit in v0--v3. 1034 // A 4x16 block of Lhs is stored in 8 bit in v4--v7. 1035 // 1036 // A 4x4 block of accumulators is stored in v16-v31 (as 4x32 bit 1037 // components which need to be horizontally-added at the end) 1038 // 1039 // The Lhs vectors are multiplied by the Rhs vectors with a widening 1040 // multiply over the 8 first levels of depth, producing int16x8 1041 // vectors of products for each position in the accumulator matrix. 1042 // Here comes the special trick: since the operands are signed int8, 1043 // their range being [ -2^7 , 2^7 ), their products are in range 1044 // [ -2^14 , 2^14 - 1 ), meaning that we can add two such values 1045 // without any risk of overflowing int16. 1046 // We thus proceed with the 8 next levels of depth, multiplying 1047 // again Lhs by Rhs, accumulating into this existing int16x8 vector. 1048 // 1049 // Only then, having processed 16 levels of depth, do we need to 1050 // horizontally add these int16x8 accumulators into the final 1051 // int32x4 accumulators. 1052 // 1053 // As we do not have enough registers to store all 16 int16x8 1054 // temporary-16bit-accumulators, we have them cycle through v8--v15. 1055 // 1056 // 1057 // Register layout (ignoring the v8--v15 temporary 16bit accumulators): 1058 // 1059 // +--------+--------+--------+--------+ 1060 // |v0.b[0] |v1.b[0] |v2.b[0] |v3.b[0] | 1061 // Rhs +--------+--------+--------+--------+ 1062 // | ... | ... | ... | ... | 1063 // +--------+--------+--------+--------| 1064 // |v0.b[15]|v1.b[15]|v2.b[15]|v3.b[15]| 1065 // +--------+--------+--------+--------+ 1066 // 1067 // | | | | | 1068 // 1069 // Lhs | | | | | 1070 // 1071 // +-------+-----+--------+ - - +--------+--------+--------+--------+ 1072 // |v4.b[0]| ... |v4.b[15]| | v16.4s | v17.4s | v18.4s | v19.4s | 1073 // |v5.b[0]| ... |v5.b[15]| | v20.4s | v21.4s | v22.4s | v23.4s | 1074 // |v6.b[0]| ... |v6.b[15]| | v24.4s | v25.4s | v26.4s | v27.4s | 1075 // |v7.b[0]| ... |v7.b[15]| | v28.4s | v29.4s | v30.4s | v31.4s | 1076 // +-------+--------------+ - - +--------+--------+--------+--------+ 1077 // 1078 // Accumulator 1079 // 1080 1081 // Some multiplications and 16-bit accumulation were already done above, 1082 // so we start right away in the middle. 1083 "sadalp v16.4s, v8.8h\n" 1084 "ld1 {v4.16b}, [%[lhs_ptr]], #16\n" 1085 "smull v8.8h, v0.8b, v6.8b\n" 1086 "sadalp v17.4s, v9.8h\n" 1087 "ld1 {v5.16b}, [%[lhs_ptr]], #16\n" 1088 "smull v9.8h, v1.8b, v6.8b\n" 1089 "sadalp v18.4s, v10.8h\n" 1090 "smull v10.8h, v2.8b, v6.8b\n" 1091 "sadalp v19.4s, v11.8h\n" 1092 "smull v11.8h, v3.8b, v6.8b\n" 1093 "sadalp v20.4s, v12.8h\n" 1094 "smull v12.8h, v0.8b, v7.8b\n" 1095 "sadalp v21.4s, v13.8h\n" 1096 "smull v13.8h, v1.8b, v7.8b\n" 1097 "sadalp v22.4s, v14.8h\n" 1098 "smull v14.8h, v2.8b, v7.8b\n" 1099 "sadalp v23.4s, v15.8h\n" 1100 "smull v15.8h, v3.8b, v7.8b\n" 1101 1102 // Multiply-accumulate second-half, again into the same 1103 // 16bit local accumulator registers. This is where we 1104 // take advantage of having int8 instead of uint8 and therefore 1105 // being able to accumulate two products into int16. 1106 "smlal2 v8.8h, v0.16b, v6.16b\n" 1107 "smlal2 v9.8h, v1.16b, v6.16b\n" 1108 "smlal2 v10.8h, v2.16b, v6.16b\n" 1109 "smlal2 v11.8h, v3.16b, v6.16b\n" 1110 1111 "ld1 {v6.16b}, [%[lhs_ptr]], #16\n" 1112 1113 "smlal2 v12.8h, v0.16b, v7.16b\n" 1114 "ld1 {v0.16b}, [%[rhs_ptr]], #16\n" 1115 "smlal2 v13.8h, v1.16b, v7.16b\n" 1116 "ld1 {v1.16b}, [%[rhs_ptr]], #16\n" 1117 "smlal2 v14.8h, v2.16b, v7.16b\n" 1118 "ld1 {v2.16b}, [%[rhs_ptr]], #16\n" 1119 "smlal2 v15.8h, v3.16b, v7.16b\n" 1120 "ld1 {v3.16b}, [%[rhs_ptr]], #16\n" 1121 1122 "sadalp v24.4s, v8.8h\n" 1123 "smull v8.8h, v0.8b, v4.8b\n" 1124 "sadalp v25.4s, v9.8h\n" 1125 "ld1 {v7.16b}, [%[lhs_ptr]], #16\n" 1126 "smull v9.8h, v1.8b, v4.8b\n" 1127 "sadalp v26.4s, v10.8h\n" 1128 "smull v10.8h, v2.8b, v4.8b\n" 1129 "sadalp v27.4s, v11.8h\n" 1130 "smull v11.8h, v3.8b, v4.8b\n" 1131 "sadalp v28.4s, v12.8h\n" 1132 "smull v12.8h, v0.8b, v5.8b\n" 1133 "sadalp v29.4s, v13.8h\n" 1134 "smull v13.8h, v1.8b, v5.8b\n" 1135 "sadalp v30.4s, v14.8h\n" 1136 "smull v14.8h, v2.8b, v5.8b\n" 1137 "sadalp v31.4s, v15.8h\n" 1138 "smull v15.8h, v3.8b, v5.8b\n" 1139 1140 // Multiply-accumulate second-half, again into the same 1141 // 16bit local accumulator registers. This is where we 1142 // take advantage of having int8 instead of uint8 and therefore 1143 // being able to accumulate two products into int16. 1144 "smlal2 v8.8h, v0.16b, v4.16b\n" 1145 "smlal2 v9.8h, v1.16b, v4.16b\n" 1146 "smlal2 v10.8h, v2.16b, v4.16b\n" 1147 "smlal2 v11.8h, v3.16b, v4.16b\n" 1148 1149 // Loop. Decrement loop index (depth) by 16, since we just handled 1150 // 16 levels of depth. Do this subs a bit before the end of the loop 1151 // for better dispatch on A57. 1152 "subs %[run_depth], %[run_depth], #16\n" 1153 1154 "smlal2 v12.8h, v0.16b, v5.16b\n" 1155 "smlal2 v13.8h, v1.16b, v5.16b\n" 1156 "smlal2 v14.8h, v2.16b, v5.16b\n" 1157 "smlal2 v15.8h, v3.16b, v5.16b\n" 1158 1159 "bne " GEMMLOWP_LABEL_LOOP "b\n" 1160 1161 // Final code for the last 16 levels of depth. 1162 // There is nothing to load anymore, only some arithmetic to finish. 1163 GEMMLOWP_LABEL_AFTER_LOOP_LAST16 1164 ":\n" 1165 1166 // Some multiplications and 16-bit accumulation were already done above, 1167 // so we start right away in the middle. 1168 "sadalp v16.4s, v8.8h\n" 1169 "smull v8.8h, v0.8b, v6.8b\n" 1170 "sadalp v17.4s, v9.8h\n" 1171 "smull v9.8h, v1.8b, v6.8b\n" 1172 "sadalp v18.4s, v10.8h\n" 1173 "smull v10.8h, v2.8b, v6.8b\n" 1174 "sadalp v19.4s, v11.8h\n" 1175 "smull v11.8h, v3.8b, v6.8b\n" 1176 "sadalp v20.4s, v12.8h\n" 1177 "smull v12.8h, v0.8b, v7.8b\n" 1178 "sadalp v21.4s, v13.8h\n" 1179 "smull v13.8h, v1.8b, v7.8b\n" 1180 "sadalp v22.4s, v14.8h\n" 1181 "smull v14.8h, v2.8b, v7.8b\n" 1182 "sadalp v23.4s, v15.8h\n" 1183 "smull v15.8h, v3.8b, v7.8b\n" 1184 1185 // Multiply-accumulate second-half, again into the same 1186 // 16bit local accumulator registers. This is where we 1187 // take advantage of having int8 instead of uint8 and therefore 1188 // being able to accumulate two products into int16. 1189 "smlal2 v8.8h, v0.16b, v6.16b\n" 1190 "smlal2 v9.8h, v1.16b, v6.16b\n" 1191 "smlal2 v10.8h, v2.16b, v6.16b\n" 1192 "smlal2 v11.8h, v3.16b, v6.16b\n" 1193 "smlal2 v12.8h, v0.16b, v7.16b\n" 1194 "smlal2 v13.8h, v1.16b, v7.16b\n" 1195 "smlal2 v14.8h, v2.16b, v7.16b\n" 1196 "smlal2 v15.8h, v3.16b, v7.16b\n" 1197 1198 "sadalp v24.4s, v8.8h\n" 1199 "sadalp v25.4s, v9.8h\n" 1200 "sadalp v26.4s, v10.8h\n" 1201 "sadalp v27.4s, v11.8h\n" 1202 "sadalp v28.4s, v12.8h\n" 1203 "sadalp v29.4s, v13.8h\n" 1204 "sadalp v30.4s, v14.8h\n" 1205 "sadalp v31.4s, v15.8h\n" 1206 1207 // Reduce 32bit accumulators horizontally. 1208 "addp v0.4s, v16.4s, v20.4s\n" 1209 "addp v2.4s, v17.4s, v21.4s\n" 1210 "addp v4.4s, v18.4s, v22.4s\n" 1211 "addp v6.4s, v19.4s, v23.4s\n" 1212 "addp v1.4s, v24.4s, v28.4s\n" 1213 "addp v3.4s, v25.4s, v29.4s\n" 1214 "addp v5.4s, v26.4s, v30.4s\n" 1215 "addp v7.4s, v27.4s, v31.4s\n" 1216 1217 "cmp %[start_depth], #0\n" 1218 "bne " GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES 1219 "f\n" 1220 1221 // Reduce 32bit accumulators horizontally, second pass 1222 // (each pass adds pairwise. we need to add 4-wise). 1223 "addp v12.4s, v0.4s, v1.4s\n" 1224 "addp v13.4s, v2.4s, v3.4s\n" 1225 "addp v14.4s, v4.4s, v5.4s\n" 1226 "addp v15.4s, v6.4s, v7.4s\n" 1227 1228 "b " GEMMLOWP_LABEL_STORE "f\n" 1229 1230 GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES 1231 ":\n" 1232 1233 // Reduce 32bit accumulators horizontally, second pass 1234 // (each pass adds pairwise. we need to add 4-wise), 1235 // and load destination values from memory. 1236 "mov x0, %[dst_ptr]\n" 1237 "ld1 {v12.16b}, [x0], %[dst_col_stride]\n" 1238 "addp v8.4s, v0.4s, v1.4s\n" 1239 "ld1 {v13.16b}, [x0], %[dst_col_stride]\n" 1240 "addp v9.4s, v2.4s, v3.4s\n" 1241 "ld1 {v14.16b}, [x0], %[dst_col_stride]\n" 1242 "addp v10.4s, v4.4s, v5.4s\n" 1243 "ld1 {v15.16b}, [x0]\n" 1244 "addp v11.4s, v6.4s, v7.4s\n" 1245 1246 // Add horizontally-reduced accumulators into 1247 // the values loaded from memory 1248 "add v12.4s, v12.4s, v8.4s\n" 1249 "add v13.4s, v13.4s, v9.4s\n" 1250 "add v14.4s, v14.4s, v10.4s\n" 1251 "add v15.4s, v15.4s, v11.4s\n" 1252 1253 GEMMLOWP_LABEL_STORE 1254 ":\n" 1255 // Store back into memory 1256 "mov x0, %[dst_ptr]\n" 1257 "st1 {v12.16b}, [x0], %[dst_col_stride]\n" 1258 "st1 {v13.16b}, [x0], %[dst_col_stride]\n" 1259 "st1 {v14.16b}, [x0], %[dst_col_stride]\n" 1260 "st1 {v15.16b}, [x0]\n" 1261 : // outputs 1262 [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 1263 [dst_ptr] "+r"(dst_ptr), [run_depth] "+r"(run_depth), 1264 [dst_col_stride] "+r"(dst_col_stride) 1265 : // inputs 1266 [start_depth] "r"(start_depth) 1267 : // clobbers 1268 "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1269 "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", 1270 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", 1271 "v28", "v29", "v30", "v31"); 1272 #undef GEMMLOWP_LABEL_LOOP 1273 #undef GEMMLOWP_LABEL_AFTER_LOOP_LAST16 1274 #undef GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES 1275 #undef GEMMLOWP_LABEL_STORE 1276 } 1277 }; 1278 1279 // Same as NEON_32bit_GEMM_Int8Operands_LhsNonzero, but uses a side format that 1280 // requires that user inputs were originally int8. This avoids the uint8->int8 1281 // conversion in the pack step. 1282 struct NEON_64bit_GEMM_Int8Operands_LhsNonzero_Int8Inputs 1283 : NEON_64bit_GEMM_Int8Operands_LhsNonzero { 1284 typedef KernelFormat< 1285 KernelSideFormatInt8Inputs<CellFormat<4, 16, CellOrder::WidthMajor>, 1>, 1286 KernelSideFormatInt8Inputs<CellFormat<4, 16, CellOrder::WidthMajor>, 1> > 1287 Format; 1288 }; 1289 1290 // Our main GEMM kernel. 1291 struct NEON_64_Kernel12x8Depth2 : KernelBase { 1292 typedef KernelFormat<KernelSideFormat<CellFormat<4, 2>, 3>, 1293 KernelSideFormat<CellFormat<4, 2>, 2> > 1294 Format; 1295 NameNEON_64_Kernel12x8Depth21296 const char* Name() const override { return "NEON, 12x8, depth 2"; } 1297 1298 // TODO(benoitjacob): reorder function arguments so dst comes last RunNEON_64_Kernel12x8Depth21299 void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride, 1300 std::size_t dst_col_stride, const std::uint8_t* lhs_ptr, 1301 const std::uint8_t* rhs_ptr, std::size_t start_depth, 1302 std::size_t run_depth) const override { 1303 (void)dst_row_stride; 1304 ScopedProfilingLabel label("optimized kernel (NEON 12x8)"); 1305 // See comments above for why we need local numerical labels in our asm. 1306 #define GEMMLOWP_LABEL_CLEAR_ACCUMULATORS "1" 1307 #define GEMMLOWP_LABEL_BEFORE_LOOP "2" 1308 #define GEMMLOWP_LABEL_LOOP "3" 1309 #define GEMMLOWP_LABEL_AFTER_LOOP "4" 1310 1311 assert(dst_row_stride == 1); 1312 asm volatile( 1313 // Load 1 Rhs cell of size 2x8 1314 "ld1 {v5.8b}, [%[rhs_ptr]], #8\n" 1315 "ld1 {v6.8b}, [%[rhs_ptr]], #8\n" 1316 1317 // Load 3 Lhs cells of size 4x2 each 1318 "ld1 {v2.8b}, [%[lhs_ptr]], #8\n" 1319 "ld1 {v3.8b}, [%[lhs_ptr]], #8\n" 1320 "ld1 {v4.8b}, [%[lhs_ptr]], #8\n" 1321 1322 // Multiply dst_col_stride by 4 == sizeof(int32) to use 1323 // it as a byte offset below. 1324 "lsl %[dst_col_stride], %[dst_col_stride], #2\n" 1325 1326 "cmp %[start_depth], #0\n" 1327 "beq " GEMMLOWP_LABEL_CLEAR_ACCUMULATORS 1328 "f\n" 1329 1330 // Load accumulators 1331 "mov x1, %[dst_ptr]\n" 1332 "mov x0, x1\n" 1333 "ld1 {v8.16b}, [x0], #16\n" 1334 "subs %[run_depth], %[run_depth], #2\n" 1335 "ld1 {v16.16b}, [x0], #16\n" 1336 "add x1, x1, %[dst_col_stride]\n" 1337 "ld1 {v24.16b}, [x0]\n" 1338 "mov x0, x1\n" 1339 "ld1 {v9.16b}, [x0], #16\n" 1340 "add x1, x1, %[dst_col_stride]\n" 1341 "ld1 {v17.16b}, [x0], #16\n" 1342 "ld1 {v25.16b}, [x0]\n" 1343 "mov x0, x1\n" 1344 "ld1 {v10.16b}, [x0], #16\n" 1345 "add x1, x1, %[dst_col_stride]\n" 1346 "ld1 {v18.16b}, [x0], #16\n" 1347 "ld1 {v26.16b}, [x0]\n" 1348 "mov x0, x1\n" 1349 "ld1 {v11.16b}, [x0], #16\n" 1350 "add x1, x1, %[dst_col_stride]\n" 1351 "ld1 {v19.16b}, [x0], #16\n" 1352 "ld1 {v27.16b}, [x0]\n" 1353 "mov x0, x1\n" 1354 "ld1 {v12.16b}, [x0], #16\n" 1355 "add x1, x1, %[dst_col_stride]\n" 1356 "ld1 {v20.16b}, [x0], #16\n" 1357 "ld1 {v28.16b}, [x0]\n" 1358 "mov x0, x1\n" 1359 "ld1 {v13.16b}, [x0], #16\n" 1360 "add x1, x1, %[dst_col_stride]\n" 1361 "ld1 {v21.16b}, [x0], #16\n" 1362 "ld1 {v29.16b}, [x0]\n" 1363 "mov x0, x1\n" 1364 "ld1 {v14.16b}, [x0], #16\n" 1365 "add x1, x1, %[dst_col_stride]\n" 1366 "ld1 {v22.16b}, [x0], #16\n" 1367 "ld1 {v30.16b}, [x0]\n" 1368 "mov x0, x1\n" 1369 "ld1 {v15.16b}, [x0], #16\n" 1370 "ld1 {v23.16b}, [x0], #16\n" 1371 "ld1 {v31.16b}, [x0]\n" 1372 1373 "b " GEMMLOWP_LABEL_BEFORE_LOOP "f\n" 1374 1375 GEMMLOWP_LABEL_CLEAR_ACCUMULATORS 1376 ":\n" 1377 1378 // Clear accumulator registers (see layout below) 1379 "dup v8.4s, wzr\n" 1380 "subs %[run_depth], %[run_depth], #2\n" 1381 "dup v9.4s, wzr\n" 1382 "dup v10.4s, wzr\n" 1383 "dup v11.4s, wzr\n" 1384 "dup v12.4s, wzr\n" 1385 "dup v13.4s, wzr\n" 1386 "dup v14.4s, wzr\n" 1387 "dup v15.4s, wzr\n" 1388 "dup v16.4s, wzr\n" 1389 "dup v17.4s, wzr\n" 1390 "dup v18.4s, wzr\n" 1391 "dup v19.4s, wzr\n" 1392 "dup v20.4s, wzr\n" 1393 "dup v21.4s, wzr\n" 1394 "dup v22.4s, wzr\n" 1395 "dup v23.4s, wzr\n" 1396 "dup v24.4s, wzr\n" 1397 "dup v25.4s, wzr\n" 1398 "dup v26.4s, wzr\n" 1399 "dup v27.4s, wzr\n" 1400 "dup v28.4s, wzr\n" 1401 "dup v29.4s, wzr\n" 1402 "dup v30.4s, wzr\n" 1403 "dup v31.4s, wzr\n" 1404 1405 GEMMLOWP_LABEL_BEFORE_LOOP 1406 ":\n" 1407 1408 "beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n" 1409 1410 GEMMLOWP_LABEL_LOOP 1411 ":\n" 1412 1413 // Overview of register layout: 1414 // 1415 // A 2x8 block of 2 2x4 cells of Rhs is stored in 16bit in v0--v1. 1416 // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in v2--v4. 1417 // A 12x8 block of accumulators is stored in 32bit in v8--v31. 1418 // 1419 // +--------+--------+-----+--------+--------+ 1420 // |v0.h[0] |v0.h[1] | ... |v1.h[2] |v1.h[3] | 1421 // Rhs +--------+--------+-----+--------+--------+ 1422 // |v0.h[4] |v0.h[5] | ... |v1.h[6] |v1.h[7] | 1423 // +--------+--------+-----+--------+--------+ 1424 // 1425 // | | | | | | 1426 // 1427 // Lhs | | | | | | 1428 // 1429 // +-------+-------+ - - +--------+--------+-----+--------+--------+ 1430 // |v2.h[0]|v2.h[4]| |v8.s[0] |v9.s[0] | ... |v14.s[0]|v15.s[0]| 1431 // |v2.h[1]|v2.h[5]| |v8.s[1] |v9.s[1] | ... |v14.s[1]|v15.s[1]| 1432 // |v2.h[2]|v2.h[6]| |v8.s[2] |v9.s[2] | ... |v14.s[2]|v15.s[2]| 1433 // |v2.h[3]|v2.h[7]| |v8.s[3] |v9.s[3] | ... |v14.s[3]|v15.s[3]| 1434 // +-------+-------+ - - +--------+--------+-----+--------+--------+ 1435 // |v3.h[0]|v3.h[4]| |v16.s[0]|v17.s[0]| ... |v22.s[0]|v23.s[0]| 1436 // |v3.h[1]|v3.h[5]| |v16.s[1]|v17.s[1]| ... |v22.s[1]|v23.s[1]| 1437 // |v3.h[2]|v3.h[6]| |v16.s[2]|v17.s[2]| ... |v22.s[2]|v23.s[2]| 1438 // |v3.h[3]|v3.h[7]| |v16.s[3]|v17.s[3]| ... |v22.s[3]|v23.s[3]| 1439 // +-------+-------+ - - +--------+--------+-----+--------+--------+ 1440 // |v4.h[0]|v4.h[4]| |v24.s[0]|v25.s[0]| ... |v30.s[0]|v31.s[0]| 1441 // |v4.h[1]|v4.h[5]| |v24.s[1]|v25.s[1]| ... |v30.s[1]|v31.s[1]| 1442 // |v4.h[2]|v4.h[6]| |v24.s[2]|v25.s[2]| ... |v30.s[2]|v31.s[2]| 1443 // |v4.h[3]|v4.h[7]| |v24.s[3]|v25.s[3]| ... |v30.s[3]|v31.s[3]| 1444 // +-------+-------+ - - +--------+--------+-----+--------+--------+ 1445 // 1446 // Accumulator 1447 1448 // Expand Lhs/Rhs cells to 16 bit. 1449 "uxtl v0.8h, v5.8b\n" 1450 "ld1 {v5.8b}, [%[rhs_ptr]], #8\n" 1451 "uxtl v1.8h, v6.8b\n" 1452 "ld1 {v6.8b}, [%[rhs_ptr]], #8\n" 1453 "uxtl v2.8h, v2.8b\n" 1454 "uxtl v3.8h, v3.8b\n" 1455 "uxtl v4.8h, v4.8b\n" 1456 1457 // Multiply-accumulate, top third 1458 "umlal v8.4s, v2.4h, v0.h[0]\n" 1459 "umlal v9.4s, v2.4h, v0.h[1]\n" 1460 "umlal v10.4s, v2.4h, v0.h[2]\n" 1461 "umlal v11.4s, v2.4h, v0.h[3]\n" 1462 "umlal v12.4s, v2.4h, v1.h[0]\n" 1463 "umlal v13.4s, v2.4h, v1.h[1]\n" 1464 "umlal v14.4s, v2.4h, v1.h[2]\n" 1465 "umlal v15.4s, v2.4h, v1.h[3]\n" 1466 "umlal2 v8.4s, v2.8h, v0.h[4]\n" 1467 "umlal2 v9.4s, v2.8h, v0.h[5]\n" 1468 "umlal2 v10.4s, v2.8h, v0.h[6]\n" 1469 "umlal2 v11.4s, v2.8h, v0.h[7]\n" 1470 "umlal2 v12.4s, v2.8h, v1.h[4]\n" 1471 "umlal2 v13.4s, v2.8h, v1.h[5]\n" 1472 "umlal2 v14.4s, v2.8h, v1.h[6]\n" 1473 "umlal2 v15.4s, v2.8h, v1.h[7]\n" 1474 "ld1 {v2.8b}, [%[lhs_ptr]], #8\n" 1475 1476 // Multiply-accumulate, middle third 1477 "umlal v16.4s, v3.4h, v0.h[0]\n" 1478 "umlal v17.4s, v3.4h, v0.h[1]\n" 1479 "umlal v18.4s, v3.4h, v0.h[2]\n" 1480 "umlal v19.4s, v3.4h, v0.h[3]\n" 1481 "umlal v20.4s, v3.4h, v1.h[0]\n" 1482 "umlal v21.4s, v3.4h, v1.h[1]\n" 1483 "umlal v22.4s, v3.4h, v1.h[2]\n" 1484 "umlal v23.4s, v3.4h, v1.h[3]\n" 1485 "umlal2 v16.4s, v3.8h, v0.h[4]\n" 1486 "umlal2 v17.4s, v3.8h, v0.h[5]\n" 1487 "umlal2 v18.4s, v3.8h, v0.h[6]\n" 1488 "umlal2 v19.4s, v3.8h, v0.h[7]\n" 1489 "umlal2 v20.4s, v3.8h, v1.h[4]\n" 1490 "umlal2 v21.4s, v3.8h, v1.h[5]\n" 1491 "umlal2 v22.4s, v3.8h, v1.h[6]\n" 1492 "umlal2 v23.4s, v3.8h, v1.h[7]\n" 1493 "ld1 {v3.8b}, [%[lhs_ptr]], #8\n" 1494 1495 "subs %[run_depth], %[run_depth], #2\n" 1496 1497 // Multiply-accumulate, bottom third 1498 "umlal v24.4s, v4.4h, v0.h[0]\n" 1499 "umlal v25.4s, v4.4h, v0.h[1]\n" 1500 "umlal v26.4s, v4.4h, v0.h[2]\n" 1501 "umlal v27.4s, v4.4h, v0.h[3]\n" 1502 "umlal v28.4s, v4.4h, v1.h[0]\n" 1503 "umlal v29.4s, v4.4h, v1.h[1]\n" 1504 "umlal v30.4s, v4.4h, v1.h[2]\n" 1505 "umlal v31.4s, v4.4h, v1.h[3]\n" 1506 "umlal2 v24.4s, v4.8h, v0.h[4]\n" 1507 "umlal2 v25.4s, v4.8h, v0.h[5]\n" 1508 "umlal2 v26.4s, v4.8h, v0.h[6]\n" 1509 "umlal2 v27.4s, v4.8h, v0.h[7]\n" 1510 "umlal2 v28.4s, v4.8h, v1.h[4]\n" 1511 "umlal2 v29.4s, v4.8h, v1.h[5]\n" 1512 "umlal2 v30.4s, v4.8h, v1.h[6]\n" 1513 "umlal2 v31.4s, v4.8h, v1.h[7]\n" 1514 "ld1 {v4.8b}, [%[lhs_ptr]], #8\n" 1515 1516 "bne " GEMMLOWP_LABEL_LOOP "b\n" 1517 1518 GEMMLOWP_LABEL_AFTER_LOOP 1519 ":\n" 1520 1521 // Expand Lhs/Rhs cells to 16 bit. 1522 "uxtl v0.8h, v5.8b\n" 1523 "uxtl v1.8h, v6.8b\n" 1524 "uxtl v2.8h, v2.8b\n" 1525 "uxtl v3.8h, v3.8b\n" 1526 "uxtl v4.8h, v4.8b\n" 1527 1528 // Multiply-accumulate, level of depth 0 1529 "umlal v8.4s, v2.4h, v0.h[0]\n" 1530 "umlal v9.4s, v2.4h, v0.h[1]\n" 1531 "umlal v10.4s, v2.4h, v0.h[2]\n" 1532 "umlal v11.4s, v2.4h, v0.h[3]\n" 1533 "umlal v12.4s, v2.4h, v1.h[0]\n" 1534 "umlal v13.4s, v2.4h, v1.h[1]\n" 1535 "umlal v14.4s, v2.4h, v1.h[2]\n" 1536 "umlal v15.4s, v2.4h, v1.h[3]\n" 1537 "umlal v16.4s, v3.4h, v0.h[0]\n" 1538 "umlal v17.4s, v3.4h, v0.h[1]\n" 1539 "umlal v18.4s, v3.4h, v0.h[2]\n" 1540 "umlal v19.4s, v3.4h, v0.h[3]\n" 1541 "umlal v20.4s, v3.4h, v1.h[0]\n" 1542 "umlal v21.4s, v3.4h, v1.h[1]\n" 1543 "umlal v22.4s, v3.4h, v1.h[2]\n" 1544 "umlal v23.4s, v3.4h, v1.h[3]\n" 1545 "umlal v24.4s, v4.4h, v0.h[0]\n" 1546 "umlal v25.4s, v4.4h, v0.h[1]\n" 1547 "umlal v26.4s, v4.4h, v0.h[2]\n" 1548 "umlal v27.4s, v4.4h, v0.h[3]\n" 1549 "umlal v28.4s, v4.4h, v1.h[0]\n" 1550 "umlal v29.4s, v4.4h, v1.h[1]\n" 1551 "umlal v30.4s, v4.4h, v1.h[2]\n" 1552 "umlal v31.4s, v4.4h, v1.h[3]\n" 1553 1554 // Multiply-accumulate, level of depth 1 1555 "umlal2 v8.4s, v2.8h, v0.h[4]\n" 1556 "umlal2 v9.4s, v2.8h, v0.h[5]\n" 1557 "umlal2 v10.4s, v2.8h, v0.h[6]\n" 1558 "umlal2 v11.4s, v2.8h, v0.h[7]\n" 1559 "umlal2 v12.4s, v2.8h, v1.h[4]\n" 1560 "umlal2 v13.4s, v2.8h, v1.h[5]\n" 1561 "umlal2 v14.4s, v2.8h, v1.h[6]\n" 1562 "umlal2 v15.4s, v2.8h, v1.h[7]\n" 1563 "umlal2 v16.4s, v3.8h, v0.h[4]\n" 1564 "umlal2 v17.4s, v3.8h, v0.h[5]\n" 1565 "umlal2 v18.4s, v3.8h, v0.h[6]\n" 1566 "umlal2 v19.4s, v3.8h, v0.h[7]\n" 1567 "umlal2 v20.4s, v3.8h, v1.h[4]\n" 1568 "umlal2 v21.4s, v3.8h, v1.h[5]\n" 1569 "umlal2 v22.4s, v3.8h, v1.h[6]\n" 1570 "umlal2 v23.4s, v3.8h, v1.h[7]\n" 1571 "umlal2 v24.4s, v4.8h, v0.h[4]\n" 1572 "umlal2 v25.4s, v4.8h, v0.h[5]\n" 1573 "umlal2 v26.4s, v4.8h, v0.h[6]\n" 1574 "umlal2 v27.4s, v4.8h, v0.h[7]\n" 1575 "umlal2 v28.4s, v4.8h, v1.h[4]\n" 1576 "umlal2 v29.4s, v4.8h, v1.h[5]\n" 1577 "umlal2 v30.4s, v4.8h, v1.h[6]\n" 1578 "umlal2 v31.4s, v4.8h, v1.h[7]\n" 1579 1580 // Store accumulators 1581 "mov x1, %[dst_ptr]\n" 1582 "mov x0, x1\n" 1583 "st1 {v8.16b}, [x0], #16\n" 1584 "subs %[run_depth], %[run_depth], #2\n" 1585 "st1 {v16.16b}, [x0], #16\n" 1586 "add x1, x1, %[dst_col_stride]\n" 1587 "st1 {v24.16b}, [x0]\n" 1588 "mov x0, x1\n" 1589 "st1 {v9.16b}, [x0], #16\n" 1590 "add x1, x1, %[dst_col_stride]\n" 1591 "st1 {v17.16b}, [x0], #16\n" 1592 "st1 {v25.16b}, [x0]\n" 1593 "mov x0, x1\n" 1594 "st1 {v10.16b}, [x0], #16\n" 1595 "add x1, x1, %[dst_col_stride]\n" 1596 "st1 {v18.16b}, [x0], #16\n" 1597 "st1 {v26.16b}, [x0]\n" 1598 "mov x0, x1\n" 1599 "st1 {v11.16b}, [x0], #16\n" 1600 "add x1, x1, %[dst_col_stride]\n" 1601 "st1 {v19.16b}, [x0], #16\n" 1602 "st1 {v27.16b}, [x0]\n" 1603 "mov x0, x1\n" 1604 "st1 {v12.16b}, [x0], #16\n" 1605 "add x1, x1, %[dst_col_stride]\n" 1606 "st1 {v20.16b}, [x0], #16\n" 1607 "st1 {v28.16b}, [x0]\n" 1608 "mov x0, x1\n" 1609 "st1 {v13.16b}, [x0], #16\n" 1610 "add x1, x1, %[dst_col_stride]\n" 1611 "st1 {v21.16b}, [x0], #16\n" 1612 "st1 {v29.16b}, [x0]\n" 1613 "mov x0, x1\n" 1614 "st1 {v14.16b}, [x0], #16\n" 1615 "add x1, x1, %[dst_col_stride]\n" 1616 "st1 {v22.16b}, [x0], #16\n" 1617 "st1 {v30.16b}, [x0]\n" 1618 "mov x0, x1\n" 1619 "st1 {v15.16b}, [x0], #16\n" 1620 "st1 {v23.16b}, [x0], #16\n" 1621 "st1 {v31.16b}, [x0]\n" 1622 #undef GEMMLOWP_LABEL_CLEAR_ACCUMULATORS 1623 #undef GEMMLOWP_LABEL_BEFORE_LOOP 1624 #undef GEMMLOWP_LABEL_LOOP 1625 #undef GEMMLOWP_LABEL_AFTER_LOOP 1626 : // outputs 1627 [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 1628 [dst_ptr] "+r"(dst_ptr), 1629 [run_depth] "+r"(run_depth) 1630 : // inputs 1631 [start_depth] "r"(start_depth), 1632 [dst_col_stride] "r"(dst_col_stride) 1633 : // clobbers 1634 "cc", "memory", "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", 1635 "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", 1636 "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", 1637 "v27", "v28", "v29", "v30", "v31"); 1638 } 1639 }; 1640 1641 #ifdef GEMMLOWP_DOTPROD_KERNEL 1642 #ifndef __ARM_FEATURE_DOTPROD 1643 #error This kernel requires ARM dot-product instructions. Enable them by \ 1644 adding '+dotprod' to a compiler flag, e.g. -march=armv8.2-a+dotprod . \ 1645 Note that Clang up to version 7 fails to define the corresponding \ 1646 preprocessor token __ARM_FEATURE_DOTPROD, so you will still have to define \ 1647 it manually. 1648 #endif 1649 // Kernels utilizing the Armv8.2 Dot Product extension. 1650 // 1651 // The dot product instructions work by taking 4 consecutive 8-bit depth 1652 // values from each operand, multiplying the 4 pairs together and 1653 // accumulating all the results into the corresponding 32-bit accumulator 1654 // lane. As such, the operation is identical to a 32-bit instruction (like 1655 // FMLA used in SGEMM), except that 4 depth values are processed at a time 1656 // instead of 1. 1657 1658 // Thus, this first kernel is a carbon copy of 1659 // "NEON_64bit_GEMM_Float32_WithScalar_A57" (which should provide good 1660 // performance for most processors) below with the opcode (fmla -> udot) and 1661 // types (float32 -> uint8/uint32) changed. 1662 // 1663 // A signed version of this kernel could be produced by replacing "udot" 1664 // with "sdot" - performance should be identical to this udot kernel. 1665 struct NEON_64_Kernel12x8Depth4_dotprod : KernelBase { 1666 typedef KernelFormat<KernelSideFormat<CellFormat<4, 4, CellOrder::WidthMajor>, 3>, 1667 KernelSideFormat<CellFormat<4, 4, CellOrder::WidthMajor>, 2> > 1668 Format; 1669 NameNEON_64_Kernel12x8Depth4_dotprod1670 const char* Name() const override { return "NEON, 12x8, depth 4, dotprod"; } 1671 RunNEON_64_Kernel12x8Depth4_dotprod1672 void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride, std::size_t dst_col_stride, 1673 const std::uint8_t* lhs_ptr, const std::uint8_t* rhs_ptr, std::size_t start_depth, 1674 std::size_t depth) const override { 1675 (void)dst_row_stride; 1676 ScopedProfilingLabel label("optimized kernel (NEON 12x8, depth 4, dotprod)"); 1677 // See comments above for why we need local numerical labels in our asm. 1678 #define GEMMLOWP_LABEL_CLEAR_ACCUMULATORS "1" 1679 #define GEMMLOWP_LABEL_BEFORE_LOOP "2" 1680 #define GEMMLOWP_LABEL_LOOP "3" 1681 #define GEMMLOWP_LABEL_AFTER_LOOP "4" 1682 1683 assert(dst_row_stride == 1); 1684 asm volatile( 1685 // Multiply dst_col_stride by 4 == sizeof(int32) to use 1686 // it as a byte offset below. 1687 "lsl %[dst_col_stride], %[dst_col_stride], #2\n" 1688 1689 "cmp %[start_depth], #0\n" 1690 "beq " GEMMLOWP_LABEL_CLEAR_ACCUMULATORS "f\n" 1691 1692 // Load accumulators 1693 "mov x1, %[dst_ptr]\n" 1694 "mov x0, x1\n" 1695 "ld1 {v8.16b}, [x0], #16\n" 1696 "ld1 {v16.16b}, [x0], #16\n" 1697 "add x1, x1, %[dst_col_stride]\n" 1698 "ld1 {v24.16b}, [x0]\n" 1699 "mov x0, x1\n" 1700 "ld1 {v9.16b}, [x0], #16\n" 1701 "add x1, x1, %[dst_col_stride]\n" 1702 "ld1 {v17.16b}, [x0], #16\n" 1703 "ld1 {v25.16b}, [x0]\n" 1704 "mov x0, x1\n" 1705 "ld1 {v10.16b}, [x0], #16\n" 1706 "add x1, x1, %[dst_col_stride]\n" 1707 "ld1 {v18.16b}, [x0], #16\n" 1708 "ld1 {v26.16b}, [x0]\n" 1709 "mov x0, x1\n" 1710 "ld1 {v11.16b}, [x0], #16\n" 1711 "add x1, x1, %[dst_col_stride]\n" 1712 "ld1 {v19.16b}, [x0], #16\n" 1713 "ld1 {v27.16b}, [x0]\n" 1714 "mov x0, x1\n" 1715 "ld1 {v12.16b}, [x0], #16\n" 1716 "add x1, x1, %[dst_col_stride]\n" 1717 "ld1 {v20.16b}, [x0], #16\n" 1718 "ld1 {v28.16b}, [x0]\n" 1719 "mov x0, x1\n" 1720 "ld1 {v13.16b}, [x0], #16\n" 1721 "add x1, x1, %[dst_col_stride]\n" 1722 "ld1 {v21.16b}, [x0], #16\n" 1723 "ld1 {v29.16b}, [x0]\n" 1724 "mov x0, x1\n" 1725 "ld1 {v14.16b}, [x0], #16\n" 1726 "add x1, x1, %[dst_col_stride]\n" 1727 "ld1 {v22.16b}, [x0], #16\n" 1728 "ld1 {v30.16b}, [x0]\n" 1729 "mov x0, x1\n" 1730 "ld1 {v15.16b}, [x0], #16\n" 1731 "ld1 {v23.16b}, [x0], #16\n" 1732 "ld1 {v31.16b}, [x0]\n" 1733 1734 "b " GEMMLOWP_LABEL_BEFORE_LOOP "f\n" 1735 1736 GEMMLOWP_LABEL_CLEAR_ACCUMULATORS ":\n" 1737 1738 // Clear accumulator registers (see layout below) 1739 "dup v8.4s, wzr\n" 1740 "dup v9.4s, wzr\n" 1741 "dup v10.4s, wzr\n" 1742 "dup v11.4s, wzr\n" 1743 "dup v12.4s, wzr\n" 1744 "dup v13.4s, wzr\n" 1745 "dup v14.4s, wzr\n" 1746 "dup v15.4s, wzr\n" 1747 "dup v16.4s, wzr\n" 1748 "dup v17.4s, wzr\n" 1749 "dup v18.4s, wzr\n" 1750 "dup v19.4s, wzr\n" 1751 "dup v20.4s, wzr\n" 1752 "dup v21.4s, wzr\n" 1753 "dup v22.4s, wzr\n" 1754 "dup v23.4s, wzr\n" 1755 "dup v24.4s, wzr\n" 1756 "dup v25.4s, wzr\n" 1757 "dup v26.4s, wzr\n" 1758 "dup v27.4s, wzr\n" 1759 "dup v28.4s, wzr\n" 1760 "dup v29.4s, wzr\n" 1761 "dup v30.4s, wzr\n" 1762 "dup v31.4s, wzr\n" 1763 1764 GEMMLOWP_LABEL_BEFORE_LOOP ":\n" 1765 1766 "subs %w[depth], %w[depth], #4\n" 1767 1768 // The start of the loop assumes first Rhs cell is already loaded, so 1769 // do it here for first iteration. 1770 "ld1 {v0.16b}, [%[rhs_ptr]], #16\n" 1771 1772 // And the same for the first Lhs cell. 1773 "ld1 {v2.16b}, [%[lhs_ptr]], #16\n" 1774 1775 "beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n" 1776 1777 GEMMLOWP_LABEL_LOOP ":\n" 1778 1779 // Start the MACs at the head of the loop - 1st cell from each side 1780 // already loaded. 1781 ".word 0x6f80e048 // udot v8.4s, v2.16b, v0.4b[0]\n" 1782 ".word 0x6fa0e049 // udot v9.4s, v2.16b, v0.4b[1]\n" 1783 "ld1 {v1.16b}, [%[rhs_ptr]], #16\n" // Load second Rhs cell. 1784 ".word 0x6f80e84a // udot v10.4s, v2.16b, v0.4b[2]\n" 1785 ".word 0x6fa0e84b // udot v11.4s, v2.16b, v0.4b[3]\n" 1786 "ld1 {v3.16b}, [%[lhs_ptr]], #16\n" // Load second Lhs cell. 1787 ".word 0x6f81e04c // udot v12.4s, v2.16b, v1.4b[0]\n" 1788 ".word 0x6fa1e04d // udot v13.4s, v2.16b, v1.4b[1]\n" 1789 "ld1 {v4.16b}, [%[lhs_ptr]], #16\n" // Load third Lhs cell. 1790 ".word 0x6f81e84e // udot v14.4s, v2.16b, v1.4b[2]\n" 1791 ".word 0x6fa1e84f // udot v15.4s, v2.16b, v1.4b[3]\n" 1792 "ld1 {v2.16b}, [%[lhs_ptr]], #16\n" // Done with first Lhs cell - load 1793 // for the next iteration early. 1794 ".word 0x6f80e070 // udot v16.4s, v3.16b, v0.4b[0]\n" 1795 ".word 0x6fa0e071 // udot v17.4s, v3.16b, v0.4b[1]\n" 1796 ".word 0x6f80e872 // udot v18.4s, v3.16b, v0.4b[2]\n" 1797 ".word 0x6fa0e873 // udot v19.4s, v3.16b, v0.4b[3]\n" 1798 ".word 0x6f81e074 // udot v20.4s, v3.16b, v1.4b[0]\n" 1799 ".word 0x6fa1e075 // udot v21.4s, v3.16b, v1.4b[1]\n" 1800 ".word 0x6f81e876 // udot v22.4s, v3.16b, v1.4b[2]\n" 1801 ".word 0x6fa1e877 // udot v23.4s, v3.16b, v1.4b[3]\n" 1802 ".word 0x6f80e098 // udot v24.4s, v4.16b, v0.4b[0]\n" 1803 ".word 0x6fa0e099 // udot v25.4s, v4.16b, v0.4b[1]\n" 1804 ".word 0x6f80e89a // udot v26.4s, v4.16b, v0.4b[2]\n" 1805 ".word 0x6fa0e89b // udot v27.4s, v4.16b, v0.4b[3]\n" 1806 "ld1 {v0.16b}, [%[rhs_ptr]], #16\n" // Done with the first Rhs cell - 1807 // load for the next iteration early. 1808 ".word 0x6f81e09c // udot v28.4s, v4.16b, v1.4b[0]\n" 1809 ".word 0x6fa1e09d // udot v29.4s, v4.16b, v1.4b[1]\n" 1810 1811 // Loop. Decrement loop index (depth) by 4 as udot processes 4 1812 // depth values. 1813 "subs %w[depth], %w[depth], #4\n" 1814 ".word 0x6f81e89e // udot v30.4s, v4.16b, v1.4b[2]\n" 1815 ".word 0x6fa1e89f // udot v31.4s, v4.16b, v1.4b[3]\n" 1816 1817 "bne " GEMMLOWP_LABEL_LOOP "b\n" 1818 1819 GEMMLOWP_LABEL_AFTER_LOOP ":\n" 1820 1821 // Final iteration. v0 and v2 were already loaded, don't load 1822 // them again, don't read past the end of buffers. 1823 ".word 0x6f80e048 // udot v8.4s, v2.16b, v0.4b[0]\n" 1824 ".word 0x6fa0e049 // udot v9.4s, v2.16b, v0.4b[1]\n" 1825 "ld1 {v1.16b}, [%[rhs_ptr]], #16\n" // Load second Rhs cell. 1826 ".word 0x6f80e84a // udot v10.4s, v2.16b, v0.4b[2]\n" 1827 ".word 0x6fa0e84b // udot v11.4s, v2.16b, v0.4b[3]\n" 1828 "ld1 {v3.16b}, [%[lhs_ptr]], #16\n" // Load second Lhs cell. 1829 ".word 0x6f81e04c // udot v12.4s, v2.16b, v1.4b[0]\n" 1830 ".word 0x6fa1e04d // udot v13.4s, v2.16b, v1.4b[1]\n" 1831 "ld1 {v4.16b}, [%[lhs_ptr]], #16\n" // Load third Lhs cell. 1832 ".word 0x6f81e84e // udot v14.4s, v2.16b, v1.4b[2]\n" 1833 ".word 0x6fa1e84f // udot v15.4s, v2.16b, v1.4b[3]\n" 1834 ".word 0x6f80e070 // udot v16.4s, v3.16b, v0.4b[0]\n" 1835 ".word 0x6fa0e071 // udot v17.4s, v3.16b, v0.4b[1]\n" 1836 ".word 0x6f80e872 // udot v18.4s, v3.16b, v0.4b[2]\n" 1837 ".word 0x6fa0e873 // udot v19.4s, v3.16b, v0.4b[3]\n" 1838 ".word 0x6f81e074 // udot v20.4s, v3.16b, v1.4b[0]\n" 1839 ".word 0x6fa1e075 // udot v21.4s, v3.16b, v1.4b[1]\n" 1840 ".word 0x6f81e876 // udot v22.4s, v3.16b, v1.4b[2]\n" 1841 ".word 0x6fa1e877 // udot v23.4s, v3.16b, v1.4b[3]\n" 1842 ".word 0x6f80e098 // udot v24.4s, v4.16b, v0.4b[0]\n" 1843 ".word 0x6fa0e099 // udot v25.4s, v4.16b, v0.4b[1]\n" 1844 ".word 0x6f80e89a // udot v26.4s, v4.16b, v0.4b[2]\n" 1845 ".word 0x6fa0e89b // udot v27.4s, v4.16b, v0.4b[3]\n" 1846 ".word 0x6f81e09c // udot v28.4s, v4.16b, v1.4b[0]\n" 1847 ".word 0x6fa1e09d // udot v29.4s, v4.16b, v1.4b[1]\n" 1848 1849 // Loop. Decrement loop index (depth) by 4 as udot processes 4 1850 // depth values. 1851 "subs %w[depth], %w[depth], #4\n" 1852 ".word 0x6f81e89e // udot v30.4s, v4.16b, v1.4b[2]\n" 1853 ".word 0x6fa1e89f // udot v31.4s, v4.16b, v1.4b[3]\n" 1854 1855 // Store accumulators 1856 "mov x1, %[dst_ptr]\n" 1857 "mov x0, x1\n" 1858 "st1 {v8.16b}, [x0], #16\n" 1859 "st1 {v16.16b}, [x0], #16\n" 1860 "add x1, x1, %[dst_col_stride]\n" 1861 "st1 {v24.16b}, [x0]\n" 1862 "mov x0, x1\n" 1863 "st1 {v9.16b}, [x0], #16\n" 1864 "add x1, x1, %[dst_col_stride]\n" 1865 "st1 {v17.16b}, [x0], #16\n" 1866 "st1 {v25.16b}, [x0]\n" 1867 "mov x0, x1\n" 1868 "st1 {v10.16b}, [x0], #16\n" 1869 "add x1, x1, %[dst_col_stride]\n" 1870 "st1 {v18.16b}, [x0], #16\n" 1871 "st1 {v26.16b}, [x0]\n" 1872 "mov x0, x1\n" 1873 "st1 {v11.16b}, [x0], #16\n" 1874 "add x1, x1, %[dst_col_stride]\n" 1875 "st1 {v19.16b}, [x0], #16\n" 1876 "st1 {v27.16b}, [x0]\n" 1877 "mov x0, x1\n" 1878 "st1 {v12.16b}, [x0], #16\n" 1879 "add x1, x1, %[dst_col_stride]\n" 1880 "st1 {v20.16b}, [x0], #16\n" 1881 "st1 {v28.16b}, [x0]\n" 1882 "mov x0, x1\n" 1883 "st1 {v13.16b}, [x0], #16\n" 1884 "add x1, x1, %[dst_col_stride]\n" 1885 "st1 {v21.16b}, [x0], #16\n" 1886 "st1 {v29.16b}, [x0]\n" 1887 "mov x0, x1\n" 1888 "st1 {v14.16b}, [x0], #16\n" 1889 "add x1, x1, %[dst_col_stride]\n" 1890 "st1 {v22.16b}, [x0], #16\n" 1891 "st1 {v30.16b}, [x0]\n" 1892 "mov x0, x1\n" 1893 "st1 {v15.16b}, [x0], #16\n" 1894 "st1 {v23.16b}, [x0], #16\n" 1895 "st1 {v31.16b}, [x0]\n" 1896 : // outputs 1897 [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 1898 [depth] "+r"(depth) 1899 : // inputs 1900 [dst_ptr] "r"(dst_ptr), [dst_col_stride] "r"(dst_col_stride), [start_depth] "r"(start_depth) 1901 : // clobbers 1902 "cc", "memory", "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", 1903 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", 1904 "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); 1905 } 1906 }; 1907 #endif // GEMMLOWP_DOTPROD_KERNEL 1908 1909 #endif // GEMMLOWP_NEON_64 1910 1911 } // namespace gemmlowp 1912 1913 #endif // GEMMLOWP_INTERNAL_KERNEL_NEON_H_ 1914