1 // Copyright 2016 The Gemmlowp Authors. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_64_H_ 16 #define GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_64_H_ 17 18 #ifdef GEMMLOWP_NEON_64 19 20 #include <cassert> 21 #include <cstdint> 22 23 namespace gemmlowp { 24 namespace meta { 25 26 template <> 27 inline void 28 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 1, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)29 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 30 const FusedKernelParams<QuantizedStaticPreprocessed, 31 RowMajor>& params, 32 uint8_t* result) { 33 #ifdef DEBUG 34 #ifdef DEBUG_METAGEMM_VERBOSE 35 std::cout << __FILE__ << "(" << __LINE__ 36 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 37 "QuantizedStaticPreprocessed, RowMajor, 1, 1, 8>::Multiply()" 38 << std::endl 39 << std::flush; 40 #endif 41 #endif 42 asm volatile( 43 "prfm pldl1keep, [%x[lhs]]\n" 44 "prfm pldl1keep, [%x[rhs]]\n" 45 46 // Clear aggregators. 47 "movi v0.4s, #0\n" 48 49 // General NxM lanes loop. 50 "1:" 51 52 // Subtract counter. 53 "subs %x[count], %x[count], #8\n" 54 55 "ld1 {v1.2s}, [%x[lhs]], #8\n" 56 "ld1 {v2.2s}, [%x[rhs]], #8\n" 57 "prfm pldl1keep, [%x[lhs], #64]\n" 58 "prfm pldl1keep, [%x[rhs], #64]\n" 59 "umull v3.8h, v2.8b, v1.8b\n" 60 "uadalp v0.4s, v3.8h\n" 61 62 // Loop break. 63 "bgt 1b\n" 64 65 // StaticQuantization::Prepare 66 "ld1 {v4.4s}, [%x[lhs]], #16\n" 67 "ld1 {v5.4s}, [%x[rhs]], #16\n" 68 "dup v6.4s, %w[multiplicative_offset]\n" 69 "dup v7.4s, %w[rounding_offset]\n" 70 "dup v8.4s, %w[shift]\n" 71 "dup v4.4s, v4.s[0]\n" 72 73 // RowMajorOutput::Prepare 74 75 // Reduce aggregators. 76 "addp v0.4s, v0.4s, v0.4s\n" 77 "addp v0.4s, v0.4s, v0.4s\n" 78 79 // StaticQuantization::Transform 80 "add v0.4s, v0.4s, v4.4s\n" 81 "add v0.4s, v0.4s, v5.4s\n" 82 "mul v0.4s, v0.4s, v6.4s\n" 83 "add v0.4s, v0.4s, v7.4s\n" 84 "sshl v0.4s, v0.4s, v8.4s\n" 85 "sqxtn v0.4h, v0.4s\n" 86 "sqxtun v0.8b, v0.8h\n" 87 88 // RowMajorOutput::Output 89 "st1 {v0.b}[0], [%x[result]], #1\n" 90 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 91 : [count] "r"(params.kernel.count), 92 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 93 [shift] "r"(params.kernel.shift), 94 [stride] "r"(params.output_stream.stride), 95 [rounding_offset] "r"(params.kernel.rounding_offset) 96 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory"); 97 } 98 99 template <> 100 inline void 101 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 2, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)102 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 103 const FusedKernelParams<QuantizedStaticPreprocessed, 104 RowMajor>& params, 105 uint8_t* result) { 106 #ifdef DEBUG 107 #ifdef DEBUG_METAGEMM_VERBOSE 108 std::cout << __FILE__ << "(" << __LINE__ 109 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 110 "QuantizedStaticPreprocessed, RowMajor, 1, 2, 8>::Multiply()" 111 << std::endl 112 << std::flush; 113 #endif 114 #endif 115 asm volatile( 116 "prfm pldl1keep, [%x[lhs]]\n" 117 "prfm pldl1keep, [%x[rhs]]\n" 118 119 // Clear aggregators. 120 "movi v0.4s, #0\n" 121 "movi v1.4s, #0\n" 122 123 // General NxM lanes loop. 124 "1:" 125 126 // Subtract counter. 127 "subs %x[count], %x[count], #8\n" 128 129 "ld1 {v2.2s}, [%x[lhs]], #8\n" 130 "ld1 {v3.2s, v4.2s}, [%x[rhs]], #16\n" 131 "prfm pldl1keep, [%x[lhs], #64]\n" 132 "prfm pldl1keep, [%x[rhs], #64]\n" 133 "umull v5.8h, v3.8b, v2.8b\n" 134 "umull v6.8h, v4.8b, v2.8b\n" 135 "uadalp v0.4s, v5.8h\n" 136 "uadalp v1.4s, v6.8h\n" 137 138 // Loop break. 139 "bgt 1b\n" 140 141 // StaticQuantization::Prepare 142 "ld1 {v4.4s}, [%x[lhs]], #16\n" 143 "ld1 {v5.4s}, [%x[rhs]], #16\n" 144 "dup v6.4s, %w[multiplicative_offset]\n" 145 "dup v7.4s, %w[rounding_offset]\n" 146 "dup v8.4s, %w[shift]\n" 147 "dup v4.4s, v4.s[0]\n" 148 149 // RowMajorOutput::Prepare 150 151 // Reduce aggregators. 152 "addp v0.4s, v0.4s, v1.4s\n" 153 "addp v0.4s, v0.4s, v0.4s\n" 154 155 // StaticQuantization::Transform 156 "add v0.4s, v0.4s, v4.4s\n" 157 "add v0.4s, v0.4s, v5.4s\n" 158 "mul v0.4s, v0.4s, v6.4s\n" 159 "add v0.4s, v0.4s, v7.4s\n" 160 "sshl v0.4s, v0.4s, v8.4s\n" 161 "sqxtn v0.4h, v0.4s\n" 162 "sqxtun v0.8b, v0.8h\n" 163 164 // RowMajorOutput::Output 165 "st1 {v0.h}[0], [%x[result]], #2\n" 166 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 167 : [count] "r"(params.kernel.count), 168 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 169 [shift] "r"(params.kernel.shift), 170 [stride] "r"(params.output_stream.stride), 171 [rounding_offset] "r"(params.kernel.rounding_offset) 172 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory"); 173 } 174 175 template <> 176 inline void 177 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 3, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)178 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 179 const FusedKernelParams<QuantizedStaticPreprocessed, 180 RowMajor>& params, 181 uint8_t* result) { 182 #ifdef DEBUG 183 #ifdef DEBUG_METAGEMM_VERBOSE 184 std::cout << __FILE__ << "(" << __LINE__ 185 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 186 "QuantizedStaticPreprocessed, RowMajor, 1, 3, 8>::Multiply()" 187 << std::endl 188 << std::flush; 189 #endif 190 #endif 191 asm volatile( 192 "prfm pldl1keep, [%x[lhs]]\n" 193 "prfm pldl1keep, [%x[rhs]]\n" 194 195 // Clear aggregators. 196 "movi v0.4s, #0\n" 197 "movi v1.4s, #0\n" 198 "movi v2.4s, #0\n" 199 200 // General NxM lanes loop. 201 "1:" 202 203 // Subtract counter. 204 "subs %x[count], %x[count], #8\n" 205 206 "ld1 {v3.2s}, [%x[lhs]], #8\n" 207 "ld1 {v4.2s, v5.2s, v6.2s}, [%x[rhs]], #24\n" 208 "prfm pldl1keep, [%x[lhs], #64]\n" 209 "prfm pldl1keep, [%x[rhs], #64]\n" 210 "umull v7.8h, v4.8b, v3.8b\n" 211 "umull v8.8h, v5.8b, v3.8b\n" 212 "umull v9.8h, v6.8b, v3.8b\n" 213 "uadalp v0.4s, v7.8h\n" 214 "uadalp v1.4s, v8.8h\n" 215 "uadalp v2.4s, v9.8h\n" 216 217 // Loop break. 218 "bgt 1b\n" 219 220 // StaticQuantization::Prepare 221 "ld1 {v4.4s}, [%x[lhs]], #16\n" 222 "ld1 {v5.4s}, [%x[rhs]], #16\n" 223 "dup v6.4s, %w[multiplicative_offset]\n" 224 "dup v7.4s, %w[rounding_offset]\n" 225 "dup v8.4s, %w[shift]\n" 226 "dup v4.4s, v4.s[0]\n" 227 228 // RowMajorOutput::Prepare 229 230 // Reduce aggregators. 231 "addp v0.4s, v0.4s, v1.4s\n" 232 "addp v2.4s, v2.4s, v2.4s\n" 233 "addp v0.4s, v0.4s, v2.4s\n" 234 235 // StaticQuantization::Transform 236 "add v0.4s, v0.4s, v4.4s\n" 237 "add v0.4s, v0.4s, v5.4s\n" 238 "mul v0.4s, v0.4s, v6.4s\n" 239 "add v0.4s, v0.4s, v7.4s\n" 240 "sshl v0.4s, v0.4s, v8.4s\n" 241 "sqxtn v0.4h, v0.4s\n" 242 "sqxtun v0.8b, v0.8h\n" 243 244 // RowMajorOutput::Output 245 "st1 {v0.h}[0], [%x[result]], #2\n" 246 "st1 {v0.b}[2], [%x[result]], #1\n" 247 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 248 : [count] "r"(params.kernel.count), 249 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 250 [shift] "r"(params.kernel.shift), 251 [stride] "r"(params.output_stream.stride), 252 [rounding_offset] "r"(params.kernel.rounding_offset) 253 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc", 254 "memory"); 255 } 256 257 template <> 258 inline void 259 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 4, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)260 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 261 const FusedKernelParams<QuantizedStaticPreprocessed, 262 RowMajor>& params, 263 uint8_t* result) { 264 #ifdef DEBUG 265 #ifdef DEBUG_METAGEMM_VERBOSE 266 std::cout << __FILE__ << "(" << __LINE__ 267 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 268 "QuantizedStaticPreprocessed, RowMajor, 1, 4, 8>::Multiply()" 269 << std::endl 270 << std::flush; 271 #endif 272 #endif 273 asm volatile( 274 "prfm pldl1keep, [%x[lhs]]\n" 275 "prfm pldl1keep, [%x[rhs]]\n" 276 277 // Clear aggregators. 278 "movi v0.4s, #0\n" 279 "movi v1.4s, #0\n" 280 "movi v2.4s, #0\n" 281 "mov v3.16b, v0.16b\n" 282 283 // General NxM lanes loop. 284 "1:" 285 286 // Subtract counter. 287 "subs %x[count], %x[count], #8\n" 288 289 "ld1 {v4.2s}, [%x[lhs]], #8\n" 290 "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n" 291 "prfm pldl1keep, [%x[lhs], #64]\n" 292 "prfm pldl1keep, [%x[rhs], #64]\n" 293 "umull v9.8h, v5.8b, v4.8b\n" 294 "umull v10.8h, v6.8b, v4.8b\n" 295 "umull v11.8h, v7.8b, v4.8b\n" 296 "umull v12.8h, v8.8b, v4.8b\n" 297 "uadalp v0.4s, v9.8h\n" 298 "uadalp v1.4s, v10.8h\n" 299 "uadalp v2.4s, v11.8h\n" 300 "uadalp v3.4s, v12.8h\n" 301 302 // Loop break. 303 "bgt 1b\n" 304 305 // StaticQuantization::Prepare 306 "ld1 {v4.4s}, [%x[lhs]], #16\n" 307 "ld1 {v5.4s}, [%x[rhs]], #16\n" 308 "dup v6.4s, %w[multiplicative_offset]\n" 309 "dup v7.4s, %w[rounding_offset]\n" 310 "dup v8.4s, %w[shift]\n" 311 "dup v4.4s, v4.s[0]\n" 312 313 // RowMajorOutput::Prepare 314 315 // Reduce aggregators. 316 "addp v0.4s, v0.4s, v1.4s\n" 317 "addp v2.4s, v2.4s, v3.4s\n" 318 "addp v0.4s, v0.4s, v2.4s\n" 319 320 // StaticQuantization::Transform 321 "add v0.4s, v0.4s, v4.4s\n" 322 "add v0.4s, v0.4s, v5.4s\n" 323 "mul v0.4s, v0.4s, v6.4s\n" 324 "add v0.4s, v0.4s, v7.4s\n" 325 "sshl v0.4s, v0.4s, v8.4s\n" 326 "sqxtn v0.4h, v0.4s\n" 327 "sqxtun v0.8b, v0.8h\n" 328 329 // RowMajorOutput::Output 330 "st1 {v0.s}[0], [%x[result]], #4\n" 331 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 332 : [count] "r"(params.kernel.count), 333 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 334 [shift] "r"(params.kernel.shift), 335 [stride] "r"(params.output_stream.stride), 336 [rounding_offset] "r"(params.kernel.rounding_offset) 337 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 338 "v11", "v12", "cc", "memory"); 339 } 340 341 template <> 342 inline void 343 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 5, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)344 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 345 const FusedKernelParams<QuantizedStaticPreprocessed, 346 RowMajor>& params, 347 uint8_t* result) { 348 #ifdef DEBUG 349 #ifdef DEBUG_METAGEMM_VERBOSE 350 std::cout << __FILE__ << "(" << __LINE__ 351 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 352 "QuantizedStaticPreprocessed, RowMajor, 1, 5, 8>::Multiply()" 353 << std::endl 354 << std::flush; 355 #endif 356 #endif 357 asm volatile( 358 "prfm pldl1keep, [%x[lhs]]\n" 359 "prfm pldl1keep, [%x[rhs]]\n" 360 361 // Clear aggregators. 362 "movi v0.4s, #0\n" 363 "movi v1.4s, #0\n" 364 "movi v2.4s, #0\n" 365 "mov v3.16b, v0.16b\n" 366 "mov v4.16b, v1.16b\n" 367 368 // General 1xM lanes loop. 369 "1:" 370 371 // Subtract counter. 372 "subs %x[count], %x[count], #8\n" 373 374 "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n" 375 "ld1 {v9.2s}, [%x[lhs]], #8\n" 376 "prfm pldl1keep, [%x[lhs], #64]\n" 377 "umull v10.8h, v5.8b, v9.8b\n" 378 "umull v11.8h, v6.8b, v9.8b\n" 379 "umull v12.8h, v7.8b, v9.8b\n" 380 "umull v13.8h, v8.8b, v9.8b\n" 381 "ld1 {v5.2s}, [%x[rhs]], #8\n" 382 "prfm pldl1keep, [%x[rhs], #128]\n" 383 "uadalp v0.4s, v10.8h\n" 384 "uadalp v1.4s, v11.8h\n" 385 "uadalp v2.4s, v12.8h\n" 386 "uadalp v3.4s, v13.8h\n" 387 "umull v10.8h, v5.8b, v9.8b\n" 388 "uadalp v4.4s, v10.8h\n" 389 390 // Loop break. 391 "bgt 1b\n" 392 393 // StaticQuantization::Prepare 394 "ld1 {v5.4s}, [%x[lhs]], #16\n" 395 "ld1 {v6.4s, v7.4s}, [%x[rhs]], #32\n" 396 "dup v8.4s, %w[multiplicative_offset]\n" 397 "dup v9.4s, %w[rounding_offset]\n" 398 "dup v10.4s, %w[shift]\n" 399 "dup v5.4s, v5.s[0]\n" 400 401 // RowMajorOutput::Prepare 402 403 // Reduce aggregators. 404 "addp v0.4s, v0.4s, v1.4s\n" 405 "addp v2.4s, v2.4s, v3.4s\n" 406 "addp v4.4s, v4.4s, v4.4s\n" 407 "addp v0.4s, v0.4s, v2.4s\n" 408 "addp v1.4s, v4.4s, v4.4s\n" 409 410 // StaticQuantization::Transform 411 "add v0.4s, v0.4s, v5.4s\n" 412 "add v1.4s, v1.4s, v5.4s\n" 413 "add v0.4s, v0.4s, v6.4s\n" 414 "add v1.4s, v1.4s, v7.4s\n" 415 "mul v0.4s, v0.4s, v8.4s\n" 416 "mul v1.4s, v1.4s, v8.4s\n" 417 "add v0.4s, v0.4s, v9.4s\n" 418 "add v1.4s, v1.4s, v9.4s\n" 419 "sshl v0.4s, v0.4s, v10.4s\n" 420 "sshl v1.4s, v1.4s, v10.4s\n" 421 "sqxtn v0.4h, v0.4s\n" 422 "sqxtn2 v0.8h, v1.4s\n" 423 "sqxtun v0.8b, v0.8h\n" 424 425 // RowMajorOutput::Output 426 "st1 {v0.s}[0], [%x[result]], #4\n" 427 "st1 {v0.b}[4], [%x[result]], #1\n" 428 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 429 : [count] "r"(params.kernel.count), 430 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 431 [shift] "r"(params.kernel.shift), 432 [stride] "r"(params.output_stream.stride), 433 [rounding_offset] "r"(params.kernel.rounding_offset) 434 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 435 "v11", "v12", "v13", "cc", "memory"); 436 } 437 438 template <> 439 inline void 440 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 6, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)441 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 442 const FusedKernelParams<QuantizedStaticPreprocessed, 443 RowMajor>& params, 444 uint8_t* result) { 445 #ifdef DEBUG 446 #ifdef DEBUG_METAGEMM_VERBOSE 447 std::cout << __FILE__ << "(" << __LINE__ 448 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 449 "QuantizedStaticPreprocessed, RowMajor, 1, 6, 8>::Multiply()" 450 << std::endl 451 << std::flush; 452 #endif 453 #endif 454 asm volatile( 455 "prfm pldl1keep, [%x[lhs]]\n" 456 "prfm pldl1keep, [%x[rhs]]\n" 457 458 // Clear aggregators. 459 "movi v0.4s, #0\n" 460 "movi v1.4s, #0\n" 461 "movi v2.4s, #0\n" 462 "mov v3.16b, v0.16b\n" 463 "mov v4.16b, v1.16b\n" 464 "mov v5.16b, v2.16b\n" 465 466 // General 1xM lanes loop. 467 "1:" 468 469 // Subtract counter. 470 "subs %x[count], %x[count], #8\n" 471 472 "ld1 {v6.2s, v7.2s, v8.2s, v9.2s}, [%x[rhs]], #32\n" 473 "ld1 {v10.2s}, [%x[lhs]], #8\n" 474 "prfm pldl1keep, [%x[lhs], #64]\n" 475 "umull v11.8h, v6.8b, v10.8b\n" 476 "umull v12.8h, v7.8b, v10.8b\n" 477 "umull v13.8h, v8.8b, v10.8b\n" 478 "umull v14.8h, v9.8b, v10.8b\n" 479 "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n" 480 "prfm pldl1keep, [%x[rhs], #128]\n" 481 "uadalp v0.4s, v11.8h\n" 482 "uadalp v1.4s, v12.8h\n" 483 "uadalp v2.4s, v13.8h\n" 484 "uadalp v3.4s, v14.8h\n" 485 "umull v11.8h, v6.8b, v10.8b\n" 486 "umull v12.8h, v7.8b, v10.8b\n" 487 "uadalp v4.4s, v11.8h\n" 488 "uadalp v5.4s, v12.8h\n" 489 490 // Loop break. 491 "bgt 1b\n" 492 493 // StaticQuantization::Prepare 494 "ld1 {v6.4s}, [%x[lhs]], #16\n" 495 "ld1 {v7.4s, v8.4s}, [%x[rhs]], #32\n" 496 "dup v9.4s, %w[multiplicative_offset]\n" 497 "dup v10.4s, %w[rounding_offset]\n" 498 "dup v11.4s, %w[shift]\n" 499 "dup v6.4s, v6.s[0]\n" 500 501 // RowMajorOutput::Prepare 502 503 // Reduce aggregators. 504 "addp v0.4s, v0.4s, v1.4s\n" 505 "addp v2.4s, v2.4s, v3.4s\n" 506 "addp v4.4s, v4.4s, v5.4s\n" 507 "addp v0.4s, v0.4s, v2.4s\n" 508 "addp v1.4s, v4.4s, v4.4s\n" 509 510 // StaticQuantization::Transform 511 "add v0.4s, v0.4s, v6.4s\n" 512 "add v1.4s, v1.4s, v6.4s\n" 513 "add v0.4s, v0.4s, v7.4s\n" 514 "add v1.4s, v1.4s, v8.4s\n" 515 "mul v0.4s, v0.4s, v9.4s\n" 516 "mul v1.4s, v1.4s, v9.4s\n" 517 "add v0.4s, v0.4s, v10.4s\n" 518 "add v1.4s, v1.4s, v10.4s\n" 519 "sshl v0.4s, v0.4s, v11.4s\n" 520 "sshl v1.4s, v1.4s, v11.4s\n" 521 "sqxtn v0.4h, v0.4s\n" 522 "sqxtn2 v0.8h, v1.4s\n" 523 "sqxtun v0.8b, v0.8h\n" 524 525 // RowMajorOutput::Output 526 "st1 {v0.s}[0], [%x[result]], #4\n" 527 "st1 {v0.h}[2], [%x[result]], #2\n" 528 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 529 : [count] "r"(params.kernel.count), 530 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 531 [shift] "r"(params.kernel.shift), 532 [stride] "r"(params.output_stream.stride), 533 [rounding_offset] "r"(params.kernel.rounding_offset) 534 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 535 "v11", "v12", "v13", "v14", "cc", "memory"); 536 } 537 538 template <> 539 inline void 540 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 7, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)541 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 542 const FusedKernelParams<QuantizedStaticPreprocessed, 543 RowMajor>& params, 544 uint8_t* result) { 545 #ifdef DEBUG 546 #ifdef DEBUG_METAGEMM_VERBOSE 547 std::cout << __FILE__ << "(" << __LINE__ 548 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 549 "QuantizedStaticPreprocessed, RowMajor, 1, 7, 8>::Multiply()" 550 << std::endl 551 << std::flush; 552 #endif 553 #endif 554 asm volatile( 555 "prfm pldl1keep, [%x[lhs]]\n" 556 "prfm pldl1keep, [%x[rhs]]\n" 557 558 // Clear aggregators. 559 "movi v0.4s, #0\n" 560 "movi v1.4s, #0\n" 561 "movi v2.4s, #0\n" 562 "mov v3.16b, v0.16b\n" 563 "mov v4.16b, v1.16b\n" 564 "mov v5.16b, v2.16b\n" 565 "mov v6.16b, v3.16b\n" 566 567 // General 1xM lanes loop. 568 "1:" 569 570 // Subtract counter. 571 "subs %x[count], %x[count], #8\n" 572 573 "ld1 {v7.2s, v8.2s, v9.2s, v10.2s}, [%x[rhs]], #32\n" 574 "ld1 {v11.2s}, [%x[lhs]], #8\n" 575 "prfm pldl1keep, [%x[lhs], #64]\n" 576 "umull v12.8h, v7.8b, v11.8b\n" 577 "umull v13.8h, v8.8b, v11.8b\n" 578 "umull v14.8h, v9.8b, v11.8b\n" 579 "umull v15.8h, v10.8b, v11.8b\n" 580 "ld1 {v7.2s, v8.2s, v9.2s}, [%x[rhs]], #24\n" 581 "prfm pldl1keep, [%x[rhs], #128]\n" 582 "uadalp v0.4s, v12.8h\n" 583 "uadalp v1.4s, v13.8h\n" 584 "uadalp v2.4s, v14.8h\n" 585 "uadalp v3.4s, v15.8h\n" 586 "umull v12.8h, v7.8b, v11.8b\n" 587 "umull v13.8h, v8.8b, v11.8b\n" 588 "umull v14.8h, v9.8b, v11.8b\n" 589 "uadalp v4.4s, v12.8h\n" 590 "uadalp v5.4s, v13.8h\n" 591 "uadalp v6.4s, v14.8h\n" 592 593 // Loop break. 594 "bgt 1b\n" 595 596 // StaticQuantization::Prepare 597 "ld1 {v7.4s}, [%x[lhs]], #16\n" 598 "ld1 {v8.4s, v9.4s}, [%x[rhs]], #32\n" 599 "dup v10.4s, %w[multiplicative_offset]\n" 600 "dup v11.4s, %w[rounding_offset]\n" 601 "dup v12.4s, %w[shift]\n" 602 "dup v7.4s, v7.s[0]\n" 603 604 // RowMajorOutput::Prepare 605 606 // Reduce aggregators. 607 "addp v0.4s, v0.4s, v1.4s\n" 608 "addp v2.4s, v2.4s, v3.4s\n" 609 "addp v4.4s, v4.4s, v5.4s\n" 610 "addp v6.4s, v6.4s, v6.4s\n" 611 "addp v0.4s, v0.4s, v2.4s\n" 612 "addp v1.4s, v4.4s, v6.4s\n" 613 614 // StaticQuantization::Transform 615 "add v0.4s, v0.4s, v7.4s\n" 616 "add v1.4s, v1.4s, v7.4s\n" 617 "add v0.4s, v0.4s, v8.4s\n" 618 "add v1.4s, v1.4s, v9.4s\n" 619 "mul v0.4s, v0.4s, v10.4s\n" 620 "mul v1.4s, v1.4s, v10.4s\n" 621 "add v0.4s, v0.4s, v11.4s\n" 622 "add v1.4s, v1.4s, v11.4s\n" 623 "sshl v0.4s, v0.4s, v12.4s\n" 624 "sshl v1.4s, v1.4s, v12.4s\n" 625 "sqxtn v0.4h, v0.4s\n" 626 "sqxtn2 v0.8h, v1.4s\n" 627 "sqxtun v0.8b, v0.8h\n" 628 629 // RowMajorOutput::Output 630 "st1 {v0.s}[0], [%x[result]], #4\n" 631 "st1 {v0.h}[2], [%x[result]], #2\n" 632 "st1 {v0.b}[6], [%x[result]], #1\n" 633 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 634 : [count] "r"(params.kernel.count), 635 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 636 [shift] "r"(params.kernel.shift), 637 [stride] "r"(params.output_stream.stride), 638 [rounding_offset] "r"(params.kernel.rounding_offset) 639 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 640 "v11", "v12", "v13", "v14", "v15", "cc", "memory"); 641 } 642 643 template <> 644 inline void 645 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 8, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)646 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 647 const FusedKernelParams<QuantizedStaticPreprocessed, 648 RowMajor>& params, 649 uint8_t* result) { 650 #ifdef DEBUG 651 #ifdef DEBUG_METAGEMM_VERBOSE 652 std::cout << __FILE__ << "(" << __LINE__ 653 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 654 "QuantizedStaticPreprocessed, RowMajor, 1, 8, 8>::Multiply()" 655 << std::endl 656 << std::flush; 657 #endif 658 #endif 659 asm volatile( 660 "prfm pldl1keep, [%x[lhs]]\n" 661 "prfm pldl1keep, [%x[rhs]]\n" 662 663 // Clear aggregators. 664 "movi v0.4s, #0\n" 665 "movi v1.4s, #0\n" 666 "movi v2.4s, #0\n" 667 "mov v3.16b, v0.16b\n" 668 "mov v4.16b, v1.16b\n" 669 "mov v5.16b, v2.16b\n" 670 "mov v6.16b, v3.16b\n" 671 "mov v7.16b, v4.16b\n" 672 673 // 1x8 lanes loop. 674 "1:" 675 676 "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n" 677 "ld1 {v8.2s}, [%x[lhs]], #8\n" 678 "umull v13.8h, v8.8b, v9.8b\n" 679 "umull v14.8h, v8.8b, v10.8b\n" 680 "umull v15.8h, v8.8b, v11.8b\n" 681 "umull v16.8h, v8.8b, v12.8b\n" 682 "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n" 683 "uadalp v0.4s, v13.8h\n" 684 "uadalp v1.4s, v14.8h\n" 685 "uadalp v2.4s, v15.8h\n" 686 "uadalp v3.4s, v16.8h\n" 687 "prfm pldl1keep, [%x[rhs], #256]\n" 688 "umull v17.8h, v8.8b, v9.8b\n" 689 "umull v13.8h, v8.8b, v10.8b\n" 690 "umull v14.8h, v8.8b, v11.8b\n" 691 "umull v15.8h, v8.8b, v12.8b\n" 692 "prfm pldl1keep, [%x[lhs], #32]\n" 693 694 // Subtract counter. 695 "subs %x[count], %x[count], #8\n" 696 697 "uadalp v4.4s, v17.8h\n" 698 "uadalp v5.4s, v13.8h\n" 699 "uadalp v6.4s, v14.8h\n" 700 "uadalp v7.4s, v15.8h\n" 701 702 // Loop break. 703 "bgt 1b\n" 704 705 // StaticQuantization::Prepare 706 "ld1 {v8.4s}, [%x[lhs]], #16\n" 707 "ld1 {v9.4s, v10.4s}, [%x[rhs]], #32\n" 708 "dup v11.4s, %w[multiplicative_offset]\n" 709 "dup v12.4s, %w[rounding_offset]\n" 710 "dup v13.4s, %w[shift]\n" 711 "dup v8.4s, v8.s[0]\n" 712 713 // RowMajorOutput::Prepare 714 715 // Reduce aggregators. 716 "addp v0.4s, v0.4s, v1.4s\n" 717 "addp v2.4s, v2.4s, v3.4s\n" 718 "addp v4.4s, v4.4s, v5.4s\n" 719 "addp v6.4s, v6.4s, v7.4s\n" 720 "addp v0.4s, v0.4s, v2.4s\n" 721 "addp v1.4s, v4.4s, v6.4s\n" 722 723 // StaticQuantization::Transform 724 "add v0.4s, v0.4s, v8.4s\n" 725 "add v1.4s, v1.4s, v8.4s\n" 726 "add v0.4s, v0.4s, v9.4s\n" 727 "add v1.4s, v1.4s, v10.4s\n" 728 "mul v0.4s, v0.4s, v11.4s\n" 729 "mul v1.4s, v1.4s, v11.4s\n" 730 "add v0.4s, v0.4s, v12.4s\n" 731 "add v1.4s, v1.4s, v12.4s\n" 732 "sshl v0.4s, v0.4s, v13.4s\n" 733 "sshl v1.4s, v1.4s, v13.4s\n" 734 "sqxtn v0.4h, v0.4s\n" 735 "sqxtn2 v0.8h, v1.4s\n" 736 "sqxtun v0.8b, v0.8h\n" 737 738 // RowMajorOutput::Output 739 "st1 {v0.2s}, [%x[result]], #8\n" 740 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 741 : [count] "r"(params.kernel.count), 742 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 743 [shift] "r"(params.kernel.shift), 744 [stride] "r"(params.output_stream.stride), 745 [rounding_offset] "r"(params.kernel.rounding_offset) 746 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 747 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory"); 748 } 749 750 template <> 751 inline void 752 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 1, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)753 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 754 const FusedKernelParams<QuantizedStaticPreprocessed, 755 RowMajor>& params, 756 uint8_t* result) { 757 #ifdef DEBUG 758 #ifdef DEBUG_METAGEMM_VERBOSE 759 std::cout << __FILE__ << "(" << __LINE__ 760 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 761 "QuantizedStaticPreprocessed, RowMajor, 2, 1, 8>::Multiply()" 762 << std::endl 763 << std::flush; 764 #endif 765 #endif 766 asm volatile( 767 "prfm pldl1keep, [%x[lhs]]\n" 768 "prfm pldl1keep, [%x[rhs]]\n" 769 770 // Clear aggregators. 771 "movi v0.4s, #0\n" 772 "movi v1.4s, #0\n" 773 774 // General NxM lanes loop. 775 "1:" 776 777 // Subtract counter. 778 "subs %x[count], %x[count], #8\n" 779 780 "ld1 {v2.2s, v3.2s}, [%x[lhs]], #16\n" 781 "ld1 {v4.2s}, [%x[rhs]], #8\n" 782 "prfm pldl1keep, [%x[lhs], #64]\n" 783 "prfm pldl1keep, [%x[rhs], #64]\n" 784 "umull v5.8h, v4.8b, v2.8b\n" 785 "umull v6.8h, v4.8b, v3.8b\n" 786 "uadalp v0.4s, v5.8h\n" 787 "uadalp v1.4s, v6.8h\n" 788 789 // Loop break. 790 "bgt 1b\n" 791 792 // StaticQuantization::Prepare 793 "ld1 {v4.4s}, [%x[lhs]], #16\n" 794 "ld1 {v5.4s}, [%x[rhs]], #16\n" 795 "dup v6.4s, %w[multiplicative_offset]\n" 796 "dup v7.4s, %w[rounding_offset]\n" 797 "dup v8.4s, %w[shift]\n" 798 "dup v2.4s, v4.s[0]\n" 799 "dup v4.4s, v4.s[1]\n" 800 801 // RowMajorOutput::Prepare 802 "add x0, %x[result], %x[stride]\n" 803 804 // Reduce aggregators. 805 "addp v0.4s, v0.4s, v0.4s\n" 806 "addp v0.4s, v0.4s, v0.4s\n" 807 "addp v1.4s, v1.4s, v1.4s\n" 808 "addp v1.4s, v1.4s, v1.4s\n" 809 810 // StaticQuantization::Transform 811 "add v0.4s, v0.4s, v2.4s\n" 812 "add v1.4s, v1.4s, v4.4s\n" 813 "add v0.4s, v0.4s, v5.4s\n" 814 "add v1.4s, v1.4s, v5.4s\n" 815 "mul v0.4s, v0.4s, v6.4s\n" 816 "mul v1.4s, v1.4s, v6.4s\n" 817 "add v0.4s, v0.4s, v7.4s\n" 818 "add v1.4s, v1.4s, v7.4s\n" 819 "sshl v0.4s, v0.4s, v8.4s\n" 820 "sshl v1.4s, v1.4s, v8.4s\n" 821 "sqxtn v0.4h, v0.4s\n" 822 "sqxtn v1.4h, v1.4s\n" 823 "sqxtun v0.8b, v0.8h\n" 824 "sqxtun v1.8b, v1.8h\n" 825 826 // RowMajorOutput::Output 827 "st1 {v0.b}[0], [%x[result]], #1\n" 828 "st1 {v1.b}[0], [x0], #1\n" 829 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 830 : [count] "r"(params.kernel.count), 831 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 832 [shift] "r"(params.kernel.shift), 833 [stride] "r"(params.output_stream.stride), 834 [rounding_offset] "r"(params.kernel.rounding_offset) 835 : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", 836 "memory"); 837 } 838 839 template <> 840 inline void 841 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 2, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)842 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 843 const FusedKernelParams<QuantizedStaticPreprocessed, 844 RowMajor>& params, 845 uint8_t* result) { 846 #ifdef DEBUG 847 #ifdef DEBUG_METAGEMM_VERBOSE 848 std::cout << __FILE__ << "(" << __LINE__ 849 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 850 "QuantizedStaticPreprocessed, RowMajor, 2, 2, 8>::Multiply()" 851 << std::endl 852 << std::flush; 853 #endif 854 #endif 855 asm volatile( 856 "prfm pldl1keep, [%x[lhs]]\n" 857 "prfm pldl1keep, [%x[rhs]]\n" 858 859 // Clear aggregators. 860 "movi v0.4s, #0\n" 861 "movi v1.4s, #0\n" 862 "movi v2.4s, #0\n" 863 "mov v3.16b, v0.16b\n" 864 865 // General NxM lanes loop. 866 "1:" 867 868 // Subtract counter. 869 "subs %x[count], %x[count], #8\n" 870 871 "ld1 {v4.2s, v5.2s}, [%x[lhs]], #16\n" 872 "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n" 873 "prfm pldl1keep, [%x[lhs], #64]\n" 874 "prfm pldl1keep, [%x[rhs], #64]\n" 875 "umull v8.8h, v6.8b, v4.8b\n" 876 "umull v9.8h, v7.8b, v4.8b\n" 877 "umull v10.8h, v6.8b, v5.8b\n" 878 "umull v11.8h, v7.8b, v5.8b\n" 879 "uadalp v0.4s, v8.8h\n" 880 "uadalp v1.4s, v9.8h\n" 881 "uadalp v2.4s, v10.8h\n" 882 "uadalp v3.4s, v11.8h\n" 883 884 // Loop break. 885 "bgt 1b\n" 886 887 // StaticQuantization::Prepare 888 "ld1 {v4.4s}, [%x[lhs]], #16\n" 889 "ld1 {v5.4s}, [%x[rhs]], #16\n" 890 "dup v6.4s, %w[multiplicative_offset]\n" 891 "dup v7.4s, %w[rounding_offset]\n" 892 "dup v8.4s, %w[shift]\n" 893 "dup v9.4s, v4.s[0]\n" 894 "dup v4.4s, v4.s[1]\n" 895 896 // RowMajorOutput::Prepare 897 "add x0, %x[result], %x[stride]\n" 898 899 // Reduce aggregators. 900 "addp v0.4s, v0.4s, v1.4s\n" 901 "addp v0.4s, v0.4s, v0.4s\n" 902 "addp v2.4s, v2.4s, v3.4s\n" 903 "addp v2.4s, v2.4s, v2.4s\n" 904 905 // StaticQuantization::Transform 906 "add v0.4s, v0.4s, v9.4s\n" 907 "add v2.4s, v2.4s, v4.4s\n" 908 "add v0.4s, v0.4s, v5.4s\n" 909 "add v2.4s, v2.4s, v5.4s\n" 910 "mul v0.4s, v0.4s, v6.4s\n" 911 "mul v2.4s, v2.4s, v6.4s\n" 912 "add v0.4s, v0.4s, v7.4s\n" 913 "add v2.4s, v2.4s, v7.4s\n" 914 "sshl v0.4s, v0.4s, v8.4s\n" 915 "sshl v2.4s, v2.4s, v8.4s\n" 916 "sqxtn v0.4h, v0.4s\n" 917 "sqxtn v2.4h, v2.4s\n" 918 "sqxtun v0.8b, v0.8h\n" 919 "sqxtun v2.8b, v2.8h\n" 920 921 // RowMajorOutput::Output 922 "st1 {v0.h}[0], [%x[result]], #2\n" 923 "st1 {v2.h}[0], [x0], #2\n" 924 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 925 : [count] "r"(params.kernel.count), 926 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 927 [shift] "r"(params.kernel.shift), 928 [stride] "r"(params.output_stream.stride), 929 [rounding_offset] "r"(params.kernel.rounding_offset) 930 : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 931 "v11", "cc", "memory"); 932 } 933 934 template <> 935 inline void 936 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 3, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)937 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 938 const FusedKernelParams<QuantizedStaticPreprocessed, 939 RowMajor>& params, 940 uint8_t* result) { 941 #ifdef DEBUG 942 #ifdef DEBUG_METAGEMM_VERBOSE 943 std::cout << __FILE__ << "(" << __LINE__ 944 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 945 "QuantizedStaticPreprocessed, RowMajor, 2, 3, 8>::Multiply()" 946 << std::endl 947 << std::flush; 948 #endif 949 #endif 950 asm volatile( 951 "prfm pldl1keep, [%x[lhs]]\n" 952 "prfm pldl1keep, [%x[rhs]]\n" 953 954 // Clear aggregators. 955 "movi v0.4s, #0\n" 956 "movi v1.4s, #0\n" 957 "movi v2.4s, #0\n" 958 "mov v3.16b, v0.16b\n" 959 "mov v4.16b, v1.16b\n" 960 "mov v5.16b, v2.16b\n" 961 962 // General NxM lanes loop. 963 "1:" 964 965 // Subtract counter. 966 "subs %x[count], %x[count], #8\n" 967 968 "ld1 {v6.2s, v7.2s}, [%x[lhs]], #16\n" 969 "ld1 {v8.2s, v9.2s, v10.2s}, [%x[rhs]], #24\n" 970 "prfm pldl1keep, [%x[lhs], #64]\n" 971 "prfm pldl1keep, [%x[rhs], #64]\n" 972 "umull v11.8h, v8.8b, v6.8b\n" 973 "umull v12.8h, v9.8b, v6.8b\n" 974 "umull v13.8h, v10.8b, v6.8b\n" 975 "umull v14.8h, v8.8b, v7.8b\n" 976 "umull v15.8h, v9.8b, v7.8b\n" 977 "umull v16.8h, v10.8b, v7.8b\n" 978 "uadalp v0.4s, v11.8h\n" 979 "uadalp v1.4s, v12.8h\n" 980 "uadalp v2.4s, v13.8h\n" 981 "uadalp v3.4s, v14.8h\n" 982 "uadalp v4.4s, v15.8h\n" 983 "uadalp v5.4s, v16.8h\n" 984 985 // Loop break. 986 "bgt 1b\n" 987 988 // StaticQuantization::Prepare 989 "ld1 {v6.4s}, [%x[lhs]], #16\n" 990 "ld1 {v7.4s}, [%x[rhs]], #16\n" 991 "dup v8.4s, %w[multiplicative_offset]\n" 992 "dup v9.4s, %w[rounding_offset]\n" 993 "dup v10.4s, %w[shift]\n" 994 "dup v11.4s, v6.s[0]\n" 995 "dup v6.4s, v6.s[1]\n" 996 997 // RowMajorOutput::Prepare 998 "add x0, %x[result], %x[stride]\n" 999 1000 // Reduce aggregators. 1001 "addp v0.4s, v0.4s, v1.4s\n" 1002 "addp v2.4s, v2.4s, v2.4s\n" 1003 "addp v0.4s, v0.4s, v2.4s\n" 1004 "addp v3.4s, v3.4s, v4.4s\n" 1005 "addp v5.4s, v5.4s, v5.4s\n" 1006 "addp v3.4s, v3.4s, v5.4s\n" 1007 1008 // StaticQuantization::Transform 1009 "add v0.4s, v0.4s, v11.4s\n" 1010 "add v3.4s, v3.4s, v6.4s\n" 1011 "add v0.4s, v0.4s, v7.4s\n" 1012 "add v3.4s, v3.4s, v7.4s\n" 1013 "mul v0.4s, v0.4s, v8.4s\n" 1014 "mul v3.4s, v3.4s, v8.4s\n" 1015 "add v0.4s, v0.4s, v9.4s\n" 1016 "add v3.4s, v3.4s, v9.4s\n" 1017 "sshl v0.4s, v0.4s, v10.4s\n" 1018 "sshl v3.4s, v3.4s, v10.4s\n" 1019 "sqxtn v0.4h, v0.4s\n" 1020 "sqxtn v3.4h, v3.4s\n" 1021 "sqxtun v0.8b, v0.8h\n" 1022 "sqxtun v3.8b, v3.8h\n" 1023 1024 // RowMajorOutput::Output 1025 "st1 {v0.h}[0], [%x[result]], #2\n" 1026 "st1 {v0.b}[2], [%x[result]], #1\n" 1027 "st1 {v3.h}[0], [x0], #2\n" 1028 "st1 {v3.b}[2], [x0], #1\n" 1029 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 1030 : [count] "r"(params.kernel.count), 1031 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 1032 [shift] "r"(params.kernel.shift), 1033 [stride] "r"(params.output_stream.stride), 1034 [rounding_offset] "r"(params.kernel.rounding_offset) 1035 : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 1036 "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory"); 1037 } 1038 1039 template <> 1040 inline void 1041 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 4, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)1042 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 1043 const FusedKernelParams<QuantizedStaticPreprocessed, 1044 RowMajor>& params, 1045 uint8_t* result) { 1046 #ifdef DEBUG 1047 #ifdef DEBUG_METAGEMM_VERBOSE 1048 std::cout << __FILE__ << "(" << __LINE__ 1049 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 1050 "QuantizedStaticPreprocessed, RowMajor, 2, 4, 8>::Multiply()" 1051 << std::endl 1052 << std::flush; 1053 #endif 1054 #endif 1055 asm volatile( 1056 "prfm pldl1keep, [%x[lhs]]\n" 1057 "prfm pldl1keep, [%x[rhs]]\n" 1058 1059 // Clear aggregators. 1060 "movi v0.4s, #0\n" 1061 "movi v1.4s, #0\n" 1062 "movi v2.4s, #0\n" 1063 "mov v3.16b, v0.16b\n" 1064 "mov v4.16b, v1.16b\n" 1065 "mov v5.16b, v2.16b\n" 1066 "mov v6.16b, v3.16b\n" 1067 "mov v7.16b, v4.16b\n" 1068 1069 // 2x4 lanes loop. 1070 "1:" 1071 1072 "ld1 {v10.8b, v11.8b, v12.8b, v13.8b}, [%x[rhs]], #32\n" 1073 "ld1 {v8.8b}, [%x[lhs]], #8\n" 1074 "umull v14.8h, v8.8b, v10.8b\n" 1075 "ld1 {v9.8b}, [%x[lhs]], #8\n" 1076 "umull v15.8h, v8.8b, v11.8b\n" 1077 "prfm pldl1keep, [%x[rhs], #64]\n" 1078 "umull v16.8h, v8.8b, v12.8b\n" 1079 "prfm pldl1keep, [%x[lhs], #64]\n" 1080 "umull v17.8h, v8.8b, v13.8b\n" 1081 "umull v18.8h, v9.8b, v10.8b\n" 1082 "uadalp v0.4s, v14.8h\n" 1083 "uadalp v1.4s, v15.8h\n" 1084 "uadalp v2.4s, v16.8h\n" 1085 "umull v14.8h, v9.8b, v11.8b\n" 1086 "umull v15.8h, v9.8b, v12.8b\n" 1087 "umull v16.8h, v9.8b, v13.8b\n" 1088 1089 // Subtract counter. 1090 "subs %x[count], %x[count], #8\n" 1091 1092 "uadalp v3.4s, v17.8h\n" 1093 "uadalp v4.4s, v18.8h\n" 1094 "uadalp v5.4s, v14.8h\n" 1095 "uadalp v6.4s, v15.8h\n" 1096 "uadalp v7.4s, v16.8h\n" 1097 1098 // Loop break. 1099 "bgt 1b\n" 1100 1101 // StaticQuantization::Prepare 1102 "ld1 {v8.4s}, [%x[lhs]], #16\n" 1103 "ld1 {v9.4s}, [%x[rhs]], #16\n" 1104 "dup v10.4s, %w[multiplicative_offset]\n" 1105 "dup v11.4s, %w[rounding_offset]\n" 1106 "dup v12.4s, %w[shift]\n" 1107 "dup v13.4s, v8.s[0]\n" 1108 "dup v8.4s, v8.s[1]\n" 1109 1110 // RowMajorOutput::Prepare 1111 "add x0, %x[result], %x[stride]\n" 1112 1113 // Reduce aggregators. 1114 "addp v0.4s, v0.4s, v1.4s\n" 1115 "addp v2.4s, v2.4s, v3.4s\n" 1116 "addp v0.4s, v0.4s, v2.4s\n" 1117 "addp v4.4s, v4.4s, v5.4s\n" 1118 "addp v6.4s, v6.4s, v7.4s\n" 1119 "addp v4.4s, v4.4s, v6.4s\n" 1120 1121 // StaticQuantization::Transform 1122 "add v0.4s, v0.4s, v13.4s\n" 1123 "add v4.4s, v4.4s, v8.4s\n" 1124 "add v0.4s, v0.4s, v9.4s\n" 1125 "add v4.4s, v4.4s, v9.4s\n" 1126 "mul v0.4s, v0.4s, v10.4s\n" 1127 "mul v4.4s, v4.4s, v10.4s\n" 1128 "add v0.4s, v0.4s, v11.4s\n" 1129 "add v4.4s, v4.4s, v11.4s\n" 1130 "sshl v0.4s, v0.4s, v12.4s\n" 1131 "sshl v4.4s, v4.4s, v12.4s\n" 1132 "sqxtn v0.4h, v0.4s\n" 1133 "sqxtn v4.4h, v4.4s\n" 1134 "sqxtun v0.8b, v0.8h\n" 1135 "sqxtun v4.8b, v4.8h\n" 1136 1137 // RowMajorOutput::Output 1138 "st1 {v0.s}[0], [%x[result]], #4\n" 1139 "st1 {v4.s}[0], [x0], #4\n" 1140 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 1141 : [count] "r"(params.kernel.count), 1142 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 1143 [shift] "r"(params.kernel.shift), 1144 [stride] "r"(params.output_stream.stride), 1145 [rounding_offset] "r"(params.kernel.rounding_offset) 1146 : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 1147 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc", "memory"); 1148 } 1149 1150 template <> 1151 inline void 1152 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 1, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)1153 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 1154 const FusedKernelParams<QuantizedStaticPreprocessed, 1155 RowMajor>& params, 1156 uint8_t* result) { 1157 #ifdef DEBUG 1158 #ifdef DEBUG_METAGEMM_VERBOSE 1159 std::cout << __FILE__ << "(" << __LINE__ 1160 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 1161 "QuantizedStaticPreprocessed, RowMajor, 3, 1, 8>::Multiply()" 1162 << std::endl 1163 << std::flush; 1164 #endif 1165 #endif 1166 asm volatile( 1167 "prfm pldl1keep, [%x[lhs]]\n" 1168 "prfm pldl1keep, [%x[rhs]]\n" 1169 1170 // Clear aggregators. 1171 "movi v0.4s, #0\n" 1172 "movi v1.4s, #0\n" 1173 "movi v2.4s, #0\n" 1174 1175 // General NxM lanes loop. 1176 "1:" 1177 1178 // Subtract counter. 1179 "subs %x[count], %x[count], #8\n" 1180 1181 "ld1 {v3.2s, v4.2s, v5.2s}, [%x[lhs]], #24\n" 1182 "ld1 {v6.2s}, [%x[rhs]], #8\n" 1183 "prfm pldl1keep, [%x[lhs], #64]\n" 1184 "prfm pldl1keep, [%x[rhs], #64]\n" 1185 "umull v7.8h, v6.8b, v3.8b\n" 1186 "umull v8.8h, v6.8b, v4.8b\n" 1187 "umull v9.8h, v6.8b, v5.8b\n" 1188 "uadalp v0.4s, v7.8h\n" 1189 "uadalp v1.4s, v8.8h\n" 1190 "uadalp v2.4s, v9.8h\n" 1191 1192 // Loop break. 1193 "bgt 1b\n" 1194 1195 // StaticQuantization::Prepare 1196 "ld1 {v4.4s}, [%x[lhs]], #16\n" 1197 "ld1 {v5.4s}, [%x[rhs]], #16\n" 1198 "dup v6.4s, %w[multiplicative_offset]\n" 1199 "dup v7.4s, %w[rounding_offset]\n" 1200 "dup v8.4s, %w[shift]\n" 1201 "dup v3.4s, v4.s[0]\n" 1202 "dup v9.4s, v4.s[1]\n" 1203 "dup v4.4s, v4.s[2]\n" 1204 1205 // RowMajorOutput::Prepare 1206 "add x0, %x[result], %x[stride]\n" 1207 "add x1, x0, %x[stride]\n" 1208 1209 // Reduce aggregators. 1210 "addp v0.4s, v0.4s, v0.4s\n" 1211 "addp v0.4s, v0.4s, v0.4s\n" 1212 "addp v1.4s, v1.4s, v1.4s\n" 1213 "addp v1.4s, v1.4s, v1.4s\n" 1214 "addp v2.4s, v2.4s, v2.4s\n" 1215 "addp v2.4s, v2.4s, v2.4s\n" 1216 1217 // StaticQuantization::Transform 1218 "add v0.4s, v0.4s, v3.4s\n" 1219 "add v1.4s, v1.4s, v9.4s\n" 1220 "add v2.4s, v2.4s, v4.4s\n" 1221 "add v0.4s, v0.4s, v5.4s\n" 1222 "add v1.4s, v1.4s, v5.4s\n" 1223 "add v2.4s, v2.4s, v5.4s\n" 1224 "mul v0.4s, v0.4s, v6.4s\n" 1225 "mul v1.4s, v1.4s, v6.4s\n" 1226 "mul v2.4s, v2.4s, v6.4s\n" 1227 "add v0.4s, v0.4s, v7.4s\n" 1228 "add v1.4s, v1.4s, v7.4s\n" 1229 "add v2.4s, v2.4s, v7.4s\n" 1230 "sshl v0.4s, v0.4s, v8.4s\n" 1231 "sshl v1.4s, v1.4s, v8.4s\n" 1232 "sshl v2.4s, v2.4s, v8.4s\n" 1233 "sqxtn v0.4h, v0.4s\n" 1234 "sqxtn v1.4h, v1.4s\n" 1235 "sqxtn v2.4h, v2.4s\n" 1236 "sqxtun v0.8b, v0.8h\n" 1237 "sqxtun v1.8b, v1.8h\n" 1238 "sqxtun v2.8b, v2.8h\n" 1239 1240 // RowMajorOutput::Output 1241 "st1 {v0.b}[0], [%x[result]], #1\n" 1242 "st1 {v1.b}[0], [x0], #1\n" 1243 "st1 {v2.b}[0], [x1], #1\n" 1244 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 1245 : [count] "r"(params.kernel.count), 1246 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 1247 [shift] "r"(params.kernel.shift), 1248 [stride] "r"(params.output_stream.stride), 1249 [rounding_offset] "r"(params.kernel.rounding_offset) 1250 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", 1251 "cc", "memory"); 1252 } 1253 1254 template <> 1255 inline void 1256 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 2, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)1257 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 1258 const FusedKernelParams<QuantizedStaticPreprocessed, 1259 RowMajor>& params, 1260 uint8_t* result) { 1261 #ifdef DEBUG 1262 #ifdef DEBUG_METAGEMM_VERBOSE 1263 std::cout << __FILE__ << "(" << __LINE__ 1264 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 1265 "QuantizedStaticPreprocessed, RowMajor, 3, 2, 8>::Multiply()" 1266 << std::endl 1267 << std::flush; 1268 #endif 1269 #endif 1270 asm volatile( 1271 "prfm pldl1keep, [%x[lhs]]\n" 1272 "prfm pldl1keep, [%x[rhs]]\n" 1273 1274 // Clear aggregators. 1275 "movi v0.4s, #0\n" 1276 "movi v1.4s, #0\n" 1277 "movi v2.4s, #0\n" 1278 "mov v3.16b, v0.16b\n" 1279 "mov v4.16b, v1.16b\n" 1280 "mov v5.16b, v2.16b\n" 1281 1282 // General NxM lanes loop. 1283 "1:" 1284 1285 // Subtract counter. 1286 "subs %x[count], %x[count], #8\n" 1287 1288 "ld1 {v6.2s, v7.2s, v8.2s}, [%x[lhs]], #24\n" 1289 "ld1 {v9.2s, v10.2s}, [%x[rhs]], #16\n" 1290 "prfm pldl1keep, [%x[lhs], #64]\n" 1291 "prfm pldl1keep, [%x[rhs], #64]\n" 1292 "umull v11.8h, v9.8b, v6.8b\n" 1293 "umull v12.8h, v10.8b, v6.8b\n" 1294 "umull v13.8h, v9.8b, v7.8b\n" 1295 "umull v14.8h, v10.8b, v7.8b\n" 1296 "umull v15.8h, v9.8b, v8.8b\n" 1297 "umull v16.8h, v10.8b, v8.8b\n" 1298 "uadalp v0.4s, v11.8h\n" 1299 "uadalp v1.4s, v12.8h\n" 1300 "uadalp v2.4s, v13.8h\n" 1301 "uadalp v3.4s, v14.8h\n" 1302 "uadalp v4.4s, v15.8h\n" 1303 "uadalp v5.4s, v16.8h\n" 1304 1305 // Loop break. 1306 "bgt 1b\n" 1307 1308 // StaticQuantization::Prepare 1309 "ld1 {v6.4s}, [%x[lhs]], #16\n" 1310 "ld1 {v7.4s}, [%x[rhs]], #16\n" 1311 "dup v8.4s, %w[multiplicative_offset]\n" 1312 "dup v9.4s, %w[rounding_offset]\n" 1313 "dup v10.4s, %w[shift]\n" 1314 "dup v11.4s, v6.s[0]\n" 1315 "dup v12.4s, v6.s[1]\n" 1316 "dup v6.4s, v6.s[2]\n" 1317 1318 // RowMajorOutput::Prepare 1319 "add x0, %x[result], %x[stride]\n" 1320 "add x1, x0, %x[stride]\n" 1321 1322 // Reduce aggregators. 1323 "addp v0.4s, v0.4s, v1.4s\n" 1324 "addp v0.4s, v0.4s, v0.4s\n" 1325 "addp v2.4s, v2.4s, v3.4s\n" 1326 "addp v2.4s, v2.4s, v2.4s\n" 1327 "addp v4.4s, v4.4s, v5.4s\n" 1328 "addp v4.4s, v4.4s, v4.4s\n" 1329 1330 // StaticQuantization::Transform 1331 "add v0.4s, v0.4s, v11.4s\n" 1332 "add v2.4s, v2.4s, v12.4s\n" 1333 "add v4.4s, v4.4s, v6.4s\n" 1334 "add v0.4s, v0.4s, v7.4s\n" 1335 "add v2.4s, v2.4s, v7.4s\n" 1336 "add v4.4s, v4.4s, v7.4s\n" 1337 "mul v0.4s, v0.4s, v8.4s\n" 1338 "mul v2.4s, v2.4s, v8.4s\n" 1339 "mul v4.4s, v4.4s, v8.4s\n" 1340 "add v0.4s, v0.4s, v9.4s\n" 1341 "add v2.4s, v2.4s, v9.4s\n" 1342 "add v4.4s, v4.4s, v9.4s\n" 1343 "sshl v0.4s, v0.4s, v10.4s\n" 1344 "sshl v2.4s, v2.4s, v10.4s\n" 1345 "sshl v4.4s, v4.4s, v10.4s\n" 1346 "sqxtn v0.4h, v0.4s\n" 1347 "sqxtn v2.4h, v2.4s\n" 1348 "sqxtn v4.4h, v4.4s\n" 1349 "sqxtun v0.8b, v0.8h\n" 1350 "sqxtun v2.8b, v2.8h\n" 1351 "sqxtun v4.8b, v4.8h\n" 1352 1353 // RowMajorOutput::Output 1354 "st1 {v0.h}[0], [%x[result]], #2\n" 1355 "st1 {v2.h}[0], [x0], #2\n" 1356 "st1 {v4.h}[0], [x1], #2\n" 1357 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 1358 : [count] "r"(params.kernel.count), 1359 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 1360 [shift] "r"(params.kernel.shift), 1361 [stride] "r"(params.output_stream.stride), 1362 [rounding_offset] "r"(params.kernel.rounding_offset) 1363 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", 1364 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory"); 1365 } 1366 1367 template <> 1368 inline void 1369 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 3, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)1370 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 1371 const FusedKernelParams<QuantizedStaticPreprocessed, 1372 RowMajor>& params, 1373 uint8_t* result) { 1374 #ifdef DEBUG 1375 #ifdef DEBUG_METAGEMM_VERBOSE 1376 std::cout << __FILE__ << "(" << __LINE__ 1377 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 1378 "QuantizedStaticPreprocessed, RowMajor, 3, 3, 8>::Multiply()" 1379 << std::endl 1380 << std::flush; 1381 #endif 1382 #endif 1383 asm volatile( 1384 "prfm pldl1keep, [%x[lhs]]\n" 1385 "prfm pldl1keep, [%x[rhs]]\n" 1386 1387 // Clear aggregators. 1388 "movi v0.4s, #0\n" 1389 "movi v1.4s, #0\n" 1390 "movi v2.4s, #0\n" 1391 "mov v3.16b, v0.16b\n" 1392 "mov v4.16b, v1.16b\n" 1393 "mov v5.16b, v2.16b\n" 1394 "mov v6.16b, v3.16b\n" 1395 "mov v7.16b, v4.16b\n" 1396 "mov v8.16b, v5.16b\n" 1397 1398 // 3x3 lanes loop. 1399 "1:" 1400 1401 "ld1 {v12.8b, v13.8b, v14.8b}, [%x[rhs]], #24\n" 1402 "ld1 {v9.8b}, [%x[lhs]], #8\n" 1403 "umull v15.8h, v9.8b, v12.8b\n" 1404 "ld1 {v10.8b}, [%x[lhs]], #8\n" 1405 "umull v16.8h, v9.8b, v13.8b\n" 1406 "ld1 {v11.8b}, [%x[lhs]], #8\n" 1407 "umull v17.8h, v9.8b, v14.8b\n" 1408 "prfm pldl1keep, [%x[lhs], #64]\n" 1409 "umull v18.8h, v10.8b, v12.8b\n" 1410 "prfm pldl1keep, [%x[rhs], #64]\n" 1411 "uadalp v0.4s, v15.8h\n" 1412 "uadalp v1.4s, v16.8h\n" 1413 "uadalp v2.4s, v17.8h\n" 1414 "uadalp v3.4s, v18.8h\n" 1415 "umull v15.8h, v10.8b, v13.8b\n" 1416 "umull v16.8h, v10.8b, v14.8b\n" 1417 "umull v17.8h, v11.8b, v12.8b\n" 1418 "umull v18.8h, v11.8b, v13.8b\n" 1419 1420 // Subtract counter. 1421 "subs %x[count], %x[count], #8\n" 1422 1423 "umull v9.8h, v11.8b, v14.8b\n" 1424 "uadalp v4.4s, v15.8h\n" 1425 "uadalp v5.4s, v16.8h\n" 1426 "uadalp v6.4s, v17.8h\n" 1427 "uadalp v7.4s, v18.8h\n" 1428 "uadalp v8.4s, v9.8h\n" 1429 1430 // Loop break. 1431 "bgt 1b\n" 1432 1433 // StaticQuantization::Prepare 1434 "ld1 {v9.4s}, [%x[lhs]], #16\n" 1435 "ld1 {v10.4s}, [%x[rhs]], #16\n" 1436 "dup v11.4s, %w[multiplicative_offset]\n" 1437 "dup v12.4s, %w[rounding_offset]\n" 1438 "dup v13.4s, %w[shift]\n" 1439 "dup v14.4s, v9.s[0]\n" 1440 "dup v15.4s, v9.s[1]\n" 1441 "dup v9.4s, v9.s[2]\n" 1442 1443 // RowMajorOutput::Prepare 1444 "add x0, %x[result], %x[stride]\n" 1445 "add x1, x0, %x[stride]\n" 1446 1447 // Reduce aggregators. 1448 "addp v0.4s, v0.4s, v1.4s\n" 1449 "addp v2.4s, v2.4s, v2.4s\n" 1450 "addp v0.4s, v0.4s, v2.4s\n" 1451 "addp v3.4s, v3.4s, v4.4s\n" 1452 "addp v5.4s, v5.4s, v5.4s\n" 1453 "addp v3.4s, v3.4s, v5.4s\n" 1454 "addp v6.4s, v6.4s, v7.4s\n" 1455 "addp v8.4s, v8.4s, v8.4s\n" 1456 "addp v6.4s, v6.4s, v8.4s\n" 1457 1458 // StaticQuantization::Transform 1459 "add v0.4s, v0.4s, v14.4s\n" 1460 "add v3.4s, v3.4s, v15.4s\n" 1461 "add v6.4s, v6.4s, v9.4s\n" 1462 "add v0.4s, v0.4s, v10.4s\n" 1463 "add v3.4s, v3.4s, v10.4s\n" 1464 "add v6.4s, v6.4s, v10.4s\n" 1465 "mul v0.4s, v0.4s, v11.4s\n" 1466 "mul v3.4s, v3.4s, v11.4s\n" 1467 "mul v6.4s, v6.4s, v11.4s\n" 1468 "add v0.4s, v0.4s, v12.4s\n" 1469 "add v3.4s, v3.4s, v12.4s\n" 1470 "add v6.4s, v6.4s, v12.4s\n" 1471 "sshl v0.4s, v0.4s, v13.4s\n" 1472 "sshl v3.4s, v3.4s, v13.4s\n" 1473 "sshl v6.4s, v6.4s, v13.4s\n" 1474 "sqxtn v0.4h, v0.4s\n" 1475 "sqxtn v3.4h, v3.4s\n" 1476 "sqxtn v6.4h, v6.4s\n" 1477 "sqxtun v0.8b, v0.8h\n" 1478 "sqxtun v3.8b, v3.8h\n" 1479 "sqxtun v6.8b, v6.8h\n" 1480 1481 // RowMajorOutput::Output 1482 "st1 {v0.h}[0], [%x[result]], #2\n" 1483 "st1 {v0.b}[2], [%x[result]], #1\n" 1484 "st1 {v3.h}[0], [x0], #2\n" 1485 "st1 {v3.b}[2], [x0], #1\n" 1486 "st1 {v6.h}[0], [x1], #2\n" 1487 "st1 {v6.b}[2], [x1], #1\n" 1488 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 1489 : [count] "r"(params.kernel.count), 1490 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 1491 [shift] "r"(params.kernel.shift), 1492 [stride] "r"(params.output_stream.stride), 1493 [rounding_offset] "r"(params.kernel.rounding_offset) 1494 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", 1495 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc", 1496 "memory"); 1497 } 1498 1499 template <> 1500 inline void MulKernel< 1501 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 1, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)1502 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 1503 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 1504 RowMajor>& params, 1505 int32_t* result) { 1506 #ifdef DEBUG 1507 #ifdef DEBUG_METAGEMM_VERBOSE 1508 std::cout << __FILE__ << "(" << __LINE__ 1509 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 1510 "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 1, " 1511 "8>::Multiply()" 1512 << std::endl 1513 << std::flush; 1514 #endif 1515 #endif 1516 asm volatile( 1517 "prfm pldl1keep, [%x[lhs]]\n" 1518 "prfm pldl1keep, [%x[rhs]]\n" 1519 1520 // Clear aggregators. 1521 "movi v0.4s, #0\n" 1522 1523 // General NxM lanes loop. 1524 "1:" 1525 1526 // Subtract counter. 1527 "subs %x[count], %x[count], #8\n" 1528 1529 "ld1 {v1.2s}, [%x[lhs]], #8\n" 1530 "ld1 {v2.2s}, [%x[rhs]], #8\n" 1531 "prfm pldl1keep, [%x[lhs], #64]\n" 1532 "prfm pldl1keep, [%x[rhs], #64]\n" 1533 "umull v3.8h, v2.8b, v1.8b\n" 1534 "uadalp v0.4s, v3.8h\n" 1535 1536 // Loop break. 1537 "bgt 1b\n" 1538 1539 // StaticQuantizationInt32::Prepare 1540 "ld1 {v4.4s}, [%x[lhs]], #16\n" 1541 "ld1 {v5.4s}, [%x[rhs]], #16\n" 1542 "dup v4.4s, v4.s[0]\n" 1543 1544 // RowMajorOutput::Prepare 1545 1546 // Reduce aggregators. 1547 "addp v0.4s, v0.4s, v0.4s\n" 1548 "addp v0.4s, v0.4s, v0.4s\n" 1549 1550 // StaticQuantizationInt32::Transform 1551 "add v0.4s, v0.4s, v4.4s\n" 1552 "add v0.4s, v0.4s, v5.4s\n" 1553 1554 // RowMajorOutput::Output 1555 "st1 {v0.s}[0], [%x[result]], #4\n" 1556 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 1557 : [count] "r"(params.kernel.count), 1558 [stride] "r"(params.output_stream.stride) 1559 : "v0", "v1", "v2", "v3", "v4", "v5", "cc", "memory"); 1560 } 1561 1562 template <> 1563 inline void MulKernel< 1564 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 2, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)1565 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 1566 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 1567 RowMajor>& params, 1568 int32_t* result) { 1569 #ifdef DEBUG 1570 #ifdef DEBUG_METAGEMM_VERBOSE 1571 std::cout << __FILE__ << "(" << __LINE__ 1572 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 1573 "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 2, " 1574 "8>::Multiply()" 1575 << std::endl 1576 << std::flush; 1577 #endif 1578 #endif 1579 asm volatile( 1580 "prfm pldl1keep, [%x[lhs]]\n" 1581 "prfm pldl1keep, [%x[rhs]]\n" 1582 1583 // Clear aggregators. 1584 "movi v0.4s, #0\n" 1585 "movi v1.4s, #0\n" 1586 1587 // General NxM lanes loop. 1588 "1:" 1589 1590 // Subtract counter. 1591 "subs %x[count], %x[count], #8\n" 1592 1593 "ld1 {v2.2s}, [%x[lhs]], #8\n" 1594 "ld1 {v3.2s, v4.2s}, [%x[rhs]], #16\n" 1595 "prfm pldl1keep, [%x[lhs], #64]\n" 1596 "prfm pldl1keep, [%x[rhs], #64]\n" 1597 "umull v5.8h, v3.8b, v2.8b\n" 1598 "umull v6.8h, v4.8b, v2.8b\n" 1599 "uadalp v0.4s, v5.8h\n" 1600 "uadalp v1.4s, v6.8h\n" 1601 1602 // Loop break. 1603 "bgt 1b\n" 1604 1605 // StaticQuantizationInt32::Prepare 1606 "ld1 {v4.4s}, [%x[lhs]], #16\n" 1607 "ld1 {v5.4s}, [%x[rhs]], #16\n" 1608 "dup v4.4s, v4.s[0]\n" 1609 1610 // RowMajorOutput::Prepare 1611 1612 // Reduce aggregators. 1613 "addp v0.4s, v0.4s, v1.4s\n" 1614 "addp v0.4s, v0.4s, v0.4s\n" 1615 1616 // StaticQuantizationInt32::Transform 1617 "add v0.4s, v0.4s, v4.4s\n" 1618 "add v0.4s, v0.4s, v5.4s\n" 1619 1620 // RowMajorOutput::Output 1621 "st1 {v0.2s}, [%x[result]], #8\n" 1622 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 1623 : [count] "r"(params.kernel.count), 1624 [stride] "r"(params.output_stream.stride) 1625 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory"); 1626 } 1627 1628 template <> 1629 inline void MulKernel< 1630 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 3, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)1631 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 1632 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 1633 RowMajor>& params, 1634 int32_t* result) { 1635 #ifdef DEBUG 1636 #ifdef DEBUG_METAGEMM_VERBOSE 1637 std::cout << __FILE__ << "(" << __LINE__ 1638 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 1639 "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 3, " 1640 "8>::Multiply()" 1641 << std::endl 1642 << std::flush; 1643 #endif 1644 #endif 1645 asm volatile( 1646 "prfm pldl1keep, [%x[lhs]]\n" 1647 "prfm pldl1keep, [%x[rhs]]\n" 1648 1649 // Clear aggregators. 1650 "movi v0.4s, #0\n" 1651 "movi v1.4s, #0\n" 1652 "movi v2.4s, #0\n" 1653 1654 // General NxM lanes loop. 1655 "1:" 1656 1657 // Subtract counter. 1658 "subs %x[count], %x[count], #8\n" 1659 1660 "ld1 {v3.2s}, [%x[lhs]], #8\n" 1661 "ld1 {v4.2s, v5.2s, v6.2s}, [%x[rhs]], #24\n" 1662 "prfm pldl1keep, [%x[lhs], #64]\n" 1663 "prfm pldl1keep, [%x[rhs], #64]\n" 1664 "umull v7.8h, v4.8b, v3.8b\n" 1665 "umull v8.8h, v5.8b, v3.8b\n" 1666 "umull v9.8h, v6.8b, v3.8b\n" 1667 "uadalp v0.4s, v7.8h\n" 1668 "uadalp v1.4s, v8.8h\n" 1669 "uadalp v2.4s, v9.8h\n" 1670 1671 // Loop break. 1672 "bgt 1b\n" 1673 1674 // StaticQuantizationInt32::Prepare 1675 "ld1 {v4.4s}, [%x[lhs]], #16\n" 1676 "ld1 {v5.4s}, [%x[rhs]], #16\n" 1677 "dup v4.4s, v4.s[0]\n" 1678 1679 // RowMajorOutput::Prepare 1680 1681 // Reduce aggregators. 1682 "addp v0.4s, v0.4s, v1.4s\n" 1683 "addp v2.4s, v2.4s, v2.4s\n" 1684 "addp v0.4s, v0.4s, v2.4s\n" 1685 1686 // StaticQuantizationInt32::Transform 1687 "add v0.4s, v0.4s, v4.4s\n" 1688 "add v0.4s, v0.4s, v5.4s\n" 1689 1690 // RowMajorOutput::Output 1691 "st1 {v0.2s}, [%x[result]], #8\n" 1692 "st1 {v0.s}[2], [%x[result]], #4\n" 1693 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 1694 : [count] "r"(params.kernel.count), 1695 [stride] "r"(params.output_stream.stride) 1696 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc", 1697 "memory"); 1698 } 1699 1700 template <> 1701 inline void MulKernel< 1702 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 4, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)1703 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 1704 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 1705 RowMajor>& params, 1706 int32_t* result) { 1707 #ifdef DEBUG 1708 #ifdef DEBUG_METAGEMM_VERBOSE 1709 std::cout << __FILE__ << "(" << __LINE__ 1710 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 1711 "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 4, " 1712 "8>::Multiply()" 1713 << std::endl 1714 << std::flush; 1715 #endif 1716 #endif 1717 asm volatile( 1718 "prfm pldl1keep, [%x[lhs]]\n" 1719 "prfm pldl1keep, [%x[rhs]]\n" 1720 1721 // Clear aggregators. 1722 "movi v0.4s, #0\n" 1723 "movi v1.4s, #0\n" 1724 "movi v2.4s, #0\n" 1725 "mov v3.16b, v0.16b\n" 1726 1727 // General NxM lanes loop. 1728 "1:" 1729 1730 // Subtract counter. 1731 "subs %x[count], %x[count], #8\n" 1732 1733 "ld1 {v4.2s}, [%x[lhs]], #8\n" 1734 "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n" 1735 "prfm pldl1keep, [%x[lhs], #64]\n" 1736 "prfm pldl1keep, [%x[rhs], #64]\n" 1737 "umull v9.8h, v5.8b, v4.8b\n" 1738 "umull v10.8h, v6.8b, v4.8b\n" 1739 "umull v11.8h, v7.8b, v4.8b\n" 1740 "umull v12.8h, v8.8b, v4.8b\n" 1741 "uadalp v0.4s, v9.8h\n" 1742 "uadalp v1.4s, v10.8h\n" 1743 "uadalp v2.4s, v11.8h\n" 1744 "uadalp v3.4s, v12.8h\n" 1745 1746 // Loop break. 1747 "bgt 1b\n" 1748 1749 // StaticQuantizationInt32::Prepare 1750 "ld1 {v4.4s}, [%x[lhs]], #16\n" 1751 "ld1 {v5.4s}, [%x[rhs]], #16\n" 1752 "dup v4.4s, v4.s[0]\n" 1753 1754 // RowMajorOutput::Prepare 1755 1756 // Reduce aggregators. 1757 "addp v0.4s, v0.4s, v1.4s\n" 1758 "addp v2.4s, v2.4s, v3.4s\n" 1759 "addp v0.4s, v0.4s, v2.4s\n" 1760 1761 // StaticQuantizationInt32::Transform 1762 "add v0.4s, v0.4s, v4.4s\n" 1763 "add v0.4s, v0.4s, v5.4s\n" 1764 1765 // RowMajorOutput::Output 1766 "st1 {v0.4s}, [%x[result]], #16\n" 1767 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 1768 : [count] "r"(params.kernel.count), 1769 [stride] "r"(params.output_stream.stride) 1770 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 1771 "v11", "v12", "cc", "memory"); 1772 } 1773 1774 template <> 1775 inline void MulKernel< 1776 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 5, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)1777 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 1778 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 1779 RowMajor>& params, 1780 int32_t* result) { 1781 #ifdef DEBUG 1782 #ifdef DEBUG_METAGEMM_VERBOSE 1783 std::cout << __FILE__ << "(" << __LINE__ 1784 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 1785 "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 5, " 1786 "8>::Multiply()" 1787 << std::endl 1788 << std::flush; 1789 #endif 1790 #endif 1791 asm volatile( 1792 "prfm pldl1keep, [%x[lhs]]\n" 1793 "prfm pldl1keep, [%x[rhs]]\n" 1794 1795 // Clear aggregators. 1796 "movi v0.4s, #0\n" 1797 "movi v1.4s, #0\n" 1798 "movi v2.4s, #0\n" 1799 "mov v3.16b, v0.16b\n" 1800 "mov v4.16b, v1.16b\n" 1801 1802 // General 1xM lanes loop. 1803 "1:" 1804 1805 // Subtract counter. 1806 "subs %x[count], %x[count], #8\n" 1807 1808 "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n" 1809 "ld1 {v9.2s}, [%x[lhs]], #8\n" 1810 "prfm pldl1keep, [%x[lhs], #64]\n" 1811 "umull v10.8h, v5.8b, v9.8b\n" 1812 "umull v11.8h, v6.8b, v9.8b\n" 1813 "umull v12.8h, v7.8b, v9.8b\n" 1814 "umull v13.8h, v8.8b, v9.8b\n" 1815 "ld1 {v5.2s}, [%x[rhs]], #8\n" 1816 "prfm pldl1keep, [%x[rhs], #128]\n" 1817 "uadalp v0.4s, v10.8h\n" 1818 "uadalp v1.4s, v11.8h\n" 1819 "uadalp v2.4s, v12.8h\n" 1820 "uadalp v3.4s, v13.8h\n" 1821 "umull v10.8h, v5.8b, v9.8b\n" 1822 "uadalp v4.4s, v10.8h\n" 1823 1824 // Loop break. 1825 "bgt 1b\n" 1826 1827 // StaticQuantizationInt32::Prepare 1828 "ld1 {v5.4s}, [%x[lhs]], #16\n" 1829 "ld1 {v6.4s, v7.4s}, [%x[rhs]], #32\n" 1830 "dup v5.4s, v5.s[0]\n" 1831 1832 // RowMajorOutput::Prepare 1833 1834 // Reduce aggregators. 1835 "addp v0.4s, v0.4s, v1.4s\n" 1836 "addp v2.4s, v2.4s, v3.4s\n" 1837 "addp v4.4s, v4.4s, v4.4s\n" 1838 "addp v0.4s, v0.4s, v2.4s\n" 1839 "addp v1.4s, v4.4s, v4.4s\n" 1840 1841 // StaticQuantizationInt32::Transform 1842 "add v0.4s, v0.4s, v5.4s\n" 1843 "add v1.4s, v1.4s, v5.4s\n" 1844 "add v0.4s, v0.4s, v6.4s\n" 1845 "add v1.4s, v1.4s, v7.4s\n" 1846 1847 // RowMajorOutput::Output 1848 "st1 {v0.4s}, [%x[result]], #16\n" 1849 "st1 {v1.s}[0], [%x[result]], #4\n" 1850 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 1851 : [count] "r"(params.kernel.count), 1852 [stride] "r"(params.output_stream.stride) 1853 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 1854 "v11", "v12", "v13", "cc", "memory"); 1855 } 1856 1857 template <> 1858 inline void MulKernel< 1859 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 6, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)1860 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 1861 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 1862 RowMajor>& params, 1863 int32_t* result) { 1864 #ifdef DEBUG 1865 #ifdef DEBUG_METAGEMM_VERBOSE 1866 std::cout << __FILE__ << "(" << __LINE__ 1867 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 1868 "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 6, " 1869 "8>::Multiply()" 1870 << std::endl 1871 << std::flush; 1872 #endif 1873 #endif 1874 asm volatile( 1875 "prfm pldl1keep, [%x[lhs]]\n" 1876 "prfm pldl1keep, [%x[rhs]]\n" 1877 1878 // Clear aggregators. 1879 "movi v0.4s, #0\n" 1880 "movi v1.4s, #0\n" 1881 "movi v2.4s, #0\n" 1882 "mov v3.16b, v0.16b\n" 1883 "mov v4.16b, v1.16b\n" 1884 "mov v5.16b, v2.16b\n" 1885 1886 // General 1xM lanes loop. 1887 "1:" 1888 1889 // Subtract counter. 1890 "subs %x[count], %x[count], #8\n" 1891 1892 "ld1 {v6.2s, v7.2s, v8.2s, v9.2s}, [%x[rhs]], #32\n" 1893 "ld1 {v10.2s}, [%x[lhs]], #8\n" 1894 "prfm pldl1keep, [%x[lhs], #64]\n" 1895 "umull v11.8h, v6.8b, v10.8b\n" 1896 "umull v12.8h, v7.8b, v10.8b\n" 1897 "umull v13.8h, v8.8b, v10.8b\n" 1898 "umull v14.8h, v9.8b, v10.8b\n" 1899 "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n" 1900 "prfm pldl1keep, [%x[rhs], #128]\n" 1901 "uadalp v0.4s, v11.8h\n" 1902 "uadalp v1.4s, v12.8h\n" 1903 "uadalp v2.4s, v13.8h\n" 1904 "uadalp v3.4s, v14.8h\n" 1905 "umull v11.8h, v6.8b, v10.8b\n" 1906 "umull v12.8h, v7.8b, v10.8b\n" 1907 "uadalp v4.4s, v11.8h\n" 1908 "uadalp v5.4s, v12.8h\n" 1909 1910 // Loop break. 1911 "bgt 1b\n" 1912 1913 // StaticQuantizationInt32::Prepare 1914 "ld1 {v6.4s}, [%x[lhs]], #16\n" 1915 "ld1 {v7.4s, v8.4s}, [%x[rhs]], #32\n" 1916 "dup v6.4s, v6.s[0]\n" 1917 1918 // RowMajorOutput::Prepare 1919 1920 // Reduce aggregators. 1921 "addp v0.4s, v0.4s, v1.4s\n" 1922 "addp v2.4s, v2.4s, v3.4s\n" 1923 "addp v4.4s, v4.4s, v5.4s\n" 1924 "addp v0.4s, v0.4s, v2.4s\n" 1925 "addp v1.4s, v4.4s, v4.4s\n" 1926 1927 // StaticQuantizationInt32::Transform 1928 "add v0.4s, v0.4s, v6.4s\n" 1929 "add v1.4s, v1.4s, v6.4s\n" 1930 "add v0.4s, v0.4s, v7.4s\n" 1931 "add v1.4s, v1.4s, v8.4s\n" 1932 1933 // RowMajorOutput::Output 1934 "st1 {v0.4s}, [%x[result]], #16\n" 1935 "st1 {v1.2s}, [%x[result]], #8\n" 1936 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 1937 : [count] "r"(params.kernel.count), 1938 [stride] "r"(params.output_stream.stride) 1939 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 1940 "v11", "v12", "v13", "v14", "cc", "memory"); 1941 } 1942 1943 template <> 1944 inline void MulKernel< 1945 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 7, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)1946 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 1947 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 1948 RowMajor>& params, 1949 int32_t* result) { 1950 #ifdef DEBUG 1951 #ifdef DEBUG_METAGEMM_VERBOSE 1952 std::cout << __FILE__ << "(" << __LINE__ 1953 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 1954 "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 7, " 1955 "8>::Multiply()" 1956 << std::endl 1957 << std::flush; 1958 #endif 1959 #endif 1960 asm volatile( 1961 "prfm pldl1keep, [%x[lhs]]\n" 1962 "prfm pldl1keep, [%x[rhs]]\n" 1963 1964 // Clear aggregators. 1965 "movi v0.4s, #0\n" 1966 "movi v1.4s, #0\n" 1967 "movi v2.4s, #0\n" 1968 "mov v3.16b, v0.16b\n" 1969 "mov v4.16b, v1.16b\n" 1970 "mov v5.16b, v2.16b\n" 1971 "mov v6.16b, v3.16b\n" 1972 1973 // General 1xM lanes loop. 1974 "1:" 1975 1976 // Subtract counter. 1977 "subs %x[count], %x[count], #8\n" 1978 1979 "ld1 {v7.2s, v8.2s, v9.2s, v10.2s}, [%x[rhs]], #32\n" 1980 "ld1 {v11.2s}, [%x[lhs]], #8\n" 1981 "prfm pldl1keep, [%x[lhs], #64]\n" 1982 "umull v12.8h, v7.8b, v11.8b\n" 1983 "umull v13.8h, v8.8b, v11.8b\n" 1984 "umull v14.8h, v9.8b, v11.8b\n" 1985 "umull v15.8h, v10.8b, v11.8b\n" 1986 "ld1 {v7.2s, v8.2s, v9.2s}, [%x[rhs]], #24\n" 1987 "prfm pldl1keep, [%x[rhs], #128]\n" 1988 "uadalp v0.4s, v12.8h\n" 1989 "uadalp v1.4s, v13.8h\n" 1990 "uadalp v2.4s, v14.8h\n" 1991 "uadalp v3.4s, v15.8h\n" 1992 "umull v12.8h, v7.8b, v11.8b\n" 1993 "umull v13.8h, v8.8b, v11.8b\n" 1994 "umull v14.8h, v9.8b, v11.8b\n" 1995 "uadalp v4.4s, v12.8h\n" 1996 "uadalp v5.4s, v13.8h\n" 1997 "uadalp v6.4s, v14.8h\n" 1998 1999 // Loop break. 2000 "bgt 1b\n" 2001 2002 // StaticQuantizationInt32::Prepare 2003 "ld1 {v7.4s}, [%x[lhs]], #16\n" 2004 "ld1 {v8.4s, v9.4s}, [%x[rhs]], #32\n" 2005 "dup v7.4s, v7.s[0]\n" 2006 2007 // RowMajorOutput::Prepare 2008 2009 // Reduce aggregators. 2010 "addp v0.4s, v0.4s, v1.4s\n" 2011 "addp v2.4s, v2.4s, v3.4s\n" 2012 "addp v4.4s, v4.4s, v5.4s\n" 2013 "addp v6.4s, v6.4s, v6.4s\n" 2014 "addp v0.4s, v0.4s, v2.4s\n" 2015 "addp v1.4s, v4.4s, v6.4s\n" 2016 2017 // StaticQuantizationInt32::Transform 2018 "add v0.4s, v0.4s, v7.4s\n" 2019 "add v1.4s, v1.4s, v7.4s\n" 2020 "add v0.4s, v0.4s, v8.4s\n" 2021 "add v1.4s, v1.4s, v9.4s\n" 2022 2023 // RowMajorOutput::Output 2024 "st1 {v0.4s}, [%x[result]], #16\n" 2025 "st1 {v1.2s}, [%x[result]], #8\n" 2026 "st1 {v1.s}[2], [%x[result]], #4\n" 2027 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2028 : [count] "r"(params.kernel.count), 2029 [stride] "r"(params.output_stream.stride) 2030 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 2031 "v11", "v12", "v13", "v14", "v15", "cc", "memory"); 2032 } 2033 2034 template <> 2035 inline void MulKernel< 2036 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 8, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2037 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2038 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 2039 RowMajor>& params, 2040 int32_t* result) { 2041 #ifdef DEBUG 2042 #ifdef DEBUG_METAGEMM_VERBOSE 2043 std::cout << __FILE__ << "(" << __LINE__ 2044 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 2045 "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 8, " 2046 "8>::Multiply()" 2047 << std::endl 2048 << std::flush; 2049 #endif 2050 #endif 2051 asm volatile( 2052 "prfm pldl1keep, [%x[lhs]]\n" 2053 "prfm pldl1keep, [%x[rhs]]\n" 2054 2055 // Clear aggregators. 2056 "movi v0.4s, #0\n" 2057 "movi v1.4s, #0\n" 2058 "movi v2.4s, #0\n" 2059 "mov v3.16b, v0.16b\n" 2060 "mov v4.16b, v1.16b\n" 2061 "mov v5.16b, v2.16b\n" 2062 "mov v6.16b, v3.16b\n" 2063 "mov v7.16b, v4.16b\n" 2064 2065 // 1x8 lanes loop. 2066 "1:" 2067 2068 "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n" 2069 "ld1 {v8.2s}, [%x[lhs]], #8\n" 2070 "umull v13.8h, v8.8b, v9.8b\n" 2071 "umull v14.8h, v8.8b, v10.8b\n" 2072 "umull v15.8h, v8.8b, v11.8b\n" 2073 "umull v16.8h, v8.8b, v12.8b\n" 2074 "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n" 2075 "uadalp v0.4s, v13.8h\n" 2076 "uadalp v1.4s, v14.8h\n" 2077 "uadalp v2.4s, v15.8h\n" 2078 "uadalp v3.4s, v16.8h\n" 2079 "prfm pldl1keep, [%x[rhs], #256]\n" 2080 "umull v17.8h, v8.8b, v9.8b\n" 2081 "umull v13.8h, v8.8b, v10.8b\n" 2082 "umull v14.8h, v8.8b, v11.8b\n" 2083 "umull v15.8h, v8.8b, v12.8b\n" 2084 "prfm pldl1keep, [%x[lhs], #32]\n" 2085 2086 // Subtract counter. 2087 "subs %x[count], %x[count], #8\n" 2088 2089 "uadalp v4.4s, v17.8h\n" 2090 "uadalp v5.4s, v13.8h\n" 2091 "uadalp v6.4s, v14.8h\n" 2092 "uadalp v7.4s, v15.8h\n" 2093 2094 // Loop break. 2095 "bgt 1b\n" 2096 2097 // StaticQuantizationInt32::Prepare 2098 "ld1 {v8.4s}, [%x[lhs]], #16\n" 2099 "ld1 {v9.4s, v10.4s}, [%x[rhs]], #32\n" 2100 "dup v8.4s, v8.s[0]\n" 2101 2102 // RowMajorOutput::Prepare 2103 2104 // Reduce aggregators. 2105 "addp v0.4s, v0.4s, v1.4s\n" 2106 "addp v2.4s, v2.4s, v3.4s\n" 2107 "addp v4.4s, v4.4s, v5.4s\n" 2108 "addp v6.4s, v6.4s, v7.4s\n" 2109 "addp v0.4s, v0.4s, v2.4s\n" 2110 "addp v1.4s, v4.4s, v6.4s\n" 2111 2112 // StaticQuantizationInt32::Transform 2113 "add v0.4s, v0.4s, v8.4s\n" 2114 "add v1.4s, v1.4s, v8.4s\n" 2115 "add v0.4s, v0.4s, v9.4s\n" 2116 "add v1.4s, v1.4s, v10.4s\n" 2117 2118 // RowMajorOutput::Output 2119 "st1 {v0.4s, v1.4s}, [%x[result]], #32\n" 2120 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2121 : [count] "r"(params.kernel.count), 2122 [stride] "r"(params.output_stream.stride) 2123 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 2124 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory"); 2125 } 2126 2127 template <> 2128 inline void MulKernel< 2129 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 1, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2130 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2131 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 2132 RowMajor>& params, 2133 int32_t* result) { 2134 #ifdef DEBUG 2135 #ifdef DEBUG_METAGEMM_VERBOSE 2136 std::cout << __FILE__ << "(" << __LINE__ 2137 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 2138 "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 1, " 2139 "8>::Multiply()" 2140 << std::endl 2141 << std::flush; 2142 #endif 2143 #endif 2144 asm volatile( 2145 "prfm pldl1keep, [%x[lhs]]\n" 2146 "prfm pldl1keep, [%x[rhs]]\n" 2147 2148 // Clear aggregators. 2149 "movi v0.4s, #0\n" 2150 "movi v1.4s, #0\n" 2151 2152 // General NxM lanes loop. 2153 "1:" 2154 2155 // Subtract counter. 2156 "subs %x[count], %x[count], #8\n" 2157 2158 "ld1 {v2.2s, v3.2s}, [%x[lhs]], #16\n" 2159 "ld1 {v4.2s}, [%x[rhs]], #8\n" 2160 "prfm pldl1keep, [%x[lhs], #64]\n" 2161 "prfm pldl1keep, [%x[rhs], #64]\n" 2162 "umull v5.8h, v4.8b, v2.8b\n" 2163 "umull v6.8h, v4.8b, v3.8b\n" 2164 "uadalp v0.4s, v5.8h\n" 2165 "uadalp v1.4s, v6.8h\n" 2166 2167 // Loop break. 2168 "bgt 1b\n" 2169 2170 // StaticQuantizationInt32::Prepare 2171 "ld1 {v4.4s}, [%x[lhs]], #16\n" 2172 "ld1 {v5.4s}, [%x[rhs]], #16\n" 2173 "dup v2.4s, v4.s[0]\n" 2174 "dup v4.4s, v4.s[1]\n" 2175 2176 // RowMajorOutput::Prepare 2177 "add x0, %x[result], %x[stride]\n" 2178 2179 // Reduce aggregators. 2180 "addp v0.4s, v0.4s, v0.4s\n" 2181 "addp v0.4s, v0.4s, v0.4s\n" 2182 "addp v1.4s, v1.4s, v1.4s\n" 2183 "addp v1.4s, v1.4s, v1.4s\n" 2184 2185 // StaticQuantizationInt32::Transform 2186 "add v0.4s, v0.4s, v2.4s\n" 2187 "add v1.4s, v1.4s, v4.4s\n" 2188 "add v0.4s, v0.4s, v5.4s\n" 2189 "add v1.4s, v1.4s, v5.4s\n" 2190 2191 // RowMajorOutput::Output 2192 "st1 {v0.s}[0], [%x[result]], #4\n" 2193 "st1 {v1.s}[0], [x0], #4\n" 2194 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2195 : [count] "r"(params.kernel.count), 2196 [stride] "r"(params.output_stream.stride) 2197 : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory"); 2198 } 2199 2200 template <> 2201 inline void MulKernel< 2202 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 2, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2203 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2204 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 2205 RowMajor>& params, 2206 int32_t* result) { 2207 #ifdef DEBUG 2208 #ifdef DEBUG_METAGEMM_VERBOSE 2209 std::cout << __FILE__ << "(" << __LINE__ 2210 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 2211 "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 2, " 2212 "8>::Multiply()" 2213 << std::endl 2214 << std::flush; 2215 #endif 2216 #endif 2217 asm volatile( 2218 "prfm pldl1keep, [%x[lhs]]\n" 2219 "prfm pldl1keep, [%x[rhs]]\n" 2220 2221 // Clear aggregators. 2222 "movi v0.4s, #0\n" 2223 "movi v1.4s, #0\n" 2224 "movi v2.4s, #0\n" 2225 "mov v3.16b, v0.16b\n" 2226 2227 // General NxM lanes loop. 2228 "1:" 2229 2230 // Subtract counter. 2231 "subs %x[count], %x[count], #8\n" 2232 2233 "ld1 {v4.2s, v5.2s}, [%x[lhs]], #16\n" 2234 "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n" 2235 "prfm pldl1keep, [%x[lhs], #64]\n" 2236 "prfm pldl1keep, [%x[rhs], #64]\n" 2237 "umull v8.8h, v6.8b, v4.8b\n" 2238 "umull v9.8h, v7.8b, v4.8b\n" 2239 "umull v10.8h, v6.8b, v5.8b\n" 2240 "umull v11.8h, v7.8b, v5.8b\n" 2241 "uadalp v0.4s, v8.8h\n" 2242 "uadalp v1.4s, v9.8h\n" 2243 "uadalp v2.4s, v10.8h\n" 2244 "uadalp v3.4s, v11.8h\n" 2245 2246 // Loop break. 2247 "bgt 1b\n" 2248 2249 // StaticQuantizationInt32::Prepare 2250 "ld1 {v4.4s}, [%x[lhs]], #16\n" 2251 "ld1 {v5.4s}, [%x[rhs]], #16\n" 2252 "dup v6.4s, v4.s[0]\n" 2253 "dup v4.4s, v4.s[1]\n" 2254 2255 // RowMajorOutput::Prepare 2256 "add x0, %x[result], %x[stride]\n" 2257 2258 // Reduce aggregators. 2259 "addp v0.4s, v0.4s, v1.4s\n" 2260 "addp v0.4s, v0.4s, v0.4s\n" 2261 "addp v2.4s, v2.4s, v3.4s\n" 2262 "addp v2.4s, v2.4s, v2.4s\n" 2263 2264 // StaticQuantizationInt32::Transform 2265 "add v0.4s, v0.4s, v6.4s\n" 2266 "add v2.4s, v2.4s, v4.4s\n" 2267 "add v0.4s, v0.4s, v5.4s\n" 2268 "add v2.4s, v2.4s, v5.4s\n" 2269 2270 // RowMajorOutput::Output 2271 "st1 {v0.2s}, [%x[result]], #8\n" 2272 "st1 {v2.2s}, [x0], #8\n" 2273 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2274 : [count] "r"(params.kernel.count), 2275 [stride] "r"(params.output_stream.stride) 2276 : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 2277 "v11", "cc", "memory"); 2278 } 2279 2280 template <> 2281 inline void MulKernel< 2282 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 3, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2283 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2284 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 2285 RowMajor>& params, 2286 int32_t* result) { 2287 #ifdef DEBUG 2288 #ifdef DEBUG_METAGEMM_VERBOSE 2289 std::cout << __FILE__ << "(" << __LINE__ 2290 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 2291 "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 3, " 2292 "8>::Multiply()" 2293 << std::endl 2294 << std::flush; 2295 #endif 2296 #endif 2297 asm volatile( 2298 "prfm pldl1keep, [%x[lhs]]\n" 2299 "prfm pldl1keep, [%x[rhs]]\n" 2300 2301 // Clear aggregators. 2302 "movi v0.4s, #0\n" 2303 "movi v1.4s, #0\n" 2304 "movi v2.4s, #0\n" 2305 "mov v3.16b, v0.16b\n" 2306 "mov v4.16b, v1.16b\n" 2307 "mov v5.16b, v2.16b\n" 2308 2309 // General NxM lanes loop. 2310 "1:" 2311 2312 // Subtract counter. 2313 "subs %x[count], %x[count], #8\n" 2314 2315 "ld1 {v6.2s, v7.2s}, [%x[lhs]], #16\n" 2316 "ld1 {v8.2s, v9.2s, v10.2s}, [%x[rhs]], #24\n" 2317 "prfm pldl1keep, [%x[lhs], #64]\n" 2318 "prfm pldl1keep, [%x[rhs], #64]\n" 2319 "umull v11.8h, v8.8b, v6.8b\n" 2320 "umull v12.8h, v9.8b, v6.8b\n" 2321 "umull v13.8h, v10.8b, v6.8b\n" 2322 "umull v14.8h, v8.8b, v7.8b\n" 2323 "umull v15.8h, v9.8b, v7.8b\n" 2324 "umull v16.8h, v10.8b, v7.8b\n" 2325 "uadalp v0.4s, v11.8h\n" 2326 "uadalp v1.4s, v12.8h\n" 2327 "uadalp v2.4s, v13.8h\n" 2328 "uadalp v3.4s, v14.8h\n" 2329 "uadalp v4.4s, v15.8h\n" 2330 "uadalp v5.4s, v16.8h\n" 2331 2332 // Loop break. 2333 "bgt 1b\n" 2334 2335 // StaticQuantizationInt32::Prepare 2336 "ld1 {v6.4s}, [%x[lhs]], #16\n" 2337 "ld1 {v7.4s}, [%x[rhs]], #16\n" 2338 "dup v8.4s, v6.s[0]\n" 2339 "dup v6.4s, v6.s[1]\n" 2340 2341 // RowMajorOutput::Prepare 2342 "add x0, %x[result], %x[stride]\n" 2343 2344 // Reduce aggregators. 2345 "addp v0.4s, v0.4s, v1.4s\n" 2346 "addp v2.4s, v2.4s, v2.4s\n" 2347 "addp v0.4s, v0.4s, v2.4s\n" 2348 "addp v3.4s, v3.4s, v4.4s\n" 2349 "addp v5.4s, v5.4s, v5.4s\n" 2350 "addp v3.4s, v3.4s, v5.4s\n" 2351 2352 // StaticQuantizationInt32::Transform 2353 "add v0.4s, v0.4s, v8.4s\n" 2354 "add v3.4s, v3.4s, v6.4s\n" 2355 "add v0.4s, v0.4s, v7.4s\n" 2356 "add v3.4s, v3.4s, v7.4s\n" 2357 2358 // RowMajorOutput::Output 2359 "st1 {v0.2s}, [%x[result]], #8\n" 2360 "st1 {v0.s}[2], [%x[result]], #4\n" 2361 "st1 {v3.2s}, [x0], #8\n" 2362 "st1 {v3.s}[2], [x0], #4\n" 2363 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2364 : [count] "r"(params.kernel.count), 2365 [stride] "r"(params.output_stream.stride) 2366 : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 2367 "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory"); 2368 } 2369 2370 template <> 2371 inline void MulKernel< 2372 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 4, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2373 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2374 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 2375 RowMajor>& params, 2376 int32_t* result) { 2377 #ifdef DEBUG 2378 #ifdef DEBUG_METAGEMM_VERBOSE 2379 std::cout << __FILE__ << "(" << __LINE__ 2380 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 2381 "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 4, " 2382 "8>::Multiply()" 2383 << std::endl 2384 << std::flush; 2385 #endif 2386 #endif 2387 asm volatile( 2388 "prfm pldl1keep, [%x[lhs]]\n" 2389 "prfm pldl1keep, [%x[rhs]]\n" 2390 2391 // Clear aggregators. 2392 "movi v0.4s, #0\n" 2393 "movi v1.4s, #0\n" 2394 "movi v2.4s, #0\n" 2395 "mov v3.16b, v0.16b\n" 2396 "mov v4.16b, v1.16b\n" 2397 "mov v5.16b, v2.16b\n" 2398 "mov v6.16b, v3.16b\n" 2399 "mov v7.16b, v4.16b\n" 2400 2401 // 2x4 lanes loop. 2402 "1:" 2403 2404 "ld1 {v10.8b, v11.8b, v12.8b, v13.8b}, [%x[rhs]], #32\n" 2405 "ld1 {v8.8b}, [%x[lhs]], #8\n" 2406 "umull v14.8h, v8.8b, v10.8b\n" 2407 "ld1 {v9.8b}, [%x[lhs]], #8\n" 2408 "umull v15.8h, v8.8b, v11.8b\n" 2409 "prfm pldl1keep, [%x[rhs], #64]\n" 2410 "umull v16.8h, v8.8b, v12.8b\n" 2411 "prfm pldl1keep, [%x[lhs], #64]\n" 2412 "umull v17.8h, v8.8b, v13.8b\n" 2413 "umull v18.8h, v9.8b, v10.8b\n" 2414 "uadalp v0.4s, v14.8h\n" 2415 "uadalp v1.4s, v15.8h\n" 2416 "uadalp v2.4s, v16.8h\n" 2417 "umull v14.8h, v9.8b, v11.8b\n" 2418 "umull v15.8h, v9.8b, v12.8b\n" 2419 "umull v16.8h, v9.8b, v13.8b\n" 2420 2421 // Subtract counter. 2422 "subs %x[count], %x[count], #8\n" 2423 2424 "uadalp v3.4s, v17.8h\n" 2425 "uadalp v4.4s, v18.8h\n" 2426 "uadalp v5.4s, v14.8h\n" 2427 "uadalp v6.4s, v15.8h\n" 2428 "uadalp v7.4s, v16.8h\n" 2429 2430 // Loop break. 2431 "bgt 1b\n" 2432 2433 // StaticQuantizationInt32::Prepare 2434 "ld1 {v8.4s}, [%x[lhs]], #16\n" 2435 "ld1 {v9.4s}, [%x[rhs]], #16\n" 2436 "dup v10.4s, v8.s[0]\n" 2437 "dup v8.4s, v8.s[1]\n" 2438 2439 // RowMajorOutput::Prepare 2440 "add x0, %x[result], %x[stride]\n" 2441 2442 // Reduce aggregators. 2443 "addp v0.4s, v0.4s, v1.4s\n" 2444 "addp v2.4s, v2.4s, v3.4s\n" 2445 "addp v0.4s, v0.4s, v2.4s\n" 2446 "addp v4.4s, v4.4s, v5.4s\n" 2447 "addp v6.4s, v6.4s, v7.4s\n" 2448 "addp v4.4s, v4.4s, v6.4s\n" 2449 2450 // StaticQuantizationInt32::Transform 2451 "add v0.4s, v0.4s, v10.4s\n" 2452 "add v4.4s, v4.4s, v8.4s\n" 2453 "add v0.4s, v0.4s, v9.4s\n" 2454 "add v4.4s, v4.4s, v9.4s\n" 2455 2456 // RowMajorOutput::Output 2457 "st1 {v0.4s}, [%x[result]], #16\n" 2458 "st1 {v4.4s}, [x0], #16\n" 2459 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2460 : [count] "r"(params.kernel.count), 2461 [stride] "r"(params.output_stream.stride) 2462 : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 2463 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc", "memory"); 2464 } 2465 2466 template <> 2467 inline void MulKernel< 2468 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 1, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2469 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2470 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 2471 RowMajor>& params, 2472 int32_t* result) { 2473 #ifdef DEBUG 2474 #ifdef DEBUG_METAGEMM_VERBOSE 2475 std::cout << __FILE__ << "(" << __LINE__ 2476 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 2477 "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 1, " 2478 "8>::Multiply()" 2479 << std::endl 2480 << std::flush; 2481 #endif 2482 #endif 2483 asm volatile( 2484 "prfm pldl1keep, [%x[lhs]]\n" 2485 "prfm pldl1keep, [%x[rhs]]\n" 2486 2487 // Clear aggregators. 2488 "movi v0.4s, #0\n" 2489 "movi v1.4s, #0\n" 2490 "movi v2.4s, #0\n" 2491 2492 // General NxM lanes loop. 2493 "1:" 2494 2495 // Subtract counter. 2496 "subs %x[count], %x[count], #8\n" 2497 2498 "ld1 {v3.2s, v4.2s, v5.2s}, [%x[lhs]], #24\n" 2499 "ld1 {v6.2s}, [%x[rhs]], #8\n" 2500 "prfm pldl1keep, [%x[lhs], #64]\n" 2501 "prfm pldl1keep, [%x[rhs], #64]\n" 2502 "umull v7.8h, v6.8b, v3.8b\n" 2503 "umull v8.8h, v6.8b, v4.8b\n" 2504 "umull v9.8h, v6.8b, v5.8b\n" 2505 "uadalp v0.4s, v7.8h\n" 2506 "uadalp v1.4s, v8.8h\n" 2507 "uadalp v2.4s, v9.8h\n" 2508 2509 // Loop break. 2510 "bgt 1b\n" 2511 2512 // StaticQuantizationInt32::Prepare 2513 "ld1 {v4.4s}, [%x[lhs]], #16\n" 2514 "ld1 {v5.4s}, [%x[rhs]], #16\n" 2515 "dup v3.4s, v4.s[0]\n" 2516 "dup v6.4s, v4.s[1]\n" 2517 "dup v4.4s, v4.s[2]\n" 2518 2519 // RowMajorOutput::Prepare 2520 "add x0, %x[result], %x[stride]\n" 2521 "add x1, x0, %x[stride]\n" 2522 2523 // Reduce aggregators. 2524 "addp v0.4s, v0.4s, v0.4s\n" 2525 "addp v0.4s, v0.4s, v0.4s\n" 2526 "addp v1.4s, v1.4s, v1.4s\n" 2527 "addp v1.4s, v1.4s, v1.4s\n" 2528 "addp v2.4s, v2.4s, v2.4s\n" 2529 "addp v2.4s, v2.4s, v2.4s\n" 2530 2531 // StaticQuantizationInt32::Transform 2532 "add v0.4s, v0.4s, v3.4s\n" 2533 "add v1.4s, v1.4s, v6.4s\n" 2534 "add v2.4s, v2.4s, v4.4s\n" 2535 "add v0.4s, v0.4s, v5.4s\n" 2536 "add v1.4s, v1.4s, v5.4s\n" 2537 "add v2.4s, v2.4s, v5.4s\n" 2538 2539 // RowMajorOutput::Output 2540 "st1 {v0.s}[0], [%x[result]], #4\n" 2541 "st1 {v1.s}[0], [x0], #4\n" 2542 "st1 {v2.s}[0], [x1], #4\n" 2543 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2544 : [count] "r"(params.kernel.count), 2545 [stride] "r"(params.output_stream.stride) 2546 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", 2547 "cc", "memory"); 2548 } 2549 2550 template <> 2551 inline void MulKernel< 2552 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 2, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2553 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2554 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 2555 RowMajor>& params, 2556 int32_t* result) { 2557 #ifdef DEBUG 2558 #ifdef DEBUG_METAGEMM_VERBOSE 2559 std::cout << __FILE__ << "(" << __LINE__ 2560 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 2561 "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 2, " 2562 "8>::Multiply()" 2563 << std::endl 2564 << std::flush; 2565 #endif 2566 #endif 2567 asm volatile( 2568 "prfm pldl1keep, [%x[lhs]]\n" 2569 "prfm pldl1keep, [%x[rhs]]\n" 2570 2571 // Clear aggregators. 2572 "movi v0.4s, #0\n" 2573 "movi v1.4s, #0\n" 2574 "movi v2.4s, #0\n" 2575 "mov v3.16b, v0.16b\n" 2576 "mov v4.16b, v1.16b\n" 2577 "mov v5.16b, v2.16b\n" 2578 2579 // General NxM lanes loop. 2580 "1:" 2581 2582 // Subtract counter. 2583 "subs %x[count], %x[count], #8\n" 2584 2585 "ld1 {v6.2s, v7.2s, v8.2s}, [%x[lhs]], #24\n" 2586 "ld1 {v9.2s, v10.2s}, [%x[rhs]], #16\n" 2587 "prfm pldl1keep, [%x[lhs], #64]\n" 2588 "prfm pldl1keep, [%x[rhs], #64]\n" 2589 "umull v11.8h, v9.8b, v6.8b\n" 2590 "umull v12.8h, v10.8b, v6.8b\n" 2591 "umull v13.8h, v9.8b, v7.8b\n" 2592 "umull v14.8h, v10.8b, v7.8b\n" 2593 "umull v15.8h, v9.8b, v8.8b\n" 2594 "umull v16.8h, v10.8b, v8.8b\n" 2595 "uadalp v0.4s, v11.8h\n" 2596 "uadalp v1.4s, v12.8h\n" 2597 "uadalp v2.4s, v13.8h\n" 2598 "uadalp v3.4s, v14.8h\n" 2599 "uadalp v4.4s, v15.8h\n" 2600 "uadalp v5.4s, v16.8h\n" 2601 2602 // Loop break. 2603 "bgt 1b\n" 2604 2605 // StaticQuantizationInt32::Prepare 2606 "ld1 {v6.4s}, [%x[lhs]], #16\n" 2607 "ld1 {v7.4s}, [%x[rhs]], #16\n" 2608 "dup v8.4s, v6.s[0]\n" 2609 "dup v9.4s, v6.s[1]\n" 2610 "dup v6.4s, v6.s[2]\n" 2611 2612 // RowMajorOutput::Prepare 2613 "add x0, %x[result], %x[stride]\n" 2614 "add x1, x0, %x[stride]\n" 2615 2616 // Reduce aggregators. 2617 "addp v0.4s, v0.4s, v1.4s\n" 2618 "addp v0.4s, v0.4s, v0.4s\n" 2619 "addp v2.4s, v2.4s, v3.4s\n" 2620 "addp v2.4s, v2.4s, v2.4s\n" 2621 "addp v4.4s, v4.4s, v5.4s\n" 2622 "addp v4.4s, v4.4s, v4.4s\n" 2623 2624 // StaticQuantizationInt32::Transform 2625 "add v0.4s, v0.4s, v8.4s\n" 2626 "add v2.4s, v2.4s, v9.4s\n" 2627 "add v4.4s, v4.4s, v6.4s\n" 2628 "add v0.4s, v0.4s, v7.4s\n" 2629 "add v2.4s, v2.4s, v7.4s\n" 2630 "add v4.4s, v4.4s, v7.4s\n" 2631 2632 // RowMajorOutput::Output 2633 "st1 {v0.2s}, [%x[result]], #8\n" 2634 "st1 {v2.2s}, [x0], #8\n" 2635 "st1 {v4.2s}, [x1], #8\n" 2636 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2637 : [count] "r"(params.kernel.count), 2638 [stride] "r"(params.output_stream.stride) 2639 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", 2640 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory"); 2641 } 2642 2643 template <> 2644 inline void MulKernel< 2645 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 3, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2646 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2647 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 2648 RowMajor>& params, 2649 int32_t* result) { 2650 #ifdef DEBUG 2651 #ifdef DEBUG_METAGEMM_VERBOSE 2652 std::cout << __FILE__ << "(" << __LINE__ 2653 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 2654 "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 3, " 2655 "8>::Multiply()" 2656 << std::endl 2657 << std::flush; 2658 #endif 2659 #endif 2660 asm volatile( 2661 "prfm pldl1keep, [%x[lhs]]\n" 2662 "prfm pldl1keep, [%x[rhs]]\n" 2663 2664 // Clear aggregators. 2665 "movi v0.4s, #0\n" 2666 "movi v1.4s, #0\n" 2667 "movi v2.4s, #0\n" 2668 "mov v3.16b, v0.16b\n" 2669 "mov v4.16b, v1.16b\n" 2670 "mov v5.16b, v2.16b\n" 2671 "mov v6.16b, v3.16b\n" 2672 "mov v7.16b, v4.16b\n" 2673 "mov v8.16b, v5.16b\n" 2674 2675 // 3x3 lanes loop. 2676 "1:" 2677 2678 "ld1 {v12.8b, v13.8b, v14.8b}, [%x[rhs]], #24\n" 2679 "ld1 {v9.8b}, [%x[lhs]], #8\n" 2680 "umull v15.8h, v9.8b, v12.8b\n" 2681 "ld1 {v10.8b}, [%x[lhs]], #8\n" 2682 "umull v16.8h, v9.8b, v13.8b\n" 2683 "ld1 {v11.8b}, [%x[lhs]], #8\n" 2684 "umull v17.8h, v9.8b, v14.8b\n" 2685 "prfm pldl1keep, [%x[lhs], #64]\n" 2686 "umull v18.8h, v10.8b, v12.8b\n" 2687 "prfm pldl1keep, [%x[rhs], #64]\n" 2688 "uadalp v0.4s, v15.8h\n" 2689 "uadalp v1.4s, v16.8h\n" 2690 "uadalp v2.4s, v17.8h\n" 2691 "uadalp v3.4s, v18.8h\n" 2692 "umull v15.8h, v10.8b, v13.8b\n" 2693 "umull v16.8h, v10.8b, v14.8b\n" 2694 "umull v17.8h, v11.8b, v12.8b\n" 2695 "umull v18.8h, v11.8b, v13.8b\n" 2696 2697 // Subtract counter. 2698 "subs %x[count], %x[count], #8\n" 2699 2700 "umull v9.8h, v11.8b, v14.8b\n" 2701 "uadalp v4.4s, v15.8h\n" 2702 "uadalp v5.4s, v16.8h\n" 2703 "uadalp v6.4s, v17.8h\n" 2704 "uadalp v7.4s, v18.8h\n" 2705 "uadalp v8.4s, v9.8h\n" 2706 2707 // Loop break. 2708 "bgt 1b\n" 2709 2710 // StaticQuantizationInt32::Prepare 2711 "ld1 {v9.4s}, [%x[lhs]], #16\n" 2712 "ld1 {v10.4s}, [%x[rhs]], #16\n" 2713 "dup v11.4s, v9.s[0]\n" 2714 "dup v12.4s, v9.s[1]\n" 2715 "dup v9.4s, v9.s[2]\n" 2716 2717 // RowMajorOutput::Prepare 2718 "add x0, %x[result], %x[stride]\n" 2719 "add x1, x0, %x[stride]\n" 2720 2721 // Reduce aggregators. 2722 "addp v0.4s, v0.4s, v1.4s\n" 2723 "addp v2.4s, v2.4s, v2.4s\n" 2724 "addp v0.4s, v0.4s, v2.4s\n" 2725 "addp v3.4s, v3.4s, v4.4s\n" 2726 "addp v5.4s, v5.4s, v5.4s\n" 2727 "addp v3.4s, v3.4s, v5.4s\n" 2728 "addp v6.4s, v6.4s, v7.4s\n" 2729 "addp v8.4s, v8.4s, v8.4s\n" 2730 "addp v6.4s, v6.4s, v8.4s\n" 2731 2732 // StaticQuantizationInt32::Transform 2733 "add v0.4s, v0.4s, v11.4s\n" 2734 "add v3.4s, v3.4s, v12.4s\n" 2735 "add v6.4s, v6.4s, v9.4s\n" 2736 "add v0.4s, v0.4s, v10.4s\n" 2737 "add v3.4s, v3.4s, v10.4s\n" 2738 "add v6.4s, v6.4s, v10.4s\n" 2739 2740 // RowMajorOutput::Output 2741 "st1 {v0.2s}, [%x[result]], #8\n" 2742 "st1 {v0.s}[2], [%x[result]], #4\n" 2743 "st1 {v3.2s}, [x0], #8\n" 2744 "st1 {v3.s}[2], [x0], #4\n" 2745 "st1 {v6.2s}, [x1], #8\n" 2746 "st1 {v6.s}[2], [x1], #4\n" 2747 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2748 : [count] "r"(params.kernel.count), 2749 [stride] "r"(params.output_stream.stride) 2750 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", 2751 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc", 2752 "memory"); 2753 } 2754 2755 template <> 2756 inline void MulKernel< 2757 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 1, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)2758 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2759 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 2760 RowMajor>& params, 2761 float* result) { 2762 #ifdef DEBUG 2763 #ifdef DEBUG_METAGEMM_VERBOSE 2764 std::cout << __FILE__ << "(" << __LINE__ 2765 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 2766 "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 1, " 2767 "8>::Multiply()" 2768 << std::endl 2769 << std::flush; 2770 #endif 2771 #endif 2772 asm volatile( 2773 "prfm pldl1keep, [%x[lhs]]\n" 2774 "prfm pldl1keep, [%x[rhs]]\n" 2775 2776 // Clear aggregators. 2777 "movi v0.4s, #0\n" 2778 2779 // General NxM lanes loop. 2780 "1:" 2781 2782 // Subtract counter. 2783 "subs %x[count], %x[count], #8\n" 2784 2785 "ld1 {v1.2s}, [%x[lhs]], #8\n" 2786 "ld1 {v2.2s}, [%x[rhs]], #8\n" 2787 "prfm pldl1keep, [%x[lhs], #64]\n" 2788 "prfm pldl1keep, [%x[rhs], #64]\n" 2789 "umull v3.8h, v2.8b, v1.8b\n" 2790 "uadalp v0.4s, v3.8h\n" 2791 2792 // Loop break. 2793 "bgt 1b\n" 2794 2795 // StaticQuantizationFloat::Prepare 2796 "ld1 {v4.4s}, [%x[lhs]], #16\n" 2797 "ld1 {v5.4s}, [%x[rhs]], #16\n" 2798 "dup v6.4s, %w[scale]\n" 2799 "dup v4.4s, v4.s[0]\n" 2800 2801 // RowMajorOutput::Prepare 2802 2803 // Reduce aggregators. 2804 "addp v0.4s, v0.4s, v0.4s\n" 2805 "addp v0.4s, v0.4s, v0.4s\n" 2806 2807 // StaticQuantizationFloat::Transform 2808 "add v0.4s, v0.4s, v4.4s\n" 2809 "add v0.4s, v0.4s, v5.4s\n" 2810 "scvtf v0.4s, v0.4s\n" 2811 "fmul v0.4s, v0.4s, v6.4s\n" 2812 2813 // RowMajorOutput::Output 2814 "st1 {v0.s}[0], [%x[result]], #4\n" 2815 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2816 : [count] "r"(params.kernel.count), 2817 [stride] "r"(params.output_stream.stride), 2818 [scale] "r"(params.kernel.scale) 2819 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory"); 2820 } 2821 2822 template <> 2823 inline void MulKernel< 2824 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 2, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)2825 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2826 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 2827 RowMajor>& params, 2828 float* result) { 2829 #ifdef DEBUG 2830 #ifdef DEBUG_METAGEMM_VERBOSE 2831 std::cout << __FILE__ << "(" << __LINE__ 2832 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 2833 "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 2, " 2834 "8>::Multiply()" 2835 << std::endl 2836 << std::flush; 2837 #endif 2838 #endif 2839 asm volatile( 2840 "prfm pldl1keep, [%x[lhs]]\n" 2841 "prfm pldl1keep, [%x[rhs]]\n" 2842 2843 // Clear aggregators. 2844 "movi v0.4s, #0\n" 2845 "movi v1.4s, #0\n" 2846 2847 // General NxM lanes loop. 2848 "1:" 2849 2850 // Subtract counter. 2851 "subs %x[count], %x[count], #8\n" 2852 2853 "ld1 {v2.2s}, [%x[lhs]], #8\n" 2854 "ld1 {v3.2s, v4.2s}, [%x[rhs]], #16\n" 2855 "prfm pldl1keep, [%x[lhs], #64]\n" 2856 "prfm pldl1keep, [%x[rhs], #64]\n" 2857 "umull v5.8h, v3.8b, v2.8b\n" 2858 "umull v6.8h, v4.8b, v2.8b\n" 2859 "uadalp v0.4s, v5.8h\n" 2860 "uadalp v1.4s, v6.8h\n" 2861 2862 // Loop break. 2863 "bgt 1b\n" 2864 2865 // StaticQuantizationFloat::Prepare 2866 "ld1 {v4.4s}, [%x[lhs]], #16\n" 2867 "ld1 {v5.4s}, [%x[rhs]], #16\n" 2868 "dup v6.4s, %w[scale]\n" 2869 "dup v4.4s, v4.s[0]\n" 2870 2871 // RowMajorOutput::Prepare 2872 2873 // Reduce aggregators. 2874 "addp v0.4s, v0.4s, v1.4s\n" 2875 "addp v0.4s, v0.4s, v0.4s\n" 2876 2877 // StaticQuantizationFloat::Transform 2878 "add v0.4s, v0.4s, v4.4s\n" 2879 "add v0.4s, v0.4s, v5.4s\n" 2880 "scvtf v0.4s, v0.4s\n" 2881 "fmul v0.4s, v0.4s, v6.4s\n" 2882 2883 // RowMajorOutput::Output 2884 "st1 {v0.2s}, [%x[result]], #8\n" 2885 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2886 : [count] "r"(params.kernel.count), 2887 [stride] "r"(params.output_stream.stride), 2888 [scale] "r"(params.kernel.scale) 2889 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory"); 2890 } 2891 2892 template <> 2893 inline void MulKernel< 2894 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 3, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)2895 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2896 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 2897 RowMajor>& params, 2898 float* result) { 2899 #ifdef DEBUG 2900 #ifdef DEBUG_METAGEMM_VERBOSE 2901 std::cout << __FILE__ << "(" << __LINE__ 2902 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 2903 "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 3, " 2904 "8>::Multiply()" 2905 << std::endl 2906 << std::flush; 2907 #endif 2908 #endif 2909 asm volatile( 2910 "prfm pldl1keep, [%x[lhs]]\n" 2911 "prfm pldl1keep, [%x[rhs]]\n" 2912 2913 // Clear aggregators. 2914 "movi v0.4s, #0\n" 2915 "movi v1.4s, #0\n" 2916 "movi v2.4s, #0\n" 2917 2918 // General NxM lanes loop. 2919 "1:" 2920 2921 // Subtract counter. 2922 "subs %x[count], %x[count], #8\n" 2923 2924 "ld1 {v3.2s}, [%x[lhs]], #8\n" 2925 "ld1 {v4.2s, v5.2s, v6.2s}, [%x[rhs]], #24\n" 2926 "prfm pldl1keep, [%x[lhs], #64]\n" 2927 "prfm pldl1keep, [%x[rhs], #64]\n" 2928 "umull v7.8h, v4.8b, v3.8b\n" 2929 "umull v8.8h, v5.8b, v3.8b\n" 2930 "umull v9.8h, v6.8b, v3.8b\n" 2931 "uadalp v0.4s, v7.8h\n" 2932 "uadalp v1.4s, v8.8h\n" 2933 "uadalp v2.4s, v9.8h\n" 2934 2935 // Loop break. 2936 "bgt 1b\n" 2937 2938 // StaticQuantizationFloat::Prepare 2939 "ld1 {v4.4s}, [%x[lhs]], #16\n" 2940 "ld1 {v5.4s}, [%x[rhs]], #16\n" 2941 "dup v6.4s, %w[scale]\n" 2942 "dup v4.4s, v4.s[0]\n" 2943 2944 // RowMajorOutput::Prepare 2945 2946 // Reduce aggregators. 2947 "addp v0.4s, v0.4s, v1.4s\n" 2948 "addp v2.4s, v2.4s, v2.4s\n" 2949 "addp v0.4s, v0.4s, v2.4s\n" 2950 2951 // StaticQuantizationFloat::Transform 2952 "add v0.4s, v0.4s, v4.4s\n" 2953 "add v0.4s, v0.4s, v5.4s\n" 2954 "scvtf v0.4s, v0.4s\n" 2955 "fmul v0.4s, v0.4s, v6.4s\n" 2956 2957 // RowMajorOutput::Output 2958 "st1 {v0.2s}, [%x[result]], #8\n" 2959 "st1 {v0.s}[2], [%x[result]], #4\n" 2960 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2961 : [count] "r"(params.kernel.count), 2962 [stride] "r"(params.output_stream.stride), 2963 [scale] "r"(params.kernel.scale) 2964 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc", 2965 "memory"); 2966 } 2967 2968 template <> 2969 inline void MulKernel< 2970 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 4, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)2971 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2972 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 2973 RowMajor>& params, 2974 float* result) { 2975 #ifdef DEBUG 2976 #ifdef DEBUG_METAGEMM_VERBOSE 2977 std::cout << __FILE__ << "(" << __LINE__ 2978 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 2979 "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 4, " 2980 "8>::Multiply()" 2981 << std::endl 2982 << std::flush; 2983 #endif 2984 #endif 2985 asm volatile( 2986 "prfm pldl1keep, [%x[lhs]]\n" 2987 "prfm pldl1keep, [%x[rhs]]\n" 2988 2989 // Clear aggregators. 2990 "movi v0.4s, #0\n" 2991 "movi v1.4s, #0\n" 2992 "movi v2.4s, #0\n" 2993 "mov v3.16b, v0.16b\n" 2994 2995 // General NxM lanes loop. 2996 "1:" 2997 2998 // Subtract counter. 2999 "subs %x[count], %x[count], #8\n" 3000 3001 "ld1 {v4.2s}, [%x[lhs]], #8\n" 3002 "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n" 3003 "prfm pldl1keep, [%x[lhs], #64]\n" 3004 "prfm pldl1keep, [%x[rhs], #64]\n" 3005 "umull v9.8h, v5.8b, v4.8b\n" 3006 "umull v10.8h, v6.8b, v4.8b\n" 3007 "umull v11.8h, v7.8b, v4.8b\n" 3008 "umull v12.8h, v8.8b, v4.8b\n" 3009 "uadalp v0.4s, v9.8h\n" 3010 "uadalp v1.4s, v10.8h\n" 3011 "uadalp v2.4s, v11.8h\n" 3012 "uadalp v3.4s, v12.8h\n" 3013 3014 // Loop break. 3015 "bgt 1b\n" 3016 3017 // StaticQuantizationFloat::Prepare 3018 "ld1 {v4.4s}, [%x[lhs]], #16\n" 3019 "ld1 {v5.4s}, [%x[rhs]], #16\n" 3020 "dup v6.4s, %w[scale]\n" 3021 "dup v4.4s, v4.s[0]\n" 3022 3023 // RowMajorOutput::Prepare 3024 3025 // Reduce aggregators. 3026 "addp v0.4s, v0.4s, v1.4s\n" 3027 "addp v2.4s, v2.4s, v3.4s\n" 3028 "addp v0.4s, v0.4s, v2.4s\n" 3029 3030 // StaticQuantizationFloat::Transform 3031 "add v0.4s, v0.4s, v4.4s\n" 3032 "add v0.4s, v0.4s, v5.4s\n" 3033 "scvtf v0.4s, v0.4s\n" 3034 "fmul v0.4s, v0.4s, v6.4s\n" 3035 3036 // RowMajorOutput::Output 3037 "st1 {v0.4s}, [%x[result]], #16\n" 3038 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3039 : [count] "r"(params.kernel.count), 3040 [stride] "r"(params.output_stream.stride), 3041 [scale] "r"(params.kernel.scale) 3042 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 3043 "v11", "v12", "cc", "memory"); 3044 } 3045 3046 template <> 3047 inline void MulKernel< 3048 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 5, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3049 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3050 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3051 RowMajor>& params, 3052 float* result) { 3053 #ifdef DEBUG 3054 #ifdef DEBUG_METAGEMM_VERBOSE 3055 std::cout << __FILE__ << "(" << __LINE__ 3056 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3057 "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 5, " 3058 "8>::Multiply()" 3059 << std::endl 3060 << std::flush; 3061 #endif 3062 #endif 3063 asm volatile( 3064 "prfm pldl1keep, [%x[lhs]]\n" 3065 "prfm pldl1keep, [%x[rhs]]\n" 3066 3067 // Clear aggregators. 3068 "movi v0.4s, #0\n" 3069 "movi v1.4s, #0\n" 3070 "movi v2.4s, #0\n" 3071 "mov v3.16b, v0.16b\n" 3072 "mov v4.16b, v1.16b\n" 3073 3074 // General 1xM lanes loop. 3075 "1:" 3076 3077 // Subtract counter. 3078 "subs %x[count], %x[count], #8\n" 3079 3080 "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n" 3081 "ld1 {v9.2s}, [%x[lhs]], #8\n" 3082 "prfm pldl1keep, [%x[lhs], #64]\n" 3083 "umull v10.8h, v5.8b, v9.8b\n" 3084 "umull v11.8h, v6.8b, v9.8b\n" 3085 "umull v12.8h, v7.8b, v9.8b\n" 3086 "umull v13.8h, v8.8b, v9.8b\n" 3087 "ld1 {v5.2s}, [%x[rhs]], #8\n" 3088 "prfm pldl1keep, [%x[rhs], #128]\n" 3089 "uadalp v0.4s, v10.8h\n" 3090 "uadalp v1.4s, v11.8h\n" 3091 "uadalp v2.4s, v12.8h\n" 3092 "uadalp v3.4s, v13.8h\n" 3093 "umull v10.8h, v5.8b, v9.8b\n" 3094 "uadalp v4.4s, v10.8h\n" 3095 3096 // Loop break. 3097 "bgt 1b\n" 3098 3099 // StaticQuantizationFloat::Prepare 3100 "ld1 {v5.4s}, [%x[lhs]], #16\n" 3101 "ld1 {v6.4s, v7.4s}, [%x[rhs]], #32\n" 3102 "dup v8.4s, %w[scale]\n" 3103 "dup v5.4s, v5.s[0]\n" 3104 3105 // RowMajorOutput::Prepare 3106 3107 // Reduce aggregators. 3108 "addp v0.4s, v0.4s, v1.4s\n" 3109 "addp v2.4s, v2.4s, v3.4s\n" 3110 "addp v4.4s, v4.4s, v4.4s\n" 3111 "addp v0.4s, v0.4s, v2.4s\n" 3112 "addp v1.4s, v4.4s, v4.4s\n" 3113 3114 // StaticQuantizationFloat::Transform 3115 "add v0.4s, v0.4s, v5.4s\n" 3116 "add v1.4s, v1.4s, v5.4s\n" 3117 "add v0.4s, v0.4s, v6.4s\n" 3118 "add v1.4s, v1.4s, v7.4s\n" 3119 "scvtf v0.4s, v0.4s\n" 3120 "scvtf v1.4s, v1.4s\n" 3121 "fmul v0.4s, v0.4s, v8.4s\n" 3122 "fmul v1.4s, v1.4s, v8.4s\n" 3123 3124 // RowMajorOutput::Output 3125 "st1 {v0.4s}, [%x[result]], #16\n" 3126 "st1 {v1.s}[0], [%x[result]], #4\n" 3127 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3128 : [count] "r"(params.kernel.count), 3129 [stride] "r"(params.output_stream.stride), 3130 [scale] "r"(params.kernel.scale) 3131 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 3132 "v11", "v12", "v13", "cc", "memory"); 3133 } 3134 3135 template <> 3136 inline void MulKernel< 3137 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 6, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3138 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3139 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3140 RowMajor>& params, 3141 float* result) { 3142 #ifdef DEBUG 3143 #ifdef DEBUG_METAGEMM_VERBOSE 3144 std::cout << __FILE__ << "(" << __LINE__ 3145 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3146 "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 6, " 3147 "8>::Multiply()" 3148 << std::endl 3149 << std::flush; 3150 #endif 3151 #endif 3152 asm volatile( 3153 "prfm pldl1keep, [%x[lhs]]\n" 3154 "prfm pldl1keep, [%x[rhs]]\n" 3155 3156 // Clear aggregators. 3157 "movi v0.4s, #0\n" 3158 "movi v1.4s, #0\n" 3159 "movi v2.4s, #0\n" 3160 "mov v3.16b, v0.16b\n" 3161 "mov v4.16b, v1.16b\n" 3162 "mov v5.16b, v2.16b\n" 3163 3164 // General 1xM lanes loop. 3165 "1:" 3166 3167 // Subtract counter. 3168 "subs %x[count], %x[count], #8\n" 3169 3170 "ld1 {v6.2s, v7.2s, v8.2s, v9.2s}, [%x[rhs]], #32\n" 3171 "ld1 {v10.2s}, [%x[lhs]], #8\n" 3172 "prfm pldl1keep, [%x[lhs], #64]\n" 3173 "umull v11.8h, v6.8b, v10.8b\n" 3174 "umull v12.8h, v7.8b, v10.8b\n" 3175 "umull v13.8h, v8.8b, v10.8b\n" 3176 "umull v14.8h, v9.8b, v10.8b\n" 3177 "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n" 3178 "prfm pldl1keep, [%x[rhs], #128]\n" 3179 "uadalp v0.4s, v11.8h\n" 3180 "uadalp v1.4s, v12.8h\n" 3181 "uadalp v2.4s, v13.8h\n" 3182 "uadalp v3.4s, v14.8h\n" 3183 "umull v11.8h, v6.8b, v10.8b\n" 3184 "umull v12.8h, v7.8b, v10.8b\n" 3185 "uadalp v4.4s, v11.8h\n" 3186 "uadalp v5.4s, v12.8h\n" 3187 3188 // Loop break. 3189 "bgt 1b\n" 3190 3191 // StaticQuantizationFloat::Prepare 3192 "ld1 {v6.4s}, [%x[lhs]], #16\n" 3193 "ld1 {v7.4s, v8.4s}, [%x[rhs]], #32\n" 3194 "dup v9.4s, %w[scale]\n" 3195 "dup v6.4s, v6.s[0]\n" 3196 3197 // RowMajorOutput::Prepare 3198 3199 // Reduce aggregators. 3200 "addp v0.4s, v0.4s, v1.4s\n" 3201 "addp v2.4s, v2.4s, v3.4s\n" 3202 "addp v4.4s, v4.4s, v5.4s\n" 3203 "addp v0.4s, v0.4s, v2.4s\n" 3204 "addp v1.4s, v4.4s, v4.4s\n" 3205 3206 // StaticQuantizationFloat::Transform 3207 "add v0.4s, v0.4s, v6.4s\n" 3208 "add v1.4s, v1.4s, v6.4s\n" 3209 "add v0.4s, v0.4s, v7.4s\n" 3210 "add v1.4s, v1.4s, v8.4s\n" 3211 "scvtf v0.4s, v0.4s\n" 3212 "scvtf v1.4s, v1.4s\n" 3213 "fmul v0.4s, v0.4s, v9.4s\n" 3214 "fmul v1.4s, v1.4s, v9.4s\n" 3215 3216 // RowMajorOutput::Output 3217 "st1 {v0.4s}, [%x[result]], #16\n" 3218 "st1 {v1.2s}, [%x[result]], #8\n" 3219 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3220 : [count] "r"(params.kernel.count), 3221 [stride] "r"(params.output_stream.stride), 3222 [scale] "r"(params.kernel.scale) 3223 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 3224 "v11", "v12", "v13", "v14", "cc", "memory"); 3225 } 3226 3227 template <> 3228 inline void MulKernel< 3229 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 7, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3230 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3231 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3232 RowMajor>& params, 3233 float* result) { 3234 #ifdef DEBUG 3235 #ifdef DEBUG_METAGEMM_VERBOSE 3236 std::cout << __FILE__ << "(" << __LINE__ 3237 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3238 "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 7, " 3239 "8>::Multiply()" 3240 << std::endl 3241 << std::flush; 3242 #endif 3243 #endif 3244 asm volatile( 3245 "prfm pldl1keep, [%x[lhs]]\n" 3246 "prfm pldl1keep, [%x[rhs]]\n" 3247 3248 // Clear aggregators. 3249 "movi v0.4s, #0\n" 3250 "movi v1.4s, #0\n" 3251 "movi v2.4s, #0\n" 3252 "mov v3.16b, v0.16b\n" 3253 "mov v4.16b, v1.16b\n" 3254 "mov v5.16b, v2.16b\n" 3255 "mov v6.16b, v3.16b\n" 3256 3257 // General 1xM lanes loop. 3258 "1:" 3259 3260 // Subtract counter. 3261 "subs %x[count], %x[count], #8\n" 3262 3263 "ld1 {v7.2s, v8.2s, v9.2s, v10.2s}, [%x[rhs]], #32\n" 3264 "ld1 {v11.2s}, [%x[lhs]], #8\n" 3265 "prfm pldl1keep, [%x[lhs], #64]\n" 3266 "umull v12.8h, v7.8b, v11.8b\n" 3267 "umull v13.8h, v8.8b, v11.8b\n" 3268 "umull v14.8h, v9.8b, v11.8b\n" 3269 "umull v15.8h, v10.8b, v11.8b\n" 3270 "ld1 {v7.2s, v8.2s, v9.2s}, [%x[rhs]], #24\n" 3271 "prfm pldl1keep, [%x[rhs], #128]\n" 3272 "uadalp v0.4s, v12.8h\n" 3273 "uadalp v1.4s, v13.8h\n" 3274 "uadalp v2.4s, v14.8h\n" 3275 "uadalp v3.4s, v15.8h\n" 3276 "umull v12.8h, v7.8b, v11.8b\n" 3277 "umull v13.8h, v8.8b, v11.8b\n" 3278 "umull v14.8h, v9.8b, v11.8b\n" 3279 "uadalp v4.4s, v12.8h\n" 3280 "uadalp v5.4s, v13.8h\n" 3281 "uadalp v6.4s, v14.8h\n" 3282 3283 // Loop break. 3284 "bgt 1b\n" 3285 3286 // StaticQuantizationFloat::Prepare 3287 "ld1 {v7.4s}, [%x[lhs]], #16\n" 3288 "ld1 {v8.4s, v9.4s}, [%x[rhs]], #32\n" 3289 "dup v10.4s, %w[scale]\n" 3290 "dup v7.4s, v7.s[0]\n" 3291 3292 // RowMajorOutput::Prepare 3293 3294 // Reduce aggregators. 3295 "addp v0.4s, v0.4s, v1.4s\n" 3296 "addp v2.4s, v2.4s, v3.4s\n" 3297 "addp v4.4s, v4.4s, v5.4s\n" 3298 "addp v6.4s, v6.4s, v6.4s\n" 3299 "addp v0.4s, v0.4s, v2.4s\n" 3300 "addp v1.4s, v4.4s, v6.4s\n" 3301 3302 // StaticQuantizationFloat::Transform 3303 "add v0.4s, v0.4s, v7.4s\n" 3304 "add v1.4s, v1.4s, v7.4s\n" 3305 "add v0.4s, v0.4s, v8.4s\n" 3306 "add v1.4s, v1.4s, v9.4s\n" 3307 "scvtf v0.4s, v0.4s\n" 3308 "scvtf v1.4s, v1.4s\n" 3309 "fmul v0.4s, v0.4s, v10.4s\n" 3310 "fmul v1.4s, v1.4s, v10.4s\n" 3311 3312 // RowMajorOutput::Output 3313 "st1 {v0.4s}, [%x[result]], #16\n" 3314 "st1 {v1.2s}, [%x[result]], #8\n" 3315 "st1 {v1.s}[2], [%x[result]], #4\n" 3316 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3317 : [count] "r"(params.kernel.count), 3318 [stride] "r"(params.output_stream.stride), 3319 [scale] "r"(params.kernel.scale) 3320 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 3321 "v11", "v12", "v13", "v14", "v15", "cc", "memory"); 3322 } 3323 3324 template <> 3325 inline void MulKernel< 3326 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 8, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3327 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3328 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3329 RowMajor>& params, 3330 float* result) { 3331 #ifdef DEBUG 3332 #ifdef DEBUG_METAGEMM_VERBOSE 3333 std::cout << __FILE__ << "(" << __LINE__ 3334 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3335 "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 8, " 3336 "8>::Multiply()" 3337 << std::endl 3338 << std::flush; 3339 #endif 3340 #endif 3341 asm volatile( 3342 "prfm pldl1keep, [%x[lhs]]\n" 3343 "prfm pldl1keep, [%x[rhs]]\n" 3344 3345 // Clear aggregators. 3346 "movi v0.4s, #0\n" 3347 "movi v1.4s, #0\n" 3348 "movi v2.4s, #0\n" 3349 "mov v3.16b, v0.16b\n" 3350 "mov v4.16b, v1.16b\n" 3351 "mov v5.16b, v2.16b\n" 3352 "mov v6.16b, v3.16b\n" 3353 "mov v7.16b, v4.16b\n" 3354 3355 // 1x8 lanes loop. 3356 "1:" 3357 3358 "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n" 3359 "ld1 {v8.2s}, [%x[lhs]], #8\n" 3360 "umull v13.8h, v8.8b, v9.8b\n" 3361 "umull v14.8h, v8.8b, v10.8b\n" 3362 "umull v15.8h, v8.8b, v11.8b\n" 3363 "umull v16.8h, v8.8b, v12.8b\n" 3364 "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n" 3365 "uadalp v0.4s, v13.8h\n" 3366 "uadalp v1.4s, v14.8h\n" 3367 "uadalp v2.4s, v15.8h\n" 3368 "uadalp v3.4s, v16.8h\n" 3369 "prfm pldl1keep, [%x[rhs], #256]\n" 3370 "umull v17.8h, v8.8b, v9.8b\n" 3371 "umull v13.8h, v8.8b, v10.8b\n" 3372 "umull v14.8h, v8.8b, v11.8b\n" 3373 "umull v15.8h, v8.8b, v12.8b\n" 3374 "prfm pldl1keep, [%x[lhs], #32]\n" 3375 3376 // Subtract counter. 3377 "subs %x[count], %x[count], #8\n" 3378 3379 "uadalp v4.4s, v17.8h\n" 3380 "uadalp v5.4s, v13.8h\n" 3381 "uadalp v6.4s, v14.8h\n" 3382 "uadalp v7.4s, v15.8h\n" 3383 3384 // Loop break. 3385 "bgt 1b\n" 3386 3387 // StaticQuantizationFloat::Prepare 3388 "ld1 {v8.4s}, [%x[lhs]], #16\n" 3389 "ld1 {v9.4s, v10.4s}, [%x[rhs]], #32\n" 3390 "dup v11.4s, %w[scale]\n" 3391 "dup v8.4s, v8.s[0]\n" 3392 3393 // RowMajorOutput::Prepare 3394 3395 // Reduce aggregators. 3396 "addp v0.4s, v0.4s, v1.4s\n" 3397 "addp v2.4s, v2.4s, v3.4s\n" 3398 "addp v4.4s, v4.4s, v5.4s\n" 3399 "addp v6.4s, v6.4s, v7.4s\n" 3400 "addp v0.4s, v0.4s, v2.4s\n" 3401 "addp v1.4s, v4.4s, v6.4s\n" 3402 3403 // StaticQuantizationFloat::Transform 3404 "add v0.4s, v0.4s, v8.4s\n" 3405 "add v1.4s, v1.4s, v8.4s\n" 3406 "add v0.4s, v0.4s, v9.4s\n" 3407 "add v1.4s, v1.4s, v10.4s\n" 3408 "scvtf v0.4s, v0.4s\n" 3409 "scvtf v1.4s, v1.4s\n" 3410 "fmul v0.4s, v0.4s, v11.4s\n" 3411 "fmul v1.4s, v1.4s, v11.4s\n" 3412 3413 // RowMajorOutput::Output 3414 "st1 {v0.4s, v1.4s}, [%x[result]], #32\n" 3415 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3416 : [count] "r"(params.kernel.count), 3417 [stride] "r"(params.output_stream.stride), 3418 [scale] "r"(params.kernel.scale) 3419 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 3420 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory"); 3421 } 3422 3423 template <> 3424 inline void MulKernel< 3425 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 1, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3426 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3427 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3428 RowMajor>& params, 3429 float* result) { 3430 #ifdef DEBUG 3431 #ifdef DEBUG_METAGEMM_VERBOSE 3432 std::cout << __FILE__ << "(" << __LINE__ 3433 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3434 "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 1, " 3435 "8>::Multiply()" 3436 << std::endl 3437 << std::flush; 3438 #endif 3439 #endif 3440 asm volatile( 3441 "prfm pldl1keep, [%x[lhs]]\n" 3442 "prfm pldl1keep, [%x[rhs]]\n" 3443 3444 // Clear aggregators. 3445 "movi v0.4s, #0\n" 3446 "movi v1.4s, #0\n" 3447 3448 // General NxM lanes loop. 3449 "1:" 3450 3451 // Subtract counter. 3452 "subs %x[count], %x[count], #8\n" 3453 3454 "ld1 {v2.2s, v3.2s}, [%x[lhs]], #16\n" 3455 "ld1 {v4.2s}, [%x[rhs]], #8\n" 3456 "prfm pldl1keep, [%x[lhs], #64]\n" 3457 "prfm pldl1keep, [%x[rhs], #64]\n" 3458 "umull v5.8h, v4.8b, v2.8b\n" 3459 "umull v6.8h, v4.8b, v3.8b\n" 3460 "uadalp v0.4s, v5.8h\n" 3461 "uadalp v1.4s, v6.8h\n" 3462 3463 // Loop break. 3464 "bgt 1b\n" 3465 3466 // StaticQuantizationFloat::Prepare 3467 "ld1 {v4.4s}, [%x[lhs]], #16\n" 3468 "ld1 {v5.4s}, [%x[rhs]], #16\n" 3469 "dup v6.4s, %w[scale]\n" 3470 "dup v2.4s, v4.s[0]\n" 3471 "dup v4.4s, v4.s[1]\n" 3472 3473 // RowMajorOutput::Prepare 3474 "add x0, %x[result], %x[stride]\n" 3475 3476 // Reduce aggregators. 3477 "addp v0.4s, v0.4s, v0.4s\n" 3478 "addp v0.4s, v0.4s, v0.4s\n" 3479 "addp v1.4s, v1.4s, v1.4s\n" 3480 "addp v1.4s, v1.4s, v1.4s\n" 3481 3482 // StaticQuantizationFloat::Transform 3483 "add v0.4s, v0.4s, v2.4s\n" 3484 "add v1.4s, v1.4s, v4.4s\n" 3485 "add v0.4s, v0.4s, v5.4s\n" 3486 "add v1.4s, v1.4s, v5.4s\n" 3487 "scvtf v0.4s, v0.4s\n" 3488 "scvtf v1.4s, v1.4s\n" 3489 "fmul v0.4s, v0.4s, v6.4s\n" 3490 "fmul v1.4s, v1.4s, v6.4s\n" 3491 3492 // RowMajorOutput::Output 3493 "st1 {v0.s}[0], [%x[result]], #4\n" 3494 "st1 {v1.s}[0], [x0], #4\n" 3495 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3496 : [count] "r"(params.kernel.count), 3497 [stride] "r"(params.output_stream.stride), 3498 [scale] "r"(params.kernel.scale) 3499 : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory"); 3500 } 3501 3502 template <> 3503 inline void MulKernel< 3504 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 2, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3505 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3506 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3507 RowMajor>& params, 3508 float* result) { 3509 #ifdef DEBUG 3510 #ifdef DEBUG_METAGEMM_VERBOSE 3511 std::cout << __FILE__ << "(" << __LINE__ 3512 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3513 "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 2, " 3514 "8>::Multiply()" 3515 << std::endl 3516 << std::flush; 3517 #endif 3518 #endif 3519 asm volatile( 3520 "prfm pldl1keep, [%x[lhs]]\n" 3521 "prfm pldl1keep, [%x[rhs]]\n" 3522 3523 // Clear aggregators. 3524 "movi v0.4s, #0\n" 3525 "movi v1.4s, #0\n" 3526 "movi v2.4s, #0\n" 3527 "mov v3.16b, v0.16b\n" 3528 3529 // General NxM lanes loop. 3530 "1:" 3531 3532 // Subtract counter. 3533 "subs %x[count], %x[count], #8\n" 3534 3535 "ld1 {v4.2s, v5.2s}, [%x[lhs]], #16\n" 3536 "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n" 3537 "prfm pldl1keep, [%x[lhs], #64]\n" 3538 "prfm pldl1keep, [%x[rhs], #64]\n" 3539 "umull v8.8h, v6.8b, v4.8b\n" 3540 "umull v9.8h, v7.8b, v4.8b\n" 3541 "umull v10.8h, v6.8b, v5.8b\n" 3542 "umull v11.8h, v7.8b, v5.8b\n" 3543 "uadalp v0.4s, v8.8h\n" 3544 "uadalp v1.4s, v9.8h\n" 3545 "uadalp v2.4s, v10.8h\n" 3546 "uadalp v3.4s, v11.8h\n" 3547 3548 // Loop break. 3549 "bgt 1b\n" 3550 3551 // StaticQuantizationFloat::Prepare 3552 "ld1 {v4.4s}, [%x[lhs]], #16\n" 3553 "ld1 {v5.4s}, [%x[rhs]], #16\n" 3554 "dup v6.4s, %w[scale]\n" 3555 "dup v7.4s, v4.s[0]\n" 3556 "dup v4.4s, v4.s[1]\n" 3557 3558 // RowMajorOutput::Prepare 3559 "add x0, %x[result], %x[stride]\n" 3560 3561 // Reduce aggregators. 3562 "addp v0.4s, v0.4s, v1.4s\n" 3563 "addp v0.4s, v0.4s, v0.4s\n" 3564 "addp v2.4s, v2.4s, v3.4s\n" 3565 "addp v2.4s, v2.4s, v2.4s\n" 3566 3567 // StaticQuantizationFloat::Transform 3568 "add v0.4s, v0.4s, v7.4s\n" 3569 "add v2.4s, v2.4s, v4.4s\n" 3570 "add v0.4s, v0.4s, v5.4s\n" 3571 "add v2.4s, v2.4s, v5.4s\n" 3572 "scvtf v0.4s, v0.4s\n" 3573 "scvtf v2.4s, v2.4s\n" 3574 "fmul v0.4s, v0.4s, v6.4s\n" 3575 "fmul v2.4s, v2.4s, v6.4s\n" 3576 3577 // RowMajorOutput::Output 3578 "st1 {v0.2s}, [%x[result]], #8\n" 3579 "st1 {v2.2s}, [x0], #8\n" 3580 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3581 : [count] "r"(params.kernel.count), 3582 [stride] "r"(params.output_stream.stride), 3583 [scale] "r"(params.kernel.scale) 3584 : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 3585 "v11", "cc", "memory"); 3586 } 3587 3588 template <> 3589 inline void MulKernel< 3590 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 3, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3591 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3592 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3593 RowMajor>& params, 3594 float* result) { 3595 #ifdef DEBUG 3596 #ifdef DEBUG_METAGEMM_VERBOSE 3597 std::cout << __FILE__ << "(" << __LINE__ 3598 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3599 "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 3, " 3600 "8>::Multiply()" 3601 << std::endl 3602 << std::flush; 3603 #endif 3604 #endif 3605 asm volatile( 3606 "prfm pldl1keep, [%x[lhs]]\n" 3607 "prfm pldl1keep, [%x[rhs]]\n" 3608 3609 // Clear aggregators. 3610 "movi v0.4s, #0\n" 3611 "movi v1.4s, #0\n" 3612 "movi v2.4s, #0\n" 3613 "mov v3.16b, v0.16b\n" 3614 "mov v4.16b, v1.16b\n" 3615 "mov v5.16b, v2.16b\n" 3616 3617 // General NxM lanes loop. 3618 "1:" 3619 3620 // Subtract counter. 3621 "subs %x[count], %x[count], #8\n" 3622 3623 "ld1 {v6.2s, v7.2s}, [%x[lhs]], #16\n" 3624 "ld1 {v8.2s, v9.2s, v10.2s}, [%x[rhs]], #24\n" 3625 "prfm pldl1keep, [%x[lhs], #64]\n" 3626 "prfm pldl1keep, [%x[rhs], #64]\n" 3627 "umull v11.8h, v8.8b, v6.8b\n" 3628 "umull v12.8h, v9.8b, v6.8b\n" 3629 "umull v13.8h, v10.8b, v6.8b\n" 3630 "umull v14.8h, v8.8b, v7.8b\n" 3631 "umull v15.8h, v9.8b, v7.8b\n" 3632 "umull v16.8h, v10.8b, v7.8b\n" 3633 "uadalp v0.4s, v11.8h\n" 3634 "uadalp v1.4s, v12.8h\n" 3635 "uadalp v2.4s, v13.8h\n" 3636 "uadalp v3.4s, v14.8h\n" 3637 "uadalp v4.4s, v15.8h\n" 3638 "uadalp v5.4s, v16.8h\n" 3639 3640 // Loop break. 3641 "bgt 1b\n" 3642 3643 // StaticQuantizationFloat::Prepare 3644 "ld1 {v6.4s}, [%x[lhs]], #16\n" 3645 "ld1 {v7.4s}, [%x[rhs]], #16\n" 3646 "dup v8.4s, %w[scale]\n" 3647 "dup v9.4s, v6.s[0]\n" 3648 "dup v6.4s, v6.s[1]\n" 3649 3650 // RowMajorOutput::Prepare 3651 "add x0, %x[result], %x[stride]\n" 3652 3653 // Reduce aggregators. 3654 "addp v0.4s, v0.4s, v1.4s\n" 3655 "addp v2.4s, v2.4s, v2.4s\n" 3656 "addp v0.4s, v0.4s, v2.4s\n" 3657 "addp v3.4s, v3.4s, v4.4s\n" 3658 "addp v5.4s, v5.4s, v5.4s\n" 3659 "addp v3.4s, v3.4s, v5.4s\n" 3660 3661 // StaticQuantizationFloat::Transform 3662 "add v0.4s, v0.4s, v9.4s\n" 3663 "add v3.4s, v3.4s, v6.4s\n" 3664 "add v0.4s, v0.4s, v7.4s\n" 3665 "add v3.4s, v3.4s, v7.4s\n" 3666 "scvtf v0.4s, v0.4s\n" 3667 "scvtf v3.4s, v3.4s\n" 3668 "fmul v0.4s, v0.4s, v8.4s\n" 3669 "fmul v3.4s, v3.4s, v8.4s\n" 3670 3671 // RowMajorOutput::Output 3672 "st1 {v0.2s}, [%x[result]], #8\n" 3673 "st1 {v0.s}[2], [%x[result]], #4\n" 3674 "st1 {v3.2s}, [x0], #8\n" 3675 "st1 {v3.s}[2], [x0], #4\n" 3676 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3677 : [count] "r"(params.kernel.count), 3678 [stride] "r"(params.output_stream.stride), 3679 [scale] "r"(params.kernel.scale) 3680 : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 3681 "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory"); 3682 } 3683 3684 template <> 3685 inline void MulKernel< 3686 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 4, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3687 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3688 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3689 RowMajor>& params, 3690 float* result) { 3691 #ifdef DEBUG 3692 #ifdef DEBUG_METAGEMM_VERBOSE 3693 std::cout << __FILE__ << "(" << __LINE__ 3694 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3695 "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 4, " 3696 "8>::Multiply()" 3697 << std::endl 3698 << std::flush; 3699 #endif 3700 #endif 3701 asm volatile( 3702 "prfm pldl1keep, [%x[lhs]]\n" 3703 "prfm pldl1keep, [%x[rhs]]\n" 3704 3705 // Clear aggregators. 3706 "movi v0.4s, #0\n" 3707 "movi v1.4s, #0\n" 3708 "movi v2.4s, #0\n" 3709 "mov v3.16b, v0.16b\n" 3710 "mov v4.16b, v1.16b\n" 3711 "mov v5.16b, v2.16b\n" 3712 "mov v6.16b, v3.16b\n" 3713 "mov v7.16b, v4.16b\n" 3714 3715 // 2x4 lanes loop. 3716 "1:" 3717 3718 "ld1 {v10.8b, v11.8b, v12.8b, v13.8b}, [%x[rhs]], #32\n" 3719 "ld1 {v8.8b}, [%x[lhs]], #8\n" 3720 "umull v14.8h, v8.8b, v10.8b\n" 3721 "ld1 {v9.8b}, [%x[lhs]], #8\n" 3722 "umull v15.8h, v8.8b, v11.8b\n" 3723 "prfm pldl1keep, [%x[rhs], #64]\n" 3724 "umull v16.8h, v8.8b, v12.8b\n" 3725 "prfm pldl1keep, [%x[lhs], #64]\n" 3726 "umull v17.8h, v8.8b, v13.8b\n" 3727 "umull v18.8h, v9.8b, v10.8b\n" 3728 "uadalp v0.4s, v14.8h\n" 3729 "uadalp v1.4s, v15.8h\n" 3730 "uadalp v2.4s, v16.8h\n" 3731 "umull v14.8h, v9.8b, v11.8b\n" 3732 "umull v15.8h, v9.8b, v12.8b\n" 3733 "umull v16.8h, v9.8b, v13.8b\n" 3734 3735 // Subtract counter. 3736 "subs %x[count], %x[count], #8\n" 3737 3738 "uadalp v3.4s, v17.8h\n" 3739 "uadalp v4.4s, v18.8h\n" 3740 "uadalp v5.4s, v14.8h\n" 3741 "uadalp v6.4s, v15.8h\n" 3742 "uadalp v7.4s, v16.8h\n" 3743 3744 // Loop break. 3745 "bgt 1b\n" 3746 3747 // StaticQuantizationFloat::Prepare 3748 "ld1 {v8.4s}, [%x[lhs]], #16\n" 3749 "ld1 {v9.4s}, [%x[rhs]], #16\n" 3750 "dup v10.4s, %w[scale]\n" 3751 "dup v11.4s, v8.s[0]\n" 3752 "dup v8.4s, v8.s[1]\n" 3753 3754 // RowMajorOutput::Prepare 3755 "add x0, %x[result], %x[stride]\n" 3756 3757 // Reduce aggregators. 3758 "addp v0.4s, v0.4s, v1.4s\n" 3759 "addp v2.4s, v2.4s, v3.4s\n" 3760 "addp v0.4s, v0.4s, v2.4s\n" 3761 "addp v4.4s, v4.4s, v5.4s\n" 3762 "addp v6.4s, v6.4s, v7.4s\n" 3763 "addp v4.4s, v4.4s, v6.4s\n" 3764 3765 // StaticQuantizationFloat::Transform 3766 "add v0.4s, v0.4s, v11.4s\n" 3767 "add v4.4s, v4.4s, v8.4s\n" 3768 "add v0.4s, v0.4s, v9.4s\n" 3769 "add v4.4s, v4.4s, v9.4s\n" 3770 "scvtf v0.4s, v0.4s\n" 3771 "scvtf v4.4s, v4.4s\n" 3772 "fmul v0.4s, v0.4s, v10.4s\n" 3773 "fmul v4.4s, v4.4s, v10.4s\n" 3774 3775 // RowMajorOutput::Output 3776 "st1 {v0.4s}, [%x[result]], #16\n" 3777 "st1 {v4.4s}, [x0], #16\n" 3778 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3779 : [count] "r"(params.kernel.count), 3780 [stride] "r"(params.output_stream.stride), 3781 [scale] "r"(params.kernel.scale) 3782 : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", 3783 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc", "memory"); 3784 } 3785 3786 template <> 3787 inline void MulKernel< 3788 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 1, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3789 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3790 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3791 RowMajor>& params, 3792 float* result) { 3793 #ifdef DEBUG 3794 #ifdef DEBUG_METAGEMM_VERBOSE 3795 std::cout << __FILE__ << "(" << __LINE__ 3796 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3797 "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 1, " 3798 "8>::Multiply()" 3799 << std::endl 3800 << std::flush; 3801 #endif 3802 #endif 3803 asm volatile( 3804 "prfm pldl1keep, [%x[lhs]]\n" 3805 "prfm pldl1keep, [%x[rhs]]\n" 3806 3807 // Clear aggregators. 3808 "movi v0.4s, #0\n" 3809 "movi v1.4s, #0\n" 3810 "movi v2.4s, #0\n" 3811 3812 // General NxM lanes loop. 3813 "1:" 3814 3815 // Subtract counter. 3816 "subs %x[count], %x[count], #8\n" 3817 3818 "ld1 {v3.2s, v4.2s, v5.2s}, [%x[lhs]], #24\n" 3819 "ld1 {v6.2s}, [%x[rhs]], #8\n" 3820 "prfm pldl1keep, [%x[lhs], #64]\n" 3821 "prfm pldl1keep, [%x[rhs], #64]\n" 3822 "umull v7.8h, v6.8b, v3.8b\n" 3823 "umull v8.8h, v6.8b, v4.8b\n" 3824 "umull v9.8h, v6.8b, v5.8b\n" 3825 "uadalp v0.4s, v7.8h\n" 3826 "uadalp v1.4s, v8.8h\n" 3827 "uadalp v2.4s, v9.8h\n" 3828 3829 // Loop break. 3830 "bgt 1b\n" 3831 3832 // StaticQuantizationFloat::Prepare 3833 "ld1 {v4.4s}, [%x[lhs]], #16\n" 3834 "ld1 {v5.4s}, [%x[rhs]], #16\n" 3835 "dup v6.4s, %w[scale]\n" 3836 "dup v3.4s, v4.s[0]\n" 3837 "dup v7.4s, v4.s[1]\n" 3838 "dup v4.4s, v4.s[2]\n" 3839 3840 // RowMajorOutput::Prepare 3841 "add x0, %x[result], %x[stride]\n" 3842 "add x1, x0, %x[stride]\n" 3843 3844 // Reduce aggregators. 3845 "addp v0.4s, v0.4s, v0.4s\n" 3846 "addp v0.4s, v0.4s, v0.4s\n" 3847 "addp v1.4s, v1.4s, v1.4s\n" 3848 "addp v1.4s, v1.4s, v1.4s\n" 3849 "addp v2.4s, v2.4s, v2.4s\n" 3850 "addp v2.4s, v2.4s, v2.4s\n" 3851 3852 // StaticQuantizationFloat::Transform 3853 "add v0.4s, v0.4s, v3.4s\n" 3854 "add v1.4s, v1.4s, v7.4s\n" 3855 "add v2.4s, v2.4s, v4.4s\n" 3856 "add v0.4s, v0.4s, v5.4s\n" 3857 "add v1.4s, v1.4s, v5.4s\n" 3858 "add v2.4s, v2.4s, v5.4s\n" 3859 "scvtf v0.4s, v0.4s\n" 3860 "scvtf v1.4s, v1.4s\n" 3861 "scvtf v2.4s, v2.4s\n" 3862 "fmul v0.4s, v0.4s, v6.4s\n" 3863 "fmul v1.4s, v1.4s, v6.4s\n" 3864 "fmul v2.4s, v2.4s, v6.4s\n" 3865 3866 // RowMajorOutput::Output 3867 "st1 {v0.s}[0], [%x[result]], #4\n" 3868 "st1 {v1.s}[0], [x0], #4\n" 3869 "st1 {v2.s}[0], [x1], #4\n" 3870 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3871 : [count] "r"(params.kernel.count), 3872 [stride] "r"(params.output_stream.stride), 3873 [scale] "r"(params.kernel.scale) 3874 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", 3875 "cc", "memory"); 3876 } 3877 3878 template <> 3879 inline void MulKernel< 3880 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 2, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3881 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3882 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3883 RowMajor>& params, 3884 float* result) { 3885 #ifdef DEBUG 3886 #ifdef DEBUG_METAGEMM_VERBOSE 3887 std::cout << __FILE__ << "(" << __LINE__ 3888 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3889 "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 2, " 3890 "8>::Multiply()" 3891 << std::endl 3892 << std::flush; 3893 #endif 3894 #endif 3895 asm volatile( 3896 "prfm pldl1keep, [%x[lhs]]\n" 3897 "prfm pldl1keep, [%x[rhs]]\n" 3898 3899 // Clear aggregators. 3900 "movi v0.4s, #0\n" 3901 "movi v1.4s, #0\n" 3902 "movi v2.4s, #0\n" 3903 "mov v3.16b, v0.16b\n" 3904 "mov v4.16b, v1.16b\n" 3905 "mov v5.16b, v2.16b\n" 3906 3907 // General NxM lanes loop. 3908 "1:" 3909 3910 // Subtract counter. 3911 "subs %x[count], %x[count], #8\n" 3912 3913 "ld1 {v6.2s, v7.2s, v8.2s}, [%x[lhs]], #24\n" 3914 "ld1 {v9.2s, v10.2s}, [%x[rhs]], #16\n" 3915 "prfm pldl1keep, [%x[lhs], #64]\n" 3916 "prfm pldl1keep, [%x[rhs], #64]\n" 3917 "umull v11.8h, v9.8b, v6.8b\n" 3918 "umull v12.8h, v10.8b, v6.8b\n" 3919 "umull v13.8h, v9.8b, v7.8b\n" 3920 "umull v14.8h, v10.8b, v7.8b\n" 3921 "umull v15.8h, v9.8b, v8.8b\n" 3922 "umull v16.8h, v10.8b, v8.8b\n" 3923 "uadalp v0.4s, v11.8h\n" 3924 "uadalp v1.4s, v12.8h\n" 3925 "uadalp v2.4s, v13.8h\n" 3926 "uadalp v3.4s, v14.8h\n" 3927 "uadalp v4.4s, v15.8h\n" 3928 "uadalp v5.4s, v16.8h\n" 3929 3930 // Loop break. 3931 "bgt 1b\n" 3932 3933 // StaticQuantizationFloat::Prepare 3934 "ld1 {v6.4s}, [%x[lhs]], #16\n" 3935 "ld1 {v7.4s}, [%x[rhs]], #16\n" 3936 "dup v8.4s, %w[scale]\n" 3937 "dup v9.4s, v6.s[0]\n" 3938 "dup v10.4s, v6.s[1]\n" 3939 "dup v6.4s, v6.s[2]\n" 3940 3941 // RowMajorOutput::Prepare 3942 "add x0, %x[result], %x[stride]\n" 3943 "add x1, x0, %x[stride]\n" 3944 3945 // Reduce aggregators. 3946 "addp v0.4s, v0.4s, v1.4s\n" 3947 "addp v0.4s, v0.4s, v0.4s\n" 3948 "addp v2.4s, v2.4s, v3.4s\n" 3949 "addp v2.4s, v2.4s, v2.4s\n" 3950 "addp v4.4s, v4.4s, v5.4s\n" 3951 "addp v4.4s, v4.4s, v4.4s\n" 3952 3953 // StaticQuantizationFloat::Transform 3954 "add v0.4s, v0.4s, v9.4s\n" 3955 "add v2.4s, v2.4s, v10.4s\n" 3956 "add v4.4s, v4.4s, v6.4s\n" 3957 "add v0.4s, v0.4s, v7.4s\n" 3958 "add v2.4s, v2.4s, v7.4s\n" 3959 "add v4.4s, v4.4s, v7.4s\n" 3960 "scvtf v0.4s, v0.4s\n" 3961 "scvtf v2.4s, v2.4s\n" 3962 "scvtf v4.4s, v4.4s\n" 3963 "fmul v0.4s, v0.4s, v8.4s\n" 3964 "fmul v2.4s, v2.4s, v8.4s\n" 3965 "fmul v4.4s, v4.4s, v8.4s\n" 3966 3967 // RowMajorOutput::Output 3968 "st1 {v0.2s}, [%x[result]], #8\n" 3969 "st1 {v2.2s}, [x0], #8\n" 3970 "st1 {v4.2s}, [x1], #8\n" 3971 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3972 : [count] "r"(params.kernel.count), 3973 [stride] "r"(params.output_stream.stride), 3974 [scale] "r"(params.kernel.scale) 3975 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", 3976 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory"); 3977 } 3978 3979 template <> 3980 inline void MulKernel< 3981 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 3, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3982 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3983 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3984 RowMajor>& params, 3985 float* result) { 3986 #ifdef DEBUG 3987 #ifdef DEBUG_METAGEMM_VERBOSE 3988 std::cout << __FILE__ << "(" << __LINE__ 3989 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3990 "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 3, " 3991 "8>::Multiply()" 3992 << std::endl 3993 << std::flush; 3994 #endif 3995 #endif 3996 asm volatile( 3997 "prfm pldl1keep, [%x[lhs]]\n" 3998 "prfm pldl1keep, [%x[rhs]]\n" 3999 4000 // Clear aggregators. 4001 "movi v0.4s, #0\n" 4002 "movi v1.4s, #0\n" 4003 "movi v2.4s, #0\n" 4004 "mov v3.16b, v0.16b\n" 4005 "mov v4.16b, v1.16b\n" 4006 "mov v5.16b, v2.16b\n" 4007 "mov v6.16b, v3.16b\n" 4008 "mov v7.16b, v4.16b\n" 4009 "mov v8.16b, v5.16b\n" 4010 4011 // 3x3 lanes loop. 4012 "1:" 4013 4014 "ld1 {v12.8b, v13.8b, v14.8b}, [%x[rhs]], #24\n" 4015 "ld1 {v9.8b}, [%x[lhs]], #8\n" 4016 "umull v15.8h, v9.8b, v12.8b\n" 4017 "ld1 {v10.8b}, [%x[lhs]], #8\n" 4018 "umull v16.8h, v9.8b, v13.8b\n" 4019 "ld1 {v11.8b}, [%x[lhs]], #8\n" 4020 "umull v17.8h, v9.8b, v14.8b\n" 4021 "prfm pldl1keep, [%x[lhs], #64]\n" 4022 "umull v18.8h, v10.8b, v12.8b\n" 4023 "prfm pldl1keep, [%x[rhs], #64]\n" 4024 "uadalp v0.4s, v15.8h\n" 4025 "uadalp v1.4s, v16.8h\n" 4026 "uadalp v2.4s, v17.8h\n" 4027 "uadalp v3.4s, v18.8h\n" 4028 "umull v15.8h, v10.8b, v13.8b\n" 4029 "umull v16.8h, v10.8b, v14.8b\n" 4030 "umull v17.8h, v11.8b, v12.8b\n" 4031 "umull v18.8h, v11.8b, v13.8b\n" 4032 4033 // Subtract counter. 4034 "subs %x[count], %x[count], #8\n" 4035 4036 "umull v9.8h, v11.8b, v14.8b\n" 4037 "uadalp v4.4s, v15.8h\n" 4038 "uadalp v5.4s, v16.8h\n" 4039 "uadalp v6.4s, v17.8h\n" 4040 "uadalp v7.4s, v18.8h\n" 4041 "uadalp v8.4s, v9.8h\n" 4042 4043 // Loop break. 4044 "bgt 1b\n" 4045 4046 // StaticQuantizationFloat::Prepare 4047 "ld1 {v9.4s}, [%x[lhs]], #16\n" 4048 "ld1 {v10.4s}, [%x[rhs]], #16\n" 4049 "dup v11.4s, %w[scale]\n" 4050 "dup v12.4s, v9.s[0]\n" 4051 "dup v13.4s, v9.s[1]\n" 4052 "dup v9.4s, v9.s[2]\n" 4053 4054 // RowMajorOutput::Prepare 4055 "add x0, %x[result], %x[stride]\n" 4056 "add x1, x0, %x[stride]\n" 4057 4058 // Reduce aggregators. 4059 "addp v0.4s, v0.4s, v1.4s\n" 4060 "addp v2.4s, v2.4s, v2.4s\n" 4061 "addp v0.4s, v0.4s, v2.4s\n" 4062 "addp v3.4s, v3.4s, v4.4s\n" 4063 "addp v5.4s, v5.4s, v5.4s\n" 4064 "addp v3.4s, v3.4s, v5.4s\n" 4065 "addp v6.4s, v6.4s, v7.4s\n" 4066 "addp v8.4s, v8.4s, v8.4s\n" 4067 "addp v6.4s, v6.4s, v8.4s\n" 4068 4069 // StaticQuantizationFloat::Transform 4070 "add v0.4s, v0.4s, v12.4s\n" 4071 "add v3.4s, v3.4s, v13.4s\n" 4072 "add v6.4s, v6.4s, v9.4s\n" 4073 "add v0.4s, v0.4s, v10.4s\n" 4074 "add v3.4s, v3.4s, v10.4s\n" 4075 "add v6.4s, v6.4s, v10.4s\n" 4076 "scvtf v0.4s, v0.4s\n" 4077 "scvtf v3.4s, v3.4s\n" 4078 "scvtf v6.4s, v6.4s\n" 4079 "fmul v0.4s, v0.4s, v11.4s\n" 4080 "fmul v3.4s, v3.4s, v11.4s\n" 4081 "fmul v6.4s, v6.4s, v11.4s\n" 4082 4083 // RowMajorOutput::Output 4084 "st1 {v0.2s}, [%x[result]], #8\n" 4085 "st1 {v0.s}[2], [%x[result]], #4\n" 4086 "st1 {v3.2s}, [x0], #8\n" 4087 "st1 {v3.s}[2], [x0], #4\n" 4088 "st1 {v6.2s}, [x1], #8\n" 4089 "st1 {v6.s}[2], [x1], #4\n" 4090 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 4091 : [count] "r"(params.kernel.count), 4092 [stride] "r"(params.output_stream.stride), 4093 [scale] "r"(params.kernel.scale) 4094 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", 4095 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc", 4096 "memory"); 4097 } 4098 4099 } // namespace meta 4100 } // namespace gemmlowp 4101 4102 #else 4103 #warning "Meta gemm for arm64 requires: GEMMLOWP_NEON_64!" 4104 #endif 4105 4106 #endif // GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_64_H_ 4107