1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_dwconv_ukernel_up4x9__aarch64_neonfma_cortex_a55( 9# size_t channels, 10# size_t output_width, 11# const float** input, 12# const float* weights, 13# float* output, 14# size_t input_stride, 15# size_t output_increment, 16# const union xnn_f32_output_params params[restrict static 1]) 17BEGIN_FUNCTION xnn_f32_dwconv_ukernel_up4x9__aarch64_neonfma_cortex_a55 18 19 # Save d8-d15 on stack 20 STP d8, d9, [sp, -64]! 21 STP d10, d11, [sp, 16] 22 STP d12, d13, [sp, 32] 23 STP d14, d15, [sp, 48] 24 25 # Load clamping_params values 26 LD2R {v30.4s, v31.4s}, [x7] 27 280: 29 # x7 := i0 30 # x8 := i1 31 LDP x7, x8, [x2] 32 # x9 := i2 33 # x10 := i3 34 LDP x9, x10, [x2, 16] 35 # x11 := i4 36 # x12 := i5 37 LDP x11, x12, [x2, 32] 38 # x13 := i6 39 # x14 := i7 40 LDP x13, x14, [x2, 48] 41 # x15 := i8 42 LDR x15, [x2, 64] 43 # input += input_stride 44 ADD x2, x2, x5 45 46 # x16 := c = channels 47 # c -= 8 48 SUBS x16, x0, 8 49 # x17 := w = weights 50 MOV x17, x3 51 52 # skip main loop if c < 8 53 B.LO 3f 54 55 # SWP prologue 56 57 # Load vbias.lo 58 LD1 {v0.2S}, [x17], 8 59 60 # Load vbias.hi 61 LD1 {v1.2S}, [x17], 8 62 63 # Load vi0.lo 64 LD1 {v4.2S}, [x7], 8 65 66 # Load vk0.lo 67 LD1 {v5.2S}, [x17], 8 68 69 # Load vi0.hi 70 LD1 {v6.2S}, [x7], 8 71 72 # Load vk0.hi 73 LD1 {v7.2S}, [x17], 8 74 75 # Load vi1.lo 76 LD1 {v8.2S}, [x8], 8 77 78 # Load vk1.lo 79 LD1 {v9.2S}, [x17], 8 80 81 # Load vi1.hi 82 LD1 {v10.2S}, [x8], 8 83 84 # Load vk1.hi 85 LD1 {v11.2S}, [x17], 8 86 87 # Load vi2.lo 88 LD1 {v12.2S}, [x9], 8 89 90 # Load vk2.lo 91 LD1 {v13.2S}, [x17], 8 92 93 # Load vi2.hi 94 LD1 {v14.2S}, [x9], 8 95 96 # Load vk2.hi 97 LD1 {v15.2S}, [x17], 8 98 99 # Load vi3.lo 100 LD1 {v16.2S}, [x10], 8 101 102 # Load vk3.lo 103 LD1 {v17.2S}, [x17], 8 104 105 # Load vi3.hi 106 LD1 {v18.2S}, [x10], 8 107 108 # Load vk3.hi 109 LD1 {v19.2S}, [x17], 8 110 111 # Load vi4.lo 112 LD1 {v20.2S}, [x11], 8 113 114 # Load vk4.lo 115 LD1 {v21.2S}, [x17], 8 116 117 # Load vi4.hi 118 LD1 {v22.2S}, [x11], 8 119 120 # Load vk4.hi 121 LD1 {v23.2S}, [x17], 8 122 123 # Load vi5.lo 124 LD1 {v24.2S}, [x12], 8 125 126 # Load vk5.lo 127 LD1 {v25.2S}, [x17], 8 128 129 # Load vi5.hi 130 LD1 {v26.2S}, [x12], 8 131 132 # Load vk5.hi 133 LD1 {v27.2S}, [x17], 8 134 135 # vacc.lo += vi0.lo * vk0.lo 136 FMLA v0.2S, v4.2S, v5.2S 137 # Load vi6.lo 138 LD1 {v4.2S}, [x13], 8 139 140 # Load vk6.lo 141 LD1 {v5.2S}, [x17], 8 142 143 # vacc.hi += vi0.hi * vk0.hi 144 FMLA v1.2S, v6.2S, v7.2S 145 # Load vi6.hi 146 LD1 {v6.2S}, [x13], 8 147 148 # Load vk6.hi 149 LD1 {v7.2S}, [x17], 8 150 151 # vacc.lo += vi1.lo * vk0.lo 152 FMLA v0.2S, v8.2S, v9.2S 153 # Load vi7.lo 154 LD1 {v8.2S}, [x14], 8 155 156 # Load vk7.lo 157 LD1 {v9.2S}, [x17], 8 158 159 # vacc.hi += vi1.hi * vk0.hi 160 FMLA v1.2S, v10.2S, v11.2S 161 # Load vi7.hi 162 LD1 {v10.2S}, [x14], 8 163 164 # Load vk7.hi 165 LD1 {v11.2S}, [x17], 8 166 167 # vacc.lo += vi2.lo * vk2.lo 168 FMLA v0.2S, v12.2S, v13.2S 169 # Load vi8.lo 170 LD1 {v12.2S}, [x15], 8 171 172 # Load vk8.lo 173 LD1 {v13.2S}, [x17], 8 174 175 # vacc.hi += vi2.hi * vk2.hi 176 FMLA v1.2S, v14.2S, v15.2S 177 # Load vi8.hi 178 LD1 {v14.2S}, [x15], 8 179 180 # Load vk8.hi 181 LD1 {v15.2S}, [x17], 8 182 183 # Load vbias_next.lo 184 LD1 {v2.2S}, [x17], 8 185 186 # Load vbias_next.hi 187 LD1 {v3.2S}, [x17], 8 188 189 # vacc.lo += vi3.lo * vk3.lo 190 FMLA v0.2S, v16.2S, v17.2S 191 # Load vi0_next.lo 192 LD1 {v16.2S}, [x7], 8 193 194 # Load vk0_next.lo 195 LD1 {v17.2S}, [x17], 8 196 197 # vacc.hi += vi3.hi * vk3.hi 198 FMLA v1.2S, v18.2S, v19.2S 199 # Load vi0_next.hi 200 LD1 {v18.2S}, [x7], 8 201 202 # Load vk0_next.hi 203 LD1 {v19.2S}, [x17], 8 204 205 # vacc.lo += vi4.lo * vk4.lo 206 FMLA v0.2S, v20.2S, v21.2S 207 # Load vi1_next.lo 208 LD1 {v20.2S}, [x8], 8 209 210 # Load vk1_next.lo 211 LD1 {v21.2S}, [x17], 8 212 213 # vacc.hi += vi4.hi * vk4.hi 214 FMLA v1.2S, v22.2S, v23.2S 215 # Load vi1_next.hi 216 LD1 {v22.2S}, [x8], 8 217 218 # Load vk1_next.hi 219 LD1 {v23.2S}, [x17], 8 220 221 # vacc.lo += vi5.lo * vk5.lo 222 FMLA v0.2S, v24.2S, v25.2S 223 # Load vi2_next.lo 224 LD1 {v24.2S}, [x9], 8 225 226 # Load vk2_next.lo 227 LD1 {v25.2S}, [x17], 8 228 229 # vacc.hi += vi5.hi * vk5.hi 230 FMLA v1.2S, v26.2S, v27.2S 231 # Load vi2_next.hi 232 LD1 {v26.2S}, [x9], 8 233 234 # Load vk2_next.hi 235 LD1 {v27.2S}, [x17], 8 236 237 # vacc.lo += vi6.lo * vk6.lo 238 FMLA v0.2S, v4.2S, v5.2S 239 # Load vi3_next.lo 240 LD1 {v4.2S}, [x10], 8 241 242 # Load vk3_next.lo 243 LD1 {v5.2S}, [x17], 8 244 245 # vacc.hi += vi6.hi * vk6.hi 246 FMLA v1.2S, v6.2S, v7.2S 247 # Load vi3_next.hi 248 LD1 {v6.2S}, [x10], 8 249 250 # Load vk3_next.hi 251 LD1 {v7.2S}, [x17], 8 252 253 # vacc.lo += vi7.lo * vk7.lo 254 FMLA v0.2S, v8.2S, v9.2S 255 # Load vi4_next.lo 256 LD1 {v8.2S}, [x11], 8 257 258 # Load vk4_next.lo 259 LD1 {v9.2S}, [x17], 8 260 261 # vacc.hi += vi7.hi * vk7.hi 262 FMLA v1.2S, v10.2S, v11.2S 263 # Load vi4_next.hi 264 LD1 {v10.2S}, [x11], 8 265 266 # Load vk4_next.hi 267 LD1 {v11.2S}, [x17], 8 268 269 # vacc.lo += vi8.lo * vk8.lo 270 FMLA v0.2S, v12.2S, v13.2S 271 # Load vi5_next.lo 272 LD1 {v12.2S}, [x12], 8 273 274 # Load vk5_next.lo 275 LD1 {v13.2S}, [x17], 8 276 277 # vacc.hi += vi8.hi * vk8.hi 278 FMLA v1.2S, v14.2S, v15.2S 279 # Load vi5_next.hi 280 LD1 {v14.2S}, [x12], 8 281 282 # Load vk5_next.hi 283 LD1 {v15.2S}, [x17], 8 284 285 # vacc_next.lo += vi0_next.lo * vk0_next.lo 286 FMLA v2.2S, v16.2S, v17.2S 287 # Load vi6_next.lo 288 LD1 {v16.2S}, [x13], 8 289 290 # vacc.lo = min(vacc.lo, vmax) 291 FMIN v0.2S, v0.2S, v30.2S 292 # Load vk6_next.lo 293 LD1 {v17.2S}, [x17], 8 294 295 # vacc_next.hi += vi0_next.hi * vk0_next.hi 296 FMLA v3.2S, v18.2S, v19.2S 297 # Load vi6_next.hi 298 LD1 {v18.2S}, [x13], 8 299 300 # vacc.hi = min(vacc.hi, vmax) 301 FMIN v1.2S, v1.2S, v30.2S 302 # Load vk6_next.hi 303 LD1 {v19.2S}, [x17], 8 304 305 # vacc_next.lo += vi1_next.lo * vk1_next.lo 306 FMLA v2.2S, v20.2S, v21.2S 307 # Load vi7_next.lo 308 LD1 {v20.2S}, [x14], 8 309 310 # vacc.lo = max(vacc.lo, vmin) 311 FMAX v0.2S, v0.2S, v31.2S 312 # Load vk7_next.lo 313 LD1 {v21.2S}, [x17], 8 314 315 # vacc_next.hi += vi1_next.hi * vk1_next.hi 316 FMLA v3.2S, v22.2S, v23.2S 317 # Load vi7_next.hi 318 LD1 {v22.2S}, [x14], 8 319 320 # vacc.hi = max(vacc.hi, vmin) 321 FMAX v1.2S, v1.2S, v31.2S 322 # Load vk7_next.hi 323 LD1 {v23.2S}, [x17], 8 324 325 # vacc_next.lo += vi2_next.lo * vk2_next.lo 326 FMLA v2.2S, v24.2S, v25.2S 327 # Load vi8_next.lo 328 LD1 {v24.2S}, [x15], 8 329 330 # Load vk8_next.lo 331 LD1 {v25.2S}, [x17], 8 332 333 # vacc_next.hi += vi2_next.hi * vk2_next.hi 334 FMLA v3.2S, v26.2S, v27.2S 335 # Load vi8_next.hi 336 LD1 {v26.2S}, [x15], 8 337 338 # Store vacc 339 STP d0, d1, [x4], 16 340 341 # c -= 8 342 SUBS x16, x16, 8 343 # Load vk8_next.hi 344 LD1 {v27.2S}, [x17], 8 345 346 B.LO 2f 347 3481: 349 # SWP iteration 350 351 # Load vbias.lo 352 LD1 {v0.2S}, [x17], 8 353 354 # Load vbias.hi 355 LD1 {v1.2S}, [x17], 8 356 357 # vacc_prev.lo += vi3_prev.lo * vk3_prev.lo 358 FMLA v2.2S, v4.2S, v5.2S 359 # Load vi0.lo 360 LD1 {v4.2S}, [x7], 8 361 362 # Load vk0.lo 363 LD1 {v5.2S}, [x17], 8 364 365 # vacc_prev.hi += vi3_prev.hi * vk3_prev.hi 366 FMLA v3.2S, v6.2S, v7.2S 367 # Load vi0.hi 368 LD1 {v6.2S}, [x7], 8 369 370 # Load vk0.hi 371 LD1 {v7.2S}, [x17], 8 372 373 # vacc_prev.lo += vi4_prev.lo * vk4_prev.lo 374 FMLA v2.2S, v8.2S, v9.2S 375 # Load vi1.lo 376 LD1 {v8.2S}, [x8], 8 377 378 # Load vk1.lo 379 LD1 {v9.2S}, [x17], 8 380 381 # vacc_prev.hi += vi4_prev.hi * vk4_prev.hi 382 FMLA v3.2S, v10.2S, v11.2S 383 # Load vi1.hi 384 LD1 {v10.2S}, [x8], 8 385 386 # Load vk1.hi 387 LD1 {v11.2S}, [x17], 8 388 389 # vacc_prev.lo += vi5_prev.lo * vk5_prev.lo 390 FMLA v2.2S, v12.2S, v13.2S 391 # Load vi2.lo 392 LD1 {v12.2S}, [x9], 8 393 394 # Load vk2.lo 395 LD1 {v13.2S}, [x17], 8 396 397 # vacc_prev.hi += vi5_prev.hi * vk5_prev.hi 398 FMLA v3.2S, v14.2S, v15.2S 399 # Load vi2.hi 400 LD1 {v14.2S}, [x9], 8 401 402 # Load vk2.hi 403 LD1 {v15.2S}, [x17], 8 404 405 # vacc_prev.lo += vi6_prev.lo * vk6_prev.lo 406 FMLA v2.2S, v16.2S, v17.2S 407 # Load vi3.lo 408 LD1 {v16.2S}, [x10], 8 409 410 # Load vk3.lo 411 LD1 {v17.2S}, [x17], 8 412 413 # vacc_prev.hi += vi6_prev.hi * vk6_prev.hi 414 FMLA v3.2S, v18.2S, v19.2S 415 # Load vi3.hi 416 LD1 {v18.2S}, [x10], 8 417 418 # Load vk3.hi 419 LD1 {v19.2S}, [x17], 8 420 421 # vacc_prev.lo += vi7_prev.lo * vk7_prev.lo 422 FMLA v2.2S, v20.2S, v21.2S 423 # Load vi4.lo 424 LD1 {v20.2S}, [x11], 8 425 426 # Load vk4.lo 427 LD1 {v21.2S}, [x17], 8 428 429 # vacc_prev.hi += vi7_prev.hi * vk7_prev.hi 430 FMLA v3.2S, v22.2S, v23.2S 431 # Load vi4.hi 432 LD1 {v22.2S}, [x11], 8 433 434 # Load vk4.hi 435 LD1 {v23.2S}, [x17], 8 436 437 # vacc_prev.lo += vi8_prev.lo * vk8_prev.lo 438 FMLA v2.2S, v24.2S, v25.2S 439 # Load vi5.lo 440 LD1 {v24.2S}, [x12], 8 441 442 # Load vk5.lo 443 LD1 {v25.2S}, [x17], 8 444 445 # vacc_prev.hi += vi8_prev.hi * vk8_prev.hi 446 FMLA v3.2S, v26.2S, v27.2S 447 # Load vi5.hi 448 LD1 {v26.2S}, [x12], 8 449 450 # Load vk5.hi 451 LD1 {v27.2S}, [x17], 8 452 453 # vacc.lo += vi0.lo * vk0.lo 454 FMLA v0.2S, v4.2S, v5.2S 455 # Load vi6.lo 456 LD1 {v4.2S}, [x13], 8 457 458 # vacc_prev.lo = min(vacc_prev.lo, vmax) 459 FMIN v2.2S, v2.2S, v30.2S 460 # Load vk6.lo 461 LD1 {v5.2S}, [x17], 8 462 463 # vacc.hi += vi0.hi * vk0.hi 464 FMLA v1.2S, v6.2S, v7.2S 465 # Load vi6.hi 466 LD1 {v6.2S}, [x13], 8 467 468 # vacc_prev.hi = min(vacc_prev.hi, vmax) 469 FMIN v3.2S, v3.2S, v30.2S 470 # Load vk6.hi 471 LD1 {v7.2S}, [x17], 8 472 473 # vacc.lo += vi1.lo * vk0.lo 474 FMLA v0.2S, v8.2S, v9.2S 475 # Load vi7.lo 476 LD1 {v8.2S}, [x14], 8 477 478 # vacc_prev.lo = max(vacc_prev.lo, vmin) 479 FMAX v2.2S, v2.2S, v31.2S 480 # Load vk7.lo 481 LD1 {v9.2S}, [x17], 8 482 483 # vacc.hi += vi1.hi * vk0.hi 484 FMLA v1.2S, v10.2S, v11.2S 485 # Load vi7.hi 486 LD1 {v10.2S}, [x14], 8 487 488 # vacc_prev.lo = max(vacc_prev.lo, vmin) 489 FMAX v3.2S, v3.2S, v31.2S 490 # Load vk7.hi 491 LD1 {v11.2S}, [x17], 8 492 493 # vacc.lo += vi2.lo * vk2.lo 494 FMLA v0.2S, v12.2S, v13.2S 495 # Load vi8.lo 496 LD1 {v12.2S}, [x15], 8 497 498 # Load vk8.lo 499 LD1 {v13.2S}, [x17], 8 500 501 # vacc.hi += vi2.hi * vk2.hi 502 FMLA v1.2S, v14.2S, v15.2S 503 # Load vi8.hi 504 LD1 {v14.2S}, [x15], 8 505 506 # Store vacc_prev 507 STP d2, d3, [x4], 16 508 509 # Load vk8.hi 510 LD1 {v15.2S}, [x17], 8 511 512 # Load vbias_next.lo 513 LD1 {v2.2S}, [x17], 8 514 515 # Load vbias_next.hi 516 LD1 {v3.2S}, [x17], 8 517 518 # vacc.lo += vi3.lo * vk3.lo 519 FMLA v0.2S, v16.2S, v17.2S 520 # Load vi0_next.lo 521 LD1 {v16.2S}, [x7], 8 522 523 # Load vk0_next.lo 524 LD1 {v17.2S}, [x17], 8 525 526 # vacc.hi += vi3.hi * vk3.hi 527 FMLA v1.2S, v18.2S, v19.2S 528 # Load vi0_next.hi 529 LD1 {v18.2S}, [x7], 8 530 531 # Load vk0_next.hi 532 LD1 {v19.2S}, [x17], 8 533 534 # vacc.lo += vi4.lo * vk4.lo 535 FMLA v0.2S, v20.2S, v21.2S 536 # Load vi1_next.lo 537 LD1 {v20.2S}, [x8], 8 538 539 # Load vk1_next.lo 540 LD1 {v21.2S}, [x17], 8 541 542 # vacc.hi += vi4.hi * vk4.hi 543 FMLA v1.2S, v22.2S, v23.2S 544 # Load vi1_next.hi 545 LD1 {v22.2S}, [x8], 8 546 547 # Load vk1_next.hi 548 LD1 {v23.2S}, [x17], 8 549 550 # vacc.lo += vi5.lo * vk5.lo 551 FMLA v0.2S, v24.2S, v25.2S 552 # Load vi2_next.lo 553 LD1 {v24.2S}, [x9], 8 554 555 # Load vk2_next.lo 556 LD1 {v25.2S}, [x17], 8 557 558 # vacc.hi += vi5.hi * vk5.hi 559 FMLA v1.2S, v26.2S, v27.2S 560 # Load vi2_next.hi 561 LD1 {v26.2S}, [x9], 8 562 563 # Load vk2_next.hi 564 LD1 {v27.2S}, [x17], 8 565 566 # vacc.lo += vi6.lo * vk6.lo 567 FMLA v0.2S, v4.2S, v5.2S 568 # Load vi3_next.lo 569 LD1 {v4.2S}, [x10], 8 570 571 # Load vk3_next.lo 572 LD1 {v5.2S}, [x17], 8 573 574 # vacc.hi += vi6.hi * vk6.hi 575 FMLA v1.2S, v6.2S, v7.2S 576 # Load vi3_next.hi 577 LD1 {v6.2S}, [x10], 8 578 579 # Load vk3_next.hi 580 LD1 {v7.2S}, [x17], 8 581 582 # vacc.lo += vi7.lo * vk7.lo 583 FMLA v0.2S, v8.2S, v9.2S 584 # Load vi4_next.lo 585 LD1 {v8.2S}, [x11], 8 586 587 # Load vk4_next.lo 588 LD1 {v9.2S}, [x17], 8 589 590 # vacc.hi += vi7.hi * vk7.hi 591 FMLA v1.2S, v10.2S, v11.2S 592 # Load vi4_next.hi 593 LD1 {v10.2S}, [x11], 8 594 595 # Load vk4_next.hi 596 LD1 {v11.2S}, [x17], 8 597 598 # vacc.lo += vi8.lo * vk8.lo 599 FMLA v0.2S, v12.2S, v13.2S 600 # Load vi5_next.lo 601 LD1 {v12.2S}, [x12], 8 602 603 # Load vk5_next.lo 604 LD1 {v13.2S}, [x17], 8 605 606 # vacc.hi += vi8.hi * vk8.hi 607 FMLA v1.2S, v14.2S, v15.2S 608 # Load vi5_next.hi 609 LD1 {v14.2S}, [x12], 8 610 611 # Load vk5_next.hi 612 LD1 {v15.2S}, [x17], 8 613 614 # vacc_next.lo += vi0_next.lo * vk0_next.lo 615 FMLA v2.2S, v16.2S, v17.2S 616 # Load vi6_next.lo 617 LD1 {v16.2S}, [x13], 8 618 619 # vacc.lo = min(vacc.lo, vmax) 620 FMIN v0.2S, v0.2S, v30.2S 621 # Load vk6_next.lo 622 LD1 {v17.2S}, [x17], 8 623 624 # vacc_next.hi += vi0_next.hi * vk0_next.hi 625 FMLA v3.2S, v18.2S, v19.2S 626 # Load vi6_next.hi 627 LD1 {v18.2S}, [x13], 8 628 629 # vacc.hi = min(vacc.hi, vmax) 630 FMIN v1.2S, v1.2S, v30.2S 631 # Load vk6_next.hi 632 LD1 {v19.2S}, [x17], 8 633 634 # vacc_next.lo += vi1_next.lo * vk1_next.lo 635 FMLA v2.2S, v20.2S, v21.2S 636 # Load vi7_next.lo 637 LD1 {v20.2S}, [x14], 8 638 639 # vacc.lo = max(vacc.lo, vmin) 640 FMAX v0.2S, v0.2S, v31.2S 641 # Load vk7_next.lo 642 LD1 {v21.2S}, [x17], 8 643 644 # vacc_next.hi += vi1_next.hi * vk1_next.hi 645 FMLA v3.2S, v22.2S, v23.2S 646 # Load vi7_next.hi 647 LD1 {v22.2S}, [x14], 8 648 649 # vacc.hi = max(vacc.hi, vmin) 650 FMAX v1.2S, v1.2S, v31.2S 651 # Load vk7_next.hi 652 LD1 {v23.2S}, [x17], 8 653 654 # vacc_next.lo += vi2_next.lo * vk2_next.lo 655 FMLA v2.2S, v24.2S, v25.2S 656 # Load vi8_next.lo 657 LD1 {v24.2S}, [x15], 8 658 659 # Load vk8_next.lo 660 LD1 {v25.2S}, [x17], 8 661 662 # vacc_next.hi += vi2_next.hi * vk2_next.hi 663 FMLA v3.2S, v26.2S, v27.2S 664 # Load vi8_next.hi 665 LD1 {v26.2S}, [x15], 8 666 667 # Store vacc 668 STP d0, d1, [x4], 16 669 670 # c -= 8 671 SUBS x16, x16, 8 672 # Load vk8_next.hi 673 LD1 {v27.2S}, [x17], 8 674 675 B.HS 1b 676 6772: 678 # SWP epilogue 679 680 # vacc_prev.lo += vi3_prev.lo * vk3_prev.lo 681 FMLA v2.2S, v4.2S, v5.2S 682 683 # vacc_prev.hi += vi3_prev.hi * vk3_prev.hi 684 FMLA v3.2S, v6.2S, v7.2S 685 686 # vacc_prev.lo += vi4_prev.lo * vk4_prev.lo 687 FMLA v2.2S, v8.2S, v9.2S 688 689 # vacc_prev.hi += vi4_prev.hi * vk4_prev.hi 690 FMLA v3.2S, v10.2S, v11.2S 691 692 # vacc_prev.lo += vi5_prev.lo * vk5_prev.lo 693 FMLA v2.2S, v12.2S, v13.2S 694 695 # vacc_prev.hi += vi5_prev.hi * vk5_prev.hi 696 FMLA v3.2S, v14.2S, v15.2S 697 698 # vacc_prev.lo += vi6_prev.lo * vk6_prev.lo 699 FMLA v2.2S, v16.2S, v17.2S 700 701 # vacc_prev.hi += vi6_prev.hi * vk6_prev.hi 702 FMLA v3.2S, v18.2S, v19.2S 703 704 # vacc_prev.lo += vi7_prev.lo * vk7_prev.lo 705 FMLA v2.2S, v20.2S, v21.2S 706 707 # vacc_prev.hi += vi7_prev.hi * vk7_prev.hi 708 FMLA v3.2S, v22.2S, v23.2S 709 710 # vacc_prev.lo += vi8_prev.lo * vk8_prev.lo 711 FMLA v2.2S, v24.2S, v25.2S 712 713 # vacc_prev.hi += vi8_prev.hi * vk8_prev.hi 714 FMLA v3.2S, v26.2S, v27.2S 715 716 # vacc_prev.lo = min(vacc_prev.lo, vmax) 717 FMIN v2.2S, v2.2S, v30.2S 718 719 # vacc_prev.hi = min(vacc_prev.hi, vmax) 720 FMIN v3.2S, v3.2S, v30.2S 721 722 # vacc_prev.lo = max(vacc_prev.lo, vmin) 723 FMAX v2.2S, v2.2S, v31.2S 724 725 # vacc_prev.lo = max(vacc_prev.lo, vmin) 726 FMAX v3.2S, v3.2S, v31.2S 727 728 # Store vacc_prev 729 STP d2, d3, [x4], 16 730 7313: 732 # skip processing 4 channels if ((c - 8) & 4) = (c & 4) != 0 733 TBZ x16, 2, 4f 734 735 LDP q0, q1, [x17], 32 736 LDP q2, q3, [x17], 32 737 LDP q4, q5, [x17], 32 738 LDP q6, q7, [x17], 32 739 LDP q8, q9, [x17], 32 740 LDR q10, [x7], 16 741 LDR q11, [x8], 16 742 LDR q12, [x9], 16 743 LDR q13, [x10], 16 744 LDR q14, [x11], 16 745 LDR q15, [x12], 16 746 LDR q16, [x13], 16 747 LDR q17, [x14], 16 748 LDR q18, [x15], 16 749 750 FMLA v0.4S, v1.4S, v10.4S 751 FMLA v0.4S, v2.4S, v11.4S 752 FMLA v0.4S, v3.4S, v12.4S 753 FMLA v0.4S, v4.4S, v13.4S 754 FMLA v0.4S, v5.4S, v14.4S 755 FMLA v0.4S, v6.4S, v15.4S 756 FMLA v0.4S, v7.4S, v16.4S 757 FMLA v0.4S, v8.4S, v17.4S 758 FMLA v0.4S, v9.4S, v18.4S 759 760 FMIN v0.4S, v0.4S, v30.4S 761 FMAX v0.4S, v0.4S, v31.4S 762 763 STR q0, [x4], 16 764 7654: 766 # restore actual c value 767 ADD x16, x16, 8 768 # skip processing remainder channels unless c != 0 769 CBZ x16, 6f 770 771 LDP q0, q1, [x17], 32 772 LDP q2, q3, [x17], 32 773 LDP q4, q5, [x17], 32 774 LDP q6, q7, [x17], 32 775 LDP q8, q9, [x17], 32 776 LDR q10, [x7], 16 777 LDR q11, [x8], 16 778 LDR q12, [x9], 16 779 LDR q13, [x10], 16 780 LDR q14, [x11], 16 781 LDR q15, [x12], 16 782 LDR q16, [x13], 16 783 LDR q17, [x14], 16 784 LDR q18, [x15], 16 785 786 FMLA v0.4S, v1.4S, v10.4S 787 FMLA v0.4S, v2.4S, v11.4S 788 FMLA v0.4S, v3.4S, v12.4S 789 FMLA v0.4S, v4.4S, v13.4S 790 FMLA v0.4S, v5.4S, v14.4S 791 FMLA v0.4S, v6.4S, v15.4S 792 FMLA v0.4S, v7.4S, v16.4S 793 FMLA v0.4S, v8.4S, v17.4S 794 FMLA v0.4S, v9.4S, v18.4S 795 796 FMIN v0.4S, v0.4S, v30.4S 797 FMAX v0.4S, v0.4S, v31.4S 798 799 TBZ x16, 1, 5f 800 801 ST1 {v0.2S}, [x4], 8 802 DUP d0, v0.D[1] 803 8045: 805 TBZ x16, 0, 6f 806 807 ST1 {v0.S}[0], [x4], 4 808 8096: 810 # output_width -= 1 811 SUBS x1, x1, 1 812 # output += output_increment 813 ADD x4, x4, x6 814 # process next pixel if output_width != 0 815 B.NE 0b 816 817 # Restore d8-d15 from stack 818 LDP d14, d15, [sp, 48] 819 LDP d12, d13, [sp, 32] 820 LDP d10, d11, [sp, 16] 821 LDP d8, d9, [sp], 64 822 RET 823 824END_FUNCTION xnn_f32_dwconv_ukernel_up4x9__aarch64_neonfma_cortex_a55 825 826#ifdef __ELF__ 827.section ".note.GNU-stack","",%progbits 828#endif 829