1///****************************************************************************** 2// * 3// * Copyright (C) 2018 The Android Open Source Project 4// * 5// * Licensed under the Apache License, Version 2.0 (the "License"); 6// * you may not use this file except in compliance with the License. 7// * You may obtain a copy of the License at: 8// * 9// * http://www.apache.org/licenses/LICENSE-2.0 10// * 11// * Unless required by applicable law or agreed to in writing, software 12// * distributed under the License is distributed on an "AS IS" BASIS, 13// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14// * See the License for the specific language governing permissions and 15// * limitations under the License. 16// * 17// ***************************************************************************** 18// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20 21 22.macro push_v_regs 23 stp q8, q9, [sp, #-32]! 24 stp q10, q11, [sp, #-32]! 25 stp q12, q13, [sp, #-32]! 26 stp q14, q15, [sp, #-32]! 27 stp x21, x22, [sp, #-16]! 28 stp x23, x24, [sp, #-16]! 29.endm 30.macro pop_v_regs 31 ldp x23, x24, [sp], #16 32 ldp x21, x22, [sp], #16 33 ldp q14, q15, [sp], #32 34 ldp q12, q13, [sp], #32 35 ldp q10, q11, [sp], #32 36 ldp q8, q9, [sp], #32 37.endm 38.macro swp reg1, reg2 39 MOV X16, \reg1 40 MOV \reg1, \reg2 41 MOV \reg2, x16 42.endm 43.text 44.global ixheaacd_post_twiddle_armv8 45ixheaacd_post_twiddle_armv8: 46 47 48 push_v_regs 49 50ARM_PROLOGUE: 51 CMP w3, #0x400 52 MOV x21, #7500 53 ADD x2, x2, x21 54 BLT NEXT 55 MOV w4, #50 56 MOV w5, #-50 57 MOV x6, #4 58 dup v10.4h, w4 59 B NEXT1 60 61NEXT: 62 MOV w4, #0x192 63 MOV w5, #0xfe6e 64 MOV x6, #32 65 dup v10.4h, w4 66 67NEXT1: 68 LDR w9, [x2] 69 LSL W22, W9, #16 70 AND W21, W9, #0xFFFF0000 71 72 LDR w7, [x1], #4 73 LDR w8, [x1], #4 74 75 ADD x2, x2, x6 76 77 78 SMULL X11, w8, w21 79 ASR X11, x11, #32 80 SMULL X10, w8, w22 81 ASR X10, x10, #32 82 SMULL X12, w7, w21 83 ASR X12, x12, #32 84 SMULL X23, w7, w22 85 ASR X23, x23, #32 86 ADD w8, w11, w23 87 88 89 SUB w10, w10, w12 90 91 MVN w8, w8 92 ADD w8, w8, #1 93 94 95 96 LSL w21, w5, #16 97 LSL w22, w4, #16 98 SMULL X23, w10, w21 99 ASR X23, x23, #32 100 ADD w9, w8, w23 101 SMULL X23, w8, w22 102 ASR X23, x23, #32 103 ADD w11, w10, w23 104 105 LSL x7, x3, #2 106 ADD x7, x0, x7 107 SUB x7, x7, #4 108 109 STR w11, [x7], #-4 110 111 STR w9, [x0], #4 112 113 LSL x5, x3, #2 114 ADD x5, x1, x5 115 SUB x5, x5, #40 116 117 118 SUB w3, w3, #1 119 ASR w3, w3, #4 120 121 122 SUB x7, x7, #28 123 124 125 126 127 128 129 130 131 132 133 134 135 MOV x8, #-32 136 137NEON_PROLOGUE: 138 139 LD4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x5], x8 140 141 LD4 {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], #32 142 LD2 {v8.h, v9.h}[0], [x2], x6 143 LD2 {v8.h, v9.h}[1], [x2], x6 144 LD2 {v8.h, v9.h}[2], [x2], x6 145 LD2 {v8.h, v9.h}[3], [x2], x6 146 147 rev64 v12.4h, v8.4h 148 rev64 v13.4h, v9.4h 149 150 uMULL v30.4s, v2.4h, v13.4h 151 uMULL v28.4s, v0.4h, v13.4h 152 uMULL v26.4s, v2.4h, v12.4h 153 uMULL v24.4s, v0.4h, v12.4h 154 155 ushR v30.4s, v30.4s, #16 156 ushR v28.4s, v28.4s, #16 157 ushR v26.4s, v26.4s, #16 158 ushR v24.4s, v24.4s, #16 159 160 sMLAL v30.4s, v3.4h, v13.4h 161 sMLAL v28.4s, v1.4h, v13.4h 162 sMLAL v26.4s, v3.4h, v12.4h 163 sMLAL v24.4s, v1.4h, v12.4h 164 165 uMULL v22.4s, v6.4h, v9.4h 166 uMULL v20.4s, v4.4h, v9.4h 167 168 ADD v28.4s, v28.4s , v26.4s 169 SUB v30.4s, v30.4s , v24.4s 170 NEG v28.4s, v28.4s 171 172 uMULL v18.4s, v6.4h, v8.4h 173 uMULL v16.4s, v4.4h, v8.4h 174 175 mov v31.8b, v30.8b 176 mov v27.D[0], v30.D[1] 177 ushR v22.4s, v22.4s, #16 178 179 mov v24.8b, v28.8b 180 mov v25.D[0], v28.D[1] 181 ushR v20.4s, v20.4s, #16 182 183 184 UZP1 v26.4h, v31.4h, v27.4h 185 UZP2 v27.4h, v31.4h, v27.4h 186 ushR v18.4s, v18.4s, #16 187 188 189 mov v31.8B , v24.8B 190 UZP1 v24.4h, v31.4h, v25.4h 191 UZP2 v25.4h, v31.4h, v25.4h 192 ushR v16.4s, v16.4s, #16 193 194 195 sMLAL v22.4s, v7.4h, v9.4h 196 sMLAL v20.4s, v5.4h, v9.4h 197 sMLAL v18.4s, v7.4h, v8.4h 198 sMLAL v16.4s, v5.4h, v8.4h 199 200 LD2 {v8.h, v9.h}[0], [x2], x6 201 uMULL v0.4s, v26.4h, v10.4h 202 203 LD2 {v8.h, v9.h}[1], [x2], x6 204 uMULL v2.4s, v24.4h, v10.4h 205 206 207 LD2 {v8.h, v9.h}[2], [x2], x6 208 ADD v22.4s, v22.4s , v16.4s 209 210 LD2 {v8.h, v9.h}[3], [x2], x6 211 SUB v20.4s, v18.4s , v20.4s 212 213 rev64 v12.4h, v8.4h 214 rev64 v13.4h, v9.4h 215 NEG v22.4s, v22.4s 216 217 218 mov v18.8b, v22.8b 219 mov v19.D[0], v22.D[1] 220 ushR v0.4s, v0.4s, #16 221 222 mov v16.16b, v20.16b 223 mov v17.D[0], v20.D[1] 224 ushR v2.4s, v2.4s, #16 225 226 227 MOV v31.8b, v18.8b 228 UZP1 v18.4h, v31.4h, v19.4h 229 UZP2 v19.4h, v31.4h, v19.4h 230 sMLAL v0.4s, v27.4h, v10.4h 231 232 233 MOV v31.8b, v16.8b 234 UZP1 v16.4h, v31.4h, v17.4h 235 UZP2 v17.4h, v31.4h, v17.4h 236 sMLAL v2.4s, v25.4h, v10.4h 237 238 uMULL v4.4s, v18.4h, v10.4h 239 uMULL v6.4s, v16.4h, v10.4h 240 241 NEG v0.4s, v0.4s 242 ADD v14.4s, v30.4s , v2.4s 243 ADD v26.4s, v28.4s , v0.4s 244 245 rev64 v14.4s, v14.4s 246 ushR v4.4s, v4.4s, #16 247 248 swp v14.D[0], v14.D[1] 249 ushR v6.4s, v6.4s, #16 250 251 sMLAL v4.4s, v19.4h, v10.4h 252 LD4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x5], x8 253 sMLAL v6.4s, v17.4h, v10.4h 254 255 256 SUB x3, x3, #2 257 258 ADD v24.4s, v20.4s , v4.4s 259 260 rev64 v24.4s, v24.4s 261 NEG v16.4s, v6.4s 262 263 LD4 {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], #32 264 265 swp v24.D[0], v24.D[1] 266 ADD v16.4s, v22.4s , v16.4s 267 268 269 270CORE_LOOP: 271 uMULL v30.4s, v2.4h, v13.4h 272 MOV v25.16B, v24.16B 273 ST2 { v25.4s, v26.4s}, [x7], x8 274 uMULL v28.4s, v0.4h, v13.4h 275 276 uMULL v26.4s, v2.4h, v12.4h 277 MOV v15.16B, v14.16B 278 ST2 { v15.4s, v16.4s}, [x0], #32 279 uMULL v24.4s, v0.4h, v12.4h 280 281 ushR v30.4s, v30.4s, #16 282 ushR v28.4s, v28.4s, #16 283 ushR v26.4s, v26.4s, #16 284 ushR v24.4s, v24.4s, #16 285 286 sMLAL v30.4s, v3.4h, v13.4h 287 sMLAL v28.4s, v1.4h, v13.4h 288 sMLAL v26.4s, v3.4h, v12.4h 289 sMLAL v24.4s, v1.4h, v12.4h 290 291 uMULL v22.4s, v6.4h, v9.4h 292 uMULL v20.4s, v4.4h, v9.4h 293 294 295 ADD v28.4s, v28.4s , v26.4s 296 SUB v30.4s, v30.4s , v24.4s 297 NEG v28.4s, v28.4s 298 299 uMULL v18.4s, v6.4h, v8.4h 300 uMULL v16.4s, v4.4h, v8.4h 301 302 303 mov v26.8b, v30.8b 304 mov v27.D[0], v30.D[1] 305 ushR v22.4s, v22.4s, #16 306 307 308 mov v24.8b, v28.8b 309 mov v25.D[0], v28.D[1] 310 ushR v20.4s, v20.4s, #16 311 312 313 MOV v31.8b, v26.8b 314 UZP1 v26.4h, v31.4h, v27.4h 315 UZP2 v27.4h, v31.4h, v27.4h 316 ushR v18.4s, v18.4s, #16 317 318 319 MOV v31.8b, v24.8b 320 UZP1 v24.4h, v31.4h, v25.4h 321 UZP2 v25.4h, v31.4h, v25.4h 322 ushR v16.4s, v16.4s, #16 323 324 325 sMLAL v22.4s, v7.4h, v9.4h 326 sMLAL v20.4s, v5.4h, v9.4h 327 sMLAL v18.4s, v7.4h, v8.4h 328 sMLAL v16.4s, v5.4h, v8.4h 329 330 LD2 {v8.h, v9.h}[0], [x2], x6 331 uMULL v0.4s, v26.4h, v10.4h 332 333 LD2 {v8.h, v9.h}[1], [x2], x6 334 uMULL v2.4s, v24.4h, v10.4h 335 336 LD2 {v8.h, v9.h}[2], [x2], x6 337 ADD v22.4s, v22.4s , v16.4s 338 339 LD2 {v8.h, v9.h}[3], [x2], x6 340 SUB v20.4s, v18.4s , v20.4s 341 342 rev64 v12.4h, v8.4h 343 rev64 v13.4h, v9.4h 344 NEG v22.4s, v22.4s 345 346 mov v18.8b, v22.8b 347 mov v19.D[0], v22.D[1] 348 ushR v0.4s, v0.4s, #16 349 350 mov v16.8b, v20.8b 351 mov v17.D[0], v20.D[1] 352 ushR v2.4s, v2.4s, #16 353 354 355 MOV v31.8b, v18.8b 356 UZP1 v18.4h, v31.4h, v19.4h 357 UZP2 v19.4h, v31.4h, v19.4h 358 sMLAL v0.4s, v27.4h, v10.4h 359 360 361 MOV v31.8b, v16.8b 362 UZP1 v16.4h, v31.4h, v17.4h 363 UZP2 v17.4h, v31.4h, v17.4h 364 sMLAL v2.4s, v25.4h, v10.4h 365 366 uMULL v4.4s, v18.4h, v10.4h 367 uMULL v6.4s, v16.4h, v10.4h 368 369 NEG v0.4s, v0.4s 370 ADD v14.4s, v30.4s , v2.4s 371 ADD v26.4s, v28.4s , v0.4s 372 373 rev64 v14.4s, v14.4s 374 ushR v4.4s, v4.4s, #16 375 376 swp v14.D[0], v14.D[1] 377 ushR v6.4s, v6.4s, #16 378 379 sMLAL v4.4s, v19.4h, v10.4h 380 LD4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x5], x8 381 sMLAL v6.4s, v17.4h, v10.4h 382 383 384 385 386 387 ADD v24.4s, v20.4s , v4.4s 388 389 rev64 v24.4s, v24.4s 390 NEG v16.4s, v6.4s 391 392 LD4 {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], #32 393 394 swp v24.D[0], v24.D[1] 395 ADD v16.4s, v22.4s , v16.4s 396 397 SUBS x3, x3, #1 398 399 BNE CORE_LOOP 400 401 402 403 404NEON_EPILOGUE: 405 uMULL v30.4s, v2.4h, v13.4h 406 MOV v25.16B, v24.16B 407 ST2 { v25.4s, v26.4s}, [x7], x8 408 uMULL v28.4s, v0.4h, v13.4h 409 410 uMULL v26.4s, v2.4h, v12.4h 411 MOV v15.16B, v14.16B 412 ST2 { v15.4s, v16.4s}, [x0], #32 413 uMULL v24.4s, v0.4h, v12.4h 414 415 416 417 ushR v30.4s, v30.4s, #16 418 ushR v28.4s, v28.4s, #16 419 ushR v26.4s, v26.4s, #16 420 ushR v24.4s, v24.4s, #16 421 422 sMLAL v30.4s, v3.4h, v13.4h 423 sMLAL v28.4s, v1.4h, v13.4h 424 sMLAL v26.4s, v3.4h, v12.4h 425 sMLAL v24.4s, v1.4h, v12.4h 426 427 428 uMULL v22.4s, v6.4h, v9.4h 429 uMULL v20.4s, v4.4h, v9.4h 430 431 432 ADD v28.4s, v28.4s , v26.4s 433 SUB v30.4s, v30.4s , v24.4s 434 NEG v28.4s, v28.4s 435 436 uMULL v18.4s, v6.4h, v8.4h 437 uMULL v16.4s, v4.4h, v8.4h 438 439 440 mov v26.8b, v30.8b 441 mov v27.D[0], v30.D[1] 442 ushR v22.4s, v22.4s, #16 443 444 mov v24.16b, v28.16b 445 mov v25.D[0], v28.D[1] 446 ushR v20.4s, v20.4s, #16 447 448 449 mov v31.8b, v26.8b 450 UZP1 v26.4h, v31.4h, v27.4h 451 UZP2 v27.4h, v31.4h, v27.4h 452 ushR v18.4s, v18.4s, #16 453 454 455 mov v31.8b, v24.8b 456 UZP1 v24.4h, v31.4h, v25.4h 457 UZP2 v25.4h, v31.4h, v25.4h 458 ushR v16.4s, v16.4s, #16 459 460 461 sMLAL v22.4s, v7.4h, v9.4h 462 sMLAL v20.4s, v5.4h, v9.4h 463 sMLAL v18.4s, v7.4h, v8.4h 464 sMLAL v16.4s, v5.4h, v8.4h 465 466 467 uMULL v0.4s, v26.4h, v10.4h 468 469 470 uMULL v2.4s, v24.4h, v10.4h 471 472 473 ADD v22.4s, v22.4s , v16.4s 474 475 476 SUB v20.4s, v18.4s , v20.4s 477 478 479 NEG v22.4s, v22.4s 480 481 482 mov v18.16b, v22.16b 483 ushR v0.4s, v0.4s, #16 484 485 mov v16.16b, v20.16b 486 ushR v2.4s, v2.4s, #16 487 488 489 mov v31.16b, v18.16b 490 mov v19.d[0], v31.d[1] 491 UZP1 v18.4h, v31.4h, v19.4h 492 UZP2 v19.4h, v31.4h, v19.4h 493 sMLAL v0.4s, v27.4h, v10.4h 494 495 496 mov v31.16b, v16.16b 497 mov v17.d[0], v31.d[1] 498 UZP1 v16.4h, v31.4h, v17.4h 499 UZP2 v17.4h, v31.4h, v17.4h 500 sMLAL v2.4s, v25.4h, v10.4h 501 502 uMULL v4.4s, v18.4h, v10.4h 503 uMULL v6.4s, v16.4h, v10.4h 504 505 NEG v0.4s, v0.4s 506 ADD v14.4s, v30.4s , v2.4s 507 ADD v26.4s, v28.4s , v0.4s 508 509 rev64 v14.4s, v14.4s 510 ushR v4.4s, v4.4s, #16 511 512 swp v14.D[0], v14.D[1] 513 ushR v6.4s, v6.4s, #16 514 515 sMLAL v4.4s, v19.4h, v10.4h 516 517 sMLAL v6.4s, v17.4h, v10.4h 518 519 520 521 522 ADD v24.4s, v20.4s , v4.4s 523 524 rev64 v24.4s, v24.4s 525 NEG v16.4s, v6.4s 526 527 528 529 swp v24.D[0], v24.D[1] 530 ADD v16.4s, v22.4s , v16.4s 531 532 MOV v25.16B, v24.16B 533 MOV v15.16B, v14.16B 534 ST2 { v15.4s, v16.4s}, [x0], #32 535 ST2 { v25.4s, v26.4s}, [x7], x8 536 537 538 539 540 LD4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x5], x8 541 542 movi v6.2s, #0x00000000 543 movi v7.2s, #0x00000000 544 545 LD2 {v4.2s, v5.2s}, [x1], #16 546 LD2 {v6.s, v7.s}[0], [x1] 547 548 LD2 {v8.h, v9.h}[0], [x2], x6 549 LD2 {v8.h, v9.h}[1], [x2], x6 550 LD2 {v8.h, v9.h}[2], [x2], x6 551 LD2 {v8.h, v9.h}[3], [x2], x6 552 553 rev64 v12.8h, v8.8h 554 rev64 v13.8h, v9.8h 555 swp v5.D[0], v6.D[0] 556 557 558 MOV v30.8B, V4.8B 559 UZP1 v4.4h, v30.4h, v5.4h 560 UZP2 v5.4h, v30.4h, v5.4h 561 MOV v30.8B, V6.8B 562 UZP1 v6.4h, v30.4h, v7.4h 563 UZP2 v7.4h, v30.4h, v7.4h 564 uMULL v30.4s, v2.4h, v13.4h 565 uMULL v28.4s, v0.4h, v13.4h 566 567 uMULL v26.4s, v2.4h, v12.4h 568 uMULL v24.4s, v0.4h, v12.4h 569 570 ushR v30.4s, v30.4s, #16 571 ushR v28.4s, v28.4s, #16 572 ushR v26.4s, v26.4s, #16 573 ushR v24.4s, v24.4s, #16 574 575 sMLAL v30.4s, v3.4h, v13.4h 576 sMLAL v28.4s, v1.4h, v13.4h 577 sMLAL v26.4s, v3.4h, v12.4h 578 sMLAL v24.4s, v1.4h, v12.4h 579 580 uMULL v22.4s, v6.4h, v9.4h 581 uMULL v20.4s, v4.4h, v9.4h 582 583 584 ADD v28.4s, v28.4s , v26.4s 585 SUB v30.4s, v30.4s , v24.4s 586 NEG v28.4s, v28.4s 587 588 uMULL v18.4s, v6.4h, v8.4h 589 uMULL v16.4s, v4.4h, v8.4h 590 591 mov v26.8b, v30.8b 592 mov v27.D[0], v30.D[1] 593 ushR v22.4s, v22.4s, #16 594 595 mov v24.16b, v28.16b 596 mov v25.D[0], v28.D[1] 597 ushR v20.4s, v20.4s, #16 598 599 600 MOV v31.8B, V26.8B 601 UZP1 v26.4h, v31.4h, v27.4h 602 UZP2 v27.4h, v31.4h, v27.4h 603 ushr v18.4s, v18.4s, #16 604 605 MOV v31.8B, V24.8B 606 UZP1 v24.4h, v31.4h, v25.4h 607 UZP2 v25.4h, v31.4h, v25.4h 608 ushR v16.4s, v16.4s, #16 609 610 sMLAL v22.4s, v7.4h, v9.4h 611 sMLAL v20.4s, v5.4h, v9.4h 612 sMLAL v18.4s, v7.4h, v8.4h 613 sMLAL v16.4s, v5.4h, v8.4h 614 615 616 uMULL v0.4s, v26.4h, v10.4h 617 618 619 uMULL v2.4s, v24.4h, v10.4h 620 621 ADD v22.4s, v22.4s , v16.4s 622 623 624 SUB v20.4s, v18.4s , v20.4s 625 626 627 NEG v22.4s, v22.4s 628 629 630 mov v18.8B, v22.8B 631 mov v19.D[0], v22.D[1] 632 ushR v0.4s, v0.4s, #16 633 634 mov v16.16b, v20.16b 635 mov v17.D[0], v20.D[1] 636 ushR v2.4s, v2.4s, #16 637 638 639 MOV v31.8B, V18.8B 640 UZP1 v18.4h, v31.4h, v19.4h 641 UZP2 v19.4h, v31.4h, v19.4h 642 sMLAL v0.4s, v27.4h, v10.4h 643 644 645 MOV v31.8B, V16.8B 646 UZP1 v16.4h, v31.4h, v17.4h 647 UZP2 v17.4h, v31.4h, v17.4h 648 sMLAL v2.4s, v25.4h, v10.4h 649 650 uMULL v4.4s, v18.4h, v10.4h 651 uMULL v6.4s, v16.4h, v10.4h 652 653 NEG v0.4s, v0.4s 654 ADD v14.4s, v30.4s , v2.4s 655 ADD v26.4s, v28.4s , v0.4s 656 657 rev64 v14.4s, v14.4s 658 ushR v4.4s, v4.4s, #16 659 660 swp v14.D[0], v14.D[1] 661 ushR v6.4s, v6.4s, #16 662 663 sMLAL v4.4s, v19.4h, v10.4h 664 665 sMLAL v6.4s, v17.4h, v10.4h 666 667 668 669 670 ADD v24.4s, v20.4s , v4.4s 671 672 rev64 v24.4s, v24.4s 673 NEG v16.4s, v6.4s 674 675 swp v24.D[0], v24.D[1] 676 ADD v16.4s, v22.4s , v16.4s 677 678 679 MOV v15.16B, v14.16B 680 ST2 {v15.2s, v16.2s}, [x0], #16 681 682 ST2 {v15.s, v16.s}[2], [x0], #8 683 684 ST1 {v15.s}[3], [x0] 685 686 ADD x7, x7, #4 687 688 ST1 {v26.s}[0], [x7], #4 689 MOV v25.16B, v24.16B 690 ST2 {v25.s, v26.s}[1], [x7], #8 691 MOV v27.D[0], V26.d[1] 692 mov v26.d[0], v25.d[1] 693 ST2 {v26.2s, v27.2s}, [x7] 694 695 696 697 698 699 700 pop_v_regs 701 ret 702 703 704 705 706 707 708 709 710 711 712 713 714