1///****************************************************************************** 2// * 3// * Copyright (C) 2018 The Android Open Source Project 4// * 5// * Licensed under the Apache License, Version 2.0 (the "License"); 6// * you may not use this file except in compliance with the License. 7// * You may obtain a copy of the License at: 8// * 9// * http://www.apache.org/licenses/LICENSE-2.0 10// * 11// * Unless required by applicable law or agreed to in writing, software 12// * distributed under the License is distributed on an "AS IS" BASIS, 13// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14// * See the License for the specific language governing permissions and 15// * limitations under the License. 16// * 17// ***************************************************************************** 18// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20 21 22.macro push_v_regs 23 stp q8, q9, [sp, #-32]! 24 stp q10, q11, [sp, #-32]! 25 stp q12, q13, [sp, #-32]! 26 stp q14, q15, [sp, #-32]! 27 stp X8, X9, [sp, #-16]! 28 stp X10, X11, [sp, #-16]! 29 stp X12, X13, [sp, #-16]! 30 stp X22, X23, [sp, #-16]! 31 stp X16, X17, [sp, #-16]! 32 stp X20, X21, [sp, #-16]! 33.endm 34.macro pop_v_regs 35 ldp X20, X21, [sp], #16 36 ldp X16, X17, [sp], #16 37 ldp X22, X23, [sp], #16 38 ldp X12, X13, [sp], #16 39 ldp X10, X11, [sp], #16 40 ldp X8, X9, [sp], #16 41 ldp q14, q15, [sp], #32 42 ldp q12, q13, [sp], #32 43 ldp q10, q11, [sp], #32 44 ldp q8, q9, [sp], #32 45.endm 46 47.macro swp reg1, reg2 48 MOV X16, \reg1 49 MOV \reg1, \reg2 50 MOV \reg2, x16 51.endm 52.text 53.global ixheaacd_pretwiddle_compute_armv8 54 55ixheaacd_pretwiddle_compute_armv8: 56 57 push_v_regs 58 59 LSL x7, x4, #4 60 ADD x7, x2, x7 61 SUB x7, x7, #4 62 MOV x22, #7500 63 ADD x3, x3, x22 64 MVN w5, w5 65 ADD w5, w5, #1 66 67 68 69 70 71ARM_PROLOGUE: 72 LDRH w21, [x3] 73 LDRH w22, [x3, #2] 74 LSL w22, w22, #16 75 LSL w21, w21, #16 76 77 LDR w8, [x3], #4 78 LDR w9, [x0], #4 79 80 81 82 83 84 85 86 87 88 89 90 91 SMULL X12, w9, w21 92 ASR X12, x12, #32 93 LDR w10, [x1], #-4 94 SMULL X11, w9, w22 95 ASR X11, x11, #32 96 SMULL X23, w10, w22 97 ASR X23, x23, #32 98 ADD w9, w12, w23 99 SMULL X6, w10, w21 100 ASR X6, x6, #32 101 102 103 MVN w9, w9 104 ADD w9, w9, #1 105 SUB w11, w11, w6 106 CMP w5, #0 107 BGT NEXT 108 MVN w8, w5 109 ADD w8, w8, #1 110 ASR w11, w11, w8 111 ASR w9, w9, w8 112 B NEXT1 113 114NEXT: 115 LSL w11, w11, w5 116 LSL w9, w9, w5 117 118 119 120NEXT1: 121 STR w9, [x2], #4 122 STR w11, [x2], #4 123 124 CMP X4, #0x100 125 BNE NXT 126 MOV X6, #4 127 B NXT1 128NXT: 129 MOV X6, #32 130 ADD X3, X3, #28 131 132NXT1: 133 SUB X4, X4, #1 134 ASR X4, X4, #2 135 SUB x7, x7, #28 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150NEON_PROLOGUE: 151 152 MOV x8, #-32 153 154 dup v14.4s, w5 155 156 SUB X1, X1, #28 157 158 LD2 {v8.h, v9.h}[0], [x3], x6 159 LD2 {v8.h, v9.h}[1], [x3], x6 160 LD2 {v8.h, v9.h}[2], [x3], x6 161 LD2 {v8.h, v9.h}[3], [x3], x6 162 163 rev64 v10.4h, v8.4h 164 rev64 v11.4h, v9.4h 165 166 LD4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 167 168 LD4 {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], x8 169 170 rev64 v0.4h, v0.4h 171 rev64 v1.4h, v1.4h 172 rev64 v4.4h, v4.4h 173 rev64 v5.4h, v5.4h 174 175 176 177 178 179 180 181 uMULL v30.4s, v2.4h, v9.4h 182 uMULL v28.4s, v4.4h, v9.4h 183 uMULL v26.4s, v2.4h, v8.4h 184 uMULL v24.4s, v4.4h, v8.4h 185 186 ushR v30.4s, v30.4s, #16 187 ushR v28.4s, v28.4s, #16 188 ushR v26.4s, v26.4s, #16 189 ushR v24.4s, v24.4s, #16 190 191 sMLAL v30.4s, v3.4h, v9.4h 192 sMLAL v28.4s, v5.4h, v9.4h 193 sMLAL v26.4s, v3.4h, v8.4h 194 sMLAL v24.4s, v5.4h, v8.4h 195 196 ADD v28.4s, v26.4s , v28.4s 197 NEG v28.4s, v28.4s 198 SUB v30.4s, v30.4s , v24.4s 199 200 uMULL v22.4s, v0.4h, v11.4h 201 uMULL v20.4s, v6.4h, v11.4h 202 uMULL v18.4s, v0.4h, v10.4h 203 uMULL v16.4s, v6.4h, v10.4h 204 205 ushR v22.4s, v22.4s, #16 206 ushR v20.4s, v20.4s, #16 207 ushR v18.4s, v18.4s, #16 208 ushR v16.4s, v16.4s, #16 209 210 sMLAL v22.4s, v1.4h, v11.4h 211 LD2 {v8.h, v9.h}[0], [x3], x6 212 213 sMLAL v20.4s, v7.4h, v11.4h 214 LD2 {v8.h, v9.h}[1], [x3], x6 215 216 sMLAL v18.4s, v1.4h, v10.4h 217 LD2 {v8.h, v9.h}[2], [x3], x6 218 219 sMLAL v16.4s, v7.4h, v10.4h 220 LD2 {v8.h, v9.h}[3], [x3], x6 221 222 ADD v20.4s, v20.4s , v18.4s 223 224 NEG v20.4s, v20.4s 225 rev64 v10.4h, v8.4h 226 rev64 v11.4h, v9.4h 227 SUB v22.4s, v16.4s , v22.4s 228 LD4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 229 230 231 232 sshL v20.4s, v20.4s, v14.4s 233 LD4 {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], x8 234 235 rev64 v0.4h, v0.4h 236 rev64 v1.4h, v1.4h 237 sshL v22.4s, v22.4s, v14.4s 238 239 rev64 v4.4h, v4.4h 240 rev64 v5.4h, v5.4h 241 sshL v18.4s, v30.4s, v14.4s 242 243 244 sshL v16.4s, v28.4s, v14.4s 245 246 247 248 249 250 251 252 253 SUB X4, X4, #2 254 255CORE_LOOP: 256 uMULL v30.4s, v2.4h, v9.4h 257 MOV v17.16B, v18.16B 258 ST2 { v16.4s, v17.4s}, [x2] 259 ADD x2, x2, #32 260 uMULL v28.4s, v4.4h, v9.4h 261 262 uMULL v26.4s, v2.4h, v8.4h 263 MOV v21.16B, v22.16B 264 ST2 { v20.4s, v21.4s}, [x7], x8 265 uMULL v24.4s, v4.4h, v8.4h 266 267 ushR v30.4s, v30.4s, #16 268 ushR v28.4s, v28.4s, #16 269 ushR v26.4s, v26.4s, #16 270 ushR v24.4s, v24.4s, #16 271 272 sMLAL v30.4s, v3.4h, v9.4h 273 sMLAL v28.4s, v5.4h, v9.4h 274 sMLAL v26.4s, v3.4h, v8.4h 275 sMLAL v24.4s, v5.4h, v8.4h 276 277 ADD v28.4s, v26.4s , v28.4s 278 NEG v28.4s, v28.4s 279 SUB v30.4s, v30.4s , v24.4s 280 281 uMULL v22.4s, v0.4h, v11.4h 282 LD2 {v8.h, v9.h}[0], [x3], x6 283 uMULL v20.4s, v6.4h, v11.4h 284 285 uMULL v18.4s, v0.4h, v10.4h 286 LD2 {v8.h, v9.h}[1], [x3], x6 287 uMULL v16.4s, v6.4h, v10.4h 288 289 ushR v22.4s, v22.4s, #16 290 LD2 {v8.h, v9.h}[2], [x3], x6 291 ushR v20.4s, v20.4s, #16 292 293 294 ushR v18.4s, v18.4s, #16 295 LD2 {v8.h, v9.h}[3], [x3], x6 296 ushR v16.4s, v16.4s, #16 297 298 sMLAL v22.4s, v1.4h, v11.4h 299 300 sMLAL v20.4s, v7.4h, v11.4h 301 302 303 sMLAL v18.4s, v1.4h, v10.4h 304 305 306 sMLAL v16.4s, v7.4h, v10.4h 307 LD4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 308 ADD v20.4s, v20.4s , v18.4s 309 310 NEG v20.4s, v20.4s 311 rev64 v10.4h, v8.4h 312 rev64 v11.4h, v9.4h 313 314 SUB v22.4s, v16.4s , v22.4s 315 LD4 {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], x8 316 sshL v20.4s, v20.4s, v14.4s 317 318 319 sshL v22.4s, v22.4s, v14.4s 320 321 rev64 v0.4h, v0.4h 322 rev64 v1.4h, v1.4h 323 sshL v18.4s, v30.4s, v14.4s 324 325 rev64 v4.4h, v4.4h 326 rev64 v5.4h, v5.4h 327 sshL v16.4s, v28.4s, v14.4s 328 329 330 SUBS x4, x4, #1 331 BNE CORE_LOOP 332 333 334 335 336 337 338NEON_EPILOGUE: 339 uMULL v30.4s, v2.4h, v9.4h 340 MOV v17.16B, v18.16B 341 ST2 { v16.4s, v17.4s}, [x2] 342 ADD x2, x2, #32 343 uMULL v28.4s, v4.4h, v9.4h 344 345 uMULL v26.4s, v2.4h, v8.4h 346 MOV v21.16B, v22.16B 347 348 ST2 { v20.4s, v21.4s}, [x7], x8 349 uMULL v24.4s, v4.4h, v8.4h 350 351 ushR v30.4s, v30.4s, #16 352 ushR v28.4s, v28.4s, #16 353 ushR v26.4s, v26.4s, #16 354 ushR v24.4s, v24.4s, #16 355 356 sMLAL v30.4s, v3.4h, v9.4h 357 sMLAL v28.4s, v5.4h, v9.4h 358 sMLAL v26.4s, v3.4h, v8.4h 359 sMLAL v24.4s, v5.4h, v8.4h 360 361 ADD v28.4s, v26.4s , v28.4s 362 NEG v28.4s, v28.4s 363 SUB v30.4s, v30.4s , v24.4s 364 365 uMULL v22.4s, v0.4h, v11.4h 366 uMULL v20.4s, v6.4h, v11.4h 367 uMULL v18.4s, v0.4h, v10.4h 368 uMULL v16.4s, v6.4h, v10.4h 369 370 ushR v22.4s, v22.4s, #16 371 ushR v20.4s, v20.4s, #16 372 ushR v18.4s, v18.4s, #16 373 ushR v16.4s, v16.4s, #16 374 375 sMLAL v22.4s, v1.4h, v11.4h 376 sMLAL v20.4s, v7.4h, v11.4h 377 sMLAL v18.4s, v1.4h, v10.4h 378 sMLAL v16.4s, v7.4h, v10.4h 379 380 ADD v20.4s, v20.4s , v18.4s 381 NEG v20.4s, v20.4s 382 SUB v22.4s, v16.4s , v22.4s 383 384 385 sshL v20.4s, v20.4s, v14.4s 386 sshL v22.4s, v22.4s, v14.4s 387 sshL v18.4s, v30.4s, v14.4s 388 sshL v16.4s, v28.4s, v14.4s 389 MOV v17.16B, v18.16B 390 ST2 { v16.4s, v17.4s}, [x2] 391 ADD x2, x2, #32 392 MOV v21.16B, v22.16B 393 ST2 { v20.4s, v21.4s}, [x7], x8 394 395 396RESIDUE_NEON: 397 MOV x10, #-16 398 movi v3.2s, #0x00000000 399 movi v4.2s, #0x00000000 400 401 LD2 {v21.2s, v22.2s}, [x0], #16 402 MOV v0.8B, v21.8B 403 MOV v2.8B, v22.8B 404 405 LD1 {v1.s}[0], [x0], #4; 406 LD1 {v3.s}[0], [x0], #4; 407 LD1 {v1.s}[1], [x0] 408 MOV v21.8B, v0.8B 409 410 UZP1 v0.4h, v21.4h, v1.4h 411 UZP2 v1.4h, v21.4h, v1.4h 412 MOV v21.8B, v2.8B 413 UZP1 v2.4h, v21.4h, v3.4h 414 UZP2 v3.4h, v21.4h, v3.4h 415 416 ADD x1, x1, #4 417 418 LD1 {v6.s}[0], [x1], #4 419 LD1 {v4.s}[1], [x1], #4 420 LD1 {v6.s}[1], [x1], #4 421 422 423 LD2 {v21.2s, v22.2s}, [x1], #16 424 MOV v5.8B, v21.8B 425 MOV v7.8B, v22.8B 426 427 428 MOV v21.8B, v4.8B 429 UZP1 v4.4h, v21.4h, v5.4h 430 UZP2 v5.4h, v21.4h, v5.4h 431 MOV v21.8B, v6.8B 432 UZP1 v6.4h, v21.4h, v7.4h 433 UZP2 v7.4h, v21.4h, v7.4h 434 rev64 v0.4h, v0.4h 435 rev64 v1.4h, v1.4h 436 rev64 v4.4h, v4.4h 437 rev64 v5.4h, v5.4h 438 439 LD2 {v8.h, v9.h}[0], [x3], x6 440 LD2 {v8.h, v9.h}[1], [x3], x6 441 LD2 {v8.h, v9.h}[2], [x3], x6 442 LD2 {v8.h, v9.h}[3], [x3], x6 443 444 rev64 v10.4h, v8.4h 445 rev64 v11.4h, v9.4h 446 447 448 449 uMULL v30.4s, v2.4h, v9.4h 450 uMULL v28.4s, v4.4h, v9.4h 451 uMULL v26.4s, v2.4h, v8.4h 452 uMULL v24.4s, v4.4h, v8.4h 453 454 ushR v30.4s, v30.4s, #16 455 ushR v28.4s, v28.4s, #16 456 ushR v26.4s, v26.4s, #16 457 ushR v24.4s, v24.4s, #16 458 459 sMLAL v30.4s, v3.4h, v9.4h 460 sMLAL v28.4s, v5.4h, v9.4h 461 sMLAL v26.4s, v3.4h, v8.4h 462 sMLAL v24.4s, v5.4h, v8.4h 463 464 ADD v28.4s, v26.4s , v28.4s 465 NEG v28.4s, v28.4s 466 SUB v30.4s, v30.4s , v24.4s 467 468 uMULL v22.4s, v0.4h, v11.4h 469 uMULL v20.4s, v6.4h, v11.4h 470 uMULL v18.4s, v0.4h, v10.4h 471 uMULL v16.4s, v6.4h, v10.4h 472 473 ushR v22.4s, v22.4s, #16 474 ushR v20.4s, v20.4s, #16 475 ushR v18.4s, v18.4s, #16 476 ushR v16.4s, v16.4s, #16 477 478 sMLAL v22.4s, v1.4h, v11.4h 479 sMLAL v20.4s, v7.4h, v11.4h 480 sMLAL v18.4s, v1.4h, v10.4h 481 sMLAL v16.4s, v7.4h, v10.4h 482 483 ADD v20.4s, v20.4s , v18.4s 484 NEG v20.4s, v20.4s 485 SUB v22.4s, v16.4s , v22.4s 486 487 488 489 sshL v20.4s, v20.4s, v14.4s 490 sshL v22.4s, v22.4s, v14.4s 491 sshL v18.4s, v30.4s, v14.4s 492 sshL v16.4s, v28.4s, v14.4s 493 MOV v21.16B, v22.16B 494 ST2 { v20.4s, v21.4s}, [x7] 495 mov v17.16B, v18.16B 496 ST2 {v16.2s, v17.2s}, [x2] 497 ADD x2, x2, #16 498 499 ST2 {v16.s, v17.s}[2], [x2] 500 ADD x2, x2, #8 501 502 503 504 505 506 507END1: 508 pop_v_regs 509 ret 510 511 512 513