1/* 2 * Copyright (c) 2016 Google Inc. 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/aarch64/asm.S" 22#include "neon.S" 23 24const itxfm4_coeffs, align=4 25 .short 11585, 0, 6270, 15137 26iadst4_coeffs: 27 .short 5283, 15212, 9929, 13377 28endconst 29 30const iadst8_coeffs, align=4 31 .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679 32idct_coeffs: 33 .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102 34 .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756 35 .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520 36 .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404 37endconst 38 39const iadst16_coeffs, align=4 40 .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053 41 .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207 42endconst 43 44// out1 = ((in1 + in2) * v0[0] + (1 << 13)) >> 14 45// out2 = ((in1 - in2) * v0[0] + (1 << 13)) >> 14 46// in/out are .8h registers; this can do with 4 temp registers, but is 47// more efficient if 6 temp registers are available. 48.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0 49.if \neg > 0 50 neg \tmp4\().4h, v0.4h 51.endif 52 add \tmp1\().8h, \in1\().8h, \in2\().8h 53 sub \tmp2\().8h, \in1\().8h, \in2\().8h 54.if \neg > 0 55 smull \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0] 56 smull2 \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0] 57.else 58 smull \tmp3\().4s, \tmp1\().4h, v0.h[0] 59 smull2 \tmp4\().4s, \tmp1\().8h, v0.h[0] 60.endif 61.ifb \tmp5 62 rshrn \out1\().4h, \tmp3\().4s, #14 63 rshrn2 \out1\().8h, \tmp4\().4s, #14 64 smull \tmp3\().4s, \tmp2\().4h, v0.h[0] 65 smull2 \tmp4\().4s, \tmp2\().8h, v0.h[0] 66 rshrn \out2\().4h, \tmp3\().4s, #14 67 rshrn2 \out2\().8h, \tmp4\().4s, #14 68.else 69 smull \tmp5\().4s, \tmp2\().4h, v0.h[0] 70 smull2 \tmp6\().4s, \tmp2\().8h, v0.h[0] 71 rshrn \out1\().4h, \tmp3\().4s, #14 72 rshrn2 \out1\().8h, \tmp4\().4s, #14 73 rshrn \out2\().4h, \tmp5\().4s, #14 74 rshrn2 \out2\().8h, \tmp6\().4s, #14 75.endif 76.endm 77 78// Same as dmbutterfly0 above, but treating the input in in2 as zero, 79// writing the same output into both out1 and out2. 80.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6 81 smull \tmp1\().4s, \in1\().4h, v0.h[0] 82 smull2 \tmp2\().4s, \in1\().8h, v0.h[0] 83 rshrn \out1\().4h, \tmp1\().4s, #14 84 rshrn2 \out1\().8h, \tmp2\().4s, #14 85 rshrn \out2\().4h, \tmp1\().4s, #14 86 rshrn2 \out2\().8h, \tmp2\().4s, #14 87.endm 88 89// out1,out2 = in1 * coef1 - in2 * coef2 90// out3,out4 = in1 * coef2 + in2 * coef1 91// out are 4 x .4s registers, in are 2 x .8h registers 92.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2 93 smull \out1\().4s, \in1\().4h, \coef1 94 smull2 \out2\().4s, \in1\().8h, \coef1 95 smull \out3\().4s, \in1\().4h, \coef2 96 smull2 \out4\().4s, \in1\().8h, \coef2 97 smlsl \out1\().4s, \in2\().4h, \coef2 98 smlsl2 \out2\().4s, \in2\().8h, \coef2 99 smlal \out3\().4s, \in2\().4h, \coef1 100 smlal2 \out4\().4s, \in2\().8h, \coef1 101.endm 102 103// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14 104// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14 105// inout are 2 x .8h registers 106.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0 107 dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2 108.if \neg > 0 109 neg \tmp3\().4s, \tmp3\().4s 110 neg \tmp4\().4s, \tmp4\().4s 111.endif 112 rshrn \inout1\().4h, \tmp1\().4s, #14 113 rshrn2 \inout1\().8h, \tmp2\().4s, #14 114 rshrn \inout2\().4h, \tmp3\().4s, #14 115 rshrn2 \inout2\().8h, \tmp4\().4s, #14 116.endm 117 118// Same as dmbutterfly above, but treating the input in inout2 as zero 119.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 120 smull \tmp1\().4s, \inout1\().4h, \coef1 121 smull2 \tmp2\().4s, \inout1\().8h, \coef1 122 smull \tmp3\().4s, \inout1\().4h, \coef2 123 smull2 \tmp4\().4s, \inout1\().8h, \coef2 124 rshrn \inout1\().4h, \tmp1\().4s, #14 125 rshrn2 \inout1\().8h, \tmp2\().4s, #14 126 rshrn \inout2\().4h, \tmp3\().4s, #14 127 rshrn2 \inout2\().8h, \tmp4\().4s, #14 128.endm 129 130// Same as dmbutterfly above, but treating the input in inout1 as zero 131.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 132 smull \tmp1\().4s, \inout2\().4h, \coef2 133 smull2 \tmp2\().4s, \inout2\().8h, \coef2 134 smull \tmp3\().4s, \inout2\().4h, \coef1 135 smull2 \tmp4\().4s, \inout2\().8h, \coef1 136 neg \tmp1\().4s, \tmp1\().4s 137 neg \tmp2\().4s, \tmp2\().4s 138 rshrn \inout2\().4h, \tmp3\().4s, #14 139 rshrn2 \inout2\().8h, \tmp4\().4s, #14 140 rshrn \inout1\().4h, \tmp1\().4s, #14 141 rshrn2 \inout1\().8h, \tmp2\().4s, #14 142.endm 143 144.macro dsmull_h out1, out2, in, coef 145 smull \out1\().4s, \in\().4h, \coef 146 smull2 \out2\().4s, \in\().8h, \coef 147.endm 148 149.macro drshrn_h out, in1, in2, shift 150 rshrn \out\().4h, \in1\().4s, \shift 151 rshrn2 \out\().8h, \in2\().4s, \shift 152.endm 153 154 155// out1 = in1 + in2 156// out2 = in1 - in2 157.macro butterfly_8h out1, out2, in1, in2 158 add \out1\().8h, \in1\().8h, \in2\().8h 159 sub \out2\().8h, \in1\().8h, \in2\().8h 160.endm 161 162// out1 = in1 - in2 163// out2 = in1 + in2 164.macro butterfly_8h_r out1, out2, in1, in2 165 sub \out1\().8h, \in1\().8h, \in2\().8h 166 add \out2\().8h, \in1\().8h, \in2\().8h 167.endm 168 169// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14 170// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14 171// out are 2 x .8h registers, in are 4 x .4s registers 172.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4 173 add \tmp1\().4s, \in1\().4s, \in3\().4s 174 add \tmp2\().4s, \in2\().4s, \in4\().4s 175 sub \tmp3\().4s, \in1\().4s, \in3\().4s 176 sub \tmp4\().4s, \in2\().4s, \in4\().4s 177 rshrn \out1\().4h, \tmp1\().4s, #14 178 rshrn2 \out1\().8h, \tmp2\().4s, #14 179 rshrn \out2\().4h, \tmp3\().4s, #14 180 rshrn2 \out2\().8h, \tmp4\().4s, #14 181.endm 182 183.macro iwht4 c0, c1, c2, c3 184 add \c0\().4h, \c0\().4h, \c1\().4h 185 sub v17.4h, \c2\().4h, \c3\().4h 186 sub v16.4h, \c0\().4h, v17.4h 187 sshr v16.4h, v16.4h, #1 188 sub \c2\().4h, v16.4h, \c1\().4h 189 sub \c1\().4h, v16.4h, \c3\().4h 190 add \c3\().4h, v17.4h, \c2\().4h 191 sub \c0\().4h, \c0\().4h, \c1\().4h 192.endm 193 194.macro idct4 c0, c1, c2, c3 195 smull v22.4s, \c1\().4h, v0.h[3] 196 smull v20.4s, \c1\().4h, v0.h[2] 197 add v16.4h, \c0\().4h, \c2\().4h 198 sub v17.4h, \c0\().4h, \c2\().4h 199 smlal v22.4s, \c3\().4h, v0.h[2] 200 smull v18.4s, v16.4h, v0.h[0] 201 smull v19.4s, v17.4h, v0.h[0] 202 smlsl v20.4s, \c3\().4h, v0.h[3] 203 rshrn v22.4h, v22.4s, #14 204 rshrn v18.4h, v18.4s, #14 205 rshrn v19.4h, v19.4s, #14 206 rshrn v20.4h, v20.4s, #14 207 add \c0\().4h, v18.4h, v22.4h 208 sub \c3\().4h, v18.4h, v22.4h 209 add \c1\().4h, v19.4h, v20.4h 210 sub \c2\().4h, v19.4h, v20.4h 211.endm 212 213.macro iadst4 c0, c1, c2, c3 214 smull v16.4s, \c0\().4h, v0.h[4] 215 smlal v16.4s, \c2\().4h, v0.h[5] 216 smlal v16.4s, \c3\().4h, v0.h[6] 217 smull v17.4s, \c0\().4h, v0.h[6] 218 smlsl v17.4s, \c2\().4h, v0.h[4] 219 sub \c0\().4h, \c0\().4h, \c2\().4h 220 smlsl v17.4s, \c3\().4h, v0.h[5] 221 add \c0\().4h, \c0\().4h, \c3\().4h 222 smull v19.4s, \c1\().4h, v0.h[7] 223 smull v18.4s, \c0\().4h, v0.h[7] 224 add v20.4s, v16.4s, v19.4s 225 add v21.4s, v17.4s, v19.4s 226 rshrn \c0\().4h, v20.4s, #14 227 add v16.4s, v16.4s, v17.4s 228 rshrn \c1\().4h, v21.4s, #14 229 sub v16.4s, v16.4s, v19.4s 230 rshrn \c2\().4h, v18.4s, #14 231 rshrn \c3\().4h, v16.4s, #14 232.endm 233 234// The public functions in this file have got the following signature: 235// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 236 237.macro itxfm_func4x4 txfm1, txfm2 238function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1 239.ifc \txfm1,\txfm2 240.ifc \txfm1,idct 241 movrel x4, itxfm4_coeffs 242 ld1 {v0.4h}, [x4] 243.endif 244.ifc \txfm1,iadst 245 movrel x4, iadst4_coeffs 246 ld1 {v0.d}[1], [x4] 247.endif 248.else 249 movrel x4, itxfm4_coeffs 250 ld1 {v0.8h}, [x4] 251.endif 252 253 movi v31.8h, #0 254.ifc \txfm1\()_\txfm2,idct_idct 255 cmp w3, #1 256 b.ne 1f 257 // DC-only for idct/idct 258 ld1 {v2.h}[0], [x2] 259 smull v2.4s, v2.4h, v0.h[0] 260 rshrn v2.4h, v2.4s, #14 261 smull v2.4s, v2.4h, v0.h[0] 262 rshrn v2.4h, v2.4s, #14 263 st1 {v31.h}[0], [x2] 264 dup v4.4h, v2.h[0] 265 mov v5.16b, v4.16b 266 mov v6.16b, v4.16b 267 mov v7.16b, v4.16b 268 b 2f 269.endif 270 2711: 272 ld1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x2] 273 st1 {v31.8h}, [x2], #16 274 275.ifc \txfm1,iwht 276 sshr v4.4h, v4.4h, #2 277 sshr v5.4h, v5.4h, #2 278 sshr v6.4h, v6.4h, #2 279 sshr v7.4h, v7.4h, #2 280.endif 281 282 \txfm1\()4 v4, v5, v6, v7 283 284 st1 {v31.8h}, [x2], #16 285 // Transpose 4x4 with 16 bit elements 286 transpose_4x4H v4, v5, v6, v7, v16, v17, v18, v19 287 288 \txfm2\()4 v4, v5, v6, v7 2892: 290 ld1 {v0.s}[0], [x0], x1 291 ld1 {v1.s}[0], [x0], x1 292.ifnc \txfm1,iwht 293 srshr v4.4h, v4.4h, #4 294 srshr v5.4h, v5.4h, #4 295 srshr v6.4h, v6.4h, #4 296 srshr v7.4h, v7.4h, #4 297.endif 298 uaddw v4.8h, v4.8h, v0.8b 299 uaddw v5.8h, v5.8h, v1.8b 300 ld1 {v2.s}[0], [x0], x1 301 ld1 {v3.s}[0], [x0], x1 302 sqxtun v0.8b, v4.8h 303 sqxtun v1.8b, v5.8h 304 sub x0, x0, x1, lsl #2 305 306 uaddw v6.8h, v6.8h, v2.8b 307 uaddw v7.8h, v7.8h, v3.8b 308 st1 {v0.s}[0], [x0], x1 309 sqxtun v2.8b, v6.8h 310 sqxtun v3.8b, v7.8h 311 312 st1 {v1.s}[0], [x0], x1 313 st1 {v2.s}[0], [x0], x1 314 st1 {v3.s}[0], [x0], x1 315 316 ret 317endfunc 318.endm 319 320itxfm_func4x4 idct, idct 321itxfm_func4x4 iadst, idct 322itxfm_func4x4 idct, iadst 323itxfm_func4x4 iadst, iadst 324itxfm_func4x4 iwht, iwht 325 326 327.macro idct8 328 dmbutterfly0 v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a 329 dmbutterfly v18, v22, v0.h[2], v0.h[3], v2, v3, v4, v5 // v18 = t2a, v22 = t3a 330 dmbutterfly v17, v23, v0.h[4], v0.h[5], v2, v3, v4, v5 // v17 = t4a, v23 = t7a 331 dmbutterfly v21, v19, v0.h[6], v0.h[7], v2, v3, v4, v5 // v21 = t5a, v19 = t6a 332 333 butterfly_8h v24, v25, v16, v22 // v24 = t0, v25 = t3 334 butterfly_8h v28, v29, v17, v21 // v28 = t4, v29 = t5a 335 butterfly_8h v30, v31, v23, v19 // v30 = t7, v31 = t6a 336 butterfly_8h v26, v27, v20, v18 // v26 = t1, v27 = t2 337 338 dmbutterfly0 v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5 339 340 butterfly_8h v16, v23, v24, v30 // v16 = out[0], v23 = out[7] 341 butterfly_8h v17, v22, v26, v31 // v17 = out[1], v22 = out[6] 342 butterfly_8h v18, v21, v27, v29 // q13 = out[2], q10 = out[5] 343 butterfly_8h v19, v20, v25, v28 // v17 = out[3], q12 = out[4] 344.endm 345 346.macro iadst8 347 dmbutterfly_l v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0] // v24,v25 = t1a, v26,v27 = t0a 348 dmbutterfly_l v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2] // v28,v29 = t3a, v30,v31 = t2a 349 dmbutterfly_l v2, v3, v4, v5, v19, v20, v1.h[5], v1.h[4] // v2,v3 = t5a, v4,v5 = t4a 350 dmbutterfly_l v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6] // v16,v18 = t7a, v21,v23 = t6a 351 352 dbutterfly_n v4, v5, v26, v27, v4, v5, v6, v7, v26, v27 // v4 = t0, v5 = t4 353 dbutterfly_n v2, v3, v24, v25, v2, v3, v6, v7, v26, v27 // v2 = t1, v3 = t5 354 dbutterfly_n v24, v25, v30, v31, v21, v23, v6, v7, v26, v27 // v24 = t2, v25 = t6 355 dbutterfly_n v30, v31, v28, v29, v16, v18, v6, v7, v26, v27 // v30 = t3, v31 = t7 356 357 butterfly_8h v16, v6, v4, v24 // v16 = out[0], v6 = t2 358 butterfly_8h v23, v7, v2, v30 // v23 = -out[7], v7 = t3 359 neg v23.8h, v23.8h // v23 = out[7] 360 361 dmbutterfly0 v19, v20, v6, v7, v24, v26, v27, v28, v29, v30 // v19 = -out[3], v20 = out[4] 362 neg v19.8h, v19.8h // v19 = out[3] 363 364 dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[2], v0.h[3] // v26,v27 = t5a, v28,v29 = t4a 365 dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[3], v0.h[2] // v2,v3 = t6a, v4,v5 = t7a 366 367 dbutterfly_n v17, v30, v28, v29, v2, v3, v6, v7, v24, v25 // v17 = -out[1], v30 = t6 368 dbutterfly_n v22, v31, v26, v27, v4, v5, v6, v7, v24, v25 // v22 = out[6], v31 = t7 369 neg v17.8h, v17.8h // v17 = out[1] 370 371 dmbutterfly0 v18, v21, v30, v31, v2, v3, v4, v5, v6, v7 // v18 = out[2], v21 = -out[5] 372 neg v21.8h, v21.8h // v21 = out[5] 373.endm 374 375 376.macro itxfm_func8x8 txfm1, txfm2 377function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1 378 // The iadst also uses a few coefficients from 379 // idct, so those always need to be loaded. 380.ifc \txfm1\()_\txfm2,idct_idct 381 movrel x4, idct_coeffs 382.else 383 movrel x4, iadst8_coeffs 384 ld1 {v1.8h}, [x4], #16 385.endif 386 ld1 {v0.8h}, [x4] 387 388 movi v2.8h, #0 389 movi v3.8h, #0 390 movi v4.8h, #0 391 movi v5.8h, #0 392 393.ifc \txfm1\()_\txfm2,idct_idct 394 cmp w3, #1 395 b.ne 1f 396 // DC-only for idct/idct 397 ld1 {v2.h}[0], [x2] 398 smull v2.4s, v2.4h, v0.h[0] 399 rshrn v2.4h, v2.4s, #14 400 smull v2.4s, v2.4h, v0.h[0] 401 rshrn v2.4h, v2.4s, #14 402 st1 {v3.h}[0], [x2] 403 dup v16.8h, v2.h[0] 404 mov v17.16b, v16.16b 405 mov v18.16b, v16.16b 406 mov v19.16b, v16.16b 407 mov v20.16b, v16.16b 408 mov v21.16b, v16.16b 409 mov v22.16b, v16.16b 410 mov v23.16b, v16.16b 411 b 2f 412.endif 4131: 414 ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2], #64 415 ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2], #64 416 sub x2, x2, #128 417 st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64 418 st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64 419 420 \txfm1\()8 421 422 // Transpose 8x8 with 16 bit elements 423 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 424 425 \txfm2\()8 4262: 427 mov x3, x0 428 // Add into the destination 429 ld1 {v0.8b}, [x0], x1 430 srshr v16.8h, v16.8h, #5 431 ld1 {v1.8b}, [x0], x1 432 srshr v17.8h, v17.8h, #5 433 ld1 {v2.8b}, [x0], x1 434 srshr v18.8h, v18.8h, #5 435 uaddw v16.8h, v16.8h, v0.8b 436 ld1 {v3.8b}, [x0], x1 437 srshr v19.8h, v19.8h, #5 438 uaddw v17.8h, v17.8h, v1.8b 439 ld1 {v4.8b}, [x0], x1 440 srshr v20.8h, v20.8h, #5 441 uaddw v18.8h, v18.8h, v2.8b 442 sqxtun v0.8b, v16.8h 443 ld1 {v5.8b}, [x0], x1 444 srshr v21.8h, v21.8h, #5 445 uaddw v19.8h, v19.8h, v3.8b 446 sqxtun v1.8b, v17.8h 447 ld1 {v6.8b}, [x0], x1 448 srshr v22.8h, v22.8h, #5 449 uaddw v20.8h, v20.8h, v4.8b 450 sqxtun v2.8b, v18.8h 451 ld1 {v7.8b}, [x0], x1 452 srshr v23.8h, v23.8h, #5 453 uaddw v21.8h, v21.8h, v5.8b 454 sqxtun v3.8b, v19.8h 455 456 st1 {v0.8b}, [x3], x1 457 uaddw v22.8h, v22.8h, v6.8b 458 st1 {v1.8b}, [x3], x1 459 sqxtun v4.8b, v20.8h 460 st1 {v2.8b}, [x3], x1 461 uaddw v23.8h, v23.8h, v7.8b 462 st1 {v3.8b}, [x3], x1 463 sqxtun v5.8b, v21.8h 464 st1 {v4.8b}, [x3], x1 465 sqxtun v6.8b, v22.8h 466 st1 {v5.8b}, [x3], x1 467 sqxtun v7.8b, v23.8h 468 469 st1 {v6.8b}, [x3], x1 470 st1 {v7.8b}, [x3], x1 471 472 ret 473endfunc 474.endm 475 476itxfm_func8x8 idct, idct 477itxfm_func8x8 iadst, idct 478itxfm_func8x8 idct, iadst 479itxfm_func8x8 iadst, iadst 480 481 482function idct16x16_dc_add_neon 483 movrel x4, idct_coeffs 484 ld1 {v0.4h}, [x4] 485 486 movi v1.4h, #0 487 488 ld1 {v2.h}[0], [x2] 489 smull v2.4s, v2.4h, v0.h[0] 490 rshrn v2.4h, v2.4s, #14 491 smull v2.4s, v2.4h, v0.h[0] 492 rshrn v2.4h, v2.4s, #14 493 dup v2.8h, v2.h[0] 494 st1 {v1.h}[0], [x2] 495 496 srshr v2.8h, v2.8h, #6 497 498 mov x3, x0 499 mov x4, #16 5001: 501 // Loop to add the constant from v2 into all 16x16 outputs 502 subs x4, x4, #2 503 ld1 {v3.16b}, [x0], x1 504 ld1 {v4.16b}, [x0], x1 505 uaddw v16.8h, v2.8h, v3.8b 506 uaddw2 v17.8h, v2.8h, v3.16b 507 uaddw v18.8h, v2.8h, v4.8b 508 uaddw2 v19.8h, v2.8h, v4.16b 509 sqxtun v3.8b, v16.8h 510 sqxtun2 v3.16b, v17.8h 511 sqxtun v4.8b, v18.8h 512 sqxtun2 v4.16b, v19.8h 513 st1 {v3.16b}, [x3], x1 514 st1 {v4.16b}, [x3], x1 515 b.ne 1b 516 517 ret 518endfunc 519 520.macro idct16_end 521 butterfly_8h v18, v7, v4, v7 // v18 = t0a, v7 = t7a 522 butterfly_8h v19, v22, v5, v22 // v19 = t1a, v22 = t6 523 butterfly_8h v4, v26, v20, v26 // v4 = t2a, v26 = t5 524 butterfly_8h v5, v6, v28, v6 // v5 = t3a, v6 = t4 525 butterfly_8h v20, v28, v16, v24 // v20 = t8a, v28 = t11a 526 butterfly_8h v24, v21, v23, v21 // v24 = t9, v21 = t10 527 butterfly_8h v23, v27, v25, v27 // v23 = t14, v27 = t13 528 butterfly_8h v25, v29, v29, v17 // v25 = t15a, v29 = t12a 529 530 dmbutterfly0 v2, v3, v27, v21, v2, v3, v16, v17, v30, v31 // v2 = t13a, v3 = t10a 531 dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11 532 533 butterfly_8h v16, v31, v18, v25 // v16 = out[0], v31 = out[15] 534 butterfly_8h v17, v30, v19, v23 // v17 = out[1], v30 = out[14] 535 butterfly_8h_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6] 536 butterfly_8h v23, v24, v7, v20 // v23 = out[7], v24 = out[8] 537 butterfly_8h v18, v29, v4, v2 // v18 = out[2], v29 = out[13] 538 butterfly_8h v19, v28, v5, v28 // v19 = out[3], v28 = out[12] 539 butterfly_8h v20, v27, v6, v27 // v20 = out[4], v27 = out[11] 540 butterfly_8h v21, v26, v26, v3 // v21 = out[5], v26 = out[10] 541 ret 542.endm 543 544function idct16 545 dmbutterfly0 v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a 546 dmbutterfly v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a, v28 = t3a 547 dmbutterfly v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a, v30 = t7a 548 dmbutterfly v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a, v22 = t6a 549 dmbutterfly v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a, v31 = t15a 550 dmbutterfly v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a, v23 = t14a 551 dmbutterfly v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a 552 dmbutterfly v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a 553 554 butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3 555 butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2 556 butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5 557 butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6 558 butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9 559 butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10 560 butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13 561 butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14 562 563 dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a 564 dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a 565 dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a 566 idct16_end 567endfunc 568 569function idct16_half 570 dmbutterfly0_h v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a 571 dmbutterfly_h1 v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a, v28 = t3a 572 dmbutterfly_h1 v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a, v30 = t7a 573 dmbutterfly_h2 v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a, v22 = t6a 574 dmbutterfly_h1 v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a, v31 = t15a 575 dmbutterfly_h2 v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a, v23 = t14a 576 dmbutterfly_h1 v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a 577 dmbutterfly_h2 v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a 578 579 butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3 580 butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2 581 butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5 582 butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6 583 butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9 584 butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10 585 butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13 586 butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14 587 588 dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a 589 dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a 590 dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a 591 idct16_end 592endfunc 593 594function idct16_quarter 595 dsmull_h v24, v25, v19, v1.h[7] 596 dsmull_h v4, v5, v17, v1.h[0] 597 dsmull_h v7, v6, v18, v0.h[5] 598 dsmull_h v30, v31, v18, v0.h[4] 599 neg v24.4s, v24.4s 600 neg v25.4s, v25.4s 601 dsmull_h v29, v28, v17, v1.h[1] 602 dsmull_h v26, v27, v19, v1.h[6] 603 dsmull_h v22, v23, v16, v0.h[0] 604 drshrn_h v24, v24, v25, #14 605 drshrn_h v16, v4, v5, #14 606 drshrn_h v7, v7, v6, #14 607 drshrn_h v6, v30, v31, #14 608 drshrn_h v29, v29, v28, #14 609 drshrn_h v17, v26, v27, #14 610 drshrn_h v28, v22, v23, #14 611 612 dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.h[2], v0.h[3] 613 dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.h[2], v0.h[3] 614 neg v22.4s, v22.4s 615 neg v23.4s, v23.4s 616 drshrn_h v27, v20, v21, #14 617 drshrn_h v21, v22, v23, #14 618 drshrn_h v23, v18, v19, #14 619 drshrn_h v25, v30, v31, #14 620 mov v4.16b, v28.16b 621 mov v5.16b, v28.16b 622 dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31 623 mov v20.16b, v28.16b 624 idct16_end 625endfunc 626 627function iadst16 628 ld1 {v0.8h,v1.8h}, [x11] 629 630 dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.h[1], v0.h[0] // v6,v7 = t1, v4,v5 = t0 631 dmbutterfly_l v10, v11, v8, v9, v23, v24, v0.h[5], v0.h[4] // v10,v11 = t9, v8,v9 = t8 632 dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a 633 dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2] // v14,v15 = t3, v12,v13 = t2 634 dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a 635 636 dmbutterfly_l v6, v7, v4, v5, v21, v26, v0.h[7], v0.h[6] // v6,v7 = t11, v4,v5 = t10 637 dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a 638 dmbutterfly_l v10, v11, v8, v9, v27, v20, v1.h[1], v1.h[0] // v10,v11 = t5, v8,v9 = t4 639 dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a 640 641 dmbutterfly_l v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4] // v14,v15 = t13, v12,v13 = t12 642 dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a 643 dmbutterfly_l v6, v7, v4, v5, v25, v22, v1.h[3], v1.h[2] // v6,v7 = t7, v4,v5 = t6 644 dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a 645 646 dmbutterfly_l v10, v11, v8, v9, v17, v30, v1.h[7], v1.h[6] // v10,v11 = t15, v8,v9 = t14 647 ld1 {v0.8h}, [x10] 648 dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a 649 dmbutterfly_l v14, v15, v12, v13, v23, v24, v0.h[4], v0.h[5] // v14,v15 = t9, v12,v13 = t8 650 dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a 651 652 dmbutterfly_l v4, v5, v6, v7, v28, v19, v0.h[5], v0.h[4] // v4,v5 = t12, v6,v7 = t13 653 dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a 654 dmbutterfly_l v10, v11, v8, v9, v21, v26, v0.h[6], v0.h[7] // v10,v11 = t11, v8,v9 = t10 655 butterfly_8h_r v4, v27, v16, v27 // v4 = t4, v27 = t0 656 dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a 657 658 dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[7], v0.h[6] // v12,v13 = t14, v14,v15 = t15 659 butterfly_8h_r v5, v20, v31, v20 // v5 = t5, v20 = t1 660 dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a 661 dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a 662 663 butterfly_8h_r v6, v25, v18, v25 // v6 = t6, v25 = t2 664 butterfly_8h_r v7, v22, v29, v22 // v7 = t7, v22 = t3 665 666 dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.h[2], v0.h[3] // v10,v11 = t13, v8,v9 = t12 667 dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[3], v0.h[2] // v12,v13 = t14, v14,v15 = t15 668 669 dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a 670 dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a 671 neg v29.8h, v29.8h // v29 = out[13] 672 673 dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.h[2], v0.h[3] // v10,v11 = t5a, v8,v9 = t4a 674 dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.h[3], v0.h[2] // v12,v13 = t6a, v14,v15 = t7a 675 676 butterfly_8h v2, v6, v27, v25 // v2 = out[0], v6 = t2a 677 butterfly_8h v3, v7, v23, v21 // v3 =-out[1], v7 = t10 678 679 dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6 680 neg v19.8h, v19.8h // v19 = out[3] 681 dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7 682 683 butterfly_8h v5, v8, v20, v22 // v5 =-out[15],v8 = t3a 684 butterfly_8h v4, v9, v24, v26 // v4 = out[14],v9 = t11 685 686 dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8] 687 dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10] 688 dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11] 689 dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9] 690 691 neg v31.8h, v5.8h // v31 = out[15] 692 neg v17.8h, v3.8h // v17 = out[1] 693 694 mov v16.16b, v2.16b 695 mov v30.16b, v4.16b 696 ret 697endfunc 698 699// Helper macros; we can't use these expressions directly within 700// e.g. .irp due to the extra concatenation \(). Therefore wrap 701// them in macros to allow using .irp below. 702.macro load i, src, inc 703 ld1 {v\i\().8h}, [\src], \inc 704.endm 705.macro store i, dst, inc 706 st1 {v\i\().8h}, [\dst], \inc 707.endm 708.macro movi_v i, size, imm 709 movi v\i\()\size, \imm 710.endm 711.macro load_clear i, src, inc 712 ld1 {v\i\().8h}, [\src] 713 st1 {v2.8h}, [\src], \inc 714.endm 715 716.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2 717 srshr \coef0, \coef0, #6 718 ld1 {v2.8b}, [x0], x1 719 srshr \coef1, \coef1, #6 720 ld1 {v3.8b}, [x3], x1 721 srshr \coef2, \coef2, #6 722 ld1 {v4.8b}, [x0], x1 723 srshr \coef3, \coef3, #6 724 uaddw \coef0, \coef0, v2.8b 725 ld1 {v5.8b}, [x3], x1 726 uaddw \coef1, \coef1, v3.8b 727 srshr \coef4, \coef4, #6 728 ld1 {v6.8b}, [x0], x1 729 srshr \coef5, \coef5, #6 730 ld1 {v7.8b}, [x3], x1 731 sqxtun v2.8b, \coef0 732 srshr \coef6, \coef6, #6 733 sqxtun v3.8b, \coef1 734 srshr \coef7, \coef7, #6 735 uaddw \coef2, \coef2, v4.8b 736 ld1 {\tmp1}, [x0], x1 737 uaddw \coef3, \coef3, v5.8b 738 ld1 {\tmp2}, [x3], x1 739 sqxtun v4.8b, \coef2 740 sub x0, x0, x1, lsl #2 741 sub x3, x3, x1, lsl #2 742 sqxtun v5.8b, \coef3 743 uaddw \coef4, \coef4, v6.8b 744 st1 {v2.8b}, [x0], x1 745 uaddw \coef5, \coef5, v7.8b 746 st1 {v3.8b}, [x3], x1 747 sqxtun v6.8b, \coef4 748 st1 {v4.8b}, [x0], x1 749 sqxtun v7.8b, \coef5 750 st1 {v5.8b}, [x3], x1 751 uaddw \coef6, \coef6, \tmp1 752 st1 {v6.8b}, [x0], x1 753 uaddw \coef7, \coef7, \tmp2 754 st1 {v7.8b}, [x3], x1 755 sqxtun \tmp1, \coef6 756 sqxtun \tmp2, \coef7 757 st1 {\tmp1}, [x0], x1 758 st1 {\tmp2}, [x3], x1 759.endm 760 761// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, 762// transpose into a horizontal 16x8 slice and store. 763// x0 = dst (temp buffer) 764// x1 = slice offset 765// x2 = src 766// x9 = input stride 767.macro itxfm16_1d_funcs txfm 768function \txfm\()16_1d_8x16_pass1_neon 769 mov x14, x30 770 771 movi v2.8h, #0 772.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 773 load_clear \i, x2, x9 774.endr 775 776 bl \txfm\()16 777 778 // Do two 8x8 transposes. Originally, v16-v31 contain the 779 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two 780 // transposed 8x8 blocks. 781 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 782 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 783 784 // Store the transposed 8x8 blocks horizontally. 785 cmp x1, #8 786 b.eq 1f 787.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 788 store \i, x0, #16 789.endr 790 br x14 7911: 792 // Special case: For the last input column (x1 == 8), 793 // which would be stored as the last row in the temp buffer, 794 // don't store the first 8x8 block, but keep it in registers 795 // for the first slice of the second pass (where it is the 796 // last 8x8 block). 797.irp i, 24, 25, 26, 27, 28, 29, 30, 31 798 add x0, x0, #16 799 store \i, x0, #16 800.endr 801 mov v24.16b, v16.16b 802 mov v25.16b, v17.16b 803 mov v26.16b, v18.16b 804 mov v27.16b, v19.16b 805 mov v28.16b, v20.16b 806 mov v29.16b, v21.16b 807 mov v30.16b, v22.16b 808 mov v31.16b, v23.16b 809 br x14 810endfunc 811 812// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, 813// load the destination pixels (from a similar 8x16 slice), add and store back. 814// x0 = dst 815// x1 = dst stride 816// x2 = src (temp buffer) 817// x3 = slice offset 818// x9 = temp buffer stride 819function \txfm\()16_1d_8x16_pass2_neon 820 mov x14, x30 821.irp i, 16, 17, 18, 19, 20, 21, 22, 23 822 load \i, x2, x9 823.endr 824 cbz x3, 1f 825.irp i, 24, 25, 26, 27, 28, 29, 30, 31 826 load \i, x2, x9 827.endr 8281: 829 830 add x3, x0, x1 831 lsl x1, x1, #1 832 bl \txfm\()16 833 834 load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b 835 load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b 836 837 br x14 838endfunc 839.endm 840 841itxfm16_1d_funcs idct 842itxfm16_1d_funcs iadst 843 844.macro itxfm_func16x16 txfm1, txfm2 845function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 846.ifc \txfm1\()_\txfm2,idct_idct 847 cmp w3, #1 848 b.eq idct16x16_dc_add_neon 849.endif 850 mov x15, x30 851 // iadst16 requires clobbering v8-v15, but idct16 doesn't need to. 852.ifnc \txfm1\()_\txfm2,idct_idct 853 stp d14, d15, [sp, #-0x10]! 854 stp d12, d13, [sp, #-0x10]! 855 stp d10, d11, [sp, #-0x10]! 856 stp d8, d9, [sp, #-0x10]! 857.endif 858 859 sub sp, sp, #512 860 861 mov x4, x0 862 mov x5, x1 863 mov x6, x2 864 865 movrel x10, idct_coeffs 866.ifnc \txfm1\()_\txfm2,idct_idct 867 movrel x11, iadst16_coeffs 868.endif 869.ifc \txfm1,idct 870 ld1 {v0.8h,v1.8h}, [x10] 871.endif 872 mov x9, #32 873 874.ifc \txfm1\()_\txfm2,idct_idct 875 cmp w3, #10 876 b.le idct16x16_quarter_add_neon 877 cmp w3, #38 878 b.le idct16x16_half_add_neon 879.endif 880 881.irp i, 0, 8 882 add x0, sp, #(\i*32) 883.ifc \txfm1\()_\txfm2,idct_idct 884.if \i == 8 885 cmp w3, #38 886 b.le 1f 887.endif 888.endif 889 mov x1, #\i 890 add x2, x6, #(\i*2) 891 bl \txfm1\()16_1d_8x16_pass1_neon 892.endr 893.ifc \txfm1\()_\txfm2,iadst_idct 894 ld1 {v0.8h,v1.8h}, [x10] 895.endif 896 897.ifc \txfm1\()_\txfm2,idct_idct 898 b 3f 8991: 900 // Set v24-v31 to zero, for the in-register passthrough of 901 // coefficients to pass 2. Since we only do two slices, this can 902 // only ever happen for the second slice. So we only need to store 903 // zeros to the temp buffer for the second half of the buffer. 904 // Move x0 to the second half, and use x9 == 32 as increment. 905 add x0, x0, #16 906.irp i, 24, 25, 26, 27, 28, 29, 30, 31 907 movi_v \i, .16b, #0 908 st1 {v24.8h}, [x0], x9 909.endr 9103: 911.endif 912 913.irp i, 0, 8 914 add x0, x4, #(\i) 915 mov x1, x5 916 add x2, sp, #(\i*2) 917 mov x3, #\i 918 bl \txfm2\()16_1d_8x16_pass2_neon 919.endr 920 921 add sp, sp, #512 922.ifnc \txfm1\()_\txfm2,idct_idct 923 ldp d8, d9, [sp], 0x10 924 ldp d10, d11, [sp], 0x10 925 ldp d12, d13, [sp], 0x10 926 ldp d14, d15, [sp], 0x10 927.endif 928 br x15 929endfunc 930.endm 931 932itxfm_func16x16 idct, idct 933itxfm_func16x16 iadst, idct 934itxfm_func16x16 idct, iadst 935itxfm_func16x16 iadst, iadst 936 937function idct16_1d_8x16_pass1_quarter_neon 938 mov x14, x30 939 movi v2.8h, #0 940.irp i, 16, 17, 18, 19 941 load_clear \i, x2, x9 942.endr 943 944 bl idct16_quarter 945 946 // Do two 8x8 transposes. Originally, v16-v31 contain the 947 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two 948 // transposed 8x8 blocks. 949 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 950 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 951 952 // Store the transposed 8x8 blocks horizontally. 953 // The first 8x8 block is kept in registers for the second pass, 954 // store the rest in the temp buffer. 955 // Since only a 4x4 part of the input was nonzero, this means that 956 // only 4 rows are nonzero after transposing, and the second pass 957 // only reads the topmost 4 rows. Therefore only store the topmost 958 // 4 rows. 959 add x0, x0, #16 960.irp i, 24, 25, 26, 27 961 store \i, x0, x9 962.endr 963 br x14 964endfunc 965 966function idct16_1d_8x16_pass2_quarter_neon 967 mov x14, x30 968 cbz x3, 1f 969.irp i, 16, 17, 18, 19 970 load \i, x2, x9 971.endr 9721: 973 974 add x3, x0, x1 975 lsl x1, x1, #1 976 bl idct16_quarter 977 978 load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b 979 load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b 980 981 br x14 982endfunc 983 984function idct16_1d_8x16_pass1_half_neon 985 mov x14, x30 986 movi v2.8h, #0 987.irp i, 16, 17, 18, 19, 20, 21, 22, 23 988 load_clear \i, x2, x9 989.endr 990 991 bl idct16_half 992 993 // Do two 8x8 transposes. Originally, v16-v31 contain the 994 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two 995 // transposed 8x8 blocks. 996 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 997 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 998 999 // Store the transposed 8x8 blocks horizontally. 1000 // The first 8x8 block is kept in registers for the second pass, 1001 // store the rest in the temp buffer. 1002 add x0, x0, #16 1003.irp i, 24, 25, 26, 27, 28, 29, 30, 31 1004 store \i, x0, x9 1005.endr 1006 br x14 1007endfunc 1008 1009function idct16_1d_8x16_pass2_half_neon 1010 mov x14, x30 1011 cbz x3, 1f 1012.irp i, 16, 17, 18, 19, 20, 21, 22, 23 1013 load \i, x2, x9 1014.endr 10151: 1016 1017 add x3, x0, x1 1018 lsl x1, x1, #1 1019 bl idct16_half 1020 1021 load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b 1022 load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b 1023 1024 br x14 1025endfunc 1026 1027.macro idct16_partial size 1028function idct16x16_\size\()_add_neon 1029 add x0, sp, #(0*32) 1030 add x2, x6, #(0*2) 1031 bl idct16_1d_8x16_pass1_\size\()_neon 1032.irp i, 0, 8 1033 add x0, x4, #(\i) 1034 mov x1, x5 1035 add x2, sp, #(\i*2) 1036 mov x3, #\i 1037 bl idct16_1d_8x16_pass2_\size\()_neon 1038.endr 1039 1040 add sp, sp, #512 1041 br x15 1042endfunc 1043.endm 1044 1045idct16_partial quarter 1046idct16_partial half 1047 1048function idct32x32_dc_add_neon 1049 movrel x4, idct_coeffs 1050 ld1 {v0.4h}, [x4] 1051 1052 movi v1.4h, #0 1053 1054 ld1 {v2.h}[0], [x2] 1055 smull v2.4s, v2.4h, v0.h[0] 1056 rshrn v2.4h, v2.4s, #14 1057 smull v2.4s, v2.4h, v0.h[0] 1058 rshrn v2.4h, v2.4s, #14 1059 dup v2.8h, v2.h[0] 1060 st1 {v1.h}[0], [x2] 1061 1062 srshr v0.8h, v2.8h, #6 1063 1064 mov x3, x0 1065 mov x4, #32 10661: 1067 // Loop to add the constant v0 into all 32x32 outputs 1068 subs x4, x4, #2 1069 ld1 {v1.16b,v2.16b}, [x0], x1 1070 uaddw v16.8h, v0.8h, v1.8b 1071 uaddw2 v17.8h, v0.8h, v1.16b 1072 ld1 {v3.16b,v4.16b}, [x0], x1 1073 uaddw v18.8h, v0.8h, v2.8b 1074 uaddw2 v19.8h, v0.8h, v2.16b 1075 uaddw v20.8h, v0.8h, v3.8b 1076 uaddw2 v21.8h, v0.8h, v3.16b 1077 uaddw v22.8h, v0.8h, v4.8b 1078 uaddw2 v23.8h, v0.8h, v4.16b 1079 sqxtun v1.8b, v16.8h 1080 sqxtun2 v1.16b, v17.8h 1081 sqxtun v2.8b, v18.8h 1082 sqxtun2 v2.16b, v19.8h 1083 sqxtun v3.8b, v20.8h 1084 sqxtun2 v3.16b, v21.8h 1085 st1 {v1.16b,v2.16b}, [x3], x1 1086 sqxtun v4.8b, v22.8h 1087 sqxtun2 v4.16b, v23.8h 1088 st1 {v3.16b,v4.16b}, [x3], x1 1089 b.ne 1b 1090 1091 ret 1092endfunc 1093 1094.macro idct32_end 1095 butterfly_8h v16, v5, v4, v5 // v16 = t16a, v5 = t19a 1096 butterfly_8h v17, v20, v23, v20 // v17 = t17, v20 = t18 1097 butterfly_8h v18, v6, v7, v6 // v18 = t23a, v6 = t20a 1098 butterfly_8h v19, v21, v22, v21 // v19 = t22, v21 = t21 1099 butterfly_8h v4, v28, v28, v30 // v4 = t24a, v28 = t27a 1100 butterfly_8h v23, v26, v25, v26 // v23 = t25, v26 = t26 1101 butterfly_8h v7, v3, v29, v31 // v7 = t31a, v3 = t28a 1102 butterfly_8h v22, v27, v24, v27 // v22 = t30, v27 = t29 1103 1104 dmbutterfly v27, v20, v0.h[2], v0.h[3], v24, v25, v30, v31 // v27 = t18a, v20 = t29a 1105 dmbutterfly v3, v5, v0.h[2], v0.h[3], v24, v25, v30, v31 // v3 = t19, v5 = t28 1106 dmbutterfly v28, v6, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20 1107 dmbutterfly v26, v21, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a 1108 1109 butterfly_8h v31, v24, v7, v4 // v31 = t31, v24 = t24 1110 butterfly_8h v30, v25, v22, v23 // v30 = t30a, v25 = t25a 1111 butterfly_8h_r v23, v16, v16, v18 // v23 = t23, v16 = t16 1112 butterfly_8h_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a 1113 butterfly_8h v18, v21, v27, v21 // v18 = t18, v21 = t21 1114 butterfly_8h_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a 1115 butterfly_8h v29, v26, v20, v26 // v29 = t29, v26 = t26 1116 butterfly_8h v19, v20, v3, v6 // v19 = t19a, v20 = t20 1117 1118 dmbutterfly0 v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27, v20 = t20 1119 dmbutterfly0 v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a 1120 dmbutterfly0 v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25, v22 = t22 1121 dmbutterfly0 v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a 1122 ret 1123.endm 1124 1125function idct32_odd 1126 dmbutterfly v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a 1127 dmbutterfly v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a 1128 dmbutterfly v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a 1129 dmbutterfly v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a 1130 dmbutterfly v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a 1131 dmbutterfly v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a 1132 dmbutterfly v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a 1133 dmbutterfly v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a 1134 1135 butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17 1136 butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18 1137 butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21 1138 butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22 1139 butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25 1140 butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26 1141 butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30 1142 butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29 1143 1144 dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19 // v23 = t17a, v24 = t30a 1145 dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a 1146 dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19 // v21 = t21a, v26 = t26a 1147 dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a 1148 idct32_end 1149endfunc 1150 1151function idct32_odd_half 1152 dmbutterfly_h1 v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a 1153 dmbutterfly_h2 v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a 1154 dmbutterfly_h1 v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a 1155 dmbutterfly_h2 v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a 1156 dmbutterfly_h1 v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a 1157 dmbutterfly_h2 v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a 1158 dmbutterfly_h1 v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a 1159 dmbutterfly_h2 v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a 1160 1161 butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17 1162 butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18 1163 butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21 1164 butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22 1165 butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25 1166 butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26 1167 butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30 1168 butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29 1169 1170 dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19 // v23 = t17a, v24 = t30a 1171 dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a 1172 dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19 // v21 = t21a, v26 = t26a 1173 dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a 1174 idct32_end 1175endfunc 1176 1177function idct32_odd_quarter 1178 dsmull_h v4, v5, v16, v8.h[0] 1179 dsmull_h v28, v29, v19, v8.h[7] 1180 dsmull_h v30, v31, v16, v8.h[1] 1181 dsmull_h v22, v23, v17, v9.h[6] 1182 dsmull_h v7, v6, v17, v9.h[7] 1183 dsmull_h v26, v27, v19, v8.h[6] 1184 dsmull_h v20, v21, v18, v9.h[0] 1185 dsmull_h v24, v25, v18, v9.h[1] 1186 1187 neg v28.4s, v28.4s 1188 neg v29.4s, v29.4s 1189 neg v7.4s, v7.4s 1190 neg v6.4s, v6.4s 1191 1192 drshrn_h v4, v4, v5, #14 1193 drshrn_h v5, v28, v29, #14 1194 drshrn_h v29, v30, v31, #14 1195 drshrn_h v28, v22, v23, #14 1196 drshrn_h v7, v7, v6, #14 1197 drshrn_h v31, v26, v27, #14 1198 drshrn_h v6, v20, v21, #14 1199 drshrn_h v30, v24, v25, #14 1200 1201 dmbutterfly_l v16, v17, v18, v19, v29, v4, v0.h[4], v0.h[5] 1202 dmbutterfly_l v27, v26, v20, v21, v31, v5, v0.h[4], v0.h[5] 1203 drshrn_h v23, v16, v17, #14 1204 drshrn_h v24, v18, v19, #14 1205 neg v20.4s, v20.4s 1206 neg v21.4s, v21.4s 1207 drshrn_h v27, v27, v26, #14 1208 drshrn_h v20, v20, v21, #14 1209 dmbutterfly_l v16, v17, v18, v19, v30, v6, v0.h[6], v0.h[7] 1210 drshrn_h v21, v16, v17, #14 1211 drshrn_h v26, v18, v19, #14 1212 dmbutterfly_l v16, v17, v18, v19, v28, v7, v0.h[6], v0.h[7] 1213 drshrn_h v25, v16, v17, #14 1214 neg v18.4s, v18.4s 1215 neg v19.4s, v19.4s 1216 drshrn_h v22, v18, v19, #14 1217 1218 idct32_end 1219endfunc 1220 1221.macro idct32_funcs suffix 1222// Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix. 1223// The 32-point IDCT can be decomposed into two 16-point IDCTs; 1224// a normal IDCT16 with every other input component (the even ones, with 1225// each output written twice), followed by a separate 16-point IDCT 1226// of the odd inputs, added/subtracted onto the outputs of the first idct16. 1227// x0 = dst (temp buffer) 1228// x1 = unused 1229// x2 = src 1230// x9 = double input stride 1231function idct32_1d_8x32_pass1\suffix\()_neon 1232 mov x14, x30 1233 movi v2.8h, #0 1234 1235 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) 1236.ifb \suffix 1237.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1238 load_clear \i, x2, x9 1239.endr 1240.endif 1241.ifc \suffix,_quarter 1242.irp i, 16, 17, 18, 19 1243 load_clear \i, x2, x9 1244.endr 1245.endif 1246.ifc \suffix,_half 1247.irp i, 16, 17, 18, 19, 20, 21, 22, 23 1248 load_clear \i, x2, x9 1249.endr 1250.endif 1251 1252 bl idct16\suffix 1253 1254 // Do two 8x8 transposes. Originally, v16-v31 contain the 1255 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the 1256 // two transposed 8x8 blocks. 1257 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 1258 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 1259 1260 // Store the registers a, b horizontally, followed by the 1261 // same registers b, a mirrored. 1262.macro store_rev a, b 1263 // There's no rev128 instruction, but we reverse each 64 bit 1264 // half, and then flip them using an ext with 8 bytes offset. 1265 rev64 v3.8h, \b 1266 st1 {\a}, [x0], #16 1267 rev64 v2.8h, \a 1268 ext v3.16b, v3.16b, v3.16b, #8 1269 st1 {\b}, [x0], #16 1270 ext v2.16b, v2.16b, v2.16b, #8 1271 st1 {v3.8h}, [x0], #16 1272 st1 {v2.8h}, [x0], #16 1273.endm 1274 store_rev v16.8h, v24.8h 1275 store_rev v17.8h, v25.8h 1276 store_rev v18.8h, v26.8h 1277 store_rev v19.8h, v27.8h 1278 store_rev v20.8h, v28.8h 1279 store_rev v21.8h, v29.8h 1280 store_rev v22.8h, v30.8h 1281 store_rev v23.8h, v31.8h 1282 sub x0, x0, #512 1283.purgem store_rev 1284 1285 // Move x2 back to the start of the input, and move 1286 // to the first odd row 1287.ifb \suffix 1288 sub x2, x2, x9, lsl #4 1289.endif 1290.ifc \suffix,_quarter 1291 sub x2, x2, x9, lsl #2 1292.endif 1293.ifc \suffix,_half 1294 sub x2, x2, x9, lsl #3 1295.endif 1296 add x2, x2, #64 1297 1298 movi v2.8h, #0 1299 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) 1300.ifb \suffix 1301.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1302 load_clear \i, x2, x9 1303.endr 1304.endif 1305.ifc \suffix,_quarter 1306.irp i, 16, 17, 18, 19 1307 load_clear \i, x2, x9 1308.endr 1309.endif 1310.ifc \suffix,_half 1311.irp i, 16, 17, 18, 19, 20, 21, 22, 23 1312 load_clear \i, x2, x9 1313.endr 1314.endif 1315 1316 bl idct32_odd\suffix 1317 1318 transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3 1319 transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3 1320 1321 // Store the registers a, b horizontally, 1322 // adding into the output first, and the mirrored, 1323 // subtracted from the output. 1324.macro store_rev a, b 1325 ld1 {v4.8h}, [x0] 1326 rev64 v3.8h, \b 1327 add v4.8h, v4.8h, \a 1328 rev64 v2.8h, \a 1329 st1 {v4.8h}, [x0], #16 1330 ext v3.16b, v3.16b, v3.16b, #8 1331 ld1 {v5.8h}, [x0] 1332 ext v2.16b, v2.16b, v2.16b, #8 1333 add v5.8h, v5.8h, \b 1334 st1 {v5.8h}, [x0], #16 1335 ld1 {v6.8h}, [x0] 1336 sub v6.8h, v6.8h, v3.8h 1337 st1 {v6.8h}, [x0], #16 1338 ld1 {v7.8h}, [x0] 1339 sub v7.8h, v7.8h, v2.8h 1340 st1 {v7.8h}, [x0], #16 1341.endm 1342 1343 store_rev v31.8h, v23.8h 1344 store_rev v30.8h, v22.8h 1345 store_rev v29.8h, v21.8h 1346 store_rev v28.8h, v20.8h 1347 store_rev v27.8h, v19.8h 1348 store_rev v26.8h, v18.8h 1349 store_rev v25.8h, v17.8h 1350 store_rev v24.8h, v16.8h 1351.purgem store_rev 1352 br x14 1353endfunc 1354 1355// This is mostly the same as 8x32_pass1, but without the transpose, 1356// and use the source as temp buffer between the two idct passes, and 1357// add into the destination. 1358// x0 = dst 1359// x1 = dst stride 1360// x2 = src (temp buffer) 1361// x7 = negative double temp buffer stride 1362// x9 = double temp buffer stride 1363function idct32_1d_8x32_pass2\suffix\()_neon 1364 mov x14, x30 1365 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) 1366.ifb \suffix 1367.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1368 load \i, x2, x9 1369.endr 1370 sub x2, x2, x9, lsl #4 1371.endif 1372.ifc \suffix,_quarter 1373.irp i, 16, 17, 18, 19 1374 load \i, x2, x9 1375.endr 1376 sub x2, x2, x9, lsl #2 1377.endif 1378.ifc \suffix,_half 1379.irp i, 16, 17, 18, 19, 20, 21, 22, 23 1380 load \i, x2, x9 1381.endr 1382 sub x2, x2, x9, lsl #3 1383.endif 1384 1385 bl idct16\suffix 1386 1387.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1388 store \i, x2, x9 1389.endr 1390 1391 sub x2, x2, x9, lsl #4 1392 add x2, x2, #64 1393 1394 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) 1395.ifb \suffix 1396.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1397 load \i, x2, x9 1398.endr 1399 sub x2, x2, x9, lsl #4 1400.endif 1401.ifc \suffix,_quarter 1402.irp i, 16, 17, 18, 19 1403 load \i, x2, x9 1404.endr 1405 sub x2, x2, x9, lsl #2 1406.endif 1407.ifc \suffix,_half 1408.irp i, 16, 17, 18, 19, 20, 21, 22, 23 1409 load \i, x2, x9 1410.endr 1411 sub x2, x2, x9, lsl #3 1412.endif 1413 sub x2, x2, #64 1414 1415 bl idct32_odd\suffix 1416 1417.macro load_acc_store a, b, c, d, neg=0 1418.if \neg == 0 1419 ld1 {v4.8h}, [x2], x9 1420 ld1 {v5.8h}, [x2], x9 1421 add v4.8h, v4.8h, \a 1422 ld1 {v6.8h}, [x2], x9 1423 add v5.8h, v5.8h, \b 1424 ld1 {v7.8h}, [x2], x9 1425 add v6.8h, v6.8h, \c 1426 add v7.8h, v7.8h, \d 1427.else 1428 ld1 {v4.8h}, [x2], x7 1429 ld1 {v5.8h}, [x2], x7 1430 sub v4.8h, v4.8h, \a 1431 ld1 {v6.8h}, [x2], x7 1432 sub v5.8h, v5.8h, \b 1433 ld1 {v7.8h}, [x2], x7 1434 sub v6.8h, v6.8h, \c 1435 sub v7.8h, v7.8h, \d 1436.endif 1437 ld1 {v10.8b}, [x0], x1 1438 ld1 {v11.8b}, [x0], x1 1439 srshr v4.8h, v4.8h, #6 1440 ld1 {v2.8b}, [x0], x1 1441 srshr v5.8h, v5.8h, #6 1442 uaddw v4.8h, v4.8h, v10.8b 1443 ld1 {v3.8b}, [x0], x1 1444 srshr v6.8h, v6.8h, #6 1445 uaddw v5.8h, v5.8h, v11.8b 1446 srshr v7.8h, v7.8h, #6 1447 sub x0, x0, x1, lsl #2 1448 uaddw v6.8h, v6.8h, v2.8b 1449 sqxtun v4.8b, v4.8h 1450 uaddw v7.8h, v7.8h, v3.8b 1451 sqxtun v5.8b, v5.8h 1452 st1 {v4.8b}, [x0], x1 1453 sqxtun v6.8b, v6.8h 1454 st1 {v5.8b}, [x0], x1 1455 sqxtun v7.8b, v7.8h 1456 st1 {v6.8b}, [x0], x1 1457 st1 {v7.8b}, [x0], x1 1458.endm 1459 load_acc_store v31.8h, v30.8h, v29.8h, v28.8h 1460 load_acc_store v27.8h, v26.8h, v25.8h, v24.8h 1461 load_acc_store v23.8h, v22.8h, v21.8h, v20.8h 1462 load_acc_store v19.8h, v18.8h, v17.8h, v16.8h 1463 sub x2, x2, x9 1464 load_acc_store v16.8h, v17.8h, v18.8h, v19.8h, 1 1465 load_acc_store v20.8h, v21.8h, v22.8h, v23.8h, 1 1466 load_acc_store v24.8h, v25.8h, v26.8h, v27.8h, 1 1467 load_acc_store v28.8h, v29.8h, v30.8h, v31.8h, 1 1468.purgem load_acc_store 1469 br x14 1470endfunc 1471.endm 1472 1473idct32_funcs 1474idct32_funcs _quarter 1475idct32_funcs _half 1476 1477const min_eob_idct_idct_32, align=4 1478 .short 0, 34, 135, 336 1479endconst 1480 1481function ff_vp9_idct_idct_32x32_add_neon, export=1 1482 cmp w3, #1 1483 b.eq idct32x32_dc_add_neon 1484 1485 movrel x10, idct_coeffs 1486 1487 mov x15, x30 1488 1489 stp d10, d11, [sp, #-0x10]! 1490 stp d8, d9, [sp, #-0x10]! 1491 1492 sub sp, sp, #2048 1493 1494 mov x4, x0 1495 mov x5, x1 1496 mov x6, x2 1497 1498 // Double stride of the input, since we only read every other line 1499 mov x9, #128 1500 neg x7, x9 1501 1502 ld1 {v0.8h,v1.8h}, [x10], #32 1503 ld1 {v8.8h,v9.8h}, [x10] 1504 1505 cmp w3, #34 1506 b.le idct32x32_quarter_add_neon 1507 cmp w3, #135 1508 b.le idct32x32_half_add_neon 1509 1510 movrel x12, min_eob_idct_idct_32, 2 1511 1512.irp i, 0, 8, 16, 24 1513 add x0, sp, #(\i*64) 1514.if \i > 0 1515 ldrh w1, [x12], #2 1516 cmp w3, w1 1517 mov x1, #(32 - \i)/4 1518 b.le 1f 1519.endif 1520 add x2, x6, #(\i*2) 1521 bl idct32_1d_8x32_pass1_neon 1522.endr 1523 b 3f 1524 15251: 1526 // Write zeros to the temp buffer for pass 2 1527 movi v16.8h, #0 1528 movi v17.8h, #0 1529 movi v18.8h, #0 1530 movi v19.8h, #0 15312: 1532 subs x1, x1, #1 1533.rept 4 1534 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x0], #64 1535.endr 1536 b.ne 2b 15373: 1538.irp i, 0, 8, 16, 24 1539 add x0, x4, #(\i) 1540 mov x1, x5 1541 add x2, sp, #(\i*2) 1542 bl idct32_1d_8x32_pass2_neon 1543.endr 1544 1545 add sp, sp, #2048 1546 1547 ldp d8, d9, [sp], 0x10 1548 ldp d10, d11, [sp], 0x10 1549 1550 br x15 1551endfunc 1552 1553.macro idct32_partial size 1554function idct32x32_\size\()_add_neon 1555 add x0, sp, #(0*64) 1556 add x2, x6, #(0*2) 1557 bl idct32_1d_8x32_pass1_\size\()_neon 1558.ifc \size,half 1559 add x0, sp, #(8*64) 1560 add x2, x6, #(8*2) 1561 bl idct32_1d_8x32_pass1_\size\()_neon 1562.endif 1563.irp i, 0, 8, 16, 24 1564 add x0, x4, #(\i) 1565 mov x1, x5 1566 add x2, sp, #(\i*2) 1567 bl idct32_1d_8x32_pass2_\size\()_neon 1568.endr 1569 1570 add sp, sp, #2048 1571 1572 ldp d8, d9, [sp], 0x10 1573 ldp d10, d11, [sp], 0x10 1574 1575 br x15 1576endfunc 1577.endm 1578 1579idct32_partial quarter 1580idct32_partial half 1581