1/* 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/aarch64/asm.S" 23 24/* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */ 25.macro h264_chroma_mc8 type, codec=h264 26function ff_\type\()_\codec\()_chroma_mc8_neon, export=1 27 .ifc \type,avg 28 mov x8, x0 29 .endif 30 prfm pldl1strm, [x1] 31 prfm pldl1strm, [x1, x2] 32 .ifc \codec,rv40 33 movrel x6, rv40bias 34 lsr w9, w5, #1 35 lsr w10, w4, #1 36 lsl w9, w9, #3 37 lsl w10, w10, #1 38 add w9, w9, w10 39 add x6, x6, w9, UXTW 40 ld1r {v22.8H}, [x6] 41 .endif 42 .ifc \codec,vc1 43 movi v22.8H, #28 44 .endif 45 mul w7, w4, w5 46 lsl w14, w5, #3 47 lsl w13, w4, #3 48 cmp w7, #0 49 sub w6, w14, w7 50 sub w12, w13, w7 51 sub w4, w7, w13 52 sub w4, w4, w14 53 add w4, w4, #64 54 b.eq 2f 55 56 dup v0.8B, w4 57 dup v1.8B, w12 58 ld1 {v4.8B, v5.8B}, [x1], x2 59 dup v2.8B, w6 60 dup v3.8B, w7 61 ext v5.8B, v4.8B, v5.8B, #1 621: ld1 {v6.8B, v7.8B}, [x1], x2 63 umull v16.8H, v4.8B, v0.8B 64 umlal v16.8H, v5.8B, v1.8B 65 ext v7.8B, v6.8B, v7.8B, #1 66 ld1 {v4.8B, v5.8B}, [x1], x2 67 umlal v16.8H, v6.8B, v2.8B 68 prfm pldl1strm, [x1] 69 ext v5.8B, v4.8B, v5.8B, #1 70 umlal v16.8H, v7.8B, v3.8B 71 umull v17.8H, v6.8B, v0.8B 72 subs w3, w3, #2 73 umlal v17.8H, v7.8B, v1.8B 74 umlal v17.8H, v4.8B, v2.8B 75 umlal v17.8H, v5.8B, v3.8B 76 prfm pldl1strm, [x1, x2] 77 .ifc \codec,h264 78 rshrn v16.8B, v16.8H, #6 79 rshrn v17.8B, v17.8H, #6 80 .else 81 add v16.8H, v16.8H, v22.8H 82 add v17.8H, v17.8H, v22.8H 83 shrn v16.8B, v16.8H, #6 84 shrn v17.8B, v17.8H, #6 85 .endif 86 .ifc \type,avg 87 ld1 {v20.8B}, [x8], x2 88 ld1 {v21.8B}, [x8], x2 89 urhadd v16.8B, v16.8B, v20.8B 90 urhadd v17.8B, v17.8B, v21.8B 91 .endif 92 st1 {v16.8B}, [x0], x2 93 st1 {v17.8B}, [x0], x2 94 b.gt 1b 95 ret 96 972: adds w12, w12, w6 98 dup v0.8B, w4 99 b.eq 5f 100 tst w6, w6 101 dup v1.8B, w12 102 b.eq 4f 103 104 ld1 {v4.8B}, [x1], x2 1053: ld1 {v6.8B}, [x1], x2 106 umull v16.8H, v4.8B, v0.8B 107 umlal v16.8H, v6.8B, v1.8B 108 ld1 {v4.8B}, [x1], x2 109 umull v17.8H, v6.8B, v0.8B 110 umlal v17.8H, v4.8B, v1.8B 111 prfm pldl1strm, [x1] 112 .ifc \codec,h264 113 rshrn v16.8B, v16.8H, #6 114 rshrn v17.8B, v17.8H, #6 115 .else 116 add v16.8H, v16.8H, v22.8H 117 add v17.8H, v17.8H, v22.8H 118 shrn v16.8B, v16.8H, #6 119 shrn v17.8B, v17.8H, #6 120 .endif 121 prfm pldl1strm, [x1, x2] 122 .ifc \type,avg 123 ld1 {v20.8B}, [x8], x2 124 ld1 {v21.8B}, [x8], x2 125 urhadd v16.8B, v16.8B, v20.8B 126 urhadd v17.8B, v17.8B, v21.8B 127 .endif 128 subs w3, w3, #2 129 st1 {v16.8B}, [x0], x2 130 st1 {v17.8B}, [x0], x2 131 b.gt 3b 132 ret 133 1344: ld1 {v4.8B, v5.8B}, [x1], x2 135 ld1 {v6.8B, v7.8B}, [x1], x2 136 ext v5.8B, v4.8B, v5.8B, #1 137 ext v7.8B, v6.8B, v7.8B, #1 138 prfm pldl1strm, [x1] 139 subs w3, w3, #2 140 umull v16.8H, v4.8B, v0.8B 141 umlal v16.8H, v5.8B, v1.8B 142 umull v17.8H, v6.8B, v0.8B 143 umlal v17.8H, v7.8B, v1.8B 144 prfm pldl1strm, [x1, x2] 145 .ifc \codec,h264 146 rshrn v16.8B, v16.8H, #6 147 rshrn v17.8B, v17.8H, #6 148 .else 149 add v16.8H, v16.8H, v22.8H 150 add v17.8H, v17.8H, v22.8H 151 shrn v16.8B, v16.8H, #6 152 shrn v17.8B, v17.8H, #6 153 .endif 154 .ifc \type,avg 155 ld1 {v20.8B}, [x8], x2 156 ld1 {v21.8B}, [x8], x2 157 urhadd v16.8B, v16.8B, v20.8B 158 urhadd v17.8B, v17.8B, v21.8B 159 .endif 160 st1 {v16.8B}, [x0], x2 161 st1 {v17.8B}, [x0], x2 162 b.gt 4b 163 ret 164 1655: ld1 {v4.8B}, [x1], x2 166 ld1 {v5.8B}, [x1], x2 167 prfm pldl1strm, [x1] 168 subs w3, w3, #2 169 umull v16.8H, v4.8B, v0.8B 170 umull v17.8H, v5.8B, v0.8B 171 prfm pldl1strm, [x1, x2] 172 .ifc \codec,h264 173 rshrn v16.8B, v16.8H, #6 174 rshrn v17.8B, v17.8H, #6 175 .else 176 add v16.8H, v16.8H, v22.8H 177 add v17.8H, v17.8H, v22.8H 178 shrn v16.8B, v16.8H, #6 179 shrn v17.8B, v17.8H, #6 180 .endif 181 .ifc \type,avg 182 ld1 {v20.8B}, [x8], x2 183 ld1 {v21.8B}, [x8], x2 184 urhadd v16.8B, v16.8B, v20.8B 185 urhadd v17.8B, v17.8B, v21.8B 186 .endif 187 st1 {v16.8B}, [x0], x2 188 st1 {v17.8B}, [x0], x2 189 b.gt 5b 190 ret 191endfunc 192.endm 193 194/* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */ 195.macro h264_chroma_mc4 type, codec=h264 196function ff_\type\()_\codec\()_chroma_mc4_neon, export=1 197 .ifc \type,avg 198 mov x8, x0 199 .endif 200 prfm pldl1strm, [x1] 201 prfm pldl1strm, [x1, x2] 202 .ifc \codec,rv40 203 movrel x6, rv40bias 204 lsr w9, w5, #1 205 lsr w10, w4, #1 206 lsl w9, w9, #3 207 lsl w10, w10, #1 208 add w9, w9, w10 209 add x6, x6, w9, UXTW 210 ld1r {v22.8H}, [x6] 211 .endif 212 .ifc \codec,vc1 213 movi v22.8H, #28 214 .endif 215 mul w7, w4, w5 216 lsl w14, w5, #3 217 lsl w13, w4, #3 218 cmp w7, #0 219 sub w6, w14, w7 220 sub w12, w13, w7 221 sub w4, w7, w13 222 sub w4, w4, w14 223 add w4, w4, #64 224 b.eq 2f 225 226 dup v24.8B, w4 227 dup v25.8B, w12 228 ld1 {v4.8B}, [x1], x2 229 dup v26.8B, w6 230 dup v27.8B, w7 231 ext v5.8B, v4.8B, v5.8B, #1 232 trn1 v0.2S, v24.2S, v25.2S 233 trn1 v2.2S, v26.2S, v27.2S 234 trn1 v4.2S, v4.2S, v5.2S 2351: ld1 {v6.8B}, [x1], x2 236 ext v7.8B, v6.8B, v7.8B, #1 237 trn1 v6.2S, v6.2S, v7.2S 238 umull v18.8H, v4.8B, v0.8B 239 umlal v18.8H, v6.8B, v2.8B 240 ld1 {v4.8B}, [x1], x2 241 ext v5.8B, v4.8B, v5.8B, #1 242 trn1 v4.2S, v4.2S, v5.2S 243 prfm pldl1strm, [x1] 244 umull v19.8H, v6.8B, v0.8B 245 umlal v19.8H, v4.8B, v2.8B 246 trn1 v30.2D, v18.2D, v19.2D 247 trn2 v31.2D, v18.2D, v19.2D 248 add v18.8H, v30.8H, v31.8H 249 .ifc \codec,h264 250 rshrn v16.8B, v18.8H, #6 251 .else 252 add v18.8H, v18.8H, v22.8H 253 shrn v16.8B, v18.8H, #6 254 .endif 255 subs w3, w3, #2 256 prfm pldl1strm, [x1, x2] 257 .ifc \type,avg 258 ld1 {v20.S}[0], [x8], x2 259 ld1 {v20.S}[1], [x8], x2 260 urhadd v16.8B, v16.8B, v20.8B 261 .endif 262 st1 {v16.S}[0], [x0], x2 263 st1 {v16.S}[1], [x0], x2 264 b.gt 1b 265 ret 266 2672: adds w12, w12, w6 268 dup v30.8B, w4 269 b.eq 5f 270 tst w6, w6 271 dup v31.8B, w12 272 trn1 v0.2S, v30.2S, v31.2S 273 trn2 v1.2S, v30.2S, v31.2S 274 b.eq 4f 275 276 ext v1.8B, v0.8B, v1.8B, #4 277 ld1 {v4.S}[0], [x1], x2 2783: ld1 {v4.S}[1], [x1], x2 279 umull v18.8H, v4.8B, v0.8B 280 ld1 {v4.S}[0], [x1], x2 281 umull v19.8H, v4.8B, v1.8B 282 trn1 v30.2D, v18.2D, v19.2D 283 trn2 v31.2D, v18.2D, v19.2D 284 add v18.8H, v30.8H, v31.8H 285 prfm pldl1strm, [x1] 286 .ifc \codec,h264 287 rshrn v16.8B, v18.8H, #6 288 .else 289 add v18.8H, v18.8H, v22.8H 290 shrn v16.8B, v18.8H, #6 291 .endif 292 .ifc \type,avg 293 ld1 {v20.S}[0], [x8], x2 294 ld1 {v20.S}[1], [x8], x2 295 urhadd v16.8B, v16.8B, v20.8B 296 .endif 297 subs w3, w3, #2 298 prfm pldl1strm, [x1, x2] 299 st1 {v16.S}[0], [x0], x2 300 st1 {v16.S}[1], [x0], x2 301 b.gt 3b 302 ret 303 3044: ld1 {v4.8B}, [x1], x2 305 ld1 {v6.8B}, [x1], x2 306 ext v5.8B, v4.8B, v5.8B, #1 307 ext v7.8B, v6.8B, v7.8B, #1 308 trn1 v4.2S, v4.2S, v5.2S 309 trn1 v6.2S, v6.2S, v7.2S 310 umull v18.8H, v4.8B, v0.8B 311 umull v19.8H, v6.8B, v0.8B 312 subs w3, w3, #2 313 trn1 v30.2D, v18.2D, v19.2D 314 trn2 v31.2D, v18.2D, v19.2D 315 add v18.8H, v30.8H, v31.8H 316 prfm pldl1strm, [x1] 317 .ifc \codec,h264 318 rshrn v16.8B, v18.8H, #6 319 .else 320 add v18.8H, v18.8H, v22.8H 321 shrn v16.8B, v18.8H, #6 322 .endif 323 .ifc \type,avg 324 ld1 {v20.S}[0], [x8], x2 325 ld1 {v20.S}[1], [x8], x2 326 urhadd v16.8B, v16.8B, v20.8B 327 .endif 328 prfm pldl1strm, [x1] 329 st1 {v16.S}[0], [x0], x2 330 st1 {v16.S}[1], [x0], x2 331 b.gt 4b 332 ret 333 3345: ld1 {v4.S}[0], [x1], x2 335 ld1 {v4.S}[1], [x1], x2 336 umull v18.8H, v4.8B, v30.8B 337 subs w3, w3, #2 338 prfm pldl1strm, [x1] 339 .ifc \codec,h264 340 rshrn v16.8B, v18.8H, #6 341 .else 342 add v18.8H, v18.8H, v22.8H 343 shrn v16.8B, v18.8H, #6 344 .endif 345 .ifc \type,avg 346 ld1 {v20.S}[0], [x8], x2 347 ld1 {v20.S}[1], [x8], x2 348 urhadd v16.8B, v16.8B, v20.8B 349 .endif 350 prfm pldl1strm, [x1] 351 st1 {v16.S}[0], [x0], x2 352 st1 {v16.S}[1], [x0], x2 353 b.gt 5b 354 ret 355endfunc 356.endm 357 358.macro h264_chroma_mc2 type 359function ff_\type\()_h264_chroma_mc2_neon, export=1 360 prfm pldl1strm, [x1] 361 prfm pldl1strm, [x1, x2] 362 orr w7, w4, w5 363 cbz w7, 2f 364 365 mul w7, w4, w5 366 lsl w14, w5, #3 367 lsl w13, w4, #3 368 sub w6, w14, w7 369 sub w12, w13, w7 370 sub w4, w7, w13 371 sub w4, w4, w14 372 add w4, w4, #64 373 dup v0.8B, w4 374 dup v2.8B, w12 375 dup v1.8B, w6 376 dup v3.8B, w7 377 trn1 v0.4H, v0.4H, v2.4H 378 trn1 v1.4H, v1.4H, v3.4H 3791: 380 ld1 {v4.S}[0], [x1], x2 381 ld1 {v4.S}[1], [x1], x2 382 rev64 v5.2S, v4.2S 383 ld1 {v5.S}[1], [x1] 384 ext v6.8B, v4.8B, v5.8B, #1 385 ext v7.8B, v5.8B, v4.8B, #1 386 trn1 v4.4H, v4.4H, v6.4H 387 trn1 v5.4H, v5.4H, v7.4H 388 umull v16.8H, v4.8B, v0.8B 389 umlal v16.8H, v5.8B, v1.8B 390 .ifc \type,avg 391 ld1 {v18.H}[0], [x0], x2 392 ld1 {v18.H}[2], [x0] 393 sub x0, x0, x2 394 .endif 395 rev64 v17.4S, v16.4S 396 add v16.8H, v16.8H, v17.8H 397 rshrn v16.8B, v16.8H, #6 398 .ifc \type,avg 399 urhadd v16.8B, v16.8B, v18.8B 400 .endif 401 st1 {v16.H}[0], [x0], x2 402 st1 {v16.H}[2], [x0], x2 403 subs w3, w3, #2 404 b.gt 1b 405 ret 406 4072: 408 ld1 {v16.H}[0], [x1], x2 409 ld1 {v16.H}[1], [x1], x2 410 .ifc \type,avg 411 ld1 {v18.H}[0], [x0], x2 412 ld1 {v18.H}[1], [x0] 413 sub x0, x0, x2 414 urhadd v16.8B, v16.8B, v18.8B 415 .endif 416 st1 {v16.H}[0], [x0], x2 417 st1 {v16.H}[1], [x0], x2 418 subs w3, w3, #2 419 b.gt 2b 420 ret 421endfunc 422.endm 423 424 h264_chroma_mc8 put 425 h264_chroma_mc8 avg 426 h264_chroma_mc4 put 427 h264_chroma_mc4 avg 428 h264_chroma_mc2 put 429 h264_chroma_mc2 avg 430 431#if CONFIG_RV40_DECODER 432const rv40bias 433 .short 0, 16, 32, 16 434 .short 32, 28, 32, 28 435 .short 0, 32, 16, 32 436 .short 32, 28, 32, 28 437endconst 438 439 h264_chroma_mc8 put, rv40 440 h264_chroma_mc8 avg, rv40 441 h264_chroma_mc4 put, rv40 442 h264_chroma_mc4 avg, rv40 443#endif 444 445#if CONFIG_VC1DSP 446 h264_chroma_mc8 put, vc1 447 h264_chroma_mc8 avg, vc1 448 h264_chroma_mc4 put, vc1 449 h264_chroma_mc4 avg, vc1 450#endif 451