1/* 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "config_components.h" 22 23#include "libavutil/arm/asm.S" 24 25/* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */ 26.macro h264_chroma_mc8 type, codec=h264 27function ff_\type\()_\codec\()_chroma_mc8_neon, export=1 28 push {r4-r7, lr} 29 ldrd r4, r5, [sp, #20] 30 .ifc \type,avg 31 mov lr, r0 32 .endif 33 pld [r1] 34 pld [r1, r2] 35 36 .ifc \codec,rv40 37 movrel r6, rv40bias 38 lsr r7, r5, #1 39 add r6, r6, r7, lsl #3 40 lsr r7, r4, #1 41 add r6, r6, r7, lsl #1 42 vld1.16 {d22[],d23[]}, [r6,:16] 43 .endif 44 .ifc \codec,vc1 45 vmov.u16 q11, #28 46 .endif 47 48A muls r7, r4, r5 49T mul r7, r4, r5 50T cmp r7, #0 51 rsb r6, r7, r5, lsl #3 52 rsb r12, r7, r4, lsl #3 53 sub r4, r7, r4, lsl #3 54 sub r4, r4, r5, lsl #3 55 add r4, r4, #64 56 57 beq 2f 58 59 vdup.8 d0, r4 60 vdup.8 d1, r12 61 vld1.8 {d4, d5}, [r1], r2 62 vdup.8 d2, r6 63 vdup.8 d3, r7 64 vext.8 d5, d4, d5, #1 65 661: vld1.8 {d6, d7}, [r1], r2 67 vmull.u8 q8, d4, d0 68 vmlal.u8 q8, d5, d1 69 vext.8 d7, d6, d7, #1 70 vld1.8 {d4, d5}, [r1], r2 71 vmlal.u8 q8, d6, d2 72 pld [r1] 73 vext.8 d5, d4, d5, #1 74 vmlal.u8 q8, d7, d3 75 vmull.u8 q9, d6, d0 76 subs r3, r3, #2 77 vmlal.u8 q9, d7, d1 78 vmlal.u8 q9, d4, d2 79 vmlal.u8 q9, d5, d3 80 pld [r1, r2] 81 .ifc \codec,h264 82 vrshrn.u16 d16, q8, #6 83 vrshrn.u16 d17, q9, #6 84 .else 85 vadd.u16 q8, q8, q11 86 vadd.u16 q9, q9, q11 87 vshrn.u16 d16, q8, #6 88 vshrn.u16 d17, q9, #6 89 .endif 90 .ifc \type,avg 91 vld1.8 {d20}, [lr,:64], r2 92 vld1.8 {d21}, [lr,:64], r2 93 vrhadd.u8 q8, q8, q10 94 .endif 95 vst1.8 {d16}, [r0,:64], r2 96 vst1.8 {d17}, [r0,:64], r2 97 bgt 1b 98 99 pop {r4-r7, pc} 100 1012: adds r12, r12, r6 102 vdup.8 d0, r4 103 beq 5f 104 tst r6, r6 105 vdup.8 d1, r12 106 107 beq 4f 108 109 vld1.8 {d4}, [r1], r2 110 1113: vld1.8 {d6}, [r1], r2 112 vmull.u8 q8, d4, d0 113 vmlal.u8 q8, d6, d1 114 vld1.8 {d4}, [r1], r2 115 vmull.u8 q9, d6, d0 116 vmlal.u8 q9, d4, d1 117 pld [r1] 118 .ifc \codec,h264 119 vrshrn.u16 d16, q8, #6 120 vrshrn.u16 d17, q9, #6 121 .else 122 vadd.u16 q8, q8, q11 123 vadd.u16 q9, q9, q11 124 vshrn.u16 d16, q8, #6 125 vshrn.u16 d17, q9, #6 126 .endif 127 pld [r1, r2] 128 .ifc \type,avg 129 vld1.8 {d20}, [lr,:64], r2 130 vld1.8 {d21}, [lr,:64], r2 131 vrhadd.u8 q8, q8, q10 132 .endif 133 subs r3, r3, #2 134 vst1.8 {d16}, [r0,:64], r2 135 vst1.8 {d17}, [r0,:64], r2 136 bgt 3b 137 138 pop {r4-r7, pc} 139 1404: vld1.8 {d4, d5}, [r1], r2 141 vld1.8 {d6, d7}, [r1], r2 142 vext.8 d5, d4, d5, #1 143 vext.8 d7, d6, d7, #1 144 pld [r1] 145 subs r3, r3, #2 146 vmull.u8 q8, d4, d0 147 vmlal.u8 q8, d5, d1 148 vmull.u8 q9, d6, d0 149 vmlal.u8 q9, d7, d1 150 pld [r1, r2] 151 .ifc \codec,h264 152 vrshrn.u16 d16, q8, #6 153 vrshrn.u16 d17, q9, #6 154 .else 155 vadd.u16 q8, q8, q11 156 vadd.u16 q9, q9, q11 157 vshrn.u16 d16, q8, #6 158 vshrn.u16 d17, q9, #6 159 .endif 160 .ifc \type,avg 161 vld1.8 {d20}, [lr,:64], r2 162 vld1.8 {d21}, [lr,:64], r2 163 vrhadd.u8 q8, q8, q10 164 .endif 165 vst1.8 {d16}, [r0,:64], r2 166 vst1.8 {d17}, [r0,:64], r2 167 bgt 4b 168 169 pop {r4-r7, pc} 170 1715: vld1.8 {d4}, [r1], r2 172 vld1.8 {d5}, [r1], r2 173 pld [r1] 174 subs r3, r3, #2 175 vmull.u8 q8, d4, d0 176 vmull.u8 q9, d5, d0 177 pld [r1, r2] 178 .ifc \codec,h264 179 vrshrn.u16 d16, q8, #6 180 vrshrn.u16 d17, q9, #6 181 .else 182 vadd.u16 q8, q8, q11 183 vadd.u16 q9, q9, q11 184 vshrn.u16 d16, q8, #6 185 vshrn.u16 d17, q9, #6 186 .endif 187 .ifc \type,avg 188 vld1.8 {d20}, [lr,:64], r2 189 vld1.8 {d21}, [lr,:64], r2 190 vrhadd.u8 q8, q8, q10 191 .endif 192 vst1.8 {d16}, [r0,:64], r2 193 vst1.8 {d17}, [r0,:64], r2 194 bgt 5b 195 196 pop {r4-r7, pc} 197endfunc 198.endm 199 200/* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */ 201.macro h264_chroma_mc4 type, codec=h264 202function ff_\type\()_\codec\()_chroma_mc4_neon, export=1 203 push {r4-r7, lr} 204 ldrd r4, r5, [sp, #20] 205 .ifc \type,avg 206 mov lr, r0 207 .endif 208 pld [r1] 209 pld [r1, r2] 210 211 .ifc \codec,rv40 212 movrel r6, rv40bias 213 lsr r7, r5, #1 214 add r6, r6, r7, lsl #3 215 lsr r7, r4, #1 216 add r6, r6, r7, lsl #1 217 vld1.16 {d22[],d23[]}, [r6,:16] 218 .endif 219 .ifc \codec,vc1 220 vmov.u16 q11, #28 221 .endif 222 223A muls r7, r4, r5 224T mul r7, r4, r5 225T cmp r7, #0 226 rsb r6, r7, r5, lsl #3 227 rsb r12, r7, r4, lsl #3 228 sub r4, r7, r4, lsl #3 229 sub r4, r4, r5, lsl #3 230 add r4, r4, #64 231 232 beq 2f 233 234 vdup.8 d0, r4 235 vdup.8 d1, r12 236 vld1.8 {d4}, [r1], r2 237 vdup.8 d2, r6 238 vdup.8 d3, r7 239 240 vext.8 d5, d4, d5, #1 241 vtrn.32 d4, d5 242 243 vtrn.32 d0, d1 244 vtrn.32 d2, d3 245 2461: vld1.8 {d6}, [r1], r2 247 vext.8 d7, d6, d7, #1 248 vtrn.32 d6, d7 249 vmull.u8 q8, d4, d0 250 vmlal.u8 q8, d6, d2 251 vld1.8 {d4}, [r1], r2 252 vext.8 d5, d4, d5, #1 253 vtrn.32 d4, d5 254 pld [r1] 255 vmull.u8 q9, d6, d0 256 vmlal.u8 q9, d4, d2 257 vadd.i16 d16, d16, d17 258 vadd.i16 d17, d18, d19 259 .ifc \codec,h264 260 vrshrn.u16 d16, q8, #6 261 .else 262 vadd.u16 q8, q8, q11 263 vshrn.u16 d16, q8, #6 264 .endif 265 subs r3, r3, #2 266 pld [r1, r2] 267 .ifc \type,avg 268 vld1.32 {d20[0]}, [lr,:32], r2 269 vld1.32 {d20[1]}, [lr,:32], r2 270 vrhadd.u8 d16, d16, d20 271 .endif 272 vst1.32 {d16[0]}, [r0,:32], r2 273 vst1.32 {d16[1]}, [r0,:32], r2 274 bgt 1b 275 276 pop {r4-r7, pc} 277 2782: adds r12, r12, r6 279 vdup.8 d0, r4 280 beq 5f 281 tst r6, r6 282 vdup.8 d1, r12 283 vtrn.32 d0, d1 284 285 beq 4f 286 287 vext.32 d1, d0, d1, #1 288 vld1.32 {d4[0]}, [r1], r2 289 2903: vld1.32 {d4[1]}, [r1], r2 291 vmull.u8 q8, d4, d0 292 vld1.32 {d4[0]}, [r1], r2 293 vmull.u8 q9, d4, d1 294 vadd.i16 d16, d16, d17 295 vadd.i16 d17, d18, d19 296 pld [r1] 297 .ifc \codec,h264 298 vrshrn.u16 d16, q8, #6 299 .else 300 vadd.u16 q8, q8, q11 301 vshrn.u16 d16, q8, #6 302 .endif 303 .ifc \type,avg 304 vld1.32 {d20[0]}, [lr,:32], r2 305 vld1.32 {d20[1]}, [lr,:32], r2 306 vrhadd.u8 d16, d16, d20 307 .endif 308 subs r3, r3, #2 309 pld [r1, r2] 310 vst1.32 {d16[0]}, [r0,:32], r2 311 vst1.32 {d16[1]}, [r0,:32], r2 312 bgt 3b 313 314 pop {r4-r7, pc} 315 3164: vld1.8 {d4}, [r1], r2 317 vld1.8 {d6}, [r1], r2 318 vext.8 d5, d4, d5, #1 319 vext.8 d7, d6, d7, #1 320 vtrn.32 d4, d5 321 vtrn.32 d6, d7 322 vmull.u8 q8, d4, d0 323 vmull.u8 q9, d6, d0 324 subs r3, r3, #2 325 vadd.i16 d16, d16, d17 326 vadd.i16 d17, d18, d19 327 pld [r1] 328 .ifc \codec,h264 329 vrshrn.u16 d16, q8, #6 330 .else 331 vadd.u16 q8, q8, q11 332 vshrn.u16 d16, q8, #6 333 .endif 334 .ifc \type,avg 335 vld1.32 {d20[0]}, [lr,:32], r2 336 vld1.32 {d20[1]}, [lr,:32], r2 337 vrhadd.u8 d16, d16, d20 338 .endif 339 pld [r1] 340 vst1.32 {d16[0]}, [r0,:32], r2 341 vst1.32 {d16[1]}, [r0,:32], r2 342 bgt 4b 343 344 pop {r4-r7, pc} 345 3465: vld1.32 {d4[0]}, [r1], r2 347 vld1.32 {d4[1]}, [r1], r2 348 vmull.u8 q8, d4, d0 349 subs r3, r3, #2 350 pld [r1] 351 .ifc \codec,h264 352 vrshrn.u16 d16, q8, #6 353 .else 354 vadd.u16 q8, q8, q11 355 vshrn.u16 d16, q8, #6 356 .endif 357 .ifc \type,avg 358 vld1.32 {d20[0]}, [lr,:32], r2 359 vld1.32 {d20[1]}, [lr,:32], r2 360 vrhadd.u8 d16, d16, d20 361 .endif 362 pld [r1] 363 vst1.32 {d16[0]}, [r0,:32], r2 364 vst1.32 {d16[1]}, [r0,:32], r2 365 bgt 5b 366 367 pop {r4-r7, pc} 368endfunc 369.endm 370 371.macro h264_chroma_mc2 type 372function ff_\type\()_h264_chroma_mc2_neon, export=1 373 push {r4-r6, lr} 374 ldr r4, [sp, #16] 375 ldr lr, [sp, #20] 376 pld [r1] 377 pld [r1, r2] 378 orrs r5, r4, lr 379 beq 2f 380 381 mul r5, r4, lr 382 rsb r6, r5, lr, lsl #3 383 rsb r12, r5, r4, lsl #3 384 sub r4, r5, r4, lsl #3 385 sub r4, r4, lr, lsl #3 386 add r4, r4, #64 387 vdup.8 d0, r4 388 vdup.8 d2, r12 389 vdup.8 d1, r6 390 vdup.8 d3, r5 391 vtrn.16 q0, q1 3921: 393 vld1.32 {d4[0]}, [r1], r2 394 vld1.32 {d4[1]}, [r1], r2 395 vrev64.32 d5, d4 396 vld1.32 {d5[1]}, [r1] 397 vext.8 q3, q2, q2, #1 398 vtrn.16 q2, q3 399 vmull.u8 q8, d4, d0 400 vmlal.u8 q8, d5, d1 401 .ifc \type,avg 402 vld1.16 {d18[0]}, [r0,:16], r2 403 vld1.16 {d18[1]}, [r0,:16] 404 sub r0, r0, r2 405 .endif 406 vtrn.32 d16, d17 407 vadd.i16 d16, d16, d17 408 vrshrn.u16 d16, q8, #6 409 .ifc \type,avg 410 vrhadd.u8 d16, d16, d18 411 .endif 412 vst1.16 {d16[0]}, [r0,:16], r2 413 vst1.16 {d16[1]}, [r0,:16], r2 414 subs r3, r3, #2 415 bgt 1b 416 pop {r4-r6, pc} 4172: 418 .ifc \type,put 419 ldrh_post r5, r1, r2 420 strh_post r5, r0, r2 421 ldrh_post r6, r1, r2 422 strh_post r6, r0, r2 423 .else 424 vld1.16 {d16[0]}, [r1], r2 425 vld1.16 {d16[1]}, [r1], r2 426 vld1.16 {d18[0]}, [r0,:16], r2 427 vld1.16 {d18[1]}, [r0,:16] 428 sub r0, r0, r2 429 vrhadd.u8 d16, d16, d18 430 vst1.16 {d16[0]}, [r0,:16], r2 431 vst1.16 {d16[1]}, [r0,:16], r2 432 .endif 433 subs r3, r3, #2 434 bgt 2b 435 pop {r4-r6, pc} 436endfunc 437.endm 438 439 h264_chroma_mc8 put 440 h264_chroma_mc8 avg 441 h264_chroma_mc4 put 442 h264_chroma_mc4 avg 443 h264_chroma_mc2 put 444 h264_chroma_mc2 avg 445 446#if CONFIG_RV40_DECODER 447const rv40bias 448 .short 0, 16, 32, 16 449 .short 32, 28, 32, 28 450 .short 0, 32, 16, 32 451 .short 32, 28, 32, 28 452endconst 453 454 h264_chroma_mc8 put, rv40 455 h264_chroma_mc8 avg, rv40 456 h264_chroma_mc4 put, rv40 457 h264_chroma_mc4 avg, rv40 458#endif 459 460#if CONFIG_VC1DSP 461 h264_chroma_mc8 put, vc1 462 h264_chroma_mc8 avg, vc1 463 h264_chroma_mc4 put, vc1 464 h264_chroma_mc4 avg, vc1 465#endif 466