1/* 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/arm/asm.S" 22 23/* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */ 24.macro h264_chroma_mc8 type, codec=h264 25function ff_\type\()_\codec\()_chroma_mc8_neon, export=1 26 push {r4-r7, lr} 27 ldrd r4, r5, [sp, #20] 28 .ifc \type,avg 29 mov lr, r0 30 .endif 31 pld [r1] 32 pld [r1, r2] 33 34 .ifc \codec,rv40 35 movrel r6, rv40bias 36 lsr r7, r5, #1 37 add r6, r6, r7, lsl #3 38 lsr r7, r4, #1 39 add r6, r6, r7, lsl #1 40 vld1.16 {d22[],d23[]}, [r6,:16] 41 .endif 42 .ifc \codec,vc1 43 vmov.u16 q11, #28 44 .endif 45 46A muls r7, r4, r5 47T mul r7, r4, r5 48T cmp r7, #0 49 rsb r6, r7, r5, lsl #3 50 rsb r12, r7, r4, lsl #3 51 sub r4, r7, r4, lsl #3 52 sub r4, r4, r5, lsl #3 53 add r4, r4, #64 54 55 beq 2f 56 57 vdup.8 d0, r4 58 vdup.8 d1, r12 59 vld1.8 {d4, d5}, [r1], r2 60 vdup.8 d2, r6 61 vdup.8 d3, r7 62 vext.8 d5, d4, d5, #1 63 641: vld1.8 {d6, d7}, [r1], r2 65 vmull.u8 q8, d4, d0 66 vmlal.u8 q8, d5, d1 67 vext.8 d7, d6, d7, #1 68 vld1.8 {d4, d5}, [r1], r2 69 vmlal.u8 q8, d6, d2 70 pld [r1] 71 vext.8 d5, d4, d5, #1 72 vmlal.u8 q8, d7, d3 73 vmull.u8 q9, d6, d0 74 subs r3, r3, #2 75 vmlal.u8 q9, d7, d1 76 vmlal.u8 q9, d4, d2 77 vmlal.u8 q9, d5, d3 78 pld [r1, r2] 79 .ifc \codec,h264 80 vrshrn.u16 d16, q8, #6 81 vrshrn.u16 d17, q9, #6 82 .else 83 vadd.u16 q8, q8, q11 84 vadd.u16 q9, q9, q11 85 vshrn.u16 d16, q8, #6 86 vshrn.u16 d17, q9, #6 87 .endif 88 .ifc \type,avg 89 vld1.8 {d20}, [lr,:64], r2 90 vld1.8 {d21}, [lr,:64], r2 91 vrhadd.u8 q8, q8, q10 92 .endif 93 vst1.8 {d16}, [r0,:64], r2 94 vst1.8 {d17}, [r0,:64], r2 95 bgt 1b 96 97 pop {r4-r7, pc} 98 992: adds r12, r12, r6 100 vdup.8 d0, r4 101 beq 5f 102 tst r6, r6 103 vdup.8 d1, r12 104 105 beq 4f 106 107 vld1.8 {d4}, [r1], r2 108 1093: vld1.8 {d6}, [r1], r2 110 vmull.u8 q8, d4, d0 111 vmlal.u8 q8, d6, d1 112 vld1.8 {d4}, [r1], r2 113 vmull.u8 q9, d6, d0 114 vmlal.u8 q9, d4, d1 115 pld [r1] 116 .ifc \codec,h264 117 vrshrn.u16 d16, q8, #6 118 vrshrn.u16 d17, q9, #6 119 .else 120 vadd.u16 q8, q8, q11 121 vadd.u16 q9, q9, q11 122 vshrn.u16 d16, q8, #6 123 vshrn.u16 d17, q9, #6 124 .endif 125 pld [r1, r2] 126 .ifc \type,avg 127 vld1.8 {d20}, [lr,:64], r2 128 vld1.8 {d21}, [lr,:64], r2 129 vrhadd.u8 q8, q8, q10 130 .endif 131 subs r3, r3, #2 132 vst1.8 {d16}, [r0,:64], r2 133 vst1.8 {d17}, [r0,:64], r2 134 bgt 3b 135 136 pop {r4-r7, pc} 137 1384: vld1.8 {d4, d5}, [r1], r2 139 vld1.8 {d6, d7}, [r1], r2 140 vext.8 d5, d4, d5, #1 141 vext.8 d7, d6, d7, #1 142 pld [r1] 143 subs r3, r3, #2 144 vmull.u8 q8, d4, d0 145 vmlal.u8 q8, d5, d1 146 vmull.u8 q9, d6, d0 147 vmlal.u8 q9, d7, d1 148 pld [r1, r2] 149 .ifc \codec,h264 150 vrshrn.u16 d16, q8, #6 151 vrshrn.u16 d17, q9, #6 152 .else 153 vadd.u16 q8, q8, q11 154 vadd.u16 q9, q9, q11 155 vshrn.u16 d16, q8, #6 156 vshrn.u16 d17, q9, #6 157 .endif 158 .ifc \type,avg 159 vld1.8 {d20}, [lr,:64], r2 160 vld1.8 {d21}, [lr,:64], r2 161 vrhadd.u8 q8, q8, q10 162 .endif 163 vst1.8 {d16}, [r0,:64], r2 164 vst1.8 {d17}, [r0,:64], r2 165 bgt 4b 166 167 pop {r4-r7, pc} 168 1695: vld1.8 {d4}, [r1], r2 170 vld1.8 {d5}, [r1], r2 171 pld [r1] 172 subs r3, r3, #2 173 vmull.u8 q8, d4, d0 174 vmull.u8 q9, d5, d0 175 pld [r1, r2] 176 .ifc \codec,h264 177 vrshrn.u16 d16, q8, #6 178 vrshrn.u16 d17, q9, #6 179 .else 180 vadd.u16 q8, q8, q11 181 vadd.u16 q9, q9, q11 182 vshrn.u16 d16, q8, #6 183 vshrn.u16 d17, q9, #6 184 .endif 185 .ifc \type,avg 186 vld1.8 {d20}, [lr,:64], r2 187 vld1.8 {d21}, [lr,:64], r2 188 vrhadd.u8 q8, q8, q10 189 .endif 190 vst1.8 {d16}, [r0,:64], r2 191 vst1.8 {d17}, [r0,:64], r2 192 bgt 5b 193 194 pop {r4-r7, pc} 195endfunc 196.endm 197 198/* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */ 199.macro h264_chroma_mc4 type, codec=h264 200function ff_\type\()_\codec\()_chroma_mc4_neon, export=1 201 push {r4-r7, lr} 202 ldrd r4, r5, [sp, #20] 203 .ifc \type,avg 204 mov lr, r0 205 .endif 206 pld [r1] 207 pld [r1, r2] 208 209 .ifc \codec,rv40 210 movrel r6, rv40bias 211 lsr r7, r5, #1 212 add r6, r6, r7, lsl #3 213 lsr r7, r4, #1 214 add r6, r6, r7, lsl #1 215 vld1.16 {d22[],d23[]}, [r6,:16] 216 .endif 217 .ifc \codec,vc1 218 vmov.u16 q11, #28 219 .endif 220 221A muls r7, r4, r5 222T mul r7, r4, r5 223T cmp r7, #0 224 rsb r6, r7, r5, lsl #3 225 rsb r12, r7, r4, lsl #3 226 sub r4, r7, r4, lsl #3 227 sub r4, r4, r5, lsl #3 228 add r4, r4, #64 229 230 beq 2f 231 232 vdup.8 d0, r4 233 vdup.8 d1, r12 234 vld1.8 {d4}, [r1], r2 235 vdup.8 d2, r6 236 vdup.8 d3, r7 237 238 vext.8 d5, d4, d5, #1 239 vtrn.32 d4, d5 240 241 vtrn.32 d0, d1 242 vtrn.32 d2, d3 243 2441: vld1.8 {d6}, [r1], r2 245 vext.8 d7, d6, d7, #1 246 vtrn.32 d6, d7 247 vmull.u8 q8, d4, d0 248 vmlal.u8 q8, d6, d2 249 vld1.8 {d4}, [r1], r2 250 vext.8 d5, d4, d5, #1 251 vtrn.32 d4, d5 252 pld [r1] 253 vmull.u8 q9, d6, d0 254 vmlal.u8 q9, d4, d2 255 vadd.i16 d16, d16, d17 256 vadd.i16 d17, d18, d19 257 .ifc \codec,h264 258 vrshrn.u16 d16, q8, #6 259 .else 260 vadd.u16 q8, q8, q11 261 vshrn.u16 d16, q8, #6 262 .endif 263 subs r3, r3, #2 264 pld [r1, r2] 265 .ifc \type,avg 266 vld1.32 {d20[0]}, [lr,:32], r2 267 vld1.32 {d20[1]}, [lr,:32], r2 268 vrhadd.u8 d16, d16, d20 269 .endif 270 vst1.32 {d16[0]}, [r0,:32], r2 271 vst1.32 {d16[1]}, [r0,:32], r2 272 bgt 1b 273 274 pop {r4-r7, pc} 275 2762: adds r12, r12, r6 277 vdup.8 d0, r4 278 beq 5f 279 tst r6, r6 280 vdup.8 d1, r12 281 vtrn.32 d0, d1 282 283 beq 4f 284 285 vext.32 d1, d0, d1, #1 286 vld1.32 {d4[0]}, [r1], r2 287 2883: vld1.32 {d4[1]}, [r1], r2 289 vmull.u8 q8, d4, d0 290 vld1.32 {d4[0]}, [r1], r2 291 vmull.u8 q9, d4, d1 292 vadd.i16 d16, d16, d17 293 vadd.i16 d17, d18, d19 294 pld [r1] 295 .ifc \codec,h264 296 vrshrn.u16 d16, q8, #6 297 .else 298 vadd.u16 q8, q8, q11 299 vshrn.u16 d16, q8, #6 300 .endif 301 .ifc \type,avg 302 vld1.32 {d20[0]}, [lr,:32], r2 303 vld1.32 {d20[1]}, [lr,:32], r2 304 vrhadd.u8 d16, d16, d20 305 .endif 306 subs r3, r3, #2 307 pld [r1, r2] 308 vst1.32 {d16[0]}, [r0,:32], r2 309 vst1.32 {d16[1]}, [r0,:32], r2 310 bgt 3b 311 312 pop {r4-r7, pc} 313 3144: vld1.8 {d4}, [r1], r2 315 vld1.8 {d6}, [r1], r2 316 vext.8 d5, d4, d5, #1 317 vext.8 d7, d6, d7, #1 318 vtrn.32 d4, d5 319 vtrn.32 d6, d7 320 vmull.u8 q8, d4, d0 321 vmull.u8 q9, d6, d0 322 subs r3, r3, #2 323 vadd.i16 d16, d16, d17 324 vadd.i16 d17, d18, d19 325 pld [r1] 326 .ifc \codec,h264 327 vrshrn.u16 d16, q8, #6 328 .else 329 vadd.u16 q8, q8, q11 330 vshrn.u16 d16, q8, #6 331 .endif 332 .ifc \type,avg 333 vld1.32 {d20[0]}, [lr,:32], r2 334 vld1.32 {d20[1]}, [lr,:32], r2 335 vrhadd.u8 d16, d16, d20 336 .endif 337 pld [r1] 338 vst1.32 {d16[0]}, [r0,:32], r2 339 vst1.32 {d16[1]}, [r0,:32], r2 340 bgt 4b 341 342 pop {r4-r7, pc} 343 3445: vld1.32 {d4[0]}, [r1], r2 345 vld1.32 {d4[1]}, [r1], r2 346 vmull.u8 q8, d4, d0 347 subs r3, r3, #2 348 pld [r1] 349 .ifc \codec,h264 350 vrshrn.u16 d16, q8, #6 351 .else 352 vadd.u16 q8, q8, q11 353 vshrn.u16 d16, q8, #6 354 .endif 355 .ifc \type,avg 356 vld1.32 {d20[0]}, [lr,:32], r2 357 vld1.32 {d20[1]}, [lr,:32], r2 358 vrhadd.u8 d16, d16, d20 359 .endif 360 pld [r1] 361 vst1.32 {d16[0]}, [r0,:32], r2 362 vst1.32 {d16[1]}, [r0,:32], r2 363 bgt 5b 364 365 pop {r4-r7, pc} 366endfunc 367.endm 368 369.macro h264_chroma_mc2 type 370function ff_\type\()_h264_chroma_mc2_neon, export=1 371 push {r4-r6, lr} 372 ldr r4, [sp, #16] 373 ldr lr, [sp, #20] 374 pld [r1] 375 pld [r1, r2] 376 orrs r5, r4, lr 377 beq 2f 378 379 mul r5, r4, lr 380 rsb r6, r5, lr, lsl #3 381 rsb r12, r5, r4, lsl #3 382 sub r4, r5, r4, lsl #3 383 sub r4, r4, lr, lsl #3 384 add r4, r4, #64 385 vdup.8 d0, r4 386 vdup.8 d2, r12 387 vdup.8 d1, r6 388 vdup.8 d3, r5 389 vtrn.16 q0, q1 3901: 391 vld1.32 {d4[0]}, [r1], r2 392 vld1.32 {d4[1]}, [r1], r2 393 vrev64.32 d5, d4 394 vld1.32 {d5[1]}, [r1] 395 vext.8 q3, q2, q2, #1 396 vtrn.16 q2, q3 397 vmull.u8 q8, d4, d0 398 vmlal.u8 q8, d5, d1 399 .ifc \type,avg 400 vld1.16 {d18[0]}, [r0,:16], r2 401 vld1.16 {d18[1]}, [r0,:16] 402 sub r0, r0, r2 403 .endif 404 vtrn.32 d16, d17 405 vadd.i16 d16, d16, d17 406 vrshrn.u16 d16, q8, #6 407 .ifc \type,avg 408 vrhadd.u8 d16, d16, d18 409 .endif 410 vst1.16 {d16[0]}, [r0,:16], r2 411 vst1.16 {d16[1]}, [r0,:16], r2 412 subs r3, r3, #2 413 bgt 1b 414 pop {r4-r6, pc} 4152: 416 .ifc \type,put 417 ldrh_post r5, r1, r2 418 strh_post r5, r0, r2 419 ldrh_post r6, r1, r2 420 strh_post r6, r0, r2 421 .else 422 vld1.16 {d16[0]}, [r1], r2 423 vld1.16 {d16[1]}, [r1], r2 424 vld1.16 {d18[0]}, [r0,:16], r2 425 vld1.16 {d18[1]}, [r0,:16] 426 sub r0, r0, r2 427 vrhadd.u8 d16, d16, d18 428 vst1.16 {d16[0]}, [r0,:16], r2 429 vst1.16 {d16[1]}, [r0,:16], r2 430 .endif 431 subs r3, r3, #2 432 bgt 2b 433 pop {r4-r6, pc} 434endfunc 435.endm 436 437 h264_chroma_mc8 put 438 h264_chroma_mc8 avg 439 h264_chroma_mc4 put 440 h264_chroma_mc4 avg 441 h264_chroma_mc2 put 442 h264_chroma_mc2 avg 443 444#if CONFIG_RV40_DECODER 445const rv40bias 446 .short 0, 16, 32, 16 447 .short 32, 28, 32, 28 448 .short 0, 32, 16, 32 449 .short 32, 28, 32, 28 450endconst 451 452 h264_chroma_mc8 put, rv40 453 h264_chroma_mc8 avg, rv40 454 h264_chroma_mc4 put, rv40 455 h264_chroma_mc4 avg, rv40 456#endif 457 458#if CONFIG_VC1DSP 459 h264_chroma_mc8 put, vc1 460 h264_chroma_mc8 avg, vc1 461 h264_chroma_mc4 put, vc1 462 h264_chroma_mc4 avg, vc1 463#endif 464