1/* 2 * VP8 NEON optimisations 3 * 4 * Copyright (c) 2010 Rob Clark <rob@ti.com> 5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> 6 * 7 * This file is part of FFmpeg. 8 * 9 * FFmpeg is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU Lesser General Public 11 * License as published by the Free Software Foundation; either 12 * version 2.1 of the License, or (at your option) any later version. 13 * 14 * FFmpeg is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * Lesser General Public License for more details. 18 * 19 * You should have received a copy of the GNU Lesser General Public 20 * License along with FFmpeg; if not, write to the Free Software 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22 */ 23 24#include "libavutil/arm/asm.S" 25#include "neon.S" 26 27function ff_vp8_luma_dc_wht_neon, export=1 28 vld1.16 {q0-q1}, [r1,:128] 29 vmov.i16 q15, #0 30 31 vadd.i16 d4, d0, d3 32 vadd.i16 d6, d1, d2 33 vst1.16 {q15}, [r1,:128]! 34 vsub.i16 d7, d1, d2 35 vsub.i16 d5, d0, d3 36 vst1.16 {q15}, [r1,:128] 37 vadd.i16 q0, q2, q3 38 vsub.i16 q1, q2, q3 39 40 vmov.i16 q8, #3 41 42 vtrn.32 d0, d2 43 vtrn.32 d1, d3 44 vtrn.16 d0, d1 45 vtrn.16 d2, d3 46 47 vadd.i16 d0, d0, d16 48 49 vadd.i16 d4, d0, d3 50 vadd.i16 d6, d1, d2 51 vsub.i16 d7, d1, d2 52 vsub.i16 d5, d0, d3 53 vadd.i16 q0, q2, q3 54 vsub.i16 q1, q2, q3 55 56 vshr.s16 q0, q0, #3 57 vshr.s16 q1, q1, #3 58 59 mov r3, #32 60 vst1.16 {d0[0]}, [r0,:16], r3 61 vst1.16 {d1[0]}, [r0,:16], r3 62 vst1.16 {d2[0]}, [r0,:16], r3 63 vst1.16 {d3[0]}, [r0,:16], r3 64 vst1.16 {d0[1]}, [r0,:16], r3 65 vst1.16 {d1[1]}, [r0,:16], r3 66 vst1.16 {d2[1]}, [r0,:16], r3 67 vst1.16 {d3[1]}, [r0,:16], r3 68 vst1.16 {d0[2]}, [r0,:16], r3 69 vst1.16 {d1[2]}, [r0,:16], r3 70 vst1.16 {d2[2]}, [r0,:16], r3 71 vst1.16 {d3[2]}, [r0,:16], r3 72 vst1.16 {d0[3]}, [r0,:16], r3 73 vst1.16 {d1[3]}, [r0,:16], r3 74 vst1.16 {d2[3]}, [r0,:16], r3 75 vst1.16 {d3[3]}, [r0,:16], r3 76 77 bx lr 78endfunc 79 80function ff_vp8_idct_add_neon, export=1 81 vld1.16 {q0-q1}, [r1,:128] 82 movw r3, #20091 83 movt r3, #35468/2 84 vdup.32 d4, r3 85 86 vmull.s16 q12, d1, d4[0] 87 vmull.s16 q13, d3, d4[0] 88 vqdmulh.s16 d20, d1, d4[1] 89 vqdmulh.s16 d23, d3, d4[1] 90 vshrn.s32 d21, q12, #16 91 vshrn.s32 d22, q13, #16 92 vadd.s16 d21, d21, d1 93 vadd.s16 d22, d22, d3 94 95 vadd.s16 d16, d0, d2 96 vsub.s16 d17, d0, d2 97 vadd.s16 d18, d21, d23 98 vsub.s16 d19, d20, d22 99 vadd.s16 q0, q8, q9 100 vsub.s16 q1, q8, q9 101 102 vtrn.32 d0, d3 103 vtrn.32 d1, d2 104 vtrn.16 d0, d1 105 vtrn.16 d3, d2 106 107 vmov.i16 q15, #0 108 vmull.s16 q12, d1, d4[0] 109 vst1.16 {q15}, [r1,:128]! 110 vmull.s16 q13, d2, d4[0] 111 vst1.16 {q15}, [r1,:128] 112 vqdmulh.s16 d21, d1, d4[1] 113 vqdmulh.s16 d23, d2, d4[1] 114 vshrn.s32 d20, q12, #16 115 vshrn.s32 d22, q13, #16 116 vadd.i16 d20, d20, d1 117 vadd.i16 d22, d22, d2 118 119 vadd.i16 d16, d0, d3 120 vsub.i16 d17, d0, d3 121 vadd.i16 d18, d20, d23 122 vld1.32 {d20[]}, [r0,:32], r2 123 vsub.i16 d19, d21, d22 124 vld1.32 {d22[]}, [r0,:32], r2 125 vadd.s16 q0, q8, q9 126 vld1.32 {d23[]}, [r0,:32], r2 127 vsub.s16 q1, q8, q9 128 vld1.32 {d21[]}, [r0,:32], r2 129 vrshr.s16 q0, q0, #3 130 vtrn.32 q10, q11 131 vrshr.s16 q1, q1, #3 132 133 sub r0, r0, r2, lsl #2 134 135 vtrn.32 d0, d3 136 vtrn.32 d1, d2 137 vtrn.16 d0, d1 138 vtrn.16 d3, d2 139 140 vaddw.u8 q0, q0, d20 141 vaddw.u8 q1, q1, d21 142 vqmovun.s16 d0, q0 143 vqmovun.s16 d1, q1 144 145 vst1.32 {d0[0]}, [r0,:32], r2 146 vst1.32 {d0[1]}, [r0,:32], r2 147 vst1.32 {d1[1]}, [r0,:32], r2 148 vst1.32 {d1[0]}, [r0,:32], r2 149 150 bx lr 151endfunc 152 153function ff_vp8_idct_dc_add_neon, export=1 154 mov r3, #0 155 ldrsh r12, [r1] 156 strh r3, [r1] 157 vdup.16 q1, r12 158 vrshr.s16 q1, q1, #3 159 vld1.32 {d0[]}, [r0,:32], r2 160 vld1.32 {d1[]}, [r0,:32], r2 161 vld1.32 {d0[1]}, [r0,:32], r2 162 vld1.32 {d1[1]}, [r0,:32], r2 163 vaddw.u8 q2, q1, d0 164 vaddw.u8 q3, q1, d1 165 sub r0, r0, r2, lsl #2 166 vqmovun.s16 d0, q2 167 vqmovun.s16 d1, q3 168 vst1.32 {d0[0]}, [r0,:32], r2 169 vst1.32 {d1[0]}, [r0,:32], r2 170 vst1.32 {d0[1]}, [r0,:32], r2 171 vst1.32 {d1[1]}, [r0,:32], r2 172 bx lr 173endfunc 174 175function ff_vp8_idct_dc_add4uv_neon, export=1 176 vmov.i16 d0, #0 177 mov r3, #32 178 vld1.16 {d16[]}, [r1,:16] 179 vst1.16 {d0[0]}, [r1,:16], r3 180 vld1.16 {d17[]}, [r1,:16] 181 vst1.16 {d0[0]}, [r1,:16], r3 182 vld1.16 {d18[]}, [r1,:16] 183 vst1.16 {d0[0]}, [r1,:16], r3 184 vld1.16 {d19[]}, [r1,:16] 185 vst1.16 {d0[0]}, [r1,:16], r3 186 mov r3, r0 187 vrshr.s16 q8, q8, #3 @ dc >>= 3 188 vld1.8 {d0}, [r0,:64], r2 189 vrshr.s16 q9, q9, #3 190 vld1.8 {d1}, [r0,:64], r2 191 vaddw.u8 q10, q8, d0 192 vld1.8 {d2}, [r0,:64], r2 193 vaddw.u8 q0, q8, d1 194 vld1.8 {d3}, [r0,:64], r2 195 vaddw.u8 q11, q8, d2 196 vld1.8 {d4}, [r0,:64], r2 197 vaddw.u8 q1, q8, d3 198 vld1.8 {d5}, [r0,:64], r2 199 vaddw.u8 q12, q9, d4 200 vld1.8 {d6}, [r0,:64], r2 201 vaddw.u8 q2, q9, d5 202 vld1.8 {d7}, [r0,:64], r2 203 vaddw.u8 q13, q9, d6 204 vqmovun.s16 d20, q10 205 vaddw.u8 q3, q9, d7 206 vqmovun.s16 d21, q0 207 vqmovun.s16 d22, q11 208 vst1.8 {d20}, [r3,:64], r2 209 vqmovun.s16 d23, q1 210 vst1.8 {d21}, [r3,:64], r2 211 vqmovun.s16 d24, q12 212 vst1.8 {d22}, [r3,:64], r2 213 vqmovun.s16 d25, q2 214 vst1.8 {d23}, [r3,:64], r2 215 vqmovun.s16 d26, q13 216 vst1.8 {d24}, [r3,:64], r2 217 vqmovun.s16 d27, q3 218 vst1.8 {d25}, [r3,:64], r2 219 vst1.8 {d26}, [r3,:64], r2 220 vst1.8 {d27}, [r3,:64], r2 221 222 bx lr 223endfunc 224 225function ff_vp8_idct_dc_add4y_neon, export=1 226 vmov.i16 d0, #0 227 mov r3, #32 228 vld1.16 {d16[]}, [r1,:16] 229 vst1.16 {d0[0]}, [r1,:16], r3 230 vld1.16 {d17[]}, [r1,:16] 231 vst1.16 {d0[0]}, [r1,:16], r3 232 vld1.16 {d18[]}, [r1,:16] 233 vst1.16 {d0[0]}, [r1,:16], r3 234 vld1.16 {d19[]}, [r1,:16] 235 vst1.16 {d0[0]}, [r1,:16], r3 236 vrshr.s16 q8, q8, #3 @ dc >>= 3 237 vld1.8 {q0}, [r0,:128], r2 238 vrshr.s16 q9, q9, #3 239 vld1.8 {q1}, [r0,:128], r2 240 vaddw.u8 q10, q8, d0 241 vld1.8 {q2}, [r0,:128], r2 242 vaddw.u8 q0, q9, d1 243 vld1.8 {q3}, [r0,:128], r2 244 vaddw.u8 q11, q8, d2 245 vaddw.u8 q1, q9, d3 246 vaddw.u8 q12, q8, d4 247 vaddw.u8 q2, q9, d5 248 vaddw.u8 q13, q8, d6 249 vaddw.u8 q3, q9, d7 250 sub r0, r0, r2, lsl #2 251 vqmovun.s16 d20, q10 252 vqmovun.s16 d21, q0 253 vqmovun.s16 d22, q11 254 vqmovun.s16 d23, q1 255 vqmovun.s16 d24, q12 256 vst1.8 {q10}, [r0,:128], r2 257 vqmovun.s16 d25, q2 258 vst1.8 {q11}, [r0,:128], r2 259 vqmovun.s16 d26, q13 260 vst1.8 {q12}, [r0,:128], r2 261 vqmovun.s16 d27, q3 262 vst1.8 {q13}, [r0,:128], r2 263 264 bx lr 265endfunc 266 267@ Register layout: 268@ P3..Q3 -> q0..q7 269@ flim_E -> q14 270@ flim_I -> q15 271@ hev_thresh -> r12 272@ 273.macro vp8_loop_filter, inner=0, simple=0 274 .if \simple 275 vabd.u8 q9, q3, q4 @ abs(P0-Q0) 276 vabd.u8 q15, q2, q5 @ abs(P1-Q1) 277 vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2 278 vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2 279 vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) 280 vmov.i8 q13, #0x80 281 vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim 282 .else 283 @ calculate hev and normal_limit: 284 vabd.u8 q12, q2, q3 @ abs(P1-P0) 285 vabd.u8 q13, q5, q4 @ abs(Q1-Q0) 286 vabd.u8 q10, q0, q1 @ abs(P3-P2) 287 vabd.u8 q11, q1, q2 @ abs(P2-P1) 288 vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I 289 vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I 290 vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I 291 vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I 292 vand q8, q8, q9 293 vabd.u8 q9, q7, q6 @ abs(Q3-Q2) 294 vand q8, q8, q11 295 vabd.u8 q11, q6, q5 @ abs(Q2-Q1) 296 vand q8, q8, q10 297 vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I 298 vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I 299 vabd.u8 q9, q3, q4 @ abs(P0-Q0) 300 vabd.u8 q15, q2, q5 @ abs(P1-Q1) 301 vand q8, q8, q10 302 vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2 303 vand q8, q8, q11 304 vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2 305 vdup.8 q15, r12 @ hev_thresh 306 vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) 307 vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh 308 vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E 309 vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh 310 vand q8, q8, q11 311 vmov.i8 q13, #0x80 312 vorr q9, q12, q14 313 .endif 314 315 @ at this point: 316 @ q8: normal_limit 317 @ q9: hev 318 319 @ convert to signed value: 320 veor q3, q3, q13 @ PS0 = P0 ^ 0x80 321 veor q4, q4, q13 @ QS0 = Q0 ^ 0x80 322 323 vmov.i16 q12, #3 324 vsubl.s8 q10, d8, d6 @ QS0 - PS0 325 vsubl.s8 q11, d9, d7 @ (widened to 16 bits) 326 veor q2, q2, q13 @ PS1 = P1 ^ 0x80 327 veor q5, q5, q13 @ QS1 = Q1 ^ 0x80 328 vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0) 329 vmul.i16 q11, q11, q12 330 331 vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1) 332 vmov.i8 q14, #4 333 vmov.i8 q15, #3 334 .if \inner 335 vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1) 336 .endif 337 vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1) 338 vaddw.s8 q11, q11, d25 339 vqmovn.s16 d20, q10 @ narrow result back into q10 340 vqmovn.s16 d21, q11 341 .if !\inner && !\simple 342 veor q1, q1, q13 @ PS2 = P2 ^ 0x80 343 veor q6, q6, q13 @ QS2 = Q2 ^ 0x80 344 .endif 345 vand q10, q10, q8 @ w &= normal_limit 346 347 @ registers used at this point.. 348 @ q0 -> P3 (don't corrupt) 349 @ q1-q6 -> PS2-QS2 350 @ q7 -> Q3 (don't corrupt) 351 @ q9 -> hev 352 @ q10 -> w 353 @ q13 -> #0x80 354 @ q14 -> #4 355 @ q15 -> #3 356 @ q8, q11, q12 -> unused 357 358 @ filter_common: is4tap==1 359 @ c1 = clamp(w + 4) >> 3; 360 @ c2 = clamp(w + 3) >> 3; 361 @ Q0 = s2u(QS0 - c1); 362 @ P0 = s2u(PS0 + c2); 363 364 .if \simple 365 vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4) 366 vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3) 367 vshr.s8 q11, q11, #3 @ c1 >>= 3 368 vshr.s8 q12, q12, #3 @ c2 >>= 3 369 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) 370 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) 371 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 372 veor q3, q3, q13 @ P0 = PS0 ^ 0x80 373 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 374 veor q2, q2, q13 @ P1 = PS1 ^ 0x80 375 .elseif \inner 376 @ the !is4tap case of filter_common, only used for inner blocks 377 @ c3 = ((c1&~hev) + 1) >> 1; 378 @ Q1 = s2u(QS1 - c3); 379 @ P1 = s2u(PS1 + c3); 380 vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4) 381 vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3) 382 vshr.s8 q11, q11, #3 @ c1 >>= 3 383 vshr.s8 q12, q12, #3 @ c2 >>= 3 384 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) 385 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) 386 vbic q11, q11, q9 @ c1 & ~hev 387 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 388 vrshr.s8 q11, q11, #1 @ c3 >>= 1 389 veor q3, q3, q13 @ P0 = PS0 ^ 0x80 390 vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3) 391 vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3) 392 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 393 veor q2, q2, q13 @ P1 = PS1 ^ 0x80 394 .else 395 vand q12, q10, q9 @ w & hev 396 vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4) 397 vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3) 398 vshr.s8 q11, q11, #3 @ c1 >>= 3 399 vshr.s8 q12, q12, #3 @ c2 >>= 3 400 vbic q10, q10, q9 @ w &= ~hev 401 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) 402 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) 403 404 @ filter_mbedge: 405 @ a = clamp((27*w + 63) >> 7); 406 @ Q0 = s2u(QS0 - a); 407 @ P0 = s2u(PS0 + a); 408 @ a = clamp((18*w + 63) >> 7); 409 @ Q1 = s2u(QS1 - a); 410 @ P1 = s2u(PS1 + a); 411 @ a = clamp((9*w + 63) >> 7); 412 @ Q2 = s2u(QS2 - a); 413 @ P2 = s2u(PS2 + a); 414 vmov.i16 q9, #63 415 vshll.s8 q14, d20, #3 416 vshll.s8 q15, d21, #3 417 vaddw.s8 q14, q14, d20 418 vaddw.s8 q15, q15, d21 419 vadd.s16 q8, q9, q14 420 vadd.s16 q9, q9, q15 @ 9*w + 63 421 vadd.s16 q11, q8, q14 422 vadd.s16 q12, q9, q15 @ 18*w + 63 423 vadd.s16 q14, q11, q14 424 vadd.s16 q15, q12, q15 @ 27*w + 63 425 vqshrn.s16 d16, q8, #7 426 vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7) 427 vqshrn.s16 d22, q11, #7 428 vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7) 429 vqshrn.s16 d28, q14, #7 430 vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7) 431 vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a) 432 vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a) 433 vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a) 434 vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a) 435 vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a) 436 vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a) 437 veor q3, q3, q13 @ P0 = PS0 ^ 0x80 438 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 439 veor q2, q2, q13 @ P1 = PS1 ^ 0x80 440 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 441 veor q1, q1, q13 @ P2 = PS2 ^ 0x80 442 veor q6, q6, q13 @ Q2 = QS2 ^ 0x80 443 .endif 444.endm 445 446.macro vp8_v_loop_filter16 name, inner=0, simple=0 447function ff_vp8_v_loop_filter16\name\()_neon, export=1 448 vpush {q4-q7} 449 sub r0, r0, r1, lsl #1+!\simple 450 451 @ Load pixels: 452 .if !\simple 453 ldr r12, [sp, #64] @ hev_thresh 454 vld1.8 {q0}, [r0,:128], r1 @ P3 455 vld1.8 {q1}, [r0,:128], r1 @ P2 456 .endif 457 vld1.8 {q2}, [r0,:128], r1 @ P1 458 vld1.8 {q3}, [r0,:128], r1 @ P0 459 vld1.8 {q4}, [r0,:128], r1 @ Q0 460 vld1.8 {q5}, [r0,:128], r1 @ Q1 461 .if !\simple 462 vld1.8 {q6}, [r0,:128], r1 @ Q2 463 vld1.8 {q7}, [r0,:128] @ Q3 464 vdup.8 q15, r3 @ flim_I 465 .endif 466 vdup.8 q14, r2 @ flim_E 467 468 vp8_loop_filter inner=\inner, simple=\simple 469 470 @ back up to P2: dst -= stride * 6 471 sub r0, r0, r1, lsl #2 472 .if !\simple 473 sub r0, r0, r1, lsl #1 474 475 @ Store pixels: 476 vst1.8 {q1}, [r0,:128], r1 @ P2 477 .endif 478 vst1.8 {q2}, [r0,:128], r1 @ P1 479 vst1.8 {q3}, [r0,:128], r1 @ P0 480 vst1.8 {q4}, [r0,:128], r1 @ Q0 481 vst1.8 {q5}, [r0,:128], r1 @ Q1 482 .if !\simple 483 vst1.8 {q6}, [r0,:128] @ Q2 484 .endif 485 486 vpop {q4-q7} 487 bx lr 488endfunc 489.endm 490 491vp8_v_loop_filter16 492vp8_v_loop_filter16 _inner, inner=1 493vp8_v_loop_filter16 _simple, simple=1 494 495.macro vp8_v_loop_filter8uv name, inner=0 496function ff_vp8_v_loop_filter8uv\name\()_neon, export=1 497 vpush {q4-q7} 498 sub r0, r0, r2, lsl #2 499 sub r1, r1, r2, lsl #2 500 ldr r12, [sp, #64] @ flim_I 501 502 @ Load pixels: 503 vld1.8 {d0}, [r0,:64], r2 @ P3 504 vld1.8 {d1}, [r1,:64], r2 @ P3 505 vld1.8 {d2}, [r0,:64], r2 @ P2 506 vld1.8 {d3}, [r1,:64], r2 @ P2 507 vld1.8 {d4}, [r0,:64], r2 @ P1 508 vld1.8 {d5}, [r1,:64], r2 @ P1 509 vld1.8 {d6}, [r0,:64], r2 @ P0 510 vld1.8 {d7}, [r1,:64], r2 @ P0 511 vld1.8 {d8}, [r0,:64], r2 @ Q0 512 vld1.8 {d9}, [r1,:64], r2 @ Q0 513 vld1.8 {d10}, [r0,:64], r2 @ Q1 514 vld1.8 {d11}, [r1,:64], r2 @ Q1 515 vld1.8 {d12}, [r0,:64], r2 @ Q2 516 vld1.8 {d13}, [r1,:64], r2 @ Q2 517 vld1.8 {d14}, [r0,:64] @ Q3 518 vld1.8 {d15}, [r1,:64] @ Q3 519 520 vdup.8 q14, r3 @ flim_E 521 vdup.8 q15, r12 @ flim_I 522 ldr r12, [sp, #68] @ hev_thresh 523 524 vp8_loop_filter inner=\inner 525 526 @ back up to P2: u,v -= stride * 6 527 sub r0, r0, r2, lsl #2 528 sub r1, r1, r2, lsl #2 529 sub r0, r0, r2, lsl #1 530 sub r1, r1, r2, lsl #1 531 532 @ Store pixels: 533 vst1.8 {d2}, [r0,:64], r2 @ P2 534 vst1.8 {d3}, [r1,:64], r2 @ P2 535 vst1.8 {d4}, [r0,:64], r2 @ P1 536 vst1.8 {d5}, [r1,:64], r2 @ P1 537 vst1.8 {d6}, [r0,:64], r2 @ P0 538 vst1.8 {d7}, [r1,:64], r2 @ P0 539 vst1.8 {d8}, [r0,:64], r2 @ Q0 540 vst1.8 {d9}, [r1,:64], r2 @ Q0 541 vst1.8 {d10}, [r0,:64], r2 @ Q1 542 vst1.8 {d11}, [r1,:64], r2 @ Q1 543 vst1.8 {d12}, [r0,:64] @ Q2 544 vst1.8 {d13}, [r1,:64] @ Q2 545 546 vpop {q4-q7} 547 bx lr 548endfunc 549.endm 550 551vp8_v_loop_filter8uv 552vp8_v_loop_filter8uv _inner, inner=1 553 554.macro vp8_h_loop_filter16 name, inner=0, simple=0 555function ff_vp8_h_loop_filter16\name\()_neon, export=1 556 vpush {q4-q7} 557 sub r0, r0, #4 558 .if !\simple 559 ldr r12, [sp, #64] @ hev_thresh 560 .endif 561 562 @ Load pixels: 563 vld1.8 {d0}, [r0], r1 @ load first 8-line src data 564 vld1.8 {d2}, [r0], r1 565 vld1.8 {d4}, [r0], r1 566 vld1.8 {d6}, [r0], r1 567 vld1.8 {d8}, [r0], r1 568 vld1.8 {d10}, [r0], r1 569 vld1.8 {d12}, [r0], r1 570 vld1.8 {d14}, [r0], r1 571 vld1.8 {d1}, [r0], r1 @ load second 8-line src data 572 vld1.8 {d3}, [r0], r1 573 vld1.8 {d5}, [r0], r1 574 vld1.8 {d7}, [r0], r1 575 vld1.8 {d9}, [r0], r1 576 vld1.8 {d11}, [r0], r1 577 vld1.8 {d13}, [r0], r1 578 vld1.8 {d15}, [r0], r1 579 580 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 581 582 vdup.8 q14, r2 @ flim_E 583 .if !\simple 584 vdup.8 q15, r3 @ flim_I 585 .endif 586 587 vp8_loop_filter inner=\inner, simple=\simple 588 589 sub r0, r0, r1, lsl #4 @ backup 16 rows 590 591 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 592 593 @ Store pixels: 594 vst1.8 {d0}, [r0], r1 595 vst1.8 {d2}, [r0], r1 596 vst1.8 {d4}, [r0], r1 597 vst1.8 {d6}, [r0], r1 598 vst1.8 {d8}, [r0], r1 599 vst1.8 {d10}, [r0], r1 600 vst1.8 {d12}, [r0], r1 601 vst1.8 {d14}, [r0], r1 602 vst1.8 {d1}, [r0], r1 603 vst1.8 {d3}, [r0], r1 604 vst1.8 {d5}, [r0], r1 605 vst1.8 {d7}, [r0], r1 606 vst1.8 {d9}, [r0], r1 607 vst1.8 {d11}, [r0], r1 608 vst1.8 {d13}, [r0], r1 609 vst1.8 {d15}, [r0] 610 611 vpop {q4-q7} 612 bx lr 613endfunc 614.endm 615 616vp8_h_loop_filter16 617vp8_h_loop_filter16 _inner, inner=1 618vp8_h_loop_filter16 _simple, simple=1 619 620.macro vp8_h_loop_filter8uv name, inner=0 621function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 622 vpush {q4-q7} 623 sub r0, r0, #4 624 sub r1, r1, #4 625 ldr r12, [sp, #64] @ flim_I 626 627 @ Load pixels: 628 vld1.8 {d0}, [r0], r2 @ load u 629 vld1.8 {d1}, [r1], r2 @ load v 630 vld1.8 {d2}, [r0], r2 631 vld1.8 {d3}, [r1], r2 632 vld1.8 {d4}, [r0], r2 633 vld1.8 {d5}, [r1], r2 634 vld1.8 {d6}, [r0], r2 635 vld1.8 {d7}, [r1], r2 636 vld1.8 {d8}, [r0], r2 637 vld1.8 {d9}, [r1], r2 638 vld1.8 {d10}, [r0], r2 639 vld1.8 {d11}, [r1], r2 640 vld1.8 {d12}, [r0], r2 641 vld1.8 {d13}, [r1], r2 642 vld1.8 {d14}, [r0], r2 643 vld1.8 {d15}, [r1], r2 644 645 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 646 647 vdup.8 q14, r3 @ flim_E 648 vdup.8 q15, r12 @ flim_I 649 ldr r12, [sp, #68] @ hev_thresh 650 651 vp8_loop_filter inner=\inner 652 653 sub r0, r0, r2, lsl #3 @ backup u 8 rows 654 sub r1, r1, r2, lsl #3 @ backup v 8 rows 655 656 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 657 658 @ Store pixels: 659 vst1.8 {d0}, [r0], r2 660 vst1.8 {d1}, [r1], r2 661 vst1.8 {d2}, [r0], r2 662 vst1.8 {d3}, [r1], r2 663 vst1.8 {d4}, [r0], r2 664 vst1.8 {d5}, [r1], r2 665 vst1.8 {d6}, [r0], r2 666 vst1.8 {d7}, [r1], r2 667 vst1.8 {d8}, [r0], r2 668 vst1.8 {d9}, [r1], r2 669 vst1.8 {d10}, [r0], r2 670 vst1.8 {d11}, [r1], r2 671 vst1.8 {d12}, [r0], r2 672 vst1.8 {d13}, [r1], r2 673 vst1.8 {d14}, [r0] 674 vst1.8 {d15}, [r1] 675 676 vpop {q4-q7} 677 bx lr 678endfunc 679.endm 680 681vp8_h_loop_filter8uv 682vp8_h_loop_filter8uv _inner, inner=1 683 684function ff_put_vp8_pixels16_neon, export=1 685 ldr r12, [sp, #0] @ h 6861: 687 subs r12, r12, #4 688 vld1.8 {q0}, [r2], r3 689 vld1.8 {q1}, [r2], r3 690 vld1.8 {q2}, [r2], r3 691 vld1.8 {q3}, [r2], r3 692 vst1.8 {q0}, [r0,:128], r1 693 vst1.8 {q1}, [r0,:128], r1 694 vst1.8 {q2}, [r0,:128], r1 695 vst1.8 {q3}, [r0,:128], r1 696 bgt 1b 697 bx lr 698endfunc 699 700function ff_put_vp8_pixels8_neon, export=1 701 ldr r12, [sp, #0] @ h 7021: 703 subs r12, r12, #4 704 vld1.8 {d0}, [r2], r3 705 vld1.8 {d1}, [r2], r3 706 vld1.8 {d2}, [r2], r3 707 vld1.8 {d3}, [r2], r3 708 vst1.8 {d0}, [r0,:64], r1 709 vst1.8 {d1}, [r0,:64], r1 710 vst1.8 {d2}, [r0,:64], r1 711 vst1.8 {d3}, [r0,:64], r1 712 bgt 1b 713 bx lr 714endfunc 715 716/* 4/6-tap 8th-pel MC */ 717 718.macro vp8_epel8_h6 d, a, b 719 vext.8 d27, \a, \b, #1 720 vmovl.u8 q8, \a 721 vext.8 d28, \a, \b, #2 722 vmovl.u8 q9, d27 723 vext.8 d29, \a, \b, #3 724 vmovl.u8 q10, d28 725 vext.8 d30, \a, \b, #4 726 vmovl.u8 q11, d29 727 vext.8 d31, \a, \b, #5 728 vmovl.u8 q12, d30 729 vmul.u16 q10, q10, d0[2] 730 vmovl.u8 q13, d31 731 vmul.u16 q11, q11, d0[3] 732 vmls.u16 q10, q9, d0[1] 733 vmls.u16 q11, q12, d1[0] 734 vmla.u16 q10, q8, d0[0] 735 vmla.u16 q11, q13, d1[1] 736 vqadd.s16 q11, q10, q11 737 vqrshrun.s16 \d, q11, #7 738.endm 739 740.macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1 741 vext.8 q14, \q0, \q1, #3 742 vext.8 q15, \q0, \q1, #4 743 vmovl.u8 q11, d28 744 vmovl.u8 q14, d29 745 vext.8 q3, \q0, \q1, #2 746 vmovl.u8 q12, d30 747 vmovl.u8 q15, d31 748 vext.8 q8, \q0, \q1, #1 749 vmovl.u8 q10, d6 750 vmovl.u8 q3, d7 751 vext.8 q2, \q0, \q1, #5 752 vmovl.u8 q13, d4 753 vmovl.u8 q2, d5 754 vmovl.u8 q9, d16 755 vmovl.u8 q8, d17 756 vmul.u16 q11, q11, d0[3] 757 vmul.u16 q10, q10, d0[2] 758 vmul.u16 q3, q3, d0[2] 759 vmul.u16 q14, q14, d0[3] 760 vmls.u16 q11, q12, d1[0] 761 vmovl.u8 q12, \s0 762 vmovl.u8 q1, \s1 763 vmls.u16 q10, q9, d0[1] 764 vmls.u16 q3, q8, d0[1] 765 vmls.u16 q14, q15, d1[0] 766 vmla.u16 q10, q12, d0[0] 767 vmla.u16 q11, q13, d1[1] 768 vmla.u16 q3, q1, d0[0] 769 vmla.u16 q14, q2, d1[1] 770 vqadd.s16 q11, q10, q11 771 vqadd.s16 q14, q3, q14 772 vqrshrun.s16 \d0, q11, #7 773 vqrshrun.s16 \d1, q14, #7 774.endm 775 776.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6 777 vmovl.u8 q10, \s0 778 vmovl.u8 q11, \s3 779 vmovl.u8 q14, \s6 780 vmovl.u8 q9, \s1 781 vmovl.u8 q12, \s4 782 vmovl.u8 q8, \s2 783 vmovl.u8 q13, \s5 784 vmul.u16 q10, q10, d0[0] 785 vmul.u16 q15, q11, d0[3] 786 vmul.u16 q11, q11, d0[2] 787 vmul.u16 q14, q14, d1[1] 788 vmls.u16 q10, q9, d0[1] 789 vmls.u16 q15, q12, d1[0] 790 vmls.u16 q11, q8, d0[1] 791 vmls.u16 q14, q13, d1[0] 792 vmla.u16 q10, q8, d0[2] 793 vmla.u16 q15, q13, d1[1] 794 vmla.u16 q11, q9, d0[0] 795 vmla.u16 q14, q12, d0[3] 796 vqadd.s16 q15, q10, q15 797 vqadd.s16 q14, q11, q14 798 vqrshrun.s16 \d0, q15, #7 799 vqrshrun.s16 \d1, q14, #7 800.endm 801 802.macro vp8_epel8_h4 d, a, b 803 vext.8 d28, \a, \b, #1 804 vmovl.u8 q9, \a 805 vext.8 d29, \a, \b, #2 806 vmovl.u8 q10, d28 807 vext.8 d30, \a, \b, #3 808 vmovl.u8 q11, d29 809 vmovl.u8 q12, d30 810 vmul.u16 q10, q10, d0[2] 811 vmul.u16 q11, q11, d0[3] 812 vmls.u16 q10, q9, d0[1] 813 vmls.u16 q11, q12, d1[0] 814 vqadd.s16 q11, q10, q11 815 vqrshrun.s16 \d, q11, #7 816.endm 817 818.macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4 819 vmovl.u8 q9, \s0 820 vmovl.u8 q10, \s1 821 vmovl.u8 q11, \s2 822 vmovl.u8 q12, \s3 823 vmovl.u8 q13, \s4 824 vmul.u16 q8, q10, d0[2] 825 vmul.u16 q14, q11, d0[3] 826 vmul.u16 q11, q11, d0[2] 827 vmul.u16 q15, q12, d0[3] 828 vmls.u16 q8, q9, d0[1] 829 vmls.u16 q14, q12, d1[0] 830 vmls.u16 q11, q10, d0[1] 831 vmls.u16 q15, q13, d1[0] 832 vqadd.s16 q8, q8, q14 833 vqadd.s16 q11, q11, q15 834 vqrshrun.s16 \d0, q8, #7 835 vqrshrun.s16 \d1, q11, #7 836.endm 837 838function ff_put_vp8_epel16_v6_neon, export=1 839 sub r2, r2, r3, lsl #1 840 push {r4,lr} 841 vpush {d8-d15} 842 843 ldr r4, [sp, #80] @ my 844 movrel lr, subpel_filters-16 845 ldr r12, [sp, #72] @ h 846 add r4, lr, r4, lsl #4 847 vld1.16 {q0}, [r4,:128] 8481: 849 vld1.8 {d2-d3}, [r2], r3 850 vld1.8 {d4-d5}, [r2], r3 851 vld1.8 {d6-d7}, [r2], r3 852 vld1.8 {d8-d9}, [r2], r3 853 vld1.8 {d10-d11},[r2], r3 854 vld1.8 {d12-d13},[r2], r3 855 vld1.8 {d14-d15},[r2] 856 sub r2, r2, r3, lsl #2 857 858 vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14 859 vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15 860 861 vst1.8 {d2-d3}, [r0,:128], r1 862 vst1.8 {d4-d5}, [r0,:128], r1 863 subs r12, r12, #2 864 bne 1b 865 866 vpop {d8-d15} 867 pop {r4,pc} 868endfunc 869 870function ff_put_vp8_epel16_h6_neon, export=1 871 sub r2, r2, #2 872 push {r4,lr} 873 874 ldr r4, [sp, #12] @ mx 875 movrel lr, subpel_filters-16 876 ldr r12, [sp, #8] @ h 877 add r4, lr, r4, lsl #4 878 vld1.16 {q0}, [r4,:128] 8791: 880 vld1.8 {d2-d4}, [r2], r3 881 882 vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2 883 884 vst1.8 {d2-d3}, [r0,:128], r1 885 subs r12, r12, #1 886 bne 1b 887 888 pop {r4,pc} 889endfunc 890 891function ff_put_vp8_epel16_h6v6_neon, export=1 892 sub r2, r2, r3, lsl #1 893 sub r2, r2, #2 894 push {r4,lr} 895 vpush {d8-d15} 896 897 @ first pass (horizontal): 898 ldr r4, [sp, #64+8+4] @ mx 899 movrel lr, subpel_filters-16 900 ldr r12, [sp, #64+8+0] @ h 901 add r4, lr, r4, lsl #4 902 sub sp, sp, #336+16 903 vld1.16 {q0}, [r4,:128] 904 add lr, sp, #15 905 add r12, r12, #5 906 bic lr, lr, #15 9071: 908 vld1.8 {d2,d3,d4}, [r2], r3 909 910 vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2 911 912 vst1.8 {d2-d3}, [lr,:128]! 913 subs r12, r12, #1 914 bne 1b 915 916 @ second pass (vertical): 917 ldr r4, [sp, #336+16+64+8+8] @ my 918 movrel lr, subpel_filters-16 919 ldr r12, [sp, #336+16+64+8+0] @ h 920 add r4, lr, r4, lsl #4 921 add lr, sp, #15 922 vld1.16 {q0}, [r4,:128] 923 bic lr, lr, #15 9242: 925 vld1.8 {d2-d5}, [lr,:128]! 926 vld1.8 {d6-d9}, [lr,:128]! 927 vld1.8 {d10-d13},[lr,:128]! 928 vld1.8 {d14-d15},[lr,:128] 929 sub lr, lr, #64 930 931 vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14 932 vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15 933 934 vst1.8 {d2-d3}, [r0,:128], r1 935 vst1.8 {d4-d5}, [r0,:128], r1 936 subs r12, r12, #2 937 bne 2b 938 939 add sp, sp, #336+16 940 vpop {d8-d15} 941 pop {r4,pc} 942endfunc 943 944function ff_put_vp8_epel8_v6_neon, export=1 945 sub r2, r2, r3, lsl #1 946 push {r4,lr} 947 948 ldr r4, [sp, #16] @ my 949 movrel lr, subpel_filters-16 950 ldr r12, [sp, #8] @ h 951 add r4, lr, r4, lsl #4 952 vld1.16 {q0}, [r4,:128] 9531: 954 vld1.8 {d2}, [r2], r3 955 vld1.8 {d3}, [r2], r3 956 vld1.8 {d4}, [r2], r3 957 vld1.8 {d5}, [r2], r3 958 vld1.8 {d6}, [r2], r3 959 vld1.8 {d7}, [r2], r3 960 vld1.8 {d28}, [r2] 961 962 sub r2, r2, r3, lsl #2 963 964 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28 965 966 vst1.8 {d2}, [r0,:64], r1 967 vst1.8 {d3}, [r0,:64], r1 968 subs r12, r12, #2 969 bne 1b 970 971 pop {r4,pc} 972endfunc 973 974function ff_put_vp8_epel8_h6_neon, export=1 975 sub r2, r2, #2 976 push {r4,lr} 977 978 ldr r4, [sp, #12] @ mx 979 movrel lr, subpel_filters-16 980 ldr r12, [sp, #8] @ h 981 add r4, lr, r4, lsl #4 982 vld1.16 {q0}, [r4,:128] 9831: 984 vld1.8 {d2,d3}, [r2], r3 985 986 vp8_epel8_h6 d2, d2, d3 987 988 vst1.8 {d2}, [r0,:64], r1 989 subs r12, r12, #1 990 bne 1b 991 992 pop {r4,pc} 993endfunc 994 995function ff_put_vp8_epel8_h6v6_neon, export=1 996 sub r2, r2, r3, lsl #1 997 sub r2, r2, #2 998 push {r4,lr} 999 1000 @ first pass (horizontal): 1001 ldr r4, [sp, #12] @ mx 1002 movrel lr, subpel_filters-16 1003 ldr r12, [sp, #8] @ h 1004 add r4, lr, r4, lsl #4 1005 sub sp, sp, #168+16 1006 vld1.16 {q0}, [r4,:128] 1007 add lr, sp, #15 1008 add r12, r12, #5 1009 bic lr, lr, #15 10101: 1011 vld1.8 {d2,d3}, [r2], r3 1012 1013 vp8_epel8_h6 d2, d2, d3 1014 1015 vst1.8 {d2}, [lr,:64]! 1016 subs r12, r12, #1 1017 bne 1b 1018 1019 @ second pass (vertical): 1020 ldr r4, [sp, #168+16+16] @ my 1021 movrel lr, subpel_filters-16 1022 ldr r12, [sp, #168+16+8] @ h 1023 add r4, lr, r4, lsl #4 1024 add lr, sp, #15 1025 vld1.16 {q0}, [r4,:128] 1026 bic lr, lr, #15 10272: 1028 vld1.8 {d2-d5}, [lr,:128]! 1029 vld1.8 {d6-d7}, [lr,:128]! 1030 vld1.8 {d30}, [lr,:64] 1031 sub lr, lr, #32 1032 1033 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30 1034 1035 vst1.8 {d2}, [r0,:64], r1 1036 vst1.8 {d3}, [r0,:64], r1 1037 subs r12, r12, #2 1038 bne 2b 1039 1040 add sp, sp, #168+16 1041 pop {r4,pc} 1042endfunc 1043 1044function ff_put_vp8_epel8_v4_neon, export=1 1045 sub r2, r2, r3 1046 push {r4,lr} 1047 1048 ldr r4, [sp, #16] @ my 1049 movrel lr, subpel_filters-16 1050 ldr r12, [sp, #8] @ h 1051 add r4, lr, r4, lsl #4 1052 vld1.16 {q0}, [r4,:128] 10531: 1054 vld1.8 {d2}, [r2], r3 1055 vld1.8 {d3}, [r2], r3 1056 vld1.8 {d4}, [r2], r3 1057 vld1.8 {d5}, [r2], r3 1058 vld1.8 {d6}, [r2] 1059 sub r2, r2, r3, lsl #1 1060 1061 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 1062 1063 vst1.8 {d2}, [r0,:64], r1 1064 vst1.8 {d3}, [r0,:64], r1 1065 subs r12, r12, #2 1066 bne 1b 1067 1068 pop {r4,pc} 1069endfunc 1070 1071function ff_put_vp8_epel8_h4_neon, export=1 1072 sub r2, r2, #1 1073 push {r4,lr} 1074 1075 ldr r4, [sp, #12] @ mx 1076 movrel lr, subpel_filters-16 1077 ldr r12, [sp, #8] @ h 1078 add r4, lr, r4, lsl #4 1079 vld1.16 {q0}, [r4,:128] 10801: 1081 vld1.8 {d2,d3}, [r2], r3 1082 1083 vp8_epel8_h4 d2, d2, d3 1084 1085 vst1.8 {d2}, [r0,:64], r1 1086 subs r12, r12, #1 1087 bne 1b 1088 1089 pop {r4,pc} 1090endfunc 1091 1092function ff_put_vp8_epel8_h4v4_neon, export=1 1093 sub r2, r2, r3 1094 sub r2, r2, #1 1095 push {r4,lr} 1096 1097 @ first pass (horizontal): 1098 ldr r4, [sp, #12] @ mx 1099 movrel lr, subpel_filters-16 1100 ldr r12, [sp, #8] @ h 1101 add r4, lr, r4, lsl #4 1102 sub sp, sp, #168+16 1103 vld1.16 {q0}, [r4,:128] 1104 add lr, sp, #15 1105 add r12, r12, #3 1106 bic lr, lr, #15 11071: 1108 vld1.8 {d2,d3}, [r2], r3 1109 1110 vp8_epel8_h4 d2, d2, d3 1111 1112 vst1.8 {d2}, [lr,:64]! 1113 subs r12, r12, #1 1114 bne 1b 1115 1116 @ second pass (vertical): 1117 ldr r4, [sp, #168+16+16] @ my 1118 movrel lr, subpel_filters-16 1119 ldr r12, [sp, #168+16+8] @ h 1120 add r4, lr, r4, lsl #4 1121 add lr, sp, #15 1122 vld1.16 {q0}, [r4,:128] 1123 bic lr, lr, #15 11242: 1125 vld1.8 {d2-d5}, [lr,:128]! 1126 vld1.8 {d6}, [lr,:64] 1127 sub lr, lr, #16 1128 1129 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 1130 1131 vst1.8 {d2}, [r0,:64], r1 1132 vst1.8 {d3}, [r0,:64], r1 1133 subs r12, r12, #2 1134 bne 2b 1135 1136 add sp, sp, #168+16 1137 pop {r4,pc} 1138endfunc 1139 1140function ff_put_vp8_epel8_h6v4_neon, export=1 1141 sub r2, r2, r3 1142 sub r2, r2, #2 1143 push {r4,lr} 1144 1145 @ first pass (horizontal): 1146 ldr r4, [sp, #12] @ mx 1147 movrel lr, subpel_filters-16 1148 ldr r12, [sp, #8] @ h 1149 add r4, lr, r4, lsl #4 1150 sub sp, sp, #168+16 1151 vld1.16 {q0}, [r4,:128] 1152 add lr, sp, #15 1153 add r12, r12, #3 1154 bic lr, lr, #15 11551: 1156 vld1.8 {d2,d3}, [r2], r3 1157 1158 vp8_epel8_h6 d2, d2, d3 1159 1160 vst1.8 {d2}, [lr,:64]! 1161 subs r12, r12, #1 1162 bne 1b 1163 1164 @ second pass (vertical): 1165 ldr r4, [sp, #168+16+16] @ my 1166 movrel lr, subpel_filters-16 1167 ldr r12, [sp, #168+16+8] @ h 1168 add r4, lr, r4, lsl #4 1169 add lr, sp, #15 1170 vld1.16 {q0}, [r4,:128] 1171 bic lr, lr, #15 11722: 1173 vld1.8 {d2-d5}, [lr,:128]! 1174 vld1.8 {d6}, [lr,:64] 1175 sub lr, lr, #16 1176 1177 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 1178 1179 vst1.8 {d2}, [r0,:64], r1 1180 vst1.8 {d3}, [r0,:64], r1 1181 subs r12, r12, #2 1182 bne 2b 1183 1184 add sp, sp, #168+16 1185 pop {r4,pc} 1186endfunc 1187 1188function ff_put_vp8_epel8_h4v6_neon, export=1 1189 sub r2, r2, r3, lsl #1 1190 sub r2, r2, #1 1191 push {r4,lr} 1192 1193 @ first pass (horizontal): 1194 ldr r4, [sp, #12] @ mx 1195 movrel lr, subpel_filters-16 1196 ldr r12, [sp, #8] @ h 1197 add r4, lr, r4, lsl #4 1198 sub sp, sp, #168+16 1199 vld1.16 {q0}, [r4,:128] 1200 add lr, sp, #15 1201 add r12, r12, #5 1202 bic lr, lr, #15 12031: 1204 vld1.8 {d2,d3}, [r2], r3 1205 1206 vp8_epel8_h4 d2, d2, d3 1207 1208 vst1.8 {d2}, [lr,:64]! 1209 subs r12, r12, #1 1210 bne 1b 1211 1212 @ second pass (vertical): 1213 ldr r4, [sp, #168+16+16] @ my 1214 movrel lr, subpel_filters-16 1215 ldr r12, [sp, #168+16+8] @ h 1216 add r4, lr, r4, lsl #4 1217 add lr, sp, #15 1218 vld1.16 {q0}, [r4,:128] 1219 bic lr, lr, #15 12202: 1221 vld1.8 {d2-d5}, [lr,:128]! 1222 vld1.8 {d6-d7}, [lr,:128]! 1223 vld1.8 {d30}, [lr,:64] 1224 sub lr, lr, #32 1225 1226 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30 1227 1228 vst1.8 {d2}, [r0,:64], r1 1229 vst1.8 {d3}, [r0,:64], r1 1230 subs r12, r12, #2 1231 bne 2b 1232 1233 add sp, sp, #168+16 1234 pop {r4,pc} 1235endfunc 1236 1237.ltorg 1238 1239function ff_put_vp8_epel4_v6_neon, export=1 1240 sub r2, r2, r3, lsl #1 1241 push {r4,lr} 1242 1243 ldr r4, [sp, #16] @ my 1244 movrel lr, subpel_filters-16 1245 ldr r12, [sp, #8] @ h 1246 add r4, lr, r4, lsl #4 1247 vld1.16 {q0}, [r4,:128] 12481: 1249 vld1.32 {d2[]}, [r2], r3 1250 vld1.32 {d3[]}, [r2], r3 1251 vld1.32 {d4[]}, [r2], r3 1252 vld1.32 {d5[]}, [r2], r3 1253 vld1.32 {d6[]}, [r2], r3 1254 vld1.32 {d7[]}, [r2], r3 1255 vld1.32 {d28[]}, [r2] 1256 sub r2, r2, r3, lsl #2 1257 vld1.32 {d2[1]}, [r2], r3 1258 vld1.32 {d3[1]}, [r2], r3 1259 vld1.32 {d4[1]}, [r2], r3 1260 vld1.32 {d5[1]}, [r2], r3 1261 vld1.32 {d6[1]}, [r2], r3 1262 vld1.32 {d7[1]}, [r2], r3 1263 vld1.32 {d28[1]}, [r2] 1264 sub r2, r2, r3, lsl #2 1265 1266 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28 1267 1268 vst1.32 {d2[0]}, [r0,:32], r1 1269 vst1.32 {d3[0]}, [r0,:32], r1 1270 vst1.32 {d2[1]}, [r0,:32], r1 1271 vst1.32 {d3[1]}, [r0,:32], r1 1272 subs r12, r12, #4 1273 bne 1b 1274 1275 pop {r4,pc} 1276endfunc 1277 1278function ff_put_vp8_epel4_h6_neon, export=1 1279 sub r2, r2, #2 1280 push {r4,lr} 1281 1282 ldr r4, [sp, #12] @ mx 1283 movrel lr, subpel_filters-16 1284 ldr r12, [sp, #8] @ h 1285 add r4, lr, r4, lsl #4 1286 vld1.16 {q0}, [r4,:128] 12871: 1288 vld1.8 {q1}, [r2], r3 1289 vp8_epel8_h6 d2, d2, d3 1290 vst1.32 {d2[0]}, [r0,:32], r1 1291 subs r12, r12, #1 1292 bne 1b 1293 1294 pop {r4,pc} 1295endfunc 1296 1297function ff_put_vp8_epel4_h6v6_neon, export=1 1298 sub r2, r2, r3, lsl #1 1299 sub r2, r2, #2 1300 push {r4,lr} 1301 1302 ldr r4, [sp, #12] @ mx 1303 movrel lr, subpel_filters-16 1304 ldr r12, [sp, #8] @ h 1305 add r4, lr, r4, lsl #4 1306 sub sp, sp, #52+16 1307 vld1.16 {q0}, [r4,:128] 1308 add lr, sp, #15 1309 add r12, r12, #5 1310 bic lr, lr, #15 13111: 1312 vld1.8 {q1}, [r2], r3 1313 vp8_epel8_h6 d2, d2, d3 1314 vst1.32 {d2[0]}, [lr,:32]! 1315 subs r12, r12, #1 1316 bne 1b 1317 1318 ldr r4, [sp, #52+16+16] @ my 1319 movrel lr, subpel_filters-16 1320 ldr r12, [sp, #52+16+8] @ h 1321 add r4, lr, r4, lsl #4 1322 add lr, sp, #15 1323 vld1.16 {q0}, [r4,:128] 1324 bic lr, lr, #15 13252: 1326 vld1.8 {d2-d3}, [lr,:128]! 1327 vld1.8 {d6}, [lr,:64]! 1328 vld1.32 {d28[]}, [lr,:32] 1329 sub lr, lr, #16 1330 vld1.8 {d4-d5}, [lr]! 1331 vld1.8 {d7}, [lr,:64]! 1332 vld1.32 {d28[1]}, [lr,:32] 1333 sub lr, lr, #16 1334 vtrn.32 q1, q2 1335 vtrn.32 d6, d7 1336 vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28 1337 vst1.32 {d2[0]}, [r0,:32], r1 1338 vst1.32 {d3[0]}, [r0,:32], r1 1339 vst1.32 {d2[1]}, [r0,:32], r1 1340 vst1.32 {d3[1]}, [r0,:32], r1 1341 subs r12, r12, #4 1342 bne 2b 1343 1344 add sp, sp, #52+16 1345 pop {r4,pc} 1346endfunc 1347 1348function ff_put_vp8_epel4_h4v6_neon, export=1 1349 sub r2, r2, r3, lsl #1 1350 sub r2, r2, #1 1351 push {r4,lr} 1352 1353 ldr r4, [sp, #12] @ mx 1354 movrel lr, subpel_filters-16 1355 ldr r12, [sp, #8] @ h 1356 add r4, lr, r4, lsl #4 1357 sub sp, sp, #52+16 1358 vld1.16 {q0}, [r4,:128] 1359 add lr, sp, #15 1360 add r12, r12, #5 1361 bic lr, lr, #15 13621: 1363 vld1.8 {d2}, [r2], r3 1364 vp8_epel8_h4 d2, d2, d2 1365 vst1.32 {d2[0]}, [lr,:32]! 1366 subs r12, r12, #1 1367 bne 1b 1368 1369 ldr r4, [sp, #52+16+16] @ my 1370 movrel lr, subpel_filters-16 1371 ldr r12, [sp, #52+16+8] @ h 1372 add r4, lr, r4, lsl #4 1373 add lr, sp, #15 1374 vld1.16 {q0}, [r4,:128] 1375 bic lr, lr, #15 13762: 1377 vld1.8 {d2-d3}, [lr,:128]! 1378 vld1.8 {d6}, [lr,:64]! 1379 vld1.32 {d28[]}, [lr,:32] 1380 sub lr, lr, #16 1381 vld1.8 {d4-d5}, [lr]! 1382 vld1.8 {d7}, [lr,:64]! 1383 vld1.32 {d28[1]}, [lr,:32] 1384 sub lr, lr, #16 1385 vtrn.32 q1, q2 1386 vtrn.32 d6, d7 1387 vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28 1388 vst1.32 {d2[0]}, [r0,:32], r1 1389 vst1.32 {d3[0]}, [r0,:32], r1 1390 vst1.32 {d2[1]}, [r0,:32], r1 1391 vst1.32 {d3[1]}, [r0,:32], r1 1392 subs r12, r12, #4 1393 bne 2b 1394 1395 add sp, sp, #52+16 1396 pop {r4,pc} 1397endfunc 1398 1399function ff_put_vp8_epel4_h6v4_neon, export=1 1400 sub r2, r2, r3 1401 sub r2, r2, #2 1402 push {r4,lr} 1403 1404 ldr r4, [sp, #12] @ mx 1405 movrel lr, subpel_filters-16 1406 ldr r12, [sp, #8] @ h 1407 add r4, lr, r4, lsl #4 1408 sub sp, sp, #44+16 1409 vld1.16 {q0}, [r4,:128] 1410 add lr, sp, #15 1411 add r12, r12, #3 1412 bic lr, lr, #15 14131: 1414 vld1.8 {q1}, [r2], r3 1415 vp8_epel8_h6 d2, d2, d3 1416 vst1.32 {d2[0]}, [lr,:32]! 1417 subs r12, r12, #1 1418 bne 1b 1419 1420 ldr r4, [sp, #44+16+16] @ my 1421 movrel lr, subpel_filters-16 1422 ldr r12, [sp, #44+16+8] @ h 1423 add r4, lr, r4, lsl #4 1424 add lr, sp, #15 1425 vld1.16 {q0}, [r4,:128] 1426 bic lr, lr, #15 14272: 1428 vld1.8 {d2-d3}, [lr,:128]! 1429 vld1.32 {d6[]}, [lr,:32] 1430 sub lr, lr, #8 1431 vld1.8 {d4-d5}, [lr]! 1432 vld1.32 {d6[1]}, [lr,:32] 1433 sub lr, lr, #8 1434 vtrn.32 q1, q2 1435 vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6 1436 vst1.32 {d2[0]}, [r0,:32], r1 1437 vst1.32 {d3[0]}, [r0,:32], r1 1438 vst1.32 {d2[1]}, [r0,:32], r1 1439 vst1.32 {d3[1]}, [r0,:32], r1 1440 subs r12, r12, #4 1441 bne 2b 1442 1443 add sp, sp, #44+16 1444 pop {r4,pc} 1445endfunc 1446 1447function ff_put_vp8_epel4_h4_neon, export=1 1448 sub r2, r2, #1 1449 push {r4,lr} 1450 1451 ldr r4, [sp, #12] @ mx 1452 movrel lr, subpel_filters-16 1453 ldr r12, [sp, #8] @ h 1454 add r4, lr, r4, lsl #4 1455 vld1.16 {q0}, [r4,:128] 14561: 1457 vld1.8 {d2}, [r2], r3 1458 vp8_epel8_h4 d2, d2, d2 1459 vst1.32 {d2[0]}, [r0,:32], r1 1460 subs r12, r12, #1 1461 bne 1b 1462 1463 pop {r4,pc} 1464endfunc 1465 1466function ff_put_vp8_epel4_v4_neon, export=1 1467 sub r2, r2, r3 1468 push {r4,lr} 1469 1470 ldr r4, [sp, #16] @ my 1471 movrel lr, subpel_filters-16 1472 ldr r12, [sp, #8] @ h 1473 add r4, lr, r4, lsl #4 1474 vld1.16 {q0}, [r4,:128] 14751: 1476 vld1.32 {d2[]}, [r2], r3 1477 vld1.32 {d3[]}, [r2], r3 1478 vld1.32 {d4[]}, [r2], r3 1479 vld1.32 {d5[]}, [r2], r3 1480 vld1.32 {d6[]}, [r2] 1481 sub r2, r2, r3, lsl #1 1482 vld1.32 {d2[1]}, [r2], r3 1483 vld1.32 {d3[1]}, [r2], r3 1484 vld1.32 {d4[1]}, [r2], r3 1485 vld1.32 {d5[1]}, [r2], r3 1486 vld1.32 {d6[1]}, [r2] 1487 sub r2, r2, r3, lsl #1 1488 1489 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 1490 1491 vst1.32 {d2[0]}, [r0,:32], r1 1492 vst1.32 {d3[0]}, [r0,:32], r1 1493 vst1.32 {d2[1]}, [r0,:32], r1 1494 vst1.32 {d3[1]}, [r0,:32], r1 1495 subs r12, r12, #4 1496 bne 1b 1497 1498 pop {r4,pc} 1499endfunc 1500 1501function ff_put_vp8_epel4_h4v4_neon, export=1 1502 sub r2, r2, r3 1503 sub r2, r2, #1 1504 push {r4,lr} 1505 1506 ldr r4, [sp, #12] @ mx 1507 movrel lr, subpel_filters-16 1508 ldr r12, [sp, #8] @ h 1509 add r4, lr, r4, lsl #4 1510 sub sp, sp, #44+16 1511 vld1.16 {q0}, [r4,:128] 1512 add lr, sp, #15 1513 add r12, r12, #3 1514 bic lr, lr, #15 15151: 1516 vld1.8 {d2}, [r2], r3 1517 vp8_epel8_h4 d2, d2, d3 1518 vst1.32 {d2[0]}, [lr,:32]! 1519 subs r12, r12, #1 1520 bne 1b 1521 1522 ldr r4, [sp, #44+16+16] @ my 1523 movrel lr, subpel_filters-16 1524 ldr r12, [sp, #44+16+8] @ h 1525 add r4, lr, r4, lsl #4 1526 add lr, sp, #15 1527 vld1.16 {q0}, [r4,:128] 1528 bic lr, lr, #15 15292: 1530 vld1.8 {d2-d3}, [lr,:128]! 1531 vld1.32 {d6[]}, [lr,:32] 1532 sub lr, lr, #8 1533 vld1.8 {d4-d5}, [lr]! 1534 vld1.32 {d6[1]}, [lr,:32] 1535 sub lr, lr, #8 1536 vtrn.32 q1, q2 1537 vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6 1538 vst1.32 {d2[0]}, [r0,:32], r1 1539 vst1.32 {d3[0]}, [r0,:32], r1 1540 vst1.32 {d2[1]}, [r0,:32], r1 1541 vst1.32 {d3[1]}, [r0,:32], r1 1542 subs r12, r12, #4 1543 bne 2b 1544 1545 add sp, sp, #44+16 1546 pop {r4,pc} 1547endfunc 1548 1549@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit 1550@ arithmetic can be used to apply filters 1551const subpel_filters, align=4 1552 .short 0, 6, 123, 12, 1, 0, 0, 0 1553 .short 2, 11, 108, 36, 8, 1, 0, 0 1554 .short 0, 9, 93, 50, 6, 0, 0, 0 1555 .short 3, 16, 77, 77, 16, 3, 0, 0 1556 .short 0, 6, 50, 93, 9, 0, 0, 0 1557 .short 1, 8, 36, 108, 11, 2, 0, 0 1558 .short 0, 1, 12, 123, 6, 0, 0, 0 1559endconst 1560 1561/* Bilinear MC */ 1562 1563function ff_put_vp8_bilin16_h_neon, export=1 1564 ldr r12, [sp, #4] @ mx 1565 vdup.8 d0, r12 1566 rsb r12, r12, #8 1567 vdup.8 d1, r12 1568 ldr r12, [sp] @ h 15691: 1570 subs r12, r12, #2 1571 vld1.8 {d2-d4}, [r2], r3 1572 vext.8 q2, q1, q2, #1 1573 vmull.u8 q8, d2, d1 1574 vmlal.u8 q8, d4, d0 1575 vld1.8 {d18-d20},[r2], r3 1576 vmull.u8 q3, d3, d1 1577 vmlal.u8 q3, d5, d0 1578 vext.8 q10, q9, q10, #1 1579 vmull.u8 q11, d18, d1 1580 vmlal.u8 q11, d20, d0 1581 vmull.u8 q12, d19, d1 1582 vmlal.u8 q12, d21, d0 1583 vrshrn.u16 d4, q8, #3 1584 vrshrn.u16 d5, q3, #3 1585 vrshrn.u16 d6, q11, #3 1586 vrshrn.u16 d7, q12, #3 1587 vst1.8 {q2}, [r0,:128], r1 1588 vst1.8 {q3}, [r0,:128], r1 1589 bgt 1b 1590 1591 bx lr 1592endfunc 1593 1594function ff_put_vp8_bilin16_v_neon, export=1 1595 ldr r12, [sp, #8] @ my 1596 vdup.8 d0, r12 1597 rsb r12, r12, #8 1598 vdup.8 d1, r12 1599 ldr r12, [sp] @ h 1600 vld1.8 {q1}, [r2], r3 16011: 1602 subs r12, r12, #2 1603 vld1.8 {q2}, [r2], r3 1604 vmull.u8 q3, d2, d1 1605 vmlal.u8 q3, d4, d0 1606 vmull.u8 q8, d3, d1 1607 vmlal.u8 q8, d5, d0 1608 vld1.8 {q1}, [r2], r3 1609 vmull.u8 q9, d4, d1 1610 vmlal.u8 q9, d2, d0 1611 vmull.u8 q10, d5, d1 1612 vmlal.u8 q10, d3, d0 1613 vrshrn.u16 d4, q3, #3 1614 vrshrn.u16 d5, q8, #3 1615 vrshrn.u16 d6, q9, #3 1616 vrshrn.u16 d7, q10, #3 1617 vst1.8 {q2}, [r0,:128], r1 1618 vst1.8 {q3}, [r0,:128], r1 1619 bgt 1b 1620 1621 bx lr 1622endfunc 1623 1624function ff_put_vp8_bilin16_hv_neon, export=1 1625 ldr r12, [sp, #4] @ mx 1626 vdup.8 d0, r12 1627 rsb r12, r12, #8 1628 vdup.8 d1, r12 1629 ldr r12, [sp, #8] @ my 1630 vdup.8 d2, r12 1631 rsb r12, r12, #8 1632 vdup.8 d3, r12 1633 ldr r12, [sp] @ h 1634 1635 vld1.8 {d4-d6}, [r2], r3 1636 vext.8 q3, q2, q3, #1 1637 vmull.u8 q8, d4, d1 1638 vmlal.u8 q8, d6, d0 1639 vmull.u8 q9, d5, d1 1640 vmlal.u8 q9, d7, d0 1641 vrshrn.u16 d4, q8, #3 1642 vrshrn.u16 d5, q9, #3 16431: 1644 subs r12, r12, #2 1645 vld1.8 {d18-d20},[r2], r3 1646 vext.8 q10, q9, q10, #1 1647 vmull.u8 q11, d18, d1 1648 vmlal.u8 q11, d20, d0 1649 vld1.8 {d26-d28},[r2], r3 1650 vmull.u8 q12, d19, d1 1651 vmlal.u8 q12, d21, d0 1652 vext.8 q14, q13, q14, #1 1653 vmull.u8 q8, d26, d1 1654 vmlal.u8 q8, d28, d0 1655 vmull.u8 q9, d27, d1 1656 vmlal.u8 q9, d29, d0 1657 vrshrn.u16 d6, q11, #3 1658 vrshrn.u16 d7, q12, #3 1659 vmull.u8 q12, d4, d3 1660 vmlal.u8 q12, d6, d2 1661 vmull.u8 q15, d5, d3 1662 vmlal.u8 q15, d7, d2 1663 vrshrn.u16 d4, q8, #3 1664 vrshrn.u16 d5, q9, #3 1665 vmull.u8 q10, d6, d3 1666 vmlal.u8 q10, d4, d2 1667 vmull.u8 q11, d7, d3 1668 vmlal.u8 q11, d5, d2 1669 vrshrn.u16 d24, q12, #3 1670 vrshrn.u16 d25, q15, #3 1671 vst1.8 {q12}, [r0,:128], r1 1672 vrshrn.u16 d20, q10, #3 1673 vrshrn.u16 d21, q11, #3 1674 vst1.8 {q10}, [r0,:128], r1 1675 bgt 1b 1676 1677 bx lr 1678endfunc 1679 1680function ff_put_vp8_bilin8_h_neon, export=1 1681 ldr r12, [sp, #4] @ mx 1682 vdup.8 d0, r12 1683 rsb r12, r12, #8 1684 vdup.8 d1, r12 1685 ldr r12, [sp] @ h 16861: 1687 subs r12, r12, #2 1688 vld1.8 {q1}, [r2], r3 1689 vext.8 d3, d2, d3, #1 1690 vmull.u8 q2, d2, d1 1691 vmlal.u8 q2, d3, d0 1692 vld1.8 {q3}, [r2], r3 1693 vext.8 d7, d6, d7, #1 1694 vmull.u8 q8, d6, d1 1695 vmlal.u8 q8, d7, d0 1696 vrshrn.u16 d4, q2, #3 1697 vrshrn.u16 d16, q8, #3 1698 vst1.8 {d4}, [r0,:64], r1 1699 vst1.8 {d16}, [r0,:64], r1 1700 bgt 1b 1701 1702 bx lr 1703endfunc 1704 1705function ff_put_vp8_bilin8_v_neon, export=1 1706 ldr r12, [sp, #8] @ my 1707 vdup.8 d0, r12 1708 rsb r12, r12, #8 1709 vdup.8 d1, r12 1710 ldr r12, [sp] @ h 1711 vld1.8 {d2}, [r2], r3 17121: 1713 subs r12, r12, #2 1714 vld1.8 {d3}, [r2], r3 1715 vmull.u8 q2, d2, d1 1716 vmlal.u8 q2, d3, d0 1717 vld1.8 {d2}, [r2], r3 1718 vmull.u8 q3, d3, d1 1719 vmlal.u8 q3, d2, d0 1720 vrshrn.u16 d4, q2, #3 1721 vrshrn.u16 d6, q3, #3 1722 vst1.8 {d4}, [r0,:64], r1 1723 vst1.8 {d6}, [r0,:64], r1 1724 bgt 1b 1725 1726 bx lr 1727endfunc 1728 1729function ff_put_vp8_bilin8_hv_neon, export=1 1730 ldr r12, [sp, #4] @ mx 1731 vdup.8 d0, r12 1732 rsb r12, r12, #8 1733 vdup.8 d1, r12 1734 ldr r12, [sp, #8] @ my 1735 vdup.8 d2, r12 1736 rsb r12, r12, #8 1737 vdup.8 d3, r12 1738 ldr r12, [sp] @ h 1739 1740 vld1.8 {q2}, [r2], r3 1741 vext.8 d5, d4, d5, #1 1742 vmull.u8 q9, d4, d1 1743 vmlal.u8 q9, d5, d0 1744 vrshrn.u16 d22, q9, #3 17451: 1746 subs r12, r12, #2 1747 vld1.8 {q3}, [r2], r3 1748 vext.8 d7, d6, d7, #1 1749 vmull.u8 q8, d6, d1 1750 vmlal.u8 q8, d7, d0 1751 vld1.8 {q2}, [r2], r3 1752 vext.8 d5, d4, d5, #1 1753 vmull.u8 q9, d4, d1 1754 vmlal.u8 q9, d5, d0 1755 vrshrn.u16 d16, q8, #3 1756 vmull.u8 q10, d22, d3 1757 vmlal.u8 q10, d16, d2 1758 vrshrn.u16 d22, q9, #3 1759 vmull.u8 q12, d16, d3 1760 vmlal.u8 q12, d22, d2 1761 vrshrn.u16 d20, q10, #3 1762 vst1.8 {d20}, [r0,:64], r1 1763 vrshrn.u16 d23, q12, #3 1764 vst1.8 {d23}, [r0,:64], r1 1765 bgt 1b 1766 1767 bx lr 1768endfunc 1769 1770function ff_put_vp8_bilin4_h_neon, export=1 1771 ldr r12, [sp, #4] @ mx 1772 vdup.8 d0, r12 1773 rsb r12, r12, #8 1774 vdup.8 d1, r12 1775 ldr r12, [sp] @ h 17761: 1777 subs r12, r12, #2 1778 vld1.8 {d2}, [r2], r3 1779 vext.8 d3, d2, d3, #1 1780 vld1.8 {d6}, [r2], r3 1781 vext.8 d7, d6, d7, #1 1782 vtrn.32 q1, q3 1783 vmull.u8 q2, d2, d1 1784 vmlal.u8 q2, d3, d0 1785 vrshrn.u16 d4, q2, #3 1786 vst1.32 {d4[0]}, [r0,:32], r1 1787 vst1.32 {d4[1]}, [r0,:32], r1 1788 bgt 1b 1789 1790 bx lr 1791endfunc 1792 1793function ff_put_vp8_bilin4_v_neon, export=1 1794 ldr r12, [sp, #8] @ my 1795 vdup.8 d0, r12 1796 rsb r12, r12, #8 1797 vdup.8 d1, r12 1798 ldr r12, [sp] @ h 1799 vld1.32 {d2[]}, [r2], r3 18001: 1801 vld1.32 {d3[]}, [r2] 1802 vld1.32 {d2[1]}, [r2], r3 1803 vld1.32 {d3[1]}, [r2], r3 1804 vmull.u8 q2, d2, d1 1805 vmlal.u8 q2, d3, d0 1806 vtrn.32 d3, d2 1807 vrshrn.u16 d4, q2, #3 1808 vst1.32 {d4[0]}, [r0,:32], r1 1809 vst1.32 {d4[1]}, [r0,:32], r1 1810 subs r12, r12, #2 1811 bgt 1b 1812 1813 bx lr 1814endfunc 1815 1816function ff_put_vp8_bilin4_hv_neon, export=1 1817 ldr r12, [sp, #4] @ mx 1818 vdup.8 d0, r12 1819 rsb r12, r12, #8 1820 vdup.8 d1, r12 1821 ldr r12, [sp, #8] @ my 1822 vdup.8 d2, r12 1823 rsb r12, r12, #8 1824 vdup.8 d3, r12 1825 ldr r12, [sp] @ h 1826 1827 vld1.8 {d4}, [r2], r3 1828 vext.8 d5, d4, d4, #1 1829 vmull.u8 q9, d4, d1 1830 vmlal.u8 q9, d5, d0 1831 vrshrn.u16 d22, q9, #3 18321: 1833 subs r12, r12, #2 1834 vld1.8 {d6}, [r2], r3 1835 vext.8 d7, d6, d6, #1 1836 vld1.8 {d4}, [r2], r3 1837 vext.8 d5, d4, d4, #1 1838 vtrn.32 q3, q2 1839 vmull.u8 q8, d6, d1 1840 vmlal.u8 q8, d7, d0 1841 vrshrn.u16 d16, q8, #3 1842 vmull.u8 q10, d16, d2 1843 vtrn.32 d22, d16 1844 vmlal.u8 q10, d22, d3 1845 vrev64.32 d22, d16 1846 vrshrn.u16 d20, q10, #3 1847 vst1.32 {d20[0]}, [r0,:32], r1 1848 vst1.32 {d20[1]}, [r0,:32], r1 1849 bgt 1b 1850 1851 bx lr 1852endfunc 1853