1/* 2 * VC1 NEON optimisations 3 * 4 * Copyright (c) 2010 Rob Clark <rob@ti.com> 5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> 6 * 7 * This file is part of FFmpeg. 8 * 9 * FFmpeg is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU Lesser General Public 11 * License as published by the Free Software Foundation; either 12 * version 2.1 of the License, or (at your option) any later version. 13 * 14 * FFmpeg is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * Lesser General Public License for more details. 18 * 19 * You should have received a copy of the GNU Lesser General Public 20 * License along with FFmpeg; if not, write to the Free Software 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22 */ 23 24#include "libavutil/arm/asm.S" 25#include "neon.S" 26 27#include "config.h" 28 29@ Transpose rows into columns of a matrix of 16-bit elements. For 4x4, pass 30@ double-word registers, for 8x4, pass quad-word registers. 31.macro transpose16 r0, r1, r2, r3 32 @ At this point: 33 @ row[0] r0 34 @ row[1] r1 35 @ row[2] r2 36 @ row[3] r3 37 38 vtrn.16 \r0, \r1 @ first and second row 39 vtrn.16 \r2, \r3 @ third and fourth row 40 vtrn.32 \r0, \r2 @ first and third row 41 vtrn.32 \r1, \r3 @ second and fourth row 42 43 @ At this point, if registers are quad-word: 44 @ column[0] d0 45 @ column[1] d2 46 @ column[2] d4 47 @ column[3] d6 48 @ column[4] d1 49 @ column[5] d3 50 @ column[6] d5 51 @ column[7] d7 52 53 @ At this point, if registers are double-word: 54 @ column[0] d0 55 @ column[1] d1 56 @ column[2] d2 57 @ column[3] d3 58.endm 59 60@ ff_vc1_inv_trans_{4,8}x{4,8}_neon and overflow: The input values in the file 61@ are supposed to be in a specific range as to allow for 16-bit math without 62@ causing overflows, but sometimes the input values are just big enough to 63@ barely cause overflow in vadd instructions like: 64@ 65@ vadd.i16 q0, q8, q10 66@ vshr.s16 q0, q0, #\rshift 67@ 68@ To prevent these borderline cases from overflowing, we just need one more 69@ bit of precision, which is accomplished by replacing the sequence above with: 70@ 71@ vhadd.s16 q0, q8, q10 72@ vshr.s16 q0, q0, #(\rshift -1) 73@ 74@ This works because vhadd is a single instruction that adds, then shifts to 75@ the right once, all before writing the result to the destination register. 76@ 77@ Even with this workaround, there were still some files that caused overflows 78@ in ff_vc1_inv_trans_8x8_neon. See the comments in ff_vc1_inv_trans_8x8_neon 79@ for the additional workaround. 80 81@ Takes 4 columns of 8 values each and operates on it. Modeled after the first 82@ for loop in vc1_inv_trans_4x8_c. 83@ Input columns: q0 q1 q2 q3 84@ Output columns: q0 q1 q2 q3 85@ Trashes: r12 q8 q9 q10 q11 q12 q13 86.macro vc1_inv_trans_4x8_helper add rshift 87 @ Compute temp1, temp2 and setup scalar #17, #22, #10 88 vadd.i16 q12, q0, q2 @ temp1 = src[0] + src[2] 89 movw r12, #17 90 vsub.i16 q13, q0, q2 @ temp2 = src[0] - src[2] 91 movt r12, #22 92 vmov.32 d0[0], r12 93 movw r12, #10 94 vmov.16 d1[0], r12 95 96 vmov.i16 q8, #\add @ t1 will accumulate here 97 vmov.i16 q9, #\add @ t2 will accumulate here 98 99 vmul.i16 q10, q1, d0[1] @ t3 = 22 * (src[1]) 100 vmul.i16 q11, q3, d0[1] @ t4 = 22 * (src[3]) 101 102 vmla.i16 q8, q12, d0[0] @ t1 = 17 * (temp1) + 4 103 vmla.i16 q9, q13, d0[0] @ t2 = 17 * (temp2) + 4 104 105 vmla.i16 q10, q3, d1[0] @ t3 += 10 * src[3] 106 vmls.i16 q11, q1, d1[0] @ t4 -= 10 * src[1] 107 108 vhadd.s16 q0, q8, q10 @ dst[0] = (t1 + t3) >> 1 109 vhsub.s16 q3, q8, q10 @ dst[3] = (t1 - t3) >> 1 110 vhsub.s16 q1, q9, q11 @ dst[1] = (t2 - t4) >> 1 111 vhadd.s16 q2, q9, q11 @ dst[2] = (t2 + t4) >> 1 112 113 @ Halving add/sub above already did one shift 114 vshr.s16 q0, q0, #(\rshift - 1) @ dst[0] >>= (rshift - 1) 115 vshr.s16 q3, q3, #(\rshift - 1) @ dst[3] >>= (rshift - 1) 116 vshr.s16 q1, q1, #(\rshift - 1) @ dst[1] >>= (rshift - 1) 117 vshr.s16 q2, q2, #(\rshift - 1) @ dst[2] >>= (rshift - 1) 118.endm 119 120@ Takes 8 columns of 4 values each and operates on it. Modeled after the second 121@ for loop in vc1_inv_trans_4x8_c. 122@ Input columns: d0 d2 d4 d6 d1 d3 d5 d7 123@ Output columns: d16 d17 d18 d19 d21 d20 d23 d22 124@ Trashes all NEON registers (and r12) except for: q4 q5 q6 q7 125.macro vc1_inv_trans_8x4_helper add add1beforeshift rshift 126 @ At this point: 127 @ src[0] d0 overwritten later 128 @ src[8] d2 129 @ src[16] d4 overwritten later 130 @ src[24] d6 131 @ src[32] d1 overwritten later 132 @ src[40] d3 133 @ src[48] d5 overwritten later 134 @ src[56] d7 135 136 movw r12, #12 137 vmov.i16 q14, #\add @ t1|t2 will accumulate here 138 movt r12, #6 139 140 vadd.i16 d20, d0, d1 @ temp1 = src[0] + src[32] 141 vsub.i16 d21, d0, d1 @ temp2 = src[0] - src[32] 142 vmov.i32 d0[0], r12 @ 16-bit: d0[0] = #12, d0[1] = #6 143 144 vshl.i16 q15, q2, #4 @ t3|t4 = 16 * (src[16]|src[48]) 145 vswp d4, d5 @ q2 = src[48]|src[16] 146 vmla.i16 q14, q10, d0[0] @ t1|t2 = 12 * (temp1|temp2) + 64 147 movw r12, #15 148 movt r12, #9 149 vmov.i32 d0[1], r12 @ 16-bit: d0[2] = #15, d0[3] = #9 150 vneg.s16 d31, d31 @ t4 = -t4 151 vmla.i16 q15, q2, d0[1] @ t3|t4 += 6 * (src[48]|src[16]) 152 153 @ At this point: 154 @ d0[2] #15 155 @ d0[3] #9 156 @ q1 src[8]|src[40] 157 @ q3 src[24]|src[56] 158 @ q14 old t1|t2 159 @ q15 old t3|t4 160 161 vshl.i16 q8, q1, #4 @ t1|t2 = 16 * (src[8]|src[40]) 162 vswp d2, d3 @ q1 = src[40]|src[8] 163 vshl.i16 q12, q3, #4 @ temp3a|temp4a = 16 * src[24]|src[56] 164 vswp d6, d7 @ q3 = src[56]|src[24] 165 vshl.i16 q13, q1, #2 @ temp3b|temp4b = 4 * (src[40]|src[8]) 166 vshl.i16 q2, q3, #2 @ temp1|temp2 = 4 * (src[56]|src[24]) 167 vswp d3, d6 @ q1 = src[40]|src[56], q3 = src[8]|src[24] 168 vsub.i16 q9, q13, q12 @ t3|t4 = - (temp3a|temp4a) + (temp3b|temp4b) 169 vadd.i16 q8, q8, q2 @ t1|t2 += temp1|temp2 170 vmul.i16 q12, q3, d0[3] @ temp3|temp4 = 9 * src[8]|src[24] 171 vmla.i16 q8, q1, d0[3] @ t1|t2 += 9 * (src[40]|src[56]) 172 vswp d6, d7 @ q3 = src[24]|src[8] 173 vswp d2, d3 @ q1 = src[56]|src[40] 174 175 vsub.i16 q11, q14, q15 @ t8|t7 = old t1|t2 - old t3|t4 176 vadd.i16 q10, q14, q15 @ t5|t6 = old t1|t2 + old t3|t4 177 .if \add1beforeshift 178 vmov.i16 q15, #1 179 .endif 180 181 vadd.i16 d18, d18, d24 @ t3 += temp3 182 vsub.i16 d19, d19, d25 @ t4 -= temp4 183 184 vswp d22, d23 @ q11 = t7|t8 185 186 vneg.s16 d17, d17 @ t2 = -t2 187 vmla.i16 q9, q1, d0[2] @ t3|t4 += 15 * src[56]|src[40] 188 vmla.i16 q8, q3, d0[2] @ t1|t2 += 15 * src[24]|src[8] 189 190 @ At this point: 191 @ t1 d16 192 @ t2 d17 193 @ t3 d18 194 @ t4 d19 195 @ t5 d20 196 @ t6 d21 197 @ t7 d22 198 @ t8 d23 199 @ #1 q15 200 201 .if \add1beforeshift 202 vadd.i16 q3, q15, q10 @ line[7,6] = t5|t6 + 1 203 vadd.i16 q2, q15, q11 @ line[5,4] = t7|t8 + 1 204 .endif 205 206 @ Sometimes this overflows, so to get one additional bit of precision, use 207 @ a single instruction that both adds and shifts right (halving). 208 vhadd.s16 q1, q9, q11 @ line[2,3] = (t3|t4 + t7|t8) >> 1 209 vhadd.s16 q0, q8, q10 @ line[0,1] = (t1|t2 + t5|t6) >> 1 210 .if \add1beforeshift 211 vhsub.s16 q2, q2, q9 @ line[5,4] = (t7|t8 - t3|t4 + 1) >> 1 212 vhsub.s16 q3, q3, q8 @ line[7,6] = (t5|t6 - t1|t2 + 1) >> 1 213 .else 214 vhsub.s16 q2, q11, q9 @ line[5,4] = (t7|t8 - t3|t4) >> 1 215 vhsub.s16 q3, q10, q8 @ line[7,6] = (t5|t6 - t1|t2) >> 1 216 .endif 217 218 vshr.s16 q9, q1, #(\rshift - 1) @ one shift is already done by vhadd/vhsub above 219 vshr.s16 q8, q0, #(\rshift - 1) 220 vshr.s16 q10, q2, #(\rshift - 1) 221 vshr.s16 q11, q3, #(\rshift - 1) 222 223 @ At this point: 224 @ dst[0] d16 225 @ dst[1] d17 226 @ dst[2] d18 227 @ dst[3] d19 228 @ dst[4] d21 229 @ dst[5] d20 230 @ dst[6] d23 231 @ dst[7] d22 232.endm 233 234@ This is modeled after the first and second for loop in vc1_inv_trans_8x8_c. 235@ Input columns: q8, q9, q10, q11, q12, q13, q14, q15 236@ Output columns: q8, q9, q10, q11, q12, q13, q14, q15 237@ Trashes all NEON registers (and r12) except for: q4 q5 q6 q7 238.macro vc1_inv_trans_8x8_helper add add1beforeshift rshift 239 @ This actually computes half of t1, t2, t3, t4, as explained below 240 @ near `tNhalf`. 241 vmov.i16 q0, #(6 / 2) @ q0 = #6/2 242 vshl.i16 q1, q10, #3 @ t3 = 16/2 * src[16] 243 vshl.i16 q3, q14, #3 @ temp4 = 16/2 * src[48] 244 vmul.i16 q2, q10, q0 @ t4 = 6/2 * src[16] 245 vmla.i16 q1, q14, q0 @ t3 += 6/2 * src[48] 246 @ unused: q0, q10, q14 247 vmov.i16 q0, #(12 / 2) @ q0 = #12/2 248 vadd.i16 q10, q8, q12 @ temp1 = src[0] + src[32] 249 vsub.i16 q14, q8, q12 @ temp2 = src[0] - src[32] 250 @ unused: q8, q12 251 vmov.i16 q8, #(\add / 2) @ t1 will accumulate here 252 vmov.i16 q12, #(\add / 2) @ t2 will accumulate here 253 movw r12, #15 254 vsub.i16 q2, q2, q3 @ t4 = 6/2 * src[16] - 16/2 * src[48] 255 movt r12, #9 256 @ unused: q3 257 vmla.i16 q8, q10, q0 @ t1 = 12/2 * temp1 + add 258 vmla.i16 q12, q14, q0 @ t2 = 12/2 * temp2 + add 259 vmov.i32 d0[0], r12 260 @ unused: q3, q10, q14 261 262 @ At this point: 263 @ q0 d0=#15|#9 264 @ q1 old t3 265 @ q2 old t4 266 @ q3 267 @ q8 old t1 268 @ q9 src[8] 269 @ q10 270 @ q11 src[24] 271 @ q12 old t2 272 @ q13 src[40] 273 @ q14 274 @ q15 src[56] 275 276 @ unused: q3, q10, q14 277 movw r12, #16 278 vshl.i16 q3, q9, #4 @ t1 = 16 * src[8] 279 movt r12, #4 280 vshl.i16 q10, q9, #2 @ t4 = 4 * src[8] 281 vmov.i32 d1[0], r12 282 vmul.i16 q14, q9, d0[0] @ t2 = 15 * src[8] 283 vmul.i16 q9, q9, d0[1] @ t3 = 9 * src[8] 284 @ unused: none 285 vmla.i16 q3, q11, d0[0] @ t1 += 15 * src[24] 286 vmls.i16 q10, q11, d0[1] @ t4 -= 9 * src[24] 287 vmls.i16 q14, q11, d1[1] @ t2 -= 4 * src[24] 288 vmls.i16 q9, q11, d1[0] @ t3 -= 16 * src[24] 289 @ unused: q11 290 vmla.i16 q3, q13, d0[1] @ t1 += 9 * src[40] 291 vmla.i16 q10, q13, d0[0] @ t4 += 15 * src[40] 292 vmls.i16 q14, q13, d1[0] @ t2 -= 16 * src[40] 293 vmla.i16 q9, q13, d1[1] @ t3 += 4 * src[40] 294 @ unused: q11, q13 295 296 @ Compute t5, t6, t7, t8 from old t1, t2, t3, t4. Actually, it computes 297 @ half of t5, t6, t7, t8 since t1, t2, t3, t4 are halved. 298 vadd.i16 q11, q8, q1 @ t5 = t1 + t3 299 vsub.i16 q1, q8, q1 @ t8 = t1 - t3 300 vadd.i16 q13, q12, q2 @ t6 = t2 + t4 301 vsub.i16 q2, q12, q2 @ t7 = t2 - t4 302 @ unused: q8, q12 303 304 .if \add1beforeshift 305 vmov.i16 q12, #1 306 .endif 307 308 @ unused: q8 309 vmla.i16 q3, q15, d1[1] @ t1 += 4 * src[56] 310 vmls.i16 q14, q15, d0[1] @ t2 -= 9 * src[56] 311 vmla.i16 q9, q15, d0[0] @ t3 += 15 * src[56] 312 vmls.i16 q10, q15, d1[0] @ t4 -= 16 * src[56] 313 @ unused: q0, q8, q15 314 315 @ At this point: 316 @ t1 q3 317 @ t2 q14 318 @ t3 q9 319 @ t4 q10 320 @ t5half q11 321 @ t6half q13 322 @ t7half q2 323 @ t8half q1 324 @ #1 q12 325 @ 326 @ tNhalf is half of the value of tN (as described in vc1_inv_trans_8x8_c). 327 @ This is done because sometimes files have input that causes tN + tM to 328 @ overflow. To avoid this overflow, we compute tNhalf, then compute 329 @ tNhalf + tM (which doesn't overflow), and then we use vhadd to compute 330 @ (tNhalf + (tNhalf + tM)) >> 1 which does not overflow because it is 331 @ one instruction. 332 333 @ For each pair of tN and tM, do: 334 @ lineA = t5half + t1 335 @ if add1beforeshift: t1 -= 1 336 @ lineA = (t5half + lineA) >> 1 337 @ lineB = t5half - t1 338 @ lineB = (t5half + lineB) >> 1 339 @ lineA >>= rshift - 1 340 @ lineB >>= rshift - 1 341 342 vadd.i16 q8, q11, q3 @ q8 = t5half + t1 343 .if \add1beforeshift 344 vsub.i16 q3, q3, q12 @ q3 = t1 - 1 345 .endif 346 347 vadd.i16 q0, q13, q14 @ q0 = t6half + t2 348 .if \add1beforeshift 349 vsub.i16 q14, q14, q12 @ q14 = t2 - 1 350 .endif 351 352 vadd.i16 q15, q2, q9 @ q15 = t7half + t3 353 .if \add1beforeshift 354 vsub.i16 q9, q9, q12 @ q9 = t3 - 1 355 .endif 356 @ unused: none 357 358 vhadd.s16 q8, q11, q8 @ q8 = (t5half + t5half + t1) >> 1 359 vsub.i16 q3, q11, q3 @ q3 = t5half - t1 + 1 360 361 vhadd.s16 q0, q13, q0 @ q0 = (t6half + t6half + t2) >> 1 362 vsub.i16 q14, q13, q14 @ q14 = t6half - t2 + 1 363 364 vhadd.s16 q15, q2, q15 @ q15 = (t7half + t7half + t3) >> 1 365 vsub.i16 q9, q2, q9 @ q9 = t7half - t3 + 1 366 367 vhadd.s16 q3, q11, q3 @ q3 = (t5half + t5half - t1 + 1) >> 1 368 @ unused: q11 369 370 vadd.i16 q11, q1, q10 @ q11 = t8half + t4 371 .if \add1beforeshift 372 vsub.i16 q10, q10, q12 @ q10 = t4 - 1 373 .endif 374 @ unused: q12 375 376 vhadd.s16 q14, q13, q14 @ q14 = (t6half + t6half - t2 + 1) >> 1 377 @ unused: q12, q13 378 vhadd.s16 q13, q2, q9 @ q9 = (t7half + t7half - t3 + 1) >> 1 379 @ unused: q12, q2, q9 380 381 vsub.i16 q10, q1, q10 @ q10 = t8half - t4 + 1 382 vhadd.s16 q11, q1, q11 @ q11 = (t8half + t8half + t4) >> 1 383 384 vshr.s16 q8, q8, #(\rshift - 1) @ q8 = line[0] 385 vhadd.s16 q12, q1, q10 @ q12 = (t8half + t8half - t4 + 1) >> 1 386 vshr.s16 q9, q0, #(\rshift - 1) @ q9 = line[1] 387 vshr.s16 q10, q15, #(\rshift - 1) @ q10 = line[2] 388 vshr.s16 q11, q11, #(\rshift - 1) @ q11 = line[3] 389 vshr.s16 q12, q12, #(\rshift - 1) @ q12 = line[4] 390 vshr.s16 q13, q13, #(\rshift - 1) @ q13 = line[5] 391 vshr.s16 q14, q14, #(\rshift - 1) @ q14 = line[6] 392 vshr.s16 q15, q3, #(\rshift - 1) @ q15 = line[7] 393.endm 394 395@ (int16_t *block [r0]) 396function ff_vc1_inv_trans_8x8_neon, export=1 397 vld1.64 {q8-q9}, [r0,:128]! 398 vld1.64 {q10-q11}, [r0,:128]! 399 vld1.64 {q12-q13}, [r0,:128]! 400 vld1.64 {q14-q15}, [r0,:128] 401 sub r0, r0, #(16 * 2 * 3) @ restore r0 402 403 @ At this point: 404 @ src[0] q8 405 @ src[8] q9 406 @ src[16] q10 407 @ src[24] q11 408 @ src[32] q12 409 @ src[40] q13 410 @ src[48] q14 411 @ src[56] q15 412 413 vc1_inv_trans_8x8_helper add=4, add1beforeshift=0, rshift=3 414 415 @ Transpose result matrix of 8x8 416 swap4 d17, d19, d21, d23, d24, d26, d28, d30 417 transpose16_4x4 q8, q9, q10, q11, q12, q13, q14, q15 418 419 vc1_inv_trans_8x8_helper add=64, add1beforeshift=1, rshift=7 420 421 vst1.64 {q8-q9}, [r0,:128]! 422 vst1.64 {q10-q11}, [r0,:128]! 423 vst1.64 {q12-q13}, [r0,:128]! 424 vst1.64 {q14-q15}, [r0,:128] 425 426 bx lr 427endfunc 428 429@ (uint8_t *dest [r0], ptrdiff_t stride [r1], int16_t *block [r2]) 430function ff_vc1_inv_trans_8x4_neon, export=1 431 vld1.64 {q0-q1}, [r2,:128]! @ load 8 * 4 * 2 = 64 bytes / 16 bytes per quad = 4 quad registers 432 vld1.64 {q2-q3}, [r2,:128] 433 434 transpose16 q0, q1, q2, q3 @ transpose rows to columns 435 436 @ At this point: 437 @ src[0] d0 438 @ src[1] d2 439 @ src[2] d4 440 @ src[3] d6 441 @ src[4] d1 442 @ src[5] d3 443 @ src[6] d5 444 @ src[7] d7 445 446 vc1_inv_trans_8x4_helper add=4, add1beforeshift=0, rshift=3 447 448 @ Move output to more standardized registers 449 vmov d0, d16 450 vmov d2, d17 451 vmov d4, d18 452 vmov d6, d19 453 vmov d1, d21 454 vmov d3, d20 455 vmov d5, d23 456 vmov d7, d22 457 458 @ At this point: 459 @ dst[0] d0 460 @ dst[1] d2 461 @ dst[2] d4 462 @ dst[3] d6 463 @ dst[4] d1 464 @ dst[5] d3 465 @ dst[6] d5 466 @ dst[7] d7 467 468 transpose16 q0, q1, q2, q3 @ turn columns into rows 469 470 @ At this point: 471 @ row[0] q0 472 @ row[1] q1 473 @ row[2] q2 474 @ row[3] q3 475 476 vc1_inv_trans_4x8_helper add=64, rshift=7 477 478 @ At this point: 479 @ line[0].l d0 480 @ line[0].h d1 481 @ line[1].l d2 482 @ line[1].h d3 483 @ line[2].l d4 484 @ line[2].h d5 485 @ line[3].l d6 486 @ line[3].h d7 487 488 @ unused registers: q12, q13, q14, q15 489 490 vld1.64 {d28}, [r0,:64], r1 @ read dest 491 vld1.64 {d29}, [r0,:64], r1 492 vld1.64 {d30}, [r0,:64], r1 493 vld1.64 {d31}, [r0,:64], r1 494 sub r0, r0, r1, lsl #2 @ restore original r0 value 495 496 vaddw.u8 q0, q0, d28 @ line[0] += dest[0] 497 vaddw.u8 q1, q1, d29 @ line[1] += dest[1] 498 vaddw.u8 q2, q2, d30 @ line[2] += dest[2] 499 vaddw.u8 q3, q3, d31 @ line[3] += dest[3] 500 501 vqmovun.s16 d0, q0 @ line[0] 502 vqmovun.s16 d1, q1 @ line[1] 503 vqmovun.s16 d2, q2 @ line[2] 504 vqmovun.s16 d3, q3 @ line[3] 505 506 vst1.64 {d0}, [r0,:64], r1 @ write dest 507 vst1.64 {d1}, [r0,:64], r1 508 vst1.64 {d2}, [r0,:64], r1 509 vst1.64 {d3}, [r0,:64] 510 511 bx lr 512endfunc 513 514@ (uint8_t *dest [r0], ptrdiff_t stride [r1], int16_t *block [r2]) 515function ff_vc1_inv_trans_4x8_neon, export=1 516 mov r12, #(8 * 2) @ 8 elements per line, each element 2 bytes 517 vld4.16 {d0[], d2[], d4[], d6[]}, [r2,:64], r12 @ read each column into a q register 518 vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r2,:64], r12 519 vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r2,:64], r12 520 vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r2,:64], r12 521 vld4.16 {d1[], d3[], d5[], d7[]}, [r2,:64], r12 522 vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r2,:64], r12 523 vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r2,:64], r12 524 vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r2,:64] 525 526 vc1_inv_trans_4x8_helper add=4, rshift=3 527 528 @ At this point: 529 @ dst[0] = q0 530 @ dst[1] = q1 531 @ dst[2] = q2 532 @ dst[3] = q3 533 534 transpose16 q0, q1, q2, q3 @ Transpose rows (registers) into columns 535 536 vc1_inv_trans_8x4_helper add=64, add1beforeshift=1, rshift=7 537 538 vld1.32 {d28[]}, [r0,:32], r1 @ read dest 539 vld1.32 {d28[1]}, [r0,:32], r1 540 vld1.32 {d29[]}, [r0,:32], r1 541 vld1.32 {d29[1]}, [r0,:32], r1 542 543 vld1.32 {d30[]}, [r0,:32], r1 544 vld1.32 {d30[0]}, [r0,:32], r1 545 vld1.32 {d31[]}, [r0,:32], r1 546 vld1.32 {d31[0]}, [r0,:32], r1 547 sub r0, r0, r1, lsl #3 @ restore original r0 value 548 549 vaddw.u8 q8, q8, d28 @ line[0,1] += dest[0,1] 550 vaddw.u8 q9, q9, d29 @ line[2,3] += dest[2,3] 551 vaddw.u8 q10, q10, d30 @ line[5,4] += dest[5,4] 552 vaddw.u8 q11, q11, d31 @ line[7,6] += dest[7,6] 553 554 vqmovun.s16 d16, q8 @ clip(line[0,1]) 555 vqmovun.s16 d18, q9 @ clip(line[2,3]) 556 vqmovun.s16 d20, q10 @ clip(line[5,4]) 557 vqmovun.s16 d22, q11 @ clip(line[7,6]) 558 559 vst1.32 {d16[0]}, [r0,:32], r1 @ write dest 560 vst1.32 {d16[1]}, [r0,:32], r1 561 vst1.32 {d18[0]}, [r0,:32], r1 562 vst1.32 {d18[1]}, [r0,:32], r1 563 564 vst1.32 {d20[1]}, [r0,:32], r1 565 vst1.32 {d20[0]}, [r0,:32], r1 566 vst1.32 {d22[1]}, [r0,:32], r1 567 vst1.32 {d22[0]}, [r0,:32] 568 569 bx lr 570endfunc 571 572@ Setup constants in registers which are used by vc1_inv_trans_4x4_helper 573.macro vc1_inv_trans_4x4_helper_setup 574 vmov.i16 q13, #17 575 vmov.i16 q14, #22 576 vmov.i16 d30, #10 @ only need double-word, not quad-word 577.endm 578 579@ This is modeled after the first for loop in vc1_inv_trans_4x4_c. 580.macro vc1_inv_trans_4x4_helper add rshift 581 vmov.i16 q2, #\add @ t1|t2 will accumulate here 582 583 vadd.i16 d16, d0, d1 @ temp1 = src[0] + src[2] 584 vsub.i16 d17, d0, d1 @ temp2 = src[0] - src[2] 585 vmul.i16 q3, q14, q1 @ t3|t4 = 22 * (src[1]|src[3]) 586 vmla.i16 q2, q13, q8 @ t1|t2 = 17 * (temp1|temp2) + add 587 vmla.i16 d6, d30, d3 @ t3 += 10 * src[3] 588 vmls.i16 d7, d30, d2 @ t4 -= 10 * src[1] 589 590 vadd.i16 q0, q2, q3 @ dst[0,2] = (t1|t2 + t3|t4) 591 vsub.i16 q1, q2, q3 @ dst[3,1] = (t1|t2 - t3|t4) 592 vshr.s16 q0, q0, #\rshift @ dst[0,2] >>= rshift 593 vshr.s16 q1, q1, #\rshift @ dst[3,1] >>= rshift 594.endm 595 596@ (uint8_t *dest [r0], ptrdiff_t stride [r1], int16_t *block [r2]) 597function ff_vc1_inv_trans_4x4_neon, export=1 598 mov r12, #(8 * 2) @ 8 elements per line, each element 2 bytes 599 vld4.16 {d0[], d1[], d2[], d3[]}, [r2,:64], r12 @ read each column into a register 600 vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r2,:64], r12 601 vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r2,:64], r12 602 vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r2,:64] 603 604 vswp d1, d2 @ so that we can later access column 1 and column 3 as a single q1 register 605 606 vc1_inv_trans_4x4_helper_setup 607 608 @ At this point: 609 @ src[0] = d0 610 @ src[1] = d2 611 @ src[2] = d1 612 @ src[3] = d3 613 614 vc1_inv_trans_4x4_helper add=4, rshift=3 @ compute t1, t2, t3, t4 and combine them into dst[0-3] 615 616 @ At this point: 617 @ dst[0] = d0 618 @ dst[1] = d3 619 @ dst[2] = d1 620 @ dst[3] = d2 621 622 transpose16 d0, d3, d1, d2 @ Transpose rows (registers) into columns 623 624 @ At this point: 625 @ src[0] = d0 626 @ src[8] = d3 627 @ src[16] = d1 628 @ src[24] = d2 629 630 vswp d2, d3 @ so that we can later access column 1 and column 3 in order as a single q1 register 631 632 @ At this point: 633 @ src[0] = d0 634 @ src[8] = d2 635 @ src[16] = d1 636 @ src[24] = d3 637 638 vc1_inv_trans_4x4_helper add=64, rshift=7 @ compute t1, t2, t3, t4 and combine them into dst[0-3] 639 640 @ At this point: 641 @ line[0] = d0 642 @ line[1] = d3 643 @ line[2] = d1 644 @ line[3] = d2 645 646 vld1.32 {d18[]}, [r0,:32], r1 @ read dest 647 vld1.32 {d19[]}, [r0,:32], r1 648 vld1.32 {d18[1]}, [r0,:32], r1 649 vld1.32 {d19[0]}, [r0,:32], r1 650 sub r0, r0, r1, lsl #2 @ restore original r0 value 651 652 vaddw.u8 q0, q0, d18 @ line[0,2] += dest[0,2] 653 vaddw.u8 q1, q1, d19 @ line[3,1] += dest[3,1] 654 655 vqmovun.s16 d0, q0 @ clip(line[0,2]) 656 vqmovun.s16 d1, q1 @ clip(line[3,1]) 657 658 vst1.32 {d0[0]}, [r0,:32], r1 @ write dest 659 vst1.32 {d1[1]}, [r0,:32], r1 660 vst1.32 {d0[1]}, [r0,:32], r1 661 vst1.32 {d1[0]}, [r0,:32] 662 663 bx lr 664endfunc 665 666@ The absolute value of multiplication constants from vc1_mspel_filter and vc1_mspel_{ver,hor}_filter_16bits. 667@ The sign is embedded in the code below that carries out the multiplication (mspel_filter{,.16}). 668#define MSPEL_MODE_1_MUL_CONSTANTS 4, 53, 18, 3 669#define MSPEL_MODE_2_MUL_CONSTANTS 1, 9, 9, 1 670#define MSPEL_MODE_3_MUL_CONSTANTS 3, 18, 53, 4 671 672@ These constants are from reading the source code of vc1_mspel_mc and determining the value that 673@ is added to `rnd` to result in the variable `r`, and the value of the variable `shift`. 674#define MSPEL_MODES_11_ADDSHIFT_CONSTANTS 15, 5 675#define MSPEL_MODES_12_ADDSHIFT_CONSTANTS 3, 3 676#define MSPEL_MODES_13_ADDSHIFT_CONSTANTS 15, 5 677#define MSPEL_MODES_21_ADDSHIFT_CONSTANTS MSPEL_MODES_12_ADDSHIFT_CONSTANTS 678#define MSPEL_MODES_22_ADDSHIFT_CONSTANTS 0, 1 679#define MSPEL_MODES_23_ADDSHIFT_CONSTANTS 3, 3 680#define MSPEL_MODES_31_ADDSHIFT_CONSTANTS MSPEL_MODES_13_ADDSHIFT_CONSTANTS 681#define MSPEL_MODES_32_ADDSHIFT_CONSTANTS MSPEL_MODES_23_ADDSHIFT_CONSTANTS 682#define MSPEL_MODES_33_ADDSHIFT_CONSTANTS 15, 5 683 684@ The addition and shift constants from vc1_mspel_filter. 685#define MSPEL_MODE_1_ADDSHIFT_CONSTANTS 32, 6 686#define MSPEL_MODE_2_ADDSHIFT_CONSTANTS 8, 4 687#define MSPEL_MODE_3_ADDSHIFT_CONSTANTS 32, 6 688 689@ Setup constants in registers for a subsequent use of mspel_filter{,.16}. 690.macro mspel_constants typesize reg_a reg_b reg_c reg_d filter_a filter_b filter_c filter_d reg_add filter_add_register 691 @ Typesize should be i8 or i16. 692 693 @ Only set the register if the value is not 1 and unique 694 .if \filter_a != 1 695 vmov.\typesize \reg_a, #\filter_a @ reg_a = filter_a 696 .endif 697 vmov.\typesize \reg_b, #\filter_b @ reg_b = filter_b 698 .if \filter_b != \filter_c 699 vmov.\typesize \reg_c, #\filter_c @ reg_c = filter_c 700 .endif 701 .if \filter_d != 1 702 vmov.\typesize \reg_d, #\filter_d @ reg_d = filter_d 703 .endif 704 @ vdup to double the size of typesize 705 .ifc \typesize,i8 706 vdup.16 \reg_add, \filter_add_register @ reg_add = filter_add_register 707 .else 708 vdup.32 \reg_add, \filter_add_register @ reg_add = filter_add_register 709 .endif 710.endm 711 712@ After mspel_constants has been used, do the filtering. 713.macro mspel_filter acc dest src0 src1 src2 src3 filter_a filter_b filter_c filter_d reg_a reg_b reg_c reg_d reg_add filter_shift narrow=1 714 .if \filter_a != 1 715 @ If filter_a != 1, then we need a move and subtract instruction 716 vmov \acc, \reg_add @ acc = reg_add 717 vmlsl.u8 \acc, \reg_a, \src0 @ acc -= filter_a * src[-stride] 718 .else 719 @ If filter_a is 1, then just subtract without an extra move 720 vsubw.u8 \acc, \reg_add, \src0 @ acc = reg_add - src[-stride] @ since filter_a == 1 721 .endif 722 vmlal.u8 \acc, \reg_b, \src1 @ acc += filter_b * src[0] 723 .if \filter_b != \filter_c 724 vmlal.u8 \acc, \reg_c, \src2 @ acc += filter_c * src[stride] 725 .else 726 @ If filter_b is the same as filter_c, use the same reg_b register 727 vmlal.u8 \acc, \reg_b, \src2 @ acc += filter_c * src[stride] @ where filter_c == filter_b 728 .endif 729 .if \filter_d != 1 730 @ If filter_d != 1, then do a multiply accumulate 731 vmlsl.u8 \acc, \reg_d, \src3 @ acc -= filter_d * src[stride * 2] 732 .else 733 @ If filter_d is 1, then just do a subtract 734 vsubw.u8 \acc, \acc, \src3 @ acc -= src[stride * 2] @ since filter_d == 1 735 .endif 736 .if \narrow 737 vqshrun.s16 \dest, \acc, #\filter_shift @ dest = clip_uint8(acc >> filter_shift) 738 .else 739 vshr.s16 \dest, \acc, #\filter_shift @ dest = acc >> filter_shift 740 .endif 741.endm 742 743@ This is similar to mspel_filter, but the input is 16-bit instead of 8-bit and narrow=0 is not supported. 744.macro mspel_filter.16 acc0 acc1 acc0_0 acc0_1 dest src0 src1 src2 src3 src4 src5 src6 src7 filter_a filter_b filter_c filter_d reg_a reg_b reg_c reg_d reg_add filter_shift 745 .if \filter_a != 1 746 vmov \acc0, \reg_add 747 vmov \acc1, \reg_add 748 vmlsl.s16 \acc0, \reg_a, \src0 749 vmlsl.s16 \acc1, \reg_a, \src1 750 .else 751 vsubw.s16 \acc0, \reg_add, \src0 752 vsubw.s16 \acc1, \reg_add, \src1 753 .endif 754 vmlal.s16 \acc0, \reg_b, \src2 755 vmlal.s16 \acc1, \reg_b, \src3 756 .if \filter_b != \filter_c 757 vmlal.s16 \acc0, \reg_c, \src4 758 vmlal.s16 \acc1, \reg_c, \src5 759 .else 760 vmlal.s16 \acc0, \reg_b, \src4 761 vmlal.s16 \acc1, \reg_b, \src5 762 .endif 763 .if \filter_d != 1 764 vmlsl.s16 \acc0, \reg_d, \src6 765 vmlsl.s16 \acc1, \reg_d, \src7 766 .else 767 vsubw.s16 \acc0, \acc0, \src6 768 vsubw.s16 \acc1, \acc1, \src7 769 .endif 770 @ Use acc0_0 and acc0_1 as temp space 771 vqshrun.s32 \acc0_0, \acc0, #\filter_shift @ Shift and narrow with saturation from s32 to u16 772 vqshrun.s32 \acc0_1, \acc1, #\filter_shift 773 vqmovn.u16 \dest, \acc0 @ Narrow with saturation from u16 to u8 774.endm 775 776@ Register usage for put_vc1_mspel_mc functions. Registers marked 'hv' are only used in put_vc1_mspel_mc_hv. 777@ 778@ r0 adjusted dst 779@ r1 adjusted src 780@ r2 stride 781@ r3 adjusted rnd 782@ r4 [hv] tmp 783@ r11 [hv] sp saved 784@ r12 loop counter 785@ d0 src[-stride] 786@ d1 src[0] 787@ d2 src[stride] 788@ d3 src[stride * 2] 789@ q0 [hv] src[-stride] 790@ q1 [hv] src[0] 791@ q2 [hv] src[stride] 792@ q3 [hv] src[stride * 2] 793@ d21 often result from mspel_filter 794@ q11 accumulator 0 795@ q12 [hv] accumulator 1 796@ q13 accumulator initial value 797@ d28 filter_a 798@ d29 filter_b 799@ d30 filter_c 800@ d31 filter_d 801 802@ (uint8_t *dst [r0], const uint8_t *src [r1], ptrdiff_t stride [r2], int rnd [r3]) 803.macro put_vc1_mspel_mc_hv hmode vmode filter_h_a filter_h_b filter_h_c filter_h_d filter_v_a filter_v_b filter_v_c filter_v_d filter_add filter_shift 804function ff_put_vc1_mspel_mc\hmode\()\vmode\()_neon, export=1 805 push {r4, r11, lr} 806 mov r11, sp @ r11 = stack pointer before realignmnet 807A bic sp, sp, #15 @ sp = round down to multiple of 16 bytes 808T bic r4, r11, #15 809T mov sp, r4 810 sub sp, sp, #(8*2*16) @ make space for 8 rows * 2 byte per element * 16 elements per row (to fit 11 actual elements per row) 811 mov r4, sp @ r4 = int16_t tmp[8 * 16] 812 813 sub r1, r1, #1 @ src -= 1 814 .if \filter_add != 0 815 add r3, r3, #\filter_add @ r3 = filter_add + rnd 816 .endif 817 mov r12, #8 @ loop counter 818 sub r1, r1, r2 @ r1 = &src[-stride] @ slide back 819 820 @ Do vertical filtering from src into tmp 821 mspel_constants i8, d28, d29, d30, d31, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, q13, r3 822 823 vld1.64 {d0,d1}, [r1], r2 824 vld1.64 {d2,d3}, [r1], r2 825 vld1.64 {d4,d5}, [r1], r2 826 8271: 828 subs r12, r12, #4 829 830 vld1.64 {d6,d7}, [r1], r2 831 mspel_filter q11, q11, d0, d2, d4, d6, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0 832 mspel_filter q12, q12, d1, d3, d5, d7, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0 833 vst1.64 {q11,q12}, [r4,:128]! @ store and increment 834 835 vld1.64 {d0,d1}, [r1], r2 836 mspel_filter q11, q11, d2, d4, d6, d0, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0 837 mspel_filter q12, q12, d3, d5, d7, d1, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0 838 vst1.64 {q11,q12}, [r4,:128]! @ store and increment 839 840 vld1.64 {d2,d3}, [r1], r2 841 mspel_filter q11, q11, d4, d6, d0, d2, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0 842 mspel_filter q12, q12, d5, d7, d1, d3, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0 843 vst1.64 {q11,q12}, [r4,:128]! @ store and increment 844 845 vld1.64 {d4,d5}, [r1], r2 846 mspel_filter q11, q11, d6, d0, d2, d4, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0 847 mspel_filter q12, q12, d7, d1, d3, d5, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0 848 vst1.64 {q11,q12}, [r4,:128]! @ store and increment 849 850 bne 1b 851 852 rsb r3, r3, #(64 + \filter_add) @ r3 = (64 + filter_add) - r3 853 mov r12, #8 @ loop counter 854 mov r4, sp @ r4 = tmp 855 856 @ Do horizontal filtering from temp to dst 857 mspel_constants i16, d28, d29, d30, d31, \filter_h_a, \filter_h_b, \filter_h_c, \filter_h_d, q13, r3 858 8592: 860 subs r12, r12, #1 861 862 vld1.64 {q0,q1}, [r4,:128]! @ read one line of tmp 863 vext.16 q2, q0, q1, #2 864 vext.16 q3, q0, q1, #3 865 vext.16 q1, q0, q1, #1 @ do last because it writes to q1 which is read by the other vext instructions 866 867 mspel_filter.16 q11, q12, d22, d23, d21, d0, d1, d2, d3, d4, d5, d6, d7, \filter_h_a, \filter_h_b, \filter_h_c, \filter_h_d, d28, d29, d30, d31, q13, 7 868 869 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst 870 871 bne 2b 872 873 mov sp, r11 874 pop {r4, r11, pc} 875endfunc 876.endm 877 878@ Use C preprocessor and assembler macros to expand to functions for horizontal and vertical filtering. 879#define PUT_VC1_MSPEL_MC_HV(hmode, vmode) \ 880 put_vc1_mspel_mc_hv hmode, vmode, \ 881 MSPEL_MODE_ ## hmode ## _MUL_CONSTANTS, \ 882 MSPEL_MODE_ ## vmode ## _MUL_CONSTANTS, \ 883 MSPEL_MODES_ ## hmode ## vmode ## _ADDSHIFT_CONSTANTS 884 885PUT_VC1_MSPEL_MC_HV(1, 1) 886PUT_VC1_MSPEL_MC_HV(1, 2) 887PUT_VC1_MSPEL_MC_HV(1, 3) 888PUT_VC1_MSPEL_MC_HV(2, 1) 889PUT_VC1_MSPEL_MC_HV(2, 2) 890PUT_VC1_MSPEL_MC_HV(2, 3) 891PUT_VC1_MSPEL_MC_HV(3, 1) 892PUT_VC1_MSPEL_MC_HV(3, 2) 893PUT_VC1_MSPEL_MC_HV(3, 3) 894 895#undef PUT_VC1_MSPEL_MC_HV 896 897.macro put_vc1_mspel_mc_h_only hmode filter_a filter_b filter_c filter_d filter_add filter_shift 898function ff_put_vc1_mspel_mc\hmode\()0_neon, export=1 899 rsb r3, r3, #\filter_add @ r3 = filter_add - r = filter_add - rnd 900 mov r12, #8 @ loop counter 901 sub r1, r1, #1 @ slide back, using immediate 902 903 mspel_constants i8, d28, d29, d30, d31, \filter_a, \filter_b, \filter_c, \filter_d, q13, r3 904 9051: 906 subs r12, r12, #1 907 908 vld1.64 {d0,d1}, [r1], r2 @ read 16 bytes even though we only need 11, also src += stride 909 vext.8 d2, d0, d1, #2 910 vext.8 d3, d0, d1, #3 911 vext.8 d1, d0, d1, #1 @ do last because it writes to d1 which is read by the other vext instructions 912 913 mspel_filter q11, d21, d0, d1, d2, d3, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift 914 915 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst 916 917 bne 1b 918 919 bx lr 920endfunc 921.endm 922 923@ Use C preprocessor and assembler macros to expand to functions for horizontal only filtering. 924#define PUT_VC1_MSPEL_MC_H_ONLY(hmode) \ 925 put_vc1_mspel_mc_h_only hmode, MSPEL_MODE_ ## hmode ## _MUL_CONSTANTS, MSPEL_MODE_ ## hmode ## _ADDSHIFT_CONSTANTS 926 927PUT_VC1_MSPEL_MC_H_ONLY(1) 928PUT_VC1_MSPEL_MC_H_ONLY(2) 929PUT_VC1_MSPEL_MC_H_ONLY(3) 930 931#undef PUT_VC1_MSPEL_MC_H_ONLY 932 933@ (uint8_t *dst [r0], const uint8_t *src [r1], ptrdiff_t stride [r2], int rnd [r3]) 934.macro put_vc1_mspel_mc_v_only vmode filter_a filter_b filter_c filter_d filter_add filter_shift 935function ff_put_vc1_mspel_mc0\vmode\()_neon, export=1 936 add r3, r3, #\filter_add - 1 @ r3 = filter_add - r = filter_add - (1 - rnd) = filter_add - 1 + rnd 937 mov r12, #8 @ loop counter 938 sub r1, r1, r2 @ r1 = &src[-stride] @ slide back 939 940 mspel_constants i8, d28, d29, d30, d31, \filter_a, \filter_b, \filter_c, \filter_d, q13, r3 941 942 vld1.64 {d0}, [r1], r2 @ d0 = src[-stride] 943 vld1.64 {d1}, [r1], r2 @ d1 = src[0] 944 vld1.64 {d2}, [r1], r2 @ d2 = src[stride] 945 9461: 947 subs r12, r12, #4 948 949 vld1.64 {d3}, [r1], r2 @ d3 = src[stride * 2] 950 mspel_filter q11, d21, d0, d1, d2, d3, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift 951 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst 952 953 vld1.64 {d0}, [r1], r2 @ d0 = next line 954 mspel_filter q11, d21, d1, d2, d3, d0, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift 955 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst 956 957 vld1.64 {d1}, [r1], r2 @ d1 = next line 958 mspel_filter q11, d21, d2, d3, d0, d1, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift 959 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst 960 961 vld1.64 {d2}, [r1], r2 @ d2 = next line 962 mspel_filter q11, d21, d3, d0, d1, d2, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift 963 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst 964 965 bne 1b 966 967 bx lr 968endfunc 969.endm 970 971@ Use C preprocessor and assembler macros to expand to functions for vertical only filtering. 972#define PUT_VC1_MSPEL_MC_V_ONLY(vmode) \ 973 put_vc1_mspel_mc_v_only vmode, MSPEL_MODE_ ## vmode ## _MUL_CONSTANTS, MSPEL_MODE_ ## vmode ## _ADDSHIFT_CONSTANTS 974 975PUT_VC1_MSPEL_MC_V_ONLY(1) 976PUT_VC1_MSPEL_MC_V_ONLY(2) 977PUT_VC1_MSPEL_MC_V_ONLY(3) 978 979#undef PUT_VC1_MSPEL_MC_V_ONLY 980 981function ff_put_pixels8x8_neon, export=1 982 vld1.64 {d0}, [r1], r2 983 vld1.64 {d1}, [r1], r2 984 vld1.64 {d2}, [r1], r2 985 vld1.64 {d3}, [r1], r2 986 vld1.64 {d4}, [r1], r2 987 vld1.64 {d5}, [r1], r2 988 vld1.64 {d6}, [r1], r2 989 vld1.64 {d7}, [r1] 990 vst1.64 {d0}, [r0,:64], r2 991 vst1.64 {d1}, [r0,:64], r2 992 vst1.64 {d2}, [r0,:64], r2 993 vst1.64 {d3}, [r0,:64], r2 994 vst1.64 {d4}, [r0,:64], r2 995 vst1.64 {d5}, [r0,:64], r2 996 vst1.64 {d6}, [r0,:64], r2 997 vst1.64 {d7}, [r0,:64] 998 bx lr 999endfunc 1000 1001function ff_vc1_inv_trans_8x8_dc_neon, export=1 1002 ldrsh r2, [r2] @ int dc = block[0]; 1003 1004 vld1.64 {d0}, [r0,:64], r1 1005 vld1.64 {d1}, [r0,:64], r1 1006 vld1.64 {d4}, [r0,:64], r1 1007 vld1.64 {d5}, [r0,:64], r1 1008 1009 add r2, r2, r2, lsl #1 @ dc = (3 * dc + 1) >> 1; 1010 vld1.64 {d6}, [r0,:64], r1 1011 add r2, r2, #1 1012 vld1.64 {d7}, [r0,:64], r1 1013 vld1.64 {d16}, [r0,:64], r1 1014 vld1.64 {d17}, [r0,:64], r1 1015 asr r2, r2, #1 1016 1017 sub r0, r0, r1, lsl #3 @ restore r0 to original value 1018 1019 add r2, r2, r2, lsl #1 @ dc = (3 * dc + 16) >> 5; 1020 add r2, r2, #16 1021 asr r2, r2, #5 1022 1023 vdup.16 q1, r2 @ dc 1024 1025 vaddw.u8 q9, q1, d0 1026 vaddw.u8 q10, q1, d1 1027 vaddw.u8 q11, q1, d4 1028 vaddw.u8 q12, q1, d5 1029 vqmovun.s16 d0, q9 1030 vqmovun.s16 d1, q10 1031 vqmovun.s16 d4, q11 1032 vst1.64 {d0}, [r0,:64], r1 1033 vqmovun.s16 d5, q12 1034 vst1.64 {d1}, [r0,:64], r1 1035 vaddw.u8 q13, q1, d6 1036 vst1.64 {d4}, [r0,:64], r1 1037 vaddw.u8 q14, q1, d7 1038 vst1.64 {d5}, [r0,:64], r1 1039 vaddw.u8 q15, q1, d16 1040 vaddw.u8 q1, q1, d17 @ this destroys q1 1041 vqmovun.s16 d6, q13 1042 vqmovun.s16 d7, q14 1043 vqmovun.s16 d16, q15 1044 vqmovun.s16 d17, q1 1045 vst1.64 {d6}, [r0,:64], r1 1046 vst1.64 {d7}, [r0,:64], r1 1047 vst1.64 {d16}, [r0,:64], r1 1048 vst1.64 {d17}, [r0,:64] 1049 bx lr 1050endfunc 1051 1052function ff_vc1_inv_trans_8x4_dc_neon, export=1 1053 ldrsh r2, [r2] @ int dc = block[0]; 1054 1055 vld1.64 {d0}, [r0,:64], r1 1056 vld1.64 {d1}, [r0,:64], r1 1057 vld1.64 {d4}, [r0,:64], r1 1058 vld1.64 {d5}, [r0,:64], r1 1059 1060 add r2, r2, r2, lsl #1 @ dc = ( 3 * dc + 1) >> 1; 1061 1062 sub r0, r0, r1, lsl #2 @ restore r0 to original value 1063 1064 add r2, r2, #1 1065 asr r2, r2, #1 1066 1067 add r2, r2, r2, lsl #4 @ dc = (17 * dc + 64) >> 7; 1068 add r2, r2, #64 1069 asr r2, r2, #7 1070 1071 vdup.16 q1, r2 @ dc 1072 1073 vaddw.u8 q3, q1, d0 1074 vaddw.u8 q8, q1, d1 1075 vaddw.u8 q9, q1, d4 1076 vaddw.u8 q10, q1, d5 1077 vqmovun.s16 d0, q3 1078 vqmovun.s16 d1, q8 1079 vqmovun.s16 d4, q9 1080 vst1.64 {d0}, [r0,:64], r1 1081 vqmovun.s16 d5, q10 1082 vst1.64 {d1}, [r0,:64], r1 1083 vst1.64 {d4}, [r0,:64], r1 1084 vst1.64 {d5}, [r0,:64] 1085 bx lr 1086endfunc 1087 1088function ff_vc1_inv_trans_4x8_dc_neon, export=1 1089 ldrsh r2, [r2] @ int dc = block[0]; 1090 1091 vld1.32 {d0[]}, [r0,:32], r1 1092 vld1.32 {d1[]}, [r0,:32], r1 1093 vld1.32 {d0[1]}, [r0,:32], r1 1094 vld1.32 {d1[1]}, [r0,:32], r1 1095 1096 add r2, r2, r2, lsl #4 @ dc = (17 * dc + 4) >> 3; 1097 vld1.32 {d4[]}, [r0,:32], r1 1098 add r2, r2, #4 1099 vld1.32 {d5[]}, [r0,:32], r1 1100 vld1.32 {d4[1]}, [r0,:32], r1 1101 asr r2, r2, #3 1102 vld1.32 {d5[1]}, [r0,:32], r1 1103 1104 add r2, r2, r2, lsl #1 @ dc = (12 * dc + 64) >> 7; 1105 1106 sub r0, r0, r1, lsl #3 @ restore r0 to original value 1107 1108 lsl r2, r2, #2 1109 add r2, r2, #64 1110 asr r2, r2, #7 1111 1112 vdup.16 q1, r2 @ dc 1113 1114 vaddw.u8 q3, q1, d0 1115 vaddw.u8 q8, q1, d1 1116 vaddw.u8 q9, q1, d4 1117 vaddw.u8 q10, q1, d5 1118 vqmovun.s16 d0, q3 1119 vst1.32 {d0[0]}, [r0,:32], r1 1120 vqmovun.s16 d1, q8 1121 vst1.32 {d1[0]}, [r0,:32], r1 1122 vqmovun.s16 d4, q9 1123 vst1.32 {d0[1]}, [r0,:32], r1 1124 vqmovun.s16 d5, q10 1125 vst1.32 {d1[1]}, [r0,:32], r1 1126 vst1.32 {d4[0]}, [r0,:32], r1 1127 vst1.32 {d5[0]}, [r0,:32], r1 1128 vst1.32 {d4[1]}, [r0,:32], r1 1129 vst1.32 {d5[1]}, [r0,:32] 1130 bx lr 1131endfunc 1132 1133function ff_vc1_inv_trans_4x4_dc_neon, export=1 1134 ldrsh r2, [r2] @ int dc = block[0]; 1135 1136 vld1.32 {d0[]}, [r0,:32], r1 1137 vld1.32 {d1[]}, [r0,:32], r1 1138 vld1.32 {d0[1]}, [r0,:32], r1 1139 vld1.32 {d1[1]}, [r0,:32], r1 1140 1141 add r2, r2, r2, lsl #4 @ dc = (17 * dc + 4) >> 3; 1142 1143 sub r0, r0, r1, lsl #2 @ restore r0 to original value 1144 1145 add r2, r2, #4 1146 asr r2, r2, #3 1147 1148 add r2, r2, r2, lsl #4 @ dc = (17 * dc + 64) >> 7; 1149 add r2, r2, #64 1150 asr r2, r2, #7 1151 1152 vdup.16 q1, r2 @ dc 1153 1154 vaddw.u8 q2, q1, d0 1155 vaddw.u8 q3, q1, d1 1156 vqmovun.s16 d0, q2 1157 vst1.32 {d0[0]}, [r0,:32], r1 1158 vqmovun.s16 d1, q3 1159 vst1.32 {d1[0]}, [r0,:32], r1 1160 vst1.32 {d0[1]}, [r0,:32], r1 1161 vst1.32 {d1[1]}, [r0,:32] 1162 bx lr 1163endfunc 1164