1;****************************************************************************** 2;* MMX/SSE2-optimized functions for the VP3 decoder 3;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24; MMX-optimized functions cribbed from the original VP3 source code. 25 26SECTION_RODATA 27 28vp3_idct_data: times 8 dw 64277 29 times 8 dw 60547 30 times 8 dw 54491 31 times 8 dw 46341 32 times 8 dw 36410 33 times 8 dw 25080 34 times 8 dw 12785 35 36pb_7: times 8 db 0x07 37pb_1F: times 8 db 0x1f 38pb_81: times 8 db 0x81 39 40cextern pb_1 41cextern pb_3 42cextern pb_80 43cextern pb_FE 44 45cextern pw_8 46 47SECTION .text 48 49; this is off by one or two for some cases when filter_limit is greater than 63 50; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1 51; out: p1 in mm4, p2 in mm3 52%macro VP3_LOOP_FILTER 0 53 movq m7, m6 54 pand m6, [pb_7] ; p0&7 55 psrlw m7, 3 56 pand m7, [pb_1F] ; p0>>3 57 movq m3, m2 ; p2 58 pxor m2, m4 59 pand m2, [pb_1] ; (p2^p1)&1 60 movq m5, m2 61 paddb m2, m2 62 paddb m2, m5 ; 3*(p2^p1)&1 63 paddb m2, m6 ; extra bits lost in shifts 64 pcmpeqb m0, m0 65 pxor m1, m0 ; 255 - p3 66 pavgb m1, m2 ; (256 - p3 + extrabits) >> 1 67 pxor m0, m4 ; 255 - p1 68 pavgb m0, m3 ; (256 + p2-p1) >> 1 69 paddb m1, [pb_3] 70 pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2 71 pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3 72 paddusb m7, m1 ; d+128+1 73 movq m6, [pb_81] 74 psubusb m6, m7 75 psubusb m7, [pb_81] 76 77 movq m5, [r2+516] ; flim 78 pminub m6, m5 79 pminub m7, m5 80 movq m0, m6 81 movq m1, m7 82 paddb m6, m6 83 paddb m7, m7 84 pminub m6, m5 85 pminub m7, m5 86 psubb m6, m0 87 psubb m7, m1 88 paddusb m4, m7 89 psubusb m4, m6 90 psubusb m3, m7 91 paddusb m3, m6 92%endmacro 93 94%macro STORE_4_WORDS 1 95 movd r2d, %1 96 mov [r0 -1], r2w 97 psrlq %1, 32 98 shr r2, 16 99 mov [r0+r1 -1], r2w 100 movd r2d, %1 101 mov [r0+r1*2-1], r2w 102 shr r2, 16 103 mov [r0+r3 -1], r2w 104%endmacro 105 106INIT_MMX mmxext 107cglobal vp3_v_loop_filter, 3, 4 108 mov r3, r1 109 neg r1 110 movq m6, [r0+r1*2] 111 movq m4, [r0+r1 ] 112 movq m2, [r0 ] 113 movq m1, [r0+r3 ] 114 115 VP3_LOOP_FILTER 116 117 movq [r0+r1], m4 118 movq [r0 ], m3 119 RET 120 121cglobal vp3_h_loop_filter, 3, 4 122 lea r3, [r1*3] 123 124 movd m6, [r0 -2] 125 movd m4, [r0+r1 -2] 126 movd m2, [r0+r1*2-2] 127 movd m1, [r0+r3 -2] 128 lea r0, [r0+r1*4 ] 129 punpcklbw m6, [r0 -2] 130 punpcklbw m4, [r0+r1 -2] 131 punpcklbw m2, [r0+r1*2-2] 132 punpcklbw m1, [r0+r3 -2] 133 sub r0, r3 134 sub r0, r1 135 136 TRANSPOSE4x4B 6, 4, 2, 1, 0 137 VP3_LOOP_FILTER 138 SBUTTERFLY bw, 4, 3, 5 139 140 STORE_4_WORDS m4 141 lea r0, [r0+r1*4 ] 142 STORE_4_WORDS m3 143 RET 144 145%macro PAVGB_NO_RND 0 146 mova m4, m0 147 mova m5, m2 148 pand m4, m1 149 pand m5, m3 150 pxor m1, m0 151 pxor m3, m2 152 pand m1, m6 153 pand m3, m6 154 psrlq m1, 1 155 psrlq m3, 1 156 paddb m4, m1 157 paddb m5, m3 158%endmacro 159 160INIT_MMX mmx 161cglobal put_vp_no_rnd_pixels8_l2, 5, 6, 0, dst, src1, src2, stride, h, stride3 162 mova m6, [pb_FE] 163 lea stride3q,[strideq+strideq*2] 164.loop: 165 mova m0, [src1q] 166 mova m1, [src2q] 167 mova m2, [src1q+strideq] 168 mova m3, [src2q+strideq] 169 PAVGB_NO_RND 170 mova [dstq], m4 171 mova [dstq+strideq], m5 172 173 mova m0, [src1q+strideq*2] 174 mova m1, [src2q+strideq*2] 175 mova m2, [src1q+stride3q] 176 mova m3, [src2q+stride3q] 177 PAVGB_NO_RND 178 mova [dstq+strideq*2], m4 179 mova [dstq+stride3q], m5 180 181 lea src1q, [src1q+strideq*4] 182 lea src2q, [src2q+strideq*4] 183 lea dstq, [dstq+strideq*4] 184 sub hd, 4 185 jnz .loop 186 RET 187 188; from original comments: The Macro does IDct on 4 1-D Dcts 189%macro BeginIDCT 0 190 movq m2, I(3) 191 movq m6, C(3) 192 movq m4, m2 193 movq m7, J(5) 194 pmulhw m4, m6 ; r4 = c3*i3 - i3 195 movq m1, C(5) 196 pmulhw m6, m7 ; r6 = c3*i5 - i5 197 movq m5, m1 198 pmulhw m1, m2 ; r1 = c5*i3 - i3 199 movq m3, I(1) 200 pmulhw m5, m7 ; r5 = c5*i5 - i5 201 movq m0, C(1) 202 paddw m4, m2 ; r4 = c3*i3 203 paddw m6, m7 ; r6 = c3*i5 204 paddw m2, m1 ; r2 = c5*i3 205 movq m1, J(7) 206 paddw m7, m5 ; r7 = c5*i5 207 movq m5, m0 ; r5 = c1 208 pmulhw m0, m3 ; r0 = c1*i1 - i1 209 paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5 210 pmulhw m5, m1 ; r5 = c1*i7 - i7 211 movq m7, C(7) 212 psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3 213 paddw m0, m3 ; r0 = c1*i1 214 pmulhw m3, m7 ; r3 = c7*i1 215 movq m2, I(2) 216 pmulhw m7, m1 ; r7 = c7*i7 217 paddw m5, m1 ; r5 = c1*i7 218 movq m1, m2 ; r1 = i2 219 pmulhw m2, C(2) ; r2 = c2*i2 - i2 220 psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7 221 movq m5, J(6) 222 paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7 223 movq m7, m5 ; r7 = i6 224 psubsw m0, m4 ; r0 = A - C 225 pmulhw m5, C(2) ; r5 = c2*i6 - i6 226 paddw m2, m1 ; r2 = c2*i2 227 pmulhw m1, C(6) ; r1 = c6*i2 228 paddsw m4, m4 ; r4 = C + C 229 paddsw m4, m0 ; r4 = C. = A + C 230 psubsw m3, m6 ; r3 = B - D 231 paddw m5, m7 ; r5 = c2*i6 232 paddsw m6, m6 ; r6 = D + D 233 pmulhw m7, C(6) ; r7 = c6*i6 234 paddsw m6, m3 ; r6 = D. = B + D 235 movq I(1), m4 ; save C. at I(1) 236 psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6 237 movq m4, C(4) 238 movq m5, m3 ; r5 = B - D 239 pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D) 240 paddsw m7, m2 ; r3 = (c4 - 1) * (B - D) 241 movq I(2), m6 ; save D. at I(2) 242 movq m2, m0 ; r2 = A - C 243 movq m6, I(0) 244 pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C) 245 paddw m5, m3 ; r5 = B. = c4 * (B - D) 246 movq m3, J(4) 247 psubsw m5, m1 ; r5 = B.. = B. - H 248 paddw m2, m0 ; r0 = A. = c4 * (A - C) 249 psubsw m6, m3 ; r6 = i0 - i4 250 movq m0, m6 251 pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4) 252 paddsw m3, m3 ; r3 = i4 + i4 253 paddsw m1, m1 ; r1 = H + H 254 paddsw m3, m0 ; r3 = i0 + i4 255 paddsw m1, m5 ; r1 = H. = B + H 256 pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4) 257 paddsw m6, m0 ; r6 = F = c4 * (i0 - i4) 258 psubsw m6, m2 ; r6 = F. = F - A. 259 paddsw m2, m2 ; r2 = A. + A. 260 movq m0, I(1) ; r0 = C. 261 paddsw m2, m6 ; r2 = A.. = F + A. 262 paddw m4, m3 ; r4 = E = c4 * (i0 + i4) 263 psubsw m2, m1 ; r2 = R2 = A.. - H. 264%endmacro 265 266; RowIDCT gets ready to transpose 267%macro RowIDCT 0 268 BeginIDCT 269 movq m3, I(2) ; r3 = D. 270 psubsw m4, m7 ; r4 = E. = E - G 271 paddsw m1, m1 ; r1 = H. + H. 272 paddsw m7, m7 ; r7 = G + G 273 paddsw m1, m2 ; r1 = R1 = A.. + H. 274 paddsw m7, m4 ; r1 = R1 = A.. + H. 275 psubsw m4, m3 ; r4 = R4 = E. - D. 276 paddsw m3, m3 277 psubsw m6, m5 ; r6 = R6 = F. - B.. 278 paddsw m5, m5 279 paddsw m3, m4 ; r3 = R3 = E. + D. 280 paddsw m5, m6 ; r5 = R5 = F. + B.. 281 psubsw m7, m0 ; r7 = R7 = G. - C. 282 paddsw m0, m0 283 movq I(1), m1 ; save R1 284 paddsw m0, m7 ; r0 = R0 = G. + C. 285%endmacro 286 287; Column IDCT normalizes and stores final results 288%macro ColumnIDCT 0 289 BeginIDCT 290 paddsw m2, OC_8 ; adjust R2 (and R1) for shift 291 paddsw m1, m1 ; r1 = H. + H. 292 paddsw m1, m2 ; r1 = R1 = A.. + H. 293 psraw m2, 4 ; r2 = NR2 294 psubsw m4, m7 ; r4 = E. = E - G 295 psraw m1, 4 ; r1 = NR2 296 movq m3, I(2) ; r3 = D. 297 paddsw m7, m7 ; r7 = G + G 298 movq I(2), m2 ; store NR2 at I2 299 paddsw m7, m4 ; r7 = G. = E + G 300 movq I(1), m1 ; store NR1 at I1 301 psubsw m4, m3 ; r4 = R4 = E. - D. 302 paddsw m4, OC_8 ; adjust R4 (and R3) for shift 303 paddsw m3, m3 ; r3 = D. + D. 304 paddsw m3, m4 ; r3 = R3 = E. + D. 305 psraw m4, 4 ; r4 = NR4 306 psubsw m6, m5 ; r6 = R6 = F. - B.. 307 psraw m3, 4 ; r3 = NR3 308 paddsw m6, OC_8 ; adjust R6 (and R5) for shift 309 paddsw m5, m5 ; r5 = B.. + B.. 310 paddsw m5, m6 ; r5 = R5 = F. + B.. 311 psraw m6, 4 ; r6 = NR6 312 movq J(4), m4 ; store NR4 at J4 313 psraw m5, 4 ; r5 = NR5 314 movq I(3), m3 ; store NR3 at I3 315 psubsw m7, m0 ; r7 = R7 = G. - C. 316 paddsw m7, OC_8 ; adjust R7 (and R0) for shift 317 paddsw m0, m0 ; r0 = C. + C. 318 paddsw m0, m7 ; r0 = R0 = G. + C. 319 psraw m7, 4 ; r7 = NR7 320 movq J(6), m6 ; store NR6 at J6 321 psraw m0, 4 ; r0 = NR0 322 movq J(5), m5 ; store NR5 at J5 323 movq J(7), m7 ; store NR7 at J7 324 movq I(0), m0 ; store NR0 at I0 325%endmacro 326 327; Following macro does two 4x4 transposes in place. 328; 329; At entry (we assume): 330; 331; r0 = a3 a2 a1 a0 332; I(1) = b3 b2 b1 b0 333; r2 = c3 c2 c1 c0 334; r3 = d3 d2 d1 d0 335; 336; r4 = e3 e2 e1 e0 337; r5 = f3 f2 f1 f0 338; r6 = g3 g2 g1 g0 339; r7 = h3 h2 h1 h0 340; 341; At exit, we have: 342; 343; I(0) = d0 c0 b0 a0 344; I(1) = d1 c1 b1 a1 345; I(2) = d2 c2 b2 a2 346; I(3) = d3 c3 b3 a3 347; 348; J(4) = h0 g0 f0 e0 349; J(5) = h1 g1 f1 e1 350; J(6) = h2 g2 f2 e2 351; J(7) = h3 g3 f3 e3 352; 353; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. 354; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. 355; 356; Since r1 is free at entry, we calculate the Js first. 357%macro Transpose 0 358 movq m1, m4 ; r1 = e3 e2 e1 e0 359 punpcklwd m4, m5 ; r4 = f1 e1 f0 e0 360 movq I(0), m0 ; save a3 a2 a1 a0 361 punpckhwd m1, m5 ; r1 = f3 e3 f2 e2 362 movq m0, m6 ; r0 = g3 g2 g1 g0 363 punpcklwd m6, m7 ; r6 = h1 g1 h0 g0 364 movq m5, m4 ; r5 = f1 e1 f0 e0 365 punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4 366 punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5 367 movq m6, m1 ; r6 = f3 e3 f2 e2 368 movq J(4), m4 369 punpckhwd m0, m7 ; r0 = h3 g3 h2 g2 370 movq J(5), m5 371 punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7 372 movq m4, I(0) ; r4 = a3 a2 a1 a0 373 punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6 374 movq m5, I(1) ; r5 = b3 b2 b1 b0 375 movq m0, m4 ; r0 = a3 a2 a1 a0 376 movq J(7), m6 377 punpcklwd m0, m5 ; r0 = b1 a1 b0 a0 378 movq J(6), m1 379 punpckhwd m4, m5 ; r4 = b3 a3 b2 a2 380 movq m5, m2 ; r5 = c3 c2 c1 c0 381 punpcklwd m2, m3 ; r2 = d1 c1 d0 c0 382 movq m1, m0 ; r1 = b1 a1 b0 a0 383 punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0 384 punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1 385 movq m2, m4 ; r2 = b3 a3 b2 a2 386 movq I(0), m0 387 punpckhwd m5, m3 ; r5 = d3 c3 d2 c2 388 movq I(1), m1 389 punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3 390 punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2 391 movq I(3), m4 392 movq I(2), m2 393%endmacro 394 395%macro VP3_1D_IDCT_SSE2 0 396 movdqa m2, I(3) ; xmm2 = i3 397 movdqa m6, C(3) ; xmm6 = c3 398 movdqa m4, m2 ; xmm4 = i3 399 movdqa m7, I(5) ; xmm7 = i5 400 pmulhw m4, m6 ; xmm4 = c3 * i3 - i3 401 movdqa m1, C(5) ; xmm1 = c5 402 pmulhw m6, m7 ; xmm6 = c3 * i5 - i5 403 movdqa m5, m1 ; xmm5 = c5 404 pmulhw m1, m2 ; xmm1 = c5 * i3 - i3 405 movdqa m3, I(1) ; xmm3 = i1 406 pmulhw m5, m7 ; xmm5 = c5 * i5 - i5 407 movdqa m0, C(1) ; xmm0 = c1 408 paddw m4, m2 ; xmm4 = c3 * i3 409 paddw m6, m7 ; xmm6 = c3 * i5 410 paddw m2, m1 ; xmm2 = c5 * i3 411 movdqa m1, I(7) ; xmm1 = i7 412 paddw m7, m5 ; xmm7 = c5 * i5 413 movdqa m5, m0 ; xmm5 = c1 414 pmulhw m0, m3 ; xmm0 = c1 * i1 - i1 415 paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C 416 pmulhw m5, m1 ; xmm5 = c1 * i7 - i7 417 movdqa m7, C(7) ; xmm7 = c7 418 psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D 419 paddw m0, m3 ; xmm0 = c1 * i1 420 pmulhw m3, m7 ; xmm3 = c7 * i1 421 movdqa m2, I(2) ; xmm2 = i2 422 pmulhw m7, m1 ; xmm7 = c7 * i7 423 paddw m5, m1 ; xmm5 = c1 * i7 424 movdqa m1, m2 ; xmm1 = i2 425 pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2 426 psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B 427 movdqa m5, I(6) ; xmm5 = i6 428 paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A 429 movdqa m7, m5 ; xmm7 = i6 430 psubsw m0, m4 ; xmm0 = A - C 431 pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6 432 paddw m2, m1 ; xmm2 = i2 * c2 433 pmulhw m1, C(6) ; xmm1 = c6 * i2 434 paddsw m4, m4 ; xmm4 = C + C 435 paddsw m4, m0 ; xmm4 = A + C = C. 436 psubsw m3, m6 ; xmm3 = B - D 437 paddw m5, m7 ; xmm5 = c2 * i6 438 paddsw m6, m6 ; xmm6 = D + D 439 pmulhw m7, C(6) ; xmm7 = c6 * i6 440 paddsw m6, m3 ; xmm6 = B + D = D. 441 movdqa I(1), m4 ; Save C. at I(1) 442 psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H 443 movdqa m4, C(4) ; xmm4 = C4 444 movdqa m5, m3 ; xmm5 = B - D 445 pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D ) 446 paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G 447 movdqa I(2), m6 ; save D. at I(2) 448 movdqa m2, m0 ; xmm2 = A - C 449 movdqa m6, I(0) ; xmm6 = i0 450 pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A. 451 paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B. 452 movdqa m3, I(4) ; xmm3 = i4 453 psubsw m5, m1 ; xmm5 = B. - H = B.. 454 paddw m2, m0 ; xmm2 = c4 * ( A - C) = A. 455 psubsw m6, m3 ; xmm6 = i0 - i4 456 movdqa m0, m6 ; xmm0 = i0 - i4 457 pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F 458 paddsw m3, m3 ; xmm3 = i4 + i4 459 paddsw m1, m1 ; xmm1 = H + H 460 paddsw m3, m0 ; xmm3 = i0 + i4 461 paddsw m1, m5 ; xmm1 = B. + H = H. 462 pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 ) 463 paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 ) 464 psubsw m6, m2 ; xmm6 = F - A. = F. 465 paddsw m2, m2 ; xmm2 = A. + A. 466 movdqa m0, I(1) ; Load C. from I(1) 467 paddsw m2, m6 ; xmm2 = F + A. = A.. 468 paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3 469 psubsw m2, m1 ; xmm2 = A.. - H. = R2 470 ADD(m2) ; Adjust R2 and R1 before shifting 471 paddsw m1, m1 ; xmm1 = H. + H. 472 paddsw m1, m2 ; xmm1 = A.. + H. = R1 473 SHIFT(m2) ; xmm2 = op2 474 psubsw m4, m7 ; xmm4 = E - G = E. 475 SHIFT(m1) ; xmm1 = op1 476 movdqa m3, I(2) ; Load D. from I(2) 477 paddsw m7, m7 ; xmm7 = G + G 478 paddsw m7, m4 ; xmm7 = E + G = G. 479 psubsw m4, m3 ; xmm4 = E. - D. = R4 480 ADD(m4) ; Adjust R4 and R3 before shifting 481 paddsw m3, m3 ; xmm3 = D. + D. 482 paddsw m3, m4 ; xmm3 = E. + D. = R3 483 SHIFT(m4) ; xmm4 = op4 484 psubsw m6, m5 ; xmm6 = F. - B..= R6 485 SHIFT(m3) ; xmm3 = op3 486 ADD(m6) ; Adjust R6 and R5 before shifting 487 paddsw m5, m5 ; xmm5 = B.. + B.. 488 paddsw m5, m6 ; xmm5 = F. + B.. = R5 489 SHIFT(m6) ; xmm6 = op6 490 SHIFT(m5) ; xmm5 = op5 491 psubsw m7, m0 ; xmm7 = G. - C. = R7 492 ADD(m7) ; Adjust R7 and R0 before shifting 493 paddsw m0, m0 ; xmm0 = C. + C. 494 paddsw m0, m7 ; xmm0 = G. + C. 495 SHIFT(m7) ; xmm7 = op7 496 SHIFT(m0) ; xmm0 = op0 497%endmacro 498 499%macro PUT_BLOCK 8 500 movdqa O(0), m%1 501 movdqa O(1), m%2 502 movdqa O(2), m%3 503 movdqa O(3), m%4 504 movdqa O(4), m%5 505 movdqa O(5), m%6 506 movdqa O(6), m%7 507 movdqa O(7), m%8 508%endmacro 509 510%macro VP3_IDCT 1 511%if mmsize == 16 512%define I(x) [%1+16*x] 513%define O(x) [%1+16*x] 514%define C(x) [vp3_idct_data+16*(x-1)] 515%define SHIFT(x) 516%define ADD(x) 517 VP3_1D_IDCT_SSE2 518%if ARCH_X86_64 519 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 520%else 521 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16] 522%endif 523 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 524 525%define SHIFT(x) psraw x, 4 526%define ADD(x) paddsw x, [pw_8] 527 VP3_1D_IDCT_SSE2 528 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 529%else ; mmsize == 8 530 ; eax = quantized input 531 ; ebx = dequantizer matrix 532 ; ecx = IDCT constants 533 ; M(I) = ecx + MaskOffset(0) + I * 8 534 ; C(I) = ecx + CosineOffset(32) + (I-1) * 8 535 ; edx = output 536 ; r0..r7 = mm0..mm7 537%define OC_8 [pw_8] 538%define C(x) [vp3_idct_data+16*(x-1)] 539 540 ; at this point, function has completed dequantization + dezigzag + 541 ; partial transposition; now do the idct itself 542%define I(x) [%1+16*x] 543%define J(x) [%1+16*x] 544 RowIDCT 545 Transpose 546 547%define I(x) [%1+16*x+8] 548%define J(x) [%1+16*x+8] 549 RowIDCT 550 Transpose 551 552%define I(x) [%1+16* x] 553%define J(x) [%1+16*(x-4)+8] 554 ColumnIDCT 555 556%define I(x) [%1+16* x +64] 557%define J(x) [%1+16*(x-4)+72] 558 ColumnIDCT 559%endif ; mmsize == 16/8 560%endmacro 561 562%macro vp3_idct_funcs 0 563cglobal vp3_idct_put, 3, 4, 9 564 VP3_IDCT r2 565 566 mova m4, [pb_80] 567 lea r3, [r1*3] 568%assign %%i 0 569%rep 16/mmsize 570 mova m0, [r2+mmsize*0+%%i] 571 mova m1, [r2+mmsize*2+%%i] 572 mova m2, [r2+mmsize*4+%%i] 573 mova m3, [r2+mmsize*6+%%i] 574%if mmsize == 8 575 packsswb m0, [r2+mmsize*8+%%i] 576 packsswb m1, [r2+mmsize*10+%%i] 577 packsswb m2, [r2+mmsize*12+%%i] 578 packsswb m3, [r2+mmsize*14+%%i] 579%else 580 packsswb m0, [r2+mmsize*1+%%i] 581 packsswb m1, [r2+mmsize*3+%%i] 582 packsswb m2, [r2+mmsize*5+%%i] 583 packsswb m3, [r2+mmsize*7+%%i] 584%endif 585 paddb m0, m4 586 paddb m1, m4 587 paddb m2, m4 588 paddb m3, m4 589 movq [r0 ], m0 590%if mmsize == 8 591 movq [r0+r1 ], m1 592 movq [r0+r1*2], m2 593 movq [r0+r3 ], m3 594%else 595 movhps [r0+r1 ], m0 596 movq [r0+r1*2], m1 597 movhps [r0+r3 ], m1 598%endif 599%if %%i == 0 600 lea r0, [r0+r1*4] 601%endif 602%if mmsize == 16 603 movq [r0 ], m2 604 movhps [r0+r1 ], m2 605 movq [r0+r1*2], m3 606 movhps [r0+r3 ], m3 607%endif 608%assign %%i %%i+8 609%endrep 610 611 pxor m0, m0 612%assign %%offset 0 613%rep 128/mmsize 614 mova [r2+%%offset], m0 615%assign %%offset %%offset+mmsize 616%endrep 617 RET 618 619cglobal vp3_idct_add, 3, 4, 9 620 VP3_IDCT r2 621 622 lea r3, [r1*3] 623 pxor m4, m4 624%if mmsize == 16 625%assign %%i 0 626%rep 2 627 movq m0, [r0] 628 movq m1, [r0+r1] 629 movq m2, [r0+r1*2] 630 movq m3, [r0+r3] 631 punpcklbw m0, m4 632 punpcklbw m1, m4 633 punpcklbw m2, m4 634 punpcklbw m3, m4 635 paddsw m0, [r2+ 0+%%i] 636 paddsw m1, [r2+16+%%i] 637 paddsw m2, [r2+32+%%i] 638 paddsw m3, [r2+48+%%i] 639 packuswb m0, m1 640 packuswb m2, m3 641 movq [r0 ], m0 642 movhps [r0+r1 ], m0 643 movq [r0+r1*2], m2 644 movhps [r0+r3 ], m2 645%if %%i == 0 646 lea r0, [r0+r1*4] 647%endif 648%assign %%i %%i+64 649%endrep 650%else 651%assign %%i 0 652%rep 2 653 movq m0, [r0] 654 movq m1, [r0+r1] 655 movq m2, [r0+r1*2] 656 movq m3, [r0+r3] 657 movq m5, m0 658 movq m6, m1 659 movq m7, m2 660 punpcklbw m0, m4 661 punpcklbw m1, m4 662 punpcklbw m2, m4 663 punpckhbw m5, m4 664 punpckhbw m6, m4 665 punpckhbw m7, m4 666 paddsw m0, [r2+ 0+%%i] 667 paddsw m1, [r2+16+%%i] 668 paddsw m2, [r2+32+%%i] 669 paddsw m5, [r2+64+%%i] 670 paddsw m6, [r2+80+%%i] 671 paddsw m7, [r2+96+%%i] 672 packuswb m0, m5 673 movq m5, m3 674 punpcklbw m3, m4 675 punpckhbw m5, m4 676 packuswb m1, m6 677 paddsw m3, [r2+48+%%i] 678 paddsw m5, [r2+112+%%i] 679 packuswb m2, m7 680 packuswb m3, m5 681 movq [r0 ], m0 682 movq [r0+r1 ], m1 683 movq [r0+r1*2], m2 684 movq [r0+r3 ], m3 685%if %%i == 0 686 lea r0, [r0+r1*4] 687%endif 688%assign %%i %%i+8 689%endrep 690%endif 691%assign %%i 0 692%rep 128/mmsize 693 mova [r2+%%i], m4 694%assign %%i %%i+mmsize 695%endrep 696 RET 697%endmacro 698 699%if ARCH_X86_32 700INIT_MMX mmx 701vp3_idct_funcs 702%endif 703 704INIT_XMM sse2 705vp3_idct_funcs 706 707%macro DC_ADD 0 708 movq m2, [r0 ] 709 movq m3, [r0+r1 ] 710 paddusb m2, m0 711 movq m4, [r0+r1*2] 712 paddusb m3, m0 713 movq m5, [r0+r2 ] 714 paddusb m4, m0 715 paddusb m5, m0 716 psubusb m2, m1 717 psubusb m3, m1 718 movq [r0 ], m2 719 psubusb m4, m1 720 movq [r0+r1 ], m3 721 psubusb m5, m1 722 movq [r0+r1*2], m4 723 movq [r0+r2 ], m5 724%endmacro 725 726INIT_MMX mmxext 727cglobal vp3_idct_dc_add, 3, 4 728 movsx r3, word [r2] 729 mov word [r2], 0 730 lea r2, [r1*3] 731 add r3, 15 732 sar r3, 5 733 movd m0, r3d 734 pshufw m0, m0, 0x0 735 pxor m1, m1 736 psubw m1, m0 737 packuswb m0, m0 738 packuswb m1, m1 739 DC_ADD 740 lea r0, [r0+r1*4] 741 DC_ADD 742 RET 743