1; 2; Simple IDCT MMX 3; 4; Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at> 5; 6; Conversion from gcc syntax to x264asm syntax with minimal modifications 7; by James Darnley <jdarnley@obe.tv>. 8; 9; This file is part of FFmpeg. 10; 11; FFmpeg is free software; you can redistribute it and/or 12; modify it under the terms of the GNU Lesser General Public 13; License as published by the Free Software Foundation; either 14; version 2.1 of the License, or (at your option) any later version. 15; 16; FFmpeg is distributed in the hope that it will be useful, 17; but WITHOUT ANY WARRANTY; without even the implied warranty of 18; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19; Lesser General Public License for more details. 20; 21; You should have received a copy of the GNU Lesser General Public 22; License along with FFmpeg; if not, write to the Free Software 23; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24;/ 25 26%include "libavutil/x86/x86util.asm" 27 28%if ARCH_X86_32 29SECTION_RODATA 30 31cextern pb_80 32 33wm1010: dw 0, 0xffff, 0, 0xffff 34d40000: dd 4 << 16, 0 35 36; 23170.475006 37; 22725.260826 38; 21406.727617 39; 19265.545870 40; 16384.000000 41; 12872.826198 42; 8866.956905 43; 4520.335430 44 45%define C0 23170 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 46%define C1 22725 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 47%define C2 21407 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 48%define C3 19266 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 49%define C4 16383 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 50%define C5 12873 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 51%define C6 8867 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 52%define C7 4520 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 53 54%define ROW_SHIFT 11 55%define COL_SHIFT 20 ; 6 56 57coeffs: 58 dw 1 << (ROW_SHIFT - 1), 0 59 dw 1 << (ROW_SHIFT - 1), 0 60 dw 1 << (ROW_SHIFT - 1), 1 61 dw 1 << (ROW_SHIFT - 1), 0 62 63 dw C4, C4, C4, C4 64 dw C4, -C4, C4, -C4 65 66 dw C2, C6, C2, C6 67 dw C6, -C2, C6, -C2 68 69 dw C1, C3, C1, C3 70 dw C5, C7, C5, C7 71 72 dw C3, -C7, C3, -C7 73 dw -C1, -C5, -C1, -C5 74 75 dw C5, -C1, C5, -C1 76 dw C7, C3, C7, C3 77 78 dw C7, -C5, C7, -C5 79 dw C3, -C1, C3, -C1 80 81SECTION .text 82 83%macro DC_COND_IDCT 7 84 movq mm0, [blockq + %1] ; R4 R0 r4 r0 85 movq mm1, [blockq + %2] ; R6 R2 r6 r2 86 movq mm2, [blockq + %3] ; R3 R1 r3 r1 87 movq mm3, [blockq + %4] ; R7 R5 r7 r5 88 movq mm4, [wm1010] 89 pand mm4, mm0 90 por mm4, mm1 91 por mm4, mm2 92 por mm4, mm3 93 packssdw mm4, mm4 94 movd t0d, mm4 95 or t0d, t0d 96 jz %%1 97 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 98 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 99 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 100 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 101 movq mm5, [coeffs + 32] ; C6 C2 C6 C2 102 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 103 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 104 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 105 movq mm7, [coeffs + 48] ; C3 C1 C3 C1 106 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 107 paddd mm4, [coeffs + 8] 108 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 109 paddd mm4, mm5 ; A0 a0 110 psubd mm6, mm5 ; A3 a3 111 movq mm5, [coeffs + 56] ; C7 C5 C7 C5 112 pmaddwd mm5, mm3 ; C7R7+C5R5 C7r7+C5r5 113 paddd mm0, [coeffs + 8] 114 paddd mm1, mm0 ; A1 a1 115 paddd mm0, mm0 116 psubd mm0, mm1 ; A2 a2 117 pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1 118 paddd mm7, mm5 ; B0 b0 119 movq mm5, [coeffs + 72] ; -C5 -C1 -C5 -C1 120 pmaddwd mm5, mm3 ; -C5R7-C1R5 -C5r7-C1r5 121 paddd mm7, mm4 ; A0+B0 a0+b0 122 paddd mm4, mm4 ; 2A0 2a0 123 psubd mm4, mm7 ; A0-B0 a0-b0 124 paddd mm5, mm2 ; B1 b1 125 psrad mm7, %7 126 psrad mm4, %7 127 movq mm2, mm1 ; A1 a1 128 paddd mm1, mm5 ; A1+B1 a1+b1 129 psubd mm2, mm5 ; A1-B1 a1-b1 130 psrad mm1, %7 131 psrad mm2, %7 132 packssdw mm7, mm1 ; A1+B1 a1+b1 A0+B0 a0+b0 133 packssdw mm2, mm4 ; A0-B0 a0-b0 A1-B1 a1-b1 134 movq [%5], mm7 135 movq mm1, [blockq + %3] ; R3 R1 r3 r1 136 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 137 movq [24 + %5], mm2 138 pmaddwd mm4, mm1 ; -C1R3+C5R1 -C1r3+C5r1 139 movq mm7, [coeffs + 88] ; C3 C7 C3 C7 140 pmaddwd mm1, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 141 pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5 142 movq mm2, mm0 ; A2 a2 143 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 144 paddd mm4, mm7 ; B2 b2 145 paddd mm2, mm4 ; A2+B2 a2+b2 146 psubd mm0, mm4 ; a2-B2 a2-b2 147 psrad mm2, %7 148 psrad mm0, %7 149 movq mm4, mm6 ; A3 a3 150 paddd mm3, mm1 ; B3 b3 151 paddd mm6, mm3 ; A3+B3 a3+b3 152 psubd mm4, mm3 ; a3-B3 a3-b3 153 psrad mm6, %7 154 packssdw mm2, mm6 ; A3+B3 a3+b3 A2+B2 a2+b2 155 movq [8 + %5], mm2 156 psrad mm4, %7 157 packssdw mm4, mm0 ; A2-B2 a2-b2 A3-B3 a3-b3 158 movq [16 + %5], mm4 159 jmp %%2 160%%1: 161 pslld mm0, 16 162 paddd mm0, [d40000] 163 psrad mm0, 13 164 packssdw mm0, mm0 165 movq [%5], mm0 166 movq [8 + %5], mm0 167 movq [16 + %5], mm0 168 movq [24 + %5], mm0 169%%2: 170%endmacro 171 172%macro Z_COND_IDCT 8 173 movq mm0, [blockq + %1] ; R4 R0 r4 r0 174 movq mm1, [blockq + %2] ; R6 R2 r6 r2 175 movq mm2, [blockq + %3] ; R3 R1 r3 r1 176 movq mm3, [blockq + %4] ; R7 R5 r7 r5 177 movq mm4, mm0 178 por mm4, mm1 179 por mm4, mm2 180 por mm4, mm3 181 packssdw mm4, mm4 182 movd t0d, mm4 183 or t0d, t0d 184 jz %8 185 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 186 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 187 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 188 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 189 movq mm5, [coeffs + 32] ; C6 C2 C6 C2 190 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 191 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 192 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 193 movq mm7, [coeffs + 48] ; C3 C1 C3 C1 194 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 195 paddd mm4, [coeffs] 196 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 197 paddd mm4, mm5 ; A0 a0 198 psubd mm6, mm5 ; A3 a3 199 movq mm5, [coeffs + 56] ; C7 C5 C7 C5 200 pmaddwd mm5, mm3 ; C7R7+C5R5 C7r7+C5r5 201 paddd mm0, [coeffs] 202 paddd mm1, mm0 ; A1 a1 203 paddd mm0, mm0 204 psubd mm0, mm1 ; A2 a2 205 pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1 206 paddd mm7, mm5 ; B0 b0 207 movq mm5, [coeffs + 72] ; -C5 -C1 -C5 -C1 208 pmaddwd mm5, mm3 ; -C5R7-C1R5 -C5r7-C1r5 209 paddd mm7, mm4 ; A0+B0 a0+b0 210 paddd mm4, mm4 ; 2A0 2a0 211 psubd mm4, mm7 ; A0-B0 a0-b0 212 paddd mm5, mm2 ; B1 b1 213 psrad mm7, %7 214 psrad mm4, %7 215 movq mm2, mm1 ; A1 a1 216 paddd mm1, mm5 ; A1+B1 a1+b1 217 psubd mm2, mm5 ; A1-B1 a1-b1 218 psrad mm1, %7 219 psrad mm2, %7 220 packssdw mm7, mm1 ; A1+B1 a1+b1 A0+B0 a0+b0 221 packssdw mm2, mm4 ; A0-B0 a0-b0 A1-B1 a1-b1 222 movq [%5], mm7 223 movq mm1, [blockq + %3] ; R3 R1 r3 r1 224 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 225 movq [24 + %5], mm2 226 pmaddwd mm4, mm1 ; -C1R3+C5R1 -C1r3+C5r1 227 movq mm7, [coeffs + 88] ; C3 C7 C3 C7 228 pmaddwd mm1, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 229 pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5 230 movq mm2, mm0 ; A2 a2 231 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 232 paddd mm4, mm7 ; B2 b2 233 paddd mm2, mm4 ; A2+B2 a2+b2 234 psubd mm0, mm4 ; a2-B2 a2-b2 235 psrad mm2, %7 236 psrad mm0, %7 237 movq mm4, mm6 ; A3 a3 238 paddd mm3, mm1 ; B3 b3 239 paddd mm6, mm3 ; A3+B3 a3+b3 240 psubd mm4, mm3 ; a3-B3 a3-b3 241 psrad mm6, %7 242 packssdw mm2, mm6 ; A3+B3 a3+b3 A2+B2 a2+b2 243 movq [8 + %5], mm2 244 psrad mm4, %7 245 packssdw mm4, mm0 ; A2-B2 a2-b2 A3-B3 a3-b3 246 movq [16 + %5], mm4 247%endmacro 248 249%macro IDCT1 6 250 movq mm0, %1 ; R4 R0 r4 r0 251 movq mm1, %2 ; R6 R2 r6 r2 252 movq mm2, %3 ; R3 R1 r3 r1 253 movq mm3, %4 ; R7 R5 r7 r5 254 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 255 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 256 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 257 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 258 movq mm5, [coeffs + 32] ; C6 C2 C6 C2 259 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 260 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 261 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 262 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 263 movq mm7, [coeffs + 48] ; C3 C1 C3 C1 264 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 265 paddd mm4, mm5 ; A0 a0 266 psubd mm6, mm5 ; A3 a3 267 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 268 paddd mm0, mm1 ; A1 a1 269 psubd mm5, mm1 ; A2 a2 270 movq mm1, [coeffs + 56] ; C7 C5 C7 C5 271 pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5 272 pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1 273 paddd mm7, mm1 ; B0 b0 274 movq mm1, [coeffs + 72] ; -C5 -C1 -C5 -C1 275 pmaddwd mm1, mm3 ; -C5R7-C1R5 -C5r7-C1r5 276 paddd mm7, mm4 ; A0+B0 a0+b0 277 paddd mm4, mm4 ; 2A0 2a0 278 psubd mm4, mm7 ; A0-B0 a0-b0 279 paddd mm1, mm2 ; B1 b1 280 psrad mm7, %6 281 psrad mm4, %6 282 movq mm2, mm0 ; A1 a1 283 paddd mm0, mm1 ; A1+B1 a1+b1 284 psubd mm2, mm1 ; A1-B1 a1-b1 285 psrad mm0, %6 286 psrad mm2, %6 287 packssdw mm7, mm7 ; A0+B0 a0+b0 288 movd [%5], mm7 289 packssdw mm0, mm0 ; A1+B1 a1+b1 290 movd [16 + %5], mm0 291 packssdw mm2, mm2 ; A1-B1 a1-b1 292 movd [96 + %5], mm2 293 packssdw mm4, mm4 ; A0-B0 a0-b0 294 movd [112 + %5], mm4 295 movq mm0, %3 ; R3 R1 r3 r1 296 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 297 pmaddwd mm4, mm0 ; -C1R3+C5R1 -C1r3+C5r1 298 movq mm7, [coeffs + 88] ; C3 C7 C3 C7 299 pmaddwd mm0, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 300 pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5 301 movq mm2, mm5 ; A2 a2 302 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 303 paddd mm4, mm7 ; B2 b2 304 paddd mm2, mm4 ; A2+B2 a2+b2 305 psubd mm5, mm4 ; a2-B2 a2-b2 306 psrad mm2, %6 307 psrad mm5, %6 308 movq mm4, mm6 ; A3 a3 309 paddd mm3, mm0 ; B3 b3 310 paddd mm6, mm3 ; A3+B3 a3+b3 311 psubd mm4, mm3 ; a3-B3 a3-b3 312 psrad mm6, %6 313 psrad mm4, %6 314 packssdw mm2, mm2 ; A2+B2 a2+b2 315 packssdw mm6, mm6 ; A3+B3 a3+b3 316 movd [32 + %5], mm2 317 packssdw mm4, mm4 ; A3-B3 a3-b3 318 packssdw mm5, mm5 ; A2-B2 a2-b2 319 movd [48 + %5], mm6 320 movd [64 + %5], mm4 321 movd [80 + %5], mm5 322%endmacro 323 324%macro IDCT2 6 325 movq mm0, %1 ; R4 R0 r4 r0 326 movq mm1, %2 ; R6 R2 r6 r2 327 movq mm3, %4 ; R7 R5 r7 r5 328 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 329 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 330 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 331 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 332 movq mm5, [coeffs + 32] ; C6 C2 C6 C2 333 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 334 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 335 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 336 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 337 paddd mm4, mm5 ; A0 a0 338 psubd mm6, mm5 ; A3 a3 339 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 340 paddd mm0, mm1 ; A1 a1 341 psubd mm5, mm1 ; A2 a2 342 movq mm1, [coeffs + 56] ; C7 C5 C7 C5 343 pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5 344 movq mm7, [coeffs + 72] ; -C5 -C1 -C5 -C1 345 pmaddwd mm7, mm3 ; -C5R7-C1R5 -C5r7-C1r5 346 paddd mm1, mm4 ; A0+B0 a0+b0 347 paddd mm4, mm4 ; 2A0 2a0 348 psubd mm4, mm1 ; A0-B0 a0-b0 349 psrad mm1, %6 350 psrad mm4, %6 351 movq mm2, mm0 ; A1 a1 352 paddd mm0, mm7 ; A1+B1 a1+b1 353 psubd mm2, mm7 ; A1-B1 a1-b1 354 psrad mm0, %6 355 psrad mm2, %6 356 packssdw mm1, mm1 ; A0+B0 a0+b0 357 movd [%5], mm1 358 packssdw mm0, mm0 ; A1+B1 a1+b1 359 movd [16 + %5], mm0 360 packssdw mm2, mm2 ; A1-B1 a1-b1 361 movd [96 + %5], mm2 362 packssdw mm4, mm4 ; A0-B0 a0-b0 363 movd [112 + %5], mm4 364 movq mm1, [coeffs + 88] ; C3 C7 C3 C7 365 pmaddwd mm1, mm3 ; C3R7+C7R5 C3r7+C7r5 366 movq mm2, mm5 ; A2 a2 367 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 368 paddd mm2, mm1 ; A2+B2 a2+b2 369 psubd mm5, mm1 ; a2-B2 a2-b2 370 psrad mm2, %6 371 psrad mm5, %6 372 movq mm1, mm6 ; A3 a3 373 paddd mm6, mm3 ; A3+B3 a3+b3 374 psubd mm1, mm3 ; a3-B3 a3-b3 375 psrad mm6, %6 376 psrad mm1, %6 377 packssdw mm2, mm2 ; A2+B2 a2+b2 378 packssdw mm6, mm6 ; A3+B3 a3+b3 379 movd [32 + %5], mm2 380 packssdw mm1, mm1 ; A3-B3 a3-b3 381 packssdw mm5, mm5 ; A2-B2 a2-b2 382 movd [48 + %5], mm6 383 movd [64 + %5], mm1 384 movd [80 + %5], mm5 385%endmacro 386 387%macro IDCT3 6 388 movq mm0, %1 ; R4 R0 r4 r0 389 movq mm3, %4 ; R7 R5 r7 r5 390 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 391 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 392 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 393 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 394 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 395 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 396 movq mm1, [coeffs + 56] ; C7 C5 C7 C5 397 pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5 398 movq mm7, [coeffs + 72] ; -C5 -C1 -C5 -C1 399 pmaddwd mm7, mm3 ; -C5R7-C1R5 -C5r7-C1r5 400 paddd mm1, mm4 ; A0+B0 a0+b0 401 paddd mm4, mm4 ; 2A0 2a0 402 psubd mm4, mm1 ; A0-B0 a0-b0 403 psrad mm1, %6 404 psrad mm4, %6 405 movq mm2, mm0 ; A1 a1 406 paddd mm0, mm7 ; A1+B1 a1+b1 407 psubd mm2, mm7 ; A1-B1 a1-b1 408 psrad mm0, %6 409 psrad mm2, %6 410 packssdw mm1, mm1 ; A0+B0 a0+b0 411 movd [%5], mm1 412 packssdw mm0, mm0 ; A1+B1 a1+b1 413 movd [16 + %5], mm0 414 packssdw mm2, mm2 ; A1-B1 a1-b1 415 movd [96 + %5], mm2 416 packssdw mm4, mm4 ; A0-B0 a0-b0 417 movd [112 + %5], mm4 418 movq mm1, [coeffs + 88] ; C3 C7 C3 C7 419 pmaddwd mm1, mm3 ; C3R7+C7R5 C3r7+C7r5 420 movq mm2, mm5 ; A2 a2 421 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 422 paddd mm2, mm1 ; A2+B2 a2+b2 423 psubd mm5, mm1 ; a2-B2 a2-b2 424 psrad mm2, %6 425 psrad mm5, %6 426 movq mm1, mm6 ; A3 a3 427 paddd mm6, mm3 ; A3+B3 a3+b3 428 psubd mm1, mm3 ; a3-B3 a3-b3 429 psrad mm6, %6 430 psrad mm1, %6 431 packssdw mm2, mm2 ; A2+B2 a2+b2 432 packssdw mm6, mm6 ; A3+B3 a3+b3 433 movd [32 + %5], mm2 434 packssdw mm1, mm1 ; A3-B3 a3-b3 435 packssdw mm5, mm5 ; A2-B2 a2-b2 436 movd [48 + %5], mm6 437 movd [64 + %5], mm1 438 movd [80 + %5], mm5 439%endmacro 440 441%macro IDCT4 6 442 movq mm0, %1 ; R4 R0 r4 r0 443 movq mm2, %3 ; R3 R1 r3 r1 444 movq mm3, %4 ; R7 R5 r7 r5 445 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 446 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 447 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 448 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 449 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 450 movq mm7, [coeffs + 48] ; C3 C1 C3 C1 451 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 452 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 453 movq mm1, [coeffs + 56] ; C7 C5 C7 C5 454 pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5 455 pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1 456 paddd mm7, mm1 ; B0 b0 457 movq mm1, [coeffs + 72] ; -C5 -C1 -C5 -C1 458 pmaddwd mm1, mm3 ; -C5R7-C1R5 -C5r7-C1r5 459 paddd mm7, mm4 ; A0+B0 a0+b0 460 paddd mm4, mm4 ; 2A0 2a0 461 psubd mm4, mm7 ; A0-B0 a0-b0 462 paddd mm1, mm2 ; B1 b1 463 psrad mm7, %6 464 psrad mm4, %6 465 movq mm2, mm0 ; A1 a1 466 paddd mm0, mm1 ; A1+B1 a1+b1 467 psubd mm2, mm1 ; A1-B1 a1-b1 468 psrad mm0, %6 469 psrad mm2, %6 470 packssdw mm7, mm7 ; A0+B0 a0+b0 471 movd [%5], mm7 472 packssdw mm0, mm0 ; A1+B1 a1+b1 473 movd [16 + %5], mm0 474 packssdw mm2, mm2 ; A1-B1 a1-b1 475 movd [96 + %5], mm2 476 packssdw mm4, mm4 ; A0-B0 a0-b0 477 movd [112 + %5], mm4 478 movq mm0, %3 ; R3 R1 r3 r1 479 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 480 pmaddwd mm4, mm0 ; -C1R3+C5R1 -C1r3+C5r1 481 movq mm7, [coeffs + 88] ; C3 C7 C3 C7 482 pmaddwd mm0, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 483 pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5 484 movq mm2, mm5 ; A2 a2 485 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 486 paddd mm4, mm7 ; B2 b2 487 paddd mm2, mm4 ; A2+B2 a2+b2 488 psubd mm5, mm4 ; a2-B2 a2-b2 489 psrad mm2, %6 490 psrad mm5, %6 491 movq mm4, mm6 ; A3 a3 492 paddd mm3, mm0 ; B3 b3 493 paddd mm6, mm3 ; A3+B3 a3+b3 494 psubd mm4, mm3 ; a3-B3 a3-b3 495 psrad mm6, %6 496 psrad mm4, %6 497 packssdw mm2, mm2 ; A2+B2 a2+b2 498 packssdw mm6, mm6 ; A3+B3 a3+b3 499 movd [32 + %5], mm2 500 packssdw mm4, mm4 ; A3-B3 a3-b3 501 packssdw mm5, mm5 ; A2-B2 a2-b2 502 movd [48 + %5], mm6 503 movd [64 + %5], mm4 504 movd [80 + %5], mm5 505%endmacro 506 507%macro IDCT5 6 508 movq mm0, %1 ; R4 R0 r4 r0 509 movq mm2, %3 ; R3 R1 r3 r1 510 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 511 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 512 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 513 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 514 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 515 movq mm7, [coeffs + 48] ; C3 C1 C3 C1 516 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 517 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 518 movq mm3, [coeffs + 64] 519 pmaddwd mm3, mm2 ; -C7R3+C3R1 -C7r3+C3r1 520 paddd mm7, mm4 ; A0+B0 a0+b0 521 paddd mm4, mm4 ; 2A0 2a0 522 psubd mm4, mm7 ; A0-B0 a0-b0 523 psrad mm7, %6 524 psrad mm4, %6 525 movq mm1, mm0 ; A1 a1 526 paddd mm0, mm3 ; A1+B1 a1+b1 527 psubd mm1, mm3 ; A1-B1 a1-b1 528 psrad mm0, %6 529 psrad mm1, %6 530 packssdw mm7, mm7 ; A0+B0 a0+b0 531 movd [%5], mm7 532 packssdw mm0, mm0 ; A1+B1 a1+b1 533 movd [16 + %5], mm0 534 packssdw mm1, mm1 ; A1-B1 a1-b1 535 movd [96 + %5], mm1 536 packssdw mm4, mm4 ; A0-B0 a0-b0 537 movd [112 + %5], mm4 538 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 539 pmaddwd mm4, mm2 ; -C1R3+C5R1 -C1r3+C5r1 540 pmaddwd mm2, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 541 movq mm1, mm5 ; A2 a2 542 paddd mm1, mm4 ; A2+B2 a2+b2 543 psubd mm5, mm4 ; a2-B2 a2-b2 544 psrad mm1, %6 545 psrad mm5, %6 546 movq mm4, mm6 ; A3 a3 547 paddd mm6, mm2 ; A3+B3 a3+b3 548 psubd mm4, mm2 ; a3-B3 a3-b3 549 psrad mm6, %6 550 psrad mm4, %6 551 packssdw mm1, mm1 ; A2+B2 a2+b2 552 packssdw mm6, mm6 ; A3+B3 a3+b3 553 movd [32 + %5], mm1 554 packssdw mm4, mm4 ; A3-B3 a3-b3 555 packssdw mm5, mm5 ; A2-B2 a2-b2 556 movd [48 + %5], mm6 557 movd [64 + %5], mm4 558 movd [80 + %5], mm5 559%endmacro 560 561%macro IDCT6 6 562 movq mm0, [%1] ; R4 R0 r4 r0 563 movq mm1, [%2] ; R6 R2 r6 r2 564 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 565 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 566 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 567 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 568 movq mm5, [coeffs + 32] ; C6 C2 C6 C2 569 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 570 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 571 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 572 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 573 paddd mm4, mm5 ; A0 a0 574 psubd mm6, mm5 ; A3 a3 575 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 576 paddd mm0, mm1 ; A1 a1 577 psubd mm5, mm1 ; A2 a2 578 movq mm2, [8 + %1] ; R4 R0 r4 r0 579 movq mm3, [8 + %2] ; R6 R2 r6 r2 580 movq mm1, [coeffs + 16] ; C4 C4 C4 C4 581 pmaddwd mm1, mm2 ; C4R4+C4R0 C4r4+C4r0 582 movq mm7, [coeffs + 24] ; -C4 C4 -C4 C4 583 pmaddwd mm2, mm7 ; -C4R4+C4R0 -C4r4+C4r0 584 movq mm7, [coeffs + 32] ; C6 C2 C6 C2 585 pmaddwd mm7, mm3 ; C6R6+C2R2 C6r6+C2r2 586 pmaddwd mm3, [coeffs + 40] ; -C2R6+C6R2 -C2r6+C6r2 587 paddd mm7, mm1 ; A0 a0 588 paddd mm1, mm1 ; 2C0 2c0 589 psubd mm1, mm7 ; A3 a3 590 paddd mm3, mm2 ; A1 a1 591 paddd mm2, mm2 ; 2C1 2c1 592 psubd mm2, mm3 ; A2 a2 593 psrad mm4, %6 594 psrad mm7, %6 595 psrad mm3, %6 596 packssdw mm4, mm7 ; A0 a0 597 movq [%5], mm4 598 psrad mm0, %6 599 packssdw mm0, mm3 ; A1 a1 600 movq [16 + %5], mm0 601 movq [96 + %5], mm0 602 movq [112 + %5], mm4 603 psrad mm5, %6 604 psrad mm6, %6 605 psrad mm2, %6 606 packssdw mm5, mm2 ; A2-B2 a2-b2 607 movq [32 + %5], mm5 608 psrad mm1, %6 609 packssdw mm6, mm1 ; A3+B3 a3+b3 610 movq [48 + %5], mm6 611 movq [64 + %5], mm6 612 movq [80 + %5], mm5 613%endmacro 614 615%macro IDCT7 6 616 movq mm0, %1 ; R4 R0 r4 r0 617 movq mm1, %2 ; R6 R2 r6 r2 618 movq mm2, %3 ; R3 R1 r3 r1 619 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 620 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 621 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 622 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 623 movq mm5, [coeffs + 32] ; C6 C2 C6 C2 624 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 625 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 626 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 627 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 628 movq mm7, [coeffs + 48] ; C3 C1 C3 C1 629 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 630 paddd mm4, mm5 ; A0 a0 631 psubd mm6, mm5 ; A3 a3 632 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 633 paddd mm0, mm1 ; A1 a1 634 psubd mm5, mm1 ; A2 a2 635 movq mm1, [coeffs + 64] 636 pmaddwd mm1, mm2 ; -C7R3+C3R1 -C7r3+C3r1 637 paddd mm7, mm4 ; A0+B0 a0+b0 638 paddd mm4, mm4 ; 2A0 2a0 639 psubd mm4, mm7 ; A0-B0 a0-b0 640 psrad mm7, %6 641 psrad mm4, %6 642 movq mm3, mm0 ; A1 a1 643 paddd mm0, mm1 ; A1+B1 a1+b1 644 psubd mm3, mm1 ; A1-B1 a1-b1 645 psrad mm0, %6 646 psrad mm3, %6 647 packssdw mm7, mm7 ; A0+B0 a0+b0 648 movd [%5], mm7 649 packssdw mm0, mm0 ; A1+B1 a1+b1 650 movd [16 + %5], mm0 651 packssdw mm3, mm3 ; A1-B1 a1-b1 652 movd [96 + %5], mm3 653 packssdw mm4, mm4 ; A0-B0 a0-b0 654 movd [112 + %5], mm4 655 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 656 pmaddwd mm4, mm2 ; -C1R3+C5R1 -C1r3+C5r1 657 pmaddwd mm2, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 658 movq mm3, mm5 ; A2 a2 659 paddd mm3, mm4 ; A2+B2 a2+b2 660 psubd mm5, mm4 ; a2-B2 a2-b2 661 psrad mm3, %6 662 psrad mm5, %6 663 movq mm4, mm6 ; A3 a3 664 paddd mm6, mm2 ; A3+B3 a3+b3 665 psubd mm4, mm2 ; a3-B3 a3-b3 666 psrad mm6, %6 667 packssdw mm3, mm3 ; A2+B2 a2+b2 668 movd [32 + %5], mm3 669 psrad mm4, %6 670 packssdw mm6, mm6 ; A3+B3 a3+b3 671 movd [48 + %5], mm6 672 packssdw mm4, mm4 ; A3-B3 a3-b3 673 packssdw mm5, mm5 ; A2-B2 a2-b2 674 movd [64 + %5], mm4 675 movd [80 + %5], mm5 676%endmacro 677 678%macro IDCT8 6 679 movq mm0, [%1] ; R4 R0 r4 r0 680 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 681 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 682 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 683 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 684 psrad mm4, %6 685 psrad mm0, %6 686 movq mm2, [8 + %1] ; R4 R0 r4 r0 687 movq mm1, [coeffs + 16] ; C4 C4 C4 C4 688 pmaddwd mm1, mm2 ; C4R4+C4R0 C4r4+C4r0 689 movq mm7, [coeffs + 24] ; -C4 C4 -C4 C4 690 pmaddwd mm2, mm7 ; -C4R4+C4R0 -C4r4+C4r0 691 movq mm7, [coeffs + 32] ; C6 C2 C6 C2 692 psrad mm1, %6 693 packssdw mm4, mm1 ; A0 a0 694 movq [%5], mm4 695 psrad mm2, %6 696 packssdw mm0, mm2 ; A1 a1 697 movq [16 + %5], mm0 698 movq [96 + %5], mm0 699 movq [112 + %5], mm4 700 movq [32 + %5], mm0 701 movq [48 + %5], mm4 702 movq [64 + %5], mm4 703 movq [80 + %5], mm0 704%endmacro 705 706%macro IDCT 0 707 DC_COND_IDCT 0, 8, 16, 24, rsp + 0, null, 11 708 Z_COND_IDCT 32, 40, 48, 56, rsp + 32, null, 11, %%4 709 Z_COND_IDCT 64, 72, 80, 88, rsp + 64, null, 11, %%2 710 Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%1 711 712 IDCT1 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 713 IDCT1 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 714 IDCT1 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 715 IDCT1 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 716 jmp %%9 717 718 ALIGN 16 719 %%4: 720 Z_COND_IDCT 64, 72, 80, 88, rsp + 64, null, 11, %%6 721 Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%5 722 723 IDCT2 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 724 IDCT2 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 725 IDCT2 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 726 IDCT2 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 727 jmp %%9 728 729 ALIGN 16 730 %%6: 731 Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%7 732 733 IDCT3 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 734 IDCT3 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 735 IDCT3 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 736 IDCT3 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 737 jmp %%9 738 739 ALIGN 16 740 %%2: 741 Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%3 742 743 IDCT4 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 744 IDCT4 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 745 IDCT4 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 746 IDCT4 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 747 jmp %%9 748 749 ALIGN 16 750 %%3: 751 752 IDCT5 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 753 IDCT5 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 754 IDCT5 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 755 IDCT5 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 756 jmp %%9 757 758 ALIGN 16 759 %%5: 760 761 IDCT6 rsp + 0, rsp + 64, rsp + 32, rsp + 96, blockq + 0, 20 762 IDCT6 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq + 8, 20 763 jmp %%9 764 765 ALIGN 16 766 %%1: 767 768 IDCT7 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 769 IDCT7 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 770 IDCT7 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 771 IDCT7 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 772 jmp %%9 773 774 ALIGN 16 775 %%7: 776 777 IDCT8 rsp + 0, rsp + 64, rsp + 32, rsp + 96, blockq + 0, 20 778 IDCT8 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq + 8, 20 779 780 %%9: 781%endmacro 782 783%macro PUT_PIXELS_CLAMPED_HALF 1 784 mova m0, [blockq+mmsize*0+%1] 785 mova m1, [blockq+mmsize*2+%1] 786%if mmsize == 8 787 mova m2, [blockq+mmsize*4+%1] 788 mova m3, [blockq+mmsize*6+%1] 789%endif 790 packuswb m0, [blockq+mmsize*1+%1] 791 packuswb m1, [blockq+mmsize*3+%1] 792%if mmsize == 8 793 packuswb m2, [blockq+mmsize*5+%1] 794 packuswb m3, [blockq+mmsize*7+%1] 795 movq [pixelsq], m0 796 movq [lsizeq+pixelsq], m1 797 movq [2*lsizeq+pixelsq], m2 798 movq [lsize3q+pixelsq], m3 799%else 800 movq [pixelsq], m0 801 movhps [lsizeq+pixelsq], m0 802 movq [2*lsizeq+pixelsq], m1 803 movhps [lsize3q+pixelsq], m1 804%endif 805%endmacro 806 807%macro ADD_PIXELS_CLAMPED 1 808 mova m0, [blockq+mmsize*0+%1] 809 mova m1, [blockq+mmsize*1+%1] 810%if mmsize == 8 811 mova m5, [blockq+mmsize*2+%1] 812 mova m6, [blockq+mmsize*3+%1] 813%endif 814 movq m2, [pixelsq] 815 movq m3, [pixelsq+lsizeq] 816%if mmsize == 8 817 mova m7, m2 818 punpcklbw m2, m4 819 punpckhbw m7, m4 820 paddsw m0, m2 821 paddsw m1, m7 822 mova m7, m3 823 punpcklbw m3, m4 824 punpckhbw m7, m4 825 paddsw m5, m3 826 paddsw m6, m7 827%else 828 punpcklbw m2, m4 829 punpcklbw m3, m4 830 paddsw m0, m2 831 paddsw m1, m3 832%endif 833 packuswb m0, m1 834%if mmsize == 8 835 packuswb m5, m6 836 movq [pixelsq], m0 837 movq [pixelsq+lsizeq], m5 838%else 839 movq [pixelsq], m0 840 movhps [pixelsq+lsizeq], m0 841%endif 842%endmacro 843 844INIT_MMX mmx 845 846cglobal simple_idct, 1, 2, 8, 128, block, t0 847 IDCT 848RET 849 850INIT_XMM sse2 851 852cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0 853 IDCT 854 lea lsize3q, [lsizeq*3] 855 PUT_PIXELS_CLAMPED_HALF 0 856 lea pixelsq, [pixelsq+lsizeq*4] 857 PUT_PIXELS_CLAMPED_HALF 64 858RET 859 860cglobal simple_idct_add, 3, 4, 8, 128, pixels, lsize, block, t0 861 IDCT 862 pxor m4, m4 863 ADD_PIXELS_CLAMPED 0 864 lea pixelsq, [pixelsq+lsizeq*2] 865 ADD_PIXELS_CLAMPED 32 866 lea pixelsq, [pixelsq+lsizeq*2] 867 ADD_PIXELS_CLAMPED 64 868 lea pixelsq, [pixelsq+lsizeq*2] 869 ADD_PIXELS_CLAMPED 96 870RET 871%endif 872