1;****************************************************************************** 2;* VP9 inverse transform x86 SIMD optimizations 3;* 4;* Copyright (C) 2015 Ronald S. Bultje <rsbultje gmail com> 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24%include "vp9itxfm_template.asm" 25 26SECTION_RODATA 27 28cextern pw_8 29cextern pw_1023 30cextern pw_2048 31cextern pw_4095 32cextern pw_m1 33cextern pd_1 34cextern pd_16 35cextern pd_32 36cextern pd_8192 37 38pd_8: times 4 dd 8 39pd_3fff: times 4 dd 0x3fff 40 41cextern pw_11585x2 42 43cextern pw_5283_13377 44cextern pw_9929_13377 45cextern pw_15212_m13377 46cextern pw_15212_9929 47cextern pw_m5283_m15212 48cextern pw_13377x2 49cextern pw_m13377_13377 50cextern pw_13377_0 51 52pw_9929_m5283: times 4 dw 9929, -5283 53 54%macro COEF_PAIR 2-3 55cextern pw_m%1_%2 56cextern pw_%2_%1 57%if %0 == 3 58cextern pw_m%1_m%2 59%if %1 != %2 60cextern pw_m%2_%1 61cextern pw_%1_%2 62%endif 63%endif 64%endmacro 65 66COEF_PAIR 2404, 16207 67COEF_PAIR 3196, 16069, 1 68COEF_PAIR 4756, 15679 69COEF_PAIR 5520, 15426 70COEF_PAIR 6270, 15137, 1 71COEF_PAIR 8423, 14053 72COEF_PAIR 10394, 12665 73COEF_PAIR 11003, 12140 74COEF_PAIR 11585, 11585, 1 75COEF_PAIR 13160, 9760 76COEF_PAIR 13623, 9102, 1 77COEF_PAIR 14449, 7723 78COEF_PAIR 14811, 7005 79COEF_PAIR 15893, 3981 80COEF_PAIR 16305, 1606 81COEF_PAIR 16364, 804 82 83default_8x8: 84times 12 db 1 85times 52 db 2 86row_8x8: 87times 18 db 1 88times 46 db 2 89col_8x8: 90times 6 db 1 91times 58 db 2 92default_16x16: 93times 10 db 1 94times 28 db 2 95times 51 db 3 96times 167 db 4 97row_16x16: 98times 21 db 1 99times 45 db 2 100times 60 db 3 101times 130 db 4 102col_16x16: 103times 5 db 1 104times 12 db 2 105times 25 db 3 106times 214 db 4 107default_32x32: 108times 9 db 1 109times 25 db 2 110times 36 db 3 111times 65 db 4 112times 105 db 5 113times 96 db 6 114times 112 db 7 115times 576 db 8 116 117SECTION .text 118 119%macro VP9_STORE_2X 6-7 dstq ; reg1, reg2, tmp1, tmp2, min, max, dst 120 mova m%3, [%7] 121 mova m%4, [%7+strideq] 122 paddw m%3, m%1 123 paddw m%4, m%2 124 pmaxsw m%3, m%5 125 pmaxsw m%4, m%5 126 pminsw m%3, m%6 127 pminsw m%4, m%6 128 mova [%7], m%3 129 mova [%7+strideq], m%4 130%endmacro 131 132%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg 133%assign %%y 0 134%rep %3 135%assign %%x 0 136%rep %3*4/mmsize 137 mova [%1+%%y+%%x], %4 138%assign %%x (%%x+mmsize) 139%endrep 140%assign %%y (%%y+%2) 141%endrep 142%endmacro 143 144; the input coefficients are scaled up by 2 bit (which we downscale immediately 145; in the iwht), and is otherwise orthonormally increased by 1 bit per iwht_1d. 146; therefore, a diff of 10-12+sign bit will fit in 12-14+sign bit after scaling, 147; i.e. everything can be done in 15+1bpp words. Since the quant fractional bits 148; add 2 bits, we need to scale before converting to word in 12bpp, since the 149; input will be 16+sign bit which doesn't fit in 15+sign words, but in 10bpp 150; we can scale after converting to words (which is half the instructions), 151; since the input is only 14+sign bit, which fits in 15+sign words directly. 152 153%macro IWHT4_FN 2 ; bpp, max 154cglobal vp9_iwht_iwht_4x4_add_%1, 3, 3, 8, dst, stride, block, eob 155 mova m7, [pw_%2] 156 mova m0, [blockq+0*16+0] 157 mova m1, [blockq+1*16+0] 158%if %1 >= 12 159 mova m4, [blockq+0*16+8] 160 mova m5, [blockq+1*16+8] 161 psrad m0, 2 162 psrad m1, 2 163 psrad m4, 2 164 psrad m5, 2 165 packssdw m0, m4 166 packssdw m1, m5 167%else 168 packssdw m0, [blockq+0*16+8] 169 packssdw m1, [blockq+1*16+8] 170 psraw m0, 2 171 psraw m1, 2 172%endif 173 mova m2, [blockq+2*16+0] 174 mova m3, [blockq+3*16+0] 175%if %1 >= 12 176 mova m4, [blockq+2*16+8] 177 mova m5, [blockq+3*16+8] 178 psrad m2, 2 179 psrad m3, 2 180 psrad m4, 2 181 psrad m5, 2 182 packssdw m2, m4 183 packssdw m3, m5 184%else 185 packssdw m2, [blockq+2*16+8] 186 packssdw m3, [blockq+3*16+8] 187 psraw m2, 2 188 psraw m3, 2 189%endif 190 191 VP9_IWHT4_1D 192 TRANSPOSE4x4W 0, 1, 2, 3, 4 193 VP9_IWHT4_1D 194 195 pxor m6, m6 196 VP9_STORE_2X 0, 1, 4, 5, 6, 7 197 lea dstq, [dstq+strideq*2] 198 VP9_STORE_2X 2, 3, 4, 5, 6, 7 199 ZERO_BLOCK blockq, 16, 4, m6 200 RET 201%endmacro 202 203INIT_MMX mmxext 204IWHT4_FN 10, 1023 205INIT_MMX mmxext 206IWHT4_FN 12, 4095 207 208%macro VP9_IDCT4_WRITEOUT 0 209%if cpuflag(ssse3) 210 mova m5, [pw_2048] 211 pmulhrsw m0, m5 212 pmulhrsw m1, m5 213 pmulhrsw m2, m5 214 pmulhrsw m3, m5 215%else 216 mova m5, [pw_8] 217 paddw m0, m5 218 paddw m1, m5 219 paddw m2, m5 220 paddw m3, m5 221 psraw m0, 4 222 psraw m1, 4 223 psraw m2, 4 224 psraw m3, 4 225%endif 226 mova m5, [pw_1023] 227 VP9_STORE_2X 0, 1, 6, 7, 4, 5 228 lea dstq, [dstq+2*strideq] 229 VP9_STORE_2X 2, 3, 6, 7, 4, 5 230%endmacro 231 232%macro DC_ONLY 2 ; shift, zero 233 mov coefd, dword [blockq] 234 movd [blockq], %2 235 imul coefd, 11585 236 add coefd, 8192 237 sar coefd, 14 238 imul coefd, 11585 239 add coefd, ((1 << (%1 - 1)) << 14) + 8192 240 sar coefd, 14 + %1 241%endmacro 242 243; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits 244; in 15+1 words without additional effort, since the coefficients are 15bpp. 245 246%macro IDCT4_10_FN 0 247cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob 248 cmp eobd, 1 249 jg .idctfull 250 251 ; dc-only 252 pxor m4, m4 253%if cpuflag(ssse3) 254 movd m0, [blockq] 255 movd [blockq], m4 256 mova m5, [pw_11585x2] 257 pmulhrsw m0, m5 258 pmulhrsw m0, m5 259%else 260 DEFINE_ARGS dst, stride, block, coef 261 DC_ONLY 4, m4 262 movd m0, coefd 263%endif 264 pshufw m0, m0, 0 265 mova m5, [pw_1023] 266%if cpuflag(ssse3) 267 pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 268%endif 269 VP9_STORE_2X 0, 0, 6, 7, 4, 5 270 lea dstq, [dstq+2*strideq] 271 VP9_STORE_2X 0, 0, 6, 7, 4, 5 272 RET 273 274.idctfull: 275 mova m0, [blockq+0*16+0] 276 mova m1, [blockq+1*16+0] 277 packssdw m0, [blockq+0*16+8] 278 packssdw m1, [blockq+1*16+8] 279 mova m2, [blockq+2*16+0] 280 mova m3, [blockq+3*16+0] 281 packssdw m2, [blockq+2*16+8] 282 packssdw m3, [blockq+3*16+8] 283 284%if cpuflag(ssse3) 285 mova m6, [pw_11585x2] 286%endif 287 mova m7, [pd_8192] ; rounding 288 VP9_IDCT4_1D 289 TRANSPOSE4x4W 0, 1, 2, 3, 4 290 VP9_IDCT4_1D 291 292 pxor m4, m4 293 ZERO_BLOCK blockq, 16, 4, m4 294 VP9_IDCT4_WRITEOUT 295 RET 296%endmacro 297 298INIT_MMX mmxext 299IDCT4_10_FN 300INIT_MMX ssse3 301IDCT4_10_FN 302 303%macro IADST4_FN 4 304cglobal vp9_%1_%3_4x4_add_10, 3, 3, 0, dst, stride, block, eob 305%if WIN64 && notcpuflag(ssse3) 306 WIN64_SPILL_XMM 8 307%endif 308 movdqa xmm5, [pd_8192] 309 mova m0, [blockq+0*16+0] 310 mova m1, [blockq+1*16+0] 311 packssdw m0, [blockq+0*16+8] 312 packssdw m1, [blockq+1*16+8] 313 mova m2, [blockq+2*16+0] 314 mova m3, [blockq+3*16+0] 315 packssdw m2, [blockq+2*16+8] 316 packssdw m3, [blockq+3*16+8] 317 318%if cpuflag(ssse3) 319 mova m6, [pw_11585x2] 320%endif 321%ifnidn %1%3, iadstiadst 322 movdq2q m7, xmm5 323%endif 324 VP9_%2_1D 325 TRANSPOSE4x4W 0, 1, 2, 3, 4 326 VP9_%4_1D 327 328 pxor m4, m4 329 ZERO_BLOCK blockq, 16, 4, m4 330 VP9_IDCT4_WRITEOUT 331 RET 332%endmacro 333 334INIT_MMX sse2 335IADST4_FN idct, IDCT4, iadst, IADST4 336IADST4_FN iadst, IADST4, idct, IDCT4 337IADST4_FN iadst, IADST4, iadst, IADST4 338 339INIT_MMX ssse3 340IADST4_FN idct, IDCT4, iadst, IADST4 341IADST4_FN iadst, IADST4, idct, IDCT4 342IADST4_FN iadst, IADST4, iadst, IADST4 343 344; inputs and outputs are dwords, coefficients are words 345; 346; dst1 = src1 * coef1 + src2 * coef2 + rnd >> 14 347; dst2 = src1 * coef2 - src2 * coef1 + rnd >> 14 348%macro SUMSUB_MUL 6-8 [pd_8192], [pd_3fff] ; src/dst 1-2, tmp1-2, coef1-2, rnd, mask 349 pand m%3, m%1, %8 350 pand m%4, m%2, %8 351 psrad m%1, 14 352 psrad m%2, 14 353 packssdw m%4, m%2 354 packssdw m%3, m%1 355 punpckhwd m%2, m%4, m%3 356 punpcklwd m%4, m%3 357 pmaddwd m%3, m%4, [pw_%6_%5] 358 pmaddwd m%1, m%2, [pw_%6_%5] 359 pmaddwd m%4, [pw_m%5_%6] 360 pmaddwd m%2, [pw_m%5_%6] 361 paddd m%3, %7 362 paddd m%4, %7 363 psrad m%3, 14 364 psrad m%4, 14 365 paddd m%1, m%3 366 paddd m%2, m%4 367%endmacro 368 369%macro IDCT4_12BPP_1D 0-8 [pd_8192], [pd_3fff], 0, 1, 2, 3, 4, 5 ; rnd, mask, in/out0-3, tmp0-1 370 SUMSUB_MUL %3, %5, %7, %8, 11585, 11585, %1, %2 371 SUMSUB_MUL %4, %6, %7, %8, 15137, 6270, %1, %2 372 SUMSUB_BA d, %4, %3, %7 373 SUMSUB_BA d, %6, %5, %7 374 SWAP %4, %6, %3 375%endmacro 376 377%macro STORE_4x4 6 ; tmp1-2, reg1-2, min, max 378 movh m%1, [dstq+strideq*0] 379 movh m%2, [dstq+strideq*2] 380 movhps m%1, [dstq+strideq*1] 381 movhps m%2, [dstq+stride3q ] 382 paddw m%1, m%3 383 paddw m%2, m%4 384 pmaxsw m%1, %5 385 pmaxsw m%2, %5 386 pminsw m%1, %6 387 pminsw m%2, %6 388 movh [dstq+strideq*0], m%1 389 movhps [dstq+strideq*1], m%1 390 movh [dstq+strideq*2], m%2 391 movhps [dstq+stride3q ], m%2 392%endmacro 393 394%macro ROUND_AND_STORE_4x4 8 ; reg1-4, min, max, rnd, shift 395 paddd m%1, %7 396 paddd m%2, %7 397 paddd m%3, %7 398 paddd m%4, %7 399 psrad m%1, %8 400 psrad m%2, %8 401 psrad m%3, %8 402 psrad m%4, %8 403 packssdw m%1, m%2 404 packssdw m%3, m%4 405 STORE_4x4 %2, %4, %1, %3, %5, %6 406%endmacro 407 408INIT_XMM sse2 409cglobal vp9_idct_idct_4x4_add_12, 4, 4, 8, dst, stride, block, eob 410 cmp eobd, 1 411 jg .idctfull 412 413 ; dc-only - this is special, since for 4x4 12bpp, the max coef size is 414 ; 17+sign bpp. Since the multiply is with 11585, which is 14bpp, the 415 ; result of each multiply is 31+sign bit, i.e. it _exactly_ fits in a 416 ; dword. After the final shift (4), the result is 13+sign bits, so we 417 ; don't need any additional processing to fit it in a word 418 DEFINE_ARGS dst, stride, block, coef 419 pxor m4, m4 420 DC_ONLY 4, m4 421 movd m0, coefd 422 pshuflw m0, m0, q0000 423 punpcklqdq m0, m0 424 mova m5, [pw_4095] 425 DEFINE_ARGS dst, stride, stride3 426 lea stride3q, [strideq*3] 427 STORE_4x4 1, 3, 0, 0, m4, m5 428 RET 429 430.idctfull: 431 DEFINE_ARGS dst, stride, block, eob 432 mova m0, [blockq+0*16] 433 mova m1, [blockq+1*16] 434 mova m2, [blockq+2*16] 435 mova m3, [blockq+3*16] 436 mova m6, [pd_8192] 437 mova m7, [pd_3fff] 438 439 IDCT4_12BPP_1D m6, m7 440 TRANSPOSE4x4D 0, 1, 2, 3, 4 441 IDCT4_12BPP_1D m6, m7 442 443 pxor m4, m4 444 ZERO_BLOCK blockq, 16, 4, m4 445 446 ; writeout 447 DEFINE_ARGS dst, stride, stride3 448 lea stride3q, [strideq*3] 449 mova m5, [pw_4095] 450 mova m6, [pd_8] 451 ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, m6, 4 452 RET 453 454%macro SCRATCH 3-4 455%if ARCH_X86_64 456 SWAP %1, %2 457%if %0 == 4 458%define reg_%4 m%2 459%endif 460%else 461 mova [%3], m%1 462%if %0 == 4 463%define reg_%4 [%3] 464%endif 465%endif 466%endmacro 467 468%macro UNSCRATCH 3-4 469%if ARCH_X86_64 470 SWAP %1, %2 471%else 472 mova m%1, [%3] 473%endif 474%if %0 == 4 475%undef reg_%4 476%endif 477%endmacro 478 479%macro PRELOAD 2-3 480%if ARCH_X86_64 481 mova m%1, [%2] 482%if %0 == 3 483%define reg_%3 m%1 484%endif 485%elif %0 == 3 486%define reg_%3 [%2] 487%endif 488%endmacro 489 490; out0 = 5283 * in0 + 13377 + in1 + 15212 * in2 + 9929 * in3 + rnd >> 14 491; out1 = 9929 * in0 + 13377 * in1 - 5283 * in2 - 15282 * in3 + rnd >> 14 492; out2 = 13377 * in0 - 13377 * in2 + 13377 * in3 + rnd >> 14 493; out3 = 15212 * in0 - 13377 * in1 + 9929 * in2 - 5283 * in3 + rnd >> 14 494%macro IADST4_12BPP_1D 0-2 [pd_8192], [pd_3fff] ; rnd, mask 495 pand m4, m0, %2 496 pand m5, m1, %2 497 psrad m0, 14 498 psrad m1, 14 499 packssdw m5, m1 500 packssdw m4, m0 501 punpckhwd m1, m4, m5 502 punpcklwd m4, m5 503 pand m5, m2, %2 504 pand m6, m3, %2 505 psrad m2, 14 506 psrad m3, 14 507 packssdw m6, m3 508 packssdw m5, m2 509 punpckhwd m3, m5, m6 510 punpcklwd m5, m6 511 SCRATCH 1, 8, rsp+0*mmsize, a 512 SCRATCH 5, 9, rsp+1*mmsize, b 513 514 ; m1/3 have the high bits of 0,1,2,3 515 ; m4/5 have the low bits of 0,1,2,3 516 ; m0/2/6/7 are free 517 518 mova m2, [pw_15212_9929] 519 mova m0, [pw_5283_13377] 520 pmaddwd m7, m2, reg_b 521 pmaddwd m6, m4, m0 522 pmaddwd m2, m3 523 pmaddwd m0, reg_a 524 paddd m6, m7 525 paddd m0, m2 526 mova m1, [pw_m13377_13377] 527 mova m5, [pw_13377_0] 528 pmaddwd m7, m1, reg_b 529 pmaddwd m2, m4, m5 530 pmaddwd m1, m3 531 pmaddwd m5, reg_a 532 paddd m2, m7 533 paddd m1, m5 534 paddd m6, %1 535 paddd m2, %1 536 psrad m6, 14 537 psrad m2, 14 538 paddd m0, m6 ; t0 539 paddd m2, m1 ; t2 540 541 mova m7, [pw_m5283_m15212] 542 mova m5, [pw_9929_13377] 543 pmaddwd m1, m7, reg_b 544 pmaddwd m6, m4, m5 545 pmaddwd m7, m3 546 pmaddwd m5, reg_a 547 paddd m6, m1 548 paddd m7, m5 549 UNSCRATCH 5, 9, rsp+1*mmsize, b 550 pmaddwd m5, [pw_9929_m5283] 551 pmaddwd m4, [pw_15212_m13377] 552 pmaddwd m3, [pw_9929_m5283] 553 UNSCRATCH 1, 8, rsp+0*mmsize, a 554 pmaddwd m1, [pw_15212_m13377] 555 paddd m4, m5 556 paddd m3, m1 557 paddd m6, %1 558 paddd m4, %1 559 psrad m6, 14 560 psrad m4, 14 561 paddd m7, m6 ; t1 562 paddd m3, m4 ; t3 563 564 SWAP 1, 7 565%endmacro 566 567%macro IADST4_12BPP_FN 4 568cglobal vp9_%1_%3_4x4_add_12, 3, 3, 12, 2 * ARCH_X86_32 * mmsize, dst, stride, block, eob 569 mova m0, [blockq+0*16] 570 mova m1, [blockq+1*16] 571 mova m2, [blockq+2*16] 572 mova m3, [blockq+3*16] 573 574 PRELOAD 10, pd_8192, rnd 575 PRELOAD 11, pd_3fff, mask 576 %2_12BPP_1D reg_rnd, reg_mask 577 TRANSPOSE4x4D 0, 1, 2, 3, 4 578 %4_12BPP_1D reg_rnd, reg_mask 579 580 pxor m4, m4 581 ZERO_BLOCK blockq, 16, 4, m4 582 583 ; writeout 584 DEFINE_ARGS dst, stride, stride3 585 lea stride3q, [strideq*3] 586 mova m5, [pw_4095] 587 mova m6, [pd_8] 588 ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, m6, 4 589 RET 590%endmacro 591 592INIT_XMM sse2 593IADST4_12BPP_FN idct, IDCT4, iadst, IADST4 594IADST4_12BPP_FN iadst, IADST4, idct, IDCT4 595IADST4_12BPP_FN iadst, IADST4, iadst, IADST4 596 597; the following line has not been executed at the end of this macro: 598; UNSCRATCH 6, 8, rsp+%3*mmsize 599%macro IDCT8_1D 1-5 [pd_8192], [pd_3fff], 2 * mmsize, 17 ; src, rnd, mask, src_stride, stack_offset 600 mova m0, [%1+0*%4] 601 mova m2, [%1+2*%4] 602 mova m4, [%1+4*%4] 603 mova m6, [%1+6*%4] 604 IDCT4_12BPP_1D %2, %3, 0, 2, 4, 6, 1, 3 ; m0/2/4/6 have t0/1/2/3 605 SCRATCH 4, 8, rsp+(%5+0)*mmsize 606 SCRATCH 6, 9, rsp+(%5+1)*mmsize 607 mova m1, [%1+1*%4] 608 mova m3, [%1+3*%4] 609 mova m5, [%1+5*%4] 610 mova m7, [%1+7*%4] 611 SUMSUB_MUL 1, 7, 4, 6, 16069, 3196, %2, %3 ; m1=t7a, m7=t4a 612 SUMSUB_MUL 5, 3, 4, 6, 9102, 13623, %2, %3 ; m5=t6a, m3=t5a 613 SUMSUB_BA d, 3, 7, 4 ; m3=t4, m7=t5a 614 SUMSUB_BA d, 5, 1, 4 ; m5=t7, m1=t6a 615 SUMSUB_MUL 1, 7, 4, 6, 11585, 11585, %2, %3 ; m1=t6, m7=t5 616 SUMSUB_BA d, 5, 0, 4 ; m5=out0, m0=out7 617 SUMSUB_BA d, 1, 2, 4 ; m1=out1, m2=out6 618 UNSCRATCH 4, 8, rsp+(%5+0)*mmsize 619 UNSCRATCH 6, 9, rsp+(%5+1)*mmsize 620 SCRATCH 2, 8, rsp+(%5+0)*mmsize 621 SUMSUB_BA d, 7, 4, 2 ; m7=out2, m4=out5 622 SUMSUB_BA d, 3, 6, 2 ; m3=out3, m6=out4 623 SWAP 0, 5, 4, 6, 2, 7 624%endmacro 625 626%macro STORE_2x8 5-7 dstq, strideq ; tmp1-2, reg, min, max 627 mova m%1, [%6+%7*0] 628 mova m%2, [%6+%7*1] 629 paddw m%1, m%3 630 paddw m%2, m%3 631 pmaxsw m%1, %4 632 pmaxsw m%2, %4 633 pminsw m%1, %5 634 pminsw m%2, %5 635 mova [%6+%7*0], m%1 636 mova [%6+%7*1], m%2 637%endmacro 638 639; FIXME we can use the intermediate storage (rsp[0-15]) on x86-32 for temp 640; storage also instead of allocating two more stack spaces. This doesn't 641; matter much but it's something... 642INIT_XMM sse2 643cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 14, \ 644 16 * mmsize + 3 * ARCH_X86_32 * mmsize, \ 645 dst, stride, block, eob 646 mova m0, [pw_1023] 647 cmp eobd, 1 648 jg .idctfull 649 650 ; dc-only - the 10bit version can be done entirely in 32bit, since the max 651 ; coef values are 16+sign bit, and the coef is 14bit, so 30+sign easily 652 ; fits in 32bit 653 DEFINE_ARGS dst, stride, block, coef 654 pxor m2, m2 655 DC_ONLY 5, m2 656 movd m1, coefd 657 pshuflw m1, m1, q0000 658 punpcklqdq m1, m1 659 DEFINE_ARGS dst, stride, cnt 660 mov cntd, 4 661.loop_dc: 662 STORE_2x8 3, 4, 1, m2, m0 663 lea dstq, [dstq+strideq*2] 664 dec cntd 665 jg .loop_dc 666 RET 667 668.idctfull: 669 SCRATCH 0, 12, rsp+16*mmsize, max 670 DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak 671%if ARCH_X86_64 672 mov dstbakq, dstq 673 movsxd cntq, cntd 674%endif 675%ifdef PIC 676 lea ptrq, [default_8x8] 677 movzx cntd, byte [ptrq+cntq-1] 678%else 679 movzx cntd, byte [default_8x8+cntq-1] 680%endif 681 mov skipd, 2 682 sub skipd, cntd 683 mov ptrq, rsp 684 PRELOAD 10, pd_8192, rnd 685 PRELOAD 11, pd_3fff, mask 686 PRELOAD 13, pd_16, srnd 687.loop_1: 688 IDCT8_1D blockq, reg_rnd, reg_mask 689 690 TRANSPOSE4x4D 0, 1, 2, 3, 6 691 mova [ptrq+ 0*mmsize], m0 692 mova [ptrq+ 2*mmsize], m1 693 mova [ptrq+ 4*mmsize], m2 694 mova [ptrq+ 6*mmsize], m3 695 UNSCRATCH 6, 8, rsp+17*mmsize 696 TRANSPOSE4x4D 4, 5, 6, 7, 0 697 mova [ptrq+ 1*mmsize], m4 698 mova [ptrq+ 3*mmsize], m5 699 mova [ptrq+ 5*mmsize], m6 700 mova [ptrq+ 7*mmsize], m7 701 add ptrq, 8 * mmsize 702 add blockq, mmsize 703 dec cntd 704 jg .loop_1 705 706 ; zero-pad the remainder (skipped cols) 707 test skipd, skipd 708 jz .end 709 add skipd, skipd 710 lea blockq, [blockq+skipq*(mmsize/2)] 711 pxor m0, m0 712.loop_z: 713 mova [ptrq+mmsize*0], m0 714 mova [ptrq+mmsize*1], m0 715 mova [ptrq+mmsize*2], m0 716 mova [ptrq+mmsize*3], m0 717 add ptrq, 4 * mmsize 718 dec skipd 719 jg .loop_z 720.end: 721 722 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 723 lea stride3q, [strideq*3] 724 mov cntd, 2 725 mov ptrq, rsp 726.loop_2: 727 IDCT8_1D ptrq, reg_rnd, reg_mask 728 729 pxor m6, m6 730 ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, reg_srnd, 5 731 lea dstq, [dstq+strideq*4] 732 UNSCRATCH 0, 8, rsp+17*mmsize 733 UNSCRATCH 1, 12, rsp+16*mmsize, max 734 UNSCRATCH 2, 13, pd_16, srnd 735 ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, m1, m2, 5 736 add ptrq, 16 737%if ARCH_X86_64 738 lea dstq, [dstbakq+8] 739%else 740 mov dstq, dstm 741 add dstq, 8 742%endif 743 dec cntd 744 jg .loop_2 745 746 ; m6 is still zero 747 ZERO_BLOCK blockq-2*mmsize, 32, 8, m6 748 RET 749 750%macro DC_ONLY_64BIT 2 ; shift, zero 751%if ARCH_X86_64 752 movsxd coefq, dword [blockq] 753 movd [blockq], %2 754 imul coefq, 11585 755 add coefq, 8192 756 sar coefq, 14 757 imul coefq, 11585 758 add coefq, ((1 << (%1 - 1)) << 14) + 8192 759 sar coefq, 14 + %1 760%else 761 mov coefd, dword [blockq] 762 movd [blockq], %2 763 DEFINE_ARGS dst, stride, cnt, coef, coefl 764 mov cntd, 2 765.loop_dc_calc: 766 mov coefld, coefd 767 sar coefd, 14 768 and coefld, 0x3fff 769 imul coefd, 11585 770 imul coefld, 11585 771 add coefld, 8192 772 sar coefld, 14 773 add coefd, coefld 774 dec cntd 775 jg .loop_dc_calc 776 add coefd, 1 << (%1 - 1) 777 sar coefd, %1 778%endif 779%endmacro 780 781INIT_XMM sse2 782cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 14, \ 783 16 * mmsize + 3 * ARCH_X86_32 * mmsize, \ 784 dst, stride, block, eob 785 mova m0, [pw_4095] 786 cmp eobd, 1 787 jg mangle(private_prefix %+ _ %+ vp9_idct_idct_8x8_add_10 %+ SUFFIX).idctfull 788 789 ; dc-only - unfortunately, this one can overflow, since coefs are 18+sign 790 ; bpp, and 18+14+sign does not fit in 32bit, so we do 2-stage multiplies 791 DEFINE_ARGS dst, stride, block, coef, coefl 792 pxor m2, m2 793 DC_ONLY_64BIT 5, m2 794 movd m1, coefd 795 pshuflw m1, m1, q0000 796 punpcklqdq m1, m1 797 DEFINE_ARGS dst, stride, cnt 798 mov cntd, 4 799.loop_dc: 800 STORE_2x8 3, 4, 1, m2, m0 801 lea dstq, [dstq+strideq*2] 802 dec cntd 803 jg .loop_dc 804 RET 805 806; inputs and outputs are dwords, coefficients are words 807; 808; dst1[hi]:dst3[lo] = src1 * coef1 + src2 * coef2 809; dst2[hi]:dst4[lo] = src1 * coef2 - src2 * coef1 810%macro SUMSUB_MUL_D 6-7 [pd_3fff] ; src/dst 1-2, dst3-4, coef1-2, mask 811 pand m%3, m%1, %7 812 pand m%4, m%2, %7 813 psrad m%1, 14 814 psrad m%2, 14 815 packssdw m%4, m%2 816 packssdw m%3, m%1 817 punpckhwd m%2, m%4, m%3 818 punpcklwd m%4, m%3 819 pmaddwd m%3, m%4, [pw_%6_%5] 820 pmaddwd m%1, m%2, [pw_%6_%5] 821 pmaddwd m%4, [pw_m%5_%6] 822 pmaddwd m%2, [pw_m%5_%6] 823%endmacro 824 825; dst1 = src2[hi]:src4[lo] + src1[hi]:src3[lo] + rnd >> 14 826; dst2 = src2[hi]:src4[lo] - src1[hi]:src3[lo] + rnd >> 14 827%macro SUMSUB_PACK_D 5-6 [pd_8192] ; src/dst 1-2, src3-4, tmp, rnd 828 SUMSUB_BA d, %1, %2, %5 829 SUMSUB_BA d, %3, %4, %5 830 paddd m%3, %6 831 paddd m%4, %6 832 psrad m%3, 14 833 psrad m%4, 14 834 paddd m%1, m%3 835 paddd m%2, m%4 836%endmacro 837 838%macro NEGD 1 839%if cpuflag(ssse3) 840 psignd %1, [pw_m1] 841%else 842 pxor %1, [pw_m1] 843 paddd %1, [pd_1] 844%endif 845%endmacro 846 847; the following line has not been executed at the end of this macro: 848; UNSCRATCH 6, 8, rsp+17*mmsize 849%macro IADST8_1D 1-3 [pd_8192], [pd_3fff] ; src, rnd, mask 850 mova m0, [%1+ 0*mmsize] 851 mova m3, [%1+ 6*mmsize] 852 mova m4, [%1+ 8*mmsize] 853 mova m7, [%1+14*mmsize] 854 SUMSUB_MUL_D 7, 0, 1, 2, 16305, 1606, %3 ; m7/1=t0a, m0/2=t1a 855 SUMSUB_MUL_D 3, 4, 5, 6, 10394, 12665, %3 ; m3/5=t4a, m4/6=t5a 856 SCRATCH 0, 8, rsp+17*mmsize 857 SUMSUB_PACK_D 3, 7, 5, 1, 0, %2 ; m3=t0, m7=t4 858 UNSCRATCH 0, 8, rsp+17*mmsize 859 SUMSUB_PACK_D 4, 0, 6, 2, 1, %2 ; m4=t1, m0=t5 860 861 SCRATCH 3, 8, rsp+17*mmsize 862 SCRATCH 4, 9, rsp+18*mmsize 863 SCRATCH 7, 10, rsp+19*mmsize 864 SCRATCH 0, 11, rsp+20*mmsize 865 866 mova m1, [%1+ 2*mmsize] 867 mova m2, [%1+ 4*mmsize] 868 mova m5, [%1+10*mmsize] 869 mova m6, [%1+12*mmsize] 870 SUMSUB_MUL_D 5, 2, 3, 4, 14449, 7723, %3 ; m5/8=t2a, m2/9=t3a 871 SUMSUB_MUL_D 1, 6, 7, 0, 4756, 15679, %3 ; m1/10=t6a, m6/11=t7a 872 SCRATCH 2, 12, rsp+21*mmsize 873 SUMSUB_PACK_D 1, 5, 7, 3, 2, %2 ; m1=t2, m5=t6 874 UNSCRATCH 2, 12, rsp+21*mmsize 875 SUMSUB_PACK_D 6, 2, 0, 4, 3, %2 ; m6=t3, m2=t7 876 877 UNSCRATCH 7, 10, rsp+19*mmsize 878 UNSCRATCH 0, 11, rsp+20*mmsize 879 SCRATCH 1, 10, rsp+19*mmsize 880 SCRATCH 6, 11, rsp+20*mmsize 881 882 SUMSUB_MUL_D 7, 0, 3, 4, 15137, 6270, %3 ; m7/8=t4a, m0/9=t5a 883 SUMSUB_MUL_D 2, 5, 1, 6, 6270, 15137, %3 ; m2/10=t7a, m5/11=t6a 884 SCRATCH 2, 12, rsp+21*mmsize 885 SUMSUB_PACK_D 5, 7, 6, 3, 2, %2 ; m5=-out1, m7=t6 886 UNSCRATCH 2, 12, rsp+21*mmsize 887 NEGD m5 ; m5=out1 888 SUMSUB_PACK_D 2, 0, 1, 4, 3, %2 ; m2=out6, m0=t7 889 SUMSUB_MUL 7, 0, 3, 4, 11585, 11585, %2, %3 ; m7=out2, m0=-out5 890 NEGD m0 ; m0=out5 891 892 UNSCRATCH 3, 8, rsp+17*mmsize 893 UNSCRATCH 4, 9, rsp+18*mmsize 894 UNSCRATCH 1, 10, rsp+19*mmsize 895 UNSCRATCH 6, 11, rsp+20*mmsize 896 SCRATCH 2, 8, rsp+17*mmsize 897 SCRATCH 0, 9, rsp+18*mmsize 898 899 SUMSUB_BA d, 1, 3, 2 ; m1=out0, m3=t2 900 SUMSUB_BA d, 6, 4, 2 ; m6=-out7, m4=t3 901 NEGD m6 ; m6=out7 902 SUMSUB_MUL 3, 4, 2, 0, 11585, 11585, %2, %3 ; m3=-out3, m4=out4 903 NEGD m3 ; m3=out3 904 905 UNSCRATCH 0, 9, rsp+18*mmsize 906 907 SWAP 0, 1, 5 908 SWAP 2, 7, 6 909%endmacro 910 911%macro IADST8_FN 5 912cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \ 913 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \ 914 dst, stride, block, eob 915 mova m0, [pw_1023] 916 917.body: 918 SCRATCH 0, 13, rsp+16*mmsize, max 919 DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak 920%if ARCH_X86_64 921 mov dstbakq, dstq 922 movsxd cntq, cntd 923%endif 924%ifdef PIC 925 lea ptrq, [%5_8x8] 926 movzx cntd, byte [ptrq+cntq-1] 927%else 928 movzx cntd, byte [%5_8x8+cntq-1] 929%endif 930 mov skipd, 2 931 sub skipd, cntd 932 mov ptrq, rsp 933 PRELOAD 14, pd_8192, rnd 934 PRELOAD 15, pd_3fff, mask 935.loop_1: 936 %2_1D blockq, reg_rnd, reg_mask 937 938 TRANSPOSE4x4D 0, 1, 2, 3, 6 939 mova [ptrq+ 0*mmsize], m0 940 mova [ptrq+ 2*mmsize], m1 941 mova [ptrq+ 4*mmsize], m2 942 mova [ptrq+ 6*mmsize], m3 943 UNSCRATCH 6, 8, rsp+17*mmsize 944 TRANSPOSE4x4D 4, 5, 6, 7, 0 945 mova [ptrq+ 1*mmsize], m4 946 mova [ptrq+ 3*mmsize], m5 947 mova [ptrq+ 5*mmsize], m6 948 mova [ptrq+ 7*mmsize], m7 949 add ptrq, 8 * mmsize 950 add blockq, mmsize 951 dec cntd 952 jg .loop_1 953 954 ; zero-pad the remainder (skipped cols) 955 test skipd, skipd 956 jz .end 957 add skipd, skipd 958 lea blockq, [blockq+skipq*(mmsize/2)] 959 pxor m0, m0 960.loop_z: 961 mova [ptrq+mmsize*0], m0 962 mova [ptrq+mmsize*1], m0 963 mova [ptrq+mmsize*2], m0 964 mova [ptrq+mmsize*3], m0 965 add ptrq, 4 * mmsize 966 dec skipd 967 jg .loop_z 968.end: 969 970 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 971 lea stride3q, [strideq*3] 972 mov cntd, 2 973 mov ptrq, rsp 974.loop_2: 975 %4_1D ptrq, reg_rnd, reg_mask 976 977 pxor m6, m6 978 PRELOAD 9, pd_16, srnd 979 ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, reg_srnd, 5 980 lea dstq, [dstq+strideq*4] 981 UNSCRATCH 0, 8, rsp+17*mmsize 982 UNSCRATCH 1, 13, rsp+16*mmsize, max 983 UNSCRATCH 2, 9, pd_16, srnd 984 ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, m1, m2, 5 985 add ptrq, 16 986%if ARCH_X86_64 987 lea dstq, [dstbakq+8] 988%else 989 mov dstq, dstm 990 add dstq, 8 991%endif 992 dec cntd 993 jg .loop_2 994 995 ; m6 is still zero 996 ZERO_BLOCK blockq-2*mmsize, 32, 8, m6 997 RET 998 999cglobal vp9_%1_%3_8x8_add_12, 4, 6 + ARCH_X86_64, 16, \ 1000 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \ 1001 dst, stride, block, eob 1002 mova m0, [pw_4095] 1003 jmp mangle(private_prefix %+ _ %+ vp9_%1_%3_8x8_add_10 %+ SUFFIX).body 1004%endmacro 1005 1006INIT_XMM sse2 1007IADST8_FN idct, IDCT8, iadst, IADST8, row 1008IADST8_FN iadst, IADST8, idct, IDCT8, col 1009IADST8_FN iadst, IADST8, iadst, IADST8, default 1010 1011%macro IDCT16_1D 1-4 4 * mmsize, 65, 67 ; src, src_stride, stack_offset, mm32bit_stack_offset 1012 IDCT8_1D %1, [pd_8192], [pd_3fff], %2 * 2, %4 ; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7 1013 ; SCRATCH 6, 8, rsp+(%4+0)*mmsize ; t6 1014 SCRATCH 0, 15, rsp+(%4+7)*mmsize ; t0a 1015 SCRATCH 1, 14, rsp+(%4+6)*mmsize ; t1a 1016 SCRATCH 2, 13, rsp+(%4+5)*mmsize ; t2a 1017 SCRATCH 3, 12, rsp+(%4+4)*mmsize ; t3a 1018 SCRATCH 4, 11, rsp+(%4+3)*mmsize ; t4 1019 mova [rsp+(%3+0)*mmsize], m5 ; t5 1020 mova [rsp+(%3+1)*mmsize], m7 ; t7 1021 1022 mova m0, [%1+ 1*%2] ; in1 1023 mova m3, [%1+ 7*%2] ; in7 1024 mova m4, [%1+ 9*%2] ; in9 1025 mova m7, [%1+15*%2] ; in15 1026 1027 SUMSUB_MUL 0, 7, 1, 2, 16305, 1606 ; m0=t15a, m7=t8a 1028 SUMSUB_MUL 4, 3, 1, 2, 10394, 12665 ; m4=t14a, m3=t9a 1029 SUMSUB_BA d, 3, 7, 1 ; m3=t8, m7=t9 1030 SUMSUB_BA d, 4, 0, 1 ; m4=t15,m0=t14 1031 SUMSUB_MUL 0, 7, 1, 2, 15137, 6270 ; m0=t14a, m7=t9a 1032 1033 mova m1, [%1+ 3*%2] ; in3 1034 mova m2, [%1+ 5*%2] ; in5 1035 mova m5, [%1+11*%2] ; in11 1036 mova m6, [%1+13*%2] ; in13 1037 1038 SCRATCH 0, 9, rsp+(%4+1)*mmsize 1039 SCRATCH 7, 10, rsp+(%4+2)*mmsize 1040 1041 SUMSUB_MUL 2, 5, 0, 7, 14449, 7723 ; m2=t13a, m5=t10a 1042 SUMSUB_MUL 6, 1, 0, 7, 4756, 15679 ; m6=t12a, m1=t11a 1043 SUMSUB_BA d, 5, 1, 0 ; m5=t11,m1=t10 1044 SUMSUB_BA d, 2, 6, 0 ; m2=t12,m6=t13 1045 NEGD m1 ; m1=-t10 1046 SUMSUB_MUL 1, 6, 0, 7, 15137, 6270 ; m1=t13a, m6=t10a 1047 1048 UNSCRATCH 7, 10, rsp+(%4+2)*mmsize 1049 SUMSUB_BA d, 5, 3, 0 ; m5=t8a, m3=t11a 1050 SUMSUB_BA d, 6, 7, 0 ; m6=t9, m7=t10 1051 SUMSUB_BA d, 2, 4, 0 ; m2=t15a,m4=t12a 1052 SCRATCH 5, 10, rsp+(%4+2)*mmsize 1053 SUMSUB_MUL 4, 3, 0, 5, 11585, 11585 ; m4=t12, m3=t11 1054 UNSCRATCH 0, 9, rsp+(%4+1)*mmsize 1055 SUMSUB_BA d, 1, 0, 5 ; m1=t14, m0=t13 1056 SCRATCH 6, 9, rsp+(%4+1)*mmsize 1057 SUMSUB_MUL 0, 7, 6, 5, 11585, 11585 ; m0=t13a,m7=t10a 1058 1059 ; order: 15|r74,14|r73,13|r72,12|r71,11|r70,r65,8|r67,r66,10|r69,9|r68,7,3,4,0,1,2 1060 ; free: 6,5 1061 1062 UNSCRATCH 5, 15, rsp+(%4+7)*mmsize 1063 SUMSUB_BA d, 2, 5, 6 ; m2=out0, m5=out15 1064 SCRATCH 5, 15, rsp+(%4+7)*mmsize 1065 UNSCRATCH 5, 14, rsp+(%4+6)*mmsize 1066 SUMSUB_BA d, 1, 5, 6 ; m1=out1, m5=out14 1067 SCRATCH 5, 14, rsp+(%4+6)*mmsize 1068 UNSCRATCH 5, 13, rsp+(%4+5)*mmsize 1069 SUMSUB_BA d, 0, 5, 6 ; m0=out2, m5=out13 1070 SCRATCH 5, 13, rsp+(%4+5)*mmsize 1071 UNSCRATCH 5, 12, rsp+(%4+4)*mmsize 1072 SUMSUB_BA d, 4, 5, 6 ; m4=out3, m5=out12 1073 SCRATCH 5, 12, rsp+(%4+4)*mmsize 1074 UNSCRATCH 5, 11, rsp+(%4+3)*mmsize 1075 SUMSUB_BA d, 3, 5, 6 ; m3=out4, m5=out11 1076 SCRATCH 4, 11, rsp+(%4+3)*mmsize 1077 mova m4, [rsp+(%3+0)*mmsize] 1078 SUMSUB_BA d, 7, 4, 6 ; m7=out5, m4=out10 1079 mova [rsp+(%3+0)*mmsize], m5 1080 UNSCRATCH 5, 8, rsp+(%4+0)*mmsize 1081 UNSCRATCH 6, 9, rsp+(%4+1)*mmsize 1082 SCRATCH 2, 8, rsp+(%4+0)*mmsize 1083 SCRATCH 1, 9, rsp+(%4+1)*mmsize 1084 UNSCRATCH 1, 10, rsp+(%4+2)*mmsize 1085 SCRATCH 0, 10, rsp+(%4+2)*mmsize 1086 mova m0, [rsp+(%3+1)*mmsize] 1087 SUMSUB_BA d, 6, 5, 2 ; m6=out6, m5=out9 1088 SUMSUB_BA d, 1, 0, 2 ; m1=out7, m0=out8 1089 1090 SWAP 0, 3, 1, 7, 2, 6, 4 1091 1092 ; output order: 8-11|r67-70=out0-3 1093 ; 0-6,r65=out4-11 1094 ; 12-15|r71-74=out12-15 1095%endmacro 1096 1097INIT_XMM sse2 1098cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \ 1099 67 * mmsize + ARCH_X86_32 * 8 * mmsize, \ 1100 dst, stride, block, eob 1101 mova m0, [pw_1023] 1102 cmp eobd, 1 1103 jg .idctfull 1104 1105 ; dc-only - the 10bit version can be done entirely in 32bit, since the max 1106 ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily 1107 ; fits in 32bit 1108 DEFINE_ARGS dst, stride, block, coef 1109 pxor m2, m2 1110 DC_ONLY 6, m2 1111 movd m1, coefd 1112 pshuflw m1, m1, q0000 1113 punpcklqdq m1, m1 1114 DEFINE_ARGS dst, stride, cnt 1115 mov cntd, 8 1116.loop_dc: 1117 STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize 1118 STORE_2x8 3, 4, 1, m2, m0, dstq+strideq, mmsize 1119 lea dstq, [dstq+strideq*2] 1120 dec cntd 1121 jg .loop_dc 1122 RET 1123 1124.idctfull: 1125 mova [rsp+64*mmsize], m0 1126 DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak 1127%if ARCH_X86_64 1128 mov dstbakq, dstq 1129 movsxd cntq, cntd 1130%endif 1131%ifdef PIC 1132 lea ptrq, [default_16x16] 1133 movzx cntd, byte [ptrq+cntq-1] 1134%else 1135 movzx cntd, byte [default_16x16+cntq-1] 1136%endif 1137 mov skipd, 4 1138 sub skipd, cntd 1139 mov ptrq, rsp 1140.loop_1: 1141 IDCT16_1D blockq 1142 1143 TRANSPOSE4x4D 0, 1, 2, 3, 7 1144 mova [ptrq+ 1*mmsize], m0 1145 mova [ptrq+ 5*mmsize], m1 1146 mova [ptrq+ 9*mmsize], m2 1147 mova [ptrq+13*mmsize], m3 1148 mova m7, [rsp+65*mmsize] 1149 TRANSPOSE4x4D 4, 5, 6, 7, 0 1150 mova [ptrq+ 2*mmsize], m4 1151 mova [ptrq+ 6*mmsize], m5 1152 mova [ptrq+10*mmsize], m6 1153 mova [ptrq+14*mmsize], m7 1154 UNSCRATCH 0, 8, rsp+67*mmsize 1155 UNSCRATCH 1, 9, rsp+68*mmsize 1156 UNSCRATCH 2, 10, rsp+69*mmsize 1157 UNSCRATCH 3, 11, rsp+70*mmsize 1158 TRANSPOSE4x4D 0, 1, 2, 3, 7 1159 mova [ptrq+ 0*mmsize], m0 1160 mova [ptrq+ 4*mmsize], m1 1161 mova [ptrq+ 8*mmsize], m2 1162 mova [ptrq+12*mmsize], m3 1163 UNSCRATCH 4, 12, rsp+71*mmsize 1164 UNSCRATCH 5, 13, rsp+72*mmsize 1165 UNSCRATCH 6, 14, rsp+73*mmsize 1166 UNSCRATCH 7, 15, rsp+74*mmsize 1167 TRANSPOSE4x4D 4, 5, 6, 7, 0 1168 mova [ptrq+ 3*mmsize], m4 1169 mova [ptrq+ 7*mmsize], m5 1170 mova [ptrq+11*mmsize], m6 1171 mova [ptrq+15*mmsize], m7 1172 add ptrq, 16 * mmsize 1173 add blockq, mmsize 1174 dec cntd 1175 jg .loop_1 1176 1177 ; zero-pad the remainder (skipped cols) 1178 test skipd, skipd 1179 jz .end 1180 add skipd, skipd 1181 lea blockq, [blockq+skipq*(mmsize/2)] 1182 pxor m0, m0 1183.loop_z: 1184 mova [ptrq+mmsize*0], m0 1185 mova [ptrq+mmsize*1], m0 1186 mova [ptrq+mmsize*2], m0 1187 mova [ptrq+mmsize*3], m0 1188 mova [ptrq+mmsize*4], m0 1189 mova [ptrq+mmsize*5], m0 1190 mova [ptrq+mmsize*6], m0 1191 mova [ptrq+mmsize*7], m0 1192 add ptrq, 8 * mmsize 1193 dec skipd 1194 jg .loop_z 1195.end: 1196 1197 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 1198 lea stride3q, [strideq*3] 1199 mov cntd, 4 1200 mov ptrq, rsp 1201.loop_2: 1202 IDCT16_1D ptrq 1203 1204 pxor m7, m7 1205 lea dstq, [dstq+strideq*4] 1206 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6 1207 lea dstq, [dstq+strideq*4] 1208 mova m0, [rsp+65*mmsize] 1209 mova m1, [rsp+64*mmsize] 1210 mova m2, [pd_32] 1211 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6 1212 1213%if ARCH_X86_64 1214 DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst 1215%else 1216 mov dstq, dstm 1217%endif 1218 UNSCRATCH 0, 8, rsp+67*mmsize 1219 UNSCRATCH 4, 9, rsp+68*mmsize 1220 UNSCRATCH 5, 10, rsp+69*mmsize 1221 UNSCRATCH 3, 11, rsp+70*mmsize 1222 ROUND_AND_STORE_4x4 0, 4, 5, 3, m7, m1, m2, 6 1223%if ARCH_X86_64 1224 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 1225 lea dstq, [dstbakq+stride3q*4] 1226%else 1227 lea dstq, [dstq+stride3q*4] 1228%endif 1229 UNSCRATCH 4, 12, rsp+71*mmsize 1230 UNSCRATCH 5, 13, rsp+72*mmsize 1231 UNSCRATCH 6, 14, rsp+73*mmsize 1232 UNSCRATCH 0, 15, rsp+74*mmsize 1233 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6 1234 1235 add ptrq, mmsize 1236%if ARCH_X86_64 1237 add dstbakq, 8 1238 mov dstq, dstbakq 1239%else 1240 add dword dstm, 8 1241 mov dstq, dstm 1242%endif 1243 dec cntd 1244 jg .loop_2 1245 1246 ; m7 is still zero 1247 ZERO_BLOCK blockq-4*mmsize, 64, 16, m7 1248 RET 1249 1250INIT_XMM sse2 1251cglobal vp9_idct_idct_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \ 1252 67 * mmsize + ARCH_X86_32 * 8 * mmsize, \ 1253 dst, stride, block, eob 1254 mova m0, [pw_4095] 1255 cmp eobd, 1 1256 jg mangle(private_prefix %+ _ %+ vp9_idct_idct_16x16_add_10 %+ SUFFIX).idctfull 1257 1258 ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign 1259 ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies 1260 DEFINE_ARGS dst, stride, block, coef, coefl 1261 pxor m2, m2 1262 DC_ONLY_64BIT 6, m2 1263 movd m1, coefd 1264 pshuflw m1, m1, q0000 1265 punpcklqdq m1, m1 1266 DEFINE_ARGS dst, stride, cnt 1267 mov cntd, 8 1268.loop_dc: 1269 STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize 1270 STORE_2x8 3, 4, 1, m2, m0, dstq+strideq, mmsize 1271 lea dstq, [dstq+strideq*2] 1272 dec cntd 1273 jg .loop_dc 1274 RET 1275 1276; r65-69 are available for spills 1277; r70-77 are available on x86-32 only (x86-64 should use m8-15) 1278; output should be in m8-11|r70-73, m0-6,r65 and m12-15|r74-77 1279%macro IADST16_1D 1 ; src 1280 mova m0, [%1+ 0*4*mmsize] ; in0 1281 mova m1, [%1+ 7*4*mmsize] ; in7 1282 mova m2, [%1+ 8*4*mmsize] ; in8 1283 mova m3, [%1+15*4*mmsize] ; in15 1284 SUMSUB_MUL_D 3, 0, 4, 5, 16364, 804 ; m3/4=t0, m0/5=t1 1285 SUMSUB_MUL_D 1, 2, 6, 7, 11003, 12140 ; m1/6=t8, m2/7=t9 1286 SCRATCH 0, 8, rsp+70*mmsize 1287 SUMSUB_PACK_D 1, 3, 6, 4, 0 ; m1=t0a, m3=t8a 1288 UNSCRATCH 0, 8, rsp+70*mmsize 1289 SUMSUB_PACK_D 2, 0, 7, 5, 4 ; m2=t1a, m0=t9a 1290 mova [rsp+67*mmsize], m1 1291 SCRATCH 2, 9, rsp+71*mmsize 1292 SCRATCH 3, 12, rsp+74*mmsize 1293 SCRATCH 0, 13, rsp+75*mmsize 1294 1295 mova m0, [%1+ 3*4*mmsize] ; in3 1296 mova m1, [%1+ 4*4*mmsize] ; in4 1297 mova m2, [%1+11*4*mmsize] ; in11 1298 mova m3, [%1+12*4*mmsize] ; in12 1299 SUMSUB_MUL_D 2, 1, 4, 5, 14811, 7005 ; m2/4=t4, m1/5=t5 1300 SUMSUB_MUL_D 0, 3, 6, 7, 5520, 15426 ; m0/6=t12, m3/7=t13 1301 SCRATCH 1, 10, rsp+72*mmsize 1302 SUMSUB_PACK_D 0, 2, 6, 4, 1 ; m0=t4a, m2=t12a 1303 UNSCRATCH 1, 10, rsp+72*mmsize 1304 SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=t5a, m1=t13a 1305 SCRATCH 0, 15, rsp+77*mmsize 1306 SCRATCH 3, 11, rsp+73*mmsize 1307 1308 UNSCRATCH 0, 12, rsp+74*mmsize ; t8a 1309 UNSCRATCH 3, 13, rsp+75*mmsize ; t9a 1310 SUMSUB_MUL_D 0, 3, 4, 5, 16069, 3196 ; m0/4=t8, m3/5=t9 1311 SUMSUB_MUL_D 1, 2, 6, 7, 3196, 16069 ; m1/6=t13, m2/7=t12 1312 SCRATCH 1, 12, rsp+74*mmsize 1313 SUMSUB_PACK_D 2, 0, 7, 4, 1 ; m2=t8a, m0=t12a 1314 UNSCRATCH 1, 12, rsp+74*mmsize 1315 SUMSUB_PACK_D 1, 3, 6, 5, 4 ; m1=t9a, m3=t13a 1316 mova [rsp+65*mmsize], m2 1317 mova [rsp+66*mmsize], m1 1318 SCRATCH 0, 8, rsp+70*mmsize 1319 SCRATCH 3, 12, rsp+74*mmsize 1320 1321 mova m0, [%1+ 2*4*mmsize] ; in2 1322 mova m1, [%1+ 5*4*mmsize] ; in5 1323 mova m2, [%1+10*4*mmsize] ; in10 1324 mova m3, [%1+13*4*mmsize] ; in13 1325 SUMSUB_MUL_D 3, 0, 4, 5, 15893, 3981 ; m3/4=t2, m0/5=t3 1326 SUMSUB_MUL_D 1, 2, 6, 7, 8423, 14053 ; m1/6=t10, m2/7=t11 1327 SCRATCH 0, 10, rsp+72*mmsize 1328 SUMSUB_PACK_D 1, 3, 6, 4, 0 ; m1=t2a, m3=t10a 1329 UNSCRATCH 0, 10, rsp+72*mmsize 1330 SUMSUB_PACK_D 2, 0, 7, 5, 4 ; m2=t3a, m0=t11a 1331 mova [rsp+68*mmsize], m1 1332 mova [rsp+69*mmsize], m2 1333 SCRATCH 3, 13, rsp+75*mmsize 1334 SCRATCH 0, 14, rsp+76*mmsize 1335 1336 mova m0, [%1+ 1*4*mmsize] ; in1 1337 mova m1, [%1+ 6*4*mmsize] ; in6 1338 mova m2, [%1+ 9*4*mmsize] ; in9 1339 mova m3, [%1+14*4*mmsize] ; in14 1340 SUMSUB_MUL_D 2, 1, 4, 5, 13160, 9760 ; m2/4=t6, m1/5=t7 1341 SUMSUB_MUL_D 0, 3, 6, 7, 2404, 16207 ; m0/6=t14, m3/7=t15 1342 SCRATCH 1, 10, rsp+72*mmsize 1343 SUMSUB_PACK_D 0, 2, 6, 4, 1 ; m0=t6a, m2=t14a 1344 UNSCRATCH 1, 10, rsp+72*mmsize 1345 SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=t7a, m1=t15a 1346 1347 UNSCRATCH 4, 13, rsp+75*mmsize ; t10a 1348 UNSCRATCH 5, 14, rsp+76*mmsize ; t11a 1349 SCRATCH 0, 13, rsp+75*mmsize 1350 SCRATCH 3, 14, rsp+76*mmsize 1351 SUMSUB_MUL_D 4, 5, 6, 7, 9102, 13623 ; m4/6=t10, m5/7=t11 1352 SUMSUB_MUL_D 1, 2, 0, 3, 13623, 9102 ; m1/0=t15, m2/3=t14 1353 SCRATCH 0, 10, rsp+72*mmsize 1354 SUMSUB_PACK_D 2, 4, 3, 6, 0 ; m2=t10a, m4=t14a 1355 UNSCRATCH 0, 10, rsp+72*mmsize 1356 SUMSUB_PACK_D 1, 5, 0, 7, 6 ; m1=t11a, m5=t15a 1357 1358 UNSCRATCH 0, 8, rsp+70*mmsize ; t12a 1359 UNSCRATCH 3, 12, rsp+74*mmsize ; t13a 1360 SCRATCH 2, 8, rsp+70*mmsize 1361 SCRATCH 1, 12, rsp+74*mmsize 1362 SUMSUB_MUL_D 0, 3, 1, 2, 15137, 6270 ; m0/1=t12, m3/2=t13 1363 SUMSUB_MUL_D 5, 4, 7, 6, 6270, 15137 ; m5/7=t15, m4/6=t14 1364 SCRATCH 2, 10, rsp+72*mmsize 1365 SUMSUB_PACK_D 4, 0, 6, 1, 2 ; m4=out2, m0=t14a 1366 UNSCRATCH 2, 10, rsp+72*mmsize 1367 SUMSUB_PACK_D 5, 3, 7, 2, 1 ; m5=-out13, m3=t15a 1368 NEGD m5 ; m5=out13 1369 1370 UNSCRATCH 1, 9, rsp+71*mmsize ; t1a 1371 mova m2, [rsp+68*mmsize] ; t2a 1372 UNSCRATCH 6, 13, rsp+75*mmsize ; t6a 1373 UNSCRATCH 7, 14, rsp+76*mmsize ; t7a 1374 SCRATCH 4, 10, rsp+72*mmsize 1375 SCRATCH 5, 13, rsp+75*mmsize 1376 UNSCRATCH 4, 15, rsp+77*mmsize ; t4a 1377 UNSCRATCH 5, 11, rsp+73*mmsize ; t5a 1378 SCRATCH 0, 14, rsp+76*mmsize 1379 SCRATCH 3, 15, rsp+77*mmsize 1380 mova m0, [rsp+67*mmsize] ; t0a 1381 SUMSUB_BA d, 4, 0, 3 ; m4=t0, m0=t4 1382 SUMSUB_BA d, 5, 1, 3 ; m5=t1, m1=t5 1383 SUMSUB_BA d, 6, 2, 3 ; m6=t2, m2=t6 1384 SCRATCH 4, 9, rsp+71*mmsize 1385 mova m3, [rsp+69*mmsize] ; t3a 1386 SUMSUB_BA d, 7, 3, 4 ; m7=t3, m3=t7 1387 1388 mova [rsp+67*mmsize], m5 1389 mova [rsp+68*mmsize], m6 1390 mova [rsp+69*mmsize], m7 1391 SUMSUB_MUL_D 0, 1, 4, 5, 15137, 6270 ; m0/4=t4a, m1/5=t5a 1392 SUMSUB_MUL_D 3, 2, 7, 6, 6270, 15137 ; m3/7=t7a, m2/6=t6a 1393 SCRATCH 1, 11, rsp+73*mmsize 1394 SUMSUB_PACK_D 2, 0, 6, 4, 1 ; m2=-out3, m0=t6 1395 NEGD m2 ; m2=out3 1396 UNSCRATCH 1, 11, rsp+73*mmsize 1397 SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=out12, m1=t7 1398 SCRATCH 2, 11, rsp+73*mmsize 1399 UNSCRATCH 2, 12, rsp+74*mmsize ; t11a 1400 SCRATCH 3, 12, rsp+74*mmsize 1401 1402 UNSCRATCH 3, 8, rsp+70*mmsize ; t10a 1403 mova m4, [rsp+65*mmsize] ; t8a 1404 mova m5, [rsp+66*mmsize] ; t9a 1405 SUMSUB_BA d, 3, 4, 6 ; m3=-out1, m4=t10 1406 NEGD m3 ; m3=out1 1407 SUMSUB_BA d, 2, 5, 6 ; m2=out14, m5=t11 1408 UNSCRATCH 6, 9, rsp+71*mmsize ; t0 1409 UNSCRATCH 7, 14, rsp+76*mmsize ; t14a 1410 SCRATCH 3, 9, rsp+71*mmsize 1411 SCRATCH 2, 14, rsp+76*mmsize 1412 1413 SUMSUB_MUL 1, 0, 2, 3, 11585, 11585 ; m1=out4, m0=out11 1414 mova [rsp+65*mmsize], m0 1415 SUMSUB_MUL 5, 4, 2, 3, 11585, 11585 ; m5=out6, m4=out9 1416 UNSCRATCH 0, 15, rsp+77*mmsize ; t15a 1417 SUMSUB_MUL 7, 0, 2, 3, 11585, m11585 ; m7=out10, m0=out5 1418 1419 mova m2, [rsp+68*mmsize] ; t2 1420 SUMSUB_BA d, 2, 6, 3 ; m2=out0, m6=t2a 1421 SCRATCH 2, 8, rsp+70*mmsize 1422 mova m2, [rsp+67*mmsize] ; t1 1423 mova m3, [rsp+69*mmsize] ; t3 1424 mova [rsp+67*mmsize], m7 1425 SUMSUB_BA d, 3, 2, 7 ; m3=-out15, m2=t3a 1426 NEGD m3 ; m3=out15 1427 SCRATCH 3, 15, rsp+77*mmsize 1428 SUMSUB_MUL 6, 2, 7, 3, 11585, m11585 ; m6=out8, m2=out7 1429 mova m7, [rsp+67*mmsize] 1430 1431 SWAP 0, 1 1432 SWAP 2, 5, 4, 6, 7, 3 1433%endmacro 1434 1435%macro IADST16_FN 7 1436cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \ 1437 70 * mmsize + ARCH_X86_32 * 8 * mmsize, \ 1438 dst, stride, block, eob 1439 mova m0, [pw_1023] 1440 1441.body: 1442 mova [rsp+64*mmsize], m0 1443 DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak 1444%if ARCH_X86_64 1445 mov dstbakq, dstq 1446 movsxd cntq, cntd 1447%endif 1448%ifdef PIC 1449 lea ptrq, [%7_16x16] 1450 movzx cntd, byte [ptrq+cntq-1] 1451%else 1452 movzx cntd, byte [%7_16x16+cntq-1] 1453%endif 1454 mov skipd, 4 1455 sub skipd, cntd 1456 mov ptrq, rsp 1457.loop_1: 1458 %2_1D blockq 1459 1460 TRANSPOSE4x4D 0, 1, 2, 3, 7 1461 mova [ptrq+ 1*mmsize], m0 1462 mova [ptrq+ 5*mmsize], m1 1463 mova [ptrq+ 9*mmsize], m2 1464 mova [ptrq+13*mmsize], m3 1465 mova m7, [rsp+65*mmsize] 1466 TRANSPOSE4x4D 4, 5, 6, 7, 0 1467 mova [ptrq+ 2*mmsize], m4 1468 mova [ptrq+ 6*mmsize], m5 1469 mova [ptrq+10*mmsize], m6 1470 mova [ptrq+14*mmsize], m7 1471 UNSCRATCH 0, 8, rsp+(%3+0)*mmsize 1472 UNSCRATCH 1, 9, rsp+(%3+1)*mmsize 1473 UNSCRATCH 2, 10, rsp+(%3+2)*mmsize 1474 UNSCRATCH 3, 11, rsp+(%3+3)*mmsize 1475 TRANSPOSE4x4D 0, 1, 2, 3, 7 1476 mova [ptrq+ 0*mmsize], m0 1477 mova [ptrq+ 4*mmsize], m1 1478 mova [ptrq+ 8*mmsize], m2 1479 mova [ptrq+12*mmsize], m3 1480 UNSCRATCH 4, 12, rsp+(%3+4)*mmsize 1481 UNSCRATCH 5, 13, rsp+(%3+5)*mmsize 1482 UNSCRATCH 6, 14, rsp+(%3+6)*mmsize 1483 UNSCRATCH 7, 15, rsp+(%3+7)*mmsize 1484 TRANSPOSE4x4D 4, 5, 6, 7, 0 1485 mova [ptrq+ 3*mmsize], m4 1486 mova [ptrq+ 7*mmsize], m5 1487 mova [ptrq+11*mmsize], m6 1488 mova [ptrq+15*mmsize], m7 1489 add ptrq, 16 * mmsize 1490 add blockq, mmsize 1491 dec cntd 1492 jg .loop_1 1493 1494 ; zero-pad the remainder (skipped cols) 1495 test skipd, skipd 1496 jz .end 1497 add skipd, skipd 1498 lea blockq, [blockq+skipq*(mmsize/2)] 1499 pxor m0, m0 1500.loop_z: 1501 mova [ptrq+mmsize*0], m0 1502 mova [ptrq+mmsize*1], m0 1503 mova [ptrq+mmsize*2], m0 1504 mova [ptrq+mmsize*3], m0 1505 mova [ptrq+mmsize*4], m0 1506 mova [ptrq+mmsize*5], m0 1507 mova [ptrq+mmsize*6], m0 1508 mova [ptrq+mmsize*7], m0 1509 add ptrq, 8 * mmsize 1510 dec skipd 1511 jg .loop_z 1512.end: 1513 1514 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 1515 lea stride3q, [strideq*3] 1516 mov cntd, 4 1517 mov ptrq, rsp 1518.loop_2: 1519 %5_1D ptrq 1520 1521 pxor m7, m7 1522 lea dstq, [dstq+strideq*4] 1523 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6 1524 lea dstq, [dstq+strideq*4] 1525 mova m0, [rsp+65*mmsize] 1526 mova m1, [rsp+64*mmsize] 1527 mova m2, [pd_32] 1528 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6 1529 1530%if ARCH_X86_64 1531 DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst 1532%else 1533 mov dstq, dstm 1534%endif 1535 UNSCRATCH 0, 8, rsp+(%6+0)*mmsize 1536 UNSCRATCH 4, 9, rsp+(%6+1)*mmsize 1537 UNSCRATCH 5, 10, rsp+(%6+2)*mmsize 1538 UNSCRATCH 3, 11, rsp+(%6+3)*mmsize 1539 ROUND_AND_STORE_4x4 0, 4, 5, 3, m7, m1, m2, 6 1540%if ARCH_X86_64 1541 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 1542 lea dstq, [dstbakq+stride3q*4] 1543%else 1544 lea dstq, [dstq+stride3q*4] 1545%endif 1546 UNSCRATCH 4, 12, rsp+(%6+4)*mmsize 1547 UNSCRATCH 5, 13, rsp+(%6+5)*mmsize 1548 UNSCRATCH 6, 14, rsp+(%6+6)*mmsize 1549 UNSCRATCH 0, 15, rsp+(%6+7)*mmsize 1550 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6 1551 1552 add ptrq, mmsize 1553%if ARCH_X86_64 1554 add dstbakq, 8 1555 mov dstq, dstbakq 1556%else 1557 add dword dstm, 8 1558 mov dstq, dstm 1559%endif 1560 dec cntd 1561 jg .loop_2 1562 1563 ; m7 is still zero 1564 ZERO_BLOCK blockq-4*mmsize, 64, 16, m7 1565 RET 1566 1567cglobal vp9_%1_%4_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \ 1568 70 * mmsize + ARCH_X86_32 * 8 * mmsize, \ 1569 dst, stride, block, eob 1570 mova m0, [pw_4095] 1571 jmp mangle(private_prefix %+ _ %+ vp9_%1_%4_16x16_add_10 %+ SUFFIX).body 1572%endmacro 1573 1574INIT_XMM sse2 1575IADST16_FN idct, IDCT16, 67, iadst, IADST16, 70, row 1576IADST16_FN iadst, IADST16, 70, idct, IDCT16, 67, col 1577IADST16_FN iadst, IADST16, 70, iadst, IADST16, 70, default 1578 1579%macro IDCT32_1D 2-3 8 * mmsize; pass[1/2], src, src_stride 1580 IDCT16_1D %2, 2 * %3, 272, 257 1581%if ARCH_X86_64 1582 mova [rsp+257*mmsize], m8 1583 mova [rsp+258*mmsize], m9 1584 mova [rsp+259*mmsize], m10 1585 mova [rsp+260*mmsize], m11 1586 mova [rsp+261*mmsize], m12 1587 mova [rsp+262*mmsize], m13 1588 mova [rsp+263*mmsize], m14 1589 mova [rsp+264*mmsize], m15 1590%endif 1591 mova [rsp+265*mmsize], m0 1592 mova [rsp+266*mmsize], m1 1593 mova [rsp+267*mmsize], m2 1594 mova [rsp+268*mmsize], m3 1595 mova [rsp+269*mmsize], m4 1596 mova [rsp+270*mmsize], m5 1597 mova [rsp+271*mmsize], m6 1598 1599 ; r257-260: t0-3 1600 ; r265-272: t4/5a/6a/7/8/9a/10/11a 1601 ; r261-264: t12a/13/14a/15 1602 ; r273-274 is free as scratch space, and 275-282 mirrors m8-15 on 32bit 1603 1604 mova m0, [%2+ 1*%3] ; in1 1605 mova m1, [%2+15*%3] ; in15 1606 mova m2, [%2+17*%3] ; in17 1607 mova m3, [%2+31*%3] ; in31 1608 SUMSUB_MUL 0, 3, 4, 5, 16364, 804 ; m0=t31a, m3=t16a 1609 SUMSUB_MUL 2, 1, 4, 5, 11003, 12140 ; m2=t30a, m1=t17a 1610 SUMSUB_BA d, 1, 3, 4 ; m1=t16, m3=t17 1611 SUMSUB_BA d, 2, 0, 4 ; m2=t31, m0=t30 1612 SUMSUB_MUL 0, 3, 4, 5, 16069, 3196 ; m0=t30a, m3=t17a 1613 SCRATCH 0, 8, rsp+275*mmsize 1614 SCRATCH 2, 9, rsp+276*mmsize 1615 1616 ; end of stage 1-3 first quart 1617 1618 mova m0, [%2+ 7*%3] ; in7 1619 mova m2, [%2+ 9*%3] ; in9 1620 mova m4, [%2+23*%3] ; in23 1621 mova m5, [%2+25*%3] ; in25 1622 SUMSUB_MUL 2, 4, 6, 7, 14811, 7005 ; m2=t29a, m4=t18a 1623 SUMSUB_MUL 5, 0, 6, 7, 5520, 15426 ; m5=t28a, m0=t19a 1624 SUMSUB_BA d, 4, 0, 6 ; m4=t19, m0=t18 1625 SUMSUB_BA d, 2, 5, 6 ; m2=t28, m5=t29 1626 SUMSUB_MUL 5, 0, 6, 7, 3196, m16069 ; m5=t29a, m0=t18a 1627 1628 ; end of stage 1-3 second quart 1629 1630 SUMSUB_BA d, 4, 1, 6 ; m4=t16a, m1=t19a 1631 SUMSUB_BA d, 0, 3, 6 ; m0=t17, m3=t18 1632 UNSCRATCH 6, 8, rsp+275*mmsize ; t30a 1633 UNSCRATCH 7, 9, rsp+276*mmsize ; t31 1634 mova [rsp+273*mmsize], m4 1635 mova [rsp+274*mmsize], m0 1636 SUMSUB_BA d, 2, 7, 0 ; m2=t31a, m7=t28a 1637 SUMSUB_BA d, 5, 6, 0 ; m5=t30, m6=t29 1638 SUMSUB_MUL 6, 3, 0, 4, 15137, 6270 ; m6=t29a, m3=t18a 1639 SUMSUB_MUL 7, 1, 0, 4, 15137, 6270 ; m7=t28, m1=t19 1640 SCRATCH 3, 10, rsp+277*mmsize 1641 SCRATCH 1, 11, rsp+278*mmsize 1642 SCRATCH 7, 12, rsp+279*mmsize 1643 SCRATCH 6, 13, rsp+280*mmsize 1644 SCRATCH 5, 14, rsp+281*mmsize 1645 SCRATCH 2, 15, rsp+282*mmsize 1646 1647 ; end of stage 4-5 first half 1648 1649 mova m0, [%2+ 5*%3] ; in5 1650 mova m1, [%2+11*%3] ; in11 1651 mova m2, [%2+21*%3] ; in21 1652 mova m3, [%2+27*%3] ; in27 1653 SUMSUB_MUL 0, 3, 4, 5, 15893, 3981 ; m0=t27a, m3=t20a 1654 SUMSUB_MUL 2, 1, 4, 5, 8423, 14053 ; m2=t26a, m1=t21a 1655 SUMSUB_BA d, 1, 3, 4 ; m1=t20, m3=t21 1656 SUMSUB_BA d, 2, 0, 4 ; m2=t27, m0=t26 1657 SUMSUB_MUL 0, 3, 4, 5, 9102, 13623 ; m0=t26a, m3=t21a 1658 SCRATCH 0, 8, rsp+275*mmsize 1659 SCRATCH 2, 9, rsp+276*mmsize 1660 1661 ; end of stage 1-3 third quart 1662 1663 mova m0, [%2+ 3*%3] ; in3 1664 mova m2, [%2+13*%3] ; in13 1665 mova m4, [%2+19*%3] ; in19 1666 mova m5, [%2+29*%3] ; in29 1667 SUMSUB_MUL 2, 4, 6, 7, 13160, 9760 ; m2=t25a, m4=t22a 1668 SUMSUB_MUL 5, 0, 6, 7, 2404, 16207 ; m5=t24a, m0=t23a 1669 SUMSUB_BA d, 4, 0, 6 ; m4=t23, m0=t22 1670 SUMSUB_BA d, 2, 5, 6 ; m2=t24, m5=t25 1671 SUMSUB_MUL 5, 0, 6, 7, 13623, m9102 ; m5=t25a, m0=t22a 1672 1673 ; end of stage 1-3 fourth quart 1674 1675 SUMSUB_BA d, 1, 4, 6 ; m1=t23a, m4=t20a 1676 SUMSUB_BA d, 3, 0, 6 ; m3=t22, m0=t21 1677 UNSCRATCH 6, 8, rsp+275*mmsize ; t26a 1678 UNSCRATCH 7, 9, rsp+276*mmsize ; t27 1679 SCRATCH 3, 8, rsp+275*mmsize 1680 SCRATCH 1, 9, rsp+276*mmsize 1681 SUMSUB_BA d, 7, 2, 1 ; m7=t24a, m2=t27a 1682 SUMSUB_BA d, 6, 5, 1 ; m6=t25, m5=t26 1683 SUMSUB_MUL 2, 4, 1, 3, 6270, m15137 ; m2=t27, m4=t20 1684 SUMSUB_MUL 5, 0, 1, 3, 6270, m15137 ; m5=t26a, m0=t21a 1685 1686 ; end of stage 4-5 second half 1687 1688 UNSCRATCH 1, 12, rsp+279*mmsize ; t28 1689 UNSCRATCH 3, 13, rsp+280*mmsize ; t29a 1690 SCRATCH 4, 12, rsp+279*mmsize 1691 SCRATCH 0, 13, rsp+280*mmsize 1692 SUMSUB_BA d, 5, 3, 0 ; m5=t29, m3=t26 1693 SUMSUB_BA d, 2, 1, 0 ; m2=t28a, m1=t27a 1694 UNSCRATCH 0, 14, rsp+281*mmsize ; t30 1695 UNSCRATCH 4, 15, rsp+282*mmsize ; t31a 1696 SCRATCH 2, 14, rsp+281*mmsize 1697 SCRATCH 5, 15, rsp+282*mmsize 1698 SUMSUB_BA d, 6, 0, 2 ; m6=t30a, m0=t25a 1699 SUMSUB_BA d, 7, 4, 2 ; m7=t31, m4=t24 1700 1701 mova m2, [rsp+273*mmsize] ; t16a 1702 mova m5, [rsp+274*mmsize] ; t17 1703 mova [rsp+273*mmsize], m6 1704 mova [rsp+274*mmsize], m7 1705 UNSCRATCH 6, 10, rsp+277*mmsize ; t18a 1706 UNSCRATCH 7, 11, rsp+278*mmsize ; t19 1707 SCRATCH 4, 10, rsp+277*mmsize 1708 SCRATCH 0, 11, rsp+278*mmsize 1709 UNSCRATCH 4, 12, rsp+279*mmsize ; t20 1710 UNSCRATCH 0, 13, rsp+280*mmsize ; t21a 1711 SCRATCH 3, 12, rsp+279*mmsize 1712 SCRATCH 1, 13, rsp+280*mmsize 1713 SUMSUB_BA d, 0, 6, 1 ; m0=t18, m6=t21 1714 SUMSUB_BA d, 4, 7, 1 ; m4=t19a, m7=t20a 1715 UNSCRATCH 3, 8, rsp+275*mmsize ; t22 1716 UNSCRATCH 1, 9, rsp+276*mmsize ; t23a 1717 SCRATCH 0, 8, rsp+275*mmsize 1718 SCRATCH 4, 9, rsp+276*mmsize 1719 SUMSUB_BA d, 3, 5, 0 ; m3=t17a, m5=t22a 1720 SUMSUB_BA d, 1, 2, 0 ; m1=t16, m2=t23 1721 1722 ; end of stage 6 1723 1724 UNSCRATCH 0, 10, rsp+277*mmsize ; t24 1725 UNSCRATCH 4, 11, rsp+278*mmsize ; t25a 1726 SCRATCH 1, 10, rsp+277*mmsize 1727 SCRATCH 3, 11, rsp+278*mmsize 1728 SUMSUB_MUL 0, 2, 1, 3, 11585, 11585 ; m0=t24a, m2=t23a 1729 SUMSUB_MUL 4, 5, 1, 3, 11585, 11585 ; m4=t25, m5=t22 1730 UNSCRATCH 1, 12, rsp+279*mmsize ; t26 1731 UNSCRATCH 3, 13, rsp+280*mmsize ; t27a 1732 SCRATCH 0, 12, rsp+279*mmsize 1733 SCRATCH 4, 13, rsp+280*mmsize 1734 SUMSUB_MUL 3, 7, 0, 4, 11585, 11585 ; m3=t27, m7=t20 1735 SUMSUB_MUL 1, 6, 0, 4, 11585, 11585 ; m1=t26a, m6=t21a 1736 1737 ; end of stage 7 1738 1739 mova m0, [rsp+269*mmsize] ; t8 1740 mova m4, [rsp+270*mmsize] ; t9a 1741 mova [rsp+269*mmsize], m1 ; t26a 1742 mova [rsp+270*mmsize], m3 ; t27 1743 mova m3, [rsp+271*mmsize] ; t10 1744 SUMSUB_BA d, 2, 0, 1 ; m2=out8, m0=out23 1745 SUMSUB_BA d, 5, 4, 1 ; m5=out9, m4=out22 1746 SUMSUB_BA d, 6, 3, 1 ; m6=out10, m3=out21 1747 mova m1, [rsp+272*mmsize] ; t11a 1748 mova [rsp+271*mmsize], m0 1749 SUMSUB_BA d, 7, 1, 0 ; m7=out11, m1=out20 1750 1751%if %1 == 1 1752 TRANSPOSE4x4D 2, 5, 6, 7, 0 1753 mova [ptrq+ 2*mmsize], m2 1754 mova [ptrq+10*mmsize], m5 1755 mova [ptrq+18*mmsize], m6 1756 mova [ptrq+26*mmsize], m7 1757%else ; %1 == 2 1758 pxor m0, m0 1759 lea dstq, [dstq+strideq*8] 1760 ROUND_AND_STORE_4x4 2, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6 1761%endif 1762 mova m2, [rsp+271*mmsize] 1763%if %1 == 1 1764 TRANSPOSE4x4D 1, 3, 4, 2, 0 1765 mova [ptrq+ 5*mmsize], m1 1766 mova [ptrq+13*mmsize], m3 1767 mova [ptrq+21*mmsize], m4 1768 mova [ptrq+29*mmsize], m2 1769%else ; %1 == 2 1770 lea dstq, [dstq+stride3q*4] 1771 ROUND_AND_STORE_4x4 1, 3, 4, 2, m0, [rsp+256*mmsize], [pd_32], 6 1772%endif 1773 1774 ; end of last stage + store for out8-11 and out20-23 1775 1776 UNSCRATCH 0, 9, rsp+276*mmsize ; t19a 1777 UNSCRATCH 1, 8, rsp+275*mmsize ; t18 1778 UNSCRATCH 2, 11, rsp+278*mmsize ; t17a 1779 UNSCRATCH 3, 10, rsp+277*mmsize ; t16 1780 mova m7, [rsp+261*mmsize] ; t12a 1781 mova m6, [rsp+262*mmsize] ; t13 1782 mova m5, [rsp+263*mmsize] ; t14a 1783 SUMSUB_BA d, 0, 7, 4 ; m0=out12, m7=out19 1784 SUMSUB_BA d, 1, 6, 4 ; m1=out13, m6=out18 1785 SUMSUB_BA d, 2, 5, 4 ; m2=out14, m5=out17 1786 mova m4, [rsp+264*mmsize] ; t15 1787 SCRATCH 7, 8, rsp+275*mmsize 1788 SUMSUB_BA d, 3, 4, 7 ; m3=out15, m4=out16 1789 1790%if %1 == 1 1791 TRANSPOSE4x4D 0, 1, 2, 3, 7 1792 mova [ptrq+ 3*mmsize], m0 1793 mova [ptrq+11*mmsize], m1 1794 mova [ptrq+19*mmsize], m2 1795 mova [ptrq+27*mmsize], m3 1796%else ; %1 == 2 1797%if ARCH_X86_64 1798 SWAP 7, 9 1799 lea dstq, [dstbakq+stride3q*4] 1800%else ; x86-32 1801 pxor m7, m7 1802 mov dstq, dstm 1803 lea dstq, [dstq+stride3q*4] 1804%endif 1805 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6 1806%endif 1807 UNSCRATCH 0, 8, rsp+275*mmsize ; out19 1808%if %1 == 1 1809 TRANSPOSE4x4D 4, 5, 6, 0, 7 1810 mova [ptrq+ 4*mmsize], m4 1811 mova [ptrq+12*mmsize], m5 1812 mova [ptrq+20*mmsize], m6 1813 mova [ptrq+28*mmsize], m0 1814%else ; %1 == 2 1815 lea dstq, [dstq+strideq*4] 1816 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6 1817%endif 1818 1819 ; end of last stage + store for out12-19 1820 1821%if ARCH_X86_64 1822 SWAP 7, 8 1823%endif 1824 mova m7, [rsp+257*mmsize] ; t0 1825 mova m6, [rsp+258*mmsize] ; t1 1826 mova m5, [rsp+259*mmsize] ; t2 1827 mova m4, [rsp+260*mmsize] ; t3 1828 mova m0, [rsp+274*mmsize] ; t31 1829 mova m1, [rsp+273*mmsize] ; t30a 1830 UNSCRATCH 2, 15, rsp+282*mmsize ; t29 1831 SUMSUB_BA d, 0, 7, 3 ; m0=out0, m7=out31 1832 SUMSUB_BA d, 1, 6, 3 ; m1=out1, m6=out30 1833 SUMSUB_BA d, 2, 5, 3 ; m2=out2, m5=out29 1834 SCRATCH 0, 9, rsp+276*mmsize 1835 UNSCRATCH 3, 14, rsp+281*mmsize ; t28a 1836 SUMSUB_BA d, 3, 4, 0 ; m3=out3, m4=out28 1837 1838%if %1 == 1 1839 TRANSPOSE4x4D 4, 5, 6, 7, 0 1840 mova [ptrq+ 7*mmsize], m4 1841 mova [ptrq+15*mmsize], m5 1842 mova [ptrq+23*mmsize], m6 1843 mova [ptrq+31*mmsize], m7 1844%else ; %1 == 2 1845%if ARCH_X86_64 1846 SWAP 0, 8 1847%else ; x86-32 1848 pxor m0, m0 1849%endif 1850 lea dstq, [dstq+stride3q*4] 1851 ROUND_AND_STORE_4x4 4, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6 1852%endif 1853 UNSCRATCH 7, 9, rsp+276*mmsize ; out0 1854%if %1 == 1 1855 TRANSPOSE4x4D 7, 1, 2, 3, 0 1856 mova [ptrq+ 0*mmsize], m7 1857 mova [ptrq+ 8*mmsize], m1 1858 mova [ptrq+16*mmsize], m2 1859 mova [ptrq+24*mmsize], m3 1860%else ; %1 == 2 1861%if ARCH_X86_64 1862 DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst 1863%else ; x86-32 1864 mov dstq, dstm 1865%endif 1866 ROUND_AND_STORE_4x4 7, 1, 2, 3, m0, [rsp+256*mmsize], [pd_32], 6 1867%if ARCH_X86_64 1868 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 1869%endif 1870%endif 1871 1872 ; end of last stage + store for out0-3 and out28-31 1873 1874%if ARCH_X86_64 1875 SWAP 0, 8 1876%endif 1877 mova m7, [rsp+265*mmsize] ; t4 1878 mova m6, [rsp+266*mmsize] ; t5a 1879 mova m5, [rsp+267*mmsize] ; t6a 1880 mova m4, [rsp+268*mmsize] ; t7 1881 mova m0, [rsp+270*mmsize] ; t27 1882 mova m1, [rsp+269*mmsize] ; t26a 1883 UNSCRATCH 2, 13, rsp+280*mmsize ; t25 1884 SUMSUB_BA d, 0, 7, 3 ; m0=out4, m7=out27 1885 SUMSUB_BA d, 1, 6, 3 ; m1=out5, m6=out26 1886 SUMSUB_BA d, 2, 5, 3 ; m2=out6, m5=out25 1887 UNSCRATCH 3, 12, rsp+279*mmsize ; t24a 1888 SCRATCH 7, 9, rsp+276*mmsize 1889 SUMSUB_BA d, 3, 4, 7 ; m3=out7, m4=out24 1890 1891%if %1 == 1 1892 TRANSPOSE4x4D 0, 1, 2, 3, 7 1893 mova [ptrq+ 1*mmsize], m0 1894 mova [ptrq+ 9*mmsize], m1 1895 mova [ptrq+17*mmsize], m2 1896 mova [ptrq+25*mmsize], m3 1897%else ; %1 == 2 1898%if ARCH_X86_64 1899 SWAP 7, 8 1900 lea dstq, [dstbakq+strideq*4] 1901%else ; x86-32 1902 pxor m7, m7 1903 lea dstq, [dstq+strideq*4] 1904%endif 1905 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6 1906%endif 1907 UNSCRATCH 0, 9, rsp+276*mmsize ; out27 1908%if %1 == 1 1909 TRANSPOSE4x4D 4, 5, 6, 0, 7 1910 mova [ptrq+ 6*mmsize], m4 1911 mova [ptrq+14*mmsize], m5 1912 mova [ptrq+22*mmsize], m6 1913 mova [ptrq+30*mmsize], m0 1914%else ; %1 == 2 1915%if ARCH_X86_64 1916 lea dstq, [dstbakq+stride3q*8] 1917%else 1918 mov dstq, dstm 1919 lea dstq, [dstq+stride3q*8] 1920%endif 1921 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6 1922%endif 1923 1924 ; end of last stage + store for out4-7 and out24-27 1925%endmacro 1926 1927INIT_XMM sse2 1928cglobal vp9_idct_idct_32x32_add_10, 4, 6 + ARCH_X86_64, 16, \ 1929 275 * mmsize + ARCH_X86_32 * 8 * mmsize, \ 1930 dst, stride, block, eob 1931 mova m0, [pw_1023] 1932 cmp eobd, 1 1933 jg .idctfull 1934 1935 ; dc-only - the 10bit version can be done entirely in 32bit, since the max 1936 ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily 1937 ; fits in 32bit 1938 DEFINE_ARGS dst, stride, block, coef 1939 pxor m2, m2 1940 DC_ONLY 6, m2 1941 movd m1, coefd 1942 pshuflw m1, m1, q0000 1943 punpcklqdq m1, m1 1944 DEFINE_ARGS dst, stride, cnt 1945 mov cntd, 32 1946.loop_dc: 1947 STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize 1948 STORE_2x8 3, 4, 1, m2, m0, dstq+mmsize*2, mmsize 1949 add dstq, strideq 1950 dec cntd 1951 jg .loop_dc 1952 RET 1953 1954.idctfull: 1955 mova [rsp+256*mmsize], m0 1956 DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak 1957%if ARCH_X86_64 1958 mov dstbakq, dstq 1959 movsxd cntq, cntd 1960%endif 1961%ifdef PIC 1962 lea ptrq, [default_32x32] 1963 movzx cntd, byte [ptrq+cntq-1] 1964%else 1965 movzx cntd, byte [default_32x32+cntq-1] 1966%endif 1967 mov skipd, 8 1968 sub skipd, cntd 1969 mov ptrq, rsp 1970.loop_1: 1971 IDCT32_1D 1, blockq 1972 1973 add ptrq, 32 * mmsize 1974 add blockq, mmsize 1975 dec cntd 1976 jg .loop_1 1977 1978 ; zero-pad the remainder (skipped cols) 1979 test skipd, skipd 1980 jz .end 1981 shl skipd, 2 1982 lea blockq, [blockq+skipq*(mmsize/4)] 1983 pxor m0, m0 1984.loop_z: 1985 mova [ptrq+mmsize*0], m0 1986 mova [ptrq+mmsize*1], m0 1987 mova [ptrq+mmsize*2], m0 1988 mova [ptrq+mmsize*3], m0 1989 mova [ptrq+mmsize*4], m0 1990 mova [ptrq+mmsize*5], m0 1991 mova [ptrq+mmsize*6], m0 1992 mova [ptrq+mmsize*7], m0 1993 add ptrq, 8 * mmsize 1994 dec skipd 1995 jg .loop_z 1996.end: 1997 1998 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 1999 lea stride3q, [strideq*3] 2000 mov cntd, 8 2001 mov ptrq, rsp 2002.loop_2: 2003 IDCT32_1D 2, ptrq 2004 2005 add ptrq, mmsize 2006%if ARCH_X86_64 2007 add dstbakq, 8 2008 mov dstq, dstbakq 2009%else 2010 add dword dstm, 8 2011 mov dstq, dstm 2012%endif 2013 dec cntd 2014 jg .loop_2 2015 2016 ; m7 is still zero 2017 ZERO_BLOCK blockq-8*mmsize, 128, 32, m7 2018 RET 2019 2020INIT_XMM sse2 2021cglobal vp9_idct_idct_32x32_add_12, 4, 6 + ARCH_X86_64, 16, \ 2022 275 * mmsize + ARCH_X86_32 * 8 * mmsize, \ 2023 dst, stride, block, eob 2024 mova m0, [pw_4095] 2025 cmp eobd, 1 2026 jg mangle(private_prefix %+ _ %+ vp9_idct_idct_32x32_add_10 %+ SUFFIX).idctfull 2027 2028 ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign 2029 ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies 2030 DEFINE_ARGS dst, stride, block, coef, coefl 2031 pxor m2, m2 2032 DC_ONLY_64BIT 6, m2 2033 movd m1, coefd 2034 pshuflw m1, m1, q0000 2035 punpcklqdq m1, m1 2036 DEFINE_ARGS dst, stride, cnt 2037 mov cntd, 32 2038.loop_dc: 2039 STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize 2040 STORE_2x8 3, 4, 1, m2, m0, dstq+mmsize*2, mmsize 2041 add dstq, strideq 2042 dec cntd 2043 jg .loop_dc 2044 RET 2045