• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;******************************************************************************
2;* VP9 inverse transform x86 SIMD optimizations
3;*
4;* Copyright (C) 2015 Ronald S. Bultje <rsbultje gmail com>
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24%include "vp9itxfm_template.asm"
25
26SECTION_RODATA
27
28cextern pw_8
29cextern pw_1023
30cextern pw_2048
31cextern pw_4095
32cextern pw_m1
33cextern pd_1
34cextern pd_16
35cextern pd_32
36cextern pd_8192
37
38pd_8: times 4 dd 8
39pd_3fff: times 4 dd 0x3fff
40
41cextern pw_11585x2
42
43cextern pw_5283_13377
44cextern pw_9929_13377
45cextern pw_15212_m13377
46cextern pw_15212_9929
47cextern pw_m5283_m15212
48cextern pw_13377x2
49cextern pw_m13377_13377
50cextern pw_13377_0
51
52pw_9929_m5283: times 4 dw 9929, -5283
53
54%macro COEF_PAIR 2-3
55cextern pw_m%1_%2
56cextern pw_%2_%1
57%if %0 == 3
58cextern pw_m%1_m%2
59%if %1 != %2
60cextern pw_m%2_%1
61cextern pw_%1_%2
62%endif
63%endif
64%endmacro
65
66COEF_PAIR  2404, 16207
67COEF_PAIR  3196, 16069, 1
68COEF_PAIR  4756, 15679
69COEF_PAIR  5520, 15426
70COEF_PAIR  6270, 15137, 1
71COEF_PAIR  8423, 14053
72COEF_PAIR 10394, 12665
73COEF_PAIR 11003, 12140
74COEF_PAIR 11585, 11585, 1
75COEF_PAIR 13160,  9760
76COEF_PAIR 13623,  9102, 1
77COEF_PAIR 14449,  7723
78COEF_PAIR 14811,  7005
79COEF_PAIR 15893,  3981
80COEF_PAIR 16305,  1606
81COEF_PAIR 16364,   804
82
83default_8x8:
84times 12 db 1
85times 52 db 2
86row_8x8:
87times 18 db 1
88times 46 db 2
89col_8x8:
90times 6 db 1
91times 58 db 2
92default_16x16:
93times 10 db 1
94times 28 db 2
95times 51 db 3
96times 167 db 4
97row_16x16:
98times 21 db 1
99times 45 db 2
100times 60 db 3
101times 130 db 4
102col_16x16:
103times 5 db 1
104times 12 db 2
105times 25 db 3
106times 214 db 4
107default_32x32:
108times 9 db 1
109times 25 db 2
110times 36 db 3
111times 65 db 4
112times 105 db 5
113times 96 db 6
114times 112 db 7
115times 576 db 8
116
117SECTION .text
118
119%macro VP9_STORE_2X 6-7 dstq ; reg1, reg2, tmp1, tmp2, min, max, dst
120    mova               m%3, [%7]
121    mova               m%4, [%7+strideq]
122    paddw              m%3, m%1
123    paddw              m%4, m%2
124    pmaxsw             m%3, m%5
125    pmaxsw             m%4, m%5
126    pminsw             m%3, m%6
127    pminsw             m%4, m%6
128    mova              [%7], m%3
129    mova      [%7+strideq], m%4
130%endmacro
131
132%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg
133%assign %%y 0
134%rep %3
135%assign %%x 0
136%rep %3*4/mmsize
137    mova      [%1+%%y+%%x], %4
138%assign %%x (%%x+mmsize)
139%endrep
140%assign %%y (%%y+%2)
141%endrep
142%endmacro
143
144; the input coefficients are scaled up by 2 bit (which we downscale immediately
145; in the iwht), and is otherwise orthonormally increased by 1 bit per iwht_1d.
146; therefore, a diff of 10-12+sign bit will fit in 12-14+sign bit after scaling,
147; i.e. everything can be done in 15+1bpp words. Since the quant fractional bits
148; add 2 bits, we need to scale before converting to word in 12bpp, since the
149; input will be 16+sign bit which doesn't fit in 15+sign words, but in 10bpp
150; we can scale after converting to words (which is half the instructions),
151; since the input is only 14+sign bit, which fits in 15+sign words directly.
152
153%macro IWHT4_FN 2 ; bpp, max
154cglobal vp9_iwht_iwht_4x4_add_%1, 3, 3, 8, dst, stride, block, eob
155    mova                m7, [pw_%2]
156    mova                m0, [blockq+0*16+0]
157    mova                m1, [blockq+1*16+0]
158%if %1 >= 12
159    mova                m4, [blockq+0*16+8]
160    mova                m5, [blockq+1*16+8]
161    psrad               m0, 2
162    psrad               m1, 2
163    psrad               m4, 2
164    psrad               m5, 2
165    packssdw            m0, m4
166    packssdw            m1, m5
167%else
168    packssdw            m0, [blockq+0*16+8]
169    packssdw            m1, [blockq+1*16+8]
170    psraw               m0, 2
171    psraw               m1, 2
172%endif
173    mova                m2, [blockq+2*16+0]
174    mova                m3, [blockq+3*16+0]
175%if %1 >= 12
176    mova                m4, [blockq+2*16+8]
177    mova                m5, [blockq+3*16+8]
178    psrad               m2, 2
179    psrad               m3, 2
180    psrad               m4, 2
181    psrad               m5, 2
182    packssdw            m2, m4
183    packssdw            m3, m5
184%else
185    packssdw            m2, [blockq+2*16+8]
186    packssdw            m3, [blockq+3*16+8]
187    psraw               m2, 2
188    psraw               m3, 2
189%endif
190
191    VP9_IWHT4_1D
192    TRANSPOSE4x4W        0, 1, 2, 3, 4
193    VP9_IWHT4_1D
194
195    pxor                m6, m6
196    VP9_STORE_2X         0, 1, 4, 5, 6, 7
197    lea               dstq, [dstq+strideq*2]
198    VP9_STORE_2X         2, 3, 4, 5, 6, 7
199    ZERO_BLOCK      blockq, 16, 4, m6
200    RET
201%endmacro
202
203INIT_MMX mmxext
204IWHT4_FN 10, 1023
205INIT_MMX mmxext
206IWHT4_FN 12, 4095
207
208%macro VP9_IDCT4_WRITEOUT 0
209%if cpuflag(ssse3)
210    mova                m5, [pw_2048]
211    pmulhrsw            m0, m5
212    pmulhrsw            m1, m5
213    pmulhrsw            m2, m5
214    pmulhrsw            m3, m5
215%else
216    mova                m5, [pw_8]
217    paddw               m0, m5
218    paddw               m1, m5
219    paddw               m2, m5
220    paddw               m3, m5
221    psraw               m0, 4
222    psraw               m1, 4
223    psraw               m2, 4
224    psraw               m3, 4
225%endif
226    mova                m5, [pw_1023]
227    VP9_STORE_2X         0,  1,  6,  7,  4,  5
228    lea               dstq, [dstq+2*strideq]
229    VP9_STORE_2X         2,  3,  6,  7,  4,  5
230%endmacro
231
232%macro DC_ONLY 2 ; shift, zero
233    mov              coefd, dword [blockq]
234    movd          [blockq], %2
235    imul             coefd, 11585
236    add              coefd, 8192
237    sar              coefd, 14
238    imul             coefd, 11585
239    add              coefd, ((1 << (%1 - 1)) << 14) + 8192
240    sar              coefd, 14 + %1
241%endmacro
242
243; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits
244; in 15+1 words without additional effort, since the coefficients are 15bpp.
245
246%macro IDCT4_10_FN 0
247cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob
248    cmp               eobd, 1
249    jg .idctfull
250
251    ; dc-only
252    pxor                m4, m4
253%if cpuflag(ssse3)
254    movd                m0, [blockq]
255    movd          [blockq], m4
256    mova                m5, [pw_11585x2]
257    pmulhrsw            m0, m5
258    pmulhrsw            m0, m5
259%else
260    DEFINE_ARGS dst, stride, block, coef
261    DC_ONLY              4, m4
262    movd                m0, coefd
263%endif
264    pshufw              m0, m0, 0
265    mova                m5, [pw_1023]
266%if cpuflag(ssse3)
267    pmulhrsw            m0, [pw_2048]       ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
268%endif
269    VP9_STORE_2X         0,  0,  6,  7,  4,  5
270    lea               dstq, [dstq+2*strideq]
271    VP9_STORE_2X         0,  0,  6,  7,  4,  5
272    RET
273
274.idctfull:
275    mova                m0, [blockq+0*16+0]
276    mova                m1, [blockq+1*16+0]
277    packssdw            m0, [blockq+0*16+8]
278    packssdw            m1, [blockq+1*16+8]
279    mova                m2, [blockq+2*16+0]
280    mova                m3, [blockq+3*16+0]
281    packssdw            m2, [blockq+2*16+8]
282    packssdw            m3, [blockq+3*16+8]
283
284%if cpuflag(ssse3)
285    mova                m6, [pw_11585x2]
286%endif
287    mova                m7, [pd_8192]       ; rounding
288    VP9_IDCT4_1D
289    TRANSPOSE4x4W  0, 1, 2, 3, 4
290    VP9_IDCT4_1D
291
292    pxor                m4, m4
293    ZERO_BLOCK      blockq, 16, 4, m4
294    VP9_IDCT4_WRITEOUT
295    RET
296%endmacro
297
298INIT_MMX mmxext
299IDCT4_10_FN
300INIT_MMX ssse3
301IDCT4_10_FN
302
303%macro IADST4_FN 4
304cglobal vp9_%1_%3_4x4_add_10, 3, 3, 0, dst, stride, block, eob
305%if WIN64 && notcpuflag(ssse3)
306    WIN64_SPILL_XMM 8
307%endif
308    movdqa            xmm5, [pd_8192]
309    mova                m0, [blockq+0*16+0]
310    mova                m1, [blockq+1*16+0]
311    packssdw            m0, [blockq+0*16+8]
312    packssdw            m1, [blockq+1*16+8]
313    mova                m2, [blockq+2*16+0]
314    mova                m3, [blockq+3*16+0]
315    packssdw            m2, [blockq+2*16+8]
316    packssdw            m3, [blockq+3*16+8]
317
318%if cpuflag(ssse3)
319    mova                m6, [pw_11585x2]
320%endif
321%ifnidn %1%3, iadstiadst
322    movdq2q             m7, xmm5
323%endif
324    VP9_%2_1D
325    TRANSPOSE4x4W  0, 1, 2, 3, 4
326    VP9_%4_1D
327
328    pxor                m4, m4
329    ZERO_BLOCK      blockq, 16, 4, m4
330    VP9_IDCT4_WRITEOUT
331    RET
332%endmacro
333
334INIT_MMX sse2
335IADST4_FN idct,  IDCT4,  iadst, IADST4
336IADST4_FN iadst, IADST4, idct,  IDCT4
337IADST4_FN iadst, IADST4, iadst, IADST4
338
339INIT_MMX ssse3
340IADST4_FN idct,  IDCT4,  iadst, IADST4
341IADST4_FN iadst, IADST4, idct,  IDCT4
342IADST4_FN iadst, IADST4, iadst, IADST4
343
344; inputs and outputs are dwords, coefficients are words
345;
346; dst1 = src1 * coef1 + src2 * coef2 + rnd >> 14
347; dst2 = src1 * coef2 - src2 * coef1 + rnd >> 14
348%macro SUMSUB_MUL 6-8 [pd_8192], [pd_3fff] ; src/dst 1-2, tmp1-2, coef1-2, rnd, mask
349    pand               m%3, m%1, %8
350    pand               m%4, m%2, %8
351    psrad              m%1, 14
352    psrad              m%2, 14
353    packssdw           m%4, m%2
354    packssdw           m%3, m%1
355    punpckhwd          m%2, m%4, m%3
356    punpcklwd          m%4, m%3
357    pmaddwd            m%3, m%4, [pw_%6_%5]
358    pmaddwd            m%1, m%2, [pw_%6_%5]
359    pmaddwd            m%4, [pw_m%5_%6]
360    pmaddwd            m%2, [pw_m%5_%6]
361    paddd              m%3, %7
362    paddd              m%4, %7
363    psrad              m%3, 14
364    psrad              m%4, 14
365    paddd              m%1, m%3
366    paddd              m%2, m%4
367%endmacro
368
369%macro IDCT4_12BPP_1D 0-8 [pd_8192], [pd_3fff], 0, 1, 2, 3, 4, 5 ; rnd, mask, in/out0-3, tmp0-1
370    SUMSUB_MUL          %3, %5, %7, %8, 11585, 11585, %1, %2
371    SUMSUB_MUL          %4, %6, %7, %8, 15137,  6270, %1, %2
372    SUMSUB_BA        d, %4, %3, %7
373    SUMSUB_BA        d, %6, %5, %7
374    SWAP                %4, %6, %3
375%endmacro
376
377%macro STORE_4x4 6 ; tmp1-2, reg1-2, min, max
378    movh               m%1, [dstq+strideq*0]
379    movh               m%2, [dstq+strideq*2]
380    movhps             m%1, [dstq+strideq*1]
381    movhps             m%2, [dstq+stride3q ]
382    paddw              m%1, m%3
383    paddw              m%2, m%4
384    pmaxsw             m%1, %5
385    pmaxsw             m%2, %5
386    pminsw             m%1, %6
387    pminsw             m%2, %6
388    movh   [dstq+strideq*0], m%1
389    movhps [dstq+strideq*1], m%1
390    movh   [dstq+strideq*2], m%2
391    movhps [dstq+stride3q ], m%2
392%endmacro
393
394%macro ROUND_AND_STORE_4x4 8 ; reg1-4, min, max, rnd, shift
395    paddd              m%1, %7
396    paddd              m%2, %7
397    paddd              m%3, %7
398    paddd              m%4, %7
399    psrad              m%1, %8
400    psrad              m%2, %8
401    psrad              m%3, %8
402    psrad              m%4, %8
403    packssdw           m%1, m%2
404    packssdw           m%3, m%4
405    STORE_4x4           %2, %4, %1, %3, %5, %6
406%endmacro
407
408INIT_XMM sse2
409cglobal vp9_idct_idct_4x4_add_12, 4, 4, 8, dst, stride, block, eob
410    cmp               eobd, 1
411    jg .idctfull
412
413    ; dc-only - this is special, since for 4x4 12bpp, the max coef size is
414    ; 17+sign bpp. Since the multiply is with 11585, which is 14bpp, the
415    ; result of each multiply is 31+sign bit, i.e. it _exactly_ fits in a
416    ; dword. After the final shift (4), the result is 13+sign bits, so we
417    ; don't need any additional processing to fit it in a word
418    DEFINE_ARGS dst, stride, block, coef
419    pxor                m4, m4
420    DC_ONLY              4, m4
421    movd                m0, coefd
422    pshuflw             m0, m0, q0000
423    punpcklqdq          m0, m0
424    mova                m5, [pw_4095]
425    DEFINE_ARGS dst, stride, stride3
426    lea           stride3q, [strideq*3]
427    STORE_4x4            1, 3, 0, 0, m4, m5
428    RET
429
430.idctfull:
431    DEFINE_ARGS dst, stride, block, eob
432    mova                m0, [blockq+0*16]
433    mova                m1, [blockq+1*16]
434    mova                m2, [blockq+2*16]
435    mova                m3, [blockq+3*16]
436    mova                m6, [pd_8192]
437    mova                m7, [pd_3fff]
438
439    IDCT4_12BPP_1D      m6, m7
440    TRANSPOSE4x4D        0, 1, 2, 3, 4
441    IDCT4_12BPP_1D      m6, m7
442
443    pxor                m4, m4
444    ZERO_BLOCK      blockq, 16, 4, m4
445
446    ; writeout
447    DEFINE_ARGS dst, stride, stride3
448    lea           stride3q, [strideq*3]
449    mova                m5, [pw_4095]
450    mova                m6, [pd_8]
451    ROUND_AND_STORE_4x4  0, 1, 2, 3, m4, m5, m6, 4
452    RET
453
454%macro SCRATCH 3-4
455%if ARCH_X86_64
456    SWAP                %1, %2
457%if %0 == 4
458%define reg_%4 m%2
459%endif
460%else
461    mova              [%3], m%1
462%if %0 == 4
463%define reg_%4 [%3]
464%endif
465%endif
466%endmacro
467
468%macro UNSCRATCH 3-4
469%if ARCH_X86_64
470    SWAP                %1, %2
471%else
472    mova               m%1, [%3]
473%endif
474%if %0 == 4
475%undef reg_%4
476%endif
477%endmacro
478
479%macro PRELOAD 2-3
480%if ARCH_X86_64
481    mova               m%1, [%2]
482%if %0 == 3
483%define reg_%3 m%1
484%endif
485%elif %0 == 3
486%define reg_%3 [%2]
487%endif
488%endmacro
489
490; out0 =  5283 * in0 + 13377 + in1 + 15212 * in2 +  9929 * in3 + rnd >> 14
491; out1 =  9929 * in0 + 13377 * in1 -  5283 * in2 - 15282 * in3 + rnd >> 14
492; out2 = 13377 * in0               - 13377 * in2 + 13377 * in3 + rnd >> 14
493; out3 = 15212 * in0 - 13377 * in1 +  9929 * in2 -  5283 * in3 + rnd >> 14
494%macro IADST4_12BPP_1D 0-2 [pd_8192], [pd_3fff] ; rnd, mask
495    pand                m4, m0, %2
496    pand                m5, m1, %2
497    psrad               m0, 14
498    psrad               m1, 14
499    packssdw            m5, m1
500    packssdw            m4, m0
501    punpckhwd           m1, m4, m5
502    punpcklwd           m4, m5
503    pand                m5, m2, %2
504    pand                m6, m3, %2
505    psrad               m2, 14
506    psrad               m3, 14
507    packssdw            m6, m3
508    packssdw            m5, m2
509    punpckhwd           m3, m5, m6
510    punpcklwd           m5, m6
511    SCRATCH              1,  8, rsp+0*mmsize, a
512    SCRATCH              5,  9, rsp+1*mmsize, b
513
514    ; m1/3 have the high bits of 0,1,2,3
515    ; m4/5 have the low bits of 0,1,2,3
516    ; m0/2/6/7 are free
517
518    mova                m2, [pw_15212_9929]
519    mova                m0, [pw_5283_13377]
520    pmaddwd             m7, m2, reg_b
521    pmaddwd             m6, m4, m0
522    pmaddwd             m2, m3
523    pmaddwd             m0, reg_a
524    paddd               m6, m7
525    paddd               m0, m2
526    mova                m1, [pw_m13377_13377]
527    mova                m5, [pw_13377_0]
528    pmaddwd             m7, m1, reg_b
529    pmaddwd             m2, m4, m5
530    pmaddwd             m1, m3
531    pmaddwd             m5, reg_a
532    paddd               m2, m7
533    paddd               m1, m5
534    paddd               m6, %1
535    paddd               m2, %1
536    psrad               m6, 14
537    psrad               m2, 14
538    paddd               m0, m6                      ; t0
539    paddd               m2, m1                      ; t2
540
541    mova                m7, [pw_m5283_m15212]
542    mova                m5, [pw_9929_13377]
543    pmaddwd             m1, m7, reg_b
544    pmaddwd             m6, m4, m5
545    pmaddwd             m7, m3
546    pmaddwd             m5, reg_a
547    paddd               m6, m1
548    paddd               m7, m5
549    UNSCRATCH            5,  9, rsp+1*mmsize, b
550    pmaddwd             m5, [pw_9929_m5283]
551    pmaddwd             m4, [pw_15212_m13377]
552    pmaddwd             m3, [pw_9929_m5283]
553    UNSCRATCH            1,  8, rsp+0*mmsize, a
554    pmaddwd             m1, [pw_15212_m13377]
555    paddd               m4, m5
556    paddd               m3, m1
557    paddd               m6, %1
558    paddd               m4, %1
559    psrad               m6, 14
560    psrad               m4, 14
561    paddd               m7, m6                      ; t1
562    paddd               m3, m4                      ; t3
563
564    SWAP                 1, 7
565%endmacro
566
567%macro IADST4_12BPP_FN 4
568cglobal vp9_%1_%3_4x4_add_12, 3, 3, 12, 2 * ARCH_X86_32 * mmsize, dst, stride, block, eob
569    mova                m0, [blockq+0*16]
570    mova                m1, [blockq+1*16]
571    mova                m2, [blockq+2*16]
572    mova                m3, [blockq+3*16]
573
574    PRELOAD             10, pd_8192, rnd
575    PRELOAD             11, pd_3fff, mask
576    %2_12BPP_1D    reg_rnd, reg_mask
577    TRANSPOSE4x4D        0, 1, 2, 3, 4
578    %4_12BPP_1D    reg_rnd, reg_mask
579
580    pxor                m4, m4
581    ZERO_BLOCK      blockq, 16, 4, m4
582
583    ; writeout
584    DEFINE_ARGS dst, stride, stride3
585    lea           stride3q, [strideq*3]
586    mova                m5, [pw_4095]
587    mova                m6, [pd_8]
588    ROUND_AND_STORE_4x4  0, 1, 2, 3, m4, m5, m6, 4
589    RET
590%endmacro
591
592INIT_XMM sse2
593IADST4_12BPP_FN idct,  IDCT4,  iadst, IADST4
594IADST4_12BPP_FN iadst, IADST4, idct,  IDCT4
595IADST4_12BPP_FN iadst, IADST4, iadst, IADST4
596
597; the following line has not been executed at the end of this macro:
598; UNSCRATCH            6, 8, rsp+%3*mmsize
599%macro IDCT8_1D 1-5 [pd_8192], [pd_3fff], 2 * mmsize, 17 ; src, rnd, mask, src_stride, stack_offset
600    mova                m0, [%1+0*%4]
601    mova                m2, [%1+2*%4]
602    mova                m4, [%1+4*%4]
603    mova                m6, [%1+6*%4]
604    IDCT4_12BPP_1D      %2, %3, 0, 2, 4, 6, 1, 3            ; m0/2/4/6 have t0/1/2/3
605    SCRATCH              4, 8, rsp+(%5+0)*mmsize
606    SCRATCH              6, 9, rsp+(%5+1)*mmsize
607    mova                m1, [%1+1*%4]
608    mova                m3, [%1+3*%4]
609    mova                m5, [%1+5*%4]
610    mova                m7, [%1+7*%4]
611    SUMSUB_MUL           1, 7, 4, 6, 16069,  3196, %2, %3   ; m1=t7a, m7=t4a
612    SUMSUB_MUL           5, 3, 4, 6,  9102, 13623, %2, %3   ; m5=t6a, m3=t5a
613    SUMSUB_BA         d, 3, 7, 4                            ; m3=t4, m7=t5a
614    SUMSUB_BA         d, 5, 1, 4                            ; m5=t7, m1=t6a
615    SUMSUB_MUL           1, 7, 4, 6, 11585, 11585, %2, %3   ; m1=t6, m7=t5
616    SUMSUB_BA         d, 5, 0, 4                            ; m5=out0, m0=out7
617    SUMSUB_BA         d, 1, 2, 4                            ; m1=out1, m2=out6
618    UNSCRATCH            4, 8, rsp+(%5+0)*mmsize
619    UNSCRATCH            6, 9, rsp+(%5+1)*mmsize
620    SCRATCH              2, 8, rsp+(%5+0)*mmsize
621    SUMSUB_BA         d, 7, 4, 2                            ; m7=out2, m4=out5
622    SUMSUB_BA         d, 3, 6, 2                            ; m3=out3, m6=out4
623    SWAP                 0, 5, 4, 6, 2, 7
624%endmacro
625
626%macro STORE_2x8 5-7 dstq, strideq ; tmp1-2, reg, min, max
627    mova               m%1, [%6+%7*0]
628    mova               m%2, [%6+%7*1]
629    paddw              m%1, m%3
630    paddw              m%2, m%3
631    pmaxsw             m%1, %4
632    pmaxsw             m%2, %4
633    pminsw             m%1, %5
634    pminsw             m%2, %5
635    mova         [%6+%7*0], m%1
636    mova         [%6+%7*1], m%2
637%endmacro
638
639; FIXME we can use the intermediate storage (rsp[0-15]) on x86-32 for temp
640; storage also instead of allocating two more stack spaces. This doesn't
641; matter much but it's something...
642INIT_XMM sse2
643cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 14, \
644                                  16 * mmsize + 3 * ARCH_X86_32 * mmsize, \
645                                  dst, stride, block, eob
646    mova                m0, [pw_1023]
647    cmp               eobd, 1
648    jg .idctfull
649
650    ; dc-only - the 10bit version can be done entirely in 32bit, since the max
651    ; coef values are 16+sign bit, and the coef is 14bit, so 30+sign easily
652    ; fits in 32bit
653    DEFINE_ARGS dst, stride, block, coef
654    pxor                m2, m2
655    DC_ONLY              5, m2
656    movd                m1, coefd
657    pshuflw             m1, m1, q0000
658    punpcklqdq          m1, m1
659    DEFINE_ARGS dst, stride, cnt
660    mov               cntd, 4
661.loop_dc:
662    STORE_2x8            3, 4, 1, m2, m0
663    lea               dstq, [dstq+strideq*2]
664    dec               cntd
665    jg .loop_dc
666    RET
667
668.idctfull:
669    SCRATCH              0, 12, rsp+16*mmsize, max
670    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
671%if ARCH_X86_64
672    mov            dstbakq, dstq
673    movsxd            cntq, cntd
674%endif
675%ifdef PIC
676    lea               ptrq, [default_8x8]
677    movzx             cntd, byte [ptrq+cntq-1]
678%else
679    movzx             cntd, byte [default_8x8+cntq-1]
680%endif
681    mov              skipd, 2
682    sub              skipd, cntd
683    mov               ptrq, rsp
684    PRELOAD             10, pd_8192, rnd
685    PRELOAD             11, pd_3fff, mask
686    PRELOAD             13, pd_16, srnd
687.loop_1:
688    IDCT8_1D        blockq, reg_rnd, reg_mask
689
690    TRANSPOSE4x4D        0, 1, 2, 3, 6
691    mova  [ptrq+ 0*mmsize], m0
692    mova  [ptrq+ 2*mmsize], m1
693    mova  [ptrq+ 4*mmsize], m2
694    mova  [ptrq+ 6*mmsize], m3
695    UNSCRATCH            6, 8, rsp+17*mmsize
696    TRANSPOSE4x4D        4, 5, 6, 7, 0
697    mova  [ptrq+ 1*mmsize], m4
698    mova  [ptrq+ 3*mmsize], m5
699    mova  [ptrq+ 5*mmsize], m6
700    mova  [ptrq+ 7*mmsize], m7
701    add               ptrq, 8 * mmsize
702    add             blockq, mmsize
703    dec               cntd
704    jg .loop_1
705
706    ; zero-pad the remainder (skipped cols)
707    test             skipd, skipd
708    jz .end
709    add              skipd, skipd
710    lea             blockq, [blockq+skipq*(mmsize/2)]
711    pxor                m0, m0
712.loop_z:
713    mova   [ptrq+mmsize*0], m0
714    mova   [ptrq+mmsize*1], m0
715    mova   [ptrq+mmsize*2], m0
716    mova   [ptrq+mmsize*3], m0
717    add               ptrq, 4 * mmsize
718    dec              skipd
719    jg .loop_z
720.end:
721
722    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
723    lea           stride3q, [strideq*3]
724    mov               cntd, 2
725    mov               ptrq, rsp
726.loop_2:
727    IDCT8_1D          ptrq, reg_rnd, reg_mask
728
729    pxor                m6, m6
730    ROUND_AND_STORE_4x4  0, 1, 2, 3, m6, reg_max, reg_srnd, 5
731    lea               dstq, [dstq+strideq*4]
732    UNSCRATCH            0, 8, rsp+17*mmsize
733    UNSCRATCH            1, 12, rsp+16*mmsize, max
734    UNSCRATCH            2, 13, pd_16, srnd
735    ROUND_AND_STORE_4x4  4, 5, 0, 7, m6, m1, m2, 5
736    add               ptrq, 16
737%if ARCH_X86_64
738    lea               dstq, [dstbakq+8]
739%else
740    mov               dstq, dstm
741    add               dstq, 8
742%endif
743    dec               cntd
744    jg .loop_2
745
746    ; m6 is still zero
747    ZERO_BLOCK blockq-2*mmsize, 32, 8, m6
748    RET
749
750%macro DC_ONLY_64BIT 2 ; shift, zero
751%if ARCH_X86_64
752    movsxd           coefq, dword [blockq]
753    movd          [blockq], %2
754    imul             coefq, 11585
755    add              coefq, 8192
756    sar              coefq, 14
757    imul             coefq, 11585
758    add              coefq, ((1 << (%1 - 1)) << 14) + 8192
759    sar              coefq, 14 + %1
760%else
761    mov              coefd, dword [blockq]
762    movd          [blockq], %2
763    DEFINE_ARGS dst, stride, cnt, coef, coefl
764    mov               cntd, 2
765.loop_dc_calc:
766    mov             coefld, coefd
767    sar              coefd, 14
768    and             coefld, 0x3fff
769    imul             coefd, 11585
770    imul            coefld, 11585
771    add             coefld, 8192
772    sar             coefld, 14
773    add              coefd, coefld
774    dec               cntd
775    jg .loop_dc_calc
776    add              coefd, 1 << (%1 - 1)
777    sar              coefd, %1
778%endif
779%endmacro
780
781INIT_XMM sse2
782cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 14, \
783                                  16 * mmsize + 3 * ARCH_X86_32 * mmsize, \
784                                  dst, stride, block, eob
785    mova                m0, [pw_4095]
786    cmp               eobd, 1
787    jg mangle(private_prefix %+ _ %+ vp9_idct_idct_8x8_add_10 %+ SUFFIX).idctfull
788
789    ; dc-only - unfortunately, this one can overflow, since coefs are 18+sign
790    ; bpp, and 18+14+sign does not fit in 32bit, so we do 2-stage multiplies
791    DEFINE_ARGS dst, stride, block, coef, coefl
792    pxor                m2, m2
793    DC_ONLY_64BIT        5, m2
794    movd                m1, coefd
795    pshuflw             m1, m1, q0000
796    punpcklqdq          m1, m1
797    DEFINE_ARGS dst, stride, cnt
798    mov               cntd, 4
799.loop_dc:
800    STORE_2x8            3, 4, 1, m2, m0
801    lea               dstq, [dstq+strideq*2]
802    dec               cntd
803    jg .loop_dc
804    RET
805
806; inputs and outputs are dwords, coefficients are words
807;
808; dst1[hi]:dst3[lo] = src1 * coef1 + src2 * coef2
809; dst2[hi]:dst4[lo] = src1 * coef2 - src2 * coef1
810%macro SUMSUB_MUL_D 6-7 [pd_3fff] ; src/dst 1-2, dst3-4, coef1-2, mask
811    pand               m%3, m%1, %7
812    pand               m%4, m%2, %7
813    psrad              m%1, 14
814    psrad              m%2, 14
815    packssdw           m%4, m%2
816    packssdw           m%3, m%1
817    punpckhwd          m%2, m%4, m%3
818    punpcklwd          m%4, m%3
819    pmaddwd            m%3, m%4, [pw_%6_%5]
820    pmaddwd            m%1, m%2, [pw_%6_%5]
821    pmaddwd            m%4, [pw_m%5_%6]
822    pmaddwd            m%2, [pw_m%5_%6]
823%endmacro
824
825; dst1 = src2[hi]:src4[lo] + src1[hi]:src3[lo] + rnd >> 14
826; dst2 = src2[hi]:src4[lo] - src1[hi]:src3[lo] + rnd >> 14
827%macro SUMSUB_PACK_D 5-6 [pd_8192] ; src/dst 1-2, src3-4, tmp, rnd
828    SUMSUB_BA        d, %1, %2, %5
829    SUMSUB_BA        d, %3, %4, %5
830    paddd              m%3, %6
831    paddd              m%4, %6
832    psrad              m%3, 14
833    psrad              m%4, 14
834    paddd              m%1, m%3
835    paddd              m%2, m%4
836%endmacro
837
838%macro NEGD 1
839%if cpuflag(ssse3)
840    psignd              %1, [pw_m1]
841%else
842    pxor                %1, [pw_m1]
843    paddd               %1, [pd_1]
844%endif
845%endmacro
846
847; the following line has not been executed at the end of this macro:
848; UNSCRATCH            6, 8, rsp+17*mmsize
849%macro IADST8_1D 1-3 [pd_8192], [pd_3fff] ; src, rnd, mask
850    mova                m0, [%1+ 0*mmsize]
851    mova                m3, [%1+ 6*mmsize]
852    mova                m4, [%1+ 8*mmsize]
853    mova                m7, [%1+14*mmsize]
854    SUMSUB_MUL_D         7, 0, 1, 2, 16305,  1606, %3   ; m7/1=t0a, m0/2=t1a
855    SUMSUB_MUL_D         3, 4, 5, 6, 10394, 12665, %3   ; m3/5=t4a, m4/6=t5a
856    SCRATCH              0, 8, rsp+17*mmsize
857    SUMSUB_PACK_D        3, 7, 5, 1, 0, %2              ; m3=t0, m7=t4
858    UNSCRATCH            0, 8, rsp+17*mmsize
859    SUMSUB_PACK_D        4, 0, 6, 2, 1, %2              ; m4=t1, m0=t5
860
861    SCRATCH              3, 8, rsp+17*mmsize
862    SCRATCH              4, 9, rsp+18*mmsize
863    SCRATCH              7, 10, rsp+19*mmsize
864    SCRATCH              0, 11, rsp+20*mmsize
865
866    mova                m1, [%1+ 2*mmsize]
867    mova                m2, [%1+ 4*mmsize]
868    mova                m5, [%1+10*mmsize]
869    mova                m6, [%1+12*mmsize]
870    SUMSUB_MUL_D         5, 2, 3, 4, 14449,  7723, %3   ; m5/8=t2a, m2/9=t3a
871    SUMSUB_MUL_D         1, 6, 7, 0,  4756, 15679, %3   ; m1/10=t6a, m6/11=t7a
872    SCRATCH              2, 12, rsp+21*mmsize
873    SUMSUB_PACK_D        1, 5, 7, 3, 2, %2              ; m1=t2, m5=t6
874    UNSCRATCH            2, 12, rsp+21*mmsize
875    SUMSUB_PACK_D        6, 2, 0, 4, 3, %2              ; m6=t3, m2=t7
876
877    UNSCRATCH            7, 10, rsp+19*mmsize
878    UNSCRATCH            0, 11, rsp+20*mmsize
879    SCRATCH              1, 10, rsp+19*mmsize
880    SCRATCH              6, 11, rsp+20*mmsize
881
882    SUMSUB_MUL_D         7, 0, 3, 4, 15137,  6270, %3   ; m7/8=t4a, m0/9=t5a
883    SUMSUB_MUL_D         2, 5, 1, 6,  6270, 15137, %3   ; m2/10=t7a, m5/11=t6a
884    SCRATCH              2, 12, rsp+21*mmsize
885    SUMSUB_PACK_D        5, 7, 6, 3, 2, %2              ; m5=-out1, m7=t6
886    UNSCRATCH            2, 12, rsp+21*mmsize
887    NEGD                m5                              ; m5=out1
888    SUMSUB_PACK_D        2, 0, 1, 4, 3, %2              ; m2=out6, m0=t7
889    SUMSUB_MUL           7, 0, 3, 4, 11585, 11585, %2, %3   ; m7=out2, m0=-out5
890    NEGD                m0                              ; m0=out5
891
892    UNSCRATCH            3, 8, rsp+17*mmsize
893    UNSCRATCH            4, 9, rsp+18*mmsize
894    UNSCRATCH            1, 10, rsp+19*mmsize
895    UNSCRATCH            6, 11, rsp+20*mmsize
896    SCRATCH              2, 8, rsp+17*mmsize
897    SCRATCH              0, 9, rsp+18*mmsize
898
899    SUMSUB_BA         d, 1, 3,  2                       ; m1=out0, m3=t2
900    SUMSUB_BA         d, 6, 4,  2                       ; m6=-out7, m4=t3
901    NEGD                m6                              ; m6=out7
902    SUMSUB_MUL           3, 4,  2,  0, 11585, 11585, %2, %3 ; m3=-out3, m4=out4
903    NEGD                m3                              ; m3=out3
904
905    UNSCRATCH            0, 9, rsp+18*mmsize
906
907    SWAP                 0, 1, 5
908    SWAP                 2, 7, 6
909%endmacro
910
911%macro IADST8_FN 5
912cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
913                              16 * mmsize + ARCH_X86_32 * 6 * mmsize, \
914                              dst, stride, block, eob
915    mova                m0, [pw_1023]
916
917.body:
918    SCRATCH              0, 13, rsp+16*mmsize, max
919    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
920%if ARCH_X86_64
921    mov            dstbakq, dstq
922    movsxd            cntq, cntd
923%endif
924%ifdef PIC
925    lea               ptrq, [%5_8x8]
926    movzx             cntd, byte [ptrq+cntq-1]
927%else
928    movzx             cntd, byte [%5_8x8+cntq-1]
929%endif
930    mov              skipd, 2
931    sub              skipd, cntd
932    mov               ptrq, rsp
933    PRELOAD             14, pd_8192, rnd
934    PRELOAD             15, pd_3fff, mask
935.loop_1:
936    %2_1D           blockq, reg_rnd, reg_mask
937
938    TRANSPOSE4x4D        0, 1, 2, 3, 6
939    mova  [ptrq+ 0*mmsize], m0
940    mova  [ptrq+ 2*mmsize], m1
941    mova  [ptrq+ 4*mmsize], m2
942    mova  [ptrq+ 6*mmsize], m3
943    UNSCRATCH            6, 8, rsp+17*mmsize
944    TRANSPOSE4x4D        4, 5, 6, 7, 0
945    mova  [ptrq+ 1*mmsize], m4
946    mova  [ptrq+ 3*mmsize], m5
947    mova  [ptrq+ 5*mmsize], m6
948    mova  [ptrq+ 7*mmsize], m7
949    add               ptrq, 8 * mmsize
950    add             blockq, mmsize
951    dec               cntd
952    jg .loop_1
953
954    ; zero-pad the remainder (skipped cols)
955    test             skipd, skipd
956    jz .end
957    add              skipd, skipd
958    lea             blockq, [blockq+skipq*(mmsize/2)]
959    pxor                m0, m0
960.loop_z:
961    mova   [ptrq+mmsize*0], m0
962    mova   [ptrq+mmsize*1], m0
963    mova   [ptrq+mmsize*2], m0
964    mova   [ptrq+mmsize*3], m0
965    add               ptrq, 4 * mmsize
966    dec              skipd
967    jg .loop_z
968.end:
969
970    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
971    lea           stride3q, [strideq*3]
972    mov               cntd, 2
973    mov               ptrq, rsp
974.loop_2:
975    %4_1D             ptrq, reg_rnd, reg_mask
976
977    pxor                m6, m6
978    PRELOAD              9, pd_16, srnd
979    ROUND_AND_STORE_4x4  0, 1, 2, 3, m6, reg_max, reg_srnd, 5
980    lea               dstq, [dstq+strideq*4]
981    UNSCRATCH            0, 8, rsp+17*mmsize
982    UNSCRATCH            1, 13, rsp+16*mmsize, max
983    UNSCRATCH            2, 9, pd_16, srnd
984    ROUND_AND_STORE_4x4  4, 5, 0, 7, m6, m1, m2, 5
985    add               ptrq, 16
986%if ARCH_X86_64
987    lea               dstq, [dstbakq+8]
988%else
989    mov               dstq, dstm
990    add               dstq, 8
991%endif
992    dec               cntd
993    jg .loop_2
994
995    ; m6 is still zero
996    ZERO_BLOCK blockq-2*mmsize, 32, 8, m6
997    RET
998
999cglobal vp9_%1_%3_8x8_add_12, 4, 6 + ARCH_X86_64, 16, \
1000                              16 * mmsize + ARCH_X86_32 * 6 * mmsize, \
1001                              dst, stride, block, eob
1002    mova                m0, [pw_4095]
1003    jmp mangle(private_prefix %+ _ %+ vp9_%1_%3_8x8_add_10 %+ SUFFIX).body
1004%endmacro
1005
1006INIT_XMM sse2
1007IADST8_FN idct,  IDCT8,  iadst, IADST8, row
1008IADST8_FN iadst, IADST8, idct,  IDCT8,  col
1009IADST8_FN iadst, IADST8, iadst, IADST8, default
1010
1011%macro IDCT16_1D 1-4 4 * mmsize, 65, 67 ; src, src_stride, stack_offset, mm32bit_stack_offset
1012    IDCT8_1D            %1, [pd_8192], [pd_3fff], %2 * 2, %4    ; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7
1013    ; SCRATCH            6, 8, rsp+(%4+0)*mmsize    ; t6
1014    SCRATCH              0, 15, rsp+(%4+7)*mmsize   ; t0a
1015    SCRATCH              1, 14, rsp+(%4+6)*mmsize   ; t1a
1016    SCRATCH              2, 13, rsp+(%4+5)*mmsize   ; t2a
1017    SCRATCH              3, 12, rsp+(%4+4)*mmsize   ; t3a
1018    SCRATCH              4, 11, rsp+(%4+3)*mmsize   ; t4
1019    mova [rsp+(%3+0)*mmsize], m5                    ; t5
1020    mova [rsp+(%3+1)*mmsize], m7                    ; t7
1021
1022    mova                m0, [%1+ 1*%2]              ; in1
1023    mova                m3, [%1+ 7*%2]              ; in7
1024    mova                m4, [%1+ 9*%2]              ; in9
1025    mova                m7, [%1+15*%2]              ; in15
1026
1027    SUMSUB_MUL           0, 7, 1, 2, 16305,  1606   ; m0=t15a, m7=t8a
1028    SUMSUB_MUL           4, 3, 1, 2, 10394, 12665   ; m4=t14a, m3=t9a
1029    SUMSUB_BA         d, 3, 7, 1                    ; m3=t8, m7=t9
1030    SUMSUB_BA         d, 4, 0, 1                    ; m4=t15,m0=t14
1031    SUMSUB_MUL           0, 7, 1, 2, 15137,  6270   ; m0=t14a, m7=t9a
1032
1033    mova                m1, [%1+ 3*%2]              ; in3
1034    mova                m2, [%1+ 5*%2]              ; in5
1035    mova                m5, [%1+11*%2]              ; in11
1036    mova                m6, [%1+13*%2]              ; in13
1037
1038    SCRATCH              0,  9, rsp+(%4+1)*mmsize
1039    SCRATCH              7, 10, rsp+(%4+2)*mmsize
1040
1041    SUMSUB_MUL           2, 5, 0, 7, 14449,  7723   ; m2=t13a, m5=t10a
1042    SUMSUB_MUL           6, 1, 0, 7,  4756, 15679   ; m6=t12a, m1=t11a
1043    SUMSUB_BA         d, 5, 1, 0                    ; m5=t11,m1=t10
1044    SUMSUB_BA         d, 2, 6, 0                    ; m2=t12,m6=t13
1045    NEGD                m1                          ; m1=-t10
1046    SUMSUB_MUL           1, 6, 0, 7, 15137,  6270   ; m1=t13a, m6=t10a
1047
1048    UNSCRATCH            7, 10, rsp+(%4+2)*mmsize
1049    SUMSUB_BA         d, 5, 3, 0                    ; m5=t8a, m3=t11a
1050    SUMSUB_BA         d, 6, 7, 0                    ; m6=t9,  m7=t10
1051    SUMSUB_BA         d, 2, 4, 0                    ; m2=t15a,m4=t12a
1052    SCRATCH              5, 10, rsp+(%4+2)*mmsize
1053    SUMSUB_MUL           4, 3, 0, 5, 11585, 11585   ; m4=t12, m3=t11
1054    UNSCRATCH            0, 9, rsp+(%4+1)*mmsize
1055    SUMSUB_BA         d, 1, 0, 5                    ; m1=t14, m0=t13
1056    SCRATCH              6, 9, rsp+(%4+1)*mmsize
1057    SUMSUB_MUL           0, 7, 6, 5, 11585, 11585   ; m0=t13a,m7=t10a
1058
1059    ; order: 15|r74,14|r73,13|r72,12|r71,11|r70,r65,8|r67,r66,10|r69,9|r68,7,3,4,0,1,2
1060    ; free: 6,5
1061
1062    UNSCRATCH            5, 15, rsp+(%4+7)*mmsize
1063    SUMSUB_BA         d, 2, 5, 6                    ; m2=out0, m5=out15
1064    SCRATCH              5, 15, rsp+(%4+7)*mmsize
1065    UNSCRATCH            5, 14, rsp+(%4+6)*mmsize
1066    SUMSUB_BA         d, 1, 5, 6                    ; m1=out1, m5=out14
1067    SCRATCH              5, 14, rsp+(%4+6)*mmsize
1068    UNSCRATCH            5, 13, rsp+(%4+5)*mmsize
1069    SUMSUB_BA         d, 0, 5, 6                    ; m0=out2, m5=out13
1070    SCRATCH              5, 13, rsp+(%4+5)*mmsize
1071    UNSCRATCH            5, 12, rsp+(%4+4)*mmsize
1072    SUMSUB_BA         d, 4, 5, 6                    ; m4=out3, m5=out12
1073    SCRATCH              5, 12, rsp+(%4+4)*mmsize
1074    UNSCRATCH            5, 11, rsp+(%4+3)*mmsize
1075    SUMSUB_BA         d, 3, 5, 6                    ; m3=out4, m5=out11
1076    SCRATCH              4, 11, rsp+(%4+3)*mmsize
1077    mova                m4, [rsp+(%3+0)*mmsize]
1078    SUMSUB_BA         d, 7, 4, 6                    ; m7=out5, m4=out10
1079    mova [rsp+(%3+0)*mmsize], m5
1080    UNSCRATCH            5, 8, rsp+(%4+0)*mmsize
1081    UNSCRATCH            6, 9, rsp+(%4+1)*mmsize
1082    SCRATCH              2, 8, rsp+(%4+0)*mmsize
1083    SCRATCH              1, 9, rsp+(%4+1)*mmsize
1084    UNSCRATCH            1, 10, rsp+(%4+2)*mmsize
1085    SCRATCH              0, 10, rsp+(%4+2)*mmsize
1086    mova                m0, [rsp+(%3+1)*mmsize]
1087    SUMSUB_BA         d, 6, 5, 2                    ; m6=out6, m5=out9
1088    SUMSUB_BA         d, 1, 0, 2                    ; m1=out7, m0=out8
1089
1090    SWAP                 0, 3, 1, 7, 2, 6, 4
1091
1092    ; output order: 8-11|r67-70=out0-3
1093    ;               0-6,r65=out4-11
1094    ;               12-15|r71-74=out12-15
1095%endmacro
1096
1097INIT_XMM sse2
1098cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
1099                                    67 * mmsize + ARCH_X86_32 * 8 * mmsize, \
1100                                    dst, stride, block, eob
1101    mova                m0, [pw_1023]
1102    cmp               eobd, 1
1103    jg .idctfull
1104
1105    ; dc-only - the 10bit version can be done entirely in 32bit, since the max
1106    ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily
1107    ; fits in 32bit
1108    DEFINE_ARGS dst, stride, block, coef
1109    pxor                m2, m2
1110    DC_ONLY              6, m2
1111    movd                m1, coefd
1112    pshuflw             m1, m1, q0000
1113    punpcklqdq          m1, m1
1114    DEFINE_ARGS dst, stride, cnt
1115    mov               cntd, 8
1116.loop_dc:
1117    STORE_2x8            3, 4, 1, m2, m0, dstq,         mmsize
1118    STORE_2x8            3, 4, 1, m2, m0, dstq+strideq, mmsize
1119    lea               dstq, [dstq+strideq*2]
1120    dec               cntd
1121    jg .loop_dc
1122    RET
1123
1124.idctfull:
1125    mova   [rsp+64*mmsize], m0
1126    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
1127%if ARCH_X86_64
1128    mov            dstbakq, dstq
1129    movsxd            cntq, cntd
1130%endif
1131%ifdef PIC
1132    lea               ptrq, [default_16x16]
1133    movzx             cntd, byte [ptrq+cntq-1]
1134%else
1135    movzx             cntd, byte [default_16x16+cntq-1]
1136%endif
1137    mov              skipd, 4
1138    sub              skipd, cntd
1139    mov               ptrq, rsp
1140.loop_1:
1141    IDCT16_1D       blockq
1142
1143    TRANSPOSE4x4D        0, 1, 2, 3, 7
1144    mova  [ptrq+ 1*mmsize], m0
1145    mova  [ptrq+ 5*mmsize], m1
1146    mova  [ptrq+ 9*mmsize], m2
1147    mova  [ptrq+13*mmsize], m3
1148    mova                m7, [rsp+65*mmsize]
1149    TRANSPOSE4x4D        4, 5, 6, 7, 0
1150    mova  [ptrq+ 2*mmsize], m4
1151    mova  [ptrq+ 6*mmsize], m5
1152    mova  [ptrq+10*mmsize], m6
1153    mova  [ptrq+14*mmsize], m7
1154    UNSCRATCH               0, 8, rsp+67*mmsize
1155    UNSCRATCH               1, 9, rsp+68*mmsize
1156    UNSCRATCH               2, 10, rsp+69*mmsize
1157    UNSCRATCH               3, 11, rsp+70*mmsize
1158    TRANSPOSE4x4D        0, 1, 2, 3, 7
1159    mova  [ptrq+ 0*mmsize], m0
1160    mova  [ptrq+ 4*mmsize], m1
1161    mova  [ptrq+ 8*mmsize], m2
1162    mova  [ptrq+12*mmsize], m3
1163    UNSCRATCH               4, 12, rsp+71*mmsize
1164    UNSCRATCH               5, 13, rsp+72*mmsize
1165    UNSCRATCH               6, 14, rsp+73*mmsize
1166    UNSCRATCH               7, 15, rsp+74*mmsize
1167    TRANSPOSE4x4D        4, 5, 6, 7, 0
1168    mova  [ptrq+ 3*mmsize], m4
1169    mova  [ptrq+ 7*mmsize], m5
1170    mova  [ptrq+11*mmsize], m6
1171    mova  [ptrq+15*mmsize], m7
1172    add               ptrq, 16 * mmsize
1173    add             blockq, mmsize
1174    dec               cntd
1175    jg .loop_1
1176
1177    ; zero-pad the remainder (skipped cols)
1178    test             skipd, skipd
1179    jz .end
1180    add              skipd, skipd
1181    lea             blockq, [blockq+skipq*(mmsize/2)]
1182    pxor                m0, m0
1183.loop_z:
1184    mova   [ptrq+mmsize*0], m0
1185    mova   [ptrq+mmsize*1], m0
1186    mova   [ptrq+mmsize*2], m0
1187    mova   [ptrq+mmsize*3], m0
1188    mova   [ptrq+mmsize*4], m0
1189    mova   [ptrq+mmsize*5], m0
1190    mova   [ptrq+mmsize*6], m0
1191    mova   [ptrq+mmsize*7], m0
1192    add               ptrq, 8 * mmsize
1193    dec              skipd
1194    jg .loop_z
1195.end:
1196
1197    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
1198    lea           stride3q, [strideq*3]
1199    mov               cntd, 4
1200    mov               ptrq, rsp
1201.loop_2:
1202    IDCT16_1D         ptrq
1203
1204    pxor               m7, m7
1205    lea               dstq, [dstq+strideq*4]
1206    ROUND_AND_STORE_4x4  0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
1207    lea               dstq, [dstq+strideq*4]
1208    mova                m0, [rsp+65*mmsize]
1209    mova                m1, [rsp+64*mmsize]
1210    mova                m2, [pd_32]
1211    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, m1, m2, 6
1212
1213%if ARCH_X86_64
1214    DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
1215%else
1216    mov               dstq, dstm
1217%endif
1218    UNSCRATCH               0, 8, rsp+67*mmsize
1219    UNSCRATCH               4, 9, rsp+68*mmsize
1220    UNSCRATCH               5, 10, rsp+69*mmsize
1221    UNSCRATCH               3, 11, rsp+70*mmsize
1222    ROUND_AND_STORE_4x4  0, 4, 5, 3, m7, m1, m2, 6
1223%if ARCH_X86_64
1224    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
1225    lea               dstq, [dstbakq+stride3q*4]
1226%else
1227    lea               dstq, [dstq+stride3q*4]
1228%endif
1229    UNSCRATCH               4, 12, rsp+71*mmsize
1230    UNSCRATCH               5, 13, rsp+72*mmsize
1231    UNSCRATCH               6, 14, rsp+73*mmsize
1232    UNSCRATCH               0, 15, rsp+74*mmsize
1233    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, m1, m2, 6
1234
1235    add               ptrq, mmsize
1236%if ARCH_X86_64
1237    add            dstbakq, 8
1238    mov               dstq, dstbakq
1239%else
1240    add         dword dstm, 8
1241    mov               dstq, dstm
1242%endif
1243    dec               cntd
1244    jg .loop_2
1245
1246    ; m7 is still zero
1247    ZERO_BLOCK blockq-4*mmsize, 64, 16, m7
1248    RET
1249
1250INIT_XMM sse2
1251cglobal vp9_idct_idct_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \
1252                                    67 * mmsize + ARCH_X86_32 * 8 * mmsize, \
1253                                    dst, stride, block, eob
1254    mova                m0, [pw_4095]
1255    cmp               eobd, 1
1256    jg mangle(private_prefix %+ _ %+ vp9_idct_idct_16x16_add_10 %+ SUFFIX).idctfull
1257
1258    ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign
1259    ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies
1260    DEFINE_ARGS dst, stride, block, coef, coefl
1261    pxor                m2, m2
1262    DC_ONLY_64BIT        6, m2
1263    movd                m1, coefd
1264    pshuflw             m1, m1, q0000
1265    punpcklqdq          m1, m1
1266    DEFINE_ARGS dst, stride, cnt
1267    mov               cntd, 8
1268.loop_dc:
1269    STORE_2x8            3, 4, 1, m2, m0, dstq,         mmsize
1270    STORE_2x8            3, 4, 1, m2, m0, dstq+strideq, mmsize
1271    lea               dstq, [dstq+strideq*2]
1272    dec               cntd
1273    jg .loop_dc
1274    RET
1275
1276; r65-69 are available for spills
1277; r70-77 are available on x86-32 only (x86-64 should use m8-15)
1278; output should be in m8-11|r70-73, m0-6,r65 and m12-15|r74-77
1279%macro IADST16_1D 1 ; src
1280    mova                m0, [%1+ 0*4*mmsize]        ; in0
1281    mova                m1, [%1+ 7*4*mmsize]        ; in7
1282    mova                m2, [%1+ 8*4*mmsize]        ; in8
1283    mova                m3, [%1+15*4*mmsize]        ; in15
1284    SUMSUB_MUL_D         3, 0, 4, 5, 16364,  804    ; m3/4=t0, m0/5=t1
1285    SUMSUB_MUL_D         1, 2, 6, 7, 11003, 12140   ; m1/6=t8, m2/7=t9
1286    SCRATCH              0, 8, rsp+70*mmsize
1287    SUMSUB_PACK_D        1, 3, 6, 4, 0              ; m1=t0a, m3=t8a
1288    UNSCRATCH            0, 8, rsp+70*mmsize
1289    SUMSUB_PACK_D        2, 0, 7, 5, 4              ; m2=t1a, m0=t9a
1290    mova   [rsp+67*mmsize], m1
1291    SCRATCH              2, 9, rsp+71*mmsize
1292    SCRATCH              3, 12, rsp+74*mmsize
1293    SCRATCH              0, 13, rsp+75*mmsize
1294
1295    mova                m0, [%1+ 3*4*mmsize]        ; in3
1296    mova                m1, [%1+ 4*4*mmsize]        ; in4
1297    mova                m2, [%1+11*4*mmsize]        ; in11
1298    mova                m3, [%1+12*4*mmsize]        ; in12
1299    SUMSUB_MUL_D         2, 1, 4, 5, 14811,  7005   ; m2/4=t4, m1/5=t5
1300    SUMSUB_MUL_D         0, 3, 6, 7,  5520, 15426   ; m0/6=t12, m3/7=t13
1301    SCRATCH              1, 10, rsp+72*mmsize
1302    SUMSUB_PACK_D        0, 2, 6, 4, 1              ; m0=t4a, m2=t12a
1303    UNSCRATCH            1, 10, rsp+72*mmsize
1304    SUMSUB_PACK_D        3, 1, 7, 5, 4              ; m3=t5a, m1=t13a
1305    SCRATCH              0, 15, rsp+77*mmsize
1306    SCRATCH              3, 11, rsp+73*mmsize
1307
1308    UNSCRATCH            0, 12, rsp+74*mmsize       ; t8a
1309    UNSCRATCH            3, 13, rsp+75*mmsize       ; t9a
1310    SUMSUB_MUL_D         0, 3, 4, 5, 16069,  3196   ; m0/4=t8, m3/5=t9
1311    SUMSUB_MUL_D         1, 2, 6, 7,  3196, 16069   ; m1/6=t13, m2/7=t12
1312    SCRATCH              1, 12, rsp+74*mmsize
1313    SUMSUB_PACK_D        2, 0, 7, 4, 1              ; m2=t8a, m0=t12a
1314    UNSCRATCH            1, 12, rsp+74*mmsize
1315    SUMSUB_PACK_D        1, 3, 6, 5, 4              ; m1=t9a, m3=t13a
1316    mova   [rsp+65*mmsize], m2
1317    mova   [rsp+66*mmsize], m1
1318    SCRATCH              0, 8, rsp+70*mmsize
1319    SCRATCH              3, 12, rsp+74*mmsize
1320
1321    mova                m0, [%1+ 2*4*mmsize]        ; in2
1322    mova                m1, [%1+ 5*4*mmsize]        ; in5
1323    mova                m2, [%1+10*4*mmsize]        ; in10
1324    mova                m3, [%1+13*4*mmsize]        ; in13
1325    SUMSUB_MUL_D         3, 0, 4, 5, 15893,  3981   ; m3/4=t2, m0/5=t3
1326    SUMSUB_MUL_D         1, 2, 6, 7,  8423, 14053   ; m1/6=t10, m2/7=t11
1327    SCRATCH              0, 10, rsp+72*mmsize
1328    SUMSUB_PACK_D        1, 3, 6, 4, 0              ; m1=t2a, m3=t10a
1329    UNSCRATCH            0, 10, rsp+72*mmsize
1330    SUMSUB_PACK_D        2, 0, 7, 5, 4              ; m2=t3a, m0=t11a
1331    mova   [rsp+68*mmsize], m1
1332    mova   [rsp+69*mmsize], m2
1333    SCRATCH              3, 13, rsp+75*mmsize
1334    SCRATCH              0, 14, rsp+76*mmsize
1335
1336    mova                m0, [%1+ 1*4*mmsize]        ; in1
1337    mova                m1, [%1+ 6*4*mmsize]        ; in6
1338    mova                m2, [%1+ 9*4*mmsize]        ; in9
1339    mova                m3, [%1+14*4*mmsize]        ; in14
1340    SUMSUB_MUL_D         2, 1, 4, 5, 13160,  9760   ; m2/4=t6, m1/5=t7
1341    SUMSUB_MUL_D         0, 3, 6, 7,  2404, 16207   ; m0/6=t14, m3/7=t15
1342    SCRATCH              1, 10, rsp+72*mmsize
1343    SUMSUB_PACK_D        0, 2, 6, 4, 1              ; m0=t6a, m2=t14a
1344    UNSCRATCH            1, 10, rsp+72*mmsize
1345    SUMSUB_PACK_D        3, 1, 7, 5, 4              ; m3=t7a, m1=t15a
1346
1347    UNSCRATCH            4, 13, rsp+75*mmsize       ; t10a
1348    UNSCRATCH            5, 14, rsp+76*mmsize       ; t11a
1349    SCRATCH              0, 13, rsp+75*mmsize
1350    SCRATCH              3, 14, rsp+76*mmsize
1351    SUMSUB_MUL_D         4, 5, 6, 7,  9102, 13623   ; m4/6=t10, m5/7=t11
1352    SUMSUB_MUL_D         1, 2, 0, 3, 13623,  9102   ; m1/0=t15, m2/3=t14
1353    SCRATCH              0, 10, rsp+72*mmsize
1354    SUMSUB_PACK_D        2, 4, 3, 6, 0              ; m2=t10a, m4=t14a
1355    UNSCRATCH            0, 10, rsp+72*mmsize
1356    SUMSUB_PACK_D        1, 5, 0, 7, 6              ; m1=t11a, m5=t15a
1357
1358    UNSCRATCH            0, 8, rsp+70*mmsize        ; t12a
1359    UNSCRATCH            3, 12, rsp+74*mmsize       ; t13a
1360    SCRATCH              2, 8, rsp+70*mmsize
1361    SCRATCH              1, 12, rsp+74*mmsize
1362    SUMSUB_MUL_D         0, 3, 1, 2, 15137,  6270   ; m0/1=t12, m3/2=t13
1363    SUMSUB_MUL_D         5, 4, 7, 6,  6270, 15137   ; m5/7=t15, m4/6=t14
1364    SCRATCH              2, 10, rsp+72*mmsize
1365    SUMSUB_PACK_D        4, 0, 6, 1, 2              ; m4=out2, m0=t14a
1366    UNSCRATCH            2, 10, rsp+72*mmsize
1367    SUMSUB_PACK_D        5, 3, 7, 2, 1              ; m5=-out13, m3=t15a
1368    NEGD                m5                          ; m5=out13
1369
1370    UNSCRATCH            1, 9, rsp+71*mmsize        ; t1a
1371    mova                m2, [rsp+68*mmsize]         ; t2a
1372    UNSCRATCH            6, 13, rsp+75*mmsize       ; t6a
1373    UNSCRATCH            7, 14, rsp+76*mmsize       ; t7a
1374    SCRATCH              4, 10, rsp+72*mmsize
1375    SCRATCH              5, 13, rsp+75*mmsize
1376    UNSCRATCH            4, 15, rsp+77*mmsize       ; t4a
1377    UNSCRATCH            5, 11, rsp+73*mmsize       ; t5a
1378    SCRATCH              0, 14, rsp+76*mmsize
1379    SCRATCH              3, 15, rsp+77*mmsize
1380    mova                m0, [rsp+67*mmsize]         ; t0a
1381    SUMSUB_BA         d, 4, 0, 3                    ; m4=t0, m0=t4
1382    SUMSUB_BA         d, 5, 1, 3                    ; m5=t1, m1=t5
1383    SUMSUB_BA         d, 6, 2, 3                    ; m6=t2, m2=t6
1384    SCRATCH              4, 9, rsp+71*mmsize
1385    mova                m3, [rsp+69*mmsize]         ; t3a
1386    SUMSUB_BA         d, 7, 3, 4                    ; m7=t3, m3=t7
1387
1388    mova   [rsp+67*mmsize], m5
1389    mova   [rsp+68*mmsize], m6
1390    mova   [rsp+69*mmsize], m7
1391    SUMSUB_MUL_D         0, 1, 4, 5, 15137,  6270   ; m0/4=t4a, m1/5=t5a
1392    SUMSUB_MUL_D         3, 2, 7, 6,  6270, 15137   ; m3/7=t7a, m2/6=t6a
1393    SCRATCH              1, 11, rsp+73*mmsize
1394    SUMSUB_PACK_D        2, 0, 6, 4, 1              ; m2=-out3, m0=t6
1395    NEGD                m2                          ; m2=out3
1396    UNSCRATCH            1, 11, rsp+73*mmsize
1397    SUMSUB_PACK_D        3, 1, 7, 5, 4              ; m3=out12, m1=t7
1398    SCRATCH              2, 11, rsp+73*mmsize
1399    UNSCRATCH            2, 12, rsp+74*mmsize       ; t11a
1400    SCRATCH              3, 12, rsp+74*mmsize
1401
1402    UNSCRATCH            3, 8, rsp+70*mmsize        ; t10a
1403    mova                m4, [rsp+65*mmsize]         ; t8a
1404    mova                m5, [rsp+66*mmsize]         ; t9a
1405    SUMSUB_BA         d, 3, 4, 6                    ; m3=-out1, m4=t10
1406    NEGD                m3                          ; m3=out1
1407    SUMSUB_BA         d, 2, 5, 6                    ; m2=out14, m5=t11
1408    UNSCRATCH            6, 9, rsp+71*mmsize        ; t0
1409    UNSCRATCH            7, 14, rsp+76*mmsize       ; t14a
1410    SCRATCH              3, 9, rsp+71*mmsize
1411    SCRATCH              2, 14, rsp+76*mmsize
1412
1413    SUMSUB_MUL           1, 0, 2, 3, 11585, 11585   ; m1=out4, m0=out11
1414    mova   [rsp+65*mmsize], m0
1415    SUMSUB_MUL           5, 4, 2, 3, 11585, 11585   ; m5=out6, m4=out9
1416    UNSCRATCH            0, 15, rsp+77*mmsize       ; t15a
1417    SUMSUB_MUL           7, 0, 2, 3, 11585, m11585  ; m7=out10, m0=out5
1418
1419    mova                m2, [rsp+68*mmsize]         ; t2
1420    SUMSUB_BA         d, 2, 6, 3                    ; m2=out0, m6=t2a
1421    SCRATCH              2, 8, rsp+70*mmsize
1422    mova                m2, [rsp+67*mmsize]         ; t1
1423    mova                m3, [rsp+69*mmsize]         ; t3
1424    mova   [rsp+67*mmsize], m7
1425    SUMSUB_BA         d, 3, 2, 7                    ; m3=-out15, m2=t3a
1426    NEGD                m3                          ; m3=out15
1427    SCRATCH              3, 15, rsp+77*mmsize
1428    SUMSUB_MUL           6, 2, 7, 3, 11585, m11585  ; m6=out8, m2=out7
1429    mova                m7, [rsp+67*mmsize]
1430
1431    SWAP                 0, 1
1432    SWAP                 2, 5, 4, 6, 7, 3
1433%endmacro
1434
1435%macro IADST16_FN 7
1436cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
1437                                70 * mmsize + ARCH_X86_32 * 8 * mmsize, \
1438                                dst, stride, block, eob
1439    mova                m0, [pw_1023]
1440
1441.body:
1442    mova   [rsp+64*mmsize], m0
1443    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
1444%if ARCH_X86_64
1445    mov            dstbakq, dstq
1446    movsxd            cntq, cntd
1447%endif
1448%ifdef PIC
1449    lea               ptrq, [%7_16x16]
1450    movzx             cntd, byte [ptrq+cntq-1]
1451%else
1452    movzx             cntd, byte [%7_16x16+cntq-1]
1453%endif
1454    mov              skipd, 4
1455    sub              skipd, cntd
1456    mov               ptrq, rsp
1457.loop_1:
1458    %2_1D           blockq
1459
1460    TRANSPOSE4x4D        0, 1, 2, 3, 7
1461    mova  [ptrq+ 1*mmsize], m0
1462    mova  [ptrq+ 5*mmsize], m1
1463    mova  [ptrq+ 9*mmsize], m2
1464    mova  [ptrq+13*mmsize], m3
1465    mova                m7, [rsp+65*mmsize]
1466    TRANSPOSE4x4D        4, 5, 6, 7, 0
1467    mova  [ptrq+ 2*mmsize], m4
1468    mova  [ptrq+ 6*mmsize], m5
1469    mova  [ptrq+10*mmsize], m6
1470    mova  [ptrq+14*mmsize], m7
1471    UNSCRATCH               0, 8, rsp+(%3+0)*mmsize
1472    UNSCRATCH               1, 9, rsp+(%3+1)*mmsize
1473    UNSCRATCH               2, 10, rsp+(%3+2)*mmsize
1474    UNSCRATCH               3, 11, rsp+(%3+3)*mmsize
1475    TRANSPOSE4x4D        0, 1, 2, 3, 7
1476    mova  [ptrq+ 0*mmsize], m0
1477    mova  [ptrq+ 4*mmsize], m1
1478    mova  [ptrq+ 8*mmsize], m2
1479    mova  [ptrq+12*mmsize], m3
1480    UNSCRATCH               4, 12, rsp+(%3+4)*mmsize
1481    UNSCRATCH               5, 13, rsp+(%3+5)*mmsize
1482    UNSCRATCH               6, 14, rsp+(%3+6)*mmsize
1483    UNSCRATCH               7, 15, rsp+(%3+7)*mmsize
1484    TRANSPOSE4x4D        4, 5, 6, 7, 0
1485    mova  [ptrq+ 3*mmsize], m4
1486    mova  [ptrq+ 7*mmsize], m5
1487    mova  [ptrq+11*mmsize], m6
1488    mova  [ptrq+15*mmsize], m7
1489    add               ptrq, 16 * mmsize
1490    add             blockq, mmsize
1491    dec               cntd
1492    jg .loop_1
1493
1494    ; zero-pad the remainder (skipped cols)
1495    test             skipd, skipd
1496    jz .end
1497    add              skipd, skipd
1498    lea             blockq, [blockq+skipq*(mmsize/2)]
1499    pxor                m0, m0
1500.loop_z:
1501    mova   [ptrq+mmsize*0], m0
1502    mova   [ptrq+mmsize*1], m0
1503    mova   [ptrq+mmsize*2], m0
1504    mova   [ptrq+mmsize*3], m0
1505    mova   [ptrq+mmsize*4], m0
1506    mova   [ptrq+mmsize*5], m0
1507    mova   [ptrq+mmsize*6], m0
1508    mova   [ptrq+mmsize*7], m0
1509    add               ptrq, 8 * mmsize
1510    dec              skipd
1511    jg .loop_z
1512.end:
1513
1514    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
1515    lea           stride3q, [strideq*3]
1516    mov               cntd, 4
1517    mov               ptrq, rsp
1518.loop_2:
1519    %5_1D             ptrq
1520
1521    pxor                m7, m7
1522    lea               dstq, [dstq+strideq*4]
1523    ROUND_AND_STORE_4x4  0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
1524    lea               dstq, [dstq+strideq*4]
1525    mova                m0, [rsp+65*mmsize]
1526    mova                m1, [rsp+64*mmsize]
1527    mova                m2, [pd_32]
1528    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, m1, m2, 6
1529
1530%if ARCH_X86_64
1531    DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
1532%else
1533    mov               dstq, dstm
1534%endif
1535    UNSCRATCH               0, 8, rsp+(%6+0)*mmsize
1536    UNSCRATCH               4, 9, rsp+(%6+1)*mmsize
1537    UNSCRATCH               5, 10, rsp+(%6+2)*mmsize
1538    UNSCRATCH               3, 11, rsp+(%6+3)*mmsize
1539    ROUND_AND_STORE_4x4  0, 4, 5, 3, m7, m1, m2, 6
1540%if ARCH_X86_64
1541    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
1542    lea               dstq, [dstbakq+stride3q*4]
1543%else
1544    lea               dstq, [dstq+stride3q*4]
1545%endif
1546    UNSCRATCH               4, 12, rsp+(%6+4)*mmsize
1547    UNSCRATCH               5, 13, rsp+(%6+5)*mmsize
1548    UNSCRATCH               6, 14, rsp+(%6+6)*mmsize
1549    UNSCRATCH               0, 15, rsp+(%6+7)*mmsize
1550    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, m1, m2, 6
1551
1552    add               ptrq, mmsize
1553%if ARCH_X86_64
1554    add            dstbakq, 8
1555    mov               dstq, dstbakq
1556%else
1557    add         dword dstm, 8
1558    mov               dstq, dstm
1559%endif
1560    dec               cntd
1561    jg .loop_2
1562
1563    ; m7 is still zero
1564    ZERO_BLOCK blockq-4*mmsize, 64, 16, m7
1565    RET
1566
1567cglobal vp9_%1_%4_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \
1568                                70 * mmsize + ARCH_X86_32 * 8 * mmsize, \
1569                                dst, stride, block, eob
1570    mova                m0, [pw_4095]
1571    jmp mangle(private_prefix %+ _ %+ vp9_%1_%4_16x16_add_10 %+ SUFFIX).body
1572%endmacro
1573
1574INIT_XMM sse2
1575IADST16_FN idct,  IDCT16,  67, iadst, IADST16, 70, row
1576IADST16_FN iadst, IADST16, 70, idct,  IDCT16,  67, col
1577IADST16_FN iadst, IADST16, 70, iadst, IADST16, 70, default
1578
1579%macro IDCT32_1D 2-3 8 * mmsize; pass[1/2], src, src_stride
1580    IDCT16_1D %2, 2 * %3, 272, 257
1581%if ARCH_X86_64
1582    mova  [rsp+257*mmsize], m8
1583    mova  [rsp+258*mmsize], m9
1584    mova  [rsp+259*mmsize], m10
1585    mova  [rsp+260*mmsize], m11
1586    mova  [rsp+261*mmsize], m12
1587    mova  [rsp+262*mmsize], m13
1588    mova  [rsp+263*mmsize], m14
1589    mova  [rsp+264*mmsize], m15
1590%endif
1591    mova  [rsp+265*mmsize], m0
1592    mova  [rsp+266*mmsize], m1
1593    mova  [rsp+267*mmsize], m2
1594    mova  [rsp+268*mmsize], m3
1595    mova  [rsp+269*mmsize], m4
1596    mova  [rsp+270*mmsize], m5
1597    mova  [rsp+271*mmsize], m6
1598
1599    ; r257-260: t0-3
1600    ; r265-272: t4/5a/6a/7/8/9a/10/11a
1601    ; r261-264: t12a/13/14a/15
1602    ; r273-274 is free as scratch space, and 275-282 mirrors m8-15 on 32bit
1603
1604    mova                m0, [%2+ 1*%3]              ; in1
1605    mova                m1, [%2+15*%3]              ; in15
1606    mova                m2, [%2+17*%3]              ; in17
1607    mova                m3, [%2+31*%3]              ; in31
1608    SUMSUB_MUL           0, 3, 4, 5, 16364,  804    ; m0=t31a, m3=t16a
1609    SUMSUB_MUL           2, 1, 4, 5, 11003, 12140   ; m2=t30a, m1=t17a
1610    SUMSUB_BA         d, 1, 3, 4                    ; m1=t16, m3=t17
1611    SUMSUB_BA         d, 2, 0, 4                    ; m2=t31, m0=t30
1612    SUMSUB_MUL           0, 3, 4, 5, 16069,  3196   ; m0=t30a, m3=t17a
1613    SCRATCH              0, 8, rsp+275*mmsize
1614    SCRATCH              2, 9, rsp+276*mmsize
1615
1616    ; end of stage 1-3 first quart
1617
1618    mova                m0, [%2+ 7*%3]              ; in7
1619    mova                m2, [%2+ 9*%3]              ; in9
1620    mova                m4, [%2+23*%3]              ; in23
1621    mova                m5, [%2+25*%3]              ; in25
1622    SUMSUB_MUL           2, 4, 6, 7, 14811,  7005   ; m2=t29a, m4=t18a
1623    SUMSUB_MUL           5, 0, 6, 7,  5520, 15426   ; m5=t28a, m0=t19a
1624    SUMSUB_BA         d, 4, 0, 6                    ; m4=t19, m0=t18
1625    SUMSUB_BA         d, 2, 5, 6                    ; m2=t28, m5=t29
1626    SUMSUB_MUL           5, 0, 6, 7,  3196, m16069  ; m5=t29a, m0=t18a
1627
1628    ; end of stage 1-3 second quart
1629
1630    SUMSUB_BA         d, 4, 1, 6                    ; m4=t16a, m1=t19a
1631    SUMSUB_BA         d, 0, 3, 6                    ; m0=t17, m3=t18
1632    UNSCRATCH            6, 8, rsp+275*mmsize       ; t30a
1633    UNSCRATCH            7, 9, rsp+276*mmsize       ; t31
1634    mova  [rsp+273*mmsize], m4
1635    mova  [rsp+274*mmsize], m0
1636    SUMSUB_BA         d, 2, 7, 0                    ; m2=t31a, m7=t28a
1637    SUMSUB_BA         d, 5, 6, 0                    ; m5=t30, m6=t29
1638    SUMSUB_MUL           6, 3, 0, 4, 15137,  6270   ; m6=t29a, m3=t18a
1639    SUMSUB_MUL           7, 1, 0, 4, 15137,  6270   ; m7=t28, m1=t19
1640    SCRATCH              3, 10, rsp+277*mmsize
1641    SCRATCH              1, 11, rsp+278*mmsize
1642    SCRATCH              7, 12, rsp+279*mmsize
1643    SCRATCH              6, 13, rsp+280*mmsize
1644    SCRATCH              5, 14, rsp+281*mmsize
1645    SCRATCH              2, 15, rsp+282*mmsize
1646
1647    ; end of stage 4-5 first half
1648
1649    mova                m0, [%2+ 5*%3]              ; in5
1650    mova                m1, [%2+11*%3]              ; in11
1651    mova                m2, [%2+21*%3]              ; in21
1652    mova                m3, [%2+27*%3]              ; in27
1653    SUMSUB_MUL           0, 3, 4, 5, 15893,  3981   ; m0=t27a, m3=t20a
1654    SUMSUB_MUL           2, 1, 4, 5,  8423, 14053   ; m2=t26a, m1=t21a
1655    SUMSUB_BA         d, 1, 3, 4                    ; m1=t20, m3=t21
1656    SUMSUB_BA         d, 2, 0, 4                    ; m2=t27, m0=t26
1657    SUMSUB_MUL           0, 3, 4, 5,  9102, 13623   ; m0=t26a, m3=t21a
1658    SCRATCH              0, 8, rsp+275*mmsize
1659    SCRATCH              2, 9, rsp+276*mmsize
1660
1661    ; end of stage 1-3 third quart
1662
1663    mova                m0, [%2+ 3*%3]              ; in3
1664    mova                m2, [%2+13*%3]              ; in13
1665    mova                m4, [%2+19*%3]              ; in19
1666    mova                m5, [%2+29*%3]              ; in29
1667    SUMSUB_MUL           2, 4, 6, 7, 13160,  9760   ; m2=t25a, m4=t22a
1668    SUMSUB_MUL           5, 0, 6, 7,  2404, 16207   ; m5=t24a, m0=t23a
1669    SUMSUB_BA         d, 4, 0, 6                    ; m4=t23, m0=t22
1670    SUMSUB_BA         d, 2, 5, 6                    ; m2=t24, m5=t25
1671    SUMSUB_MUL           5, 0, 6, 7, 13623, m9102   ; m5=t25a, m0=t22a
1672
1673    ; end of stage 1-3 fourth quart
1674
1675    SUMSUB_BA         d, 1, 4, 6                    ; m1=t23a, m4=t20a
1676    SUMSUB_BA         d, 3, 0, 6                    ; m3=t22, m0=t21
1677    UNSCRATCH            6, 8, rsp+275*mmsize       ; t26a
1678    UNSCRATCH            7, 9, rsp+276*mmsize       ; t27
1679    SCRATCH              3, 8, rsp+275*mmsize
1680    SCRATCH              1, 9, rsp+276*mmsize
1681    SUMSUB_BA         d, 7, 2, 1                    ; m7=t24a, m2=t27a
1682    SUMSUB_BA         d, 6, 5, 1                    ; m6=t25, m5=t26
1683    SUMSUB_MUL           2, 4, 1, 3,  6270, m15137  ; m2=t27, m4=t20
1684    SUMSUB_MUL           5, 0, 1, 3,  6270, m15137  ; m5=t26a, m0=t21a
1685
1686    ; end of stage 4-5 second half
1687
1688    UNSCRATCH            1, 12, rsp+279*mmsize      ; t28
1689    UNSCRATCH            3, 13, rsp+280*mmsize      ; t29a
1690    SCRATCH              4, 12, rsp+279*mmsize
1691    SCRATCH              0, 13, rsp+280*mmsize
1692    SUMSUB_BA         d, 5, 3, 0                    ; m5=t29, m3=t26
1693    SUMSUB_BA         d, 2, 1, 0                    ; m2=t28a, m1=t27a
1694    UNSCRATCH            0, 14, rsp+281*mmsize      ; t30
1695    UNSCRATCH            4, 15, rsp+282*mmsize      ; t31a
1696    SCRATCH              2, 14, rsp+281*mmsize
1697    SCRATCH              5, 15, rsp+282*mmsize
1698    SUMSUB_BA         d, 6, 0, 2                    ; m6=t30a, m0=t25a
1699    SUMSUB_BA         d, 7, 4, 2                    ; m7=t31, m4=t24
1700
1701    mova                m2, [rsp+273*mmsize]        ; t16a
1702    mova                m5, [rsp+274*mmsize]        ; t17
1703    mova  [rsp+273*mmsize], m6
1704    mova  [rsp+274*mmsize], m7
1705    UNSCRATCH            6, 10, rsp+277*mmsize      ; t18a
1706    UNSCRATCH            7, 11, rsp+278*mmsize      ; t19
1707    SCRATCH              4, 10, rsp+277*mmsize
1708    SCRATCH              0, 11, rsp+278*mmsize
1709    UNSCRATCH            4, 12, rsp+279*mmsize      ; t20
1710    UNSCRATCH            0, 13, rsp+280*mmsize      ; t21a
1711    SCRATCH              3, 12, rsp+279*mmsize
1712    SCRATCH              1, 13, rsp+280*mmsize
1713    SUMSUB_BA         d, 0, 6, 1                    ; m0=t18, m6=t21
1714    SUMSUB_BA         d, 4, 7, 1                    ; m4=t19a, m7=t20a
1715    UNSCRATCH            3, 8, rsp+275*mmsize       ; t22
1716    UNSCRATCH            1, 9, rsp+276*mmsize       ; t23a
1717    SCRATCH              0, 8, rsp+275*mmsize
1718    SCRATCH              4, 9, rsp+276*mmsize
1719    SUMSUB_BA         d, 3, 5, 0                    ; m3=t17a, m5=t22a
1720    SUMSUB_BA         d, 1, 2, 0                    ; m1=t16, m2=t23
1721
1722    ; end of stage 6
1723
1724    UNSCRATCH            0, 10, rsp+277*mmsize      ; t24
1725    UNSCRATCH            4, 11, rsp+278*mmsize      ; t25a
1726    SCRATCH              1, 10, rsp+277*mmsize
1727    SCRATCH              3, 11, rsp+278*mmsize
1728    SUMSUB_MUL           0, 2, 1, 3, 11585, 11585   ; m0=t24a, m2=t23a
1729    SUMSUB_MUL           4, 5, 1, 3, 11585, 11585   ; m4=t25, m5=t22
1730    UNSCRATCH            1, 12, rsp+279*mmsize      ; t26
1731    UNSCRATCH            3, 13, rsp+280*mmsize      ; t27a
1732    SCRATCH              0, 12, rsp+279*mmsize
1733    SCRATCH              4, 13, rsp+280*mmsize
1734    SUMSUB_MUL           3, 7, 0, 4, 11585, 11585   ; m3=t27, m7=t20
1735    SUMSUB_MUL           1, 6, 0, 4, 11585, 11585   ; m1=t26a, m6=t21a
1736
1737    ; end of stage 7
1738
1739    mova                m0, [rsp+269*mmsize]        ; t8
1740    mova                m4, [rsp+270*mmsize]        ; t9a
1741    mova  [rsp+269*mmsize], m1                      ; t26a
1742    mova  [rsp+270*mmsize], m3                      ; t27
1743    mova                m3, [rsp+271*mmsize]        ; t10
1744    SUMSUB_BA         d, 2, 0, 1                    ; m2=out8, m0=out23
1745    SUMSUB_BA         d, 5, 4, 1                    ; m5=out9, m4=out22
1746    SUMSUB_BA         d, 6, 3, 1                    ; m6=out10, m3=out21
1747    mova                m1, [rsp+272*mmsize]        ; t11a
1748    mova  [rsp+271*mmsize], m0
1749    SUMSUB_BA         d, 7, 1, 0                    ; m7=out11, m1=out20
1750
1751%if %1 == 1
1752    TRANSPOSE4x4D        2, 5, 6, 7, 0
1753    mova  [ptrq+ 2*mmsize], m2
1754    mova  [ptrq+10*mmsize], m5
1755    mova  [ptrq+18*mmsize], m6
1756    mova  [ptrq+26*mmsize], m7
1757%else ; %1 == 2
1758    pxor                m0, m0
1759    lea               dstq, [dstq+strideq*8]
1760    ROUND_AND_STORE_4x4  2, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6
1761%endif
1762    mova                m2, [rsp+271*mmsize]
1763%if %1 == 1
1764    TRANSPOSE4x4D        1, 3, 4, 2, 0
1765    mova  [ptrq+ 5*mmsize], m1
1766    mova  [ptrq+13*mmsize], m3
1767    mova  [ptrq+21*mmsize], m4
1768    mova  [ptrq+29*mmsize], m2
1769%else ; %1 == 2
1770    lea               dstq, [dstq+stride3q*4]
1771    ROUND_AND_STORE_4x4  1, 3, 4, 2, m0, [rsp+256*mmsize], [pd_32], 6
1772%endif
1773
1774    ; end of last stage + store for out8-11 and out20-23
1775
1776    UNSCRATCH            0, 9, rsp+276*mmsize       ; t19a
1777    UNSCRATCH            1, 8, rsp+275*mmsize       ; t18
1778    UNSCRATCH            2, 11, rsp+278*mmsize      ; t17a
1779    UNSCRATCH            3, 10, rsp+277*mmsize      ; t16
1780    mova                m7, [rsp+261*mmsize]        ; t12a
1781    mova                m6, [rsp+262*mmsize]        ; t13
1782    mova                m5, [rsp+263*mmsize]        ; t14a
1783    SUMSUB_BA         d, 0, 7, 4                    ; m0=out12, m7=out19
1784    SUMSUB_BA         d, 1, 6, 4                    ; m1=out13, m6=out18
1785    SUMSUB_BA         d, 2, 5, 4                    ; m2=out14, m5=out17
1786    mova                m4, [rsp+264*mmsize]        ; t15
1787    SCRATCH              7, 8, rsp+275*mmsize
1788    SUMSUB_BA         d, 3, 4, 7                    ; m3=out15, m4=out16
1789
1790%if %1 == 1
1791    TRANSPOSE4x4D        0, 1, 2, 3, 7
1792    mova  [ptrq+ 3*mmsize], m0
1793    mova  [ptrq+11*mmsize], m1
1794    mova  [ptrq+19*mmsize], m2
1795    mova  [ptrq+27*mmsize], m3
1796%else ; %1 == 2
1797%if ARCH_X86_64
1798    SWAP                 7, 9
1799    lea               dstq, [dstbakq+stride3q*4]
1800%else ; x86-32
1801    pxor                m7, m7
1802    mov               dstq, dstm
1803    lea               dstq, [dstq+stride3q*4]
1804%endif
1805    ROUND_AND_STORE_4x4  0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6
1806%endif
1807    UNSCRATCH            0, 8, rsp+275*mmsize       ; out19
1808%if %1 == 1
1809    TRANSPOSE4x4D        4, 5, 6, 0, 7
1810    mova  [ptrq+ 4*mmsize], m4
1811    mova  [ptrq+12*mmsize], m5
1812    mova  [ptrq+20*mmsize], m6
1813    mova  [ptrq+28*mmsize], m0
1814%else ; %1 == 2
1815    lea               dstq, [dstq+strideq*4]
1816    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6
1817%endif
1818
1819    ; end of last stage + store for out12-19
1820
1821%if ARCH_X86_64
1822    SWAP                 7, 8
1823%endif
1824    mova                m7, [rsp+257*mmsize]        ; t0
1825    mova                m6, [rsp+258*mmsize]        ; t1
1826    mova                m5, [rsp+259*mmsize]        ; t2
1827    mova                m4, [rsp+260*mmsize]        ; t3
1828    mova                m0, [rsp+274*mmsize]        ; t31
1829    mova                m1, [rsp+273*mmsize]        ; t30a
1830    UNSCRATCH            2, 15, rsp+282*mmsize      ; t29
1831    SUMSUB_BA         d, 0, 7, 3                    ; m0=out0, m7=out31
1832    SUMSUB_BA         d, 1, 6, 3                    ; m1=out1, m6=out30
1833    SUMSUB_BA         d, 2, 5, 3                    ; m2=out2, m5=out29
1834    SCRATCH              0, 9, rsp+276*mmsize
1835    UNSCRATCH            3, 14, rsp+281*mmsize      ; t28a
1836    SUMSUB_BA         d, 3, 4, 0                    ; m3=out3, m4=out28
1837
1838%if %1 == 1
1839    TRANSPOSE4x4D        4, 5, 6, 7, 0
1840    mova  [ptrq+ 7*mmsize], m4
1841    mova  [ptrq+15*mmsize], m5
1842    mova  [ptrq+23*mmsize], m6
1843    mova  [ptrq+31*mmsize], m7
1844%else ; %1 == 2
1845%if ARCH_X86_64
1846    SWAP                 0, 8
1847%else ; x86-32
1848    pxor                m0, m0
1849%endif
1850    lea               dstq, [dstq+stride3q*4]
1851    ROUND_AND_STORE_4x4  4, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6
1852%endif
1853    UNSCRATCH            7, 9, rsp+276*mmsize       ; out0
1854%if %1 == 1
1855    TRANSPOSE4x4D        7, 1, 2, 3, 0
1856    mova  [ptrq+ 0*mmsize], m7
1857    mova  [ptrq+ 8*mmsize], m1
1858    mova  [ptrq+16*mmsize], m2
1859    mova  [ptrq+24*mmsize], m3
1860%else ; %1 == 2
1861%if ARCH_X86_64
1862    DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
1863%else ; x86-32
1864    mov               dstq, dstm
1865%endif
1866    ROUND_AND_STORE_4x4  7, 1, 2, 3, m0, [rsp+256*mmsize], [pd_32], 6
1867%if ARCH_X86_64
1868    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
1869%endif
1870%endif
1871
1872    ; end of last stage + store for out0-3 and out28-31
1873
1874%if ARCH_X86_64
1875    SWAP                 0, 8
1876%endif
1877    mova                m7, [rsp+265*mmsize]        ; t4
1878    mova                m6, [rsp+266*mmsize]        ; t5a
1879    mova                m5, [rsp+267*mmsize]        ; t6a
1880    mova                m4, [rsp+268*mmsize]        ; t7
1881    mova                m0, [rsp+270*mmsize]        ; t27
1882    mova                m1, [rsp+269*mmsize]        ; t26a
1883    UNSCRATCH            2, 13, rsp+280*mmsize      ; t25
1884    SUMSUB_BA         d, 0, 7, 3                    ; m0=out4, m7=out27
1885    SUMSUB_BA         d, 1, 6, 3                    ; m1=out5, m6=out26
1886    SUMSUB_BA         d, 2, 5, 3                    ; m2=out6, m5=out25
1887    UNSCRATCH            3, 12, rsp+279*mmsize      ; t24a
1888    SCRATCH              7, 9, rsp+276*mmsize
1889    SUMSUB_BA         d, 3, 4, 7                    ; m3=out7, m4=out24
1890
1891%if %1 == 1
1892    TRANSPOSE4x4D        0, 1, 2, 3, 7
1893    mova  [ptrq+ 1*mmsize], m0
1894    mova  [ptrq+ 9*mmsize], m1
1895    mova  [ptrq+17*mmsize], m2
1896    mova  [ptrq+25*mmsize], m3
1897%else ; %1 == 2
1898%if ARCH_X86_64
1899    SWAP                 7, 8
1900    lea               dstq, [dstbakq+strideq*4]
1901%else ; x86-32
1902    pxor                m7, m7
1903    lea               dstq, [dstq+strideq*4]
1904%endif
1905    ROUND_AND_STORE_4x4  0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6
1906%endif
1907    UNSCRATCH            0, 9, rsp+276*mmsize       ; out27
1908%if %1 == 1
1909    TRANSPOSE4x4D        4, 5, 6, 0, 7
1910    mova  [ptrq+ 6*mmsize], m4
1911    mova  [ptrq+14*mmsize], m5
1912    mova  [ptrq+22*mmsize], m6
1913    mova  [ptrq+30*mmsize], m0
1914%else ; %1 == 2
1915%if ARCH_X86_64
1916    lea               dstq, [dstbakq+stride3q*8]
1917%else
1918    mov               dstq, dstm
1919    lea               dstq, [dstq+stride3q*8]
1920%endif
1921    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6
1922%endif
1923
1924    ; end of last stage + store for out4-7 and out24-27
1925%endmacro
1926
1927INIT_XMM sse2
1928cglobal vp9_idct_idct_32x32_add_10, 4, 6 + ARCH_X86_64, 16, \
1929                                    275 * mmsize + ARCH_X86_32 * 8 * mmsize, \
1930                                    dst, stride, block, eob
1931    mova                m0, [pw_1023]
1932    cmp               eobd, 1
1933    jg .idctfull
1934
1935    ; dc-only - the 10bit version can be done entirely in 32bit, since the max
1936    ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily
1937    ; fits in 32bit
1938    DEFINE_ARGS dst, stride, block, coef
1939    pxor                m2, m2
1940    DC_ONLY              6, m2
1941    movd                m1, coefd
1942    pshuflw             m1, m1, q0000
1943    punpcklqdq          m1, m1
1944    DEFINE_ARGS dst, stride, cnt
1945    mov               cntd, 32
1946.loop_dc:
1947    STORE_2x8            3, 4, 1, m2, m0, dstq,          mmsize
1948    STORE_2x8            3, 4, 1, m2, m0, dstq+mmsize*2, mmsize
1949    add               dstq, strideq
1950    dec               cntd
1951    jg .loop_dc
1952    RET
1953
1954.idctfull:
1955    mova  [rsp+256*mmsize], m0
1956    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
1957%if ARCH_X86_64
1958    mov            dstbakq, dstq
1959    movsxd            cntq, cntd
1960%endif
1961%ifdef PIC
1962    lea               ptrq, [default_32x32]
1963    movzx             cntd, byte [ptrq+cntq-1]
1964%else
1965    movzx             cntd, byte [default_32x32+cntq-1]
1966%endif
1967    mov              skipd, 8
1968    sub              skipd, cntd
1969    mov               ptrq, rsp
1970.loop_1:
1971    IDCT32_1D            1, blockq
1972
1973    add               ptrq, 32 * mmsize
1974    add             blockq, mmsize
1975    dec               cntd
1976    jg .loop_1
1977
1978    ; zero-pad the remainder (skipped cols)
1979    test             skipd, skipd
1980    jz .end
1981    shl              skipd, 2
1982    lea             blockq, [blockq+skipq*(mmsize/4)]
1983    pxor                m0, m0
1984.loop_z:
1985    mova   [ptrq+mmsize*0], m0
1986    mova   [ptrq+mmsize*1], m0
1987    mova   [ptrq+mmsize*2], m0
1988    mova   [ptrq+mmsize*3], m0
1989    mova   [ptrq+mmsize*4], m0
1990    mova   [ptrq+mmsize*5], m0
1991    mova   [ptrq+mmsize*6], m0
1992    mova   [ptrq+mmsize*7], m0
1993    add               ptrq, 8 * mmsize
1994    dec              skipd
1995    jg .loop_z
1996.end:
1997
1998    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
1999    lea           stride3q, [strideq*3]
2000    mov               cntd, 8
2001    mov               ptrq, rsp
2002.loop_2:
2003    IDCT32_1D            2, ptrq
2004
2005    add               ptrq, mmsize
2006%if ARCH_X86_64
2007    add            dstbakq, 8
2008    mov               dstq, dstbakq
2009%else
2010    add         dword dstm, 8
2011    mov               dstq, dstm
2012%endif
2013    dec               cntd
2014    jg .loop_2
2015
2016    ; m7 is still zero
2017    ZERO_BLOCK blockq-8*mmsize, 128, 32, m7
2018    RET
2019
2020INIT_XMM sse2
2021cglobal vp9_idct_idct_32x32_add_12, 4, 6 + ARCH_X86_64, 16, \
2022                                    275 * mmsize + ARCH_X86_32 * 8 * mmsize, \
2023                                    dst, stride, block, eob
2024    mova                m0, [pw_4095]
2025    cmp               eobd, 1
2026    jg mangle(private_prefix %+ _ %+ vp9_idct_idct_32x32_add_10 %+ SUFFIX).idctfull
2027
2028    ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign
2029    ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies
2030    DEFINE_ARGS dst, stride, block, coef, coefl
2031    pxor                m2, m2
2032    DC_ONLY_64BIT        6, m2
2033    movd                m1, coefd
2034    pshuflw             m1, m1, q0000
2035    punpcklqdq          m1, m1
2036    DEFINE_ARGS dst, stride, cnt
2037    mov               cntd, 32
2038.loop_dc:
2039    STORE_2x8            3, 4, 1, m2, m0, dstq,          mmsize
2040    STORE_2x8            3, 4, 1, m2, m0, dstq+mmsize*2, mmsize
2041    add               dstq, strideq
2042    dec               cntd
2043    jg .loop_dc
2044    RET
2045