• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*!
2;* \copy
3;*     Copyright (c)  2009-2013, Cisco Systems
4;*     All rights reserved.
5;*
6;*     Redistribution and use in source and binary forms, with or without
7;*     modification, are permitted provided that the following conditions
8;*     are met:
9;*
10;*        ?Redistributions of source code must retain the above copyright
11;*          notice, this list of conditions and the following disclaimer.
12;*
13;*        ?Redistributions in binary form must reproduce the above copyright
14;*          notice, this list of conditions and the following disclaimer in
15;*          the documentation and/or other materials provided with the
16;*          distribution.
17;*
18;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29;*     POSSIBILITY OF SUCH DAMAGE.
30;*
31;*
32;*  dct.asm
33;*
34;*  History
35;*      8/4/2009 Created
36;*
37;*
38;*************************************************************************/
39
40%include "asm_inc.asm"
41
42%macro LOAD_3_PARA_TO_5_PARA_IDCT 0
43%ifdef X86_32
44    push r3
45    push r4
46    %assign push_num push_num+2
47    mov r0, [esp + push_num*4 + 4]
48    mov r1, [esp + push_num*4 + 8]
49    mov r4, [esp + push_num*4 + 12]
50%else
51    mov r4, r2
52%endif
53    mov r2, r0
54    mov r3, r1
55%endmacro
56
57%ifdef PREFIX
58    %define prefixed(a) _ %+ a
59%else
60    %define prefixed(a) a
61%endif
62
63%ifdef X86_32_PICASM
64SECTION .text align=32
65%else
66SECTION .rodata align=32
67%endif
68
69;***********************************************************************
70; Constant
71;***********************************************************************
72
73align 32
74wels_shufb0312_movzxw_128:
75    db 0, 80h, 3, 80h, 1, 80h, 2, 80h, 4, 80h, 7, 80h, 5, 80h, 6, 80h
76wels_shufb2301_128:
77    db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
78wels_shufb0231_128:
79    db 0, 2, 3, 1, 4, 6, 7, 5, 8, 10, 11, 9, 12, 14, 15, 13
80wels_dw32_128:
81    times 8 dw 32
82wels_p1m1p1m1w_256:
83    times 8 dw 1, -1
84wels_p1p2m1m2w_256:
85    times 4 dw 1, 2, -1, -2
86wels_p1p1m1m1w_256:
87    times 4 dw 1, 1, -1, -1
88wels_8xp1w_8xm1w:
89    times 8 dw  1
90    times 8 dw -1
91wels_4xp1w_4xm1w_256:
92    times 4 dw  1
93    times 4 dw -1
94    times 4 dw  1
95    times 4 dw -1
96wels_4xp1w_4xp2w_4xm1w_4xm2w:
97    times 4 dw  1
98    times 4 dw  2
99    times 4 dw -1
100    times 4 dw -2
101
102align 16
103wels_p1m1p1m1w_128:
104    times 4 dw 1, -1
105wels_p1p2p1p2w_128:
106    times 4 dw 1, 2
107wels_p1m1m1p1w_128:
108    times 2 dw 1, -1, -1, 1
109wels_p0m8000p0m8000w_128:
110    times 4 dw 0, -8000h
111wels_p1p1m1m1w_128:
112    times 2 dw 1, 1, -1, -1
113wels_4xp1w_4xp2w:
114    times 4 dw 1
115    times 4 dw 2
116wels_4xp0w_4xm8000w:
117    times 4 dw 0
118    times 4 dw -8000h
119
120SECTION .text
121
122;***********************************************************************
123; MMX functions
124;***********************************************************************
125
126%macro MMX_LoadDiff4P 5
127    movd        %1, [%3]
128    movd        %2, [%4]
129    punpcklbw   %1, %5
130    punpcklbw   %2, %5
131    psubw       %1, %2
132%endmacro
133
134%macro MMX_LoadDiff4x4P 10 ;d0, d1, d2, d3, pix1address, pix1stride, pix2address, pix2stride, tmp(mm), 0(mm)
135    MMX_LoadDiff4P %1, %9, %5,    %7,    %10
136    MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
137    lea  %5, [%5+2*%6]
138    lea  %7, [%7+2*%8]
139    MMX_LoadDiff4P %3, %9, %5,    %7,    %10
140    MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
141%endmacro
142
143%macro MMX_SumSubMul2 3
144    movq    %3, %1
145    psllw   %1, $01
146    paddw   %1, %2
147    psllw   %2, $01
148    psubw   %3, %2
149%endmacro
150
151%macro MMX_SumSubDiv2 3
152    movq    %3, %2
153    psraw   %3, $01
154    paddw   %3, %1
155    psraw   %1, $01
156    psubw   %1, %2
157%endmacro
158
159%macro MMX_SumSub 3
160    movq    %3, %2
161    psubw   %2, %1
162    paddw   %1, %3
163%endmacro
164
165%macro MMX_DCT 6
166    MMX_SumSub      %4, %1, %6
167    MMX_SumSub      %3, %2, %6
168    MMX_SumSub      %3, %4, %6
169    MMX_SumSubMul2  %1, %2, %5
170%endmacro
171
172%macro MMX_IDCT 6
173    MMX_SumSub      %4, %5, %6
174    MMX_SumSubDiv2  %3, %2, %1
175    MMX_SumSub      %1, %4, %6
176    MMX_SumSub      %3, %5, %6
177%endmacro
178
179%macro MMX_StoreDiff4P 6
180    movd       %2, %6
181    punpcklbw  %2, %4
182    paddw      %1, %3
183    psraw      %1, $06
184    paddsw     %1, %2
185    packuswb   %1, %2
186    movd       %5, %1
187%endmacro
188
189;***********************************************************************
190;   void WelsDctT4_mmx( int16_t *pDct[4], uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
191;***********************************************************************
192WELS_EXTERN WelsDctT4_mmx
193    %assign push_num 0
194    LOAD_5_PARA
195    SIGN_EXTENSION r2, r2d
196    SIGN_EXTENSION r4, r4d
197    WELS_Zero    mm7
198
199    MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7
200
201    MMX_DCT         mm1, mm2, mm3 ,mm4, mm5, mm6
202    MMX_Trans4x4W   mm3, mm1, mm4, mm5, mm2
203
204    MMX_DCT         mm3, mm5, mm2 ,mm4, mm1, mm6
205    MMX_Trans4x4W   mm2, mm3, mm4, mm1, mm5
206
207    movq    [r0+ 0],   mm2
208    movq    [r0+ 8],   mm1
209    movq    [r0+16],   mm5
210    movq    [r0+24],   mm4
211    WELSEMMS
212    LOAD_5_PARA_POP
213    ret
214
215;***********************************************************************
216; void IdctResAddPred_mmx(uint8_t* pPred, int32_t iStride, int16_t* pDct);
217;***********************************************************************
218WELS_EXTERN IdctResAddPred_mmx
219    %assign push_num 0
220    LOAD_3_PARA_TO_5_PARA_IDCT
221    jmp prefixed(WelsIDctT4Rec_mmx.begin)
222
223;***********************************************************************
224;   void WelsIDctT4Rec_mmx(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs)
225;***********************************************************************
226WELS_EXTERN WelsIDctT4Rec_mmx
227    %assign push_num 0
228    LOAD_5_PARA
229.begin:
230    SIGN_EXTENSION r1, r1d
231    SIGN_EXTENSION r3, r3d
232    movq    mm0, [r4+ 0]
233    movq    mm1, [r4+ 8]
234    movq    mm2, [r4+16]
235    movq    mm3, [r4+24]
236
237    MMX_Trans4x4W       mm0, mm1, mm2, mm3, mm4
238    MMX_IDCT            mm1, mm2, mm3, mm4, mm0, mm6
239    MMX_Trans4x4W       mm1, mm3, mm0, mm4, mm2
240    MMX_IDCT            mm3, mm0, mm4, mm2, mm1, mm6
241
242    WELS_Zero           mm7
243    WELS_DW32           mm6
244
245    MMX_StoreDiff4P     mm3, mm0, mm6, mm7, [r0], [r2]
246    MMX_StoreDiff4P     mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
247    lea     r0, [r0+2*r1]
248    lea     r2, [r2+2*r3]
249    MMX_StoreDiff4P     mm1, mm0, mm6, mm7, [r0], [r2]
250    MMX_StoreDiff4P     mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
251
252    WELSEMMS
253    LOAD_5_PARA_POP
254    ret
255
256
257;***********************************************************************
258; SSE2 functions
259;***********************************************************************
260
261%macro SSE2_Store4x8p 6
262    movlps   [%1+0x00], %2
263    movhps   [%1+0x20], %2
264    movlps   [%1+0x08], %3
265    movhps   [%1+0x28], %3
266    movlps   [%1+0x10], %4
267    movhps   [%1+0x30], %4
268    movlps   [%1+0x18], %5
269    movhps   [%1+0x38], %5
270%endmacro
271
272%macro SSE2_Load4x8p 6
273    MOVDQ    %2,    [%1+0x00]
274    MOVDQ    %4,    [%1+0x10]
275    MOVDQ    %6,    [%1+0x20]
276    MOVDQ    %3,    [%1+0x30]
277    SSE2_XSawp qdq, %4, %3, %5
278    SSE2_XSawp qdq, %2, %6, %3
279%endmacro
280
281%macro SSE2_SumSubMul2 3
282    movdqa  %3, %1
283    psllw   %1, 1
284    paddw   %1, %2
285    psllw   %2, 1
286    psubw   %3, %2
287%endmacro
288
289%macro SSE2_SumSubDiv2 4
290    movdqa  %4, %1
291    movdqa  %3, %2
292    psraw   %2, $01
293    psraw   %4, $01
294    paddw   %1, %2
295    psubw   %4, %3
296%endmacro
297
298%macro SSE2_StoreDiff16p 9
299    paddw       %1, %4
300    psraw       %1, $06
301    movq        %3, %7
302    punpcklbw   %3, %5
303    paddsw      %1, %3
304    paddw       %2, %4
305    psraw       %2, $06
306    movq        %3, %9
307    punpcklbw   %3, %5
308    paddsw      %2, %3
309    packuswb    %1, %2
310    movlps      %6, %1
311    movhps      %8, %1
312%endmacro
313
314%macro SSE2_StoreDiff8p 5
315    movq        %2, %5
316    punpcklbw   %2, %3
317    paddsw      %2, %1
318    packuswb    %2, %2
319    movq        %4, %2
320%endmacro
321
322%macro SSE2_Load2x4P 2
323    MOVDQ       %1, [%2]
324%endmacro
325
326%macro SSE2_Store2x4P 2
327    MOVDQ       [%1], %2
328%endmacro
329
330; out=%1 pPixel1Line1=%2 pPixel1Line2=%3 pPixel2Line1=%4 pPixel2Line2=%5 zero=%6 clobber=%7,%8
331%macro SSE2_LoadDiff2x4P 8
332    movd        %1, [%2]
333    movd        %7, [%3]
334    punpckldq   %1, %7
335    punpcklbw   %1, %6
336    movd        %7, [%4]
337    movd        %8, [%5]
338    punpckldq   %7, %8
339    punpcklbw   %7, %6
340    psubw       %1, %7
341%endmacro
342
343; pRec1=%1 pRec2=%2 data=%3 pPred1=%4 pPred2=%5 dw32=%6 zero=%7 clobber=%8,%9
344%macro SSE2_StoreDiff2x4P 9
345    paddw       %3, %6
346    psraw       %3, 6
347    movd        %8, [%4]
348    movd        %9, [%5]
349    punpckldq   %8, %9
350    punpcklbw   %8, %7
351    paddsw      %3, %8
352    packuswb    %3, %3
353    movd        [%1], %3
354    psrlq       %3, 32
355    movd        [%2], %3
356%endmacro
357
358%macro SSE2_Load8DC 6
359    movdqa      %1,     %6      ; %1 = dc0 dc1
360    paddw       %1,     %5
361    psraw       %1,     $06     ; (dc + 32) >> 6
362
363    movdqa      %2,     %1
364    psrldq      %2,     4
365    punpcklwd   %2,     %2
366    punpckldq   %2,     %2      ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
367
368    movdqa      %3,     %1
369    psrldq      %3,     8
370    punpcklwd   %3,     %3
371    punpckldq   %3,     %3      ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
372
373    movdqa      %4,     %1
374    psrldq      %4,     12
375    punpcklwd   %4,     %4
376    punpckldq   %4,     %4      ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
377
378    punpcklwd   %1,     %1
379    punpckldq   %1,     %1      ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
380%endmacro
381
382%macro SSE2_DCT 6
383    SSE2_SumSub     %6, %3, %5
384    SSE2_SumSub     %1, %2, %5
385    SSE2_SumSub     %3, %2, %5
386    SSE2_SumSubMul2     %6, %1, %4
387%endmacro
388
389%macro SSE2_IDCT 7
390    SSE2_SumSub       %7, %2, %6
391    SSE2_SumSubDiv2     %1, %3, %5, %4
392    SSE2_SumSub      %2, %1, %5
393    SSE2_SumSub      %7, %4, %5
394%endmacro
395
396; Do 2 horizontal 4-pt DCTs in parallel packed as 8 words in an xmm register.
397; out=%1 in=%1 clobber=%2
398%macro SSE2_DCT_HORIZONTAL 2
399    pshuflw       %2, %1, 1bh                    ; [x[3],x[2],x[1],x[0]] low qw
400    pmullw        %1, [pic(wels_p1m1p1m1w_128)]  ; [x[0],-x[1],x[2],-x[3], ...]
401    pshufhw       %2, %2, 1bh                    ; [x[3],x[2],x[1],x[0]] high qw
402    paddw         %1, %2                         ; s = [x[0]+x[3],-x[1]+x[2],x[2]+x[1],-x[3]+x[0], ...]
403    pshufd        %2, %1, 0b1h                   ; [s[2],s[3],s[0],s[1], ...]
404    pmullw        %1, [pic(wels_p1m1m1p1w_128)]  ; [s[0],-s[1],-s[2],s[3], ...]
405    pmullw        %2, [pic(wels_p1p2p1p2w_128)]  ; [s[2],2*s[3],s[0],2*s[1], ...]]
406    paddw         %1, %2                         ; y = [s[0]+s[2],-s[1]+2*s[3],-s[2]+s[0],s[3]+2*s[1], ...]
407%endmacro
408
409; Do 2 horizontal 4-pt IDCTs in parallel packed as 8 words in an xmm register.
410;
411; Use a multiply by reciprocal to get -x>>1, and x+=-x>>1 to get x>>1, which
412; avoids a cumbersome blend with SSE2 to get a vector with right-shifted odd
413; elements.
414;
415; out=%1 in=%1 wels_p1m1m1p1w_128=%2 clobber=%3,%4
416%macro SSE2_IDCT_HORIZONTAL 4
417    movdqa        %3, [pic(wels_p0m8000p0m8000w_128)]
418    pmulhw        %3, %1                    ; x[0:7] * [0,-8000h,0,-8000h, ...] >> 16
419    pshufd        %4, %1, 0b1h              ; [x[2],x[3],x[0],x[1], ...]
420    pmullw        %4, %2                    ; [x[2],-x[3],-x[0],x[1], ...]
421    paddw         %1, %3                    ; [x[0]+0,x[1]+(-x[1]>>1),x[2]+0,x[3]+(-x[3]>>1), ...]
422    paddw         %1, %4                    ; s = [x[0]+x[2],(x[1]>>1)-x[3],x[2]-x[0],(x[3]>>1)+x[1], ...]
423    pshuflw       %3, %1, 1bh               ; [s[3],s[2],s[1],s[0]] low qw
424    pmullw        %1, [pic(wels_p1p1m1m1w_128)]  ; [s[0],s[1],-s[2],-s[3], ...]
425    pshufhw       %3, %3, 1bh               ; [s[3],s[2],s[1],s[0]] high qw
426    pmullw        %3, %2                    ; [s[3],-s[2],-s[1],s[0], ...]
427    paddw         %1, %3                    ; y = [s[0]+s[3],s[1]-s[2],-s[2]-s[1],-s[3]+s[0], ...]
428%endmacro
429
430; Do 4 vertical 4-pt DCTs in parallel packed as 16 words in 2 xmm registers.
431; Uses scrambled input to save a negation.
432; [y0,y1]=%1 [y2,y3]=%2 [x1,x0]=%1 [x2,x3]=%2 clobber=%3
433%macro SSE2_DCT_4x4P 3
434    movdqa        %3, %1
435    psubw         %1, %2                    ; [x1-x2,x0-x3]
436    paddw         %2, %3                    ; [x1+x2,x0+x3]
437    movdqa        %3, %2
438    punpckhqdq    %2, %1                    ; s03 = [x0+x3,x0-x3]
439    punpcklqdq    %3, %1                    ; s12 = [x1+x2,x1-x2]
440    movdqa        %1, %2
441    pmullw        %1, [pic(wels_4xp1w_4xp2w)] ; [s03[0],2*s03[1]]
442    paddw         %1, %3                      ; [y0,y1] = [s03[0]+s12[0],2*s03[1]+s12[1]]
443    pmullw        %3, [pic(wels_4xp1w_4xp2w)] ; [s12[0],2*s12[1]]
444    psubw         %2, %3                    ; [y2,y3] = [s03[0]-s12[0],s03[1]-2*s12[1]]
445%endmacro
446
447; Do 4 vertical 4-pt IDCTs in parallel packed as 16 words in 2 xmm registers.
448; Output is scrambled to save a negation.
449; [y1,y0]=%1 [y2,y3]=%2 [x0,x1]=%1 [x2,x3]=%2 clobber=%3,%4
450%macro SSE2_IDCT_4x4P 4
451    movdqa        %4, [pic(wels_4xp0w_4xm8000w)]
452    movdqa        %3, %1
453    pmulhw        %3, %4                    ; x[0:1] * [0,-8000h] >> 16
454    pmulhw        %4, %2                    ; x[2:3] * [0,-8000h] >> 16
455    paddw         %3, %1                    ; [x[0],x[1]>>1]
456    paddw         %4, %2                    ; [x[2],x[3]>>1]
457    psubw         %3, %2                    ; [x[0]-x[2],(x[1]>>1)-x[3]]
458    paddw         %1, %4                    ; [x[2]+x[0],(x[3]>>1)+x[1]]
459    movdqa        %2, %3
460    punpckhqdq    %3, %1                    ; s13 = [(x[1]>>1)-x[3],(x[3]>>1)+x[1]]
461    punpcklqdq    %2, %1                    ; s02 = [x[0]-x[2], x[2]+x[0]]
462    movdqa        %1, %2
463    paddw         %1, %3                    ; [y1,y0] = [s02[0]+s13[0],s02[1]+s13[1]]
464    psubw         %2, %3                    ; [y2,y3] = [s02[0]-s13[0],s02[1]-s13[1]]
465%endmacro
466
467;***********************************************************************
468; void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
469;***********************************************************************
470WELS_EXTERN WelsDctFourT4_sse2
471    %assign push_num 0
472    INIT_X86_32_PIC r5
473    LOAD_5_PARA
474    PUSH_XMM 8
475    SIGN_EXTENSION r2, r2d
476    SIGN_EXTENSION r4, r4d
477    pxor    xmm7, xmm7
478    ;Load 4x8
479    SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [r1], [r3]
480    SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [r1+r2], [r3+r4]
481    lea     r1, [r1 + 2 * r2]
482    lea     r3, [r3 + 2 * r4]
483    SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [r1], [r3]
484    SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
485
486    SSE2_DCT            xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
487    SSE2_DCT_HORIZONTAL xmm2, xmm5
488    SSE2_DCT_HORIZONTAL xmm0, xmm5
489    SSE2_DCT_HORIZONTAL xmm3, xmm5
490    SSE2_DCT_HORIZONTAL xmm4, xmm5
491
492    SSE2_Store4x8p r0, xmm2, xmm0, xmm3, xmm4, xmm1
493
494    lea     r1, [r1 + 2 * r2]
495    lea     r3, [r3 + 2 * r4]
496
497    ;Load 4x8
498    SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [r1      ], [r3    ]
499    SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [r1+r2  ], [r3+r4]
500    lea     r1, [r1 + 2 * r2]
501    lea     r3, [r3 + 2 * r4]
502    SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [r1], [r3]
503    SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
504
505    SSE2_DCT            xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
506    SSE2_DCT_HORIZONTAL xmm2, xmm5
507    SSE2_DCT_HORIZONTAL xmm0, xmm5
508    SSE2_DCT_HORIZONTAL xmm3, xmm5
509    SSE2_DCT_HORIZONTAL xmm4, xmm5
510
511    SSE2_Store4x8p r0+64, xmm2, xmm0, xmm3, xmm4, xmm1
512
513    POP_XMM
514    LOAD_5_PARA_POP
515    DEINIT_X86_32_PIC
516    ret
517
518;***********************************************************************
519; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
520;***********************************************************************
521WELS_EXTERN WelsIDctFourT4Rec_sse2
522    %assign push_num 0
523    INIT_X86_32_PIC r5
524    LOAD_5_PARA
525    PUSH_XMM 8
526    SIGN_EXTENSION r1, r1d
527    SIGN_EXTENSION r3, r3d
528    ;Load 4x8
529    SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5
530
531    movdqa xmm7, [pic(wels_p1m1m1p1w_128)]
532    SSE2_IDCT_HORIZONTAL xmm0, xmm7, xmm5, xmm6
533    SSE2_IDCT_HORIZONTAL xmm1, xmm7, xmm5, xmm6
534    SSE2_IDCT_HORIZONTAL xmm4, xmm7, xmm5, xmm6
535    SSE2_IDCT_HORIZONTAL xmm2, xmm7, xmm5, xmm6
536    SSE2_IDCT xmm1, xmm4, xmm2, xmm3, xmm5, xmm6, xmm0
537
538    WELS_Zero           xmm7
539    WELS_DW32           xmm6
540
541    SSE2_StoreDiff16p xmm1, xmm3, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3]
542    lea     r0, [r0 + 2 * r1]
543    lea     r2, [r2 + 2 * r3]
544    SSE2_StoreDiff16p xmm0, xmm4, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3]
545
546    lea     r0, [r0 + 2 * r1]
547    lea     r2, [r2 + 2 * r3]
548    SSE2_Load4x8p  r4+64, xmm0, xmm1, xmm4, xmm2, xmm5
549
550    movdqa xmm7, [pic(wels_p1m1m1p1w_128)]
551    SSE2_IDCT_HORIZONTAL xmm0, xmm7, xmm5, xmm6
552    SSE2_IDCT_HORIZONTAL xmm1, xmm7, xmm5, xmm6
553    SSE2_IDCT_HORIZONTAL xmm4, xmm7, xmm5, xmm6
554    SSE2_IDCT_HORIZONTAL xmm2, xmm7, xmm5, xmm6
555    SSE2_IDCT xmm1, xmm4, xmm2, xmm3, xmm5, xmm6, xmm0
556
557    WELS_Zero           xmm7
558    WELS_DW32           xmm6
559
560    SSE2_StoreDiff16p xmm1, xmm3, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3]
561    lea     r0, [r0 + 2 * r1]
562    lea     r2, [r2 + 2 * r3]
563    SSE2_StoreDiff16p xmm0, xmm4, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3]
564    POP_XMM
565    LOAD_5_PARA_POP
566    DEINIT_X86_32_PIC
567    ret
568
569;***********************************************************************
570; void WelsDctT4_sse2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2)
571;***********************************************************************
572WELS_EXTERN WelsDctT4_sse2
573    %assign push_num 0
574    INIT_X86_32_PIC r5
575    LOAD_5_PARA
576    PUSH_XMM 5
577    SIGN_EXTENSION r2, r2d
578    SIGN_EXTENSION r4, r4d
579
580    WELS_Zero xmm2
581    SSE2_LoadDiff2x4P xmm0, r1+r2, r1, r3+r4, r3, xmm2, xmm3, xmm4
582    add r1, r2
583    add r3, r4
584    SSE2_LoadDiff2x4P xmm1, r1+r2, r1+2*r2, r3+r4, r3+2*r4, xmm2, xmm3, xmm4
585    SSE2_DCT_HORIZONTAL xmm0, xmm3
586    SSE2_DCT_HORIZONTAL xmm1, xmm3
587    SSE2_DCT_4x4P xmm0, xmm1, xmm3
588    SSE2_Store2x4P r0,    xmm0
589    SSE2_Store2x4P r0+16, xmm1
590
591    POP_XMM
592    LOAD_5_PARA_POP
593    DEINIT_X86_32_PIC
594    ret
595
596;***********************************************************************
597; void IdctResAddPred_sse2(uint8_t* pPred, int32_t iStride, int16_t* pDct);
598;***********************************************************************
599WELS_EXTERN IdctResAddPred_sse2
600    %assign push_num 0
601    LOAD_3_PARA_TO_5_PARA_IDCT
602    jmp prefixed(WelsIDctT4Rec_sse2.begin)
603
604;***********************************************************************
605; void WelsIDctT4Rec_sse2(uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct);
606;***********************************************************************
607WELS_EXTERN WelsIDctT4Rec_sse2
608    %assign push_num 0
609    LOAD_5_PARA
610.begin:
611    INIT_X86_32_PIC r5
612    PUSH_XMM 6
613    SIGN_EXTENSION r1, r1d
614    SIGN_EXTENSION r3, r3d
615
616    SSE2_Load2x4P xmm0, r4
617    SSE2_Load2x4P xmm1, r4+16
618    movdqa xmm4, [pic(wels_p1m1m1p1w_128)]
619    SSE2_IDCT_HORIZONTAL xmm0, xmm4, xmm2, xmm3
620    SSE2_IDCT_HORIZONTAL xmm1, xmm4, xmm2, xmm3
621    SSE2_IDCT_4x4P xmm0, xmm1, xmm2, xmm3
622    WELS_Zero xmm4
623    WELS_DW32 xmm5
624    SSE2_StoreDiff2x4P r0+r1, r0, xmm0, r2+r3, r2, xmm5, xmm4, xmm2, xmm3
625    add r0, r1
626    add r2, r3
627    SSE2_StoreDiff2x4P r0+r1, r0+2*r1, xmm1, r2+r3, r2+2*r3, xmm5, xmm4, xmm2, xmm3
628
629    POP_XMM
630    DEINIT_X86_32_PIC
631    LOAD_5_PARA_POP
632    ret
633
634%macro SSE2_StoreDiff4x8p 8
635    SSE2_StoreDiff8p    %1, %3, %4, [%5],           [%6]
636    SSE2_StoreDiff8p    %1, %3, %4, [%5 + %7],      [%6 + %8]
637    SSE2_StoreDiff8p    %2, %3, %4, [%5 + 8],       [%6 + 8]
638    SSE2_StoreDiff8p    %2, %3, %4, [%5 + %7 + 8],  [%6 + %8 + 8]
639%endmacro
640
641 ;***********************************************************************
642; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
643;***********************************************************************
644WELS_EXTERN WelsIDctRecI16x16Dc_sse2
645    %assign push_num 0
646    LOAD_5_PARA
647    PUSH_XMM 8
648    SIGN_EXTENSION r1, r1d
649    SIGN_EXTENSION r3, r3d
650    pxor        xmm7,       xmm7
651    WELS_DW32   xmm6
652
653    SSE2_Load8DC            xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
654    SSE2_StoreDiff4x8p      xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
655
656    lea         r0,     [r0 + 2 * r1]
657    lea         r2,     [r2 + 2 * r3]
658    SSE2_StoreDiff4x8p      xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
659
660    lea         r0,     [r0 + 2 * r1]
661    lea         r2,     [r2 + 2 * r3]
662    SSE2_StoreDiff4x8p      xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
663
664    lea         r0,     [r0 + 2 * r1]
665    lea         r2,     [r2 + 2 * r3]
666    SSE2_StoreDiff4x8p      xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
667
668    SSE2_Load8DC            xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
669    lea         r0,     [r0 + 2 * r1]
670    lea         r2,     [r2 + 2 * r3]
671    SSE2_StoreDiff4x8p      xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
672
673    lea         r0,     [r0 + 2 * r1]
674    lea         r2,     [r2 + 2 * r3]
675    SSE2_StoreDiff4x8p      xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
676
677    lea         r0,     [r0 + 2 * r1]
678    lea         r2,     [r2 + 2 * r3]
679    SSE2_StoreDiff4x8p      xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
680
681    lea         r0,     [r0 + 2 * r1]
682    lea         r2,     [r2 + 2 * r3]
683    SSE2_StoreDiff4x8p      xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
684    POP_XMM
685    LOAD_5_PARA_POP
686    ret
687
688
689;***********************************************************************
690; AVX2 functions
691;***********************************************************************
692
693%ifdef HAVE_AVX2
694; out=%1 pPixel1=%2 iStride1=%3 pPixel2=%4 iStride2=%5 wels_shufb0312_movzxw=%6 clobber=%7,%8
695%macro AVX2_LoadDiff16P 8
696    vmovq         x%1, [%2         ]
697    vpbroadcastq  y%7, [%2 + 4 * %3]
698    vpblendd      y%1, y%1, y%7, 11110000b
699    vpshufb       y%1, y%1, y%6
700    vmovq         x%7, [%4         ]
701    vpbroadcastq  y%8, [%4 + 4 * %5]
702    vpblendd      y%7, y%7, y%8, 11110000b
703    vpshufb       y%7, y%7, y%6
704    vpsubw        y%1, y%1, y%7
705%endmacro
706
707; pRec=%1 iStride=%2 data=%3,%4 pPred=%5 iPredStride=%6 dw32=%7 wels_shufb0312_movzxw=%8 clobber=%9,%10
708%macro AVX2_StoreDiff32P 10
709    vpaddw        y%3, y%3, y%7
710    vpsraw        y%3, y%3, 6
711    vmovq         x%9,  [%5         ]
712    vpbroadcastq  y%10, [%5 + 4 * %6]
713    add           %5, %6
714    vpblendd      y%9, y%9, y%10, 11110000b
715    vpshufb       y%9, y%9, y%8
716    vpaddsw       y%3, y%3, y%9
717    vpaddw        y%4, y%4, y%7
718    vpsraw        y%4, y%4, 6
719    vmovq         x%9,  [%5         ]
720    vpbroadcastq  y%10, [%5 + 4 * %6]
721    vpblendd      y%9, y%9, y%10, 11110000b
722    vpshufb       y%9, y%9, y%8
723    vpaddsw       y%4, y%4, y%9
724    vpackuswb     y%3, y%3, y%4
725    vbroadcasti128 y%4, [pic(wels_shufb0231_128)]
726    vpshufb       y%3, y%3, y%4
727    vextracti128  x%4, y%3, 1
728    vmovlps       [%1         ], x%3
729    vmovlps       [%1 + 4 * %2], x%4
730    add           %1, %2
731    vmovhps       [%1         ], x%3
732    vmovhps       [%1 + 4 * %2], x%4
733%endmacro
734
735; out=%1,%2,%3,%4 pDct=%5 clobber=%6
736%macro AVX2_Load4x16P 6
737    vmovdqa       x%2,      [%5+0x00]
738    vinserti128   y%2, y%2, [%5+0x40], 1
739    vmovdqa       x%6,      [%5+0x20]
740    vinserti128   y%6, y%6, [%5+0x60], 1
741    vpunpcklqdq   y%1, y%2, y%6
742    vpunpckhqdq   y%2, y%2, y%6
743    vmovdqa       x%4,      [%5+0x10]
744    vinserti128   y%4, y%4, [%5+0x50], 1
745    vmovdqa       x%6,      [%5+0x30]
746    vinserti128   y%6, y%6, [%5+0x70], 1
747    vpunpcklqdq   y%3, y%4, y%6
748    vpunpckhqdq   y%4, y%4, y%6
749%endmacro
750
751; pDct=%1 data=%1,%2,%3,%4 clobber=%5
752%macro AVX2_Store4x16P 6
753    vpunpcklqdq   y%6, y%2,  y%3
754    vmovdqa       [%1+0x00], x%6
755    vextracti128  [%1+0x40], y%6, 1
756    vpunpckhqdq   y%6, y%2,  y%3
757    vmovdqa       [%1+0x20], x%6
758    vextracti128  [%1+0x60], y%6, 1
759    vpunpcklqdq   y%6, y%4,  y%5
760    vmovdqa       [%1+0x10], x%6
761    vextracti128  [%1+0x50], y%6, 1
762    vpunpckhqdq   y%6, y%4,  y%5
763    vmovdqa       [%1+0x30], x%6
764    vextracti128  [%1+0x70], y%6, 1
765%endmacro
766
767%macro AVX2_Load4x4P 2
768    vmovdqu       y%1, [%2]
769%endmacro
770
771%macro AVX2_Store4x4P 2
772    vmovdqu       [%1], y%2
773%endmacro
774
775; Load 4 lines of 4 pixels, shuffle and zero extend to 16-bit.
776; out=%1 pPixel=%2 iStride=%3 [wels_shufb0312_movzxw]=%4 clobber=%5,%6
777%macro AVX2_Loadzx4x4P 6
778    vmovd         x%1, [%2         ]
779    add           %2, %3
780    vpbroadcastd  x%5, [%2 + 2 * %3]
781    vpblendd      x%1, x%1, x%5, 1010b
782    vpbroadcastd  y%5, [%2         ]
783    vpbroadcastd  y%6, [%2 +     %3]
784    vpblendd      y%5, y%5, y%6, 10101010b
785    vpblendd      y%1, y%1, y%5, 11110000b
786    vpshufb       y%1, y%1, %4
787%endmacro
788
789; out=%1 pPixel1=%2 iStride1=%3 pPixel2=%4 iStride2=%5 wels_shufb0312_movzxw=%6 clobber=%7,%8,%9
790%macro AVX2_LoadDiff4x4P 9
791    AVX2_Loadzx4x4P %1, %2, %3, y%6, %7, %8
792    AVX2_Loadzx4x4P %7, %4, %5, y%6, %8, %9
793    vpsubw        y%1, y%1, y%7
794%endmacro
795
796; pRec=%1 iStride=%2 data=%3 pPred=%4 iPredStride=%5 dw32=%6 wels_shufb0312_movzxw=%7 clobber=%8,%9,%10
797%macro AVX2_StoreDiff4x4P 10
798    vpaddw         y%3, y%3, y%6
799    vpsraw         y%3, y%3, 6
800    AVX2_Loadzx4x4P %8, %4, %5, y%7, %9, %10
801    vpaddsw        y%3, y%3, y%8
802    vpackuswb      y%3, y%3, y%3
803    vbroadcasti128 y%8, [pic(wels_shufb0231_128)]
804    vpshufb        y%3, y%3, y%8
805    vextracti128   x%8, y%3, 1
806    vmovd          [%1         ], x%3
807    add            %1, %2
808    vmovd          [%1         ], x%8
809    vpsrlq         x%8, x%8, 32
810    vmovd          [%1     + %2], x%8
811    vpsrlq         x%3, x%3, 32
812    vmovd          [%1 + 2 * %2], x%3
813%endmacro
814
815; 4-pt DCT
816; out=%1,%2,%3,%4 in=%1,%2,%3,%4 clobber=%5
817%macro AVX2_DCT 5
818    vpsubw        %5, %1, %4  ; s3 = x0 - x3
819    vpaddw        %1, %1, %4  ; s0 = x0 + x3
820    vpsubw        %4, %2, %3  ; s2 = x1 - x2
821    vpaddw        %2, %2, %3  ; s1 = x1 + x2
822    vpsubw        %3, %1, %2  ; y2 = s0 - s1
823    vpaddw        %1, %1, %2  ; y0 = s0 + s1
824    vpsllw        %2, %5, 1
825    vpaddw        %2, %2, %4  ; y1 = 2 * s3 + s2
826    vpsllw        %4, %4, 1
827    vpsubw        %4, %5, %4  ; y3 = s3 - 2 * s2
828%endmacro
829
830; 4-pt IDCT
831; out=%1,%2,%3,%4 in=%1,%2,%3,%4 clobber=%5
832%macro AVX2_IDCT 5
833    vpsraw        %5, %2, 1
834    vpsubw        %5, %5, %4  ; t3 = (x1 >> 1) - x3
835    vpsraw        %4, %4, 1
836    vpaddw        %4, %2, %4  ; t2 = x1 + (x3 >> 1)
837    vpaddw        %2, %1, %3  ; t0 = x0 + x2
838    vpsubw        %3, %1, %3  ; t1 = x0 - x2
839    vpaddw        %1, %2, %4  ; y0 = t0 + t2
840    vpsubw        %4, %2, %4  ; y3 = t0 - t2
841    vpaddw        %2, %3, %5  ; y1 = t1 + t3
842    vpsubw        %3, %3, %5  ; y2 = t1 - t3
843%endmacro
844
845; Do 4 horizontal 4-pt DCTs in parallel packed as 16 words in a ymm register.
846; Uses scrambled input to save a negation.
847; [y0,y1,y2,y3]=%1 [x0,x3,x1,x2]=%1 wels_shufb2301=%2 clobber=%3
848%macro AVX2_DCT_HORIZONTAL 3
849    vpsignw       %3, %1, [pic(wels_p1m1p1m1w_256)]  ; [x0,-x3,x1,-x2]
850    vpshufb       %1, %1, %2                    ; [x3,x0,x2,x1]
851    vpaddw        %1, %1, %3                    ; s = [x0+x3,-x3+x0,x1+x2,-x2+x1]
852    vpmullw       %3, %1, [pic(wels_p1p2m1m2w_256)]  ; [s[0],2*s[1],-s[2],-2*s[3], ...]
853    vpshufd       %1, %1, 0b1h                  ; [s[2],s[3],s[0],s[1], ...]
854    vpaddw        %1, %1, %3                    ; [y0,y1,y2,y3] = [s[0]+s[2],2*s[1]+s[3],-s[2]+s[0],-2*s[3]+s[1], ...]
855%endmacro
856
857; Do 4 horizontal 4-pt IDCTs in parallel packed as 16 words in a ymm register.
858; Output is scrambled to save a negation.
859; [y0,y3,y1,y2]=%1 [x0,x1,x2,x3]=%1 wels_shufb2301=%2 clobber=%3
860%macro AVX2_IDCT_HORIZONTAL 3
861    vpsraw        %3, %1, 1                     ; [x0>>1,x1>>1,x2>>1,x3>>1]
862    vpblendw      %3, %1, %3, 10101010b         ; [x0,x1>>1,x2,x3>>1]
863    vpsignw       %1, %1, [pic(wels_p1p1m1m1w_256)]  ; [x0,x1,-x2,-x3]
864    vpshufd       %3, %3, 0b1h                  ; [x2,x3>>1,x0,x1>>1]
865    vpaddw        %1, %3, %1                    ; s = [x2+x0,(x3>>1)+x1,x0-x2,(x1>>1)-x3]
866    vpshufb       %3, %1, %2                    ; [s[1],s[0],s[3],s[2], ...]
867    vpsignw       %1, %1, [pic(wels_p1m1p1m1w_256)]  ; [s[0],-s[1],s[2],-s[3], ...]
868    vpaddw        %1, %1, %3                    ; [y0,y3,y1,y2] = [s[0]+s[1],-s[1]+s[0],s[2]+s[3],-s[3]+s[2], ...]
869%endmacro
870
871; Do 4 vertical 4-pt DCTs in parallel packed as 16 words in a ymm register.
872; Uses scrambled input to save a negation.
873; [y0,y1,y2,y3]=%1 [x0,x3,x1,x2]=%1 clobber=%2
874%macro AVX2_DCT_4x4P 2
875    vpsignw       %2, %1, [pic(wels_4xp1w_4xm1w_256)]    ; [x0,-x3,x1,-x2]
876    vpshufd       %1, %1, 4eh                            ; [x3,x0,x2,x1]
877    vpaddw        %1, %1, %2                             ; s = [x0+x3,-x3+x0,x1+x2,-x2+x1]
878    vpmullw       %2, %1, [pic(wels_4xp1w_4xp2w_4xm1w_4xm2w)] ; [s[0],2*s[1],-s[2],-2*s[3]]
879    vpermq        %1, %1, 4eh                            ; [s[2],s[3],s[0],s[1]]
880    vpaddw        %1, %1, %2                             ; [y0,y1,y2,y3] = [s[0]+s[2],2*s[1]+s[3],-s[2]+s[0],-2*s[3]+s[1]]
881%endmacro
882
883; Do 4 vertical 4-pt IDCTs in parallel packed as 16 words in a ymm register.
884; Output is scrambled to save a negation.
885; [y0,y3,y1,y2]=%1 [x0,x1,x2,x3]=%1 clobber=%2
886%macro AVX2_IDCT_4x4P 2
887    vpsraw        %2, %1, 1                              ; [x0>>1,x1>>1,x2>>1,x3>>1]
888    vpblendw      %2, %1, %2, 11110000b                  ; [x0,x1>>1,x2,x3>>1]
889    vpsignw       %1, %1, [pic(wels_8xp1w_8xm1w)]        ; [x0,x1,-x2,-x3]
890    vpermq        %2, %2, 4eh                            ; [x2,x3>>1,x0,x1>>1]
891    vpaddw        %1, %2, %1                             ; s = [x2+x0,(x3>>1)+x1,x0-x2,(x1>>1)-x3]
892    vpshufd       %2, %1, 4eh                            ; [s[1],s[0],s[3],s[2]]
893    vpmullw       %1, %1, [pic(wels_4xp1w_4xm1w_256)]    ; [s[0],-s[1],s[2],-s[3], ...]
894    vpaddw        %1, %1, %2                             ; [y0,y3,y1,y2] = [s[0]+s[1],-s[1]+s[0],s[2]+s[3],-s[3]+s[2]]
895%endmacro
896
897;***********************************************************************
898; void WelsDctFourT4_avx2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2)
899;***********************************************************************
900WELS_EXTERN WelsDctFourT4_avx2
901    %assign push_num 0
902    INIT_X86_32_PIC r5
903    LOAD_5_PARA
904    PUSH_XMM 7
905    SIGN_EXTENSION r2, r2d
906    SIGN_EXTENSION r4, r4d
907
908    vbroadcasti128 ymm6, [pic(wels_shufb0312_movzxw_128)]
909
910    ;Load 4x16
911    AVX2_LoadDiff16P mm0, r1, r2, r3, r4, mm6, mm4, mm5
912    add r1, r2
913    add r3, r4
914    AVX2_LoadDiff16P mm1, r1, r2, r3, r4, mm6, mm4, mm5
915    add r1, r2
916    add r3, r4
917    AVX2_LoadDiff16P mm2, r1, r2, r3, r4, mm6, mm4, mm5
918    add r1, r2
919    add r3, r4
920    AVX2_LoadDiff16P mm3, r1, r2, r3, r4, mm6, mm4, mm5
921
922    AVX2_DCT ymm0, ymm1, ymm2, ymm3, ymm5
923    vbroadcasti128 ymm6, [pic(wels_shufb2301_128)]
924    AVX2_DCT_HORIZONTAL ymm0, ymm6, ymm5
925    AVX2_DCT_HORIZONTAL ymm1, ymm6, ymm5
926    AVX2_DCT_HORIZONTAL ymm2, ymm6, ymm5
927    AVX2_DCT_HORIZONTAL ymm3, ymm6, ymm5
928
929    AVX2_Store4x16P r0, mm0, mm1, mm2, mm3, mm5
930    vzeroupper
931
932    POP_XMM
933    LOAD_5_PARA_POP
934    DEINIT_X86_32_PIC
935    ret
936
937;***********************************************************************
938; void IdctFourResAddPred_avx2(uint8_t* pPred, int32_t iStride, const int16_t* pDct, const int8_t* pNzc);
939;***********************************************************************
940WELS_EXTERN IdctFourResAddPred_avx2
941    %assign push_num 0
942    LOAD_3_PARA_TO_5_PARA_IDCT
943    jmp prefixed(WelsIDctFourT4Rec_avx2.begin)
944
945;***********************************************************************
946; void WelsIDctFourT4Rec_avx2(uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct);
947;***********************************************************************
948WELS_EXTERN WelsIDctFourT4Rec_avx2
949    %assign push_num 0
950    LOAD_5_PARA
951.begin:
952    INIT_X86_32_PIC r5
953    PUSH_XMM 8
954    SIGN_EXTENSION r1, r1d
955    SIGN_EXTENSION r3, r3d
956
957    AVX2_Load4x16P mm0, mm1, mm2, mm3, r4, mm5
958    vbroadcasti128 ymm6, [pic(wels_shufb2301_128)]
959    AVX2_IDCT_HORIZONTAL ymm0, ymm6, ymm5
960    AVX2_IDCT_HORIZONTAL ymm1, ymm6, ymm5
961    AVX2_IDCT_HORIZONTAL ymm2, ymm6, ymm5
962    AVX2_IDCT_HORIZONTAL ymm3, ymm6, ymm5
963    AVX2_IDCT ymm0, ymm1, ymm2, ymm3, ymm5
964
965    vbroadcasti128 ymm6, [pic(wels_shufb0312_movzxw_128)]
966    vbroadcasti128 ymm7, [pic(wels_dw32_128)]
967    AVX2_StoreDiff32P r0, r1, mm0, mm1, r2, r3, mm7, mm6, mm5, mm4
968    add r2, r3
969    add r0, r1
970    AVX2_StoreDiff32P r0, r1, mm2, mm3, r2, r3, mm7, mm6, mm5, mm4
971    vzeroupper
972
973    POP_XMM
974    DEINIT_X86_32_PIC
975    LOAD_5_PARA_POP
976    ret
977
978;***********************************************************************
979; void WelsDctT4_avx2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2)
980;***********************************************************************
981WELS_EXTERN WelsDctT4_avx2
982    %assign push_num 0
983    INIT_X86_32_PIC r5
984    LOAD_5_PARA
985    PUSH_XMM 5
986    SIGN_EXTENSION r2, r2d
987    SIGN_EXTENSION r4, r4d
988
989    vbroadcasti128 ymm1, [pic(wels_shufb0312_movzxw_128)]
990    AVX2_LoadDiff4x4P mm0, r1, r2, r3, r4, mm1, mm2, mm3, mm4
991    AVX2_DCT_4x4P ymm0, ymm2
992    vbroadcasti128 ymm1, [pic(wels_shufb2301_128)]
993    AVX2_DCT_HORIZONTAL ymm0, ymm1, ymm2
994    AVX2_Store4x4P r0, mm0
995    vzeroupper
996
997    POP_XMM
998    LOAD_5_PARA_POP
999    DEINIT_X86_32_PIC
1000    ret
1001
1002;***********************************************************************
1003; void IdctResAddPred_avx2(uint8_t* pPred, int32_t iStride, int16_t* pDct);
1004;***********************************************************************
1005WELS_EXTERN IdctResAddPred_avx2
1006    %assign push_num 0
1007    LOAD_3_PARA_TO_5_PARA_IDCT
1008    jmp prefixed(WelsIDctT4Rec_avx2.begin)
1009
1010;***********************************************************************
1011; void WelsIDctT4Rec_avx2(uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct);
1012;***********************************************************************
1013WELS_EXTERN WelsIDctT4Rec_avx2
1014    %assign push_num 0
1015    LOAD_5_PARA
1016.begin:
1017    INIT_X86_32_PIC r5
1018    PUSH_XMM 6
1019    SIGN_EXTENSION r1, r1d
1020    SIGN_EXTENSION r3, r3d
1021
1022    AVX2_Load4x4P mm0, r4
1023    vbroadcasti128 ymm4, [pic(wels_shufb2301_128)]
1024    AVX2_IDCT_HORIZONTAL ymm0, ymm4, ymm1
1025    AVX2_IDCT_4x4P ymm0, ymm1
1026    vbroadcasti128 ymm4, [pic(wels_shufb0312_movzxw_128)]
1027    vbroadcasti128 ymm5, [pic(wels_dw32_128)]
1028    AVX2_StoreDiff4x4P r0, r1, mm0, r2, r3, mm5, mm4, mm1, mm2, mm3
1029    vzeroupper
1030
1031    POP_XMM
1032    DEINIT_X86_32_PIC
1033    LOAD_5_PARA_POP
1034    ret
1035%endif
1036
1037