• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*!
2;* \copy
3;*     Copyright (c)  2009-2013, Cisco Systems
4;*     All rights reserved.
5;*
6;*     Redistribution and use in source and binary forms, with or without
7;*     modification, are permitted provided that the following conditions
8;*     are met:
9;*
10;*        * Redistributions of source code must retain the above copyright
11;*          notice, this list of conditions and the following disclaimer.
12;*
13;*        * Redistributions in binary form must reproduce the above copyright
14;*          notice, this list of conditions and the following disclaimer in
15;*          the documentation and/or other materials provided with the
16;*          distribution.
17;*
18;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29;*     POSSIBILITY OF SUCH DAMAGE.
30;*
31;*
32;*  mc_luma.asm
33;*
34;*  Abstract
35;*      sse2 motion compensation
36;*
37;*  History
38;*      17/08/2009 Created
39;*
40;*
41;*************************************************************************/
42%include "asm_inc.asm"
43
44;*******************************************************************************
45; Local Data (Read Only)
46;*******************************************************************************
47%ifdef X86_32_PICASM
48SECTION .text align=32
49%else
50SECTION .rodata align=32
51%endif
52
53;*******************************************************************************
54; Various memory constants (trigonometric values or rounding values)
55;*******************************************************************************
56
57%ifdef HAVE_AVX2
58ALIGN 32
59dwm32768_256:
60    times 16 dw -32768
61maddubsw_m2p10_m40m40_p10m2_p0p0_256:
62    times 4 db -2, 10, -40, -40, 10, -2, 0, 0
63dwm1024_256:
64    times 16 dw -1024
65dd32768_256:
66    times 8 dd 32768
67maddubsw_p1m5_256:
68    times 16 db 1, -5
69maddubsw_m5p1_256:
70    times 16 db -5, 1
71db20_256:
72    times 32 db 20
73maddubsw_m5p20_256:
74    times 16 db -5, 20
75maddubsw_p20m5_256:
76    times 16 db 20, -5
77h264_w0x10_256:
78    times 16 dw 16
79dw32_256:
80    times 16 dw 32
81%endif ; HAVE_AVX2
82
83ALIGN 16
84shufb_32435465768798A9:
85    db 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9
86shufb_011267784556ABBC:
87    db 0, 1, 1, 2, 6, 7, 7, 8, 4, 5, 5, 6, 0Ah, 0Bh, 0Bh, 0Ch
88maddubsw_p1m5_p1m5_m5p1_m5p1_128:
89    times 2 db 1, -5, 1, -5, -5, 1, -5, 1
90maddubsw_m2p10_m40m40_p10m2_p0p0_128:
91    times 2 db -2, 10, -40, -40, 10, -2, 0, 0
92dwm1024_128:
93    times 8 dw -1024
94dd32768_128:
95    times 4 dd 32768
96maddubsw_p1m5_128:
97    times 8 db 1, -5
98maddubsw_m5p1_128:
99    times 8 db -5, 1
100db20_128:
101    times 16 db 20
102maddubsw_m5p20_128:
103    times 8 db -5, 20
104maddubsw_p20m5_128:
105    times 8 db 20, -5
106h264_w0x10_1:
107    dw 16, 16, 16, 16, 16, 16, 16, 16
108ALIGN 16
109h264_mc_hc_32:
110    dw 32, 32, 32, 32, 32, 32, 32, 32
111
112
113;*******************************************************************************
114; Code
115;*******************************************************************************
116
117SECTION .text
118
119%ifdef X86_32_PICASM
120
121%macro MOVEIMM_DW16 1
122    pcmpeqw      %1,  %1
123    psrlw        %1,  15
124    psllw        %1,  4
125%endmacro
126
127%endif
128
129;*******************************************************************************
130; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc,
131;                       int iSrcStride,
132;                       uint8_t *pDst,
133;                       int iDstStride,
134;                       int iHeight)
135;*******************************************************************************
136WELS_EXTERN McHorVer20WidthEq4_mmx
137    %assign  push_num 0
138    LOAD_5_PARA
139    SIGN_EXTENSION  r1, r1d
140    SIGN_EXTENSION  r3, r3d
141    SIGN_EXTENSION  r4, r4d
142
143    sub r0, 2
144    WELS_Zero mm7
145%ifdef X86_32_PICASM
146    MOVEIMM_DW16 mm6
147%else
148    movq mm6, [h264_w0x10_1]
149%endif
150.height_loop:
151    movd mm0, [r0]
152    punpcklbw mm0, mm7
153    movd mm1, [r0+5]
154    punpcklbw mm1, mm7
155    movd mm2, [r0+1]
156    punpcklbw mm2, mm7
157    movd mm3, [r0+4]
158    punpcklbw mm3, mm7
159    movd mm4, [r0+2]
160    punpcklbw mm4, mm7
161    movd mm5, [r0+3]
162    punpcklbw mm5, mm7
163
164    paddw mm2, mm3
165    paddw mm4, mm5
166    psllw mm4, 2
167    psubw mm4, mm2
168    paddw mm0, mm1
169    paddw mm0, mm4
170    psllw mm4, 2
171    paddw mm0, mm4
172    paddw mm0, mm6
173    psraw mm0, 5
174    packuswb mm0, mm7
175    movd [r2], mm0
176
177    add r0, r1
178    add r2, r3
179    dec r4
180    jnz .height_loop
181
182    WELSEMMS
183    LOAD_5_PARA_POP
184    ret
185
186;*******************************************************************************
187; Macros and other preprocessor constants
188;*******************************************************************************
189
190
191%macro SSE_LOAD_8P 3
192    movq %1, %3
193    punpcklbw %1, %2
194%endmacro
195
196%macro FILTER_HV_W8 9
197    paddw   %1, %6
198    paddw   %1, [pic(h264_w0x10_1)]
199    movdqa  %8, %3
200    movdqa  %7, %2
201    paddw   %8, %4
202    paddw   %7, %5
203    psllw   %8, 2
204    psubw   %8, %7
205    paddw   %1, %8
206    psllw   %8, 2
207    paddw   %1, %8
208    psraw   %1, 5
209    WELS_Zero %8
210    packuswb %1, %8
211    movq    %9, %1
212%endmacro
213
214
215%macro FILTER_HV_W4 9
216paddw   %1, %6
217paddw   %1, [pic(h264_w0x10_1)]
218movdqa  %8, %3
219movdqa  %7, %2
220paddw   %8, %4
221paddw   %7, %5
222psllw   %8, 2
223psubw   %8, %7
224paddw   %1, %8
225psllw   %8, 2
226paddw   %1, %8
227psraw   %1, 5
228WELS_Zero %8
229packuswb %1, %8
230movd    %9, %1
231%endmacro
232
233
234;*******************************************************************************
235; Code
236;*******************************************************************************
237
238SECTION .text
239
240;***********************************************************************
241; void McHorVer22Width8HorFirst_sse2(const int16_t *pSrc,
242;                       int16_t iSrcStride,
243;                       uint8_t *pDst,
244;                       int32_t iDstStride
245;                       int32_t iHeight
246;                       )
247;***********************************************************************
248WELS_EXTERN McHorVer22Width8HorFirst_sse2
249    %assign  push_num 0
250    LOAD_5_PARA
251    PUSH_XMM 8
252    SIGN_EXTENSION  r1, r1d
253    SIGN_EXTENSION  r3, r3d
254    SIGN_EXTENSION  r4, r4d
255    pxor xmm7, xmm7
256
257    sub r0, r1              ;;;;;;;;need more 5 lines.
258    sub r0, r1
259
260.yloop_width_8:
261    movq xmm0, [r0]
262    punpcklbw xmm0, xmm7
263    movq xmm1, [r0+5]
264    punpcklbw xmm1, xmm7
265    movq xmm2, [r0+1]
266    punpcklbw xmm2, xmm7
267    movq xmm3, [r0+4]
268    punpcklbw xmm3, xmm7
269    movq xmm4, [r0+2]
270    punpcklbw xmm4, xmm7
271    movq xmm5, [r0+3]
272    punpcklbw xmm5, xmm7
273
274    paddw xmm2, xmm3
275    paddw xmm4, xmm5
276    psllw xmm4, 2
277    psubw xmm4, xmm2
278    paddw xmm0, xmm1
279    paddw xmm0, xmm4
280    psllw xmm4, 2
281    paddw xmm0, xmm4
282    movdqa [r2], xmm0
283
284    add r0, r1
285    add r2, r3
286    dec r4
287    jnz .yloop_width_8
288    POP_XMM
289    LOAD_5_PARA_POP
290    ret
291
292;*******************************************************************************
293; void McHorVer20WidthEq8_sse2(  const uint8_t *pSrc,
294;                       int iSrcStride,
295;                                               uint8_t *pDst,
296;                                               int iDstStride,
297;                                               int iHeight,
298;                      );
299;*******************************************************************************
300WELS_EXTERN McHorVer20WidthEq8_sse2
301    %assign  push_num 0
302    LOAD_5_PARA
303    PUSH_XMM 8
304    SIGN_EXTENSION  r1, r1d
305    SIGN_EXTENSION  r3, r3d
306    SIGN_EXTENSION  r4, r4d
307    lea r0, [r0-2]            ;pSrc -= 2;
308
309    pxor xmm7, xmm7
310%ifdef X86_32_PICASM
311    MOVEIMM_DW16 xmm6
312%else
313    movdqa xmm6, [h264_w0x10_1]
314%endif
315.y_loop:
316    movq xmm0, [r0]
317    punpcklbw xmm0, xmm7
318    movq xmm1, [r0+5]
319    punpcklbw xmm1, xmm7
320    movq xmm2, [r0+1]
321    punpcklbw xmm2, xmm7
322    movq xmm3, [r0+4]
323    punpcklbw xmm3, xmm7
324    movq xmm4, [r0+2]
325    punpcklbw xmm4, xmm7
326    movq xmm5, [r0+3]
327    punpcklbw xmm5, xmm7
328
329    paddw xmm2, xmm3
330    paddw xmm4, xmm5
331    psllw xmm4, 2
332    psubw xmm4, xmm2
333    paddw xmm0, xmm1
334    paddw xmm0, xmm4
335    psllw xmm4, 2
336    paddw xmm0, xmm4
337    paddw xmm0, xmm6
338    psraw xmm0, 5
339
340    packuswb xmm0, xmm7
341    movq [r2], xmm0
342
343    lea r2, [r2+r3]
344    lea r0, [r0+r1]
345    dec r4
346    jnz near .y_loop
347
348    POP_XMM
349    LOAD_5_PARA_POP
350    ret
351
352;*******************************************************************************
353; void McHorVer20WidthEq16_sse2(  const uint8_t *pSrc,
354;                       int iSrcStride,
355;                                               uint8_t *pDst,
356;                                               int iDstStride,
357;                                               int iHeight,
358;                      );
359;*******************************************************************************
360WELS_EXTERN McHorVer20WidthEq16_sse2
361    %assign  push_num 0
362    LOAD_5_PARA
363    PUSH_XMM 8
364    SIGN_EXTENSION  r1, r1d
365    SIGN_EXTENSION  r3, r3d
366    SIGN_EXTENSION  r4, r4d
367    lea r0, [r0-2]            ;pSrc -= 2;
368
369    pxor xmm7, xmm7
370%ifdef X86_32_PICASM
371    MOVEIMM_DW16 xmm6
372%else
373    movdqa xmm6, [h264_w0x10_1]
374%endif
375.y_loop:
376
377    movq xmm0, [r0]
378    punpcklbw xmm0, xmm7
379    movq xmm1, [r0+5]
380    punpcklbw xmm1, xmm7
381    movq xmm2, [r0+1]
382    punpcklbw xmm2, xmm7
383    movq xmm3, [r0+4]
384    punpcklbw xmm3, xmm7
385    movq xmm4, [r0+2]
386    punpcklbw xmm4, xmm7
387    movq xmm5, [r0+3]
388    punpcklbw xmm5, xmm7
389
390    paddw xmm2, xmm3
391    paddw xmm4, xmm5
392    psllw xmm4, 2
393    psubw xmm4, xmm2
394    paddw xmm0, xmm1
395    paddw xmm0, xmm4
396    psllw xmm4, 2
397    paddw xmm0, xmm4
398    paddw xmm0, xmm6
399    psraw xmm0, 5
400    packuswb xmm0, xmm7
401    movq [r2], xmm0
402
403    movq xmm0, [r0+8]
404    punpcklbw xmm0, xmm7
405    movq xmm1, [r0+5+8]
406    punpcklbw xmm1, xmm7
407    movq xmm2, [r0+1+8]
408    punpcklbw xmm2, xmm7
409    movq xmm3, [r0+4+8]
410    punpcklbw xmm3, xmm7
411    movq xmm4, [r0+2+8]
412    punpcklbw xmm4, xmm7
413    movq xmm5, [r0+3+8]
414    punpcklbw xmm5, xmm7
415
416    paddw xmm2, xmm3
417    paddw xmm4, xmm5
418    psllw xmm4, 2
419    psubw xmm4, xmm2
420    paddw xmm0, xmm1
421    paddw xmm0, xmm4
422    psllw xmm4, 2
423    paddw xmm0, xmm4
424    paddw xmm0, xmm6
425    psraw xmm0, 5
426    packuswb xmm0, xmm7
427    movq [r2+8], xmm0
428
429    lea r2, [r2+r3]
430    lea r0, [r0+r1]
431    dec r4
432    jnz near .y_loop
433
434    POP_XMM
435    LOAD_5_PARA_POP
436    ret
437
438
439;*******************************************************************************
440; void McHorVer02WidthEq8_sse2( const uint8_t *pSrc,
441;                       int iSrcStride,
442;                       uint8_t *pDst,
443;                       int iDstStride,
444;                       int iHeight )
445;*******************************************************************************
446WELS_EXTERN McHorVer02WidthEq8_sse2
447    %assign  push_num 0
448    INIT_X86_32_PIC r5
449    LOAD_5_PARA
450    PUSH_XMM 8
451    SIGN_EXTENSION  r1, r1d
452    SIGN_EXTENSION  r3, r3d
453    SIGN_EXTENSION  r4, r4d
454    sub r0, r1
455    sub r0, r1
456
457    WELS_Zero xmm7
458
459    SSE_LOAD_8P xmm0, xmm7, [r0]
460    SSE_LOAD_8P xmm1, xmm7, [r0+r1]
461    lea r0, [r0+2*r1]
462    SSE_LOAD_8P xmm2, xmm7, [r0]
463    SSE_LOAD_8P xmm3, xmm7, [r0+r1]
464    lea r0, [r0+2*r1]
465    SSE_LOAD_8P xmm4, xmm7, [r0]
466    SSE_LOAD_8P xmm5, xmm7, [r0+r1]
467
468.start:
469    FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
470    dec r4
471    jz near .xx_exit
472
473    lea r0, [r0+2*r1]
474    SSE_LOAD_8P xmm6, xmm7, [r0]
475    FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
476    dec r4
477    jz near .xx_exit
478
479    lea r2, [r2+2*r3]
480    SSE_LOAD_8P xmm7, xmm0, [r0+r1]
481    FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
482    dec r4
483    jz near .xx_exit
484
485    lea r0, [r0+2*r1]
486    SSE_LOAD_8P xmm0, xmm1, [r0]
487    FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
488    dec r4
489    jz near .xx_exit
490
491    lea r2, [r2+2*r3]
492    SSE_LOAD_8P xmm1, xmm2, [r0+r1]
493    FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
494    dec r4
495    jz near .xx_exit
496
497    lea r0, [r0+2*r1]
498    SSE_LOAD_8P xmm2, xmm3, [r0]
499    FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
500    dec r4
501    jz near .xx_exit
502
503    lea r2, [r2+2*r3]
504    SSE_LOAD_8P xmm3, xmm4, [r0+r1]
505    FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
506    dec r4
507    jz near .xx_exit
508
509    lea r0, [r0+2*r1]
510    SSE_LOAD_8P xmm4, xmm5, [r0]
511    FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
512    dec r4
513    jz near .xx_exit
514
515    lea r2, [r2+2*r3]
516    SSE_LOAD_8P xmm5, xmm6, [r0+r1]
517    jmp near .start
518
519.xx_exit:
520    POP_XMM
521    LOAD_5_PARA_POP
522    DEINIT_X86_32_PIC
523    ret
524
525;***********************************************************************
526; Code
527;***********************************************************************
528
529SECTION .text
530
531
532
533;***********************************************************************
534; void McHorVer02Height9Or17_sse2(  const uint8_t *pSrc,
535;                       int32_t iSrcStride,
536;                       uint8_t *pDst,
537;                       int32_t iDstStride,
538;                       int32_t iWidth,
539;                       int32_t iHeight )
540;***********************************************************************
541WELS_EXTERN McHorVer02Height9Or17_sse2
542    %assign  push_num 0
543    INIT_X86_32_PIC r6
544    LOAD_6_PARA
545    PUSH_XMM 8
546    SIGN_EXTENSION  r1, r1d
547    SIGN_EXTENSION  r3, r3d
548    SIGN_EXTENSION  r4, r4d
549    SIGN_EXTENSION  r5, r5d
550
551%ifndef X86_32
552    push r12
553    push r13
554    push r14
555    mov  r12, r0
556    mov  r13, r2
557    mov  r14, r5
558%endif
559
560    shr r4, 3
561    sub r0, r1
562    sub r0, r1
563
564.xloop:
565    WELS_Zero xmm7
566    SSE_LOAD_8P xmm0, xmm7, [r0]
567    SSE_LOAD_8P xmm1, xmm7, [r0+r1]
568    lea r0, [r0+2*r1]
569    SSE_LOAD_8P xmm2, xmm7, [r0]
570    SSE_LOAD_8P xmm3, xmm7, [r0+r1]
571    lea r0, [r0+2*r1]
572    SSE_LOAD_8P xmm4, xmm7, [r0]
573    SSE_LOAD_8P xmm5, xmm7, [r0+r1]
574
575    FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
576    dec r5
577    lea r0, [r0+2*r1]
578    SSE_LOAD_8P xmm6, xmm7, [r0]
579    movdqa xmm0,xmm1
580    movdqa xmm1,xmm2
581    movdqa xmm2,xmm3
582    movdqa xmm3,xmm4
583    movdqa xmm4,xmm5
584    movdqa xmm5,xmm6
585    add r2, r3
586    sub r0, r1
587
588.start:
589    FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
590    dec r5
591    jz near .x_loop_dec
592
593    lea r0, [r0+2*r1]
594    SSE_LOAD_8P xmm6, xmm7, [r0]
595    FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
596    dec r5
597    jz near .x_loop_dec
598
599    lea r2, [r2+2*r3]
600    SSE_LOAD_8P xmm7, xmm0, [r0+r1]
601    FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
602    dec r5
603    jz near .x_loop_dec
604
605    lea r0, [r0+2*r1]
606    SSE_LOAD_8P xmm0, xmm1, [r0]
607    FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
608    dec r5
609    jz near .x_loop_dec
610
611    lea r2, [r2+2*r3]
612    SSE_LOAD_8P xmm1, xmm2, [r0+r1]
613    FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
614    dec r5
615    jz near .x_loop_dec
616
617    lea r0, [r0+2*r1]
618    SSE_LOAD_8P xmm2, xmm3, [r0]
619    FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
620    dec r5
621    jz near .x_loop_dec
622
623    lea r2, [r2+2*r3]
624    SSE_LOAD_8P xmm3, xmm4, [r0+r1]
625    FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
626    dec r5
627    jz near .x_loop_dec
628
629    lea r0, [r0+2*r1]
630    SSE_LOAD_8P xmm4, xmm5, [r0]
631    FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
632    dec r5
633    jz near .x_loop_dec
634
635    lea r2, [r2+2*r3]
636    SSE_LOAD_8P xmm5, xmm6, [r0+r1]
637    jmp near .start
638
639.x_loop_dec:
640    dec r4
641    jz  near .xx_exit
642%ifdef X86_32
643    mov r0, arg1
644    mov r2, arg3
645    mov r5, arg6
646%else
647    mov r0, r12
648    mov r2, r13
649    mov r5, r14
650%endif
651    sub r0, r1
652    sub r0, r1
653    add r0, 8
654    add r2, 8
655    jmp near .xloop
656
657.xx_exit:
658%ifndef X86_32
659    pop r14
660    pop r13
661    pop r12
662%endif
663    POP_XMM
664    LOAD_6_PARA_POP
665    DEINIT_X86_32_PIC
666    ret
667
668
669;***********************************************************************
670; void McHorVer02Height5_sse2(  const uint8_t *pSrc,
671;                       int32_t iSrcStride,
672;                       uint8_t *pDst,
673;                       int32_t iDstStride,
674;                       int32_t iWidth,
675;                       int32_t iHeight )
676;***********************************************************************
677WELS_EXTERN McHorVer02Height5_sse2
678%assign  push_num 0
679INIT_X86_32_PIC r6
680LOAD_6_PARA
681PUSH_XMM 8
682SIGN_EXTENSION  r1, r1d
683SIGN_EXTENSION  r3, r3d
684SIGN_EXTENSION  r4, r4d
685SIGN_EXTENSION  r5, r5d
686
687%ifndef X86_32
688push r12
689push r13
690push r14
691mov  r12, r0
692mov  r13, r2
693mov  r14, r5
694%endif
695
696shr r4, 2
697sub r0, r1
698sub r0, r1
699
700.xloop:
701WELS_Zero xmm7
702SSE_LOAD_8P xmm0, xmm7, [r0]
703SSE_LOAD_8P xmm1, xmm7, [r0+r1]
704lea r0, [r0+2*r1]
705SSE_LOAD_8P xmm2, xmm7, [r0]
706SSE_LOAD_8P xmm3, xmm7, [r0+r1]
707lea r0, [r0+2*r1]
708SSE_LOAD_8P xmm4, xmm7, [r0]
709SSE_LOAD_8P xmm5, xmm7, [r0+r1]
710
711FILTER_HV_W4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
712dec r5
713lea r0, [r0+2*r1]
714SSE_LOAD_8P xmm6, xmm7, [r0]
715movdqa xmm0,xmm1
716movdqa xmm1,xmm2
717movdqa xmm2,xmm3
718movdqa xmm3,xmm4
719movdqa xmm4,xmm5
720movdqa xmm5,xmm6
721add r2, r3
722sub r0, r1
723
724.start:
725FILTER_HV_W4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
726dec r5
727jz near .x_loop_dec
728
729lea r0, [r0+2*r1]
730SSE_LOAD_8P xmm6, xmm7, [r0]
731FILTER_HV_W4 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
732dec r5
733jz near .x_loop_dec
734
735lea r2, [r2+2*r3]
736SSE_LOAD_8P xmm7, xmm0, [r0+r1]
737FILTER_HV_W4 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
738dec r5
739jz near .x_loop_dec
740
741lea r0, [r0+2*r1]
742SSE_LOAD_8P xmm0, xmm1, [r0]
743FILTER_HV_W4 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
744dec r5
745jz near .x_loop_dec
746
747lea r2, [r2+2*r3]
748SSE_LOAD_8P xmm1, xmm2, [r0+r1]
749FILTER_HV_W4 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
750dec r5
751jz near .x_loop_dec
752
753lea r0, [r0+2*r1]
754SSE_LOAD_8P xmm2, xmm3, [r0]
755FILTER_HV_W4 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
756dec r5
757jz near .x_loop_dec
758
759lea r2, [r2+2*r3]
760SSE_LOAD_8P xmm3, xmm4, [r0+r1]
761FILTER_HV_W4 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
762dec r5
763jz near .x_loop_dec
764
765lea r0, [r0+2*r1]
766SSE_LOAD_8P xmm4, xmm5, [r0]
767FILTER_HV_W4 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
768dec r5
769jz near .x_loop_dec
770
771lea r2, [r2+2*r3]
772SSE_LOAD_8P xmm5, xmm6, [r0+r1]
773jmp near .start
774
775.x_loop_dec:
776dec r4
777jz  near .xx_exit
778%ifdef X86_32
779mov r0, arg1
780mov r2, arg3
781mov r5, arg6
782%else
783mov r0, r12
784mov r2, r13
785mov r5, r14
786%endif
787sub r0, r1
788sub r0, r1
789add r0, 4
790add r2, 4
791jmp near .xloop
792
793.xx_exit:
794%ifndef X86_32
795pop r14
796pop r13
797pop r12
798%endif
799POP_XMM
800LOAD_6_PARA_POP
801DEINIT_X86_32_PIC
802ret
803
804
805;***********************************************************************
806; void McHorVer20Width9Or17_sse2(       const uint8_t *pSrc,
807;                       int32_t iSrcStride,
808;                       uint8_t *pDst,
809;                       int32_t iDstStride,
810;                       int32_t iWidth,
811;                       int32_t iHeight
812;                      );
813;***********************************************************************
814WELS_EXTERN McHorVer20Width9Or17_sse2
815    %assign  push_num 0
816    INIT_X86_32_PIC r6
817    LOAD_6_PARA
818    PUSH_XMM 8
819    SIGN_EXTENSION  r1, r1d
820    SIGN_EXTENSION  r3, r3d
821    SIGN_EXTENSION  r4, r4d
822    SIGN_EXTENSION  r5, r5d
823    sub r0, 2
824    pxor xmm7, xmm7
825
826    cmp r4, 9
827    jne near .width_17
828
829.yloop_width_9:
830    movq xmm0, [r0]
831    punpcklbw xmm0, xmm7
832    movq xmm1, [r0+5]
833    punpcklbw xmm1, xmm7
834    movq xmm2, [r0+1]
835    punpcklbw xmm2, xmm7
836    movq xmm3, [r0+4]
837    punpcklbw xmm3, xmm7
838    movq xmm4, [r0+2]
839    punpcklbw xmm4, xmm7
840    movq xmm5, [r0+3]
841    punpcklbw xmm5, xmm7
842
843    movdqa xmm7, xmm2
844    paddw   xmm7, xmm3
845    movdqa xmm6, xmm4
846    paddw   xmm6, xmm5
847    psllw xmm6, 2
848    psubw xmm6, xmm7
849    paddw xmm0, xmm1
850    paddw xmm0, xmm6
851    psllw xmm6, 2
852    paddw xmm0, xmm6
853    paddw xmm0, [pic(h264_w0x10_1)]
854    psraw  xmm0, 5
855    packuswb xmm0, xmm0
856    movd [r2], xmm0
857
858    pxor  xmm7, xmm7
859    movq xmm0, [r0+6]
860    punpcklbw xmm0, xmm7
861
862    paddw xmm4, xmm1
863    paddw xmm5, xmm3
864    psllw xmm5, 2
865    psubw xmm5, xmm4
866    paddw xmm2, xmm0
867    paddw xmm2, xmm5
868    psllw xmm5, 2
869    paddw xmm2, xmm5
870    paddw xmm2, [pic(h264_w0x10_1)]
871    psraw  xmm2, 5
872    packuswb xmm2, xmm2
873    movq [r2+1], xmm2
874
875    add r0, r1
876    add r2, r3
877    dec r5
878    jnz .yloop_width_9
879    POP_XMM
880    LOAD_6_PARA_POP
881    DEINIT_X86_32_PIC_KEEPDEF
882    ret
883
884
885.width_17:
886.yloop_width_17:
887    movq xmm0, [r0]
888    punpcklbw xmm0, xmm7
889    movq xmm1, [r0+5]
890    punpcklbw xmm1, xmm7
891    movq xmm2, [r0+1]
892    punpcklbw xmm2, xmm7
893    movq xmm3, [r0+4]
894    punpcklbw xmm3, xmm7
895    movq xmm4, [r0+2]
896    punpcklbw xmm4, xmm7
897    movq xmm5, [r0+3]
898    punpcklbw xmm5, xmm7
899
900    paddw xmm2, xmm3
901    paddw xmm4, xmm5
902    psllw xmm4, 2
903    psubw xmm4, xmm2
904    paddw xmm0, xmm1
905    paddw xmm0, xmm4
906    psllw xmm4, 2
907    paddw xmm0, xmm4
908    paddw xmm0, [pic(h264_w0x10_1)]
909    psraw  xmm0, 5
910    packuswb xmm0, xmm0
911    movq [r2], xmm0
912
913    movq xmm0, [r0+8]
914    punpcklbw xmm0, xmm7
915    movq xmm1, [r0+5+8]
916    punpcklbw xmm1, xmm7
917    movq xmm2, [r0+1+8]
918    punpcklbw xmm2, xmm7
919    movq xmm3, [r0+4+8]
920    punpcklbw xmm3, xmm7
921    movq xmm4, [r0+2+8]
922    punpcklbw xmm4, xmm7
923    movq xmm5, [r0+3+8]
924    punpcklbw xmm5, xmm7
925
926    movdqa xmm7, xmm2
927    paddw   xmm7, xmm3
928    movdqa xmm6, xmm4
929    paddw   xmm6, xmm5
930    psllw xmm6, 2
931    psubw xmm6, xmm7
932    paddw xmm0, xmm1
933    paddw xmm0, xmm6
934    psllw xmm6, 2
935    paddw xmm0, xmm6
936    paddw xmm0, [pic(h264_w0x10_1)]
937    psraw  xmm0, 5
938    packuswb xmm0, xmm0
939    movd [r2+8], xmm0
940
941
942    pxor  xmm7, xmm7
943    movq xmm0, [r0+6+8]
944    punpcklbw xmm0, xmm7
945
946    paddw xmm4, xmm1
947    paddw xmm5, xmm3
948    psllw xmm5, 2
949    psubw xmm5, xmm4
950    paddw xmm2, xmm0
951    paddw xmm2, xmm5
952    psllw xmm5, 2
953    paddw xmm2, xmm5
954    paddw xmm2, [pic(h264_w0x10_1)]
955    psraw  xmm2, 5
956    packuswb xmm2, xmm2
957    movq [r2+9], xmm2
958    add r0, r1
959    add r2, r3
960    dec r5
961    jnz .yloop_width_17
962    POP_XMM
963    LOAD_6_PARA_POP
964    DEINIT_X86_32_PIC
965    ret
966
967
968;***********************************************************************
969; void McHorVer20Width5_sse2(       const uint8_t *pSrc,
970;                       int32_t iSrcStride,
971;                       uint8_t *pDst,
972;                       int32_t iDstStride,
973;                       int32_t iWidth,
974;                       int32_t iHeight
975;                      );
976;***********************************************************************
977WELS_EXTERN McHorVer20Width5_sse2
978%assign  push_num 0
979INIT_X86_32_PIC r6
980LOAD_6_PARA
981PUSH_XMM 8
982SIGN_EXTENSION  r1, r1d
983SIGN_EXTENSION  r3, r3d
984SIGN_EXTENSION  r4, r4d
985SIGN_EXTENSION  r5, r5d
986sub r0, 2
987pxor xmm7, xmm7
988
989.yloop_width_5:
990movq xmm0, [r0]
991punpcklbw xmm0, xmm7
992movq xmm1, [r0+5]
993punpcklbw xmm1, xmm7
994movq xmm2, [r0+1]
995punpcklbw xmm2, xmm7
996movq xmm3, [r0+4]
997punpcklbw xmm3, xmm7
998movq xmm4, [r0+2]
999punpcklbw xmm4, xmm7
1000movq xmm5, [r0+3]
1001punpcklbw xmm5, xmm7
1002
1003movdqa xmm7, xmm2
1004paddw   xmm7, xmm3
1005movdqa xmm6, xmm4
1006paddw   xmm6, xmm5
1007psllw xmm6, 2
1008psubw xmm6, xmm7
1009paddw xmm0, xmm1
1010paddw xmm0, xmm6
1011psllw xmm6, 2
1012paddw xmm0, xmm6
1013paddw xmm0, [pic(h264_w0x10_1)]
1014psraw  xmm0, 5
1015packuswb xmm0, xmm0
1016movd [r2], xmm0
1017
1018pxor  xmm7, xmm7
1019movq xmm0, [r0+6]
1020punpcklbw xmm0, xmm7
1021
1022paddw xmm4, xmm1
1023paddw xmm5, xmm3
1024psllw xmm5, 2
1025psubw xmm5, xmm4
1026paddw xmm2, xmm0
1027paddw xmm2, xmm5
1028psllw xmm5, 2
1029paddw xmm2, xmm5
1030paddw xmm2, [pic(h264_w0x10_1)]
1031psraw  xmm2, 5
1032packuswb xmm2, xmm2
1033movd [r2+1], xmm2
1034
1035add r0, r1
1036add r2, r3
1037dec r5
1038jnz .yloop_width_5
1039POP_XMM
1040LOAD_6_PARA_POP
1041DEINIT_X86_32_PIC
1042ret
1043
1044
1045;***********************************************************************
1046;void McHorVer22HorFirst_sse2
1047;                           (const uint8_t *pSrc,
1048;                           int32_t iSrcStride,
1049;                           uint8_t * pTap,
1050;                           int32_t iTapStride,
1051;                           int32_t iWidth,int32_t iHeight);
1052;***********************************************************************
1053WELS_EXTERN McHorVer22HorFirst_sse2
1054    %assign  push_num 0
1055    LOAD_6_PARA
1056    PUSH_XMM 8
1057    SIGN_EXTENSION  r1, r1d
1058    SIGN_EXTENSION  r3, r3d
1059    SIGN_EXTENSION  r4, r4d
1060    SIGN_EXTENSION  r5, r5d
1061    pxor xmm7, xmm7
1062    sub r0, r1              ;;;;;;;;need more 5 lines.
1063    sub r0, r1
1064
1065    cmp r4, 9
1066    jne near .width_17
1067
1068.yloop_width_9:
1069    movq xmm0, [r0]
1070    punpcklbw xmm0, xmm7
1071    movq xmm1, [r0+5]
1072    punpcklbw xmm1, xmm7
1073    movq xmm2, [r0+1]
1074    punpcklbw xmm2, xmm7
1075    movq xmm3, [r0+4]
1076    punpcklbw xmm3, xmm7
1077    movq xmm4, [r0+2]
1078    punpcklbw xmm4, xmm7
1079    movq xmm5, [r0+3]
1080    punpcklbw xmm5, xmm7
1081
1082    movdqa xmm7, xmm2
1083    paddw   xmm7, xmm3
1084    movdqa xmm6, xmm4
1085    paddw   xmm6, xmm5
1086    psllw xmm6, 2
1087    psubw xmm6, xmm7
1088    paddw xmm0, xmm1
1089    paddw xmm0, xmm6
1090    psllw xmm6, 2
1091    paddw xmm0, xmm6
1092    movd [r2], xmm0
1093
1094    pxor  xmm7, xmm7
1095    movq xmm0, [r0+6]
1096    punpcklbw xmm0, xmm7
1097
1098    paddw xmm4, xmm1
1099    paddw xmm5, xmm3
1100    psllw xmm5, 2
1101    psubw xmm5, xmm4
1102    paddw xmm2, xmm0
1103    paddw xmm2, xmm5
1104    psllw xmm5, 2
1105    paddw xmm2, xmm5
1106    movq [r2+2], xmm2
1107    movhps [r2+2+8], xmm2
1108
1109    add r0, r1
1110    add r2, r3
1111    dec r5
1112    jnz .yloop_width_9
1113    POP_XMM
1114    LOAD_6_PARA_POP
1115    ret
1116
1117
1118.width_17:
1119.yloop_width_17:
1120    movq xmm0, [r0]
1121    punpcklbw xmm0, xmm7
1122    movq xmm1, [r0+5]
1123    punpcklbw xmm1, xmm7
1124    movq xmm2, [r0+1]
1125    punpcklbw xmm2, xmm7
1126    movq xmm3, [r0+4]
1127    punpcklbw xmm3, xmm7
1128    movq xmm4, [r0+2]
1129    punpcklbw xmm4, xmm7
1130    movq xmm5, [r0+3]
1131    punpcklbw xmm5, xmm7
1132
1133    paddw xmm2, xmm3
1134    paddw xmm4, xmm5
1135    psllw xmm4, 2
1136    psubw xmm4, xmm2
1137    paddw xmm0, xmm1
1138    paddw xmm0, xmm4
1139    psllw xmm4, 2
1140    paddw xmm0, xmm4
1141    movdqa [r2], xmm0
1142
1143    movq xmm0, [r0+8]
1144    punpcklbw xmm0, xmm7
1145    movq xmm1, [r0+5+8]
1146    punpcklbw xmm1, xmm7
1147    movq xmm2, [r0+1+8]
1148    punpcklbw xmm2, xmm7
1149    movq xmm3, [r0+4+8]
1150    punpcklbw xmm3, xmm7
1151    movq xmm4, [r0+2+8]
1152    punpcklbw xmm4, xmm7
1153    movq xmm5, [r0+3+8]
1154    punpcklbw xmm5, xmm7
1155
1156    movdqa xmm7, xmm2
1157    paddw   xmm7, xmm3
1158    movdqa xmm6, xmm4
1159    paddw   xmm6, xmm5
1160    psllw xmm6, 2
1161    psubw xmm6, xmm7
1162    paddw xmm0, xmm1
1163    paddw xmm0, xmm6
1164    psllw xmm6, 2
1165    paddw xmm0, xmm6
1166    movd [r2+16], xmm0
1167
1168
1169    pxor  xmm7, xmm7
1170    movq xmm0, [r0+6+8]
1171    punpcklbw xmm0, xmm7
1172
1173    paddw xmm4, xmm1
1174    paddw xmm5, xmm3
1175    psllw xmm5, 2
1176    psubw xmm5, xmm4
1177    paddw xmm2, xmm0
1178    paddw xmm2, xmm5
1179    psllw xmm5, 2
1180    paddw xmm2, xmm5
1181    movq [r2+18], xmm2
1182    movhps [r2+18+8], xmm2
1183
1184    add r0, r1
1185    add r2, r3
1186    dec r5
1187    jnz .yloop_width_17
1188    POP_XMM
1189    LOAD_6_PARA_POP
1190    ret
1191
1192
1193%macro FILTER_VER 9
1194    paddw  %1, %6
1195    movdqa %7, %2
1196    movdqa %8, %3
1197
1198
1199    paddw %7, %5
1200    paddw %8, %4
1201
1202    psubw  %1, %7
1203    psraw   %1, 2
1204    paddw  %1, %8
1205    psubw  %1, %7
1206    psraw   %1, 2
1207    paddw  %8, %1
1208    paddw  %8, [pic(h264_mc_hc_32)]
1209    psraw   %8, 6
1210    packuswb %8, %8
1211    movq %9, %8
1212%endmacro
1213;***********************************************************************
1214;void McHorVer22Width8VerLastAlign_sse2(
1215;                                           const uint8_t *pTap,
1216;                                           int32_t iTapStride,
1217;                                           uint8_t * pDst,
1218;                                           int32_t iDstStride,
1219;                                           int32_t iWidth,
1220;                                           int32_t iHeight);
1221;***********************************************************************
1222
1223WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
1224    %assign  push_num 0
1225    INIT_X86_32_PIC r6
1226    LOAD_6_PARA
1227    PUSH_XMM 8
1228    SIGN_EXTENSION  r1, r1d
1229    SIGN_EXTENSION  r3, r3d
1230    SIGN_EXTENSION  r4, r4d
1231    SIGN_EXTENSION  r5, r5d
1232%ifndef X86_32
1233    push r12
1234    push r13
1235    push r14
1236    mov  r12, r0
1237    mov  r13, r2
1238    mov  r14, r5
1239%endif
1240
1241    shr r4, 3
1242
1243.width_loop:
1244    movdqa xmm0, [r0]
1245    movdqa xmm1, [r0+r1]
1246    lea r0, [r0+2*r1]
1247    movdqa xmm2, [r0]
1248    movdqa xmm3, [r0+r1]
1249    lea r0, [r0+2*r1]
1250    movdqa xmm4, [r0]
1251    movdqa xmm5, [r0+r1]
1252
1253    FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
1254    dec r5
1255    lea r0, [r0+2*r1]
1256    movdqa xmm6, [r0]
1257
1258    movdqa xmm0, xmm1
1259    movdqa xmm1, xmm2
1260    movdqa xmm2, xmm3
1261    movdqa xmm3, xmm4
1262    movdqa xmm4, xmm5
1263    movdqa xmm5, xmm6
1264
1265    add r2, r3
1266    sub r0, r1
1267
1268.start:
1269    FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
1270    dec r5
1271    jz near .x_loop_dec
1272
1273    lea r0, [r0+2*r1]
1274    movdqa xmm6, [r0]
1275    FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
1276    dec r5
1277    jz near .x_loop_dec
1278
1279    lea r2, [r2+2*r3]
1280    movdqa xmm7, [r0+r1]
1281    FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
1282    dec r5
1283    jz near .x_loop_dec
1284
1285    lea r0, [r0+2*r1]
1286    movdqa xmm0, [r0]
1287    FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
1288    dec r5
1289    jz near .x_loop_dec
1290
1291    lea r2, [r2+2*r3]
1292    movdqa xmm1, [r0+r1]
1293    FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
1294    dec r5
1295    jz near .x_loop_dec
1296
1297    lea r0, [r0+2*r1]
1298    movdqa xmm2, [r0]
1299    FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
1300    dec r5
1301    jz near .x_loop_dec
1302
1303    lea r2, [r2+2*r3]
1304    movdqa xmm3, [r0+r1]
1305    FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
1306    dec r5
1307    jz near .x_loop_dec
1308
1309    lea r0, [r0+2*r1]
1310    movdqa xmm4, [r0]
1311    FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
1312    dec r5
1313    jz near .x_loop_dec
1314
1315    lea r2, [r2+2*r3]
1316    movdqa xmm5, [r0+r1]
1317    jmp near .start
1318
1319.x_loop_dec:
1320    dec r4
1321    jz near .exit
1322%ifdef X86_32
1323    mov r0, arg1
1324    mov r2, arg3
1325    mov r5, arg6
1326%else
1327    mov r0, r12
1328    mov r2, r13
1329    mov r5, r14
1330%endif
1331    add r0, 16
1332    add r2, 8
1333    jmp .width_loop
1334
1335.exit:
1336%ifndef X86_32
1337    pop r14
1338    pop r13
1339    pop r12
1340%endif
1341    POP_XMM
1342    LOAD_6_PARA_POP
1343    DEINIT_X86_32_PIC
1344    ret
1345
1346;***********************************************************************
1347;void McHorVer22Width8VerLastUnAlign_sse2(
1348;                                           const uint8_t *pTap,
1349;                                           int32_t iTapStride,
1350;                                           uint8_t * pDst,
1351;                                           int32_t iDstStride,
1352;                                           int32_t iWidth,
1353;                                           int32_t iHeight);
1354;***********************************************************************
1355
1356WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
1357    %assign  push_num 0
1358    INIT_X86_32_PIC r6
1359    LOAD_6_PARA
1360    PUSH_XMM 8
1361    SIGN_EXTENSION  r1, r1d
1362    SIGN_EXTENSION  r3, r3d
1363    SIGN_EXTENSION  r4, r4d
1364    SIGN_EXTENSION  r5, r5d
1365%ifndef X86_32
1366    push r12
1367    push r13
1368    push r14
1369    mov  r12, r0
1370    mov  r13, r2
1371    mov  r14, r5
1372%endif
1373    shr r4, 3
1374
1375.width_loop:
1376    movdqu xmm0, [r0]
1377    movdqu xmm1, [r0+r1]
1378    lea r0, [r0+2*r1]
1379    movdqu xmm2, [r0]
1380    movdqu xmm3, [r0+r1]
1381    lea r0, [r0+2*r1]
1382    movdqu xmm4, [r0]
1383    movdqu xmm5, [r0+r1]
1384
1385    FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
1386    dec r5
1387    lea r0, [r0+2*r1]
1388    movdqu xmm6, [r0]
1389
1390    movdqa xmm0, xmm1
1391    movdqa xmm1, xmm2
1392    movdqa xmm2, xmm3
1393    movdqa xmm3, xmm4
1394    movdqa xmm4, xmm5
1395    movdqa xmm5, xmm6
1396
1397    add r2, r3
1398    sub r0, r1
1399
1400.start:
1401    FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
1402    dec r5
1403    jz near .x_loop_dec
1404
1405    lea r0, [r0+2*r1]
1406    movdqu xmm6, [r0]
1407    FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
1408    dec r5
1409    jz near .x_loop_dec
1410
1411    lea r2, [r2+2*r3]
1412    movdqu xmm7, [r0+r1]
1413    FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
1414    dec r5
1415    jz near .x_loop_dec
1416
1417    lea r0, [r0+2*r1]
1418    movdqu xmm0, [r0]
1419    FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
1420    dec r5
1421    jz near .x_loop_dec
1422
1423    lea r2, [r2+2*r3]
1424    movdqu xmm1, [r0+r1]
1425    FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
1426    dec r5
1427    jz near .x_loop_dec
1428
1429    lea r0, [r0+2*r1]
1430    movdqu xmm2, [r0]
1431    FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
1432    dec r5
1433    jz near .x_loop_dec
1434
1435    lea r2, [r2+2*r3]
1436    movdqu xmm3, [r0+r1]
1437    FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
1438    dec r5
1439    jz near .x_loop_dec
1440
1441    lea r0, [r0+2*r1]
1442    movdqu xmm4, [r0]
1443    FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
1444    dec r5
1445    jz near .x_loop_dec
1446
1447    lea r2, [r2+2*r3]
1448    movdqu xmm5, [r0+r1]
1449    jmp near .start
1450
1451.x_loop_dec:
1452    dec r4
1453    jz near .exit
1454%ifdef X86_32
1455    mov r0, arg1
1456    mov r2, arg3
1457    mov r5, arg6
1458%else
1459    mov r0, r12
1460    mov r2, r13
1461    mov r5, r14
1462%endif
1463    add r0, 16
1464    add r2, 8
1465    jmp .width_loop
1466
1467.exit:
1468%ifndef X86_32
1469    pop r14
1470    pop r13
1471    pop r12
1472%endif
1473    POP_XMM
1474    LOAD_6_PARA_POP
1475    DEINIT_X86_32_PIC
1476    ret
1477
1478
1479;***********************************************************************
1480;void McHorVer22Width5HorFirst_sse2
1481;                           (const uint8_t *pSrc,
1482;                           int32_t iSrcStride,
1483;                           uint8_t * pTap,
1484;                           int32_t iTapStride,
1485;                           int32_t iWidth,int32_t iHeight);
1486;***********************************************************************
1487WELS_EXTERN McHorVer22Width5HorFirst_sse2
1488%assign  push_num 0
1489LOAD_6_PARA
1490PUSH_XMM 8
1491SIGN_EXTENSION  r1, r1d
1492SIGN_EXTENSION  r3, r3d
1493SIGN_EXTENSION  r4, r4d
1494SIGN_EXTENSION  r5, r5d
1495pxor xmm7, xmm7
1496sub r0, r1              ;;;;;;;;need more 5 lines.
1497sub r0, r1
1498
1499.yloop_width_5:
1500movq xmm0, [r0]
1501punpcklbw xmm0, xmm7
1502movq xmm1, [r0+5]
1503punpcklbw xmm1, xmm7
1504movq xmm2, [r0+1]
1505punpcklbw xmm2, xmm7
1506movq xmm3, [r0+4]
1507punpcklbw xmm3, xmm7
1508movq xmm4, [r0+2]
1509punpcklbw xmm4, xmm7
1510movq xmm5, [r0+3]
1511punpcklbw xmm5, xmm7
1512
1513movdqa xmm7, xmm2
1514paddw   xmm7, xmm3
1515movdqa xmm6, xmm4
1516paddw   xmm6, xmm5
1517psllw xmm6, 2
1518psubw xmm6, xmm7
1519paddw xmm0, xmm1
1520paddw xmm0, xmm6
1521psllw xmm6, 2
1522paddw xmm0, xmm6
1523movd [r2], xmm0
1524
1525pxor  xmm7, xmm7
1526movq xmm0, [r0+6]
1527punpcklbw xmm0, xmm7
1528
1529paddw xmm4, xmm1
1530paddw xmm5, xmm3
1531psllw xmm5, 2
1532psubw xmm5, xmm4
1533paddw xmm2, xmm0
1534paddw xmm2, xmm5
1535psllw xmm5, 2
1536paddw xmm2, xmm5
1537movq [r2+2], xmm2
1538movhps [r2+2+8], xmm2
1539
1540add r0, r1
1541add r2, r3
1542dec r5
1543jnz .yloop_width_5
1544POP_XMM
1545LOAD_6_PARA_POP
1546ret
1547
1548
1549%macro FILTER_VER_4 9
1550paddw  %1, %6
1551movdqa %7, %2
1552movdqa %8, %3
1553
1554
1555paddw %7, %5
1556paddw %8, %4
1557
1558psubw  %1, %7
1559psraw   %1, 2
1560paddw  %1, %8
1561psubw  %1, %7
1562psraw   %1, 2
1563paddw  %8, %1
1564paddw  %8, [pic(h264_mc_hc_32)]
1565psraw   %8, 6
1566packuswb %8, %8
1567movd %9, %8
1568%endmacro
1569
1570
1571;***********************************************************************
1572;void McHorVer22Width4VerLastAlign_sse2(
1573;                                           const uint8_t *pTap,
1574;                                           int32_t iTapStride,
1575;                                           uint8_t * pDst,
1576;                                           int32_t iDstStride,
1577;                                           int32_t iWidth,
1578;                                           int32_t iHeight);
1579;***********************************************************************
1580
1581WELS_EXTERN McHorVer22Width4VerLastAlign_sse2
1582%assign  push_num 0
1583INIT_X86_32_PIC r6
1584LOAD_6_PARA
1585PUSH_XMM 8
1586SIGN_EXTENSION  r1, r1d
1587SIGN_EXTENSION  r3, r3d
1588SIGN_EXTENSION  r4, r4d
1589SIGN_EXTENSION  r5, r5d
1590%ifndef X86_32
1591push r12
1592push r13
1593push r14
1594mov  r12, r0
1595mov  r13, r2
1596mov  r14, r5
1597%endif
1598
1599shr r4, 2
1600
1601.width_loop:
1602movdqa xmm0, [r0]
1603movdqa xmm1, [r0+r1]
1604lea r0, [r0+2*r1]
1605movdqa xmm2, [r0]
1606movdqa xmm3, [r0+r1]
1607lea r0, [r0+2*r1]
1608movdqa xmm4, [r0]
1609movdqa xmm5, [r0+r1]
1610
1611FILTER_VER_4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
1612dec r5
1613lea r0, [r0+2*r1]
1614movdqa xmm6, [r0]
1615
1616movdqa xmm0, xmm1
1617movdqa xmm1, xmm2
1618movdqa xmm2, xmm3
1619movdqa xmm3, xmm4
1620movdqa xmm4, xmm5
1621movdqa xmm5, xmm6
1622
1623add r2, r3
1624sub r0, r1
1625
1626.start:
1627FILTER_VER_4 xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
1628dec r5
1629jz near .x_loop_dec
1630
1631lea r0, [r0+2*r1]
1632movdqa xmm6, [r0]
1633FILTER_VER_4 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
1634dec r5
1635jz near .x_loop_dec
1636
1637lea r2, [r2+2*r3]
1638movdqa xmm7, [r0+r1]
1639FILTER_VER_4  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
1640dec r5
1641jz near .x_loop_dec
1642
1643lea r0, [r0+2*r1]
1644movdqa xmm0, [r0]
1645FILTER_VER_4  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
1646dec r5
1647jz near .x_loop_dec
1648
1649lea r2, [r2+2*r3]
1650movdqa xmm1, [r0+r1]
1651FILTER_VER_4  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
1652dec r5
1653jz near .x_loop_dec
1654
1655lea r0, [r0+2*r1]
1656movdqa xmm2, [r0]
1657FILTER_VER_4  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
1658dec r5
1659jz near .x_loop_dec
1660
1661lea r2, [r2+2*r3]
1662movdqa xmm3, [r0+r1]
1663FILTER_VER_4  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
1664dec r5
1665jz near .x_loop_dec
1666
1667lea r0, [r0+2*r1]
1668movdqa xmm4, [r0]
1669FILTER_VER_4  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
1670dec r5
1671jz near .x_loop_dec
1672
1673lea r2, [r2+2*r3]
1674movdqa xmm5, [r0+r1]
1675jmp near .start
1676
1677.x_loop_dec:
1678dec r4
1679jz near .exit
1680%ifdef X86_32
1681mov r0, arg1
1682mov r2, arg3
1683mov r5, arg6
1684%else
1685mov r0, r12
1686mov r2, r13
1687mov r5, r14
1688%endif
1689add r0, 8
1690add r2, 4
1691jmp .width_loop
1692
1693.exit:
1694%ifndef X86_32
1695pop r14
1696pop r13
1697pop r12
1698%endif
1699POP_XMM
1700LOAD_6_PARA_POP
1701DEINIT_X86_32_PIC
1702ret
1703
1704
1705;***********************************************************************
1706;void McHorVer22Width4VerLastUnAlign_sse2(
1707;                                           const uint8_t *pTap,
1708;                                           int32_t iTapStride,
1709;                                           uint8_t * pDst,
1710;                                           int32_t iDstStride,
1711;                                           int32_t iWidth,
1712;                                           int32_t iHeight);
1713;***********************************************************************
1714
1715WELS_EXTERN McHorVer22Width4VerLastUnAlign_sse2
1716%assign  push_num 0
1717INIT_X86_32_PIC r6
1718LOAD_6_PARA
1719PUSH_XMM 8
1720SIGN_EXTENSION  r1, r1d
1721SIGN_EXTENSION  r3, r3d
1722SIGN_EXTENSION  r4, r4d
1723SIGN_EXTENSION  r5, r5d
1724%ifndef X86_32
1725push r12
1726push r13
1727push r14
1728mov  r12, r0
1729mov  r13, r2
1730mov  r14, r5
1731%endif
1732shr r4, 2
1733
1734.width_loop:
1735movdqu xmm0, [r0]
1736movdqu xmm1, [r0+r1]
1737lea r0, [r0+2*r1]
1738movdqu xmm2, [r0]
1739movdqu xmm3, [r0+r1]
1740lea r0, [r0+2*r1]
1741movdqu xmm4, [r0]
1742movdqu xmm5, [r0+r1]
1743
1744FILTER_VER_4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
1745dec r5
1746lea r0, [r0+2*r1]
1747movdqu xmm6, [r0]
1748
1749movdqa xmm0, xmm1
1750movdqa xmm1, xmm2
1751movdqa xmm2, xmm3
1752movdqa xmm3, xmm4
1753movdqa xmm4, xmm5
1754movdqa xmm5, xmm6
1755
1756add r2, r3
1757sub r0, r1
1758
1759.start:
1760FILTER_VER_4 xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
1761dec r5
1762jz near .x_loop_dec
1763
1764lea r0, [r0+2*r1]
1765movdqu xmm6, [r0]
1766FILTER_VER_4 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
1767dec r5
1768jz near .x_loop_dec
1769
1770lea r2, [r2+2*r3]
1771movdqu xmm7, [r0+r1]
1772FILTER_VER_4  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
1773dec r5
1774jz near .x_loop_dec
1775
1776lea r0, [r0+2*r1]
1777movdqu xmm0, [r0]
1778FILTER_VER_4  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
1779dec r5
1780jz near .x_loop_dec
1781
1782lea r2, [r2+2*r3]
1783movdqu xmm1, [r0+r1]
1784FILTER_VER_4  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
1785dec r5
1786jz near .x_loop_dec
1787
1788lea r0, [r0+2*r1]
1789movdqu xmm2, [r0]
1790FILTER_VER_4  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
1791dec r5
1792jz near .x_loop_dec
1793
1794lea r2, [r2+2*r3]
1795movdqu xmm3, [r0+r1]
1796FILTER_VER_4  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
1797dec r5
1798jz near .x_loop_dec
1799
1800lea r0, [r0+2*r1]
1801movdqu xmm4, [r0]
1802FILTER_VER_4  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
1803dec r5
1804jz near .x_loop_dec
1805
1806lea r2, [r2+2*r3]
1807movdqu xmm5, [r0+r1]
1808jmp near .start
1809
1810.x_loop_dec:
1811dec r4
1812jz near .exit
1813%ifdef X86_32
1814mov r0, arg1
1815mov r2, arg3
1816mov r5, arg6
1817%else
1818mov r0, r12
1819mov r2, r13
1820mov r5, r14
1821%endif
1822add r0, 8
1823add r2, 4
1824jmp .width_loop
1825
1826.exit:
1827%ifndef X86_32
1828pop r14
1829pop r13
1830pop r12
1831%endif
1832POP_XMM
1833LOAD_6_PARA_POP
1834DEINIT_X86_32_PIC
1835ret
1836
1837
1838; px_ab=%1 px_cd=%2 px_ef=%3 maddubsw_ab=%4 maddubsw_cd=%5 maddubsw_ef=%6 tmp=%7
1839%macro SSSE3_FilterVertical_8px 7
1840    pmaddubsw       %1, %4
1841    movdqa          %7, %2
1842    pmaddubsw       %7, %5
1843    paddw           %1, %7
1844    movdqa          %7, %3
1845    pmaddubsw       %7, %6
1846    paddw           %1, %7
1847    paddw           %1, [pic(h264_w0x10_1)]
1848    psraw           %1, 5
1849%endmacro
1850
1851; px_a=%1 px_f=%2 px_bc=%3 px_de=%4 maddubsw_bc=%5 maddubsw_de=%6 tmp=%7,%8
1852%macro SSSE3_FilterVertical2_8px 8
1853    movdqa          %8, %2
1854    pxor            %7, %7
1855    punpcklbw       %1, %7
1856    punpcklbw       %8, %7
1857    paddw           %1, %8
1858    movdqa          %7, %3
1859    pmaddubsw       %7, %5
1860    paddw           %1, %7
1861    movdqa          %7, %4
1862    pmaddubsw       %7, %6
1863    paddw           %1, %7
1864    paddw           %1, [pic(h264_w0x10_1)]
1865    psraw           %1, 5
1866%endmacro
1867
1868; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6
1869%macro SSSE3_FilterHorizontalbw_8px 6
1870    movdqa          %5, %1
1871    pshufb          %1, %2
1872    pshufb          %5, %3
1873    pshufd          %6, %1, 10110001b
1874    pmaddubsw       %1, [pic(db20_128)]
1875    pmaddubsw       %5, %4
1876    pmaddubsw       %6, %4
1877    paddw           %1, %5
1878    paddw           %1, %6
1879%endmacro
1880
1881; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6
1882%macro SSSE3_FilterHorizontal_8px 6
1883    SSSE3_FilterHorizontalbw_8px %1, %2, %3, %4, %5, %6
1884    paddw           %1, [pic(h264_w0x10_1)]
1885    psraw           %1, 5
1886%endmacro
1887
1888; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7
1889%macro SSSE3_FilterHorizontalbw_2x4px 7
1890    movdqa          %6, %1
1891    movdqa          %7, %2
1892    pshufb          %1, %3
1893    pshufb          %2, %3
1894    punpcklqdq      %1, %2
1895    pshufb          %6, %4
1896    pshufb          %7, %4
1897    punpcklqdq      %6, %7
1898    pshufd          %7, %1, 10110001b
1899    pmaddubsw       %1, [pic(db20_128)]
1900    pmaddubsw       %6, %5
1901    pmaddubsw       %7, %5
1902    paddw           %1, %6
1903    paddw           %1, %7
1904%endmacro
1905
1906; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7
1907%macro SSSE3_FilterHorizontal_2x4px 7
1908    SSSE3_FilterHorizontalbw_2x4px %1, %2, %3, %4, %5, %6, %7
1909    paddw           %1, [pic(h264_w0x10_1)]
1910    psraw           %1, 5
1911%endmacro
1912
1913; pixels=%1 -32768>>scale=%2 tmp=%3
1914%macro SSSE3_FilterHorizontalbw_2px 3
1915    pmaddubsw       %1, [pic(maddubsw_m2p10_m40m40_p10m2_p0p0_128)]
1916    pmaddwd         %1, %2
1917    pshufd          %3, %1, 10110001b
1918    paddd           %1, %3
1919%endmacro
1920
1921; pixels=%1 tmp=%2
1922%macro SSSE3_FilterHorizontal_2px 2
1923    SSSE3_FilterHorizontalbw_2px %1, [pic(dwm1024_128)], %2
1924    paddd           %1, [pic(dd32768_128)]
1925%endmacro
1926
1927; px0=%1 px1=%2 px2=%3 px3=%4 px4=%5 px5=%6 tmp=%7
1928%macro SSE2_FilterVerticalw_8px 7
1929    paddw           %1, %6
1930    movdqa          %7, %2
1931    paddw           %7, %5
1932    psubw           %1, %7
1933    psraw           %1, 2
1934    psubw           %1, %7
1935    movdqa          %7, %3
1936    paddw           %7, %4
1937    paddw           %1, %7
1938    psraw           %1, 2
1939    paddw           %7, [pic(h264_mc_hc_32)]
1940    paddw           %1, %7
1941    psraw           %1, 6
1942%endmacro
1943
1944;***********************************************************************
1945; void McHorVer02_ssse3(const uint8_t *pSrc,
1946;                       int32_t iSrcStride,
1947;                       uint8_t *pDst,
1948;                       int32_t iDstStride,
1949;                       int32_t iWidth,
1950;                       int32_t iHeight)
1951;***********************************************************************
1952
1953WELS_EXTERN McHorVer02_ssse3
1954%define p_src         r0
1955%define i_srcstride   r1
1956%define p_dst         r2
1957%define i_dststride   r3
1958%ifdef X86_32_PICASM
1959%define i_width       dword arg5
1960%else
1961%define i_width       r4
1962%endif
1963%define i_height      r5
1964%define i_srcstride3  r6
1965    %assign push_num 0
1966%ifdef X86_32
1967    push            r6
1968    %assign push_num 1
1969%endif
1970    LOAD_6_PARA
1971    PUSH_XMM 8
1972    SIGN_EXTENSION  r1, r1d
1973    SIGN_EXTENSION  r3, r3d
1974    SIGN_EXTENSION  r4, r4d
1975    SIGN_EXTENSION  r5, r5d
1976    INIT_X86_32_PIC_NOPRESERVE r4
1977    sub             p_src, i_srcstride
1978    sub             p_src, i_srcstride
1979    lea             i_srcstride3, [3 * i_srcstride]
1980    %assign push_num_begin push_num
1981    cmp             i_width, 4
1982    jg              .width8or16
1983
1984    movd            xmm0, [p_src]
1985    movd            xmm4, [p_src + i_srcstride]
1986    punpcklbw       xmm0, xmm4
1987    movd            xmm1, [p_src + 2 * i_srcstride]
1988    punpcklbw       xmm4, xmm1
1989    punpcklqdq      xmm0, xmm4
1990    movd            xmm4, [p_src + i_srcstride3]
1991    lea             p_src, [p_src + 4 * i_srcstride]
1992    punpcklbw       xmm1, xmm4
1993    movd            xmm2, [p_src]
1994    punpcklbw       xmm4, xmm2
1995    punpcklqdq      xmm1, xmm4
1996    movd            xmm4, [p_src + i_srcstride]
1997    lea             p_src, [p_src + 2 * i_srcstride]
1998    punpcklbw       xmm2, xmm4
1999    movd            xmm3, [p_src]
2000    punpcklbw       xmm4, xmm3
2001    punpcklqdq      xmm2, xmm4
2002    movdqa          xmm5, [pic(db20_128)]
2003    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
2004    packuswb        xmm0, xmm0
2005    movd            [p_dst], xmm0
2006    psrlq           xmm0, 32
2007    movd            [p_dst + i_dststride], xmm0
2008    lea             p_dst, [p_dst + 2 * i_dststride]
2009    movd            xmm4, [p_src + i_srcstride]
2010    punpcklbw       xmm3, xmm4
2011    movd            xmm0, [p_src + 2 * i_srcstride]
2012    punpcklbw       xmm4, xmm0
2013    punpcklqdq      xmm3, xmm4
2014    SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
2015    packuswb        xmm1, xmm1
2016    movd            [p_dst], xmm1
2017    psrlq           xmm1, 32
2018    movd            [p_dst + i_dststride], xmm1
2019    cmp             i_height, 5
2020    jl              .width4_height_le5_done
2021    lea             p_dst, [p_dst + 2 * i_dststride]
2022    movd            xmm4, [p_src + i_srcstride3]
2023    punpcklbw       xmm0, xmm4
2024    jg              .width4_height_ge8
2025    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
2026    packuswb        xmm2, xmm2
2027    movd            [p_dst], xmm2
2028.width4_height_le5_done:
2029    DEINIT_X86_32_PIC_KEEPDEF
2030    POP_XMM
2031    LOAD_6_PARA_POP
2032%ifdef X86_32
2033    pop             r6
2034%endif
2035    ret
2036.width4_height_ge8:
2037    lea             p_src, [p_src + 4 * i_srcstride]
2038    movd            xmm1, [p_src]
2039    punpcklbw       xmm4, xmm1
2040    punpcklqdq      xmm0, xmm4
2041    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
2042    packuswb        xmm2, xmm2
2043    movd            [p_dst], xmm2
2044    psrlq           xmm2, 32
2045    movd            [p_dst + i_dststride], xmm2
2046    lea             p_dst, [p_dst + 2 * i_dststride]
2047    movd            xmm4, [p_src + i_srcstride]
2048    punpcklbw       xmm1, xmm4
2049    movd            xmm2, [p_src + 2 * i_srcstride]
2050    punpcklbw       xmm4, xmm2
2051    punpcklqdq      xmm1, xmm4
2052    SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
2053    packuswb        xmm3, xmm3
2054    movd            [p_dst], xmm3
2055    psrlq           xmm3, 32
2056    movd            [p_dst + i_dststride], xmm3
2057    cmp             i_height, 9
2058    jl              .width4_height_ge8_done
2059    lea             p_dst, [p_dst + 2 * i_dststride]
2060    movd            xmm4, [p_src + i_srcstride3]
2061    punpcklbw       xmm2, xmm4
2062    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
2063    packuswb        xmm0, xmm0
2064    movd            [p_dst], xmm0
2065.width4_height_ge8_done:
2066    DEINIT_X86_32_PIC_KEEPDEF
2067    POP_XMM
2068    LOAD_6_PARA_POP
2069%ifdef X86_32
2070    pop             r6
2071%endif
2072    ret
2073
2074.width8or16:
2075    %assign push_num push_num_begin
2076    sub             i_height, 1
2077    push            i_height
2078    %assign push_num push_num + 1
2079%xdefine i_ycnt i_height
2080%define i_height [r7]
2081.xloop:
2082    push            p_src
2083    push            p_dst
2084    %assign push_num push_num + 2
2085    test            i_ycnt, 1
2086    jnz             .yloop_begin_even
2087    movq            xmm0, [p_src]
2088    movq            xmm1, [p_src + i_srcstride]
2089    punpcklbw       xmm0, xmm1
2090    movq            xmm2, [p_src + 2 * i_srcstride]
2091    movq            xmm3, [p_src + i_srcstride3]
2092    lea             p_src, [p_src + 4 * i_srcstride]
2093    punpcklbw       xmm2, xmm3
2094    movq            xmm4, [p_src]
2095    movq            xmm5, [p_src + i_srcstride]
2096    lea             p_src, [p_src + 2 * i_srcstride]
2097    punpcklbw       xmm4, xmm5
2098    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm7
2099    packuswb        xmm0, xmm0
2100    movlps          [p_dst], xmm0
2101    add             p_dst, i_dststride
2102    jmp             .yloop
2103.yloop_begin_even:
2104    movq            xmm1, [p_src]
2105    movq            xmm2, [p_src + i_srcstride]
2106    movq            xmm3, [p_src + 2 * i_srcstride]
2107    add             p_src, i_srcstride3
2108    punpcklbw       xmm2, xmm3
2109    movq            xmm4, [p_src]
2110    movq            xmm5, [p_src + i_srcstride]
2111    lea             p_src, [p_src + 2 * i_srcstride]
2112    punpcklbw       xmm4, xmm5
2113.yloop:
2114    movq            xmm6, [p_src]
2115    SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm0, xmm7
2116    movq            xmm7, [p_src + i_srcstride]
2117    punpcklbw       xmm6, xmm7
2118    SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm0
2119    packuswb        xmm1, xmm2
2120    movlps          [p_dst], xmm1
2121    movhps          [p_dst + i_dststride], xmm1
2122    lea             p_dst, [p_dst + 2 * i_dststride]
2123    movq            xmm0, [p_src + 2 * i_srcstride]
2124    SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm2, xmm1
2125    movq            xmm1, [p_src + i_srcstride3]
2126    lea             p_src, [p_src + 4 * i_srcstride]
2127    punpcklbw       xmm0, xmm1
2128    SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm2
2129    packuswb        xmm3, xmm4
2130    movlps          [p_dst], xmm3
2131    movhps          [p_dst + i_dststride], xmm3
2132    cmp             i_ycnt, 4
2133    jle             .yloop_exit
2134    lea             p_dst, [p_dst + 2 * i_dststride]
2135    movq            xmm2, [p_src]
2136    SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm4, xmm3
2137    movq            xmm3, [p_src + i_srcstride]
2138    punpcklbw       xmm2, xmm3
2139    SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm4
2140    packuswb        xmm5, xmm6
2141    movlps          [p_dst], xmm5
2142    movhps          [p_dst + i_dststride], xmm5
2143    lea             p_dst, [p_dst + 2 * i_dststride]
2144    movq            xmm4, [p_src + 2 * i_srcstride]
2145    SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm6, xmm5
2146    movq            xmm5, [p_src + i_srcstride3]
2147    lea             p_src, [p_src + 4 * i_srcstride]
2148    punpcklbw       xmm4, xmm5
2149    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm6
2150    packuswb        xmm7, xmm0
2151    movlps          [p_dst], xmm7
2152    movhps          [p_dst + i_dststride], xmm7
2153    lea             p_dst, [p_dst + 2 * i_dststride]
2154    sub             i_ycnt, 8
2155    jg              .yloop
2156.yloop_exit:
2157    pop             p_dst
2158    pop             p_src
2159    %assign push_num push_num - 2
2160    sub             i_width, 8
2161    jle             .width8or16_done
2162    add             p_src, 8
2163    add             p_dst, 8
2164    mov             i_ycnt, i_height
2165    jmp             .xloop
2166.width8or16_done:
2167    pop             i_ycnt
2168    %assign push_num push_num - 1
2169    DEINIT_X86_32_PIC
2170    POP_XMM
2171    LOAD_6_PARA_POP
2172%ifdef X86_32
2173    pop             r6
2174%endif
2175    ret
2176%undef p_src
2177%undef i_srcstride
2178%undef i_srcstride3
2179%undef p_dst
2180%undef i_dststride
2181%undef i_width
2182%undef i_height
2183%undef i_ycnt
2184
2185
2186;*******************************************************************************
2187; void McHorVer20_ssse3(const uint8_t *pSrc,
2188;                       int iSrcStride,
2189;                       uint8_t *pDst,
2190;                       int iDstStride,
2191;                       int iWidth,
2192;                       int iHeight);
2193;*******************************************************************************
2194
2195WELS_EXTERN McHorVer20_ssse3
2196%define p_src        r0
2197%define i_srcstride  r1
2198%define p_dst        r2
2199%define i_dststride  r3
2200%define i_width      r4
2201%define i_height     r5
2202    %assign  push_num 0
2203    INIT_X86_32_PIC r6
2204    LOAD_6_PARA
2205    PUSH_XMM 7
2206    SIGN_EXTENSION  r1, r1d
2207    SIGN_EXTENSION  r3, r3d
2208    SIGN_EXTENSION  r4, r4d
2209    SIGN_EXTENSION  r5, r5d
2210    movdqa          xmm4, [pic(shufb_32435465768798A9)]
2211    movdqa          xmm5, [pic(shufb_011267784556ABBC)]
2212    movdqa          xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
2213    cmp             i_width, 8
2214    je              .width8_yloop
2215    jg              .width16_yloop
2216.width4_yloop:
2217    movdqu          xmm0, [p_src - 2]
2218    movdqu          xmm1, [p_src + i_srcstride - 2]
2219    lea             p_src, [p_src + 2 * i_srcstride]
2220    SSSE3_FilterHorizontal_2x4px xmm0, xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
2221    packuswb        xmm0, xmm0
2222    movd            [p_dst], xmm0
2223    psrlq           xmm0, 32
2224    movd            [p_dst + i_dststride], xmm0
2225    lea             p_dst, [p_dst + 2 * i_dststride]
2226    sub             i_height, 2
2227    jg              .width4_yloop
2228    POP_XMM
2229    LOAD_6_PARA_POP
2230    DEINIT_X86_32_PIC_KEEPDEF
2231    ret
2232.width8_yloop:
2233    movdqu          xmm0, [p_src - 2]
2234    movdqu          xmm1, [p_src + i_srcstride - 2]
2235    lea             p_src, [p_src + 2 * i_srcstride]
2236    SSSE3_FilterHorizontal_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
2237    SSSE3_FilterHorizontal_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
2238    packuswb        xmm0, xmm1
2239    movlps          [p_dst], xmm0
2240    movhps          [p_dst + i_dststride], xmm0
2241    lea             p_dst, [p_dst + 2 * i_dststride]
2242    sub             i_height, 2
2243    jg              .width8_yloop
2244    POP_XMM
2245    LOAD_6_PARA_POP
2246    DEINIT_X86_32_PIC_KEEPDEF
2247    ret
2248.width16_yloop:
2249    movdqu          xmm0, [p_src - 2]
2250    movdqu          xmm1, [p_src + 6]
2251    add             p_src, i_srcstride
2252    SSSE3_FilterHorizontal_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
2253    SSSE3_FilterHorizontal_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
2254    packuswb        xmm0, xmm1
2255    MOVDQ           [p_dst], xmm0
2256    add             p_dst, i_dststride
2257    sub             i_height, 1
2258    jg              .width16_yloop
2259    POP_XMM
2260    LOAD_6_PARA_POP
2261    DEINIT_X86_32_PIC
2262    ret
2263%undef p_src
2264%undef i_srcstride
2265%undef p_dst
2266%undef i_dststride
2267%undef i_width
2268%undef i_height
2269
2270
2271;***********************************************************************
2272; void McHorVer20Width5Or9Or17_ssse3(const uint8_t *pSrc,
2273;                                    int32_t iSrcStride,
2274;                                    uint8_t *pDst,
2275;                                    int32_t iDstStride,
2276;                                    int32_t iWidth,
2277;                                    int32_t iHeight);
2278;***********************************************************************
2279
2280WELS_EXTERN McHorVer20Width5Or9Or17_ssse3
2281%define p_src        r0
2282%define i_srcstride  r1
2283%define p_dst        r2
2284%define i_dststride  r3
2285%define i_width      r4
2286%define i_height     r5
2287    %assign  push_num 0
2288    INIT_X86_32_PIC r6
2289    LOAD_6_PARA
2290    PUSH_XMM 8
2291    SIGN_EXTENSION  r1, r1d
2292    SIGN_EXTENSION  r3, r3d
2293    SIGN_EXTENSION  r4, r4d
2294    SIGN_EXTENSION  r5, r5d
2295    movdqa          xmm5, [pic(shufb_32435465768798A9)]
2296    movdqa          xmm6, [pic(shufb_011267784556ABBC)]
2297    movdqa          xmm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
2298    cmp             i_width, 9
2299    je              .width9_yloop
2300    jg              .width17_yloop
2301.width5_yloop:
2302    movdqu          xmm0, [p_src - 2]
2303    add             p_src, i_srcstride
2304    SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
2305    packuswb        xmm0, xmm0
2306    movdqa          xmm1, xmm0
2307    psrlq           xmm1, 8
2308    movd            [p_dst], xmm0
2309    movd            [p_dst + 1], xmm1
2310    add             p_dst, i_dststride
2311    sub             i_height, 1
2312    jg              .width5_yloop
2313    POP_XMM
2314    LOAD_6_PARA_POP
2315    DEINIT_X86_32_PIC_KEEPDEF
2316    ret
2317.width9_yloop:
2318    movdqu          xmm0, [p_src - 2]
2319    movdqu          xmm4, [p_src + i_srcstride - 2]
2320    lea             p_src, [p_src + 2 * i_srcstride]
2321    movdqa          xmm3, xmm0
2322    punpckhqdq      xmm3, xmm4
2323    SSSE3_FilterHorizontal_2px xmm3, xmm2
2324    SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
2325    packuswb        xmm3, xmm0
2326    movd            [p_dst + 5], xmm3
2327    movhps          [p_dst], xmm3
2328    add             p_dst, i_dststride
2329    SSSE3_FilterHorizontal_8px xmm4, xmm5, xmm6, xmm7, xmm1, xmm2
2330    packuswb        xmm4, xmm4
2331    psrldq          xmm3, 4
2332    movd            [p_dst + 5], xmm3
2333    movlps          [p_dst], xmm4
2334    add             p_dst, i_dststride
2335    sub             i_height, 2
2336    jg              .width9_yloop
2337    POP_XMM
2338    LOAD_6_PARA_POP
2339    DEINIT_X86_32_PIC_KEEPDEF
2340    ret
2341.width17_yloop:
2342    movdqu          xmm0, [p_src - 2]
2343    movdqu          xmm3, [p_src + 6]
2344    add             p_src, i_srcstride
2345    movdqa          xmm4, xmm3
2346    SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
2347    SSSE3_FilterHorizontal_8px xmm3, xmm5, xmm6, xmm7, xmm1, xmm2
2348    packuswb        xmm0, xmm3
2349    movdqu          xmm1, [p_src - 2]
2350    movdqu          xmm3, [p_src + 6]
2351    add             p_src, i_srcstride
2352    punpckhqdq      xmm4, xmm3
2353    SSSE3_FilterHorizontal_2px xmm4, xmm2
2354    packuswb        xmm4, xmm4
2355    movd            [p_dst + 13], xmm4
2356    MOVDQ           [p_dst], xmm0
2357    add             p_dst, i_dststride
2358    psrldq          xmm4, 4
2359    movd            [p_dst + 13], xmm4
2360    SSSE3_FilterHorizontal_8px xmm1, xmm5, xmm6, xmm7, xmm0, xmm2
2361    SSSE3_FilterHorizontal_8px xmm3, xmm5, xmm6, xmm7, xmm0, xmm2
2362    packuswb        xmm1, xmm3
2363    MOVDQ           [p_dst], xmm1
2364    add             p_dst, i_dststride
2365    sub             i_height, 2
2366    jg              .width17_yloop
2367    POP_XMM
2368    LOAD_6_PARA_POP
2369    DEINIT_X86_32_PIC
2370    ret
2371%undef p_src
2372%undef i_srcstride
2373%undef p_dst
2374%undef i_dststride
2375%undef i_width
2376%undef i_height
2377
2378
2379;*******************************************************************************
2380; void McHorVer20Width4U8ToS16_ssse3(const uint8_t *pSrc,
2381;                                    int iSrcStride,
2382;                                    int16_t *pDst,
2383;                                    int iHeight);
2384;*******************************************************************************
2385
2386WELS_EXTERN McHorVer20Width4U8ToS16_ssse3
2387%define p_src        r0
2388%define i_srcstride  r1
2389%define p_dst        r2
2390%define i_height     r3
2391    %assign  push_num 0
2392    INIT_X86_32_PIC r4
2393    LOAD_4_PARA
2394    PUSH_XMM 7
2395    SIGN_EXTENSION  r1, r1d
2396    SIGN_EXTENSION  r3, r3d
2397    sub             p_src, i_srcstride
2398    sub             p_src, i_srcstride
2399    movdqa          xmm4, [pic(shufb_32435465768798A9)]
2400    movdqa          xmm5, [pic(shufb_011267784556ABBC)]
2401    movdqa          xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
2402    sub             i_height, 1
2403.yloop:
2404    movdqu          xmm0, [p_src - 2]
2405    movdqu          xmm1, [p_src + i_srcstride - 2]
2406    lea             p_src, [p_src + 2 * i_srcstride]
2407    SSSE3_FilterHorizontalbw_2x4px xmm0, xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
2408    movdqa          [p_dst], xmm0
2409    add             p_dst, 16
2410    sub             i_height, 2
2411    jg              .yloop
2412    ; Height % 2 remainder.
2413    movdqu          xmm0, [p_src - 2]
2414    SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
2415    movlps          [p_dst], xmm0
2416    POP_XMM
2417    LOAD_4_PARA_POP
2418    DEINIT_X86_32_PIC
2419    ret
2420%undef p_src
2421%undef i_srcstride
2422%undef p_dst
2423%undef i_height
2424
2425
2426;***********************************************************************
2427; void McHorVer02Width4S16ToU8_ssse3(const int16_t *pSrc,
2428;                                    uint8_t *pDst,
2429;                                    int32_t iDstStride,
2430;                                    int32_t iHeight);
2431;***********************************************************************
2432
2433WELS_EXTERN McHorVer02Width4S16ToU8_ssse3
2434%define p_src        r0
2435%define p_dst        r1
2436%define i_dststride  r2
2437%define i_height     r3
2438%define i_srcstride  8
2439    %assign  push_num 0
2440    INIT_X86_32_PIC r4
2441    LOAD_4_PARA
2442    PUSH_XMM 8
2443    SIGN_EXTENSION  r2, r2d
2444    SIGN_EXTENSION  r3, r3d
2445    movdqa          xmm0, [p_src +  0 * i_srcstride]
2446    movdqu          xmm1, [p_src +  1 * i_srcstride]
2447    movdqa          xmm2, [p_src +  2 * i_srcstride]
2448    movdqu          xmm3, [p_src +  3 * i_srcstride]
2449    movdqa          xmm4, [p_src +  4 * i_srcstride]
2450    movdqu          xmm5, [p_src +  5 * i_srcstride]
2451    movdqa          xmm6, [p_src +  6 * i_srcstride]
2452    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7
2453    packuswb        xmm0, xmm0
2454    movd            [p_dst], xmm0
2455    psrlq           xmm0, 32
2456    movd            [p_dst + i_dststride], xmm0
2457    lea             p_dst, [p_dst + 2 * i_dststride]
2458    movdqu          xmm7, [p_src +  7 * i_srcstride]
2459    movdqa          xmm0, [p_src +  8 * i_srcstride]
2460    SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm1
2461    packuswb        xmm2, xmm2
2462    movd            [p_dst], xmm2
2463    psrlq           xmm2, 32
2464    movd            [p_dst + i_dststride], xmm2
2465    cmp             i_height, 4
2466    jle             .done
2467    lea             p_dst, [p_dst + 2 * i_dststride]
2468    movdqu          xmm1, [p_src +  9 * i_srcstride]
2469    movdqa          xmm2, [p_src + 10 * i_srcstride]
2470    SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm3
2471    packuswb        xmm4, xmm4
2472    movd            [p_dst], xmm4
2473    psrlq           xmm4, 32
2474    movd            [p_dst + i_dststride], xmm4
2475    lea             p_dst, [p_dst + 2 * i_dststride]
2476    movdqu          xmm3, [p_src + 11 * i_srcstride]
2477    SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm5
2478    packuswb        xmm6, xmm6
2479    movd            [p_dst], xmm6
2480    psrlq           xmm6, 32
2481    movd            [p_dst + i_dststride], xmm6
2482.done:
2483    POP_XMM
2484    LOAD_4_PARA_POP
2485    DEINIT_X86_32_PIC
2486    ret
2487%undef p_src
2488%undef p_dst
2489%undef i_dststride
2490%undef i_height
2491%undef i_srcstride
2492
2493
2494;***********************************************************************
2495; void McHorVer20Width8U8ToS16_ssse3(const uint8_t *pSrc,
2496;                                    int16_t iSrcStride,
2497;                                    int16_t *pDst,
2498;                                    int32_t iDstStride,
2499;                                    int32_t iHeight);
2500;***********************************************************************
2501
2502WELS_EXTERN McHorVer20Width8U8ToS16_ssse3
2503%define p_src        r0
2504%define i_srcstride  r1
2505%define p_dst        r2
2506%define i_dststride  r3
2507%define i_height     r4
2508    %assign  push_num 0
2509    INIT_X86_32_PIC r5
2510    LOAD_5_PARA
2511    PUSH_XMM 7
2512    SIGN_EXTENSION  r1, r1d
2513    SIGN_EXTENSION  r3, r3d
2514    SIGN_EXTENSION  r4, r4d
2515    sub             p_src, i_srcstride
2516    sub             p_src, i_srcstride
2517    movdqa          xmm4, [pic(shufb_32435465768798A9)]
2518    movdqa          xmm5, [pic(shufb_011267784556ABBC)]
2519    movdqa          xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
2520    sub             i_height, 1
2521.yloop:
2522    movdqu          xmm0, [p_src - 2]
2523    movdqu          xmm1, [p_src + i_srcstride - 2]
2524    lea             p_src, [p_src + 2 * i_srcstride]
2525    SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
2526    MOVDQ           [p_dst], xmm0
2527    add             p_dst, i_dststride
2528    SSSE3_FilterHorizontalbw_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
2529    MOVDQ           [p_dst], xmm1
2530    add             p_dst, i_dststride
2531    sub             i_height, 2
2532    jg              .yloop
2533    jl              .done
2534    movdqu          xmm0, [p_src - 2]
2535    SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
2536    MOVDQ           [p_dst], xmm0
2537.done:
2538    POP_XMM
2539    LOAD_5_PARA_POP
2540    DEINIT_X86_32_PIC
2541    ret
2542%undef p_src
2543%undef i_srcstride
2544%undef p_dst
2545%undef i_dststride
2546%undef i_height
2547
2548
2549;***********************************************************************
2550; void McHorVer02Width5S16ToU8_ssse3(const int16_t *pSrc,
2551;                                    int32_t iTapStride,
2552;                                    uint8_t *pDst,
2553;                                    int32_t iDstStride,
2554;                                    int32_t iHeight);
2555;***********************************************************************
2556
2557WELS_EXTERN McHorVer02Width5S16ToU8_ssse3
2558%define p_src        r0
2559%define i_srcstride  r1
2560%define p_dst        r2
2561%define i_dststride  r3
2562%define i_height     r4
2563%define i_srcstride3 r5
2564    %assign  push_num 0
2565%ifdef X86_32
2566    push            r5
2567    %assign  push_num 1
2568%endif
2569    INIT_X86_32_PIC r6
2570    LOAD_5_PARA
2571    PUSH_XMM 8
2572    SIGN_EXTENSION  r1, r1d
2573    SIGN_EXTENSION  r3, r3d
2574    SIGN_EXTENSION  r4, r4d
2575    lea             i_srcstride3, [3 * i_srcstride]
2576    movdqa          xmm0, [p_src]
2577    movdqa          xmm1, [p_src + i_srcstride]
2578    movdqa          xmm2, [p_src + 2 * i_srcstride]
2579    movdqa          xmm3, [p_src + i_srcstride3]
2580    lea             p_src, [p_src + 4 * i_srcstride]
2581    movdqa          xmm4, [p_src]
2582    movdqa          xmm5, [p_src + i_srcstride]
2583    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
2584    movdqa          xmm6, [p_src + 2 * i_srcstride]
2585    packuswb        xmm0, xmm0
2586    movdqa          xmm7, xmm0
2587    psrlq           xmm7, 8
2588    movd            [p_dst + 1], xmm7
2589    movd            [p_dst], xmm0
2590    add             p_dst, i_dststride
2591    SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
2592    movdqa          xmm7, [p_src + i_srcstride3]
2593    lea             p_src, [p_src + 4 * i_srcstride]
2594    packuswb        xmm1, xmm1
2595    movdqa          xmm0, xmm1
2596    psrlq           xmm0, 8
2597    movd            [p_dst + 1], xmm0
2598    movd            [p_dst], xmm1
2599    add             p_dst, i_dststride
2600    SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0
2601    movdqa          xmm0, [p_src]
2602    packuswb        xmm2, xmm2
2603    movdqa          xmm1, xmm2
2604    psrlq           xmm1, 8
2605    movd            [p_dst + 1], xmm1
2606    movd            [p_dst], xmm2
2607    add             p_dst, i_dststride
2608    SSE2_FilterVerticalw_8px xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
2609    packuswb        xmm3, xmm3
2610    movdqa          xmm2, xmm3
2611    psrlq           xmm2, 8
2612    movd            [p_dst + 1], xmm2
2613    movd            [p_dst], xmm3
2614    add             p_dst, i_dststride
2615    movdqa          xmm1, [p_src + i_srcstride]
2616    SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2
2617    packuswb        xmm4, xmm4
2618    movdqa          xmm3, xmm4
2619    psrlq           xmm3, 8
2620    movd            [p_dst + 1], xmm3
2621    movd            [p_dst], xmm4
2622    cmp             i_height, 5
2623    jle             .done
2624    add             p_dst, i_dststride
2625    movdqa          xmm2, [p_src + 2 * i_srcstride]
2626    SSE2_FilterVerticalw_8px xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3
2627    movdqa          xmm3, [p_src + i_srcstride3]
2628    lea             p_src, [p_src + 4 * i_srcstride]
2629    packuswb        xmm5, xmm5
2630    movdqa          xmm4, xmm5
2631    psrlq           xmm4, 8
2632    movd            [p_dst + 1], xmm4
2633    movd            [p_dst], xmm5
2634    add             p_dst, i_dststride
2635    SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4
2636    movdqa          xmm4, [p_src]
2637    packuswb        xmm6, xmm6
2638    movdqa          xmm5, xmm6
2639    psrlq           xmm5, 8
2640    movd            [p_dst + 1], xmm5
2641    movd            [p_dst], xmm6
2642    add             p_dst, i_dststride
2643    SSE2_FilterVerticalw_8px xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
2644    packuswb        xmm7, xmm7
2645    movdqa          xmm6, xmm7
2646    psrlq           xmm6, 8
2647    movd            [p_dst + 1], xmm6
2648    movd            [p_dst], xmm7
2649    add             p_dst, i_dststride
2650    movdqa          xmm5, [p_src + i_srcstride]
2651    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
2652    packuswb        xmm0, xmm0
2653    movdqa          xmm7, xmm0
2654    psrlq           xmm7, 8
2655    movd            [p_dst + 1], xmm7
2656    movd            [p_dst], xmm0
2657.done:
2658    POP_XMM
2659    LOAD_5_PARA_POP
2660    DEINIT_X86_32_PIC
2661%ifdef X86_32
2662    pop             r5
2663%endif
2664    ret
2665%undef p_src
2666%undef i_srcstride
2667%undef p_dst
2668%undef i_dststride
2669%undef i_height
2670%undef i_srcstride3
2671
2672
2673;***********************************************************************
2674; void McHorVer20Width9Or17U8ToS16_ssse3(const uint8_t *pSrc,
2675;                                        int32_t iSrcStride,
2676;                                        int16_t *pDst,
2677;                                        int32_t iDstStride,
2678;                                        int32_t iWidth,
2679;                                        int32_t iHeight);
2680;***********************************************************************
2681
2682WELS_EXTERN McHorVer20Width9Or17U8ToS16_ssse3
2683%define p_src       r0
2684%define i_srcstride r1
2685%define p_dst       r2
2686%define i_dststride r3
2687%define i_width     r4
2688%define i_height    r5
2689    %assign  push_num 0
2690    INIT_X86_32_PIC r6
2691    LOAD_6_PARA
2692    PUSH_XMM 8
2693    SIGN_EXTENSION  r1, r1d
2694    SIGN_EXTENSION  r3, r3d
2695    SIGN_EXTENSION  r4, r4d
2696    SIGN_EXTENSION  r5, r5d
2697    sub             p_src, i_srcstride
2698    sub             p_src, i_srcstride
2699    pcmpeqw         xmm4, xmm4
2700    psllw           xmm4, 15                                ; dw -32768
2701    movdqa          xmm5, [pic(shufb_32435465768798A9)]
2702    movdqa          xmm6, [pic(shufb_011267784556ABBC)]
2703    movdqa          xmm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
2704    cmp             i_width, 9
2705    jne             .width17_yloop
2706
2707.width9_yloop:
2708    movdqu          xmm0, [p_src - 2]
2709    movdqa          xmm3, xmm0
2710    SSSE3_FilterHorizontalbw_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
2711    movdqu          xmm2, [p_src + i_srcstride - 2]
2712    lea             p_src, [p_src + 2 * i_srcstride]
2713    punpckhqdq      xmm3, xmm2
2714    SSSE3_FilterHorizontalbw_2px xmm3, xmm4, xmm1
2715    movlps          [p_dst + 10], xmm3
2716    MOVDQ           [p_dst], xmm0
2717    add             p_dst, i_dststride
2718    movhps          [p_dst + 10], xmm3
2719    SSSE3_FilterHorizontalbw_8px xmm2, xmm5, xmm6, xmm7, xmm1, xmm0
2720    MOVDQ           [p_dst], xmm2
2721    add             p_dst, i_dststride
2722    sub             i_height, 2
2723    jg              .width9_yloop
2724    POP_XMM
2725    LOAD_6_PARA_POP
2726    DEINIT_X86_32_PIC_KEEPDEF
2727    ret
2728
2729.width17_yloop:
2730    movdqu          xmm0, [p_src - 2]
2731    movdqu          xmm3, [p_src + 6]
2732    add             p_src, i_srcstride
2733    SSSE3_FilterHorizontalbw_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
2734    MOVDQ           [p_dst], xmm0
2735    movdqa          xmm0, xmm3
2736    SSSE3_FilterHorizontalbw_8px xmm3, xmm5, xmm6, xmm7, xmm1, xmm2
2737    movdqu          xmm2, [p_src + 6]
2738    punpckhqdq      xmm0, xmm2
2739    SSSE3_FilterHorizontalbw_2px xmm0, xmm4, xmm1
2740    movdqu          xmm1, [p_src - 2]
2741    add             p_src, i_srcstride
2742    movlps          [p_dst + 26], xmm0
2743    MOVDQ           [p_dst + 16], xmm3
2744    add             p_dst, i_dststride
2745    movhps          [p_dst + 26], xmm0
2746    SSSE3_FilterHorizontalbw_8px xmm1, xmm5, xmm6, xmm7, xmm0, xmm3
2747    MOVDQ           [p_dst], xmm1
2748    SSSE3_FilterHorizontalbw_8px xmm2, xmm5, xmm6, xmm7, xmm0, xmm3
2749    MOVDQ           [p_dst + 16], xmm2
2750    add             p_dst, i_dststride
2751    sub             i_height, 2
2752    jg              .width17_yloop
2753    POP_XMM
2754    LOAD_6_PARA_POP
2755    DEINIT_X86_32_PIC
2756    ret
2757%undef p_src
2758%undef i_srcstride
2759%undef p_dst
2760%undef i_dststride
2761%undef i_width
2762%undef i_height
2763
2764
2765;***********************************************************************
2766; void McHorVer02WidthGe8S16ToU8_ssse3(const int16_t *pSrc,
2767;                                      int32_t iSrcStride,
2768;                                      uint8_t *pDst,
2769;                                      int32_t iDstStride,
2770;                                      int32_t iWidth,
2771;                                      int32_t iHeight);
2772;***********************************************************************
2773
2774WELS_EXTERN McHorVer02WidthGe8S16ToU8_ssse3
2775%define p_src        r0
2776%define i_srcstride  r1
2777%define p_dst        r2
2778%define i_dststride  r3
2779%ifdef X86_32_PICASM
2780%define i_width      dword arg5
2781%else
2782%define i_width      r4
2783%endif
2784%define i_height     r5
2785%define i_srcstride3 r6
2786    %assign  push_num 0
2787%ifdef X86_32
2788    push            r6
2789    %assign  push_num 1
2790%endif
2791    LOAD_6_PARA
2792    PUSH_XMM 8
2793    SIGN_EXTENSION  r1, r1d
2794    SIGN_EXTENSION  r3, r3d
2795    SIGN_EXTENSION  r4, r4d
2796    SIGN_EXTENSION  r5, r5d
2797    INIT_X86_32_PIC_NOPRESERVE r4
2798    sub             i_height, 1
2799    push            i_height
2800    %assign push_num push_num + 1
2801    lea             i_srcstride3, [3 * i_srcstride]
2802    test            i_width, 1
2803    jz              .width_loop
2804    push            p_src
2805    push            p_dst
2806    %assign push_num push_num + 2
2807%ifdef X86_32_PICASM
2808    add             p_src, i_width
2809    add             p_src, i_width
2810    sub             p_src, 2
2811%else
2812    lea             p_src, [p_src + 2 * i_width - 2]
2813%endif
2814    add             p_dst, i_width
2815    movd            xmm0, [p_src]
2816    punpcklwd       xmm0, [p_src + i_srcstride]
2817    movd            xmm1, [p_src + 2 * i_srcstride]
2818    add             p_src, i_srcstride3
2819    punpcklwd       xmm1, [p_src]
2820    punpckldq       xmm0, xmm1
2821    movd            xmm1, [p_src + i_srcstride]
2822    cmp             i_height, 4
2823    je              .filter5_unalign
2824    punpcklwd       xmm1, [p_src + 2 * i_srcstride]
2825    movd            xmm2, [p_src + i_srcstride3]
2826    lea             p_src, [p_src + 4 * i_srcstride]
2827    punpcklwd       xmm2, [p_src]
2828    punpckldq       xmm1, xmm2
2829    punpcklqdq      xmm0, xmm1
2830.height_loop_unalign:
2831    movd            xmm1, [p_src + i_srcstride]
2832    palignr         xmm1, xmm0, 2
2833    movd            xmm2, [p_src + 2 * i_srcstride]
2834    palignr         xmm2, xmm1, 2
2835    movd            xmm3, [p_src + i_srcstride3]
2836    palignr         xmm3, xmm2, 2
2837    lea             p_src, [p_src + 4 * i_srcstride]
2838    movd            xmm4, [p_src]
2839    palignr         xmm4, xmm3, 2
2840    movd            xmm5, [p_src + i_srcstride]
2841    palignr         xmm5, xmm4, 2
2842    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7
2843    packuswb        xmm0, xmm0
2844    movdqa          xmm6, xmm0
2845    pslld           xmm6, 24
2846    movd            [p_dst - 4], xmm6
2847    movlps          [p_dst + 4 * i_dststride - 8], xmm6
2848    add             p_dst, i_dststride
2849    movdqa          xmm6, xmm0
2850    pslld           xmm6, 16
2851    movd            [p_dst - 4], xmm6
2852    movlps          [p_dst + 4 * i_dststride - 8], xmm6
2853    add             p_dst, i_dststride
2854    movdqa          xmm6, xmm0
2855    pslld           xmm6, 8
2856    movd            [p_dst - 4], xmm6
2857    movd            [p_dst + i_dststride - 4], xmm0
2858    lea             p_dst, [p_dst + 4 * i_dststride]
2859    movlps          [p_dst - 8], xmm6
2860    movlps          [p_dst + i_dststride - 8], xmm0
2861    lea             p_dst, [p_dst + 2 * i_dststride]
2862    sub             i_height, 8
2863    jle             .height_loop_unalign_exit
2864    movd            xmm1, [p_src + 2 * i_srcstride]
2865    palignr         xmm1, xmm5, 2
2866    movd            xmm0, [p_src + i_srcstride3]
2867    lea             p_src, [p_src + 4 * i_srcstride]
2868    punpcklwd       xmm0, [p_src]
2869    palignr         xmm0, xmm1, 4
2870    jmp             .height_loop_unalign
2871.height_loop_unalign_exit:
2872    movddup         xmm6, [p_src + 2 * i_srcstride - 6]
2873    SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
2874    packuswb        xmm1, xmm1
2875    movlps          [p_dst - 8], xmm1
2876    jmp             .unalign_done
2877.filter5_unalign:
2878    pslldq          xmm0, 8
2879    palignr         xmm1, xmm0, 2
2880    movd            xmm2, [p_src + 2 * i_srcstride]
2881    palignr         xmm2, xmm1, 2
2882    movd            xmm3, [p_src + i_srcstride3]
2883    lea             p_src, [p_src + 4 * i_srcstride]
2884    palignr         xmm3, xmm2, 2
2885    movd            xmm4, [p_src]
2886    palignr         xmm4, xmm3, 2
2887    movd            xmm5, [p_src + i_srcstride]
2888    palignr         xmm5, xmm4, 2
2889    movd            xmm6, [p_src + 2 * i_srcstride]
2890    palignr         xmm6, xmm5, 2
2891    SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
2892    packuswb        xmm1, xmm1
2893    movdqa          xmm0, xmm1
2894    psrlq           xmm1,  8
2895    movdqa          xmm2, xmm0
2896    psrlq           xmm2, 16
2897    movdqa          xmm3, xmm0
2898    psrlq           xmm3, 24
2899    movd            [p_dst - 4], xmm0
2900    movd            [p_dst + i_dststride - 4], xmm1
2901    lea             p_dst, [p_dst + 2 * i_dststride]
2902    movd            [p_dst - 4], xmm2
2903    movd            [p_dst + i_dststride - 4], xmm3
2904    movlps          [p_dst + 2 * i_dststride - 8], xmm0
2905.unalign_done:
2906    pop             p_dst
2907    pop             p_src
2908    %assign push_num push_num - 2
2909    mov             i_height, [r7]
2910    sub             i_width, 1
2911.width_loop:
2912    push            p_src
2913    push            p_dst
2914    %assign push_num push_num + 2
2915    movdqa          xmm0, [p_src]
2916    movdqa          xmm1, [p_src + i_srcstride]
2917    movdqa          xmm2, [p_src + 2 * i_srcstride]
2918    movdqa          xmm3, [p_src + i_srcstride3]
2919    lea             p_src, [p_src + 4 * i_srcstride]
2920    movdqa          xmm4, [p_src]
2921.height_loop:
2922    movdqa          xmm5, [p_src + i_srcstride]
2923    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
2924    movdqa          xmm6, [p_src + 2 * i_srcstride]
2925    SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
2926    movdqa          xmm7, [p_src + i_srcstride3]
2927    lea             p_src, [p_src + 4 * i_srcstride]
2928    packuswb        xmm0, xmm1
2929    movlps          [p_dst], xmm0
2930    movhps          [p_dst + i_dststride], xmm0
2931    lea             p_dst, [p_dst + 2 * i_dststride]
2932    SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0
2933    movdqa          xmm0, [p_src]
2934    SSE2_FilterVerticalw_8px xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
2935    packuswb        xmm2, xmm3
2936    movlps          [p_dst], xmm2
2937    movhps          [p_dst + i_dststride], xmm2
2938    cmp             i_height, 4
2939    jl              .x_loop_dec
2940    lea             p_dst, [p_dst + 2 * i_dststride]
2941    movdqa          xmm1, [p_src + i_srcstride]
2942    SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2
2943    je              .store_xmm4_exit
2944    movdqa          xmm2, [p_src + 2 * i_srcstride]
2945    SSE2_FilterVerticalw_8px xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3
2946    movdqa          xmm3, [p_src + i_srcstride3]
2947    lea             p_src, [p_src + 4 * i_srcstride]
2948    packuswb        xmm4, xmm5
2949    movlps          [p_dst], xmm4
2950    movhps          [p_dst + i_dststride], xmm4
2951    lea             p_dst, [p_dst + 2 * i_dststride]
2952    SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4
2953    movdqa          xmm4, [p_src]
2954    SSE2_FilterVerticalw_8px xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
2955    packuswb        xmm6, xmm7
2956    movlps          [p_dst], xmm6
2957    movhps          [p_dst + i_dststride], xmm6
2958    lea             p_dst, [p_dst + 2 * i_dststride]
2959    sub             i_height, 8
2960    jg              .height_loop
2961    jl              .x_loop_dec
2962    movdqa          xmm5, [p_src + i_srcstride]
2963    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
2964    packuswb        xmm0, xmm0
2965    movlps          [p_dst], xmm0
2966.x_loop_dec:
2967    pop             p_dst
2968    pop             p_src
2969    %assign push_num push_num - 2
2970    sub             i_width, 8
2971    jle             .done
2972    mov             i_height, [r7]
2973    add             p_src, 16
2974    add             p_dst, 8
2975    jmp             .width_loop
2976.store_xmm4_exit:
2977    packuswb        xmm4, xmm4
2978    movlps          [p_dst], xmm4
2979    pop             p_dst
2980    pop             p_src
2981.done:
2982    pop             i_height
2983    %assign push_num push_num - 1
2984    DEINIT_X86_32_PIC
2985    POP_XMM
2986    LOAD_6_PARA_POP
2987%ifdef X86_32
2988    pop             r6
2989%endif
2990    ret
2991%undef p_src
2992%undef i_srcstride
2993%undef p_dst
2994%undef i_dststride
2995%undef i_width
2996%undef i_height
2997%undef i_srcstride3
2998
2999
3000%ifdef HAVE_AVX2
3001
3002; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6
3003%macro AVX2_FilterHorizontalbw_16px 6
3004    vpshufb         %5, %1, %3
3005    vpshufb         %1, %1, %2
3006    vpshufd         %6, %1, 10110001b
3007    vpmaddubsw      %1, %1, [pic(db20_256)]
3008    vpmaddubsw      %5, %5, %4
3009    vpmaddubsw      %6, %6, %4
3010    vpaddw          %1, %1, %5
3011    vpaddw          %1, %1, %6
3012%endmacro
3013
3014; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 db20=%4 tmp=%5,%6
3015%macro AVX2_FilterHorizontal_16px 6
3016    AVX2_FilterHorizontalbw_16px %1, %2, %3, %4, %5, %6
3017    vpaddw          %1, %1, [pic(h264_w0x10_256)]
3018    vpsraw          %1, %1, 5
3019%endmacro
3020
3021; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7
3022%macro AVX2_FilterHorizontalbw_4x4px 7
3023    vpshufb         %6, %1, %4
3024    vpshufb         %7, %2, %4
3025    vpshufb         %1, %1, %3
3026    vpshufb         %2, %2, %3
3027    vpunpcklqdq     %1, %1, %2
3028    vpunpcklqdq     %6, %6, %7
3029    vpshufd         %7, %1, 10110001b
3030    vpmaddubsw      %1, %1, [pic(db20_256)]
3031    vpmaddubsw      %6, %6, %5
3032    vpmaddubsw      %7, %7, %5
3033    vpaddw          %1, %1, %6
3034    vpaddw          %1, %1, %7
3035%endmacro
3036
3037; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 db20=%5 tmp=%6,%7
3038%macro AVX2_FilterHorizontal_4x4px 7
3039    AVX2_FilterHorizontalbw_4x4px %1, %2, %3, %4, %5, %6, %7
3040    vpaddw          %1, %1, [pic(h264_w0x10_256)]
3041    vpsraw          %1, %1, 5
3042%endmacro
3043
3044; pixels=%1 -32768>>scale=%2 tmp=%3
3045%macro AVX2_FilterHorizontalbw_4px 3
3046    vpmaddubsw      %1, %1, [pic(maddubsw_m2p10_m40m40_p10m2_p0p0_256)]
3047    vpmaddwd        %1, %1, %2
3048    vpshufd         %3, %1, 10110001b
3049    vpaddd          %1, %1, %3
3050%endmacro
3051
3052; pixels=%1 tmp=%2
3053%macro AVX2_FilterHorizontal_4px 2
3054    AVX2_FilterHorizontalbw_4px %1, [pic(dwm1024_256)], %2
3055    vpaddd          %1, %1, [pic(dd32768_256)]
3056%endmacro
3057
3058; px_ab=%1 px_cd=%2 px_ef=%3 maddubsw_ab=%4 maddubsw_cd=%5 maddubsw_ef=%6 tmp=%7
3059%macro AVX2_FilterVertical_16px 7
3060    vpmaddubsw      %1, %1, %4
3061    vpmaddubsw      %7, %2, %5
3062    vpaddw          %1, %1, %7
3063    vpmaddubsw      %7, %3, %6
3064    vpaddw          %1, %1, %7
3065    vpaddw          %1, %1, [pic(h264_w0x10_256)]
3066    vpsraw          %1, %1, 5
3067%endmacro
3068
3069; px_a=%1 px_f=%2 px_bc=%3 px_de=%4 maddubsw_bc=%5 maddubsw_de=%6 tmp=%7,%8
3070%macro AVX2_FilterVertical2_16px 8
3071    vpxor           %7, %7, %7
3072    vpunpcklbw      %1, %1, %7
3073    vpunpcklbw      %8, %2, %7
3074    vpaddw          %1, %1, %8
3075    vpmaddubsw      %7, %3, %5
3076    vpaddw          %1, %1, %7
3077    vpmaddubsw      %7, %4, %6
3078    vpaddw          %1, %1, %7
3079    vpaddw          %1, %1, [pic(h264_w0x10_256)]
3080    vpsraw          %1, %1, 5
3081%endmacro
3082
3083; px0=%1 px1=%2 px2=%3 px3=%4 px4=%5 px5=%6 tmp=%7
3084%macro AVX2_FilterVerticalw_16px 7
3085    vpaddw          %1, %1, %6
3086    vpaddw          %7, %2, %5
3087    vpsubw          %1, %1, %7
3088    vpsraw          %1, %1, 2
3089    vpsubw          %1, %1, %7
3090    vpaddw          %7, %3, %4
3091    vpaddw          %1, %1, %7
3092    vpsraw          %1, %1, 2
3093    vpaddw          %7, %7, [pic(dw32_256)]
3094    vpaddw          %1, %1, %7
3095    vpsraw          %1, %1, 6
3096%endmacro
3097
3098;***********************************************************************
3099; void McHorVer02_avx2(const uint8_t *pSrc,
3100;                      int32_t iSrcStride,
3101;                      uint8_t *pDst,
3102;                      int32_t iDstStride,
3103;                      int32_t iWidth,
3104;                      int32_t iHeight)
3105;***********************************************************************
3106
3107WELS_EXTERN McHorVer02_avx2
3108%define p_src         r0
3109%define i_srcstride   r1
3110%define p_dst         r2
3111%define i_dststride   r3
3112%ifdef X86_32_PICASM
3113%define i_width       dword arg5
3114%else
3115%define i_width       r4
3116%endif
3117%define i_height      r5
3118%define i_srcstride3  r6
3119    %assign push_num 0
3120%ifdef X86_32
3121    push            r6
3122    %assign push_num 1
3123%endif
3124    LOAD_6_PARA
3125    PUSH_XMM 8
3126    SIGN_EXTENSION  r1, r1d
3127    SIGN_EXTENSION  r3, r3d
3128    SIGN_EXTENSION  r4, r4d
3129    SIGN_EXTENSION  r5, r5d
3130    INIT_X86_32_PIC_NOPRESERVE r4
3131    sub             p_src, i_srcstride
3132    sub             p_src, i_srcstride
3133    lea             i_srcstride3, [3 * i_srcstride]
3134    cmp             i_width, 8
3135    je              .width8
3136    jg              .width16
3137; .width4:
3138    vmovd           xmm0, [p_src]
3139    vpbroadcastd    xmm5, [p_src + i_srcstride]
3140    vpunpcklbw      xmm0, xmm0, xmm5
3141    vpbroadcastd    ymm1, [p_src + 2 * i_srcstride]
3142    vpunpcklbw      xmm5, xmm5, xmm1
3143    vpblendd        xmm0, xmm0, xmm5, 1100b
3144    vpbroadcastd    ymm5, [p_src + i_srcstride3]
3145    lea             p_src, [p_src + 4 * i_srcstride]
3146    vpunpcklbw      ymm1, ymm1, ymm5
3147    vpbroadcastd    ymm2, [p_src]
3148    vpunpcklbw      ymm5, ymm5, ymm2
3149    vpblendd        ymm1, ymm1, ymm5, 11001100b
3150    vpblendd        ymm0, ymm0, ymm1, 11110000b
3151    vpbroadcastd    ymm5, [p_src + i_srcstride]
3152    lea             p_src, [p_src + 2 * i_srcstride]
3153    vpunpcklbw      ymm2, ymm2, ymm5
3154    vpbroadcastd    ymm3, [p_src]
3155    vpunpcklbw      ymm5, ymm5, ymm3
3156    vpblendd        ymm2, ymm2, ymm5, 11001100b
3157    vpblendd        ymm1, ymm1, ymm2, 11110000b
3158    vpbroadcastd    ymm5, [p_src + i_srcstride]
3159    vpunpcklbw      ymm3, ymm3, ymm5
3160    vpbroadcastd    ymm4, [p_src + 2 * i_srcstride]
3161    vpunpcklbw      ymm5, ymm5, ymm4
3162    vpblendd        ymm3, ymm3, ymm5, 11001100b
3163    vpblendd        ymm2, ymm2, ymm3, 11110000b
3164    vbroadcasti128  ymm6, [pic(db20_128)]
3165    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [pic(maddubsw_p1m5_256)], ymm6, [pic(maddubsw_m5p1_256)], ymm5
3166    vpackuswb       ymm0, ymm0, ymm0
3167    vmovd           [p_dst], xmm0
3168    vpsrlq          xmm5, xmm0, 32
3169    vmovd           [p_dst + i_dststride], xmm5
3170    lea             p_dst, [p_dst + 2 * i_dststride]
3171    vextracti128    xmm0, ymm0, 1
3172    vmovd           [p_dst], xmm0
3173    vpsrlq          xmm5, xmm0, 32
3174    vmovd           [p_dst + i_dststride], xmm5
3175    cmp             i_height, 5
3176    jl              .width4_done
3177    lea             p_dst, [p_dst + 2 * i_dststride]
3178    vpbroadcastd    ymm5, [p_src + i_srcstride3]
3179    vpunpcklbw      ymm4, ymm4, ymm5
3180    jg              .width4_height_ge8
3181    AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [pic(maddubsw_p1m5_256)], xmm6, [pic(maddubsw_m5p1_256)], xmm5
3182    vpackuswb       xmm2, xmm2, xmm2
3183    vmovd           [p_dst], xmm2
3184    jmp             .width4_done
3185.width4_height_ge8:
3186    lea             p_src, [p_src + 4 * i_srcstride]
3187    vpbroadcastd    ymm1, [p_src]
3188    vpunpcklbw      ymm5, ymm5, ymm1
3189    vpblendd        ymm4, ymm4, ymm5, 11001100b
3190    vpblendd        ymm3, ymm3, ymm4, 11110000b
3191    vpbroadcastd    ymm5, [p_src + i_srcstride]
3192    vpunpcklbw      ymm1, ymm5
3193    vpbroadcastd    ymm0, [p_src + 2 * i_srcstride]
3194    vpunpcklbw      ymm5, ymm5, ymm0
3195    vpblendd        ymm1, ymm1, ymm5, 11001100b
3196    vpblendd        ymm4, ymm4, ymm1, 11110000b
3197    AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [pic(maddubsw_p1m5_256)], ymm6, [pic(maddubsw_m5p1_256)], ymm5
3198    vpackuswb       ymm2, ymm2, ymm2
3199    vmovd           [p_dst], xmm2
3200    vpsrlq          xmm5, xmm2, 32
3201    vmovd           [p_dst + i_dststride], xmm5
3202    lea             p_dst, [p_dst + 2 * i_dststride]
3203    vextracti128    xmm2, ymm2, 1
3204    vmovd           [p_dst], xmm2
3205    vpsrlq          xmm5, xmm2, 32
3206    vmovd           [p_dst + i_dststride], xmm5
3207    cmp             i_height, 9
3208    jl              .width4_done
3209    lea             p_dst, [p_dst + 2 * i_dststride]
3210    vmovd           xmm5, [p_src + i_srcstride3]
3211    vpunpcklbw      xmm0, xmm0, xmm5
3212    AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [pic(maddubsw_p1m5_256)], xmm6, [pic(maddubsw_m5p1_256)], xmm5
3213    vpackuswb       xmm4, xmm4, xmm4
3214    vmovd           [p_dst], xmm4
3215.width4_done:
3216    vzeroupper
3217    DEINIT_X86_32_PIC_KEEPDEF
3218    POP_XMM
3219    LOAD_6_PARA_POP
3220%ifdef X86_32
3221    pop             r6
3222%endif
3223    ret
3224
3225.width8:
3226    sub             i_height, 1
3227    vmovq           xmm0, [p_src]
3228    vmovq           xmm4, [p_src + i_srcstride]
3229    vpunpcklbw      xmm0, xmm0, xmm4
3230    vmovq           xmm1, [p_src + 2 * i_srcstride]
3231    vpunpcklbw      xmm4, xmm4, xmm1
3232    vinserti128     ymm0, ymm0, xmm4, 1
3233    vmovq           xmm4, [p_src + i_srcstride3]
3234    lea             p_src, [p_src + 4 * i_srcstride]
3235    vpunpcklbw      xmm1, xmm1, xmm4
3236    vmovq           xmm6, [p_src]
3237    vpunpcklbw      xmm4, xmm4, xmm6
3238    vinserti128     ymm1, ymm1, xmm4, 1
3239.width8_yloop:
3240    vmovq           xmm4, [p_src + i_srcstride]
3241    vpunpcklbw      xmm2, xmm6, xmm4
3242    vmovq           xmm3, [p_src + 2 * i_srcstride]
3243    vpunpcklbw      xmm4, xmm4, xmm3
3244    vinserti128     ymm2, ymm2, xmm4, 1
3245    vbroadcasti128  ymm5, [pic(db20_128)]
3246    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
3247    vmovq           xmm4, [p_src + i_srcstride3]
3248    lea             p_src, [p_src + 4 * i_srcstride]
3249    vpunpcklbw      xmm3, xmm3, xmm4
3250    vmovq           xmm6, [p_src]
3251    vpunpcklbw      xmm4, xmm4, xmm6
3252    vinserti128     ymm3, ymm3, xmm4, 1
3253    AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
3254    vpackuswb       ymm0, ymm0, ymm1
3255    vmovlps         [p_dst], xmm0
3256    vextracti128    xmm1, ymm0, 1
3257    vmovlps         [p_dst + i_dststride], xmm1
3258    lea             p_dst, [p_dst + 2 * i_dststride]
3259    vmovhps         [p_dst], xmm0
3260    vmovhps         [p_dst + i_dststride], xmm1
3261    cmp             i_height, 4
3262    jl              .width8_done
3263    lea             p_dst, [p_dst + 2 * i_dststride]
3264    vmovq           xmm4, [p_src + i_srcstride]
3265    vpunpcklbw      xmm0, xmm6, xmm4
3266    jg              .width8_height_ge8
3267    AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_256)], xmm5, [pic(maddubsw_m5p1_256)], xmm4
3268    vpackuswb       xmm2, xmm2, xmm2
3269    vmovlps         [p_dst], xmm2
3270    jmp             .width8_done
3271.width8_height_ge8:
3272    vmovq           xmm1, [p_src + 2 * i_srcstride]
3273    vpunpcklbw      xmm4, xmm4, xmm1
3274    vinserti128     ymm0, ymm0, xmm4, 1
3275    AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
3276    vmovq           xmm4, [p_src + i_srcstride3]
3277    lea             p_src, [p_src + 4 * i_srcstride]
3278    vpunpcklbw      xmm1, xmm1, xmm4
3279    vmovq           xmm6, [p_src]
3280    vpunpcklbw      xmm4, xmm4, xmm6
3281    vinserti128     ymm1, ymm1, xmm4, 1
3282    AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
3283    vpackuswb       ymm2, ymm2, ymm3
3284    vmovlps         [p_dst], xmm2
3285    vextracti128    xmm3, ymm2, 1
3286    vmovlps         [p_dst + i_dststride], xmm3
3287    lea             p_dst, [p_dst + 2 * i_dststride]
3288    vmovhps         [p_dst], xmm2
3289    vmovhps         [p_dst + i_dststride], xmm3
3290    lea             p_dst, [p_dst + 2 * i_dststride]
3291    sub             i_height, 8
3292    jg              .width8_yloop
3293    jl              .width8_done
3294    vmovq           xmm4, [p_src + i_srcstride]
3295    vpunpcklbw      xmm2, xmm6, xmm4
3296    AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_256)], xmm5, [pic(maddubsw_m5p1_256)], xmm4
3297    vpackuswb       xmm0, xmm0, xmm0
3298    vmovlps         [p_dst], xmm0
3299.width8_done:
3300    vzeroupper
3301    DEINIT_X86_32_PIC_KEEPDEF
3302    POP_XMM
3303    LOAD_6_PARA_POP
3304%ifdef X86_32
3305    pop             r6
3306%endif
3307    ret
3308
3309.width16:
3310    sub             i_height, 1
3311    test            i_height, 1
3312    jnz             .width16_yloop_begin_even
3313    vmovq           xmm0, [p_src]
3314    vpbroadcastq    ymm1, [p_src + 8]
3315    vpblendd        ymm0, ymm0, ymm1, 11110000b
3316    vmovq           xmm1, [p_src + i_srcstride]
3317    vpbroadcastq    ymm2, [p_src + i_srcstride + 8]
3318    vpblendd        ymm1, ymm1, ymm2, 11110000b
3319    vpunpcklbw      ymm0, ymm0, ymm1
3320    vmovq           xmm2, [p_src + 2 * i_srcstride]
3321    vpbroadcastq    ymm3, [p_src + 2 * i_srcstride + 8]
3322    vpblendd        ymm2, ymm2, ymm3, 11110000b
3323    vmovq           xmm3, [p_src + i_srcstride3]
3324    vpbroadcastq    ymm4, [p_src + i_srcstride3 + 8]
3325    lea             p_src, [p_src + 4 * i_srcstride]
3326    vpblendd        ymm3, ymm3, ymm4, 11110000b
3327    vpunpcklbw      ymm2, ymm2, ymm3
3328    vmovq           xmm4, [p_src]
3329    vpbroadcastq    ymm5, [p_src + 8]
3330    vpblendd        ymm4, ymm4, ymm5, 11110000b
3331    vmovq           xmm5, [p_src + i_srcstride]
3332    vpbroadcastq    ymm6, [p_src + i_srcstride + 8]
3333    lea             p_src, [p_src + 2 * i_srcstride]
3334    vpblendd        ymm5, ymm5, ymm6, 11110000b
3335    vpunpcklbw      ymm4, ymm4, ymm5
3336    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm7
3337    vpackuswb       ymm0, ymm0, ymm0
3338    vpermq          ymm0, ymm0, 1000b
3339    vmovdqa         [p_dst], xmm0
3340    add             p_dst, i_dststride
3341    jmp             .width16_yloop
3342.width16_yloop_begin_even:
3343    vmovq           xmm1, [p_src]
3344    vpbroadcastq    ymm2, [p_src + 8]
3345    vpblendd        ymm1, ymm1, ymm2, 11110000b
3346    vmovq           xmm2, [p_src + i_srcstride]
3347    vpbroadcastq    ymm3, [p_src + i_srcstride + 8]
3348    vpblendd        ymm2, ymm2, ymm3, 11110000b
3349    vmovq           xmm3, [p_src + 2 * i_srcstride]
3350    vpbroadcastq    ymm4, [p_src + 2 * i_srcstride + 8]
3351    add             p_src, i_srcstride3
3352    vpblendd        ymm3, ymm3, ymm4, 11110000b
3353    vpunpcklbw      ymm2, ymm2, ymm3
3354    vmovq           xmm4, [p_src]
3355    vpbroadcastq    ymm5, [p_src + 8]
3356    vpblendd        ymm4, ymm4, ymm5, 11110000b
3357    vmovq           xmm5, [p_src + i_srcstride]
3358    vpbroadcastq    ymm6, [p_src + i_srcstride + 8]
3359    lea             p_src, [p_src + 2 * i_srcstride]
3360    vpblendd        ymm5, ymm5, ymm6, 11110000b
3361    vpunpcklbw      ymm4, ymm4, ymm5
3362.width16_yloop:
3363    vmovq           xmm6, [p_src]
3364    vpbroadcastq    ymm7, [p_src + 8]
3365    vpblendd        ymm6, ymm6, ymm7, 11110000b
3366    AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm0, ymm7
3367    vmovq           xmm7, [p_src + i_srcstride]
3368    vpbroadcastq    ymm0, [p_src + i_srcstride + 8]
3369    vpblendd        ymm7, ymm7, ymm0, 11110000b
3370    vpunpcklbw      ymm6, ymm6, ymm7
3371    AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm0
3372    vpackuswb       ymm1, ymm1, ymm2
3373    vpermq          ymm1, ymm1, 11011000b
3374    vmovdqa         [p_dst], xmm1
3375    vextracti128    [p_dst + i_dststride], ymm1, 1
3376    lea             p_dst, [p_dst + 2 * i_dststride]
3377    vmovq           xmm0, [p_src + 2 * i_srcstride]
3378    vpbroadcastq    ymm1, [p_src + 2 * i_srcstride + 8]
3379    vpblendd        ymm0, ymm0, ymm1, 11110000b
3380    AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm2, ymm1
3381    vmovq           xmm1, [p_src + i_srcstride3]
3382    vpbroadcastq    ymm2, [p_src + i_srcstride3 + 8]
3383    lea             p_src, [p_src + 4 * i_srcstride]
3384    vpblendd        ymm1, ymm1, ymm2, 11110000b
3385    vpunpcklbw      ymm0, ymm0, ymm1
3386    AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm2
3387    vpackuswb       ymm3, ymm3, ymm4
3388    vpermq          ymm3, ymm3, 11011000b
3389    vmovdqa         [p_dst], xmm3
3390    vextracti128    [p_dst + i_dststride], ymm3, 1
3391    lea             p_dst, [p_dst + 2 * i_dststride]
3392    vmovq           xmm2, [p_src]
3393    vpbroadcastq    ymm3, [p_src + 8]
3394    vpblendd        ymm2, ymm2, ymm3, 11110000b
3395    AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm4, ymm3
3396    vmovq           xmm3, [p_src + i_srcstride]
3397    vpbroadcastq    ymm4, [p_src + i_srcstride + 8]
3398    vpblendd        ymm3, ymm3, ymm4, 11110000b
3399    vpunpcklbw      ymm2, ymm2, ymm3
3400    AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm4
3401    vpackuswb       ymm5, ymm5, ymm6
3402    vpermq          ymm5, ymm5, 11011000b
3403    vmovdqa         [p_dst], xmm5
3404    vextracti128    [p_dst + i_dststride], ymm5, 1
3405    lea             p_dst, [p_dst + 2 * i_dststride]
3406    vmovq           xmm4, [p_src + 2 * i_srcstride]
3407    vpbroadcastq    ymm5, [p_src + 2 * i_srcstride + 8]
3408    vpblendd        ymm4, ymm4, ymm5, 11110000b
3409    AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm6, ymm5
3410    vmovq           xmm5, [p_src + i_srcstride3]
3411    vpbroadcastq    ymm6, [p_src + i_srcstride3 + 8]
3412    lea             p_src, [p_src + 4 * i_srcstride]
3413    vpblendd        ymm5, ymm5, ymm6, 11110000b
3414    vpunpcklbw      ymm4, ymm4, ymm5
3415    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm6
3416    vpackuswb       ymm7, ymm7, ymm0
3417    vpermq          ymm7, ymm7, 11011000b
3418    vmovdqa         [p_dst], xmm7
3419    vextracti128    [p_dst + i_dststride], ymm7, 1
3420    lea             p_dst, [p_dst + 2 * i_dststride]
3421    sub             i_height, 8
3422    jg              .width16_yloop
3423    vzeroupper
3424    DEINIT_X86_32_PIC
3425    POP_XMM
3426    LOAD_6_PARA_POP
3427%ifdef X86_32
3428    pop             r6
3429%endif
3430    ret
3431%undef p_src
3432%undef i_srcstride
3433%undef i_srcstride3
3434%undef p_dst
3435%undef i_dststride
3436%undef i_width
3437%undef i_height
3438%undef i_ycnt
3439
3440
3441;*******************************************************************************
3442; void McHorVer20_avx2(const uint8_t *pSrc,
3443;                      int iSrcStride,
3444;                      uint8_t *pDst,
3445;                      int iDstStride,
3446;                      int iWidth,
3447;                      int iHeight);
3448;*******************************************************************************
3449
3450WELS_EXTERN McHorVer20_avx2
3451%define p_src        r0
3452%define i_srcstride  r1
3453%define p_dst        r2
3454%define i_dststride  r3
3455%define i_width      r4
3456%define i_height     r5
3457    %assign  push_num 0
3458    INIT_X86_32_PIC r6
3459    LOAD_6_PARA
3460    PUSH_XMM 7
3461    SIGN_EXTENSION  r1, r1d
3462    SIGN_EXTENSION  r3, r3d
3463    SIGN_EXTENSION  r4, r4d
3464    SIGN_EXTENSION  r5, r5d
3465    vbroadcasti128  ymm4, [pic(shufb_32435465768798A9)]
3466    vbroadcasti128  ymm5, [pic(shufb_011267784556ABBC)]
3467    vbroadcasti128  ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
3468    cmp             i_width, 8
3469    je              .width8
3470    jg              .width16_yloop
3471%xdefine i_srcstride3 i_width
3472%undef i_width
3473    lea             i_srcstride3, [3 * i_srcstride]
3474.width4_yloop:
3475    vmovdqu         xmm0, [p_src - 2]
3476    vmovdqu         xmm1, [p_src + i_srcstride - 2]
3477    vinserti128     ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1
3478    vinserti128     ymm1, ymm1, [p_src + i_srcstride3 - 2], 1
3479    lea             p_src, [p_src + 4 * i_srcstride]
3480    AVX2_FilterHorizontal_4x4px ymm0, ymm1, ymm4, ymm5, ymm6, ymm2, ymm3
3481    vpackuswb       ymm0, ymm0, ymm0
3482    vmovd           [p_dst], xmm0
3483    vpsrlq          xmm1, xmm0, 32
3484    vmovd           [p_dst + i_dststride], xmm1
3485    lea             p_dst, [p_dst + 2 * i_dststride]
3486    vextracti128    xmm0, ymm0, 1
3487    vmovd           [p_dst], xmm0
3488    vpsrlq          xmm1, xmm0, 32
3489    vmovd           [p_dst + i_dststride], xmm1
3490    lea             p_dst, [p_dst + 2 * i_dststride]
3491    sub             i_height, 4
3492    jg              .width4_yloop
3493    vzeroupper
3494    POP_XMM
3495    LOAD_6_PARA_POP
3496    DEINIT_X86_32_PIC_KEEPDEF
3497    ret
3498.width8:
3499    lea             i_srcstride3, [3 * i_srcstride]
3500.width8_yloop:
3501    vmovdqu         xmm0, [p_src - 2]
3502    vmovdqu         xmm1, [p_src + i_srcstride - 2]
3503    vinserti128     ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1
3504    vinserti128     ymm1, ymm1, [p_src + i_srcstride3 - 2], 1
3505    lea             p_src, [p_src + 4 * i_srcstride]
3506    AVX2_FilterHorizontal_16px ymm0, ymm4, ymm5, ymm6, ymm2, ymm3
3507    AVX2_FilterHorizontal_16px ymm1, ymm4, ymm5, ymm6, ymm2, ymm3
3508    vpackuswb       ymm0, ymm0, ymm1
3509    vmovlps         [p_dst], xmm0
3510    vmovhps         [p_dst + i_dststride], xmm0
3511    lea             p_dst, [p_dst + 2 * i_dststride]
3512    vextracti128    xmm0, ymm0, 1
3513    vmovlps         [p_dst], xmm0
3514    vmovhps         [p_dst + i_dststride], xmm0
3515    lea             p_dst, [p_dst + 2 * i_dststride]
3516    sub             i_height, 4
3517    jg              .width8_yloop
3518    vzeroupper
3519    POP_XMM
3520    LOAD_6_PARA_POP
3521    DEINIT_X86_32_PIC_KEEPDEF
3522    ret
3523%undef i_srcstride3
3524.width16_yloop:
3525    vmovdqu         xmm0, [p_src - 2]
3526    vmovdqu         xmm1, [p_src + 6]
3527    vinserti128     ymm0, ymm0, [p_src + i_srcstride - 2], 1
3528    vinserti128     ymm1, ymm1, [p_src + i_srcstride + 6], 1
3529    lea             p_src, [p_src + 2 * i_srcstride]
3530    AVX2_FilterHorizontal_16px ymm0, ymm4, ymm5, ymm6, ymm2, ymm3
3531    AVX2_FilterHorizontal_16px ymm1, ymm4, ymm5, ymm6, ymm2, ymm3
3532    vpackuswb       ymm0, ymm0, ymm1
3533    vmovdqa         [p_dst], xmm0
3534    vextracti128    [p_dst + i_dststride], ymm0, 1
3535    lea             p_dst, [p_dst + 2 * i_dststride]
3536    sub             i_height, 2
3537    jg              .width16_yloop
3538    vzeroupper
3539    POP_XMM
3540    LOAD_6_PARA_POP
3541    DEINIT_X86_32_PIC
3542    ret
3543%undef p_src
3544%undef i_srcstride
3545%undef p_dst
3546%undef i_dststride
3547%undef i_width
3548%undef i_height
3549
3550
3551;***********************************************************************
3552; void McHorVer20Width5Or9Or17_avx2(const uint8_t *pSrc,
3553;                                   int32_t iSrcStride,
3554;                                   uint8_t *pDst,
3555;                                   int32_t iDstStride,
3556;                                   int32_t iWidth,
3557;                                   int32_t iHeight);
3558;***********************************************************************
3559
3560WELS_EXTERN McHorVer20Width5Or9Or17_avx2
3561%define p_src        r0
3562%define i_srcstride  r1
3563%define p_dst        r2
3564%define i_dststride  r3
3565%define i_width      r4
3566%define i_height     r5
3567    %assign  push_num 0
3568    INIT_X86_32_PIC r6
3569    LOAD_6_PARA
3570    PUSH_XMM 8
3571    SIGN_EXTENSION  r1, r1d
3572    SIGN_EXTENSION  r3, r3d
3573    SIGN_EXTENSION  r4, r4d
3574    SIGN_EXTENSION  r5, r5d
3575    vbroadcasti128  ymm5, [pic(shufb_32435465768798A9)]
3576    vbroadcasti128  ymm6, [pic(shufb_011267784556ABBC)]
3577    vbroadcasti128  ymm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
3578    cmp             i_width, 9
3579    je              .width9
3580    jg              .width17
3581.width5_yloop:
3582    vmovdqu         xmm0, [p_src - 2]
3583    vinserti128     ymm0, ymm0, [p_src + i_srcstride - 2], 1
3584    lea             p_src, [p_src + 2 * i_srcstride]
3585    AVX2_FilterHorizontal_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
3586    vpackuswb       ymm0, ymm0, ymm0
3587    vpsrlq          xmm1, xmm0, 8
3588    vmovd           [p_dst + 1], xmm1
3589    vmovd           [p_dst], xmm0
3590    add             p_dst, i_dststride
3591    vextracti128    xmm0, ymm0, 1
3592    vpsrlq          xmm1, xmm0, 8
3593    vmovd           [p_dst + 1], xmm1
3594    vmovd           [p_dst], xmm0
3595    add             p_dst, i_dststride
3596    sub             i_height, 2
3597    jg              .width5_yloop
3598    vzeroupper
3599    POP_XMM
3600    LOAD_6_PARA_POP
3601    DEINIT_X86_32_PIC_KEEPDEF
3602    ret
3603.width9:
3604%xdefine i_srcstride3 i_width
3605%undef i_width
3606    lea             i_srcstride3, [3 * i_srcstride]
3607.width9_yloop:
3608    vmovdqu         xmm0, [p_src - 2]
3609    vmovdqu         xmm4, [p_src + i_srcstride - 2]
3610    vinserti128     ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1
3611    vinserti128     ymm4, ymm4, [p_src + i_srcstride3 - 2], 1
3612    lea             p_src, [p_src + 4 * i_srcstride]
3613    vpunpckhqdq     ymm3, ymm0, ymm4
3614    AVX2_FilterHorizontal_4px ymm3, ymm2
3615    AVX2_FilterHorizontal_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
3616    vpackuswb       ymm3, ymm3, ymm0
3617    vmovd           [p_dst + 5], xmm3
3618    vmovhps         [p_dst], xmm3
3619    add             p_dst, i_dststride
3620    AVX2_FilterHorizontal_16px ymm4, ymm5, ymm6, ymm7, ymm1, ymm2
3621    vpackuswb       ymm4, ymm4, ymm4
3622    vpsrlq          xmm2, xmm3, 32
3623    vmovd           [p_dst + 5], xmm2
3624    vmovlps         [p_dst], xmm4
3625    add             p_dst, i_dststride
3626    vextracti128    xmm3, ymm3, 1
3627    vextracti128    xmm4, ymm4, 1
3628    vmovd           [p_dst + 5], xmm3
3629    vmovhps         [p_dst], xmm3
3630    add             p_dst, i_dststride
3631    vpsrlq          xmm2, xmm3, 32
3632    vmovd           [p_dst + 5], xmm2
3633    vmovlps         [p_dst], xmm4
3634    add             p_dst, i_dststride
3635    sub             i_height, 4
3636    jg              .width9_yloop
3637    vzeroupper
3638    POP_XMM
3639    LOAD_6_PARA_POP
3640    DEINIT_X86_32_PIC_KEEPDEF
3641    ret
3642.width17:
3643    lea             i_srcstride3, [3 * i_srcstride]
3644.width17_yloop:
3645    vmovdqu         xmm0, [p_src - 2]
3646    vmovdqu         xmm3, [p_src + 6]
3647    vinserti128     ymm0, ymm0, [p_src + i_srcstride - 2], 1
3648    vinserti128     ymm3, ymm3, [p_src + i_srcstride + 6], 1
3649    vmovdqa         ymm4, ymm3
3650    AVX2_FilterHorizontal_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
3651    AVX2_FilterHorizontal_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2
3652    vpackuswb       ymm0, ymm0, ymm3
3653    vmovdqu         xmm1, [p_src + 2 * i_srcstride - 2]
3654    vmovdqu         xmm3, [p_src + 2 * i_srcstride + 6]
3655    vinserti128     ymm1, ymm1, [p_src + i_srcstride3 - 2], 1
3656    vinserti128     ymm3, ymm3, [p_src + i_srcstride3 + 6], 1
3657    lea             p_src, [p_src + 4 * i_srcstride]
3658    vpunpckhqdq     ymm4, ymm4, ymm3
3659    AVX2_FilterHorizontal_4px ymm4, ymm2
3660    vpackuswb       ymm4, ymm4, ymm4
3661    vmovd           [p_dst + 13], xmm4
3662    vmovdqa         [p_dst], xmm0
3663    add             p_dst, i_dststride
3664    vextracti128    xmm2, ymm4, 1
3665    vmovd           [p_dst + 13], xmm2
3666    vextracti128    [p_dst], ymm0, 1
3667    add             p_dst, i_dststride
3668    vpsrlq          xmm4, xmm4, 32
3669    vmovd           [p_dst + 13], xmm4
3670    AVX2_FilterHorizontal_16px ymm1, ymm5, ymm6, ymm7, ymm0, ymm4
3671    AVX2_FilterHorizontal_16px ymm3, ymm5, ymm6, ymm7, ymm0, ymm4
3672    vpackuswb       ymm1, ymm1, ymm3
3673    vmovdqa         [p_dst], xmm1
3674    add             p_dst, i_dststride
3675    vpsrlq          xmm2, xmm2, 32
3676    vmovd           [p_dst + 13], xmm2
3677    vextracti128    [p_dst], ymm1, 1
3678    add             p_dst, i_dststride
3679    sub             i_height, 4
3680    jg              .width17_yloop
3681    vzeroupper
3682    POP_XMM
3683    LOAD_6_PARA_POP
3684    DEINIT_X86_32_PIC
3685    ret
3686%undef i_srcstride3
3687%undef p_src
3688%undef i_srcstride
3689%undef p_dst
3690%undef i_dststride
3691%undef i_width
3692%undef i_height
3693
3694
3695;*******************************************************************************
3696; void McHorVer20Width4U8ToS16_avx2(const uint8_t *pSrc,
3697;                                   int iSrcStride,
3698;                                   int16_t *pDst,
3699;                                   int iHeight);
3700;*******************************************************************************
3701
3702WELS_EXTERN McHorVer20Width4U8ToS16_avx2
3703%define p_src        r0
3704%define i_srcstride  r1
3705%define p_dst        r2
3706%define i_height     r3
3707%define i_srcstride3 r4
3708%define i_dststride   8
3709    %assign  push_num 0
3710%ifdef X86_32
3711    push            r4
3712    %assign  push_num 1
3713%endif
3714    INIT_X86_32_PIC r5
3715    LOAD_4_PARA
3716    PUSH_XMM 7
3717    SIGN_EXTENSION  r1, r1d
3718    SIGN_EXTENSION  r3, r3d
3719    sub             p_src, i_srcstride
3720    sub             p_src, i_srcstride
3721    lea             i_srcstride3, [3 * i_srcstride]
3722    vbroadcasti128  ymm4, [pic(shufb_32435465768798A9)]
3723    vbroadcasti128  ymm5, [pic(shufb_011267784556ABBC)]
3724    vbroadcasti128  ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
3725    sub             i_height, 3
3726.yloop:
3727    vmovdqu         xmm0, [p_src - 2]
3728    vmovdqu         xmm1, [p_src + i_srcstride - 2]
3729    vinserti128     ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1
3730    vinserti128     ymm1, ymm1, [p_src + i_srcstride3 - 2], 1
3731    lea             p_src, [p_src + 4 * i_srcstride]
3732    AVX2_FilterHorizontalbw_4x4px ymm0, ymm1, ymm4, ymm5, ymm6, ymm2, ymm3
3733    vmovdqa         [p_dst], ymm0
3734    add             p_dst, 4 * i_dststride
3735    sub             i_height, 4
3736    jg              .yloop
3737    ; Height % 4 remaining single.
3738    vmovdqu         xmm0, [p_src - 2]
3739    AVX2_FilterHorizontalbw_16px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
3740    vmovlps         [p_dst], xmm0
3741    vzeroupper
3742    POP_XMM
3743    LOAD_4_PARA_POP
3744    DEINIT_X86_32_PIC
3745%ifdef X86_32
3746    pop             r4
3747%endif
3748    ret
3749%undef p_src
3750%undef i_srcstride
3751%undef p_dst
3752%undef i_height
3753%undef i_srcstride3
3754%undef i_dststride
3755
3756
3757;***********************************************************************
3758; void McHorVer02Width4S16ToU8_avx2(const int16_t *pSrc,
3759;                                   uint8_t *pDst,
3760;                                   int32_t iDstStride,
3761;                                   int32_t iHeight);
3762;***********************************************************************
3763
3764WELS_EXTERN McHorVer02Width4S16ToU8_avx2
3765%define p_src        r0
3766%define p_dst        r1
3767%define i_dststride  r2
3768%define i_height     r3
3769%define i_dststride3 r4
3770%define i_srcstride  8
3771    %assign  push_num 0
3772%ifdef X86_32
3773    push            r4
3774    %assign  push_num 1
3775%endif
3776    INIT_X86_32_PIC r5
3777    LOAD_4_PARA
3778    PUSH_XMM 8
3779    SIGN_EXTENSION  r2, r2d
3780    SIGN_EXTENSION  r3, r3d
3781    lea             i_dststride3, [3 * i_dststride]
3782    vmovdqu         ymm0, [p_src +  0 * i_srcstride]
3783    vmovdqu         ymm1, [p_src +  1 * i_srcstride]
3784    vmovdqu         ymm2, [p_src +  2 * i_srcstride]
3785    vmovdqu         ymm3, [p_src +  3 * i_srcstride]
3786    vmovdqu         ymm4, [p_src +  4 * i_srcstride]
3787    vmovdqu         ymm5, [p_src +  5 * i_srcstride]
3788    vmovdqu         ymm6, [p_src +  6 * i_srcstride]
3789    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm7
3790    vpackuswb       ymm0, ymm0, ymm0
3791    vmovd           [p_dst], xmm0
3792    vpsrlq          xmm7, xmm0, 32
3793    vmovd           [p_dst + i_dststride], xmm7
3794    vextracti128    xmm0, ymm0, 1
3795    vmovd           [p_dst + 2 * i_dststride], xmm0
3796    vpsrlq          xmm7, xmm0, 32
3797    vmovd           [p_dst + i_dststride3], xmm7
3798    cmp             i_height, 4
3799    jle             .done
3800    lea             p_dst, [p_dst + 4 * i_dststride]
3801    vmovdqu         ymm7, [p_src +  7 * i_srcstride]
3802    vmovdqu         ymm0, [p_src +  8 * i_srcstride]
3803    vmovdqu         ymm1, [p_src +  9 * i_srcstride]
3804    AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm3
3805    vpackuswb       ymm4, ymm4, ymm4
3806    vmovd           [p_dst], xmm4
3807    vpsrlq          xmm3, xmm4, 32
3808    vmovd           [p_dst + i_dststride], xmm3
3809    vextracti128    xmm4, ymm4, 1
3810    vmovd           [p_dst + 2 * i_dststride], xmm4
3811    vpsrlq          xmm3, xmm4, 32
3812    vmovd           [p_dst + i_dststride3], xmm3
3813.done:
3814    vzeroupper
3815    POP_XMM
3816    LOAD_4_PARA_POP
3817    DEINIT_X86_32_PIC
3818%ifdef X86_32
3819    pop             r4
3820%endif
3821    ret
3822%undef p_src
3823%undef p_dst
3824%undef i_dststride
3825%undef i_height
3826%undef i_srcstride
3827%undef i_dststride3
3828
3829
3830;*******************************************************************************
3831; void McHorVer20Width8U8ToS16_avx2(const uint8_t *pSrc,
3832;                                   int iSrcStride,
3833;                                   int16_t *pDst,
3834;                                   int iHeight);
3835;*******************************************************************************
3836
3837WELS_EXTERN McHorVer20Width8U8ToS16_avx2
3838%define p_src        r0
3839%define i_srcstride  r1
3840%define p_dst        r2
3841%define i_height     r3
3842%define i_dststride  16
3843    %assign  push_num 0
3844    INIT_X86_32_PIC r4
3845    LOAD_4_PARA
3846    PUSH_XMM 6
3847    SIGN_EXTENSION  r1, r1d
3848    SIGN_EXTENSION  r3, r3d
3849    sub             p_src, i_srcstride
3850    sub             p_src, i_srcstride
3851    vbroadcasti128  ymm3, [pic(shufb_32435465768798A9)]
3852    vbroadcasti128  ymm4, [pic(shufb_011267784556ABBC)]
3853    vbroadcasti128  ymm5, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
3854    sub             i_height, 1
3855.yloop:
3856    vmovdqu         xmm0, [p_src - 2]
3857    vinserti128     ymm0, ymm0, [p_src + i_srcstride - 2], 1
3858    lea             p_src, [p_src + 2 * i_srcstride]
3859    AVX2_FilterHorizontalbw_16px ymm0, ymm3, ymm4, ymm5, ymm1, ymm2
3860    vmovdqu         [p_dst], ymm0
3861    add             p_dst, 2 * i_dststride
3862    sub             i_height, 2
3863    jg              .yloop
3864    jl              .done
3865    vmovdqu         xmm0, [p_src - 2]
3866    AVX2_FilterHorizontalbw_16px xmm0, xmm3, xmm4, xmm5, xmm1, xmm2
3867    vmovdqa         [p_dst], xmm0
3868.done:
3869    vzeroupper
3870    POP_XMM
3871    LOAD_4_PARA_POP
3872    DEINIT_X86_32_PIC
3873    ret
3874%undef p_src
3875%undef i_srcstride
3876%undef p_dst
3877%undef i_height
3878%undef i_dststride
3879
3880
3881;***********************************************************************
3882; void McHorVer02Width5S16ToU8_avx2(const int16_t *pSrc,
3883;                                   uint8_t *pDst,
3884;                                   int32_t iDstStride,
3885;                                   int32_t iHeight);
3886;***********************************************************************
3887
3888WELS_EXTERN McHorVer02Width5S16ToU8_avx2
3889%define p_src        r0
3890%define p_dst        r1
3891%define i_dststride  r2
3892%define i_height     r3
3893%define i_srcstride  16
3894    %assign  push_num 0
3895    INIT_X86_32_PIC r4
3896    LOAD_4_PARA
3897    PUSH_XMM 8
3898    SIGN_EXTENSION  r2, r2d
3899    SIGN_EXTENSION  r3, r3d
3900    vmovdqu         ymm0, [p_src +  0 * i_srcstride]
3901    vmovdqu         ymm2, [p_src +  2 * i_srcstride]
3902    vmovdqu         ymm4, [p_src +  4 * i_srcstride]
3903    vmovdqu         ymm6, [p_src +  6 * i_srcstride]
3904    vperm2i128      ymm1, ymm0, ymm2, 00100001b
3905    vperm2i128      ymm3, ymm2, ymm4, 00100001b
3906    vperm2i128      ymm5, ymm4, ymm6, 00100001b
3907    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm7
3908    vpackuswb       ymm0, ymm0, ymm0
3909    vpsrlq          xmm7, xmm0, 8
3910    vmovd           [p_dst + 1], xmm7
3911    vmovd           [p_dst], xmm0
3912    add             p_dst, i_dststride
3913    vextracti128    xmm0, ymm0, 1
3914    vpsrlq          xmm7, xmm0, 8
3915    vmovd           [p_dst + 1], xmm7
3916    vmovd           [p_dst], xmm0
3917    add             p_dst, i_dststride
3918    vmovdqu         ymm7, [p_src +  7 * i_srcstride]
3919    vmovdqu         ymm0, [p_src +  8 * i_srcstride]
3920    AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm1
3921    vpackuswb       ymm2, ymm2, ymm2
3922    vpsrlq          xmm1, xmm2, 8
3923    vmovd           [p_dst + 1], xmm1
3924    vmovd           [p_dst], xmm2
3925    add             p_dst, i_dststride
3926    vextracti128    xmm2, ymm2, 1
3927    vpsrlq          xmm1, xmm2, 8
3928    vmovd           [p_dst + 1], xmm1
3929    vmovd           [p_dst], xmm2
3930    add             p_dst, i_dststride
3931    vmovdqu         ymm1, [p_src +  9 * i_srcstride]
3932    vmovdqu         ymm2, [p_src + 10 * i_srcstride]
3933    AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm3
3934    vpackuswb       ymm4, ymm4, ymm4
3935    vpsrlq          xmm3, xmm4, 8
3936    vmovd           [p_dst + 1], xmm3
3937    vmovd           [p_dst], xmm4
3938    cmp             i_height, 5
3939    jle             .done
3940    add             p_dst, i_dststride
3941    vextracti128    xmm4, ymm4, 1
3942    vpsrlq          xmm3, xmm4, 8
3943    vmovd           [p_dst + 1], xmm3
3944    vmovd           [p_dst], xmm4
3945    add             p_dst, i_dststride
3946    vmovdqu         ymm3, [p_src + 11 * i_srcstride]
3947    vmovdqu         xmm4, [p_src + 12 * i_srcstride]
3948    AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm5
3949    vpackuswb       ymm6, ymm6, ymm6
3950    vpsrlq          xmm5, xmm6, 8
3951    vmovd           [p_dst + 1], xmm5
3952    vmovd           [p_dst], xmm6
3953    add             p_dst, i_dststride
3954    vextracti128    xmm6, ymm6, 1
3955    vpsrlq          xmm5, xmm6, 8
3956    vmovd           [p_dst + 1], xmm5
3957    vmovd           [p_dst], xmm6
3958    add             p_dst, i_dststride
3959    vmovdqu         xmm5, [p_src + 13 * i_srcstride]
3960    AVX2_FilterVerticalw_16px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7
3961    vpackuswb       xmm0, xmm0, xmm0
3962    vpsrlq          xmm7, xmm0, 8
3963    vmovd           [p_dst + 1], xmm7
3964    vmovd           [p_dst], xmm0
3965.done:
3966    vzeroupper
3967    POP_XMM
3968    LOAD_4_PARA_POP
3969    DEINIT_X86_32_PIC
3970    ret
3971%undef p_src
3972%undef p_dst
3973%undef i_dststride
3974%undef i_height
3975%undef i_srcstride
3976
3977
3978;***********************************************************************
3979; void McHorVer02Width8S16ToU8_avx2(const int16_t *pSrc,
3980;                                   uint8_t *pDst,
3981;                                   int32_t iDstStride,
3982;                                   int32_t iHeight);
3983;***********************************************************************
3984
3985WELS_EXTERN McHorVer02Width8S16ToU8_avx2
3986%define p_src        r0
3987%define p_dst        r1
3988%define i_dststride  r2
3989%define i_height     r3
3990%define i_dststride3 r4
3991%define i_srcstride  16
3992    %assign  push_num 0
3993%ifdef X86_32
3994    push            r4
3995    %assign  push_num 1
3996%endif
3997    INIT_X86_32_PIC r5
3998    LOAD_4_PARA
3999    PUSH_XMM 8
4000    SIGN_EXTENSION  r2, r2d
4001    SIGN_EXTENSION  r3, r3d
4002    lea             i_dststride3, [3 * i_dststride]
4003    vmovdqa         ymm0, [p_src +  0 * i_srcstride]
4004    vmovdqa         ymm2, [p_src +  2 * i_srcstride]
4005    vmovdqa         ymm4, [p_src +  4 * i_srcstride]
4006    vperm2i128      ymm1, ymm0, ymm2, 00100001b
4007    vperm2i128      ymm3, ymm2, ymm4, 00100001b
4008.yloop:
4009    vmovdqa         ymm6, [p_src +  6 * i_srcstride]
4010    vperm2i128      ymm5, ymm4, ymm6, 00100001b
4011    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm7
4012    vmovdqu         ymm7, [p_src +  7 * i_srcstride]
4013    AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm1
4014    vpackuswb       ymm1, ymm0, ymm2
4015    vmovdqa         ymm0, [p_src +  8 * i_srcstride]
4016    vextracti128    xmm2, ymm1, 1
4017    vmovlps         [p_dst], xmm1
4018    vmovlps         [p_dst + i_dststride], xmm2
4019    vmovhps         [p_dst + 2 * i_dststride], xmm1
4020    vmovhps         [p_dst + i_dststride3], xmm2
4021    cmp             i_height, 4
4022    jle             .done
4023    lea             p_dst, [p_dst + 4 * i_dststride]
4024    vmovdqu         ymm1, [p_src +  9 * i_srcstride]
4025    vmovdqa         ymm2, [p_src + 10 * i_srcstride]
4026    AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm3
4027    vmovdqu         ymm3, [p_src + 11 * i_srcstride]
4028    AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm5
4029    vpackuswb       ymm5, ymm4, ymm6
4030    vmovdqa         ymm4, [p_src + 12 * i_srcstride]
4031    add             p_src, 8 * i_srcstride
4032    vextracti128    xmm6, ymm5, 1
4033    vmovlps         [p_dst], xmm5
4034    vmovlps         [p_dst + i_dststride], xmm6
4035    vmovhps         [p_dst + 2 * i_dststride], xmm5
4036    vmovhps         [p_dst + i_dststride3], xmm6
4037    lea             p_dst, [p_dst + 4 * i_dststride]
4038    sub             i_height, 8
4039    jg              .yloop
4040.done:
4041    vzeroupper
4042    POP_XMM
4043    LOAD_4_PARA_POP
4044    DEINIT_X86_32_PIC
4045%ifdef X86_32
4046    pop             r4
4047%endif
4048    ret
4049%undef p_src
4050%undef p_dst
4051%undef i_dststride
4052%undef i_height
4053%undef i_dststride3
4054%undef i_srcstride
4055
4056
4057;*******************************************************************************
4058; void McHorVer20Width16U8ToS16_avx2(const uint8_t *pSrc,
4059;                                    int32_t iSrcStride,
4060;                                    int16_t *pDst,
4061;                                    int32_t iHeight);
4062;*******************************************************************************
4063
4064WELS_EXTERN McHorVer20Width16U8ToS16_avx2
4065%define p_src        r0
4066%define i_srcstride  r1
4067%define p_dst        r2
4068%define i_height     r3
4069%define i_dststride  32
4070    %assign  push_num 0
4071    INIT_X86_32_PIC r4
4072    LOAD_4_PARA
4073    PUSH_XMM 7
4074    SIGN_EXTENSION  r1, r1d
4075    SIGN_EXTENSION  r3, r3d
4076    sub             p_src, i_srcstride
4077    sub             p_src, i_srcstride
4078    vbroadcasti128  ymm4, [pic(shufb_32435465768798A9)]
4079    vbroadcasti128  ymm5, [pic(shufb_011267784556ABBC)]
4080    vbroadcasti128  ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
4081    sub             i_height, 1
4082.yloop:
4083    vmovdqu         xmm0, [p_src - 2]
4084    vinserti128     ymm0, ymm0, [p_src + 6], 1
4085    vmovdqu         xmm1, [p_src + i_srcstride - 2]
4086    vinserti128     ymm1, ymm1, [p_src + i_srcstride + 6], 1
4087    lea             p_src, [p_src + 2 * i_srcstride]
4088    AVX2_FilterHorizontalbw_16px ymm0, ymm4, ymm5, ymm6, ymm2, ymm3
4089    vmovdqa         [p_dst], ymm0
4090    AVX2_FilterHorizontalbw_16px ymm1, ymm4, ymm5, ymm6, ymm2, ymm3
4091    vmovdqa         [p_dst + i_dststride], ymm1
4092    add             p_dst, 2 * i_dststride
4093    sub             i_height, 2
4094    jg              .yloop
4095    jl              .done
4096    vmovdqu         xmm0, [p_src - 2]
4097    vinserti128     ymm0, ymm0, [p_src + 6], 1
4098    AVX2_FilterHorizontalbw_16px ymm0, ymm4, ymm5, ymm6, ymm1, ymm2
4099    vmovdqa         [p_dst], ymm0
4100.done:
4101    vzeroupper
4102    POP_XMM
4103    LOAD_4_PARA_POP
4104    DEINIT_X86_32_PIC
4105    ret
4106%undef p_src
4107%undef i_srcstride
4108%undef p_dst
4109%undef i_height
4110%undef i_dststride
4111
4112
4113;***********************************************************************
4114; void McHorVer02Width9S16ToU8_avx2(const int16_t *pSrc,
4115;                                   uint8_t *pDst,
4116;                                   int32_t iDstStride,
4117;                                   int32_t iHeight);
4118;***********************************************************************
4119
4120WELS_EXTERN McHorVer02Width9S16ToU8_avx2
4121%define p_src        r0
4122%define p_dst        r1
4123%define i_dststride  r2
4124%define i_height     r3
4125%define i_srcstride  32
4126    %assign  push_num 0
4127    INIT_X86_32_PIC r4
4128    LOAD_4_PARA
4129    PUSH_XMM 8
4130    SIGN_EXTENSION  r2, r2d
4131    SIGN_EXTENSION  r3, r3d
4132    vmovdqa         ymm0, [p_src + 0 * i_srcstride]
4133    vmovdqa         ymm1, [p_src + 1 * i_srcstride]
4134    vmovdqa         ymm2, [p_src + 2 * i_srcstride]
4135    vmovdqa         ymm3, [p_src + 3 * i_srcstride]
4136    vmovdqa         ymm4, [p_src + 4 * i_srcstride]
4137    sub             i_height, 1
4138.height_loop:
4139    vmovdqa         ymm5, [p_src + 5 * i_srcstride]
4140    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6
4141    vmovdqa         ymm6, [p_src + 6 * i_srcstride]
4142    AVX2_FilterVerticalw_16px ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
4143    vmovdqa         ymm7, [p_src + 7 * i_srcstride]
4144    vpackuswb       ymm0, ymm0, ymm1
4145    vextracti128    xmm1, ymm0, 1
4146    vpsllq          xmm1, xmm1, 56
4147    vmovlps         [p_dst + 1], xmm1
4148    vmovlps         [p_dst], xmm0
4149    add             p_dst, i_dststride
4150    vmovhps         [p_dst + 1], xmm1
4151    vmovhps         [p_dst], xmm0
4152    add             p_dst, i_dststride
4153    AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm0
4154    vmovdqa         ymm0, [p_src + 8 * i_srcstride]
4155    AVX2_FilterVerticalw_16px ymm3, ymm4, ymm5, ymm6, ymm7, ymm0, ymm1
4156    vpackuswb       ymm2, ymm2, ymm3
4157    vextracti128    xmm3, ymm2, 1
4158    vpsllq          xmm3, xmm3, 56
4159    vmovlps         [p_dst + 1], xmm3
4160    vmovlps         [p_dst], xmm2
4161    add             p_dst, i_dststride
4162    vmovhps         [p_dst + 1], xmm3
4163    vmovhps         [p_dst], xmm2
4164    add             p_dst, i_dststride
4165    vmovdqa         ymm1, [p_src + 9 * i_srcstride]
4166    AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm2
4167    vmovdqa         ymm2, [p_src + 10 * i_srcstride]
4168    AVX2_FilterVerticalw_16px ymm5, ymm6, ymm7, ymm0, ymm1, ymm2, ymm3
4169    vmovdqa         ymm3, [p_src + 11 * i_srcstride]
4170    vpackuswb       ymm4, ymm4, ymm5
4171    vextracti128    xmm5, ymm4, 1
4172    vpsllq          xmm5, xmm5, 56
4173    vmovlps         [p_dst + 1], xmm5
4174    vmovlps         [p_dst], xmm4
4175    cmp             i_height, 4
4176    jle             .done
4177    add             p_dst, i_dststride
4178    vmovhps         [p_dst + 1], xmm5
4179    vmovhps         [p_dst], xmm4
4180    add             p_dst, i_dststride
4181    AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm4
4182    vmovdqa         ymm4, [p_src + 12 * i_srcstride]
4183    add             p_src, 8 * i_srcstride
4184    AVX2_FilterVerticalw_16px ymm7, ymm0, ymm1, ymm2, ymm3, ymm4, ymm5
4185    vpackuswb       ymm6, ymm6, ymm7
4186    vextracti128    xmm7, ymm6, 1
4187    vpsllq          xmm7, xmm7, 56
4188    vmovlps         [p_dst + 1], xmm7
4189    vmovlps         [p_dst], xmm6
4190    add             p_dst, i_dststride
4191    vmovhps         [p_dst + 1], xmm7
4192    vmovhps         [p_dst], xmm6
4193    add             p_dst, i_dststride
4194    sub             i_height, 8
4195    jg              .height_loop
4196    vmovdqa         ymm5, [p_src + 5 * i_srcstride]
4197    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6
4198    vpackuswb       ymm0, ymm0, ymm0
4199    vextracti128    xmm1, ymm0, 1
4200    vpsllq          xmm1, xmm1, 56
4201    vmovlps         [p_dst + 1], xmm1
4202    vmovlps         [p_dst], xmm0
4203.done:
4204    vzeroupper
4205    POP_XMM
4206    LOAD_4_PARA_POP
4207    DEINIT_X86_32_PIC
4208    ret
4209%undef p_src
4210%undef i_srcstride
4211%undef p_dst
4212%undef i_dststride
4213%undef i_height
4214
4215
4216;*******************************************************************************
4217; void McHorVer20Width17U8ToS16_avx2(const uint8_t *pSrc,
4218;                                    int32_t iSrcStride,
4219;                                    int16_t *pDst,
4220;                                    int32_t iHeight);
4221;*******************************************************************************
4222
4223WELS_EXTERN McHorVer20Width17U8ToS16_avx2
4224%define p_src        r0
4225%define i_srcstride  r1
4226%define p_dst        r2
4227%define i_height     r3
4228%define i_srcstride3 r4
4229%define i_dststride  64
4230    %assign  push_num 0
4231%ifdef X86_32
4232    push            r4
4233    %assign  push_num 1
4234%endif
4235    INIT_X86_32_PIC r5
4236    LOAD_4_PARA
4237    PUSH_XMM 8
4238    SIGN_EXTENSION  r1, r1d
4239    SIGN_EXTENSION  r3, r3d
4240    sub             p_src, i_srcstride
4241    sub             p_src, i_srcstride
4242    lea             i_srcstride3, [3 * i_srcstride]
4243    vbroadcasti128  ymm5, [pic(shufb_32435465768798A9)]
4244    vbroadcasti128  ymm6, [pic(shufb_011267784556ABBC)]
4245    vbroadcasti128  ymm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
4246    sub             i_height, 3
4247.yloop:
4248    vmovdqu         xmm0, [p_src - 2]
4249    vmovdqu         xmm3, [p_src + 6]
4250    vinserti128     ymm0, ymm0, [p_src + i_srcstride - 2], 1
4251    vinserti128     ymm3, ymm3, [p_src + i_srcstride + 6], 1
4252    vmovdqa         ymm4, ymm3
4253    AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
4254    vmovdqa         [p_dst], xmm0
4255    vextracti128    [p_dst + i_dststride], ymm0, 1
4256    AVX2_FilterHorizontalbw_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2
4257    vmovdqu         xmm1, [p_src + 2 * i_srcstride - 2]
4258    vmovdqu         xmm0, [p_src + 2 * i_srcstride + 6]
4259    vinserti128     ymm1, ymm1, [p_src + i_srcstride3 - 2], 1
4260    vinserti128     ymm0, ymm0, [p_src + i_srcstride3 + 6], 1
4261    lea             p_src, [p_src + 4 * i_srcstride]
4262    vpunpckhqdq     ymm4, ymm4, ymm0
4263    AVX2_FilterHorizontalbw_4px ymm4, [pic(dwm32768_256)], ymm2
4264    vmovlps         [p_dst + 26], xmm4
4265    vmovdqa         [p_dst + 16], xmm3
4266    vextracti128    xmm2, ymm4, 1
4267    vmovlps         [p_dst + i_dststride + 26], xmm2
4268    vextracti128    [p_dst + i_dststride + 16], ymm3, 1
4269    vmovhps         [p_dst + 2 * i_dststride + 26], xmm4
4270    AVX2_FilterHorizontalbw_16px ymm1, ymm5, ymm6, ymm7, ymm3, ymm4
4271    vmovdqa         [p_dst + 2 * i_dststride], xmm1
4272    AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm3, ymm4
4273    vmovdqa         [p_dst + 2 * i_dststride + 16], xmm0
4274    vextracti128    [p_dst + 3 * i_dststride], ymm1, 1
4275    vmovhps         [p_dst + 3 * i_dststride + 26], xmm2
4276    vextracti128    [p_dst + 3 * i_dststride + 16], ymm0, 1
4277    add             p_dst, 4 * i_dststride
4278    sub             i_height, 4
4279    jg              .yloop
4280    ; Handle remaining 2 lines after 4x unrolled loop.
4281    vmovdqu         xmm0, [p_src - 2]
4282    vinserti128     ymm0, ymm0, [p_src + 6], 1
4283    vmovdqu         xmm3, [p_src + i_srcstride - 2]
4284    vinserti128     ymm3, ymm3, [p_src + i_srcstride + 6], 1
4285    vpunpckhqdq     ymm4, ymm0, ymm3
4286    AVX2_FilterHorizontalbw_4px ymm4, [pic(dwm32768_256)], ymm2
4287    AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
4288    AVX2_FilterHorizontalbw_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2
4289    vextracti128    xmm4, ymm4, 1
4290    vmovlps         [p_dst + 26], xmm4
4291    vmovdqa         [p_dst], ymm0
4292    vmovhps         [p_dst + i_dststride + 26], xmm4
4293    vmovdqa         [p_dst + i_dststride], ymm3
4294    vzeroupper
4295    POP_XMM
4296    LOAD_4_PARA_POP
4297    DEINIT_X86_32_PIC
4298%ifdef X86_32
4299    pop             r4
4300%endif
4301    ret
4302%undef p_src
4303%undef i_srcstride
4304%undef p_dst
4305%undef i_dststride
4306%undef i_height
4307%undef i_srcstride3
4308
4309
4310;***********************************************************************
4311; void McHorVer02Width16Or17S16ToU8_avx2(const int16_t *pSrc,
4312;                                        int32_t iSrcStride,
4313;                                        uint8_t *pDst,
4314;                                        int32_t iDstStride,
4315;                                        int32_t iWidth,
4316;                                        int32_t iHeight);
4317;***********************************************************************
4318
4319WELS_EXTERN McHorVer02Width16Or17S16ToU8_avx2
4320%define p_src        r0
4321%define i_srcstride  r1
4322%define p_dst        r2
4323%define i_dststride  r3
4324%ifdef X86_32_PICASM
4325%define i_width      dword arg5
4326%else
4327%define i_width      r4
4328%endif
4329%define i_height     r5
4330%define i_srcstride3 r6
4331    %assign  push_num 0
4332%ifdef X86_32
4333    push            r6
4334    %assign  push_num 1
4335%endif
4336    LOAD_6_PARA
4337    PUSH_XMM 8
4338    SIGN_EXTENSION  r1, r1d
4339    SIGN_EXTENSION  r3, r3d
4340    SIGN_EXTENSION  r4, r4d
4341    SIGN_EXTENSION  r5, r5d
4342    INIT_X86_32_PIC_NOPRESERVE r4
4343    sub             i_height, 1
4344    lea             i_srcstride3, [3 * i_srcstride]
4345    test            i_width, 1
4346    jz              .align_begin
4347    push            i_height
4348    push            p_src
4349    push            p_dst
4350    %assign push_num push_num + 3
4351%ifdef X86_32_PICASM
4352    add             p_src, i_width
4353    add             p_src, i_width
4354    sub             p_src, 2
4355%else
4356    lea             p_src, [p_src + 2 * i_width - 2]
4357%endif
4358    add             p_dst, i_width
4359    vmovd           xmm0, [p_src]
4360    vpunpcklwd      xmm0, xmm0, [p_src + i_srcstride]
4361    vmovd           xmm1, [p_src + 2 * i_srcstride]
4362    add             p_src, i_srcstride3
4363    vpunpcklwd      xmm1, xmm1, [p_src]
4364    vpunpckldq      xmm0, xmm0, xmm1
4365    vmovd           xmm1, [p_src + i_srcstride]
4366    vpunpcklwd      xmm1, xmm1, [p_src + 2 * i_srcstride]
4367    vmovd           xmm2, [p_src + i_srcstride3]
4368    lea             p_src, [p_src + 4 * i_srcstride]
4369    vpunpcklwd      xmm2, xmm2, [p_src]
4370    vpunpckldq      xmm1, xmm1, xmm2
4371    vpunpcklqdq     xmm0, xmm0, xmm1
4372.height_loop_unalign:
4373    vmovd           xmm1, [p_src + i_srcstride]
4374    vpalignr        xmm1, xmm1, xmm0, 2
4375    vmovd           xmm2, [p_src + 2 * i_srcstride]
4376    vpalignr        xmm2, xmm2, xmm1, 2
4377    vmovd           xmm3, [p_src + i_srcstride3]
4378    vpalignr        xmm3, xmm3, xmm2, 2
4379    lea             p_src, [p_src + 4 * i_srcstride]
4380    vmovd           xmm4, [p_src]
4381    vpalignr        xmm4, xmm4, xmm3, 2
4382    vmovd           xmm5, [p_src + i_srcstride]
4383    vpalignr        xmm5, xmm5, xmm4, 2
4384    AVX2_FilterVerticalw_16px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7
4385    vpackuswb       xmm0, xmm0, xmm0
4386    vpslld          xmm6, xmm0, 24
4387    vmovd           [p_dst - 4], xmm6
4388    vmovlps         [p_dst + 4 * i_dststride - 8], xmm6
4389    add             p_dst, i_dststride
4390    vpslld          xmm6, xmm0, 16
4391    vmovd           [p_dst - 4], xmm6
4392    vmovlps         [p_dst + 4 * i_dststride - 8], xmm6
4393    add             p_dst, i_dststride
4394    vpslld          xmm6, xmm0, 8
4395    vmovd           [p_dst - 4], xmm6
4396    vmovd           [p_dst + i_dststride - 4], xmm0
4397    lea             p_dst, [p_dst + 4 * i_dststride]
4398    vmovlps         [p_dst - 8], xmm6
4399    vmovlps         [p_dst + i_dststride - 8], xmm0
4400    lea             p_dst, [p_dst + 2 * i_dststride]
4401    sub             i_height, 8
4402    jle             .height_loop_unalign_exit
4403    vmovd           xmm1, [p_src + 2 * i_srcstride]
4404    vpalignr        xmm1, xmm1, xmm5, 2
4405    vmovd           xmm0, [p_src + i_srcstride3]
4406    lea             p_src, [p_src + 4 * i_srcstride]
4407    vpunpcklwd      xmm0, xmm0, [p_src]
4408    vpalignr        xmm0, xmm0, xmm1, 4
4409    jmp             .height_loop_unalign
4410.height_loop_unalign_exit:
4411    vpbroadcastq    xmm6, [p_src + 2 * i_srcstride - 6]
4412    AVX2_FilterVerticalw_16px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
4413    vpackuswb       xmm1, xmm1, xmm1
4414    vmovlps         [p_dst - 8], xmm1
4415    pop             p_dst
4416    pop             p_src
4417    pop             i_height
4418    %assign push_num push_num - 3
4419.align_begin:
4420    vmovdqa         ymm0, [p_src]
4421    vmovdqa         ymm1, [p_src + i_srcstride]
4422    vmovdqa         ymm2, [p_src + 2 * i_srcstride]
4423    vmovdqa         ymm3, [p_src + i_srcstride3]
4424    lea             p_src, [p_src + 4 * i_srcstride]
4425    vmovdqa         ymm4, [p_src]
4426.height_loop:
4427    vmovdqa         ymm5, [p_src + i_srcstride]
4428    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6
4429    vmovdqa         ymm6, [p_src + 2 * i_srcstride]
4430    AVX2_FilterVerticalw_16px ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
4431    vmovdqa         ymm7, [p_src + i_srcstride3]
4432    lea             p_src, [p_src + 4 * i_srcstride]
4433    vpackuswb       ymm0, ymm0, ymm1
4434    vpermq          ymm0, ymm0, 11011000b
4435    vmovdqa         [p_dst], xmm0
4436    vextracti128    [p_dst + i_dststride], ymm0, 1
4437    lea             p_dst, [p_dst + 2 * i_dststride]
4438    AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm0
4439    vmovdqa         ymm0, [p_src]
4440    AVX2_FilterVerticalw_16px ymm3, ymm4, ymm5, ymm6, ymm7, ymm0, ymm1
4441    vpackuswb       ymm2, ymm2, ymm3
4442    vpermq          ymm2, ymm2, 11011000b
4443    vmovdqa         [p_dst], xmm2
4444    vextracti128    [p_dst + i_dststride], ymm2, 1
4445    lea             p_dst, [p_dst + 2 * i_dststride]
4446    vmovdqa         ymm1, [p_src + i_srcstride]
4447    AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm2
4448    vmovdqa         ymm2, [p_src + 2 * i_srcstride]
4449    AVX2_FilterVerticalw_16px ymm5, ymm6, ymm7, ymm0, ymm1, ymm2, ymm3
4450    vmovdqa         ymm3, [p_src + i_srcstride3]
4451    lea             p_src, [p_src + 4 * i_srcstride]
4452    vpackuswb       ymm4, ymm4, ymm5
4453    vpermq          ymm4, ymm4, 11011000b
4454    vmovdqa        [p_dst], xmm4
4455    vextracti128   [p_dst + i_dststride], ymm4, 1
4456    lea             p_dst, [p_dst + 2 * i_dststride]
4457    AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm4
4458    vmovdqa         ymm4, [p_src]
4459    AVX2_FilterVerticalw_16px ymm7, ymm0, ymm1, ymm2, ymm3, ymm4, ymm5
4460    vpackuswb       ymm6, ymm6, ymm7
4461    vpermq          ymm6, ymm6, 11011000b
4462    vmovdqa         [p_dst], xmm6
4463    vextracti128    [p_dst + i_dststride], ymm6, 1
4464    lea             p_dst, [p_dst + 2 * i_dststride]
4465    sub             i_height, 8
4466    jg              .height_loop
4467    jl              .done
4468    vmovdqa         ymm5, [p_src + i_srcstride]
4469    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6
4470    vpackuswb       ymm0, ymm0, ymm0
4471    vpermq          ymm0, ymm0, 11011000b
4472    vmovdqa         [p_dst], xmm0
4473.done:
4474    vzeroupper
4475    DEINIT_X86_32_PIC
4476    POP_XMM
4477    LOAD_6_PARA_POP
4478%ifdef X86_32
4479    pop             r6
4480%endif
4481    ret
4482%undef p_src
4483%undef i_srcstride
4484%undef p_dst
4485%undef i_dststride
4486%undef i_width
4487%undef i_height
4488%undef i_srcstride3
4489
4490%endif ; HAVE_AVX2
4491