• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*!
2;* \copy
3;*     Copyright (c)  2009-2013, Cisco Systems
4;*     All rights reserved.
5;*
6;*     Redistribution and use in source and binary forms, with or without
7;*     modification, are permitted provided that the following conditions
8;*     are met:
9;*
10;*        * Redistributions of source code must retain the above copyright
11;*          notice, this list of conditions and the following disclaimer.
12;*
13;*        * Redistributions in binary form must reproduce the above copyright
14;*          notice, this list of conditions and the following disclaimer in
15;*          the documentation and/or other materials provided with the
16;*          distribution.
17;*
18;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29;*     POSSIBILITY OF SUCH DAMAGE.
30;*
31;*
32;*  upsampling.asm
33;*
34;*  Abstract
35;*      SIMD for pixel domain down sampling
36;*
37;*  History
38;*      10/22/2009  Created
39;*
40;*************************************************************************/
41%include "asm_inc.asm"
42
43%ifdef __NASM_VER__
44    %use smartalign
45%endif
46
47;***********************************************************************
48; Macros and other preprocessor constants
49;***********************************************************************
50
51
52;***********************************************************************
53; Some constants
54;***********************************************************************
55
56;***********************************************************************
57; Local Data (Read Only)
58;***********************************************************************
59
60%ifdef X86_32_PICASM
61SECTION .text align=32
62%else
63SECTION .rodata align=32
64%endif
65
66;***********************************************************************
67; Various memory constants (trigonometric values or rounding values)
68;***********************************************************************
69
70ALIGN 32
71%ifndef X86_32_PICASM
72db80h_256:
73    times 32 db 80h
74shufb_0000000088888888:
75    times 8 db 0
76    times 8 db 8
77shufb_000044448888CCCC:
78    times 4 db 0
79    times 4 db 4
80    times 4 db 8
81    times 4 db 12
82%endif
83shufb_mask_low:
84    db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
85shufb_mask_high:
86    db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
87add_extra_half:
88    dd 16384,0,0,0
89
90shufb_mask_quarter:
91db 00h, 04h, 08h, 0ch, 80h, 80h, 80h, 80h, 01h, 05h, 09h, 0dh, 80h, 80h, 80h, 80h
92
93shufb_mask_onethird_low_1:
94db 00h, 03h, 06h, 09h, 0ch, 0fh, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h
95shufb_mask_onethird_low_2:
96db 80h, 80h, 80h, 80h, 80h, 80h, 02h, 05h, 08h, 0bh, 0eh, 80h, 80h, 80h, 80h, 80h
97shufb_mask_onethird_low_3:
98db 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 01h, 04h, 07h, 0ah, 0dh
99
100shufb_mask_onethird_high_1:
101db 01h, 04h, 07h, 0ah, 0dh, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h
102shufb_mask_onethird_high_2:
103db 80h, 80h, 80h, 80h, 80h, 00h, 03h, 06h, 09h, 0ch, 0fh, 80h, 80h, 80h, 80h, 80h
104shufb_mask_onethird_high_3:
105db 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 02h, 05h, 08h, 0bh, 0eh
106
107;***********************************************************************
108; Code
109;***********************************************************************
110
111SECTION .text
112
113;***********************************************************************
114;   void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride,
115;                   unsigned char* pSrc, const int iSrcStride,
116;                   const int iSrcWidth, const int iSrcHeight );
117;***********************************************************************
118WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
119%ifdef X86_32
120    push r6
121    %assign push_num 1
122%else
123    %assign push_num 0
124%endif
125    LOAD_6_PARA
126    SIGN_EXTENSION r1, r1d
127    SIGN_EXTENSION r3, r3d
128    SIGN_EXTENSION r4, r4d
129    SIGN_EXTENSION r5, r5d
130
131%ifndef X86_32
132    push r12
133    mov r12, r4
134%endif
135    sar r5, $01            ; iSrcHeight >> 1
136
137.yloops1:
138%ifdef X86_32
139    mov r4, arg5
140%else
141    mov r4, r12
142%endif
143    sar r4, $01            ; iSrcWidth >> 1
144    mov r6, r4        ; iDstWidth restored at ebx
145    sar r4, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb
146    neg r6             ; - (iSrcWidth >> 1)
147    ; each loop = source bandwidth: 32 bytes
148.xloops1:
149    ; 1st part horizonal loop: x16 bytes
150    ;               mem  hi<-       ->lo
151    ;1st Line Src:  mm0: d D c C b B a A    mm1: h H g G f F e E
152    ;2nd Line Src:  mm2: l L k K j J i I    mm3: p P o O n N m M
153    ;=> target:
154    ;: H G F E D C B A, P O N M L K J I
155    ;: h g f e d c b a, p o n m l k j i
156    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
157    movq mm0, [r2]         ; 1st pSrc line
158    movq mm1, [r2+8]       ; 1st pSrc line + 8
159    movq mm2, [r2+r3]     ; 2nd pSrc line
160    movq mm3, [r2+r3+8]   ; 2nd pSrc line + 8
161
162    ; to handle mm0, mm1, mm2, mm3
163    pshufw mm4, mm0, 0d8h   ; d D b B c C a A ; 11011000 B
164    pshufw mm5, mm4, 04eh   ; c C a A d D b B ; 01001110 B
165    punpcklbw mm4, mm5      ; d c D C b a B A
166    pshufw mm4, mm4, 0d8h   ; d c b a D C B A ; 11011000 B: mm4
167
168    pshufw mm5, mm1, 0d8h   ; h H f F g G e E ; 11011000 B
169    pshufw mm6, mm5, 04eh   ; g G e E h H f F ; 01001110 B
170    punpcklbw mm5, mm6      ; h g H G f e F E
171    pshufw mm5, mm5, 0d8h   ; h g f e H G F E ; 11011000 B: mm5
172
173    pshufw mm6, mm2, 0d8h   ; l L j J k K i I ; 11011000 B
174    pshufw mm7, mm6, 04eh   ; k K i I l L j J ; 01001110 B
175    punpcklbw mm6, mm7      ; l k L K j i J I
176    pshufw mm6, mm6, 0d8h   ; l k j i L K J I ; 11011000 B: mm6
177
178    pshufw mm7, mm3, 0d8h   ; p P n N o O m M ; 11011000 B
179    pshufw mm0, mm7, 04eh   ; o O m M p P n N ; 01001110 B
180    punpcklbw mm7, mm0      ; p o P O n m N M
181    pshufw mm7, mm7, 0d8h   ; p o n m P O N M ; 11011000 B: mm7
182
183    ; to handle mm4, mm5, mm6, mm7
184    movq mm0, mm4       ;
185    punpckldq mm0, mm5  ; H G F E D C B A
186    punpckhdq mm4, mm5  ; h g f e d c b a
187
188    movq mm1, mm6
189    punpckldq mm1, mm7  ; P O N M L K J I
190    punpckhdq mm6, mm7  ; p o n m l k j i
191
192    ; avg within MB horizon width (16 x 2 lines)
193    pavgb mm0, mm4      ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
194    pavgb mm1, mm6      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
195    pavgb mm0, mm1      ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
196
197    ; 2nd part horizonal loop: x16 bytes
198    ;               mem  hi<-       ->lo
199    ;1st Line Src:  mm0: d D c C b B a A    mm1: h H g G f F e E
200    ;2nd Line Src:  mm2: l L k K j J i I    mm3: p P o O n N m M
201    ;=> target:
202    ;: H G F E D C B A, P O N M L K J I
203    ;: h g f e d c b a, p o n m l k j i
204    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
205    movq mm1, [r2+16]      ; 1st pSrc line + 16
206    movq mm2, [r2+24]      ; 1st pSrc line + 24
207    movq mm3, [r2+r3+16]  ; 2nd pSrc line + 16
208    movq mm4, [r2+r3+24]  ; 2nd pSrc line + 24
209
210    ; to handle mm1, mm2, mm3, mm4
211    pshufw mm5, mm1, 0d8h   ; d D b B c C a A ; 11011000 B
212    pshufw mm6, mm5, 04eh   ; c C a A d D b B ; 01001110 B
213    punpcklbw mm5, mm6      ; d c D C b a B A
214    pshufw mm5, mm5, 0d8h   ; d c b a D C B A ; 11011000 B: mm5
215
216    pshufw mm6, mm2, 0d8h   ; h H f F g G e E ; 11011000 B
217    pshufw mm7, mm6, 04eh   ; g G e E h H f F ; 01001110 B
218    punpcklbw mm6, mm7      ; h g H G f e F E
219    pshufw mm6, mm6, 0d8h   ; h g f e H G F E ; 11011000 B: mm6
220
221    pshufw mm7, mm3, 0d8h   ; l L j J k K i I ; 11011000 B
222    pshufw mm1, mm7, 04eh   ; k K i I l L j J ; 01001110 B
223    punpcklbw mm7, mm1      ; l k L K j i J I
224    pshufw mm7, mm7, 0d8h   ; l k j i L K J I ; 11011000 B: mm7
225
226    pshufw mm1, mm4, 0d8h   ; p P n N o O m M ; 11011000 B
227    pshufw mm2, mm1, 04eh   ; o O m M p P n N ; 01001110 B
228    punpcklbw mm1, mm2      ; p o P O n m N M
229    pshufw mm1, mm1, 0d8h   ; p o n m P O N M ; 11011000 B: mm1
230
231    ; to handle mm5, mm6, mm7, mm1
232    movq mm2, mm5
233    punpckldq mm2, mm6  ; H G F E D C B A
234    punpckhdq mm5, mm6  ; h g f e d c b a
235
236    movq mm3, mm7
237    punpckldq mm3, mm1  ; P O N M L K J I
238    punpckhdq mm7, mm1  ; p o n m l k j i
239
240    ; avg within MB horizon width (16 x 2 lines)
241    pavgb mm2, mm5      ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
242    pavgb mm3, mm7      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
243    pavgb mm2, mm3      ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
244
245    movq [r0  ], mm0
246    movq [r0+8], mm2
247
248    ; next SMB
249    lea r2, [r2+32]
250    lea r0, [r0+16]
251
252    dec r4
253    jg near .xloops1
254
255    ; next line
256    lea r2, [r2+2*r3]    ; next end of lines
257    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
258    lea r0, [r0+r1]
259    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]
260
261    dec r5
262    jg near .yloops1
263
264    WELSEMMS
265%ifndef X86_32
266    pop r12
267%endif
268    LOAD_6_PARA_POP
269%ifdef X86_32
270    pop r6
271%endif
272    ret
273
274;***********************************************************************
275;   void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
276;                     unsigned char* pSrc, const int iSrcStride,
277;                     const int iSrcWidth, const int iSrcHeight );
278;***********************************************************************
279WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
280%ifdef X86_32
281    push r6
282    %assign push_num 1
283%else
284    %assign push_num 0
285%endif
286    LOAD_6_PARA
287    SIGN_EXTENSION r1, r1d
288    SIGN_EXTENSION r3, r3d
289    SIGN_EXTENSION r4, r4d
290    SIGN_EXTENSION r5, r5d
291
292%ifndef X86_32
293    push r12
294    mov r12, r4
295%endif
296    sar r5, $01            ; iSrcHeight >> 1
297
298.yloops2:
299%ifdef X86_32
300    mov r4, arg5
301%else
302    mov r4, r12
303%endif
304    sar r4, $01            ; iSrcWidth >> 1
305    mov r6, r4        ; iDstWidth restored at ebx
306    sar r4, $03            ; (iSrcWidth >> 1) / 8     ; loop count = num_of_mb
307    neg r6             ; - (iSrcWidth >> 1)
308    ; each loop = source bandwidth: 16 bytes
309.xloops2:
310    ; 1st part horizonal loop: x16 bytes
311    ;               mem  hi<-       ->lo
312    ;1st Line Src:  mm0: d D c C b B a A    mm1: h H g G f F e E
313    ;2nd Line Src:  mm2: l L k K j J i I    mm3: p P o O n N m M
314    ;=> target:
315    ;: H G F E D C B A, P O N M L K J I
316    ;: h g f e d c b a, p o n m l k j i
317    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
318    movq mm0, [r2]         ; 1st pSrc line
319    movq mm1, [r2+8]       ; 1st pSrc line + 8
320    movq mm2, [r2+r3]     ; 2nd pSrc line
321    movq mm3, [r2+r3+8]   ; 2nd pSrc line + 8
322
323    ; to handle mm0, mm1, mm2, mm3
324    pshufw mm4, mm0, 0d8h   ; d D b B c C a A ; 11011000 B
325    pshufw mm5, mm4, 04eh   ; c C a A d D b B ; 01001110 B
326    punpcklbw mm4, mm5      ; d c D C b a B A
327    pshufw mm4, mm4, 0d8h   ; d c b a D C B A ; 11011000 B: mm4
328
329    pshufw mm5, mm1, 0d8h   ; h H f F g G e E ; 11011000 B
330    pshufw mm6, mm5, 04eh   ; g G e E h H f F ; 01001110 B
331    punpcklbw mm5, mm6      ; h g H G f e F E
332    pshufw mm5, mm5, 0d8h   ; h g f e H G F E ; 11011000 B: mm5
333
334    pshufw mm6, mm2, 0d8h   ; l L j J k K i I ; 11011000 B
335    pshufw mm7, mm6, 04eh   ; k K i I l L j J ; 01001110 B
336    punpcklbw mm6, mm7      ; l k L K j i J I
337    pshufw mm6, mm6, 0d8h   ; l k j i L K J I ; 11011000 B: mm6
338
339    pshufw mm7, mm3, 0d8h   ; p P n N o O m M ; 11011000 B
340    pshufw mm0, mm7, 04eh   ; o O m M p P n N ; 01001110 B
341    punpcklbw mm7, mm0      ; p o P O n m N M
342    pshufw mm7, mm7, 0d8h   ; p o n m P O N M ; 11011000 B: mm7
343
344    ; to handle mm4, mm5, mm6, mm7
345    movq mm0, mm4       ;
346    punpckldq mm0, mm5  ; H G F E D C B A
347    punpckhdq mm4, mm5  ; h g f e d c b a
348
349    movq mm1, mm6
350    punpckldq mm1, mm7  ; P O N M L K J I
351    punpckhdq mm6, mm7  ; p o n m l k j i
352
353    ; avg within MB horizon width (16 x 2 lines)
354    pavgb mm0, mm4      ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
355    pavgb mm1, mm6      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
356    pavgb mm0, mm1      ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
357
358    movq [r0  ], mm0
359
360    ; next SMB
361    lea r2, [r2+16]
362    lea r0, [r0+8]
363
364    dec r4
365    jg near .xloops2
366
367    ; next line
368    lea r2, [r2+2*r3]    ; next end of lines
369    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
370    lea r0, [r0+r1]
371    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]
372
373    dec r5
374    jg near .yloops2
375
376    WELSEMMS
377%ifndef X86_32
378    pop r12
379%endif
380    LOAD_6_PARA_POP
381%ifdef X86_32
382    pop r6
383%endif
384    ret
385
386;***********************************************************************
387;   void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
388;                     unsigned char* pSrc, const int iSrcStride,
389;                     const int iSrcWidth, const int iSrcHeight );
390;***********************************************************************
391WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
392%ifdef X86_32
393    push r6
394    %assign push_num 1
395%else
396    %assign push_num 0
397%endif
398    LOAD_6_PARA
399    SIGN_EXTENSION r1, r1d
400    SIGN_EXTENSION r3, r3d
401    SIGN_EXTENSION r4, r4d
402    SIGN_EXTENSION r5, r5d
403
404%ifndef X86_32
405    push r12
406    mov r12, r4
407%endif
408    sar r5, $01            ; iSrcHeight >> 1
409
410.yloops3:
411%ifdef X86_32
412    mov r4, arg5
413%else
414    mov r4, r12
415%endif
416    sar r4, $01            ; iSrcWidth >> 1
417    mov r6, r4        ; iDstWidth restored at ebx
418    sar r4, $02            ; (iSrcWidth >> 1) / 4     ; loop count = num_of_mb
419    neg r6             ; - (iSrcWidth >> 1)
420    ; each loop = source bandwidth: 8 bytes
421.xloops3:
422    ; 1st part horizonal loop: x8 bytes
423    ;               mem  hi<-       ->lo
424    ;1st Line Src:  mm0: d D c C b B a A
425    ;2nd Line Src:  mm1: h H g G f F e E
426    ;=> target:
427    ;: H G F E D C B A
428    ;: h g f e d c b a
429    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
430    movq mm0, [r2]         ; 1st pSrc line
431    movq mm1, [r2+r3]     ; 2nd pSrc line
432
433    ; to handle mm0, mm1, mm2, mm3
434    pshufw mm2, mm0, 0d8h   ; d D b B c C a A ; 11011000 B
435    pshufw mm3, mm2, 04eh   ; c C a A d D b B ; 01001110 B
436    punpcklbw mm2, mm3      ; d c D C b a B A
437    pshufw mm2, mm2, 0d8h   ; d c b a D C B A ; 11011000 B: mm4
438
439    pshufw mm4, mm1, 0d8h   ; h H f F g G e E ; 11011000 B
440    pshufw mm5, mm4, 04eh   ; g G e E h H f F ; 01001110 B
441    punpcklbw mm4, mm5      ; h g H G f e F E
442    pshufw mm4, mm4, 0d8h   ; h g f e H G F E ; 11011000 B: mm5
443
444    ; to handle mm2, mm4
445    movq mm0, mm2       ;
446    punpckldq mm0, mm4  ; H G F E D C B A
447    punpckhdq mm2, mm4  ; h g f e d c b a
448
449    ; avg within MB horizon width (16 x 2 lines)
450    pavgb mm0, mm2      ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
451    pshufw mm1, mm0, 04eh   ; 01001110 B
452    pavgb mm0, mm1      ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
453
454    movd [r0], mm0
455
456    ; next unit
457    lea r2, [r2+8]
458    lea r0, [r0+4]
459
460    dec r4
461    jg near .xloops3
462
463    ; next line
464    lea r2, [r2+2*r3]    ; next end of lines
465    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
466    lea r0, [r0+r1]
467    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]
468
469    dec r5
470    jg near .yloops3
471
472    WELSEMMS
473%ifndef X86_32
474    pop r12
475%endif
476    LOAD_6_PARA_POP
477%ifdef X86_32
478    pop r6
479%endif
480    ret
481
482
483
484;***********************************************************************
485;   void DyadicBilinearDownsamplerWidthx32_ssse3(   unsigned char* pDst, const int iDstStride,
486;                   unsigned char* pSrc, const int iSrcStride,
487;                   const int iSrcWidth, const int iSrcHeight );
488;***********************************************************************
489WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
490%ifdef X86_32
491    push r6
492    %assign push_num 1
493%else
494    %assign push_num 0
495%endif
496    LOAD_6_PARA
497    PUSH_XMM 4
498    SIGN_EXTENSION r1, r1d
499    SIGN_EXTENSION r3, r3d
500    SIGN_EXTENSION r4, r4d
501    SIGN_EXTENSION r5, r5d
502
503%ifndef X86_32
504    push r12
505    mov r12, r4
506%endif
507    sar r5, $01            ; iSrcHeight >> 1
508
509    WELS_DB1 xmm3
510    WELS_Zero xmm2
511    sar r4, $01            ; iSrcWidth >> 1
512    add r0, r4             ; pDst += iSrcWidth >> 1
513
514.yloops4:
515%ifdef X86_32
516    mov r4, arg5
517%else
518    mov r4, r12
519%endif
520    sar r4, $01            ; iSrcWidth >> 1
521    neg r4                 ; -(iSrcWidth >> 1)
522    mov r6, r4
523    align 16
524    ; each loop = source bandwidth: 32 bytes
525.xloops4:
526    movdqa xmm0, [r2+r3]
527    movdqa xmm1, [r2+r3+16]
528    pavgb  xmm0, [r2]          ; avg vertical pixels 0-15
529    pavgb  xmm1, [r2+16]       ; avg vertical pixels 16-31
530    add r2, 32                 ; pSrc += 32
531    pmaddubsw xmm0, xmm3       ; pairwise horizontal sum neighboring pixels 0-15
532    pmaddubsw xmm1, xmm3       ; pairwise horizontal sum neighboring pixels 16-31
533    pavgw xmm0, xmm2           ; (sum + 1) >> 1
534    pavgw xmm1, xmm2           ; (sum + 1) >> 1
535    packuswb xmm0, xmm1        ; pack words to bytes
536    movdqa [r0+r4], xmm0       ; store results
537    add r4, 16
538    jl .xloops4
539
540    ; next line
541    lea r2, [r2+2*r3]    ; next end of lines
542    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
543    lea r0, [r0+r1]
544
545    sub r5, 1
546    jg .yloops4
547
548%ifndef X86_32
549    pop r12
550%endif
551
552    POP_XMM
553    LOAD_6_PARA_POP
554%ifdef X86_32
555    pop r6
556%endif
557    ret
558
559;***********************************************************************
560;   void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
561;                     unsigned char* pSrc, const int iSrcStride,
562;                     const int iSrcWidth, const int iSrcHeight );
563;***********************************************************************
564WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
565%ifdef X86_32
566    push r6
567    %assign push_num 1
568%else
569    %assign push_num 0
570%endif
571    LOAD_6_PARA
572    PUSH_XMM 4
573    SIGN_EXTENSION r1, r1d
574    SIGN_EXTENSION r3, r3d
575    SIGN_EXTENSION r4, r4d
576    SIGN_EXTENSION r5, r5d
577
578%ifndef X86_32
579    push r12
580    mov r12, r4
581%endif
582    sar r5, $01            ; iSrcHeight >> 1
583    WELS_DB1 xmm3
584    WELS_Zero xmm2
585    add r2, r4             ; pSrc += iSrcWidth
586    sar r4, $01            ; iSrcWidth >> 1
587    add r0, r4             ; pDst += iSrcWidth >> 1
588
589.yloops5:
590%ifdef X86_32
591    mov r4, arg5
592%else
593    mov r4, r12
594%endif
595    sar r4, $01            ; iSrcWidth >> 1
596    neg r4                 ; -(iSrcWidth >> 1)
597    lea r6, [r2+r3]        ; pSrc + iSrcStride
598    align 16
599    ; each loop = source bandwidth: 16 bytes
600.xloops5:
601    movdqa xmm0, [r2+2*r4]
602    pavgb  xmm0, [r6+2*r4]     ; avg vertical pixels
603    pmaddubsw xmm0, xmm3       ; pairwise horizontal sum neighboring pixels
604    pavgw xmm0, xmm2           ; (sum + 1) >> 1
605    packuswb xmm0, xmm0        ; pack words to bytes
606    movlps [r0+r4], xmm0       ; store results
607    add r4, 8
608    jl .xloops5
609
610    ; next line
611    lea r2, [r2+2*r3]    ; next end of lines
612    lea r0, [r0+r1]
613
614    sub r5, 1
615    jg .yloops5
616
617%ifndef X86_32
618    pop r12
619%endif
620
621    POP_XMM
622    LOAD_6_PARA_POP
623%ifdef X86_32
624    pop r6
625%endif
626    ret
627
628
629%ifdef X86_32
630;**************************************************************************************************************
631;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
632;                           unsigned char* pSrc, const int iSrcStride,
633;                           unsigned int uiScaleX, unsigned int uiScaleY );
634;{
635;**************************************************************************************************************
636
637WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
638    push    ebp
639    push    esi
640    push    edi
641    push    ebx
642%define     pushsize    16
643%define     localsize   16
644%define     pDstData        esp + pushsize + localsize + 4
645%define     dwDstStride     esp + pushsize + localsize + 8
646%define     dwDstWidth      esp + pushsize + localsize + 12
647%define     dwDstHeight     esp + pushsize + localsize + 16
648%define     pSrcData        esp + pushsize + localsize + 20
649%define     dwSrcStride     esp + pushsize + localsize + 24
650%define     uiScaleX            esp + pushsize + localsize + 28
651%define     uiScaleY            esp + pushsize + localsize + 32
652%define     tmpHeight       esp + 0
653%define     yInverse        esp + 4
654%define     xInverse        esp + 8
655%define     dstStep         esp + 12
656    sub     esp,            localsize
657
658    pxor    xmm0,   xmm0
659    mov     eax,    [uiScaleX]
660    and     eax,    32767
661    mov     ebx,    eax
662    neg     ebx
663    and     ebx,    32767
664    movd    xmm1,       eax                     ; uinc(uiScaleX mod 32767)
665    movd    xmm2,       ebx                     ; -uinc
666    psllq   xmm1,       32
667    por     xmm1,       xmm2                    ; 0 0  uinc  -uinc   (dword)
668    pshufd  xmm7,       xmm1,   01000100b       ; xmm7: uinc -uinc uinc -uinc
669
670    mov     eax,    [uiScaleY]
671    and     eax,    32767
672    mov     ebx,    eax
673    neg     ebx
674    and     ebx,    32767
675    movd    xmm6,       eax                     ; vinc(uiScaleY mod 32767)
676    movd    xmm2,       ebx                     ; -vinc
677    psllq   xmm6,       32
678    por     xmm6,       xmm2                    ; 0 0 vinc -vinc (dword)
679    pshufd  xmm6,       xmm6,   01010000b       ; xmm6: vinc vinc -vinc -vinc
680
681    mov     edx,        40003fffh
682    movd    xmm5,       edx
683    punpcklwd   xmm5,   xmm0                    ; 16384 16383
684    pshufd  xmm5,       xmm5,   01000100b       ; xmm5: 16384 16383 16384 16383
685
686
687DOWNSAMPLE:
688
689    mov     eax,            [dwDstHeight]
690    mov     edi,            [pDstData]
691    mov     edx,            [dwDstStride]
692    mov     ecx,            [dwDstWidth]
693    sub     edx,            ecx
694    mov     [dstStep],  edx             ; stride - width
695    dec     eax
696    mov     [tmpHeight],    eax
697    mov     eax,            16384
698    mov     [yInverse],     eax
699
700    pshufd  xmm4,       xmm5,   01010000b   ; initial v to 16384 16384 16383 16383
701
702HEIGHT:
703    mov     eax,    [yInverse]
704    mov     esi,    [pSrcData]
705    shr     eax,    15
706    mul     dword [dwSrcStride]
707    add     esi,    eax                 ; get current row address
708    mov     ebp,    esi
709    add     ebp,    [dwSrcStride]
710
711    mov     eax,        16384
712    mov     [xInverse],     eax
713    mov     ecx,            [dwDstWidth]
714    dec     ecx
715
716    movdqa  xmm3,       xmm5            ; initial u to 16384 16383 16384 16383
717
718WIDTH:
719    mov     eax,        [xInverse]
720    shr     eax,        15
721
722    movd    xmm1,       [esi+eax]       ; xxxxxxba
723    movd    xmm2,       [ebp+eax]       ; xxxxxxdc
724    pxor    xmm0,       xmm0
725    punpcklwd   xmm1,   xmm2            ; xxxxdcba
726    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a
727    punpcklwd   xmm1,   xmm0            ; 000d000c000b000a
728
729    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv
730    pmaddwd xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2
731    movdqa  xmm0,   xmm2
732    pmuludq xmm2,   xmm1
733    psrlq   xmm0,   32
734    psrlq   xmm1,   32
735    pmuludq xmm0,   xmm1
736    paddq   xmm2,   xmm0
737    pshufd  xmm1,   xmm2,   00001110b
738    paddq   xmm2,   xmm1
739    psrlq   xmm2,   29
740
741    movd    eax,    xmm2
742    inc     eax
743    shr     eax,    1
744    mov     [edi],  al
745    inc     edi
746
747    mov     eax,        [uiScaleX]
748    add     [xInverse], eax
749
750    paddw   xmm3,       xmm7            ; inc u
751    psllw   xmm3,       1
752    psrlw   xmm3,       1
753
754    loop    WIDTH
755
756WIDTH_END:
757    mov     eax,        [xInverse]
758    shr     eax,        15
759    mov     cl,         [esi+eax]
760    mov     [edi],      cl
761    inc     edi
762
763    mov     eax,        [uiScaleY]
764    add     [yInverse], eax
765    add     edi,        [dstStep]
766
767    paddw   xmm4,   xmm6                ; inc v
768    psllw   xmm4,   1
769    psrlw   xmm4,   1
770
771    dec     dword [tmpHeight]
772    jg      HEIGHT
773
774
775LAST_ROW:
776    mov     eax,    [yInverse]
777    mov     esi,    [pSrcData]
778    shr     eax,    15
779    mul     dword [dwSrcStride]
780    add     esi,    eax                 ; get current row address
781
782    mov     eax,        16384
783    mov     [xInverse],     eax
784    mov     ecx,            [dwDstWidth]
785
786LAST_ROW_WIDTH:
787    mov     eax,        [xInverse]
788    shr     eax,        15
789
790    mov     al,         [esi+eax]
791    mov     [edi],  al
792    inc     edi
793
794    mov     eax,        [uiScaleX]
795    add     [xInverse], eax
796
797    loop    LAST_ROW_WIDTH
798
799LAST_ROW_END:
800
801    add     esp,            localsize
802    pop     ebx
803    pop     edi
804    pop     esi
805    pop     ebp
806%undef      pushsize
807%undef      localsize
808%undef      pSrcData
809%undef      dwSrcWidth
810%undef      dwSrcHeight
811%undef      dwSrcStride
812%undef      pDstData
813%undef      dwDstWidth
814%undef      dwDstHeight
815%undef      dwDstStride
816%undef      uiScaleX
817%undef      uiScaleY
818%undef      tmpHeight
819%undef      yInverse
820%undef      xInverse
821%undef      dstStep
822    ret
823
824
825
826
827;**************************************************************************************************************
828;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
829;               unsigned char* pSrc, const int iSrcStride,
830;               unsigned int uiScaleX, unsigned int uiScaleY );
831;{
832;**************************************************************************************************************
833
834WELS_EXTERN GeneralBilinearFastDownsampler_sse2
835    push    ebp
836    push    esi
837    push    edi
838    push    ebx
839%define     pushsize    16
840%define     localsize   16
841%define     pDstData        esp + pushsize + localsize + 4
842%define     dwDstStride     esp + pushsize + localsize + 8
843%define     dwDstWidth      esp + pushsize + localsize + 12
844%define     dwDstHeight     esp + pushsize + localsize + 16
845%define     pSrcData        esp + pushsize + localsize + 20
846%define     dwSrcStride     esp + pushsize + localsize + 24
847%define     uiScaleX            esp + pushsize + localsize + 28
848%define     uiScaleY            esp + pushsize + localsize + 32
849%define     tmpHeight       esp + 0
850%define     yInverse        esp + 4
851%define     xInverse        esp + 8
852%define     dstStep         esp + 12
853    sub     esp,            localsize
854
855    pxor    xmm0,   xmm0
856    mov     edx,    65535
857    mov     eax,    [uiScaleX]
858    and     eax,    edx
859    mov     ebx,    eax
860    neg     ebx
861    and     ebx,    65535
862    movd    xmm1,       eax                     ; uinc(uiScaleX mod 65536)
863    movd    xmm2,       ebx                     ; -uinc
864    psllq   xmm1,       32
865    por     xmm1,       xmm2                    ; 0 uinc 0 -uinc
866    pshuflw xmm7,       xmm1,   10001000b       ; xmm7: uinc -uinc uinc -uinc
867
868    mov     eax,    [uiScaleY]
869    and     eax,    32767
870    mov     ebx,    eax
871    neg     ebx
872    and     ebx,    32767
873    movd    xmm6,       eax                     ; vinc(uiScaleY mod 32767)
874    movd    xmm2,       ebx                     ; -vinc
875    psllq   xmm6,       32
876    por     xmm6,       xmm2                    ; 0 vinc 0 -vinc
877    pshuflw xmm6,       xmm6,   10100000b       ; xmm6: vinc vinc -vinc -vinc
878
879    mov     edx,        80007fffh               ; 32768 32767
880    movd    xmm5,       edx
881    pshuflw xmm5,       xmm5,       01000100b   ; 32768 32767 32768 32767
882    mov     ebx,        16384
883
884
885FAST_DOWNSAMPLE:
886
887    mov     eax,            [dwDstHeight]
888    mov     edi,            [pDstData]
889    mov     edx,            [dwDstStride]
890    mov     ecx,            [dwDstWidth]
891    sub     edx,            ecx
892    mov     [dstStep],  edx             ; stride - width
893    dec     eax
894    mov     [tmpHeight],    eax
895    mov     eax,        16384
896    mov     [yInverse],     eax
897
898    pshuflw xmm4,       xmm5,   01010000b
899    psrlw   xmm4,       1               ; initial v to 16384 16384 16383 16383
900
901FAST_HEIGHT:
902    mov     eax,    [yInverse]
903    mov     esi,    [pSrcData]
904    shr     eax,    15
905    mul     dword [dwSrcStride]
906    add     esi,    eax                 ; get current row address
907    mov     ebp,    esi
908    add     ebp,    [dwSrcStride]
909
910    mov     eax,        32768
911    mov     [xInverse],     eax
912    mov     ecx,            [dwDstWidth]
913    dec     ecx
914
915    movdqa  xmm3,       xmm5            ; initial u to 32768 32767 32768 32767
916
917FAST_WIDTH:
918    mov     eax,        [xInverse]
919    shr     eax,        16
920
921    movd    xmm1,       [esi+eax]       ; xxxxxxba
922    movd    xmm2,       [ebp+eax]       ; xxxxxxdc
923    punpcklwd   xmm1,   xmm2            ; xxxxdcba
924    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a
925
926    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv
927    pmulhuw xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2
928    pmaddwd     xmm2,   xmm1
929    pshufd  xmm1,   xmm2,   00000001b
930    paddd   xmm2,   xmm1
931    movd    xmm1,   ebx
932    paddd   xmm2,   xmm1
933    psrld   xmm2,   15
934
935    packuswb    xmm2,   xmm0
936    movd    eax,    xmm2
937    mov     [edi],  al
938    inc     edi
939
940    mov     eax,        [uiScaleX]
941    add     [xInverse], eax
942
943    paddw   xmm3,       xmm7            ; inc u
944
945    loop    FAST_WIDTH
946
947FAST_WIDTH_END:
948    mov     eax,        [xInverse]
949    shr     eax,        16
950    mov     cl,         [esi+eax]
951    mov     [edi],      cl
952    inc     edi
953
954    mov     eax,        [uiScaleY]
955    add     [yInverse], eax
956    add     edi,        [dstStep]
957
958    paddw   xmm4,   xmm6                ; inc v
959    psllw   xmm4,   1
960    psrlw   xmm4,   1
961
962    dec     dword [tmpHeight]
963    jg      FAST_HEIGHT
964
965
966FAST_LAST_ROW:
967    mov     eax,    [yInverse]
968    mov     esi,    [pSrcData]
969    shr     eax,    15
970    mul     dword [dwSrcStride]
971    add     esi,    eax                 ; get current row address
972
973    mov     eax,        32768
974    mov     [xInverse],     eax
975    mov     ecx,            [dwDstWidth]
976
977FAST_LAST_ROW_WIDTH:
978    mov     eax,        [xInverse]
979    shr     eax,        16
980
981    mov     al,         [esi+eax]
982    mov     [edi],  al
983    inc     edi
984
985    mov     eax,        [uiScaleX]
986    add     [xInverse], eax
987
988    loop    FAST_LAST_ROW_WIDTH
989
990FAST_LAST_ROW_END:
991
992    add     esp,            localsize
993    pop     ebx
994    pop     edi
995    pop     esi
996    pop     ebp
997%undef      pushsize
998%undef      localsize
999%undef      pSrcData
1000%undef      dwSrcWidth
1001%undef      dwSrcHeight
1002%undef      dwSrcStride
1003%undef      pDstData
1004%undef      dwDstStride
1005%undef      uiScaleX
1006%undef      uiScaleY
1007%undef      tmpHeight
1008%undef      yInverse
1009%undef      xInverse
1010%undef      dstStep
1011    ret
1012
1013%elifdef  WIN64
1014
1015;**************************************************************************************************************
1016;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
1017;                           unsigned char* pSrc, const int iSrcStride,
1018;                           unsigned int uiScaleX, unsigned int uiScaleY );
1019;{
1020;**************************************************************************************************************
1021
1022WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
1023    push    r12
1024    push    r13
1025    push    r14
1026    push    r15
1027    push    rsi
1028    push    rdi
1029    push    rbx
1030    push    rbp
1031    %assign push_num 8
1032    LOAD_7_PARA
1033    PUSH_XMM 8
1034    SIGN_EXTENSION r1, r1d
1035    SIGN_EXTENSION r2, r2d
1036    SIGN_EXTENSION r3, r3d
1037    SIGN_EXTENSION r5, r5d
1038    SIGN_EXTENSION r6, r6d
1039
1040    pxor    xmm0,   xmm0
1041    mov     r12d,   r6d
1042    and     r12d,   32767
1043    mov     r13d,   r12d
1044    neg     r13d
1045    and     r13d,   32767
1046    movd    xmm1,   r12d                     ; uinc(uiScaleX mod 32767)
1047    movd    xmm2,   r13d                     ; -uinc
1048    psllq   xmm1,   32
1049    por     xmm1,   xmm2                    ; 0 0  uinc  -uinc   (dword)
1050    pshufd  xmm7,   xmm1,   01000100b       ; xmm7: uinc -uinc uinc -uinc
1051
1052    mov     r12,    arg8
1053    SIGN_EXTENSION r12, r12d
1054    mov     rbp,    r12
1055    and     r12d,   32767
1056    mov     r13d,   r12d
1057    neg     r13d
1058    and     r13d,   32767
1059    movd    xmm6,       r12d                     ; vinc(uiScaleY mod 32767)
1060    movd    xmm2,       r13d                     ; -vinc
1061    psllq   xmm6,       32
1062    por     xmm6,       xmm2                    ; 0 0 vinc -vinc (dword)
1063    pshufd  xmm6,       xmm6,   01010000b       ; xmm6: vinc vinc -vinc -vinc
1064
1065    mov     r12d,        40003fffh
1066    movd    xmm5,       r12d
1067    punpcklwd   xmm5,   xmm0                    ; 16384 16383
1068    pshufd  xmm5,       xmm5,   01000100b       ; xmm5: 16384 16383 16384 16383
1069
1070DOWNSAMPLE:
1071    sub     r1, r2                   ; stride - width
1072    dec     r3
1073    mov     r14,16384
1074    pshufd  xmm4,       xmm5,   01010000b   ; initial v to 16384 16384 16383 16383
1075
1076HEIGHT:
1077    ;mov     r12, r4
1078    mov     r12, r14
1079    shr     r12,    15
1080    imul    r12,    r5
1081    add     r12,    r4                 ; get current row address
1082    mov     r13,    r12
1083    add     r13,    r5
1084
1085    mov     r15, 16384
1086    mov     rsi, r2
1087    dec     rsi
1088    movdqa  xmm3,       xmm5            ; initial u to 16384 16383 16384 16383
1089
1090WIDTH:
1091    mov     rdi,        r15
1092    shr     rdi,        15
1093
1094    movd    xmm1,       [r12+rdi]       ; xxxxxxba
1095    movd    xmm2,       [r13+rdi]       ; xxxxxxdc
1096    pxor    xmm0,       xmm0
1097    punpcklwd   xmm1,   xmm2            ; xxxxdcba
1098    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a
1099    punpcklwd   xmm1,   xmm0            ; 000d000c000b000a
1100
1101    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv
1102    pmaddwd xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2
1103    movdqa  xmm0,   xmm2
1104    pmuludq xmm2,   xmm1
1105    psrlq   xmm0,   32
1106    psrlq   xmm1,   32
1107    pmuludq xmm0,   xmm1
1108    paddq   xmm2,   xmm0
1109    pshufd  xmm1,   xmm2,   00001110b
1110    paddq   xmm2,   xmm1
1111    psrlq   xmm2,   29
1112
1113    movd    ebx,    xmm2
1114    inc     ebx
1115    shr     ebx,    1
1116    mov     [r0],   bl
1117    inc     r0
1118
1119    add      r15, r6
1120    paddw   xmm3,       xmm7            ; inc u
1121    psllw   xmm3,       1
1122    psrlw   xmm3,       1
1123
1124    dec     rsi
1125    jg      WIDTH
1126
1127WIDTH_END:
1128    shr     r15, 15
1129    mov     bl,  [r12+r15]
1130    mov     [r0],bl
1131    inc     r0
1132    add     r14, rbp
1133    add     r0,  r1
1134
1135    paddw   xmm4,   xmm6                ; inc v
1136    psllw   xmm4,   1
1137    psrlw   xmm4,   1
1138
1139    dec     r3
1140    jg      HEIGHT
1141
1142LAST_ROW:
1143    shr     r14, 15
1144    imul    r14, r5
1145    add     r4, r14
1146    mov     r15, 16384
1147
1148LAST_ROW_WIDTH:
1149    mov     rdi, r15
1150    shr     rdi, 15
1151    mov     bl,  [r4+rdi]
1152    mov     [r0],bl
1153    inc     r0
1154
1155    add     r15, r6
1156    dec     r2
1157    jg    LAST_ROW_WIDTH
1158
1159LAST_ROW_END:
1160
1161    POP_XMM
1162    pop     rbp
1163    pop     rbx
1164    pop     rdi
1165    pop     rsi
1166    pop     r15
1167    pop     r14
1168    pop     r13
1169    pop     r12
1170    ret
1171
1172;**************************************************************************************************************
1173;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
1174;               unsigned char* pSrc, const int iSrcStride,
1175;               unsigned int uiScaleX, unsigned int uiScaleY );
1176;{
1177;**************************************************************************************************************
1178
1179WELS_EXTERN GeneralBilinearFastDownsampler_sse2
1180    push    r12
1181    push    r13
1182    push    r14
1183    push    r15
1184    push    rsi
1185    push    rdi
1186    push    rbx
1187    push    rbp
1188    %assign push_num 8
1189    LOAD_7_PARA
1190    PUSH_XMM 8
1191    SIGN_EXTENSION r1, r1d
1192    SIGN_EXTENSION r2, r2d
1193    SIGN_EXTENSION r3, r3d
1194    SIGN_EXTENSION r5, r5d
1195    SIGN_EXTENSION r6, r6d
1196
1197    pxor    xmm0,   xmm0
1198    mov     r12d,   r6d
1199    and     r12d,   65535
1200    mov     r13d,   r12d
1201    neg     r13d
1202    and     r13d,   65535
1203    movd    xmm1,   r12d                     ; uinc(uiScaleX mod 65536)
1204    movd    xmm2,   r13d                     ; -uinc
1205    psllq   xmm1,   32
1206    por     xmm1,   xmm2                    ; 0 uinc 0 -uinc
1207    pshuflw xmm7,   xmm1,   10001000b       ; xmm7: uinc -uinc uinc -uinc
1208
1209    mov     r12,    arg8
1210    SIGN_EXTENSION r12, r12d
1211    mov     rbp,    r12
1212    and     r12d,   32767
1213    mov     r13d,   r12d
1214    neg     r13d
1215    and     r13d,   32767
1216    movd    xmm6,       r12d                     ; vinc(uiScaleY mod 32767)
1217    movd    xmm2,       r13d                     ; -vinc
1218    psllq   xmm6,       32
1219    por     xmm6,       xmm2                    ; 0 vinc 0 -vinc
1220    pshuflw xmm6,       xmm6,   10100000b       ; xmm6: vinc vinc -vinc -vinc
1221
1222    mov     r12d,       80007fffh               ; 32768 32767
1223    movd    xmm5,       r12d
1224    pshuflw xmm5,       xmm5,       01000100b   ; 32768 32767 32768 32767
1225
1226FAST_DOWNSAMPLE:
1227    sub     r1, r2                   ; stride - width
1228    dec     r3
1229    mov     r14,16384
1230
1231    pshuflw xmm4,       xmm5,   01010000b
1232    psrlw   xmm4,       1               ; initial v to 16384 16384 16383 16383
1233
1234FAST_HEIGHT:
1235    mov     r12, r14
1236    shr     r12,    15
1237    imul    r12,    r5
1238    add     r12,    r4                 ; get current row address
1239    mov     r13,    r12
1240    add     r13,    r5
1241
1242    mov     r15, 32768
1243    mov     rsi, r2
1244    dec     rsi
1245
1246    movdqa  xmm3,       xmm5            ; initial u to 32768 32767 32768 32767
1247
1248FAST_WIDTH:
1249    mov     rdi,        r15
1250    shr     rdi,        16
1251
1252    movd    xmm1,       [r12+rdi]       ; xxxxxxba
1253    movd    xmm2,       [r13+rdi]       ; xxxxxxdc
1254    punpcklwd   xmm1,   xmm2            ; xxxxdcba
1255    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a
1256
1257    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv
1258    pmulhuw xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2
1259    pmaddwd     xmm2,   xmm1
1260    pshufd  xmm1,   xmm2,   00000001b
1261    paddd   xmm2,   xmm1
1262    movdqa  xmm1,   [add_extra_half]
1263    paddd   xmm2,   xmm1
1264    psrld   xmm2,   15
1265
1266    packuswb    xmm2,   xmm0
1267    movd    ebx,    xmm2
1268    mov     [r0],  bl
1269    inc     r0
1270
1271    add     r15, r6
1272
1273    paddw   xmm3,       xmm7            ; inc u
1274    dec     rsi
1275    jg      FAST_WIDTH
1276
1277FAST_WIDTH_END:
1278    shr     r15, 16
1279    mov     bl,  [r12+r15]
1280    mov     [r0],bl
1281    inc     r0
1282    add     r14, rbp
1283    add     r0,  r1
1284
1285    paddw   xmm4,   xmm6                ; inc v
1286    psllw   xmm4,   1
1287    psrlw   xmm4,   1
1288
1289    dec     r3
1290    jg      FAST_HEIGHT
1291
1292
1293FAST_LAST_ROW:
1294    shr     r14, 15
1295    imul    r14, r5
1296    add     r4, r14
1297    mov     r15, 32768
1298
1299FAST_LAST_ROW_WIDTH:
1300    mov     rdi, r15
1301    shr     rdi, 16
1302    mov     bl,  [r4+rdi]
1303    mov     [r0],bl
1304    inc     r0
1305
1306    add     r15, r6
1307    dec     r2
1308    jg      FAST_LAST_ROW_WIDTH
1309
1310FAST_LAST_ROW_END:
1311
1312    POP_XMM
1313    pop     rbp
1314    pop     rbx
1315    pop     rdi
1316    pop     rsi
1317    pop     r15
1318    pop     r14
1319    pop     r13
1320    pop     r12
1321    ret
1322
1323%elifdef  UNIX64
1324
1325;**************************************************************************************************************
1326;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
1327;                           unsigned char* pSrc, const int iSrcStride,
1328;                           unsigned int uiScaleX, unsigned int uiScaleY );
1329;{
1330;**************************************************************************************************************
1331
1332WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
1333    push    r12
1334    push    r13
1335    push    r14
1336    push    r15
1337    push    rbx
1338    push    rbp
1339    %assign push_num 6
1340    LOAD_7_PARA
1341    SIGN_EXTENSION r1, r1d
1342    SIGN_EXTENSION r2, r2d
1343    SIGN_EXTENSION r3, r3d
1344    SIGN_EXTENSION r5, r5d
1345    SIGN_EXTENSION r6, r6d
1346
1347    pxor    xmm0,   xmm0
1348    mov     r12d,   r6d
1349    and     r12d,   32767
1350    mov     r13d,   r12d
1351    neg     r13d
1352    and     r13d,   32767
1353    movd    xmm1,   r12d                     ; uinc(uiScaleX mod 32767)
1354    movd    xmm2,   r13d                     ; -uinc
1355    psllq   xmm1,   32
1356    por     xmm1,   xmm2                    ; 0 0  uinc  -uinc   (dword)
1357    pshufd  xmm7,   xmm1,   01000100b       ; xmm7: uinc -uinc uinc -uinc
1358
1359    mov     r12,    arg8
1360    SIGN_EXTENSION r12, r12d
1361    mov     rbp,    r12
1362    and     r12d,   32767
1363    mov     r13d,   r12d
1364    neg     r13d
1365    and     r13d,   32767
1366    movd    xmm6,       r12d                     ; vinc(uiScaleY mod 32767)
1367    movd    xmm2,       r13d                     ; -vinc
1368    psllq   xmm6,       32
1369    por     xmm6,       xmm2                    ; 0 0 vinc -vinc (dword)
1370    pshufd  xmm6,       xmm6,   01010000b       ; xmm6: vinc vinc -vinc -vinc
1371
1372    mov     r12d,        40003fffh
1373    movd    xmm5,       r12d
1374    punpcklwd   xmm5,   xmm0                    ; 16384 16383
1375    pshufd  xmm5,       xmm5,   01000100b       ; xmm5: 16384 16383 16384 16383
1376
1377DOWNSAMPLE:
1378    sub     r1, r2                   ; stride - width
1379    dec     r3
1380    mov     r14,16384
1381    pshufd  xmm4,       xmm5,   01010000b   ; initial v to 16384 16384 16383 16383
1382
1383HEIGHT:
1384    ;mov     r12, r4
1385    mov     r12, r14
1386    shr     r12,    15
1387    imul    r12,    r5
1388    add     r12,    r4                 ; get current row address
1389    mov     r13,    r12
1390    add     r13,    r5
1391
1392    mov     r15, 16384
1393    mov     rax, r2
1394    dec     rax
1395    movdqa  xmm3,       xmm5            ; initial u to 16384 16383 16384 16383
1396
1397WIDTH:
1398    mov     r11,        r15
1399    shr     r11,        15
1400
1401    movd    xmm1,       [r12+r11]       ; xxxxxxba
1402    movd    xmm2,       [r13+r11]       ; xxxxxxdc
1403    pxor    xmm0,       xmm0
1404    punpcklwd   xmm1,   xmm2            ; xxxxdcba
1405    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a
1406    punpcklwd   xmm1,   xmm0            ; 000d000c000b000a
1407
1408    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv
1409    pmaddwd xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2
1410    movdqa  xmm0,   xmm2
1411    pmuludq xmm2,   xmm1
1412    psrlq   xmm0,   32
1413    psrlq   xmm1,   32
1414    pmuludq xmm0,   xmm1
1415    paddq   xmm2,   xmm0
1416    pshufd  xmm1,   xmm2,   00001110b
1417    paddq   xmm2,   xmm1
1418    psrlq   xmm2,   29
1419
1420    movd    ebx,    xmm2
1421    inc     ebx
1422    shr     ebx,    1
1423    mov     [r0],   bl
1424    inc     r0
1425
1426    add      r15, r6
1427    paddw   xmm3,       xmm7            ; inc u
1428    psllw   xmm3,       1
1429    psrlw   xmm3,       1
1430
1431    dec     rax
1432    jg      WIDTH
1433
1434WIDTH_END:
1435    shr     r15, 15
1436    mov     bl,  [r12+r15]
1437    mov     [r0],bl
1438    inc     r0
1439    add     r14, rbp
1440    add     r0,  r1
1441
1442    paddw   xmm4,   xmm6                ; inc v
1443    psllw   xmm4,   1
1444    psrlw   xmm4,   1
1445
1446    dec     r3
1447    jg      HEIGHT
1448
1449LAST_ROW:
1450    shr     r14, 15
1451    imul    r14, r5
1452    add     r4, r14
1453    mov     r15, 16384
1454
1455LAST_ROW_WIDTH:
1456    mov     r11, r15
1457    shr     r11, 15
1458    mov     bl,  [r4+r11]
1459    mov     [r0],bl
1460    inc     r0
1461
1462    add     r15, r6
1463    dec     r2
1464    jg    LAST_ROW_WIDTH
1465
1466LAST_ROW_END:
1467
1468    pop     rbp
1469    pop     rbx
1470    pop     r15
1471    pop     r14
1472    pop     r13
1473    pop     r12
1474    ret
1475
1476;**************************************************************************************************************
1477;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
1478;               unsigned char* pSrc, const int iSrcStride,
1479;               unsigned int uiScaleX, unsigned int uiScaleY );
1480;{
1481;**************************************************************************************************************
1482
1483WELS_EXTERN GeneralBilinearFastDownsampler_sse2
1484    push    r12
1485    push    r13
1486    push    r14
1487    push    r15
1488    push    rbx
1489    push    rbp
1490    %assign push_num 6
1491    LOAD_7_PARA
1492    SIGN_EXTENSION r1, r1d
1493    SIGN_EXTENSION r2, r2d
1494    SIGN_EXTENSION r3, r3d
1495    SIGN_EXTENSION r5, r5d
1496    SIGN_EXTENSION r6, r6d
1497
1498    pxor    xmm0,   xmm0
1499    mov     r12d,   r6d
1500    and     r12d,   65535
1501    mov     r13d,   r12d
1502    neg     r13d
1503    and     r13d,   65535
1504    movd    xmm1,   r12d                     ; uinc(uiScaleX mod 65536)
1505    movd    xmm2,   r13d                     ; -uinc
1506    psllq   xmm1,   32
1507    por     xmm1,   xmm2                    ; 0 uinc 0 -uinc
1508    pshuflw xmm7,   xmm1,   10001000b       ; xmm7: uinc -uinc uinc -uinc
1509
1510    mov     r12,    arg8
1511    SIGN_EXTENSION r12, r12d
1512    mov     rbp,    r12
1513    and     r12d,   32767
1514    mov     r13d,   r12d
1515    neg     r13d
1516    and     r13d,   32767
1517    movd    xmm6,       r12d                     ; vinc(uiScaleY mod 32767)
1518    movd    xmm2,       r13d                     ; -vinc
1519    psllq   xmm6,       32
1520    por     xmm6,       xmm2                    ; 0 vinc 0 -vinc
1521    pshuflw xmm6,       xmm6,   10100000b       ; xmm6: vinc vinc -vinc -vinc
1522
1523    mov     r12d,       80007fffh               ; 32768 32767
1524    movd    xmm5,       r12d
1525    pshuflw xmm5,       xmm5,       01000100b   ; 32768 32767 32768 32767
1526
1527FAST_DOWNSAMPLE:
1528    sub     r1, r2                   ; stride - width
1529    dec     r3
1530    mov     r14,16384
1531
1532    pshuflw xmm4,       xmm5,   01010000b
1533    psrlw   xmm4,       1               ; initial v to 16384 16384 16383 16383
1534
1535FAST_HEIGHT:
1536    mov     r12, r14
1537    shr     r12,    15
1538    imul    r12,    r5
1539    add     r12,    r4                 ; get current row address
1540    mov     r13,    r12
1541    add     r13,    r5
1542
1543    mov     r15, 32768
1544    mov     rax, r2
1545    dec     rax
1546
1547    movdqa  xmm3,       xmm5            ; initial u to 32768 32767 32768 32767
1548
1549FAST_WIDTH:
1550    mov     r11,        r15
1551    shr     r11,        16
1552
1553    movd    xmm1,       [r12+r11]       ; xxxxxxba
1554    movd    xmm2,       [r13+r11]       ; xxxxxxdc
1555    punpcklwd   xmm1,   xmm2            ; xxxxdcba
1556    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a
1557
1558    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv
1559    pmulhuw xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2
1560    pmaddwd     xmm2,   xmm1
1561    pshufd  xmm1,   xmm2,   00000001b
1562    paddd   xmm2,   xmm1
1563    movdqa  xmm1,   [add_extra_half]
1564    paddd   xmm2,   xmm1
1565    psrld   xmm2,   15
1566
1567    packuswb    xmm2,   xmm0
1568    movd    ebx,    xmm2
1569    mov     [r0],  bl
1570    inc     r0
1571
1572    add     r15, r6
1573
1574    paddw   xmm3,       xmm7            ; inc u
1575    dec     rax
1576    jg      FAST_WIDTH
1577
1578FAST_WIDTH_END:
1579    shr     r15, 16
1580    mov     bl,  [r12+r15]
1581    mov     [r0],bl
1582    inc     r0
1583    add     r14, rbp
1584    add     r0,  r1
1585
1586    paddw   xmm4,   xmm6                ; inc v
1587    psllw   xmm4,   1
1588    psrlw   xmm4,   1
1589
1590    dec     r3
1591    jg      FAST_HEIGHT
1592
1593
1594FAST_LAST_ROW:
1595    shr     r14, 15
1596    imul    r14, r5
1597    add     r4, r14
1598    mov     r15, 32768
1599
1600FAST_LAST_ROW_WIDTH:
1601    mov     r11, r15
1602    shr     r11, 16
1603    mov     bl,  [r4+r11]
1604    mov     [r0],bl
1605    inc     r0
1606
1607    add     r15, r6
1608    dec     r2
1609    jg      FAST_LAST_ROW_WIDTH
1610
1611FAST_LAST_ROW_END:
1612
1613    pop     rbp
1614    pop     rbx
1615    pop     r15
1616    pop     r14
1617    pop     r13
1618    pop     r12
1619    ret
1620%endif
1621
1622;***********************************************************************
1623;   void DyadicBilinearOneThirdDownsampler_ssse3(    unsigned char* pDst, const int iDstStride,
1624;                   unsigned char* pSrc, const int iSrcStride,
1625;                   const int iSrcWidth, const int iSrcHeight );
1626;***********************************************************************
1627WELS_EXTERN DyadicBilinearOneThirdDownsampler_ssse3
1628%ifdef X86_32
1629    push r6
1630    %assign push_num 1
1631%else
1632    %assign push_num 0
1633%endif
1634    LOAD_6_PARA
1635    PUSH_XMM 8
1636    SIGN_EXTENSION r1, r1d
1637    SIGN_EXTENSION r3, r3d
1638    SIGN_EXTENSION r4, r4d
1639    SIGN_EXTENSION r5, r5d
1640%ifdef X86_32_PICASM
1641    %define i_height dword arg6
1642%else
1643    %define i_height r5
1644%endif
1645    INIT_X86_32_PIC_NOPRESERVE r5
1646
1647%ifndef X86_32
1648    push r12
1649    mov r12, r4
1650%endif
1651
1652    mov r6, r1             ;Save the tailer for the unasigned size
1653    imul r6, i_height
1654    add r6, r0
1655    movdqa xmm7, [r6]
1656
1657.yloops_onethird_sse3:
1658%ifdef X86_32
1659    mov r4, arg5
1660%else
1661    mov r4, r12
1662%endif
1663
1664    mov r6, r0        ;save base address
1665    ; each loop = source bandwidth: 48 bytes
1666.xloops_onethird_sse3:
1667    ; 1st part horizonal loop: x48 bytes
1668    ;               mem  hi<-       ->lo
1669    ;1st Line Src:  xmm0: F * e E * d D * c C * b B * a A
1670    ;               xmm2: k K * j J * i I * h H * g G * f
1671    ;               xmm2: * p P * o O * n N * m M * l L *
1672    ;
1673    ;2nd Line Src:  xmm2: F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'
1674    ;               xmm1: k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'
1675    ;               xmm1: *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *
1676    ;=> target:
1677    ;: P O N M L K J I H G F E D C B A
1678    ;: p o n m l k j i h g f e d c b a
1679    ;: P' ..                          A'
1680    ;: p' ..                          a'
1681
1682    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1683    ;1st line
1684    movdqa xmm0, [r2]                         ;F * e E * d D * c C * b B * a A
1685    movdqa xmm1, xmm0
1686    movdqa xmm5, [pic(shufb_mask_onethird_low_1)]
1687    movdqa xmm6, [pic(shufb_mask_onethird_high_1)]
1688    pshufb xmm0, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
1689    pshufb xmm1, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
1690
1691    movdqa xmm2, [r2+16]                      ;k K * j J * i I * h H * g G * f
1692    movdqa xmm3, xmm2
1693    movdqa xmm5, [pic(shufb_mask_onethird_low_2)]
1694    movdqa xmm6, [pic(shufb_mask_onethird_high_2)]
1695    pshufb xmm2, xmm5                           ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
1696    pshufb xmm3, xmm6                           ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
1697
1698    paddusb xmm0, xmm2                          ;0 0 0 0 0 K J I H G F E D C B A -> xmm0
1699    paddusb xmm1, xmm3                          ;0 0 0 0 0 k j i h g f e d c b a -> xmm1
1700
1701    movdqa xmm2, [r2+32]                      ;* p P * o O * n N * m M * l L *
1702    movdqa xmm3, xmm2
1703    movdqa xmm5, [pic(shufb_mask_onethird_low_3)]
1704    movdqa xmm6, [pic(shufb_mask_onethird_high_3)]
1705    pshufb xmm2, xmm5                           ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
1706    pshufb xmm3, xmm6                           ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
1707
1708    paddusb xmm0, xmm2                          ;P O N M L K J I H G F E D C B A -> xmm0
1709    paddusb xmm1, xmm3                          ;p o n m l k j i h g f e d c b a -> xmm1
1710    pavgb xmm0, xmm1                            ;1st line average                -> xmm0
1711
1712    ;2nd line
1713    movdqa xmm2, [r2+r3]                      ;F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'
1714    movdqa xmm3, xmm2
1715    movdqa xmm5, [pic(shufb_mask_onethird_low_1)]
1716    movdqa xmm6, [pic(shufb_mask_onethird_high_1)]
1717    pshufb xmm2, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
1718    pshufb xmm3, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0  e' d' c' b' a' -> xmm3
1719
1720    movdqa xmm1, [r2+r3+16]                   ;k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'
1721    movdqa xmm4, xmm1
1722    movdqa xmm5, [pic(shufb_mask_onethird_low_2)]
1723    movdqa xmm6, [pic(shufb_mask_onethird_high_2)]
1724    pshufb xmm1, xmm5                           ;0 0 0 0 0 K' J' I' H' G' 0  0 0 0 0 0 -> xmm1
1725    pshufb xmm4, xmm6                           ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
1726
1727    paddusb xmm2, xmm1                          ;0 0 0 0 0 K' J' I' H' G' F' E' D' C' B' A' -> xmm2
1728    paddusb xmm3, xmm4                          ;0 0 0 0 0 k' j' i' h' g' f' e' d' c' b' a' -> xmm3
1729
1730    movdqa xmm1, [r2+r3+32]                   ; *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *
1731    movdqa xmm4, xmm1
1732    movdqa xmm5, [pic(shufb_mask_onethird_low_3)]
1733    movdqa xmm6, [pic(shufb_mask_onethird_high_3)]
1734    pshufb xmm1, xmm5                           ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
1735    pshufb xmm4, xmm6                           ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
1736
1737    paddusb xmm2, xmm1                          ;P' O' N' M' L' K' J' I' H' G' F' E' D' C' B' A' -> xmm2
1738    paddusb xmm3, xmm4                          ;p' o' n' m' l' k' j' i' h' g' f' e' d' c' b' a' -> xmm3
1739    pavgb xmm2, xmm3                            ;2nd line average                                -> xmm2
1740
1741    pavgb xmm0, xmm2                            ; bytes-average(1st line , 2nd line )
1742
1743    ; write pDst
1744    movdqa [r0], xmm0                           ;write result in dst
1745
1746    ; next SMB
1747    lea r2, [r2+48]                             ;current src address
1748    lea r0, [r0+16]                             ;current dst address
1749
1750    sub r4, 48                                  ;xloops counter
1751    cmp r4, 0
1752    jg near .xloops_onethird_sse3
1753
1754    sub r6, r0                                  ;offset = base address - current address
1755    lea r2, [r2+2*r3]                           ;
1756    lea r2, [r2+r3]                             ;
1757    lea r2, [r2+2*r6]                           ;current line + 3 lines
1758    lea r2, [r2+r6]
1759    lea r0, [r0+r1]
1760    lea r0, [r0+r6]                             ;current dst lien + 1 line
1761
1762    dec i_height
1763    jg near .yloops_onethird_sse3
1764
1765    movdqa [r0], xmm7                           ;restore the tailer for the unasigned size
1766
1767%ifndef X86_32
1768    pop r12
1769%endif
1770
1771    DEINIT_X86_32_PIC
1772    POP_XMM
1773    LOAD_6_PARA_POP
1774%ifdef X86_32
1775    pop r6
1776%endif
1777    ret
1778%undef i_height
1779
1780;***********************************************************************
1781;   void DyadicBilinearOneThirdDownsampler_sse4(    unsigned char* pDst, const int iDstStride,
1782;                   unsigned char* pSrc, const int iSrcStride,
1783;                   const int iSrcWidth, const int iSrcHeight );
1784;***********************************************************************
1785WELS_EXTERN DyadicBilinearOneThirdDownsampler_sse4
1786%ifdef X86_32
1787    push r6
1788    %assign push_num 1
1789%else
1790    %assign push_num 0
1791%endif
1792    LOAD_6_PARA
1793    PUSH_XMM 8
1794    SIGN_EXTENSION r1, r1d
1795    SIGN_EXTENSION r3, r3d
1796    SIGN_EXTENSION r4, r4d
1797    SIGN_EXTENSION r5, r5d
1798%ifdef X86_32_PICASM
1799    %define i_height dword arg6
1800%else
1801    %define i_height r5
1802%endif
1803    INIT_X86_32_PIC_NOPRESERVE r5
1804
1805%ifndef X86_32
1806    push r12
1807    mov r12, r4
1808%endif
1809
1810    mov r6, r1             ;Save the tailer for the unasigned size
1811    imul r6, i_height
1812    add r6, r0
1813    movdqa xmm7, [r6]
1814
1815.yloops_onethird_sse4:
1816%ifdef X86_32
1817    mov r4, arg5
1818%else
1819    mov r4, r12
1820%endif
1821
1822    mov r6, r0        ;save base address
1823    ; each loop = source bandwidth: 48 bytes
1824.xloops_onethird_sse4:
1825    ; 1st part horizonal loop: x48 bytes
1826    ;               mem  hi<-       ->lo
1827    ;1st Line Src:  xmm0: F * e E * d D * c C * b B * a A
1828    ;               xmm2: k K * j J * i I * h H * g G * f
1829    ;               xmm2: * p P * o O * n N * m M * l L *
1830    ;
1831    ;2nd Line Src:  xmm2: F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'
1832    ;               xmm1: k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'
1833    ;               xmm1: *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *
1834    ;=> target:
1835    ;: P O N M L K J I H G F E D C B A
1836    ;: p o n m l k j i h g f e d c b a
1837    ;: P' ..                          A'
1838    ;: p' ..                          a'
1839
1840    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1841    ;1st line
1842    movntdqa xmm0, [r2]                         ;F * e E * d D * c C * b B * a A
1843    movdqa xmm1, xmm0
1844    movdqa xmm5, [pic(shufb_mask_onethird_low_1)]
1845    movdqa xmm6, [pic(shufb_mask_onethird_high_1)]
1846    pshufb xmm0, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
1847    pshufb xmm1, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
1848
1849    movntdqa xmm2, [r2+16]                      ;k K * j J * i I * h H * g G * f
1850    movdqa xmm3, xmm2
1851    movdqa xmm5, [pic(shufb_mask_onethird_low_2)]
1852    movdqa xmm6, [pic(shufb_mask_onethird_high_2)]
1853    pshufb xmm2, xmm5                           ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
1854    pshufb xmm3, xmm6                           ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
1855
1856    paddusb xmm0, xmm2                          ;0 0 0 0 0 K J I H G F E D C B A -> xmm0
1857    paddusb xmm1, xmm3                          ;0 0 0 0 0 k j i h g f e d c b a -> xmm1
1858
1859    movntdqa xmm2, [r2+32]                      ;* p P * o O * n N * m M * l L *
1860    movdqa xmm3, xmm2
1861    movdqa xmm5, [pic(shufb_mask_onethird_low_3)]
1862    movdqa xmm6, [pic(shufb_mask_onethird_high_3)]
1863    pshufb xmm2, xmm5                           ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
1864    pshufb xmm3, xmm6                           ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
1865
1866    paddusb xmm0, xmm2                          ;P O N M L K J I H G F E D C B A -> xmm0
1867    paddusb xmm1, xmm3                          ;p o n m l k j i h g f e d c b a -> xmm1
1868    pavgb xmm0, xmm1                            ;1st line average                -> xmm0
1869
1870    ;2nd line
1871    movntdqa xmm2, [r2+r3]                      ;F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'
1872    movdqa xmm3, xmm2
1873    movdqa xmm5, [pic(shufb_mask_onethird_low_1)]
1874    movdqa xmm6, [pic(shufb_mask_onethird_high_1)]
1875    pshufb xmm2, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
1876    pshufb xmm3, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0  e' d' c' b' a' -> xmm3
1877
1878    movntdqa xmm1, [r2+r3+16]                   ;k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'
1879    movdqa xmm4, xmm1
1880    movdqa xmm5, [pic(shufb_mask_onethird_low_2)]
1881    movdqa xmm6, [pic(shufb_mask_onethird_high_2)]
1882    pshufb xmm1, xmm5                           ;0 0 0 0 0 K' J' I' H' G' 0  0 0 0 0 0 -> xmm1
1883    pshufb xmm4, xmm6                           ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
1884
1885    paddusb xmm2, xmm1                          ;0 0 0 0 0 K' J' I' H' G' F' E' D' C' B' A' -> xmm2
1886    paddusb xmm3, xmm4                          ;0 0 0 0 0 k' j' i' h' g' f' e' d' c' b' a' -> xmm3
1887
1888    movntdqa xmm1, [r2+r3+32]                   ; *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *
1889    movdqa xmm4, xmm1
1890    movdqa xmm5, [pic(shufb_mask_onethird_low_3)]
1891    movdqa xmm6, [pic(shufb_mask_onethird_high_3)]
1892    pshufb xmm1, xmm5                           ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
1893    pshufb xmm4, xmm6                           ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
1894
1895    paddusb xmm2, xmm1                          ;P' O' N' M' L' K' J' I' H' G' F' E' D' C' B' A' -> xmm2
1896    paddusb xmm3, xmm4                          ;p' o' n' m' l' k' j' i' h' g' f' e' d' c' b' a' -> xmm3
1897    pavgb xmm2, xmm3                            ;2nd line average                                -> xmm2
1898
1899    pavgb xmm0, xmm2                            ; bytes-average(1st line , 2nd line )
1900
1901    ; write pDst
1902    movdqa [r0], xmm0                           ;write result in dst
1903
1904    ; next SMB
1905    lea r2, [r2+48]                             ;current src address
1906    lea r0, [r0+16]                             ;current dst address
1907
1908    sub r4, 48                                  ;xloops counter
1909    cmp r4, 0
1910    jg near .xloops_onethird_sse4
1911
1912    sub r6, r0                                  ;offset = base address - current address
1913    lea r2, [r2+2*r3]                           ;
1914    lea r2, [r2+r3]                             ;
1915    lea r2, [r2+2*r6]                           ;current line + 3 lines
1916    lea r2, [r2+r6]
1917    lea r0, [r0+r1]
1918    lea r0, [r0+r6]                             ;current dst lien + 1 line
1919
1920    dec i_height
1921    jg near .yloops_onethird_sse4
1922
1923    movdqa [r0], xmm7                           ;restore the tailer for the unasigned size
1924
1925%ifndef X86_32
1926    pop r12
1927%endif
1928
1929    DEINIT_X86_32_PIC
1930    POP_XMM
1931    LOAD_6_PARA_POP
1932%ifdef X86_32
1933    pop r6
1934%endif
1935    ret
1936%undef i_height
1937
1938;***********************************************************************
1939;   void DyadicBilinearQuarterDownsampler_sse( unsigned char* pDst, const int iDstStride,
1940;                   unsigned char* pSrc, const int iSrcStride,
1941;                   const int iSrcWidth, const int iSrcHeight );
1942;***********************************************************************
1943WELS_EXTERN DyadicBilinearQuarterDownsampler_sse
1944%ifdef X86_32
1945    push r6
1946    %assign push_num 1
1947%else
1948    %assign push_num 0
1949%endif
1950    LOAD_6_PARA
1951    SIGN_EXTENSION r1, r1d
1952    SIGN_EXTENSION r3, r3d
1953    SIGN_EXTENSION r4, r4d
1954    SIGN_EXTENSION r5, r5d
1955
1956%ifndef X86_32
1957    push r12
1958    mov r12, r4
1959%endif
1960    sar r5, $02            ; iSrcHeight >> 2
1961
1962    mov r6, r1             ;Save the tailer for the unasigned size
1963    imul r6, r5
1964    add r6, r0
1965    movq xmm7, [r6]
1966
1967.yloops_quarter_sse:
1968%ifdef X86_32
1969    mov r4, arg5
1970%else
1971    mov r4, r12
1972%endif
1973
1974    mov r6, r0        ;save base address
1975    ; each loop = source bandwidth: 32 bytes
1976.xloops_quarter_sse:
1977    ; 1st part horizonal loop: x16 bytes
1978    ;               mem  hi<-       ->lo
1979    ;1st Line Src:  mm0: d D c C b B a A    mm1: h H g G f F e E
1980    ;2nd Line Src:  mm2: l L k K j J i I    mm3: p P o O n N m M
1981    ;
1982    ;=> target:
1983    ;: G E C A,
1984    ;:
1985    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1986    movq mm0, [r2]         ; 1st pSrc line
1987    movq mm1, [r2+8]       ; 1st pSrc line + 8
1988    movq mm2, [r2+r3]     ; 2nd pSrc line
1989    movq mm3, [r2+r3+8]   ; 2nd pSrc line + 8
1990
1991    pshufw mm0, mm0, 0d8h    ; x X x X c C a A
1992    pshufw mm1, mm1, 0d8h    ; x X x X g G e E
1993    pshufw mm2, mm2, 0d8h    ; x X x X k K i I
1994    pshufw mm3, mm3, 0d8h    ; x X x X o O m M
1995
1996    punpckldq mm0, mm1       ; g G e E c C a A
1997    punpckldq mm2, mm3       ; o O m M k K i I
1998
1999    ; to handle mm0,mm2
2000    pshufw mm4, mm0, 0d8h       ;g G c C e E a A
2001    pshufw mm5, mm4, 04eh       ;e E a A g G c C
2002    punpcklbw mm4, mm5          ;g e G E c a C A  -> mm4
2003    pshufw mm4, mm4, 0d8h       ;g e c a G E C A  -> mm4
2004
2005    pshufw mm5, mm2, 0d8h       ;o O k K m M i I
2006    pshufw mm6, mm5, 04eh       ;m M i I o O k K
2007    punpcklbw mm5, mm6          ;o m O M k i K I
2008    pshufw mm5, mm5, 0d8h       ;o m k i O M K I  -> mm5
2009
2010    ; to handle mm4, mm5
2011    movq mm0, mm4
2012    punpckldq mm0, mm6          ;x x x x G E C A
2013    punpckhdq mm4, mm6          ;x x x x g e c a
2014
2015    movq mm1, mm5
2016    punpckldq mm1, mm6          ;x x x x O M K I
2017    punpckhdq mm5, mm6          ;x x x x o m k i
2018
2019    ; avg within MB horizon width (8 x 2 lines)
2020    pavgb mm0, mm4      ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
2021    pavgb mm1, mm5      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
2022    pavgb mm0, mm1      ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
2023
2024    ; 2nd part horizonal loop: x16 bytes
2025    movq mm1, [r2+16]      ; 1st pSrc line + 16
2026    movq mm2, [r2+24]      ; 1st pSrc line + 24
2027    movq mm3, [r2+r3+16]  ; 2nd pSrc line + 16
2028    movq mm4, [r2+r3+24]  ; 2nd pSrc line + 24
2029
2030    pshufw mm1, mm1, 0d8h
2031    pshufw mm2, mm2, 0d8h
2032    pshufw mm3, mm3, 0d8h
2033    pshufw mm4, mm4, 0d8h
2034
2035    punpckldq mm1, mm2
2036    punpckldq mm3, mm4
2037
2038    ; to handle mm1, mm3
2039    pshufw mm4, mm1, 0d8h
2040    pshufw mm5, mm4, 04eh
2041    punpcklbw mm4, mm5
2042    pshufw mm4, mm4, 0d8h
2043
2044    pshufw mm5, mm3, 0d8h
2045    pshufw mm6, mm5, 04eh
2046    punpcklbw mm5, mm6
2047    pshufw mm5, mm5, 0d8h
2048
2049    ; to handle mm4, mm5
2050    movq mm2, mm4
2051    punpckldq mm2, mm6
2052    punpckhdq mm4, mm6
2053
2054    movq mm3, mm5
2055    punpckldq mm3, mm6
2056    punpckhdq mm5, mm6
2057
2058    ; avg within MB horizon width (8 x 2 lines)
2059    pavgb mm2, mm4      ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
2060    pavgb mm3, mm5      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
2061    pavgb mm2, mm3      ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
2062
2063    movd [r0  ], mm0
2064    movd [r0+4], mm2
2065
2066    ; next SMB
2067    lea r2, [r2+32]
2068    lea r0, [r0+8]
2069
2070    sub r4, 32
2071    cmp r4, 0
2072    jg near .xloops_quarter_sse
2073
2074    sub  r6, r0
2075    ; next line
2076    lea r2, [r2+4*r3]    ; next 4 end of lines
2077    lea r2, [r2+4*r6]    ; reset to base 0 [- 4 * iDstWidth]
2078    lea r0, [r0+r1]
2079    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]
2080
2081    dec r5
2082    jg near .yloops_quarter_sse
2083
2084    movq [r0], xmm7      ;restored the tailer for the unasigned size
2085
2086    WELSEMMS
2087%ifndef X86_32
2088    pop r12
2089%endif
2090    LOAD_6_PARA_POP
2091%ifdef X86_32
2092    pop r6
2093%endif
2094    ret
2095
2096;***********************************************************************
2097;   void DyadicBilinearQuarterDownsampler_ssse3(   unsigned char* pDst, const int iDstStride,
2098;                   unsigned char* pSrc, const int iSrcStride,
2099;                   const int iSrcWidth, const int iSrcHeight );
2100;***********************************************************************
2101WELS_EXTERN DyadicBilinearQuarterDownsampler_ssse3
2102    ;push ebx
2103    ;push edx
2104    ;push esi
2105    ;push edi
2106    ;push ebp
2107
2108    ;mov edi, [esp+24]   ; pDst
2109    ;mov edx, [esp+28]   ; iDstStride
2110    ;mov esi, [esp+32]   ; pSrc
2111    ;mov ecx, [esp+36]   ; iSrcStride
2112    ;mov ebp, [esp+44]   ; iSrcHeight
2113%ifdef X86_32
2114    push r6
2115    %assign push_num 1
2116%else
2117    %assign push_num 0
2118%endif
2119    LOAD_6_PARA
2120    PUSH_XMM 8
2121    SIGN_EXTENSION r1, r1d
2122    SIGN_EXTENSION r3, r3d
2123    SIGN_EXTENSION r4, r4d
2124    SIGN_EXTENSION r5, r5d
2125
2126%ifndef X86_32
2127    push r12
2128    mov r12, r4
2129%endif
2130    sar r5, $02            ; iSrcHeight >> 2
2131
2132    mov r6, r1             ;Save the tailer for the unasigned size
2133    imul r6, r5
2134    add r6, r0
2135    movq xmm7, [r6]
2136
2137    INIT_X86_32_PIC_NOPRESERVE r4
2138    movdqa xmm6, [pic(shufb_mask_quarter)]
2139    DEINIT_X86_32_PIC
2140
2141.yloops_quarter_sse3:
2142    ;mov eax, [esp+40]   ; iSrcWidth
2143    ;sar eax, $02            ; iSrcWidth >> 2
2144    ;mov ebx, eax        ; iDstWidth restored at ebx
2145    ;sar eax, $04            ; (iSrcWidth >> 2) / 16     ; loop count = num_of_mb
2146    ;neg ebx             ; - (iSrcWidth >> 2)
2147%ifdef X86_32
2148    mov r4, arg5
2149%else
2150    mov r4, r12
2151%endif
2152
2153    mov r6, r0
2154    ; each loop = source bandwidth: 32 bytes
2155.xloops_quarter_sse3:
2156    ; 1st part horizonal loop: x32 bytes
2157    ;               mem  hi<-       ->lo
2158    ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A
2159    ;               xmm1: p P o O n N m M l L k K j J i I
2160    ;2nd Line Src:  xmm2: h H g G f F e E d D c C b B a A
2161    ;               xmm3: p P o O n N m M l L k K j J i I
2162
2163    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2164    movdqa xmm0, [r2]          ; 1st_src_line
2165    movdqa xmm1, [r2+16]       ; 1st_src_line + 16
2166    movdqa xmm2, [r2+r3]       ; 2nd_src_line
2167    movdqa xmm3, [r2+r3+16]    ; 2nd_src_line + 16
2168
2169    pshufb xmm0, xmm6           ;1st line: 0 0 0 0 g e c a 0 0 0 0 G E C A
2170    pshufb xmm1, xmm6           ;1st line: 0 0 0 0 o m k i 0 0 0 0 O M K I
2171    pshufb xmm2, xmm6           ;2nd line: 0 0 0 0 g e c a 0 0 0 0 G E C A
2172    pshufb xmm3, xmm6           ;2nd line: 0 0 0 0 o m k i 0 0 0 0 O M K I
2173
2174    movdqa xmm4, xmm0
2175    movdqa xmm5, xmm2
2176    punpckldq xmm0, xmm1        ;1st line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm0
2177    punpckhdq xmm4, xmm1        ;1st line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm4
2178    punpckldq xmm2, xmm3        ;2nd line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm2
2179    punpckhdq xmm5, xmm3        ;2nd line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm5
2180
2181    pavgb xmm0, xmm4
2182    pavgb xmm2, xmm5
2183    pavgb xmm0, xmm2            ;average
2184
2185    ; write pDst
2186    movq [r0], xmm0
2187
2188    ; next SMB
2189    lea r2, [r2+32]
2190    lea r0, [r0+8]
2191
2192    sub r4, 32
2193    cmp r4, 0
2194    jg near .xloops_quarter_sse3
2195
2196    sub r6, r0
2197    ; next line
2198    lea r2, [r2+4*r3]    ; next end of lines
2199    lea r2, [r2+4*r6]    ; reset to base 0 [- 4 * iDstWidth]
2200    lea r0, [r0+r1]
2201    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]
2202
2203    dec r5
2204    jg near .yloops_quarter_sse3
2205
2206    movq [r0], xmm7      ;restored the tailer for the unasigned size
2207
2208%ifndef X86_32
2209    pop r12
2210%endif
2211
2212    POP_XMM
2213    LOAD_6_PARA_POP
2214%ifdef X86_32
2215    pop r6
2216%endif
2217    ret
2218
2219;***********************************************************************
2220;   void DyadicBilinearQuarterDownsampler_sse4(    unsigned char* pDst, const int iDstStride,
2221;                   unsigned char* pSrc, const int iSrcStride,
2222;                   const int iSrcWidth, const int iSrcHeight );
2223;***********************************************************************
2224WELS_EXTERN DyadicBilinearQuarterDownsampler_sse4
2225%ifdef X86_32
2226    push r6
2227    %assign push_num 1
2228%else
2229    %assign push_num 0
2230%endif
2231    LOAD_6_PARA
2232    PUSH_XMM 8
2233    SIGN_EXTENSION r1, r1d
2234    SIGN_EXTENSION r3, r3d
2235    SIGN_EXTENSION r4, r4d
2236    SIGN_EXTENSION r5, r5d
2237
2238%ifndef X86_32
2239    push r12
2240    mov r12, r4
2241%endif
2242    sar r5, $02            ; iSrcHeight >> 2
2243
2244    mov r6, r1             ;Save the tailer for the unasigned size
2245    imul r6, r5
2246    add r6, r0
2247    movq xmm7, [r6]
2248
2249    INIT_X86_32_PIC_NOPRESERVE r4
2250    movdqa xmm6, [pic(shufb_mask_quarter)]    ;mask
2251    DEINIT_X86_32_PIC
2252
2253.yloops_quarter_sse4:
2254%ifdef X86_32
2255    mov r4, arg5
2256%else
2257    mov r4, r12
2258%endif
2259
2260    mov r6, r0
2261    ; each loop = source bandwidth: 32 bytes
2262.xloops_quarter_sse4:
2263    ; 1st part horizonal loop: x16 bytes
2264    ;               mem  hi<-       ->lo
2265    ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A
2266    ;               xmm1: p P o O n N m M l L k K j J i I
2267    ;2nd Line Src:  xmm2: h H g G f F e E d D c C b B a A
2268    ;               xmm3: p P o O n N m M l L k K j J i I
2269
2270    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2271    movntdqa xmm0, [r2]            ; 1st_src_line
2272    movntdqa xmm1, [r2+16]         ; 1st_src_line + 16
2273    movntdqa xmm2, [r2+r3]         ; 2nd_src_line
2274    movntdqa xmm3, [r2+r3+16]      ; 2nd_src_line + 16
2275
2276    pshufb xmm0, xmm6               ;1st line: 0 0 0 0 g e c a 0 0 0 0 G E C A
2277    pshufb xmm1, xmm6               ;1st line: 0 0 0 0 o m k i 0 0 0 0 O M K I
2278    pshufb xmm2, xmm6               ;2nd line: 0 0 0 0 g e c a 0 0 0 0 G E C A
2279    pshufb xmm3, xmm6               ;2nd line: 0 0 0 0 o m k i 0 0 0 0 O M K I
2280
2281    movdqa xmm4, xmm0
2282    movdqa xmm5, xmm2
2283    punpckldq xmm0, xmm1            ;1st line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm0
2284    punpckhdq xmm4, xmm1            ;1st line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm4
2285    punpckldq xmm2, xmm3            ;2nd line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm2
2286    punpckhdq xmm5, xmm3            ;2nd line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm5
2287
2288    pavgb xmm0, xmm4
2289    pavgb xmm2, xmm5
2290    pavgb xmm0, xmm2                ;average
2291
2292    ; write pDst
2293    movq [r0], xmm0
2294
2295    ; next SMB
2296    lea r2, [r2+32]
2297    lea r0, [r0+8]
2298
2299    sub r4, 32
2300    cmp r4, 0
2301    jg near .xloops_quarter_sse4
2302
2303    sub r6, r0
2304    lea r2, [r2+4*r3]    ; next end of lines
2305    lea r2, [r2+4*r6]    ; reset to base 0 [- 2 * iDstWidth]
2306    lea r0, [r0+r1]
2307    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]
2308
2309    dec r5
2310    jg near .yloops_quarter_sse4
2311
2312    movq [r0], xmm7      ;restore the tailer for the unasigned size
2313
2314%ifndef X86_32
2315    pop r12
2316%endif
2317
2318    POP_XMM
2319    LOAD_6_PARA_POP
2320%ifdef X86_32
2321    pop r6
2322%endif
2323    ret
2324
2325; xpos_int=%1 xpos_frac=%2 inc_int+1=%3 inc_frac=%4 tmp=%5
2326%macro SSE2_BilinearIncXposuw 5
2327    movdqa          %5, %2
2328    paddw           %2, %4
2329    paddusw         %5, %4
2330    pcmpeqw         %5, %2
2331    paddb           %1, %3
2332    paddb           %1, %5  ; subtract 1 if no carry
2333%endmacro
2334
2335; outl=%1 outh=%2 in=%3
2336%macro SSE2_UnpckXFracuw 3
2337    pcmpeqw         %1, %1
2338    pxor            %1, %3
2339    movdqa          %2, %1
2340    punpcklwd       %1, %3
2341    punpckhwd       %2, %3
2342%endmacro
2343
2344; [in:xfrac out:xyfrac0]=%1 [out:xyfrac1]=%2 yfrac0=%3 yfrac1=%4
2345%macro SSE2_BilinearFastCalcXYFrac 4
2346    movdqa          %2, %1
2347    pmulhuw         %1, %3
2348    pmulhuw         %2, %4
2349%endmacro
2350
2351; [in:dwordsl out:bytes] dwordsh=%2 zero=%3
2352%macro SSE2_BilinearFastPackDwordsToBytes 3
2353    psrld           %1, 14
2354    psrld           %2, 14
2355    packssdw        %1, %2
2356    pavgw           %1, %3
2357    packuswb        %1, %1
2358%endmacro
2359
2360%macro SSSE3_BilinearFastDownsample2xOrLess_8px 0
2361    movdqa          xmm_tmp0, xmm_xpos_int
2362    pshufb          xmm_tmp0, xmm_0
2363    psubb           xmm_xpos_int, xmm_tmp0
2364    SSE2_UnpckXFracuw xmm_tmp0, xmm_tmp1, xmm_xpos_frac
2365    mov             r_tmp0, i_xpos
2366    lea             i_xpos, [i_xpos + 8 * i_scalex]
2367    shr             r_tmp0, 16
2368    lddqu           xmm_tmp4, [p_src_row0 + r_tmp0]
2369    pshufb          xmm_tmp4, xmm_xpos_int
2370    movdqa          xmm_tmp5, xmm_tmp4
2371    punpcklbw       xmm_tmp4, xmm_0
2372    punpckhbw       xmm_tmp5, xmm_0
2373    SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
2374    SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp3, xmm_yfrac0, xmm_yfrac1
2375    pmaddwd         xmm_tmp0, xmm_tmp4
2376    pmaddwd         xmm_tmp1, xmm_tmp5
2377    lddqu           xmm_tmp4, [p_src_row1 + r_tmp0]
2378    pshufb          xmm_tmp4, xmm_xpos_int
2379    movdqa          xmm_tmp5, xmm_tmp4
2380    punpcklbw       xmm_tmp4, xmm_0
2381    punpckhbw       xmm_tmp5, xmm_0
2382    pmaddwd         xmm_tmp2, xmm_tmp4
2383    pmaddwd         xmm_tmp3, xmm_tmp5
2384    paddd           xmm_tmp0, xmm_tmp2
2385    paddd           xmm_tmp1, xmm_tmp3
2386    SSE2_BilinearFastPackDwordsToBytes xmm_tmp0, xmm_tmp1, xmm_0
2387    movlps          [p_dst], xmm_tmp0
2388    add             p_dst, 8
2389    SSE2_BilinearIncXposuw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_tmp0
2390%endmacro
2391
2392%macro SSSE3_BilinearFastDownsample4xOrLess_8px 0
2393    movdqa          xmm_tmp0, xmm_xpos_int
2394    pshufb          xmm_tmp0, xmm_shufb_0000000088888888
2395    psubb           xmm_xpos_int, xmm_tmp0
2396    SSE2_UnpckXFracuw xmm_tmp0, xmm_tmp1, xmm_xpos_frac
2397    mov             r_tmp0, i_xpos
2398    shr             r_tmp0, 16
2399    lddqu           xmm_tmp3, [p_src_row0 + r_tmp0]
2400    lddqu           xmm_tmp4, [p_src_row1 + r_tmp0]
2401    movdqa          xmm_tmp2, xmm_xpos_int
2402    punpcklbw       xmm_tmp2, xmm_db80h
2403    pshufb          xmm_tmp3, xmm_tmp2
2404    pshufb          xmm_tmp4, xmm_tmp2
2405    SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
2406    pmaddwd         xmm_tmp0, xmm_tmp3
2407    pmaddwd         xmm_tmp2, xmm_tmp4
2408    paddd           xmm_tmp0, xmm_tmp2
2409    lea             r_tmp0, [i_xpos + 4 * i_scalex]
2410    lea             i_xpos, [i_xpos + 8 * i_scalex]
2411    shr             r_tmp0, 16
2412    lddqu           xmm_tmp3, [p_src_row0 + r_tmp0]
2413    lddqu           xmm_tmp4, [p_src_row1 + r_tmp0]
2414    movdqa          xmm_tmp2, xmm_xpos_int
2415    punpckhbw       xmm_tmp2, xmm_db80h
2416    pshufb          xmm_tmp3, xmm_tmp2
2417    pshufb          xmm_tmp4, xmm_tmp2
2418    SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
2419    pmaddwd         xmm_tmp1, xmm_tmp3
2420    pmaddwd         xmm_tmp2, xmm_tmp4
2421    paddd           xmm_tmp1, xmm_tmp2
2422    SSE2_BilinearFastPackDwordsToBytes xmm_tmp0, xmm_tmp1, xmm_0
2423    movlps          [p_dst], xmm_tmp0
2424    add             p_dst, 8
2425    SSE2_BilinearIncXposuw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_tmp0
2426%endmacro
2427
2428%macro SSE2_GeneralBilinearFastDownsample_8px 0
2429    mov             r_tmp0, i_xpos
2430    shr             r_tmp0, 16
2431    movd            xmm_tmp3, [p_src_row0 + r_tmp0]
2432    movd            xmm_tmp4, [p_src_row1 + r_tmp0]
2433    lea             r_tmp0, [i_xpos + i_scalex]
2434    shr             r_tmp0, 16
2435    pinsrw          xmm_tmp3, [p_src_row0 + r_tmp0], 1
2436    pinsrw          xmm_tmp4, [p_src_row1 + r_tmp0], 1
2437    lea             r_tmp0, [i_xpos + 2 * i_scalex]
2438    lea             i_xpos, [i_xpos + 4 * i_scalex]
2439    shr             r_tmp0, 16
2440    pinsrw          xmm_tmp3, [p_src_row0 + r_tmp0], 2
2441    pinsrw          xmm_tmp4, [p_src_row1 + r_tmp0], 2
2442    mov             r_tmp0, i_xpos
2443    sub             r_tmp0, i_scalex
2444    shr             r_tmp0, 16
2445    pinsrw          xmm_tmp3, [p_src_row0 + r_tmp0], 3
2446    pinsrw          xmm_tmp4, [p_src_row1 + r_tmp0], 3
2447    punpcklbw       xmm_tmp3, xmm_0
2448    punpcklbw       xmm_tmp4, xmm_0
2449    movdqa          xmm_tmp0, xmm_xfrac0
2450    SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
2451    pmaddwd         xmm_tmp0, xmm_tmp3
2452    pmaddwd         xmm_tmp2, xmm_tmp4
2453    paddd           xmm_tmp0, xmm_tmp2
2454    mov             r_tmp0, i_xpos
2455    shr             r_tmp0, 16
2456    movd            xmm_tmp3, [p_src_row0 + r_tmp0]
2457    movd            xmm_tmp4, [p_src_row1 + r_tmp0]
2458    lea             r_tmp0, [i_xpos + i_scalex]
2459    shr             r_tmp0, 16
2460    pinsrw          xmm_tmp3, [p_src_row0 + r_tmp0], 1
2461    pinsrw          xmm_tmp4, [p_src_row1 + r_tmp0], 1
2462    lea             r_tmp0, [i_xpos + 2 * i_scalex]
2463    lea             i_xpos, [i_xpos + 4 * i_scalex]
2464    shr             r_tmp0, 16
2465    pinsrw          xmm_tmp3, [p_src_row0 + r_tmp0], 2
2466    pinsrw          xmm_tmp4, [p_src_row1 + r_tmp0], 2
2467    mov             r_tmp0, i_xpos
2468    sub             r_tmp0, i_scalex
2469    shr             r_tmp0, 16
2470    pinsrw          xmm_tmp3, [p_src_row0 + r_tmp0], 3
2471    pinsrw          xmm_tmp4, [p_src_row1 + r_tmp0], 3
2472    punpcklbw       xmm_tmp3, xmm_0
2473    punpcklbw       xmm_tmp4, xmm_0
2474    movdqa          xmm_tmp1, xmm_xfrac1
2475    SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
2476    pmaddwd         xmm_tmp1, xmm_tmp3
2477    pmaddwd         xmm_tmp2, xmm_tmp4
2478    paddd           xmm_tmp1, xmm_tmp2
2479    SSE2_BilinearFastPackDwordsToBytes xmm_tmp0, xmm_tmp1, xmm_0
2480    movlps          [p_dst], xmm_tmp0
2481    add             p_dst, 8
2482    paddw           xmm_xfrac0, xmm_xfrac_inc
2483    paddw           xmm_xfrac1, xmm_xfrac_inc
2484%endmacro
2485
2486; xpos_int=%1 xpos_frac=%2 inc_int=%3 inc_frac=%4 7FFFh=%5 tmp=%6
2487%macro SSE2_BilinearIncXposw 6
2488    pxor            %6, %6
2489    paddw           %2, %4
2490    pcmpgtw         %6, %2
2491    paddb           %1, %3
2492    psubb           %1, %6  ; add carry
2493    pand            %2, %5
2494%endmacro
2495
2496; outl=%1 outh=%2 in=%3 7FFFh=%4
2497%macro SSE2_UnpckXFracw 4
2498    movdqa          %1, %3
2499    pxor            %1, %4
2500    movdqa          %2, %1
2501    punpcklwd       %1, %3
2502    punpckhwd       %2, %3
2503%endmacro
2504
2505; res>>29=%1 data0=%2 data1=%3 frac0=%4 frac1=%5 tmp=%6
2506%macro SSE41_LinearAccurateInterpolateVerticalDwords 6
2507    pshufd          %1, %2, 10110001b
2508    pshufd          %6, %3, 10110001b
2509    pmuludq         %1, %4
2510    pmuludq         %6, %5
2511    paddq           %1, %6
2512    pmuludq         %2, %4
2513    pmuludq         %3, %5
2514    paddq           %2, %3
2515    psllq           %1,  3
2516    psrlq           %2, 29
2517    blendps         %1, %2, 0101b
2518%endmacro
2519
2520%macro SSE41_BilinearAccurateDownsample2xOrLess_8px 0
2521    movdqa          xmm_tmp0, xmm_xpos_int
2522    pshufb          xmm_tmp0, xmm_0
2523    psubb           xmm_xpos_int, xmm_tmp0
2524    SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff
2525    mov             r_tmp0, i_xpos
2526    lea             i_xpos, [i_xpos + 8 * i_scalex]
2527    shr             r_tmp0, 16
2528    lddqu           xmm_tmp4, [p_src_row0 + r_tmp0]
2529    pshufb          xmm_tmp4, xmm_xpos_int
2530    movdqa          xmm_tmp5, xmm_tmp4
2531    punpcklbw       xmm_tmp4, xmm_0
2532    punpckhbw       xmm_tmp5, xmm_0
2533    pmaddwd         xmm_tmp4, xmm_tmp0
2534    pmaddwd         xmm_tmp5, xmm_tmp1
2535    lddqu           xmm_tmp2, [p_src_row1 + r_tmp0]
2536    pshufb          xmm_tmp2, xmm_xpos_int
2537    movdqa          xmm_tmp3, xmm_tmp2
2538    punpcklbw       xmm_tmp2, xmm_0
2539    punpckhbw       xmm_tmp3, xmm_0
2540    pmaddwd         xmm_tmp2, xmm_tmp0
2541    pmaddwd         xmm_tmp3, xmm_tmp1
2542    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp1
2543    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp5, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2
2544    packssdw        xmm_tmp0, xmm_tmp1
2545    pavgw           xmm_tmp0, xmm_0
2546    packuswb        xmm_tmp0, xmm_tmp0
2547    movlps          [p_dst], xmm_tmp0
2548    add             p_dst, 8
2549    SSE2_BilinearIncXposw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_7fff, xmm_tmp0
2550%endmacro
2551
2552%macro SSE41_BilinearAccurateDownsample4xOrLess_8px 0
2553    movdqa          xmm_tmp0, xmm_xpos_int
2554    pshufb          xmm_tmp0, xmm_shufb_0000000088888888
2555    psubb           xmm_xpos_int, xmm_tmp0
2556    SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff
2557    mov             r_tmp0, i_xpos
2558    shr             r_tmp0, 16
2559    movdqa          xmm_tmp3, xmm_xpos_int
2560    punpcklbw       xmm_tmp3, xmm_db80h
2561    lddqu           xmm_tmp4, [p_src_row0 + r_tmp0]
2562    lddqu           xmm_tmp2, [p_src_row1 + r_tmp0]
2563    lea             r_tmp0, [i_xpos + 4 * i_scalex]
2564    lea             i_xpos, [i_xpos + 8 * i_scalex]
2565    shr             r_tmp0, 16
2566    pshufb          xmm_tmp4, xmm_tmp3
2567    pshufb          xmm_tmp2, xmm_tmp3
2568    pmaddwd         xmm_tmp4, xmm_tmp0
2569    pmaddwd         xmm_tmp2, xmm_tmp0
2570    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3
2571    movdqa          xmm_tmp2, xmm_xpos_int
2572    punpckhbw       xmm_tmp2, xmm_db80h
2573    lddqu           xmm_tmp4, [p_src_row0 + r_tmp0]
2574    lddqu           xmm_tmp3, [p_src_row1 + r_tmp0]
2575    pshufb          xmm_tmp4, xmm_tmp2
2576    pshufb          xmm_tmp3, xmm_tmp2
2577    pmaddwd         xmm_tmp4, xmm_tmp1
2578    pmaddwd         xmm_tmp3, xmm_tmp1
2579    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp4, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2
2580    packssdw        xmm_tmp0, xmm_tmp1
2581    pavgw           xmm_tmp0, xmm_0
2582    packuswb        xmm_tmp0, xmm_tmp0
2583    movlps          [p_dst], xmm_tmp0
2584    add             p_dst, 8
2585    SSE2_BilinearIncXposw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_7fff, xmm_tmp0
2586%endmacro
2587
2588%macro SSE41_GeneralBilinearAccurateDownsample_8px 0
2589    mov             r_tmp0, i_xpos
2590    shr             r_tmp0, 16
2591    movd            xmm_tmp4, [p_src_row0 + r_tmp0]
2592    movd            xmm_tmp2, [p_src_row1 + r_tmp0]
2593    lea             r_tmp0, [i_xpos + 1 * i_scalex]
2594    shr             r_tmp0, 16
2595    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 1
2596    pinsrw          xmm_tmp2, [p_src_row1 + r_tmp0], 1
2597    lea             r_tmp0, [i_xpos + 2 * i_scalex]
2598    lea             i_xpos, [i_xpos + 4 * i_scalex]
2599    shr             r_tmp0, 16
2600    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 2
2601    pinsrw          xmm_tmp2, [p_src_row1 + r_tmp0], 2
2602    mov             r_tmp0, i_xpos
2603    sub             r_tmp0, i_scalex
2604    shr             r_tmp0, 16
2605    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 3
2606    pinsrw          xmm_tmp2, [p_src_row1 + r_tmp0], 3
2607    punpcklbw       xmm_tmp4, xmm_0
2608    punpcklbw       xmm_tmp2, xmm_0
2609    pmaddwd         xmm_tmp4, xmm_xfrac0
2610    pmaddwd         xmm_tmp2, xmm_xfrac0
2611    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3
2612    mov             r_tmp0, i_xpos
2613    shr             r_tmp0, 16
2614    movd            xmm_tmp4, [p_src_row0 + r_tmp0]
2615    movd            xmm_tmp3, [p_src_row1 + r_tmp0]
2616    lea             r_tmp0, [i_xpos + 1 * i_scalex]
2617    shr             r_tmp0, 16
2618    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 1
2619    pinsrw          xmm_tmp3, [p_src_row1 + r_tmp0], 1
2620    lea             r_tmp0, [i_xpos + 2 * i_scalex]
2621    lea             i_xpos, [i_xpos + 4 * i_scalex]
2622    shr             r_tmp0, 16
2623    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 2
2624    pinsrw          xmm_tmp3, [p_src_row1 + r_tmp0], 2
2625    mov             r_tmp0, i_xpos
2626    sub             r_tmp0, i_scalex
2627    shr             r_tmp0, 16
2628    pinsrw          xmm_tmp4, [p_src_row0 + r_tmp0], 3
2629    pinsrw          xmm_tmp3, [p_src_row1 + r_tmp0], 3
2630    punpcklbw       xmm_tmp4, xmm_0
2631    punpcklbw       xmm_tmp3, xmm_0
2632    pmaddwd         xmm_tmp4, xmm_xfrac1
2633    pmaddwd         xmm_tmp3, xmm_xfrac1
2634    SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp4, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2
2635    packssdw        xmm_tmp0, xmm_tmp1
2636    pavgw           xmm_tmp0, xmm_0
2637    packuswb        xmm_tmp0, xmm_tmp0
2638    movlps          [p_dst], xmm_tmp0
2639    add             p_dst, 8
2640    paddw           xmm_xfrac0, xmm_xfrac_inc
2641    paddw           xmm_xfrac1, xmm_xfrac_inc
2642    pand            xmm_xfrac0, xmm_7fff
2643    pand            xmm_xfrac1, xmm_7fff
2644%endmacro
2645
2646; downsample_8px_macro=%1 b_fast=%2
2647%macro SSE2_GeneralBilinearDownsampler_loop 2
2648%%height:
2649    mov             p_src_row0, i_ypos
2650    shr             p_src_row0, 15
2651    imul            p_src_row0, i_src_stride
2652    add             p_src_row0, p_src
2653    mov             p_src_row1, p_src_row0
2654    add             p_src_row1, i_src_stride
2655    movd            xmm_tmp1, i_yposd
2656%if %2
2657    pshuflw         xmm_tmp1, xmm_tmp1, 0
2658    psllw           xmm_tmp1, 1
2659    psrlw           xmm_tmp1, 1
2660%else
2661    pslld           xmm_tmp1, 17
2662    psrld           xmm_tmp1, 17
2663%endif
2664%ifdef X86_32
2665    pshufd          xmm_tmp1, xmm_tmp1, 0
2666    pcmpeqw         xmm_tmp0, xmm_tmp0
2667%if %2
2668    psrlw           xmm_tmp0, 1
2669%else
2670    psrld           xmm_tmp0, 17
2671%endif
2672    pxor            xmm_tmp0, xmm_tmp1
2673    movdqa          xmm_yfrac0, xmm_tmp0
2674    movdqa          xmm_yfrac1, xmm_tmp1
2675%else
2676    pshufd          xmm_yfrac1, xmm_tmp1, 0
2677    pcmpeqw         xmm_yfrac0, xmm_yfrac0
2678%if %2
2679    psrlw           xmm_yfrac0, 1
2680%else
2681    psrld           xmm_yfrac0, 17
2682%endif
2683    pxor            xmm_yfrac0, xmm_yfrac1
2684%endif
2685
2686    mov             i_xpos, 1 << 15
2687    mov             i_width_cnt, i_dst_width
2688    sub             i_width_cnt, 1
2689
2690%ifdef xmm_xpos_int
2691    movdqa          xmm_xpos_int, xmm_xpos_int_begin
2692    movdqa          xmm_xpos_frac, xmm_xpos_frac_begin
2693%else
2694    movdqa          xmm_xfrac0, xmm_xfrac0_begin
2695    movdqa          xmm_xfrac1, xmm_xfrac1_begin
2696%endif
2697
2698%%width:
2699    %1
2700    sub             i_width_cnt, 8
2701    jg              %%width
2702
2703    lea             p_dst, [p_dst + i_width_cnt + 1]
2704    imul            i_width_cnt, i_scalex
2705    add             i_xpos, i_width_cnt
2706    shr             i_xpos, 16
2707    movzx           r_tmp0, byte [p_src_row0 + i_xpos]
2708    mov             [p_dst - 1], r_tmp0b
2709%ifdef X86_32
2710    mov             r_tmp0, i_scaleyd
2711    add             i_yposd, r_tmp0
2712%else
2713    add             i_yposd, i_scaleyd
2714%endif
2715    add             p_dst, i_dst_stride_less_width
2716    sub             i_dst_height, 1
2717    jg              %%height
2718%endmacro
2719
2720;**************************************************************************************************************
2721;void GeneralBilinearFastDownsampler_ssse3 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
2722;    int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
2723;    uint32_t uiScaleY);
2724;
2725;**************************************************************************************************************
2726
2727WELS_EXTERN GeneralBilinearFastDownsampler_ssse3
2728    %assign push_num 0
2729%ifndef X86_32
2730    push            r12
2731    push            r13
2732    push            rbx
2733    push            rbp
2734    %assign push_num 4
2735%ifdef WIN64
2736    push            rdi
2737    push            rsi
2738    %assign push_num push_num + 2
2739%endif
2740%endif
2741    LOAD_7_PARA
2742    PUSH_XMM 16
2743    SIGN_EXTENSION  r1, r1d
2744    SIGN_EXTENSION  r2, r2d
2745    SIGN_EXTENSION  r3, r3d
2746    SIGN_EXTENSION  r5, r5d
2747    ZERO_EXTENSION  r6d
2748    sub             r1, r2                                            ; dst_stride - dst_width
2749%ifdef X86_32
2750    movd            xmm0, arg8
2751    movd            xmm1, esp
2752    and             esp, -16
2753%ifdef X86_32_PICASM
2754    sub             esp, 8 * 4 + 9 * 16
2755%else
2756    sub             esp, 8 * 4 + 7 * 16
2757%endif
2758    movd            [esp], xmm1
2759    %define p_dst                   r0
2760    %define i_dst_stride_less_width [esp + 1 * 4]
2761    %define i_dst_width             [esp + 2 * 4]
2762    %define i_dst_height            dword [esp + 3 * 4]
2763    %define p_src                   [esp + 4 * 4]
2764    %define i_src_stride            [esp + 5 * 4]
2765    %define i_scalex                r6
2766    %define i_scalexd               r6d
2767    %define i_scaleyd               [esp + 6 * 4]
2768    %define i_xpos                  r2
2769    %define i_ypos                  dword [esp + 7 * 4]
2770    %define i_yposd                 dword [esp + 7 * 4]
2771    %define p_src_row0              r3
2772    %define p_src_row1              r4
2773    %define i_width_cnt             r5
2774    %define r_tmp0                  r1
2775    %define r_tmp0b                 r1b
2776    %define xmm_xpos_frac           xmm1
2777    %define xmm_xpos_frac_inc       [esp + 8 * 4]
2778    %define xmm_xpos_int            xmm3
2779    %define xmm_xpos_int_inc        [esp + 8 * 4 + 1 * 16]
2780    %define xmm_yfrac0              [esp + 8 * 4 + 2 * 16]
2781    %define xmm_yfrac1              [esp + 8 * 4 + 3 * 16]
2782    %define xmm_tmp0                xmm7
2783    %define xmm_tmp1                xmm0
2784    %define xmm_tmp2                xmm2
2785    %define xmm_tmp3                xmm4
2786    %define xmm_tmp4                xmm5
2787    %define xmm_tmp5                xmm6
2788    %define xmm_0                   [esp + 8 * 4 + 4 * 16]
2789    %define xmm_xpos_int_begin      [esp + 8 * 4 + 5 * 16]
2790    %define xmm_xpos_frac_begin     [esp + 8 * 4 + 6 * 16]
2791%ifdef X86_32_PICASM
2792    %define xmm_db80h                  [esp + 8 * 4 + 7 * 16]
2793    %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 8 * 16]
2794    pxor            xmm_tmp4, xmm_tmp4
2795    pcmpeqb         xmm_tmp5, xmm_tmp5
2796    psubb           xmm_tmp4, xmm_tmp5
2797    movdqa          xmm_tmp3, xmm_tmp4
2798    psllw           xmm_tmp3, 3
2799    pslldq          xmm_tmp3, 8
2800    movdqa          xmm_shufb_0000000088888888, xmm_tmp3
2801    psllw           xmm_tmp4, 7
2802    movdqa          xmm_db80h, xmm_tmp4
2803%else
2804    %define xmm_db80h               [db80h_256]
2805    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
2806%endif
2807    mov             i_dst_stride_less_width, r1
2808    mov             i_dst_width, r2
2809    mov             i_dst_height, r3
2810    mov             p_src, r4
2811    mov             i_src_stride, r5
2812    movd            i_scaleyd, xmm0
2813    pxor            xmm_tmp0, xmm_tmp0
2814    movdqa          xmm_0, xmm_tmp0
2815%else
2816    %define p_dst                   r0
2817    %define i_dst_stride_less_width r1
2818    %define i_dst_width             r2
2819    %define i_dst_height            r3
2820    %define p_src                   r4
2821    %define i_src_stride            r5
2822    %define i_scalex                r6
2823    %define i_scalexd               r6d
2824    %define i_scaleyd               dword arg8d
2825    %define i_xpos                  r12
2826    %define i_ypos                  r13
2827    %define i_yposd                 r13d
2828    %define p_src_row0              rbp
2829%ifdef WIN64
2830    %define p_src_row1              rsi
2831    %define i_width_cnt             rdi
2832%else
2833    %define p_src_row1              r11
2834    %define i_width_cnt             rax
2835%endif
2836    %define r_tmp0                  rbx
2837    %define r_tmp0b                 bl
2838    %define xmm_0                   xmm0
2839    %define xmm_xpos_frac           xmm1
2840    %define xmm_xpos_frac_inc       xmm8
2841    %define xmm_xpos_int            xmm3
2842    %define xmm_xpos_int_inc        xmm10
2843    %define xmm_yfrac0              xmm11
2844    %define xmm_yfrac1              xmm12
2845    %define xmm_tmp0                xmm7
2846    %define xmm_tmp1                xmm2
2847    %define xmm_tmp2                xmm9
2848    %define xmm_tmp3                xmm4
2849    %define xmm_tmp4                xmm5
2850    %define xmm_tmp5                xmm6
2851    %define xmm_xpos_int_begin      xmm14
2852    %define xmm_xpos_frac_begin     xmm15
2853    %define xmm_db80h               [db80h_256]
2854    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
2855    pxor            xmm_0, xmm_0
2856%endif
2857
2858    sub             i_dst_height, 1
2859    je              .final_row
2860    jl              .done
2861
2862    mov             i_ypos, 1 << 14
2863    movd            xmm_xpos_frac, i_scalexd
2864    pshufd          xmm_xpos_frac, xmm_xpos_frac, 0
2865    movdqa          xmm_tmp0, xmm_xpos_frac
2866    pslld           xmm_tmp0, 2
2867    pslldq          xmm_xpos_frac, 4
2868    paddd           xmm_tmp0, xmm_xpos_frac
2869    movdqa          xmm_tmp1, xmm_xpos_frac
2870    pslldq          xmm_tmp1, 4
2871    paddd           xmm_xpos_frac, xmm_tmp1
2872    paddd           xmm_tmp0, xmm_tmp1
2873    pslldq          xmm_tmp1, 4
2874    paddd           xmm_xpos_frac, xmm_tmp1
2875    paddd           xmm_tmp0, xmm_tmp1
2876    pcmpeqw         xmm_tmp1, xmm_tmp1
2877    psrld           xmm_tmp1, 31
2878    pslld           xmm_tmp1, 15
2879    paddd           xmm_xpos_frac, xmm_tmp1
2880    paddd           xmm_tmp0, xmm_tmp1
2881    movdqa          xmm_xpos_int, xmm_xpos_frac
2882    movdqa          xmm_tmp1, xmm_tmp0
2883    psrld           xmm_xpos_int, 16
2884    psrld           xmm_tmp1, 16
2885    packssdw        xmm_xpos_int, xmm_tmp1
2886    packuswb        xmm_xpos_int, xmm_xpos_int
2887    movdqa          xmm_tmp1, xmm_xpos_int
2888    pcmpeqw         xmm_tmp2, xmm_tmp2
2889    psubb           xmm_tmp1, xmm_tmp2
2890    punpcklbw       xmm_xpos_int, xmm_tmp1
2891    pslld           xmm_xpos_frac, 16
2892    pslld           xmm_tmp0, 16
2893    psrad           xmm_xpos_frac, 16
2894    psrad           xmm_tmp0, 16
2895    packssdw        xmm_xpos_frac, xmm_tmp0
2896    movd            xmm_tmp0, i_scalexd
2897    pslld           xmm_tmp0, 3
2898    movdqa          xmm_tmp1, xmm_tmp0
2899    punpcklwd       xmm_tmp0, xmm_tmp0
2900    pshufd          xmm_tmp0, xmm_tmp0, 0
2901    movdqa          xmm_xpos_frac_inc, xmm_tmp0
2902    psrld           xmm_tmp1, 16
2903    psubw           xmm_tmp1, xmm_tmp2
2904    pxor            xmm_tmp2, xmm_tmp2
2905    pshufb          xmm_tmp1, xmm_tmp2
2906    movdqa          xmm_xpos_int_inc, xmm_tmp1
2907    movdqa          xmm_xpos_int_begin, xmm_xpos_int
2908    movdqa          xmm_xpos_frac_begin, xmm_xpos_frac
2909
2910    cmp             i_scalex, 4 << 16
2911    ja              .scalex_above4
2912    cmp             i_scalex, 2 << 16
2913    ja              .scalex_above2_beloweq4
2914    SSE2_GeneralBilinearDownsampler_loop SSSE3_BilinearFastDownsample2xOrLess_8px, 1
2915    jmp             .final_row
2916%ifdef X86_32
2917    %undef xmm_yfrac0
2918    %xdefine xmm_yfrac0 xmm_tmp5
2919    %undef xmm_tmp5
2920%endif
2921.scalex_above2_beloweq4:
2922    SSE2_GeneralBilinearDownsampler_loop SSSE3_BilinearFastDownsample4xOrLess_8px, 1
2923    jmp             .final_row
2924.scalex_above4:
2925%xdefine xmm_xfrac0 xmm_xpos_frac
2926%xdefine xmm_xfrac1 xmm_xpos_int
2927%xdefine xmm_xfrac0_begin xmm_xpos_int_begin
2928%xdefine xmm_xfrac1_begin xmm_xpos_frac_begin
2929%xdefine xmm_xfrac_inc xmm_xpos_frac_inc
2930%undef xmm_xpos_int
2931%undef xmm_xpos_frac
2932%undef xmm_xpos_int_begin
2933%undef xmm_xpos_frac_begin
2934%undef xmm_xpos_int_inc
2935%undef xmm_xpos_frac_inc
2936    SSE2_UnpckXFracuw xmm_tmp0, xmm_xfrac1, xmm_xfrac0
2937    movdqa          xmm_xfrac0, xmm_tmp0
2938    movdqa          xmm_xfrac0_begin, xmm_xfrac0
2939    movdqa          xmm_xfrac1_begin, xmm_xfrac1
2940    pcmpeqw         xmm_tmp0, xmm_tmp0
2941    pmullw          xmm_tmp0, xmm_xfrac_inc
2942    punpcklwd       xmm_tmp0, xmm_xfrac_inc
2943    movdqa          xmm_xfrac_inc, xmm_tmp0
2944    SSE2_GeneralBilinearDownsampler_loop SSE2_GeneralBilinearFastDownsample_8px, 1
2945
2946.final_row:
2947    mov             p_src_row0, i_ypos
2948    shr             p_src_row0, 15
2949    imul            p_src_row0, i_src_stride
2950    add             p_src_row0, p_src
2951    mov             i_xpos, 1 << 15
2952    mov             i_width_cnt, i_dst_width
2953
2954.final_row_width:
2955    mov             r_tmp0, i_xpos
2956    shr             r_tmp0, 16
2957    movzx           r_tmp0, byte [p_src_row0 + r_tmp0]
2958    mov             [p_dst], r_tmp0b
2959    add             p_dst, 1
2960    add             i_xpos, i_scalex
2961    sub             i_width_cnt, 1
2962    jg              .final_row_width
2963
2964.done:
2965%ifdef X86_32
2966    mov             esp, [esp]
2967%endif
2968    POP_XMM
2969    LOAD_7_PARA_POP
2970%ifndef X86_32
2971%ifdef WIN64
2972    pop             rsi
2973    pop             rdi
2974%endif
2975    pop             rbp
2976    pop             rbx
2977    pop             r13
2978    pop             r12
2979%endif
2980    ret
2981%undef p_dst
2982%undef i_dst_stride_less_width
2983%undef i_dst_width
2984%undef i_dst_height
2985%undef p_src
2986%undef i_src_stride
2987%undef i_scalex
2988%undef i_scalexd
2989%undef i_scaleyd
2990%undef i_xpos
2991%undef i_ypos
2992%undef i_yposd
2993%undef p_src_row0
2994%undef p_src_row1
2995%undef i_width_cnt
2996%undef r_tmp0
2997%undef r_tmp0b
2998%undef xmm_0
2999%undef xmm_xpos_frac
3000%undef xmm_xpos_frac_inc
3001%undef xmm_xpos_int
3002%undef xmm_xpos_int_inc
3003%undef xmm_yfrac0
3004%undef xmm_yfrac1
3005%undef xmm_tmp0
3006%undef xmm_tmp1
3007%undef xmm_tmp2
3008%undef xmm_tmp3
3009%undef xmm_tmp4
3010%undef xmm_tmp5
3011%undef xmm_xpos_int_begin
3012%undef xmm_xpos_frac_begin
3013%undef xmm_xfrac0
3014%undef xmm_xfrac1
3015%undef xmm_xfrac0_begin
3016%undef xmm_xfrac1_begin
3017%undef xmm_xfrac_inc
3018%undef xmm_db80h
3019%undef xmm_shufb_0000000088888888
3020
3021;**************************************************************************************************************
3022;void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
3023;    int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
3024;    uint32_t uiScaleY);
3025;
3026;**************************************************************************************************************
3027
3028WELS_EXTERN GeneralBilinearAccurateDownsampler_sse41
3029    %assign push_num 0
3030%ifndef X86_32
3031    push            r12
3032    push            r13
3033    push            rbx
3034    push            rbp
3035    %assign push_num 4
3036%ifdef WIN64
3037    push            rdi
3038    push            rsi
3039    %assign push_num push_num + 2
3040%endif
3041%endif
3042    LOAD_7_PARA
3043    PUSH_XMM 16
3044    SIGN_EXTENSION  r1, r1d
3045    SIGN_EXTENSION  r2, r2d
3046    SIGN_EXTENSION  r3, r3d
3047    SIGN_EXTENSION  r5, r5d
3048    ZERO_EXTENSION  r6d
3049    sub             r1, r2                                            ; dst_stride - dst_width
3050    add             r6, r6                                            ; 2 * scalex
3051%ifdef X86_32
3052    movd            xmm0, arg8
3053    movd            xmm1, esp
3054    and             esp, -16
3055%ifdef X86_32_PICASM
3056    sub             esp, 8 * 4 + 10 * 16
3057%else
3058    sub             esp, 8 * 4 + 8 * 16
3059%endif
3060    movd            [esp], xmm1
3061    %define p_dst                   r0
3062    %define i_dst_stride_less_width [esp + 1 * 4]
3063    %define i_dst_width             [esp + 2 * 4]
3064    %define i_dst_height            dword [esp + 3 * 4]
3065    %define p_src                   [esp + 4 * 4]
3066    %define i_src_stride            [esp + 5 * 4]
3067    %define i_scalex                r6
3068    %define i_scalexd               r6d
3069    %define i_scaleyd               [esp + 6 * 4]
3070    %define i_xpos                  r2
3071    %define i_ypos                  dword [esp + 7 * 4]
3072    %define i_yposd                 dword [esp + 7 * 4]
3073    %define p_src_row0              r3
3074    %define p_src_row1              r4
3075    %define i_width_cnt             r5
3076    %define r_tmp0                  r1
3077    %define r_tmp0b                 r1b
3078    %define xmm_xpos_frac           xmm1
3079    %define xmm_xpos_frac_inc       [esp + 8 * 4]
3080    %define xmm_xpos_int            xmm3
3081    %define xmm_xpos_int_inc        [esp + 8 * 4 + 1 * 16]
3082    %define xmm_yfrac0              [esp + 8 * 4 + 2 * 16]
3083    %define xmm_yfrac1              [esp + 8 * 4 + 3 * 16]
3084    %define xmm_tmp0                xmm7
3085    %define xmm_tmp1                xmm0
3086    %define xmm_tmp2                xmm2
3087    %define xmm_tmp3                xmm4
3088    %define xmm_tmp4                xmm5
3089    %define xmm_tmp5                xmm6
3090    %define xmm_0                   [esp + 8 * 4 + 4 * 16]
3091    %define xmm_7fff                [esp + 8 * 4 + 5 * 16]
3092    %define xmm_xpos_int_begin      [esp + 8 * 4 + 6 * 16]
3093    %define xmm_xpos_frac_begin     [esp + 8 * 4 + 7 * 16]
3094%ifdef X86_32_PICASM
3095    %define xmm_db80h                  [esp + 8 * 4 + 8 * 16]
3096    %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 9 * 16]
3097    pxor            xmm_tmp4, xmm_tmp4
3098    pcmpeqb         xmm_tmp5, xmm_tmp5
3099    psubb           xmm_tmp4, xmm_tmp5
3100    movdqa          xmm_tmp3, xmm_tmp4
3101    psllw           xmm_tmp3, 3
3102    pslldq          xmm_tmp3, 8
3103    movdqa          xmm_shufb_0000000088888888, xmm_tmp3
3104    psllw           xmm_tmp4, 7
3105    movdqa          xmm_db80h, xmm_tmp4
3106%else
3107    %define xmm_db80h               [db80h_256]
3108    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
3109%endif
3110    mov             i_dst_stride_less_width, r1
3111    mov             i_dst_width, r2
3112    mov             i_dst_height, r3
3113    mov             p_src, r4
3114    mov             i_src_stride, r5
3115    movd            i_scaleyd, xmm0
3116    pxor            xmm_tmp5, xmm_tmp5
3117    movdqa          xmm_0, xmm_tmp5
3118    pcmpeqw         xmm_tmp5, xmm_tmp5
3119    psrlw           xmm_tmp5, 1
3120    movdqa          xmm_7fff, xmm_tmp5
3121%else
3122    %define p_dst                   r0
3123    %define i_dst_stride_less_width r1
3124    %define i_dst_width             r2
3125    %define i_dst_height            r3
3126    %define p_src                   r4
3127    %define i_src_stride            r5
3128    %define i_scalex                r6
3129    %define i_scalexd               r6d
3130    %define i_scaleyd               dword arg8d
3131    %define i_xpos                  r12
3132    %define i_ypos                  r13
3133    %define i_yposd                 r13d
3134    %define p_src_row0              rbp
3135%ifdef WIN64
3136    %define p_src_row1              rsi
3137    %define i_width_cnt             rdi
3138%else
3139    %define p_src_row1              r11
3140    %define i_width_cnt             rax
3141%endif
3142    %define r_tmp0                  rbx
3143    %define r_tmp0b                 bl
3144    %define xmm_0                   xmm0
3145    %define xmm_xpos_frac           xmm1
3146    %define xmm_xpos_frac_inc       xmm8
3147    %define xmm_xpos_int            xmm3
3148    %define xmm_xpos_int_inc        xmm10
3149    %define xmm_yfrac0              xmm11
3150    %define xmm_yfrac1              xmm12
3151    %define xmm_tmp0                xmm7
3152    %define xmm_tmp1                xmm2
3153    %define xmm_tmp2                xmm9
3154    %define xmm_tmp3                xmm4
3155    %define xmm_tmp4                xmm5
3156    %define xmm_tmp5                xmm6
3157    %define xmm_7fff                xmm13
3158    %define xmm_xpos_int_begin      xmm14
3159    %define xmm_xpos_frac_begin     xmm15
3160    %define xmm_db80h               [db80h_256]
3161    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
3162    pxor            xmm_0, xmm_0
3163    pcmpeqw         xmm_7fff, xmm_7fff
3164    psrlw           xmm_7fff, 1
3165%endif
3166
3167    sub             i_dst_height, 1
3168    je              .final_row
3169    jl              .done
3170
3171    mov             i_ypos, 1 << 14
3172    movd            xmm_xpos_frac, i_scalexd
3173    pshufd          xmm_xpos_frac, xmm_xpos_frac, 0
3174    movdqa          xmm_tmp0, xmm_xpos_frac
3175    pslld           xmm_tmp0, 2
3176    pslldq          xmm_xpos_frac, 4
3177    paddd           xmm_tmp0, xmm_xpos_frac
3178    movdqa          xmm_tmp1, xmm_xpos_frac
3179    pslldq          xmm_tmp1, 4
3180    paddd           xmm_xpos_frac, xmm_tmp1
3181    paddd           xmm_tmp0, xmm_tmp1
3182    pslldq          xmm_tmp1, 4
3183    paddd           xmm_xpos_frac, xmm_tmp1
3184    paddd           xmm_tmp0, xmm_tmp1
3185    pcmpeqw         xmm_tmp1, xmm_tmp1
3186    psrld           xmm_tmp1, 31
3187    pslld           xmm_tmp1, 15
3188    paddd           xmm_xpos_frac, xmm_tmp1
3189    paddd           xmm_tmp0, xmm_tmp1
3190    movdqa          xmm_xpos_int, xmm_xpos_frac
3191    movdqa          xmm_tmp1, xmm_tmp0
3192    psrld           xmm_xpos_int, 16
3193    psrld           xmm_tmp1, 16
3194    packssdw        xmm_xpos_int, xmm_tmp1
3195    packuswb        xmm_xpos_int, xmm_xpos_int
3196    movdqa          xmm_tmp1, xmm_xpos_int
3197    pcmpeqw         xmm_tmp2, xmm_tmp2
3198    psubb           xmm_tmp1, xmm_tmp2
3199    punpcklbw       xmm_xpos_int, xmm_tmp1
3200    pslld           xmm_xpos_frac, 16
3201    pslld           xmm_tmp0, 16
3202    psrad           xmm_xpos_frac, 16
3203    psrad           xmm_tmp0, 16
3204    packssdw        xmm_xpos_frac, xmm_tmp0
3205    psrlw           xmm_xpos_frac, 1
3206    movd            xmm_tmp0, i_scalexd
3207    pslld           xmm_tmp0, 3
3208    movdqa          xmm_tmp1, xmm_tmp0
3209    punpcklwd       xmm_tmp0, xmm_tmp0
3210    pshufd          xmm_tmp0, xmm_tmp0, 0
3211    psrlw           xmm_tmp0, 1
3212    movdqa          xmm_xpos_frac_inc, xmm_tmp0
3213    psrld           xmm_tmp1, 16
3214    pxor            xmm_tmp2, xmm_tmp2
3215    pshufb          xmm_tmp1, xmm_tmp2
3216    movdqa          xmm_xpos_int_inc, xmm_tmp1
3217    movdqa          xmm_xpos_int_begin, xmm_xpos_int
3218    movdqa          xmm_xpos_frac_begin, xmm_xpos_frac
3219
3220    cmp             i_scalex, 4 << 16
3221    ja              .scalex_above4
3222    cmp             i_scalex, 2 << 16
3223    ja              .scalex_above2_beloweq4
3224    SSE2_GeneralBilinearDownsampler_loop SSE41_BilinearAccurateDownsample2xOrLess_8px, 0
3225    jmp             .final_row
3226%ifdef X86_32
3227    %undef xmm_yfrac0
3228    %xdefine xmm_yfrac0 xmm_tmp5
3229    %undef xmm_tmp5
3230%endif
3231.scalex_above2_beloweq4:
3232    SSE2_GeneralBilinearDownsampler_loop SSE41_BilinearAccurateDownsample4xOrLess_8px, 0
3233    jmp             .final_row
3234.scalex_above4:
3235%xdefine xmm_xfrac0 xmm_xpos_frac
3236%xdefine xmm_xfrac1 xmm_xpos_int
3237%xdefine xmm_xfrac0_begin xmm_xpos_int_begin
3238%xdefine xmm_xfrac1_begin xmm_xpos_frac_begin
3239%xdefine xmm_xfrac_inc xmm_xpos_frac_inc
3240%undef xmm_xpos_int
3241%undef xmm_xpos_frac
3242%undef xmm_xpos_int_begin
3243%undef xmm_xpos_frac_begin
3244%undef xmm_xpos_int_inc
3245%undef xmm_xpos_frac_inc
3246    SSE2_UnpckXFracw xmm_tmp0, xmm_xfrac1, xmm_xfrac0, xmm_7fff
3247    movdqa          xmm_xfrac0, xmm_tmp0
3248    movdqa          xmm_xfrac0_begin, xmm_xfrac0
3249    movdqa          xmm_xfrac1_begin, xmm_xfrac1
3250    pcmpeqw         xmm_tmp0, xmm_tmp0
3251    pmullw          xmm_tmp0, xmm_xfrac_inc
3252    punpcklwd       xmm_tmp0, xmm_xfrac_inc
3253    movdqa          xmm_xfrac_inc, xmm_tmp0
3254    SSE2_GeneralBilinearDownsampler_loop SSE41_GeneralBilinearAccurateDownsample_8px, 0
3255
3256.final_row:
3257    mov             p_src_row0, i_ypos
3258    shr             p_src_row0, 15
3259    imul            p_src_row0, i_src_stride
3260    add             p_src_row0, p_src
3261    mov             i_xpos, 1 << 15
3262    mov             i_width_cnt, i_dst_width
3263
3264.final_row_width:
3265    mov             r_tmp0, i_xpos
3266    shr             r_tmp0, 16
3267    movzx           r_tmp0, byte [p_src_row0 + r_tmp0]
3268    mov             [p_dst], r_tmp0b
3269    add             p_dst, 1
3270    add             i_xpos, i_scalex
3271    sub             i_width_cnt, 1
3272    jg              .final_row_width
3273
3274.done:
3275%ifdef X86_32
3276    mov             esp, [esp]
3277%endif
3278    POP_XMM
3279    LOAD_7_PARA_POP
3280%ifndef X86_32
3281%ifdef WIN64
3282    pop             rsi
3283    pop             rdi
3284%endif
3285    pop             rbp
3286    pop             rbx
3287    pop             r13
3288    pop             r12
3289%endif
3290    ret
3291%undef p_dst
3292%undef i_dst_stride_less_width
3293%undef i_dst_width
3294%undef i_dst_height
3295%undef p_src
3296%undef i_src_stride
3297%undef i_scalex
3298%undef i_scalexd
3299%undef i_scaleyd
3300%undef i_xpos
3301%undef i_ypos
3302%undef i_yposd
3303%undef p_src_row0
3304%undef p_src_row1
3305%undef i_width_cnt
3306%undef r_tmp0
3307%undef r_tmp0b
3308%undef xmm_0
3309%undef xmm_xpos_frac
3310%undef xmm_xpos_frac_inc
3311%undef xmm_xpos_int
3312%undef xmm_xpos_int_inc
3313%undef xmm_yfrac0
3314%undef xmm_yfrac1
3315%undef xmm_tmp0
3316%undef xmm_tmp1
3317%undef xmm_tmp2
3318%undef xmm_tmp3
3319%undef xmm_tmp4
3320%undef xmm_tmp5
3321%undef xmm_7fff
3322%undef xmm_xpos_int_begin
3323%undef xmm_xpos_frac_begin
3324%undef xmm_xfrac0
3325%undef xmm_xfrac1
3326%undef xmm_xfrac0_begin
3327%undef xmm_xfrac1_begin
3328%undef xmm_xfrac_inc
3329%undef xmm_db80h
3330%undef xmm_shufb_0000000088888888
3331
3332%ifdef HAVE_AVX2
3333; xpos_int=%1 xpos_frac=%2 inc_int+1=%3 inc_frac=%4 tmp=%5
3334%macro AVX2_BilinearIncXposuw 5
3335    vpaddusw        %5, %2, %4
3336    vpaddw          %2, %2, %4
3337    vpcmpeqw        %5, %5, %2
3338    vpaddb          %1, %1, %3
3339    vpaddb          %1, %1, %5  ; subtract 1 if no carry
3340%endmacro
3341
3342; outl=%1 outh=%2 in=%3 FFFFh/7FFFh=%4
3343%macro AVX2_UnpckXFrac 4
3344    vpxor           %1, %3, %4
3345    vpunpckhwd      %2, %1, %3
3346    vpunpcklwd      %1, %1, %3
3347%endmacro
3348
3349; out0=%1 out1=%2 xfrac=%3 yfrac0=%4 yfrac1=%5
3350%macro AVX2_BilinearFastCalcXYFrac 5
3351    vpmulhuw        %2, %3, %5
3352    vpmulhuw        %1, %3, %4
3353%endmacro
3354
3355; [in:dwordsl out:bytes] dwordsh=%2 zero=%3
3356%macro AVX2_BilinearFastPackDwordsToBytes 3
3357    vpsrld          %1, %1, 14
3358    vpsrld          %2, %2, 14
3359    vpackssdw       %1, %1, %2
3360    vpavgw          %1, %1, %3
3361    vpackuswb       %1, %1, %1
3362%endmacro
3363
3364%macro AVX2_BilinearFastDownsample2xOrLess_16px 0
3365    vpshufb         ymm_tmp0, ymm_xpos_int, ymm_0
3366    vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
3367    AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff
3368    mov             r_tmp0, i_xpos
3369    shr             r_tmp0, 16
3370    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
3371    vmovdqu         xmm_tmp5, [p_src_row1 + r_tmp0]
3372    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
3373    lea             i_xpos, [i_xpos + 8 * i_scalex2]
3374    shr             r_tmp0, 16
3375    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
3376    vinserti128     ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
3377    vpshufb         ymm_tmp4, ymm_tmp4, ymm_xpos_int
3378    vpshufb         ymm_tmp5, ymm_tmp5, ymm_xpos_int
3379    AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp2, ymm_tmp0, ymm_yfrac0, ymm_yfrac1
3380    vpunpcklbw      ymm_tmp3, ymm_tmp4, ymm_0
3381    vpmaddwd        ymm_tmp0, ymm_tmp0, ymm_tmp3
3382    vpunpcklbw      ymm_tmp3, ymm_tmp5, ymm_0
3383    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp3
3384    vpaddd          ymm_tmp0, ymm_tmp0, ymm_tmp2
3385    AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp3, ymm_tmp1, ymm_yfrac0, ymm_yfrac1
3386    vpunpckhbw      ymm_tmp2, ymm_tmp4, ymm_0
3387    vpmaddwd        ymm_tmp1, ymm_tmp1, ymm_tmp2
3388    vpunpckhbw      ymm_tmp2, ymm_tmp5, ymm_0
3389    vpmaddwd        ymm_tmp3, ymm_tmp3, ymm_tmp2
3390    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp3
3391    AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0
3392    vmovlps         [p_dst], xmm_tmp0
3393    vextracti128    [p_dst + 8], ymm_tmp0, 1
3394    add             p_dst, 16
3395    AVX2_BilinearIncXposuw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_tmp0
3396%endmacro
3397
3398%macro AVX2_BilinearFastDownsample4xOrLess_16px 0
3399    vbroadcasti128  ymm_tmp0, xmm_shufb_0000000088888888
3400    vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
3401    vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
3402    AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff
3403    mov             r_tmp0, i_xpos
3404    shr             r_tmp0, 16
3405    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
3406    vmovdqu         xmm_tmp3, [p_src_row1 + r_tmp0]
3407    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
3408    shr             r_tmp0, 16
3409    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
3410    vinserti128     ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1
3411    lea             r_tmp0, [i_xpos + 2 * i_scalex2]
3412    lea             i_xpos, [r_tmp0 + 4 * i_scalex2]
3413    shr             r_tmp0, 16
3414    vpunpcklbw      ymm_tmp2, ymm_xpos_int, ymm_ffff
3415    vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp2
3416    vpshufb         ymm_tmp3, ymm_tmp3, ymm_tmp2
3417    AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp2, ymm_tmp0, ymm_yfrac0, ymm_yfrac1
3418    vpmaddwd        ymm_tmp0, ymm_tmp0, ymm_tmp4
3419    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp3
3420    vpaddd          ymm_tmp0, ymm_tmp0, ymm_tmp2
3421    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
3422    vmovdqu         xmm_tmp3, [p_src_row1 + r_tmp0]
3423    mov             r_tmp0, i_xpos
3424    lea             i_xpos, [i_xpos + 2 * i_scalex2]
3425    shr             r_tmp0, 16
3426    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
3427    vinserti128     ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1
3428    vpunpckhbw      ymm_tmp2, ymm_xpos_int, ymm_ffff
3429    vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp2
3430    vpshufb         ymm_tmp3, ymm_tmp3, ymm_tmp2
3431    AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp2, ymm_tmp1, ymm_yfrac0, ymm_yfrac1
3432    vpmaddwd        ymm_tmp1, ymm_tmp1, ymm_tmp4
3433    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp3
3434    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp2
3435    AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0
3436    vmovlps         [p_dst], xmm_tmp0
3437    vextracti128    [p_dst + 8], ymm_tmp0, 1
3438    add             p_dst, 16
3439    AVX2_BilinearIncXposuw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_tmp0
3440%endmacro
3441
3442%macro AVX2_BilinearFastDownsample8xOrLess_16px 0
3443    vbroadcasti128  ymm_tmp0, xmm_shufb_000044448888CCCC
3444    vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
3445    vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
3446    mov             r_tmp0, i_xpos
3447    shr             r_tmp0, 16
3448    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
3449    vmovdqu         xmm_tmp5, [p_src_row1 + r_tmp0]
3450    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
3451    add             i_xpos, i_scalex2
3452    shr             r_tmp0, 16
3453    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
3454    vinserti128     ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
3455    mov             r_tmp0, i_xpos
3456    shr             r_tmp0, 16
3457    vmovdqu         xmm_tmp0, [p_src_row0 + r_tmp0]
3458    vmovdqu         xmm_tmp1, [p_src_row1 + r_tmp0]
3459    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
3460    add             i_xpos, i_scalex2
3461    shr             r_tmp0, 16
3462    vinserti128     ymm_tmp0, ymm_tmp0, [p_src_row0 + r_tmp0], 1
3463    vinserti128     ymm_tmp1, ymm_tmp1, [p_src_row1 + r_tmp0], 1
3464    vpunpcklbw      ymm_tmp3, ymm_xpos_int, ymm_ffff
3465    vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp3
3466    vpshufb         ymm_tmp5, ymm_tmp5, ymm_tmp3
3467    vpshufb         ymm_tmp0, ymm_tmp0, ymm_tmp3
3468    vpshufb         ymm_tmp1, ymm_tmp1, ymm_tmp3
3469    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp0, 11001100b
3470    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp1, 11001100b
3471    AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff
3472    AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp2, ymm_tmp0, ymm_yfrac0, ymm_yfrac1
3473    vpmaddwd        ymm_tmp0, ymm_tmp0, ymm_tmp4
3474    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp5
3475    vpaddd          ymm_tmp0, ymm_tmp0, ymm_tmp2
3476    mov             r_tmp0, i_xpos
3477    shr             r_tmp0, 16
3478    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
3479    vmovdqu         xmm_tmp5, [p_src_row1 + r_tmp0]
3480    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
3481    add             i_xpos, i_scalex2
3482    shr             r_tmp0, 16
3483    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
3484    vinserti128     ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
3485    mov             r_tmp0, i_xpos
3486    lea             i_xpos, [i_xpos + 4 * i_scalex2]
3487    shr             r_tmp0, 16
3488    vmovdqu         xmm_tmp2, [p_src_row0 + r_tmp0]
3489    vmovdqu         xmm_tmp3, [p_src_row1 + r_tmp0]
3490    mov             r_tmp0, i_xpos
3491    add             i_xpos, i_scalex2
3492    shr             r_tmp0, 16
3493    vinserti128     ymm_tmp2, ymm_tmp2, [p_src_row0 + r_tmp0], 1
3494    vinserti128     ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1
3495    vpshufb         ymm_tmp4, ymm_tmp4, ymm_xpos_int
3496    vpshufb         ymm_tmp5, ymm_tmp5, ymm_xpos_int
3497    vpshufb         ymm_tmp2, ymm_tmp2, ymm_xpos_int
3498    vpshufb         ymm_tmp3, ymm_tmp3, ymm_xpos_int
3499    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp2, 10001000b
3500    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp3, 10001000b
3501    vpunpckhbw      ymm_tmp4, ymm_tmp4, ymm_0
3502    vpunpckhbw      ymm_tmp5, ymm_tmp5, ymm_0
3503    AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp3, ymm_tmp1, ymm_yfrac0, ymm_yfrac1
3504    vpmaddwd        ymm_tmp1, ymm_tmp1, ymm_tmp4
3505    vpmaddwd        ymm_tmp3, ymm_tmp3, ymm_tmp5
3506    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp3
3507    AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0
3508    vmovlps         [p_dst], xmm_tmp0
3509    vextracti128    [p_dst + 8], ymm_tmp0, 1
3510    add             p_dst, 16
3511    AVX2_BilinearIncXposuw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_tmp0
3512%endmacro
3513
3514%macro AVX2_GeneralBilinearFastDownsample_16px 0
3515    mov             r_tmp0, i_xpos
3516    shr             r_tmp0, 16
3517    vpbroadcastd    ymm_tmp4, [p_src_row0 + r_tmp0]
3518    vpbroadcastd    ymm_tmp5, [p_src_row1 + r_tmp0]
3519    lea             r_tmp0, [i_xpos + 1 * i_scalex]
3520    shr             r_tmp0, 16
3521    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
3522    vpunpcklwd      ymm_tmp4, ymm_tmp4, ymm_tmp0
3523    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
3524    vpunpcklwd      ymm_tmp5, ymm_tmp5, ymm_tmp0
3525    lea             r_tmp0, [i_xpos + 2 * i_scalex]
3526    lea             i_xpos, [i_xpos + 4 * i_scalex]
3527    shr             r_tmp0, 16
3528    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
3529    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp0, 00100010b
3530    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
3531    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp0, 00100010b
3532    mov             r_tmp0, i_xpos
3533    sub             r_tmp0, i_scalex
3534    shr             r_tmp0, 16
3535    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0 - 2]
3536    vpblendw        ymm_tmp4, ymm_tmp4, ymm_tmp0, 1000b
3537    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0 - 2]
3538    vpblendw        ymm_tmp5, ymm_tmp5, ymm_tmp0, 1000b
3539    mov             r_tmp0, i_xpos
3540    shr             r_tmp0, 16
3541    vpbroadcastd    ymm_tmp2, [p_src_row0 + r_tmp0]
3542    vpbroadcastd    ymm_tmp3, [p_src_row1 + r_tmp0]
3543    lea             r_tmp0, [i_xpos + 1 * i_scalex]
3544    shr             r_tmp0, 16
3545    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
3546    vpunpcklwd      ymm_tmp2, ymm_tmp2, ymm_tmp0
3547    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
3548    vpunpcklwd      ymm_tmp3, ymm_tmp3, ymm_tmp0
3549    lea             r_tmp0, [i_xpos + 2 * i_scalex]
3550    lea             i_xpos, [i_xpos + 4 * i_scalex]
3551    shr             r_tmp0, 16
3552    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
3553    vpblendd        ymm_tmp2, ymm_tmp2, ymm_tmp0, 00100010b
3554    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
3555    vpblendd        ymm_tmp3, ymm_tmp3, ymm_tmp0, 00100010b
3556    mov             r_tmp0, i_xpos
3557    sub             r_tmp0, i_scalex
3558    shr             r_tmp0, 16
3559    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0 - 2]
3560    vpblendw        ymm_tmp2, ymm_tmp2, ymm_tmp0, 1000b
3561    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0 - 2]
3562    vpblendw        ymm_tmp3, ymm_tmp3, ymm_tmp0, 1000b
3563    mov             r_tmp0, i_xpos
3564    shr             r_tmp0, 16
3565    vmovd           xmm_tmp0, [p_src_row0 + r_tmp0]
3566    vmovd           xmm_tmp1, [p_src_row1 + r_tmp0]
3567    lea             r_tmp0, [i_xpos + i_scalex]
3568    shr             r_tmp0, 16
3569    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 1
3570    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 1
3571    lea             r_tmp0, [i_xpos + 2 * i_scalex]
3572    lea             i_xpos, [i_xpos + 4 * i_scalex]
3573    shr             r_tmp0, 16
3574    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 2
3575    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 2
3576    mov             r_tmp0, i_xpos
3577    sub             r_tmp0, i_scalex
3578    shr             r_tmp0, 16
3579    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 3
3580    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 3
3581    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp0, 00001111b
3582    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp1, 00001111b
3583    mov             r_tmp0, i_xpos
3584    shr             r_tmp0, 16
3585    vmovd           xmm_tmp0, [p_src_row0 + r_tmp0]
3586    vmovd           xmm_tmp1, [p_src_row1 + r_tmp0]
3587    lea             r_tmp0, [i_xpos + i_scalex]
3588    shr             r_tmp0, 16
3589    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 1
3590    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 1
3591    lea             r_tmp0, [i_xpos + 2 * i_scalex]
3592    lea             i_xpos, [i_xpos + 4 * i_scalex]
3593    shr             r_tmp0, 16
3594    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 2
3595    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 2
3596    mov             r_tmp0, i_xpos
3597    sub             r_tmp0, i_scalex
3598    shr             r_tmp0, 16
3599    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 3
3600    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 3
3601    vpblendd        ymm_tmp2, ymm_tmp2, ymm_tmp0, 00001111b
3602    vpblendd        ymm_tmp3, ymm_tmp3, ymm_tmp1, 00001111b
3603    vpunpcklbw      ymm_tmp4, ymm_tmp4, ymm_0
3604    vpunpcklbw      ymm_tmp5, ymm_tmp5, ymm_0
3605    AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp1, ymm_xfrac0, ymm_yfrac0, ymm_yfrac1
3606    vpmaddwd        ymm_tmp0, ymm_tmp0, ymm_tmp4
3607    vpmaddwd        ymm_tmp1, ymm_tmp1, ymm_tmp5
3608    vpaddd          ymm_tmp0, ymm_tmp0, ymm_tmp1
3609    vpunpcklbw      ymm_tmp4, ymm_tmp2, ymm_0
3610    vpunpcklbw      ymm_tmp5, ymm_tmp3, ymm_0
3611    AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp2, ymm_xfrac1, ymm_yfrac0, ymm_yfrac1
3612    vpmaddwd        ymm_tmp1, ymm_tmp1, ymm_tmp4
3613    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp5
3614    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp2
3615    AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0
3616    vpermq          ymm_tmp0, ymm_tmp0, 0010b
3617    vmovdqu         [p_dst], xmm_tmp0
3618    add             p_dst, 16
3619    vpaddw          ymm_xfrac0, ymm_xfrac0, ymm_xfrac_inc
3620    vpaddw          ymm_xfrac1, ymm_xfrac1, ymm_xfrac_inc
3621%endmacro
3622
3623; xpos_int=%1 xpos_frac=%2 inc_int=%3 inc_frac=%4 7FFFh=%5 tmp=%6,%7
3624%macro AVX2_BilinearIncXposw 7
3625    vpaddb          %1, %1, %3
3626    vpaddw          %6, %2, %4
3627    vpcmpgtw        %7, %2, %6
3628    vpsubb          %1, %1, %7  ; add carry
3629    vpand           %2, %6, %5
3630%endmacro
3631
3632; res>>29=%1 data0=%2 data1=%3 frac0=%4 frac1=%5 tmp=%6
3633%macro AVX2_LinearAccurateInterpolateVerticalDwords 6
3634    vpshufd         %1, %2, 10110001b
3635    vpshufd         %6, %3, 10110001b
3636    vpmuludq        %1, %1, %4
3637    vpmuludq        %6, %6, %5
3638    vpaddq          %1, %1, %6
3639    vpmuludq        %2, %2, %4
3640    vpmuludq        %3, %3, %5
3641    vpaddq          %2, %2, %3
3642    vpsllq          %1, %1,  3
3643    vpsrlq          %2, %2, 29
3644    vpblendd        %1, %1, %2, 01010101b
3645%endmacro
3646
3647%macro AVX2_BilinearAccurateDownsample2xOrLess_16px 0
3648    vpshufb         ymm_tmp0, ymm_xpos_int, ymm_0
3649    vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
3650    AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff
3651    mov             r_tmp0, i_xpos
3652    shr             r_tmp0, 16
3653    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
3654    vmovdqu         xmm_tmp5, [p_src_row1 + r_tmp0]
3655    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
3656    lea             i_xpos, [i_xpos + 8 * i_scalex2]
3657    shr             r_tmp0, 16
3658    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
3659    vinserti128     ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
3660    vpshufb         ymm_tmp4, ymm_tmp4, ymm_xpos_int
3661    vpshufb         ymm_tmp5, ymm_tmp5, ymm_xpos_int
3662    vpunpcklbw      ymm_tmp2, ymm_tmp4, ymm_0
3663    vpunpcklbw      ymm_tmp3, ymm_tmp5, ymm_0
3664    vpunpckhbw      ymm_tmp4, ymm_tmp4, ymm_0
3665    vpunpckhbw      ymm_tmp5, ymm_tmp5, ymm_0
3666    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp0
3667    vpmaddwd        ymm_tmp3, ymm_tmp3, ymm_tmp0
3668    vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_tmp1
3669    vpmaddwd        ymm_tmp5, ymm_tmp5, ymm_tmp1
3670    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp2, ymm_tmp3, ymm_yfrac0, ymm_yfrac1, ymm_tmp1
3671    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp2
3672    vpackssdw       ymm_tmp0, ymm_tmp0, ymm_tmp1
3673    vpavgw          ymm_tmp0, ymm_tmp0, ymm_0
3674    vpackuswb       ymm_tmp0, ymm_tmp0, ymm_tmp0
3675    vmovlps         [p_dst], xmm_tmp0
3676    vextracti128    [p_dst + 8], ymm_tmp0, 1
3677    add             p_dst, 16
3678    AVX2_BilinearIncXposw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_7fff, ymm_tmp0, ymm_tmp1
3679%endmacro
3680
3681%macro AVX2_BilinearAccurateDownsample4xOrLess_16px 0
3682    vbroadcasti128  ymm_tmp0, xmm_shufb_0000000088888888
3683    vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
3684    vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
3685    AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff
3686    mov             r_tmp0, i_xpos
3687    shr             r_tmp0, 16
3688    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
3689    vmovdqu         xmm_tmp2, [p_src_row1 + r_tmp0]
3690    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
3691    shr             r_tmp0, 16
3692    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
3693    vinserti128     ymm_tmp2, ymm_tmp2, [p_src_row1 + r_tmp0], 1
3694    lea             r_tmp0, [i_xpos + 2 * i_scalex2]
3695    lea             i_xpos, [r_tmp0 + 4 * i_scalex2]
3696    shr             r_tmp0, 16
3697    vpunpcklbw      ymm_tmp3, ymm_xpos_int, ymm_db80h
3698    vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp3
3699    vpshufb         ymm_tmp2, ymm_tmp2, ymm_tmp3
3700    vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_tmp0
3701    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp0
3702    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp4, ymm_tmp2, ymm_yfrac0, ymm_yfrac1, ymm_tmp3
3703    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
3704    vmovdqu         xmm_tmp2, [p_src_row1 + r_tmp0]
3705    mov             r_tmp0, i_xpos
3706    lea             i_xpos, [i_xpos + 2 * i_scalex2]
3707    shr             r_tmp0, 16
3708    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
3709    vinserti128     ymm_tmp2, ymm_tmp2, [p_src_row1 + r_tmp0], 1
3710    vpunpckhbw      ymm_tmp3, ymm_xpos_int, ymm_db80h
3711    vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp3
3712    vpshufb         ymm_tmp2, ymm_tmp2, ymm_tmp3
3713    vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_tmp1
3714    vpmaddwd        ymm_tmp2, ymm_tmp2, ymm_tmp1
3715    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp2, ymm_yfrac0, ymm_yfrac1, ymm_tmp3
3716    vpackssdw       ymm_tmp0, ymm_tmp0, ymm_tmp1
3717    vpavgw          ymm_tmp0, ymm_tmp0, ymm_0
3718    vpackuswb       ymm_tmp0, ymm_tmp0, ymm_tmp0
3719    vmovlps         [p_dst], xmm_tmp0
3720    vextracti128    [p_dst + 8], ymm_tmp0, 1
3721    add             p_dst, 16
3722    AVX2_BilinearIncXposw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_7fff, ymm_tmp0, ymm_tmp1
3723%endmacro
3724
3725%macro AVX2_BilinearAccurateDownsample8xOrLess_16px 0
3726    vbroadcasti128  ymm_tmp0, xmm_shufb_000044448888CCCC
3727    vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
3728    vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
3729    mov             r_tmp0, i_xpos
3730    shr             r_tmp0, 16
3731    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
3732    vmovdqu         xmm_tmp5, [p_src_row1 + r_tmp0]
3733    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
3734    add             i_xpos, i_scalex2
3735    shr             r_tmp0, 16
3736    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
3737    vinserti128     ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
3738    mov             r_tmp0, i_xpos
3739    shr             r_tmp0, 16
3740    vmovdqu         xmm_tmp0, [p_src_row0 + r_tmp0]
3741    vmovdqu         xmm_tmp1, [p_src_row1 + r_tmp0]
3742    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
3743    add             i_xpos, i_scalex2
3744    shr             r_tmp0, 16
3745    vinserti128     ymm_tmp0, ymm_tmp0, [p_src_row0 + r_tmp0], 1
3746    vinserti128     ymm_tmp1, ymm_tmp1, [p_src_row1 + r_tmp0], 1
3747    vpunpcklbw      ymm_tmp3, ymm_xpos_int, ymm_db80h
3748    vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp3
3749    vpshufb         ymm_tmp5, ymm_tmp5, ymm_tmp3
3750    vpshufb         ymm_tmp0, ymm_tmp0, ymm_tmp3
3751    vpshufb         ymm_tmp1, ymm_tmp1, ymm_tmp3
3752    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp0, 11001100b
3753    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp1, 11001100b
3754    AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff
3755    vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_tmp0
3756    vpmaddwd        ymm_tmp5, ymm_tmp5, ymm_tmp0
3757    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp3
3758    mov             r_tmp0, i_xpos
3759    shr             r_tmp0, 16
3760    vmovdqu         xmm_tmp4, [p_src_row0 + r_tmp0]
3761    vmovdqu         xmm_tmp5, [p_src_row1 + r_tmp0]
3762    lea             r_tmp0, [i_xpos + 4 * i_scalex2]
3763    add             i_xpos, i_scalex2
3764    shr             r_tmp0, 16
3765    vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
3766    vinserti128     ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
3767    mov             r_tmp0, i_xpos
3768    lea             i_xpos, [i_xpos + 4 * i_scalex2]
3769    shr             r_tmp0, 16
3770    vmovdqu         xmm_tmp2, [p_src_row0 + r_tmp0]
3771    vmovdqu         xmm_tmp3, [p_src_row1 + r_tmp0]
3772    mov             r_tmp0, i_xpos
3773    add             i_xpos, i_scalex2
3774    shr             r_tmp0, 16
3775    vinserti128     ymm_tmp2, ymm_tmp2, [p_src_row0 + r_tmp0], 1
3776    vinserti128     ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1
3777    vpshufb         ymm_tmp4, ymm_tmp4, ymm_xpos_int
3778    vpshufb         ymm_tmp5, ymm_tmp5, ymm_xpos_int
3779    vpshufb         ymm_tmp2, ymm_tmp2, ymm_xpos_int
3780    vpshufb         ymm_tmp3, ymm_tmp3, ymm_xpos_int
3781    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp2, 10001000b
3782    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp3, 10001000b
3783    vpunpckhbw      ymm_tmp4, ymm_tmp4, ymm_0
3784    vpunpckhbw      ymm_tmp5, ymm_tmp5, ymm_0
3785    vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_tmp1
3786    vpmaddwd        ymm_tmp5, ymm_tmp5, ymm_tmp1
3787    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp3
3788    vpackssdw       ymm_tmp0, ymm_tmp0, ymm_tmp1
3789    vpavgw          ymm_tmp0, ymm_tmp0, ymm_0
3790    vpackuswb       ymm_tmp0, ymm_tmp0, ymm_tmp0
3791    vmovlps         [p_dst], xmm_tmp0
3792    vextracti128    [p_dst + 8], ymm_tmp0, 1
3793    add             p_dst, 16
3794    AVX2_BilinearIncXposw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_7fff, ymm_tmp0, ymm_tmp1
3795%endmacro
3796
3797%macro AVX2_GeneralBilinearAccurateDownsample_16px 0
3798    mov             r_tmp0, i_xpos
3799    shr             r_tmp0, 16
3800    vpbroadcastd    ymm_tmp4, [p_src_row0 + r_tmp0]
3801    vpbroadcastd    ymm_tmp5, [p_src_row1 + r_tmp0]
3802    lea             r_tmp0, [i_xpos + 1 * i_scalex]
3803    shr             r_tmp0, 16
3804    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
3805    vpunpcklwd      ymm_tmp4, ymm_tmp4, ymm_tmp0
3806    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
3807    vpunpcklwd      ymm_tmp5, ymm_tmp5, ymm_tmp0
3808    lea             r_tmp0, [i_xpos + 2 * i_scalex]
3809    lea             i_xpos, [i_xpos + 4 * i_scalex]
3810    shr             r_tmp0, 16
3811    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
3812    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp0, 00100010b
3813    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
3814    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp0, 00100010b
3815    mov             r_tmp0, i_xpos
3816    sub             r_tmp0, i_scalex
3817    shr             r_tmp0, 16
3818    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0 - 2]
3819    vpblendw        ymm_tmp4, ymm_tmp4, ymm_tmp0, 1000b
3820    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0 - 2]
3821    vpblendw        ymm_tmp5, ymm_tmp5, ymm_tmp0, 1000b
3822    mov             r_tmp0, i_xpos
3823    shr             r_tmp0, 16
3824    vpbroadcastd    ymm_tmp2, [p_src_row0 + r_tmp0]
3825    vpbroadcastd    ymm_tmp3, [p_src_row1 + r_tmp0]
3826    lea             r_tmp0, [i_xpos + 1 * i_scalex]
3827    shr             r_tmp0, 16
3828    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
3829    vpunpcklwd      ymm_tmp2, ymm_tmp2, ymm_tmp0
3830    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
3831    vpunpcklwd      ymm_tmp3, ymm_tmp3, ymm_tmp0
3832    lea             r_tmp0, [i_xpos + 2 * i_scalex]
3833    lea             i_xpos, [i_xpos + 4 * i_scalex]
3834    shr             r_tmp0, 16
3835    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0]
3836    vpblendd        ymm_tmp2, ymm_tmp2, ymm_tmp0, 00100010b
3837    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0]
3838    vpblendd        ymm_tmp3, ymm_tmp3, ymm_tmp0, 00100010b
3839    mov             r_tmp0, i_xpos
3840    sub             r_tmp0, i_scalex
3841    shr             r_tmp0, 16
3842    vpbroadcastd    ymm_tmp0, [p_src_row0 + r_tmp0 - 2]
3843    vpblendw        ymm_tmp2, ymm_tmp2, ymm_tmp0, 1000b
3844    vpbroadcastd    ymm_tmp0, [p_src_row1 + r_tmp0 - 2]
3845    vpblendw        ymm_tmp3, ymm_tmp3, ymm_tmp0, 1000b
3846    mov             r_tmp0, i_xpos
3847    shr             r_tmp0, 16
3848    vmovd           xmm_tmp0, [p_src_row0 + r_tmp0]
3849    vmovd           xmm_tmp1, [p_src_row1 + r_tmp0]
3850    lea             r_tmp0, [i_xpos + i_scalex]
3851    shr             r_tmp0, 16
3852    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 1
3853    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 1
3854    lea             r_tmp0, [i_xpos + 2 * i_scalex]
3855    lea             i_xpos, [i_xpos + 4 * i_scalex]
3856    shr             r_tmp0, 16
3857    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 2
3858    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 2
3859    mov             r_tmp0, i_xpos
3860    sub             r_tmp0, i_scalex
3861    shr             r_tmp0, 16
3862    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 3
3863    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 3
3864    vpblendd        ymm_tmp4, ymm_tmp4, ymm_tmp0, 00001111b
3865    vpblendd        ymm_tmp5, ymm_tmp5, ymm_tmp1, 00001111b
3866    mov             r_tmp0, i_xpos
3867    shr             r_tmp0, 16
3868    vmovd           xmm_tmp0, [p_src_row0 + r_tmp0]
3869    vmovd           xmm_tmp1, [p_src_row1 + r_tmp0]
3870    lea             r_tmp0, [i_xpos + i_scalex]
3871    shr             r_tmp0, 16
3872    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 1
3873    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 1
3874    lea             r_tmp0, [i_xpos + 2 * i_scalex]
3875    lea             i_xpos, [i_xpos + 4 * i_scalex]
3876    shr             r_tmp0, 16
3877    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 2
3878    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 2
3879    mov             r_tmp0, i_xpos
3880    sub             r_tmp0, i_scalex
3881    shr             r_tmp0, 16
3882    vpinsrw         xmm_tmp0, [p_src_row0 + r_tmp0], 3
3883    vpinsrw         xmm_tmp1, [p_src_row1 + r_tmp0], 3
3884    vpblendd        ymm_tmp2, ymm_tmp2, ymm_tmp0, 00001111b
3885    vpblendd        ymm_tmp3, ymm_tmp3, ymm_tmp1, 00001111b
3886    vpunpcklbw      ymm_tmp4, ymm_tmp4, ymm_0
3887    vpunpcklbw      ymm_tmp5, ymm_tmp5, ymm_0
3888    vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_xfrac0
3889    vpmaddwd        ymm_tmp5, ymm_tmp5, ymm_xfrac0
3890    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp1
3891    vpunpcklbw      ymm_tmp4, ymm_tmp2, ymm_0
3892    vpunpcklbw      ymm_tmp5, ymm_tmp3, ymm_0
3893    vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_xfrac1
3894    vpmaddwd        ymm_tmp5, ymm_tmp5, ymm_xfrac1
3895    AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp2
3896    vpackssdw       ymm_tmp0, ymm_tmp0, ymm_tmp1
3897    vpavgw          ymm_tmp0, ymm_tmp0, ymm_0
3898    vpackuswb       ymm_tmp0, ymm_tmp0, ymm_tmp0
3899    vextracti128    [p_dst], ymm_tmp0, 1
3900    vmovlps         [p_dst + 8], xmm_tmp0
3901    add             p_dst, 16
3902    vpaddw          ymm_xfrac0, ymm_xfrac0, ymm_xfrac_inc
3903    vpaddw          ymm_xfrac1, ymm_xfrac1, ymm_xfrac_inc
3904    vpand           ymm_xfrac0, ymm_xfrac0, ymm_7fff
3905    vpand           ymm_xfrac1, ymm_xfrac1, ymm_7fff
3906%endmacro
3907
3908; downsample_16px_macro=%1 b_fast=%2
3909%macro AVX2_GeneralBilinearDownsampler_loop 2
3910%%height:
3911    mov             p_src_row0, i_ypos
3912    shr             p_src_row0, 15
3913    imul            p_src_row0, i_src_stride
3914    add             p_src_row0, p_src
3915    mov             p_src_row1, p_src_row0
3916    add             p_src_row1, i_src_stride
3917%ifdef X86_32
3918%if %2
3919    vpbroadcastw    ymm_tmp1, i_ypos
3920    vpsllw          ymm_tmp1, ymm_tmp1, 1
3921    vpsrlw          ymm_tmp1, ymm_tmp1, 1
3922    vpcmpeqw        ymm_tmp0, ymm_tmp0, ymm_tmp0
3923    vpsrlw          ymm_tmp0, ymm_tmp0, 1
3924%else
3925    vpbroadcastd    ymm_tmp1, i_ypos
3926    vpslld          ymm_tmp1, ymm_tmp1, 17
3927    vpsrld          ymm_tmp1, ymm_tmp1, 17
3928    vpcmpeqw        ymm_tmp0, ymm_tmp0, ymm_tmp0
3929    vpsrld          ymm_tmp0, ymm_tmp0, 17
3930%endif
3931    vpxor           ymm_tmp0, ymm_tmp0, ymm_tmp1
3932    vmovdqa         ymm_yfrac0, ymm_tmp0
3933    vmovdqa         ymm_yfrac1, ymm_tmp1
3934%else
3935    vmovd           xmm_tmp0, i_yposd
3936    vpbroadcastw    ymm_yfrac1, xmm_tmp0
3937%if %2
3938    vpsllw          ymm_yfrac1, ymm_yfrac1, 1
3939    vpsrlw          ymm_yfrac1, ymm_yfrac1, 1
3940    vpcmpeqw        ymm_yfrac0, ymm_yfrac0, ymm_yfrac0
3941    vpsrlw          ymm_yfrac0, ymm_yfrac0, 1
3942%else
3943    vpslld          ymm_yfrac1, ymm_yfrac1, 17
3944    vpsrld          ymm_yfrac1, ymm_yfrac1, 17
3945    vpcmpeqw        ymm_yfrac0, ymm_yfrac0, ymm_yfrac0
3946    vpsrld          ymm_yfrac0, ymm_yfrac0, 17
3947%endif
3948    vpxor           ymm_yfrac0, ymm_yfrac0, ymm_yfrac1
3949%endif
3950
3951    mov             i_xpos, 1 << 15
3952    mov             i_width_cnt, i_dst_width
3953    sub             i_width_cnt, 1
3954
3955%ifdef ymm_xpos_int
3956    vmovdqa         ymm_xpos_int, ymm_xpos_int_begin
3957    vmovdqa         ymm_xpos_frac, ymm_xpos_frac_begin
3958%else
3959    vmovdqa         ymm_xfrac0, ymm_xfrac0_begin
3960    vmovdqa         ymm_xfrac1, ymm_xfrac1_begin
3961%endif
3962
3963%%width:
3964    %1
3965    sub             i_width_cnt, 16
3966    jg              %%width
3967
3968    lea             p_dst, [p_dst + i_width_cnt + 1]
3969%ifdef i_scalex2
3970    mov             r_tmp0, i_scalex2
3971    shr             r_tmp0, 1
3972    imul            i_width_cnt, r_tmp0
3973%else
3974    imul            i_width_cnt, i_scalex
3975%endif
3976    add             i_xpos, i_width_cnt
3977    shr             i_xpos, 16
3978    movzx           r_tmp0, byte [p_src_row0 + i_xpos]
3979    mov             [p_dst - 1], r_tmp0b
3980%ifdef X86_32
3981    mov             r_tmp0, i_scaleyd
3982    add             i_yposd, r_tmp0
3983%else
3984    add             i_yposd, i_scaleyd
3985%endif
3986    add             p_dst, i_dst_stride_less_width
3987    sub             i_dst_height, 1
3988    jg              %%height
3989%endmacro
3990
3991;**************************************************************************************************************
3992;void GeneralBilinearFastDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
3993;    int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
3994;    uint32_t uiScaleY);
3995;
3996;**************************************************************************************************************
3997
3998WELS_EXTERN GeneralBilinearFastDownsampler_avx2
3999    %assign push_num 0
4000%ifndef X86_32
4001    push            r12
4002    push            r13
4003    push            rbx
4004    push            rbp
4005    %assign push_num 4
4006%ifdef WIN64
4007    push            rdi
4008    push            rsi
4009    %assign push_num push_num + 2
4010%endif
4011%endif
4012    LOAD_7_PARA
4013    PUSH_XMM 16
4014    SIGN_EXTENSION  r1, r1d
4015    SIGN_EXTENSION  r2, r2d
4016    SIGN_EXTENSION  r3, r3d
4017    SIGN_EXTENSION  r5, r5d
4018    ZERO_EXTENSION  r6d
4019    sub             r1, r2                                            ; dst_stride - dst_width
4020%ifdef X86_32
4021    vmovd           xmm0, arg8
4022    vmovd           xmm1, esp
4023    and             esp, -32
4024%ifdef X86_32_PICASM
4025    sub             esp, 8 * 4 + 9 * 32
4026%else
4027    sub             esp, 8 * 4 + 8 * 32
4028%endif
4029    vmovd           [esp], xmm1
4030    %define p_dst                   r0
4031    %define i_dst_stride_less_width [esp + 1 * 4]
4032    %define i_dst_width             [esp + 2 * 4]
4033    %define i_dst_height            dword [esp + 3 * 4]
4034    %define p_src                   [esp + 4 * 4]
4035    %define i_src_stride            [esp + 5 * 4]
4036    %define i_scalex                r6
4037    %define i_scalexd               r6d
4038    %define i_scaleyd               [esp + 6 * 4]
4039    %define i_xpos                  r2
4040    %define i_ypos                  [esp + 7 * 4]
4041    %define i_yposd                 dword [esp + 7 * 4]
4042    %define p_src_row0              r3
4043    %define p_src_row1              r4
4044    %define i_width_cnt             r5
4045    %define r_tmp0                  r1
4046    %define r_tmp0b                 r1b
4047    %define ymm_xpos_frac           ymm1
4048    %define ymm_xpos_frac_inc       [esp + 8 * 4]
4049    %define ymm_xpos_int            ymm3
4050    %define ymm_xpos_int_inc        [esp + 8 * 4 + 1 * 32]
4051    %define ymm_yfrac0              [esp + 8 * 4 + 2 * 32]
4052    %define ymm_yfrac1              [esp + 8 * 4 + 3 * 32]
4053    %define xmm_tmp0                xmm7
4054    %define ymm_tmp0                ymm7
4055    %define xmm_tmp1                xmm0
4056    %define ymm_tmp1                ymm0
4057    %define xmm_tmp2                xmm2
4058    %define ymm_tmp2                ymm2
4059    %define xmm_tmp3                xmm4
4060    %define ymm_tmp3                ymm4
4061    %define xmm_tmp4                xmm5
4062    %define ymm_tmp4                ymm5
4063    %define xmm_tmp5                xmm6
4064    %define ymm_tmp5                ymm6
4065    %define ymm_0                   [esp + 8 * 4 + 4 * 32]
4066    %define ymm_ffff                [esp + 8 * 4 + 5 * 32]
4067    %define ymm_xpos_int_begin      [esp + 8 * 4 + 6 * 32]
4068    %define ymm_xpos_frac_begin     [esp + 8 * 4 + 7 * 32]
4069%ifdef X86_32_PICASM
4070    %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 8 * 32]
4071    %define xmm_shufb_000044448888CCCC [esp + 8 * 4 + 8 * 32 + 16]
4072    vpxor           ymm_tmp4, ymm_tmp4, ymm_tmp4
4073    vpcmpeqb        ymm_tmp5, ymm_tmp5, ymm_tmp5
4074    vpsubb          ymm_tmp4, ymm_tmp4, ymm_tmp5
4075    vpsllw          ymm_tmp3, ymm_tmp4, 3
4076    vpslldq         ymm_tmp3, ymm_tmp3, 8
4077    vmovdqa         xmm_shufb_0000000088888888, xmm_tmp3
4078    vpsllq          ymm_tmp5, ymm_tmp4, 34
4079    vpaddb          ymm_tmp5, ymm_tmp5, ymm_tmp3
4080    vmovdqa         xmm_shufb_000044448888CCCC, xmm_tmp5
4081%else
4082    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
4083    %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC]
4084%endif
4085    mov             i_dst_stride_less_width, r1
4086    mov             i_dst_width, r2
4087    mov             i_dst_height, r3
4088    mov             p_src, r4
4089    mov             i_src_stride, r5
4090    vmovd           i_scaleyd, xmm0
4091    vpxor           xmm0, xmm0, xmm0
4092    vmovdqa         ymm_0, ymm0
4093    vpcmpeqw        ymm_tmp0, ymm_tmp0, ymm_tmp0
4094    vmovdqa         ymm_ffff, ymm_tmp0
4095%else
4096    %define p_dst                   r0
4097    %define i_dst_stride_less_width r1
4098    %define i_dst_width             r2
4099    %define i_dst_height            r3
4100    %define p_src                   r4
4101    %define i_src_stride            r5
4102    %define i_scalex                r6
4103    %define i_scalexd               r6d
4104    %define i_scaleyd               dword arg8d
4105    %define i_xpos                  r12
4106    %define i_ypos                  r13
4107    %define i_yposd                 r13d
4108    %define p_src_row0              rbp
4109%ifdef WIN64
4110    %define p_src_row1              rsi
4111    %define i_width_cnt             rdi
4112%else
4113    %define p_src_row1              r11
4114    %define i_width_cnt             rax
4115%endif
4116    %define r_tmp0                  rbx
4117    %define r_tmp0b                 bl
4118    %define ymm_0                   ymm0
4119    %define ymm_xpos_frac           ymm1
4120    %define ymm_xpos_frac_inc       ymm2
4121    %define ymm_xpos_int            ymm3
4122    %define ymm_xpos_int_inc        ymm4
4123    %define ymm_yfrac0              ymm5
4124    %define ymm_yfrac1              ymm6
4125    %define xmm_tmp0                xmm7
4126    %define ymm_tmp0                ymm7
4127    %define xmm_tmp1                xmm8
4128    %define ymm_tmp1                ymm8
4129    %define xmm_tmp2                xmm9
4130    %define ymm_tmp2                ymm9
4131    %define xmm_tmp3                xmm10
4132    %define ymm_tmp3                ymm10
4133    %define xmm_tmp4                xmm11
4134    %define ymm_tmp4                ymm11
4135    %define xmm_tmp5                xmm12
4136    %define ymm_tmp5                ymm12
4137    %define ymm_ffff                ymm13
4138    %define ymm_xpos_int_begin      ymm14
4139    %define ymm_xpos_frac_begin     ymm15
4140    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
4141    %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC]
4142    vpxor           ymm_0, ymm_0, ymm_0
4143    vpcmpeqw        ymm_ffff, ymm_ffff, ymm_ffff
4144%endif
4145
4146    sub             i_dst_height, 1
4147    je              .final_row
4148    jl              .done
4149
4150    mov             i_yposd, 1 << 14
4151    vmovd           xmm_tmp0, i_scalexd
4152    vpbroadcastd    ymm_tmp0, xmm_tmp0
4153    vpslld          ymm_tmp1, ymm_tmp0, 2
4154    vpslld          ymm_tmp2, ymm_tmp0, 3
4155    vpaddd          ymm_tmp3, ymm_tmp1, ymm_tmp2
4156    vpxor           ymm_tmp4, ymm_tmp4, ymm_tmp4
4157    vpblendd        ymm_tmp1, ymm_tmp4, ymm_tmp1, 11110000b
4158    vpblendd        ymm_tmp2, ymm_tmp2, ymm_tmp3, 11110000b
4159    vpaddd          ymm_tmp3, ymm_tmp0, ymm_tmp0
4160    vpblendd        ymm_tmp3, ymm_tmp4, ymm_tmp3, 11001100b
4161    vpblendd        ymm_tmp0, ymm_tmp4, ymm_tmp0, 10101010b
4162    vpaddd          ymm_tmp0, ymm_tmp3, ymm_tmp0
4163    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp0
4164    vpaddd          ymm_tmp2, ymm_tmp2, ymm_tmp0
4165    vpcmpeqw        ymm_tmp3, ymm_tmp3, ymm_tmp3
4166    vpsrld          ymm_tmp3, ymm_tmp3, 31
4167    vpslld          ymm_tmp3, ymm_tmp3, 15
4168    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp3
4169    vpaddd          ymm_tmp2, ymm_tmp2, ymm_tmp3
4170    vpsrld          ymm_xpos_int, ymm_tmp1, 16
4171    vpsrld          ymm_tmp0, ymm_tmp2, 16
4172    vpackssdw       ymm_xpos_int, ymm_xpos_int, ymm_tmp0
4173    vpermq          ymm_xpos_int, ymm_xpos_int, 11011000b
4174    vpackuswb       ymm_xpos_int, ymm_xpos_int, ymm_xpos_int
4175    vpcmpeqw        ymm_tmp3, ymm_tmp3, ymm_tmp3
4176    vpsubb          ymm_tmp0, ymm_xpos_int, ymm_tmp3
4177    vpunpcklbw      ymm_xpos_int, ymm_xpos_int, ymm_tmp0
4178    vpslld          ymm_tmp1, ymm_tmp1, 16
4179    vpsrld          ymm_tmp1, ymm_tmp1, 16
4180    vpslld          ymm_tmp2, ymm_tmp2, 16
4181    vpsrld          ymm_tmp2, ymm_tmp2, 16
4182    vpackusdw       ymm_xpos_frac, ymm_tmp1, ymm_tmp2
4183    vpermq          ymm_xpos_frac, ymm_xpos_frac, 11011000b
4184    vmovd           xmm_tmp0, i_scalexd
4185    vpslld          xmm_tmp0, xmm_tmp0, 4
4186    vpbroadcastw    ymm_tmp1, xmm_tmp0
4187    vmovdqa         ymm_xpos_frac_inc, ymm_tmp1
4188    vpsrld          xmm_tmp0, xmm_tmp0, 16
4189    vpsubw          ymm_tmp0, ymm_tmp0, ymm_tmp3
4190    vpbroadcastb    ymm_tmp0, xmm_tmp0
4191    vmovdqa         ymm_xpos_int_inc, ymm_tmp0
4192    vmovdqa         ymm_xpos_int_begin, ymm_xpos_int
4193    vmovdqa         ymm_xpos_frac_begin, ymm_xpos_frac
4194
4195    cmp             i_scalex, 4 << 16
4196    ja              .scalex_above4
4197    cmp             i_scalex, 2 << 16
4198    ja              .scalex_above2_beloweq4
4199    add             i_scalex, i_scalex
4200%xdefine i_scalex2 i_scalex
4201%undef i_scalex
4202    AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearFastDownsample2xOrLess_16px, 1
4203    shr             i_scalex2, 1
4204%xdefine i_scalex i_scalex2
4205%undef i_scalex2
4206    jmp             .final_row
4207.scalex_above2_beloweq4:
4208    add             i_scalex, i_scalex
4209%xdefine i_scalex2 i_scalex
4210%undef i_scalex
4211    AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearFastDownsample4xOrLess_16px, 1
4212    shr             i_scalex2, 1
4213%xdefine i_scalex i_scalex2
4214%undef i_scalex2
4215    jmp             .final_row
4216.scalex_above4:
4217    cmp             i_scalex, 8 << 16
4218    ja              .scalex_above8
4219    add             i_scalex, i_scalex
4220%xdefine i_scalex2 i_scalex
4221%undef i_scalex
4222    AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearFastDownsample8xOrLess_16px, 1
4223    shr             i_scalex2, 1
4224%xdefine i_scalex i_scalex2
4225%undef i_scalex2
4226    jmp             .final_row
4227.scalex_above8:
4228%xdefine ymm_xfrac0 ymm_xpos_frac
4229%xdefine ymm_xfrac1 ymm_xpos_int
4230%xdefine ymm_xfrac0_begin ymm_xpos_int_begin
4231%xdefine ymm_xfrac1_begin ymm_xpos_frac_begin
4232%xdefine ymm_xfrac_inc ymm_xpos_frac_inc
4233%undef ymm_xpos_int
4234%undef ymm_xpos_frac
4235%undef ymm_xpos_int_begin
4236%undef ymm_xpos_frac_begin
4237%undef ymm_xpos_int_inc
4238%undef ymm_xpos_frac_inc
4239    AVX2_UnpckXFrac ymm_tmp0, ymm_xfrac1, ymm_xfrac0, ymm_ffff
4240    vpermq          ymm_xfrac0, ymm_tmp0,   01001110b
4241    vpermq          ymm_xfrac1, ymm_xfrac1, 01001110b
4242    vmovdqa         ymm_xfrac0_begin, ymm_xfrac0
4243    vmovdqa         ymm_xfrac1_begin, ymm_xfrac1
4244    vpcmpeqw        ymm_tmp0, ymm_tmp0, ymm_tmp0
4245    vpmullw         ymm_tmp0, ymm_tmp0, ymm_xfrac_inc
4246    vpunpcklwd      ymm_tmp0, ymm_tmp0, ymm_xfrac_inc
4247    vmovdqa         ymm_xfrac_inc, ymm_tmp0
4248    AVX2_GeneralBilinearDownsampler_loop AVX2_GeneralBilinearFastDownsample_16px, 1
4249
4250.final_row:
4251    mov             p_src_row0, i_ypos
4252    shr             p_src_row0, 15
4253    imul            p_src_row0, i_src_stride
4254    add             p_src_row0, p_src
4255    mov             i_xpos, 1 << 15
4256    mov             i_width_cnt, i_dst_width
4257
4258.final_row_width:
4259    mov             r_tmp0, i_xpos
4260    shr             r_tmp0, 16
4261    movzx           r_tmp0, byte [p_src_row0 + r_tmp0]
4262    mov             [p_dst], r_tmp0b
4263    add             p_dst, 1
4264    add             i_xpos, i_scalex
4265    sub             i_width_cnt, 1
4266    jg              .final_row_width
4267
4268.done:
4269    vzeroupper
4270%ifdef X86_32
4271    mov             esp, [esp]
4272%endif
4273    POP_XMM
4274    LOAD_7_PARA_POP
4275%ifndef X86_32
4276%ifdef WIN64
4277    pop             rsi
4278    pop             rdi
4279%endif
4280    pop             rbp
4281    pop             rbx
4282    pop             r13
4283    pop             r12
4284%endif
4285    ret
4286%undef p_dst
4287%undef i_dst_stride_less_width
4288%undef i_dst_width
4289%undef i_dst_height
4290%undef p_src
4291%undef i_src_stride
4292%undef i_scalex
4293%undef i_scalexd
4294%undef i_scaleyd
4295%undef i_xpos
4296%undef i_ypos
4297%undef i_yposd
4298%undef p_src_row0
4299%undef p_src_row1
4300%undef i_width_cnt
4301%undef r_tmp0
4302%undef r_tmp0b
4303%undef ymm_xpos_frac
4304%undef ymm_xpos_frac_inc
4305%undef ymm_xpos_int
4306%undef ymm_xpos_int_inc
4307%undef ymm_yfrac0
4308%undef ymm_yfrac1
4309%undef xmm_tmp0
4310%undef ymm_tmp0
4311%undef xmm_tmp1
4312%undef ymm_tmp1
4313%undef xmm_tmp2
4314%undef ymm_tmp2
4315%undef xmm_tmp3
4316%undef ymm_tmp3
4317%undef xmm_tmp4
4318%undef ymm_tmp4
4319%undef xmm_tmp5
4320%undef ymm_tmp5
4321%undef ymm_ffff
4322%undef ymm_0
4323%undef ymm_xpos_int_begin
4324%undef ymm_xpos_frac_begin
4325%undef ymm_xfrac0
4326%undef ymm_xfrac1
4327%undef ymm_xfrac0_begin
4328%undef ymm_xfrac1_begin
4329%undef ymm_xfrac_inc
4330%undef xmm_shufb_0000000088888888
4331%undef xmm_shufb_000044448888CCCC
4332
4333;**************************************************************************************************************
4334;void GeneralBilinearAccurateDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
4335;    int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
4336;    uint32_t uiScaleY);
4337;
4338;**************************************************************************************************************
4339
4340WELS_EXTERN GeneralBilinearAccurateDownsampler_avx2
4341    %assign push_num 0
4342%ifndef X86_32
4343    push            r12
4344    push            r13
4345    push            rbx
4346    push            rbp
4347    %assign push_num 4
4348%ifdef WIN64
4349    push            rdi
4350    push            rsi
4351    %assign push_num push_num + 2
4352%endif
4353%endif
4354    LOAD_7_PARA
4355    PUSH_XMM 16
4356    SIGN_EXTENSION  r1, r1d
4357    SIGN_EXTENSION  r2, r2d
4358    SIGN_EXTENSION  r3, r3d
4359    SIGN_EXTENSION  r5, r5d
4360    ZERO_EXTENSION  r6d
4361    sub             r1, r2                                            ; dst_stride - dst_width
4362    add             r6, r6                                            ; 2 * scalex
4363%ifdef X86_32
4364    vmovd           xmm0, arg8
4365    vmovd           xmm1, esp
4366    and             esp, -32
4367%ifdef X86_32_PICASM
4368    sub             esp, 8 * 4 + 10 * 32
4369%else
4370    sub             esp, 8 * 4 + 8 * 32
4371%endif
4372    vmovd           [esp], xmm1
4373    %define p_dst                   r0
4374    %define i_dst_stride_less_width [esp + 1 * 4]
4375    %define i_dst_width             [esp + 2 * 4]
4376    %define i_dst_height            dword [esp + 3 * 4]
4377    %define p_src                   [esp + 4 * 4]
4378    %define i_src_stride            [esp + 5 * 4]
4379    %define i_scalex                r6
4380    %define i_scalexd               r6d
4381    %define i_scaleyd               [esp + 6 * 4]
4382    %define i_xpos                  r2
4383    %define i_ypos                  [esp + 7 * 4]
4384    %define i_yposd                 dword [esp + 7 * 4]
4385    %define p_src_row0              r3
4386    %define p_src_row1              r4
4387    %define i_width_cnt             r5
4388    %define r_tmp0                  r1
4389    %define r_tmp0b                 r1b
4390    %define ymm_xpos_frac           ymm1
4391    %define ymm_xpos_frac_inc       [esp + 8 * 4]
4392    %define ymm_xpos_int            ymm3
4393    %define ymm_xpos_int_inc        [esp + 8 * 4 + 1 * 32]
4394    %define ymm_yfrac0              [esp + 8 * 4 + 2 * 32]
4395    %define ymm_yfrac1              [esp + 8 * 4 + 3 * 32]
4396    %define xmm_tmp0                xmm7
4397    %define ymm_tmp0                ymm7
4398    %define xmm_tmp1                xmm0
4399    %define ymm_tmp1                ymm0
4400    %define xmm_tmp2                xmm2
4401    %define ymm_tmp2                ymm2
4402    %define xmm_tmp3                xmm4
4403    %define ymm_tmp3                ymm4
4404    %define xmm_tmp4                xmm5
4405    %define ymm_tmp4                ymm5
4406    %define xmm_tmp5                xmm6
4407    %define ymm_tmp5                ymm6
4408    %define ymm_0                   [esp + 8 * 4 + 4 * 32]
4409    %define ymm_7fff                [esp + 8 * 4 + 5 * 32]
4410    %define ymm_xpos_int_begin      [esp + 8 * 4 + 6 * 32]
4411    %define ymm_xpos_frac_begin     [esp + 8 * 4 + 7 * 32]
4412%ifdef X86_32_PICASM
4413    %define ymm_db80h                  [esp + 8 * 4 + 8 * 32]
4414    %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 9 * 32]
4415    %define xmm_shufb_000044448888CCCC [esp + 8 * 4 + 9 * 32 + 16]
4416    vpxor           ymm_tmp4, ymm_tmp4, ymm_tmp4
4417    vpcmpeqb        ymm_tmp5, ymm_tmp5, ymm_tmp5
4418    vpsubb          ymm_tmp4, ymm_tmp4, ymm_tmp5
4419    vpsllw          ymm_tmp3, ymm_tmp4, 3
4420    vpslldq         ymm_tmp3, ymm_tmp3, 8
4421    vmovdqa         xmm_shufb_0000000088888888, xmm_tmp3
4422    vpsllq          ymm_tmp5, ymm_tmp4, 34
4423    vpaddb          ymm_tmp5, ymm_tmp5, ymm_tmp3
4424    vmovdqa         xmm_shufb_000044448888CCCC, xmm_tmp5
4425    vpsllw          ymm_tmp4, ymm_tmp4, 7
4426    vmovdqa         ymm_db80h, ymm_tmp4
4427%else
4428    %define ymm_db80h               [db80h_256]
4429    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
4430    %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC]
4431%endif
4432    mov             i_dst_stride_less_width, r1
4433    mov             i_dst_width, r2
4434    mov             i_dst_height, r3
4435    mov             p_src, r4
4436    mov             i_src_stride, r5
4437    vmovd           i_scaleyd, xmm0
4438    vpxor           xmm0, xmm0, xmm0
4439    vmovdqa         ymm_0, ymm0
4440    vpcmpeqw        ymm0, ymm0, ymm0
4441    vpsrlw          ymm0, ymm0, 1
4442    vmovdqa         ymm_7fff, ymm0
4443%else
4444    %define p_dst                   r0
4445    %define i_dst_stride_less_width r1
4446    %define i_dst_width             r2
4447    %define i_dst_height            r3
4448    %define p_src                   r4
4449    %define i_src_stride            r5
4450    %define i_scalex                r6
4451    %define i_scalexd               r6d
4452    %define i_scaleyd               dword arg8d
4453    %define i_xpos                  r12
4454    %define i_ypos                  r13
4455    %define i_yposd                 r13d
4456    %define p_src_row0              rbp
4457%ifdef WIN64
4458    %define p_src_row1              rsi
4459    %define i_width_cnt             rdi
4460%else
4461    %define p_src_row1              r11
4462    %define i_width_cnt             rax
4463%endif
4464    %define r_tmp0                  rbx
4465    %define r_tmp0b                 bl
4466    %define ymm_0                   ymm0
4467    %define ymm_xpos_frac           ymm1
4468    %define ymm_xpos_int            ymm3
4469    %define ymm_xpos_frac_inc       ymm2
4470    %define ymm_xpos_int_inc        ymm4
4471    %define ymm_yfrac0              ymm5
4472    %define ymm_yfrac1              ymm6
4473    %define xmm_tmp0                xmm7
4474    %define ymm_tmp0                ymm7
4475    %define xmm_tmp1                xmm8
4476    %define ymm_tmp1                ymm8
4477    %define xmm_tmp2                xmm9
4478    %define ymm_tmp2                ymm9
4479    %define xmm_tmp3                xmm10
4480    %define ymm_tmp3                ymm10
4481    %define xmm_tmp4                xmm11
4482    %define ymm_tmp4                ymm11
4483    %define xmm_tmp5                xmm12
4484    %define ymm_tmp5                ymm12
4485    %define ymm_7fff                ymm13
4486    %define ymm_xpos_int_begin      ymm14
4487    %define ymm_xpos_frac_begin     ymm15
4488    %define ymm_db80h               [db80h_256]
4489    %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
4490    %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC]
4491    vpxor           ymm_0, ymm_0, ymm_0
4492    vpcmpeqw        ymm_7fff, ymm_7fff, ymm_7fff
4493    vpsrlw          ymm_7fff, ymm_7fff, 1
4494%endif
4495
4496    sub             i_dst_height, 1
4497    je              .final_row
4498    jl              .done
4499
4500    mov             i_yposd, 1 << 14
4501    vmovd           xmm_tmp0, i_scalexd
4502    vpbroadcastd    ymm_tmp0, xmm_tmp0
4503    vpslld          ymm_tmp1, ymm_tmp0, 2
4504    vpslld          ymm_tmp2, ymm_tmp0, 3
4505    vpaddd          ymm_tmp3, ymm_tmp1, ymm_tmp2
4506    vpxor           ymm_tmp4, ymm_tmp4, ymm_tmp4
4507    vpblendd        ymm_tmp1, ymm_tmp4, ymm_tmp1, 11110000b
4508    vpblendd        ymm_tmp2, ymm_tmp2, ymm_tmp3, 11110000b
4509    vpaddd          ymm_tmp3, ymm_tmp0, ymm_tmp0
4510    vpblendd        ymm_tmp3, ymm_tmp4, ymm_tmp3, 11001100b
4511    vpblendd        ymm_tmp0, ymm_tmp4, ymm_tmp0, 10101010b
4512    vpaddd          ymm_tmp0, ymm_tmp3, ymm_tmp0
4513    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp0
4514    vpaddd          ymm_tmp2, ymm_tmp2, ymm_tmp0
4515    vpcmpeqw        ymm_tmp3, ymm_tmp3, ymm_tmp3
4516    vpsrld          ymm_tmp3, ymm_tmp3, 31
4517    vpslld          ymm_tmp3, ymm_tmp3, 15
4518    vpaddd          ymm_tmp1, ymm_tmp1, ymm_tmp3
4519    vpaddd          ymm_tmp2, ymm_tmp2, ymm_tmp3
4520    vpsrld          ymm_xpos_int, ymm_tmp1, 16
4521    vpsrld          ymm_tmp0, ymm_tmp2, 16
4522    vpackssdw       ymm_xpos_int, ymm_xpos_int, ymm_tmp0
4523    vpermq          ymm_xpos_int, ymm_xpos_int, 11011000b
4524    vpackuswb       ymm_xpos_int, ymm_xpos_int, ymm_xpos_int
4525    vpcmpeqw        ymm_tmp3, ymm_tmp3, ymm_tmp3
4526    vpsubb          ymm_tmp0, ymm_xpos_int, ymm_tmp3
4527    vpunpcklbw      ymm_xpos_int, ymm_xpos_int, ymm_tmp0
4528    vpslld          ymm_tmp1, ymm_tmp1, 16
4529    vpsrld          ymm_tmp1, ymm_tmp1, 16
4530    vpslld          ymm_tmp2, ymm_tmp2, 16
4531    vpsrld          ymm_tmp2, ymm_tmp2, 16
4532    vpackusdw       ymm_xpos_frac, ymm_tmp1, ymm_tmp2
4533    vpermq          ymm_xpos_frac, ymm_xpos_frac, 11011000b
4534    vpsrlw          ymm_xpos_frac, ymm_xpos_frac, 1
4535    vmovd           xmm_tmp0, i_scalexd
4536    vpslld          xmm_tmp0, xmm_tmp0, 4
4537    vpbroadcastw    ymm_tmp1, xmm_tmp0
4538    vpsrlw          ymm_tmp1, ymm_tmp1, 1
4539    vmovdqa         ymm_xpos_frac_inc, ymm_tmp1
4540    vpsrld          xmm_tmp0, xmm_tmp0, 16
4541    vpsubw          ymm_tmp0, ymm_tmp0, ymm_tmp3
4542    vpbroadcastb    ymm_tmp0, xmm_tmp0
4543    vmovdqa         ymm_xpos_int_inc, ymm_tmp0
4544    vmovdqa         ymm_xpos_int_begin, ymm_xpos_int
4545    vmovdqa         ymm_xpos_frac_begin, ymm_xpos_frac
4546
4547    cmp             i_scalex, 4 << 16
4548    ja              .scalex_above4
4549    cmp             i_scalex, 2 << 16
4550    ja              .scalex_above2_beloweq4
4551    add             i_scalex, i_scalex
4552%xdefine i_scalex2 i_scalex
4553%undef i_scalex
4554    AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearAccurateDownsample2xOrLess_16px, 0
4555    shr             i_scalex2, 1
4556%xdefine i_scalex i_scalex2
4557%undef i_scalex2
4558    jmp             .final_row
4559.scalex_above2_beloweq4:
4560    add             i_scalex, i_scalex
4561%xdefine i_scalex2 i_scalex
4562%undef i_scalex
4563    AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearAccurateDownsample4xOrLess_16px, 0
4564    shr             i_scalex2, 1
4565%xdefine i_scalex i_scalex2
4566%undef i_scalex2
4567    jmp             .final_row
4568.scalex_above4:
4569    cmp             i_scalex, 8 << 16
4570    ja              .scalex_above8
4571    add             i_scalex, i_scalex
4572%xdefine i_scalex2 i_scalex
4573%undef i_scalex
4574    AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearAccurateDownsample8xOrLess_16px, 0
4575    shr             i_scalex2, 1
4576%xdefine i_scalex i_scalex2
4577%undef i_scalex2
4578    jmp             .final_row
4579.scalex_above8:
4580%xdefine ymm_xfrac0 ymm_xpos_frac
4581%xdefine ymm_xfrac1 ymm_xpos_int
4582%xdefine ymm_xfrac0_begin ymm_xpos_int_begin
4583%xdefine ymm_xfrac1_begin ymm_xpos_frac_begin
4584%xdefine ymm_xfrac_inc ymm_xpos_frac_inc
4585%undef ymm_xpos_int
4586%undef ymm_xpos_frac
4587%undef ymm_xpos_int_begin
4588%undef ymm_xpos_frac_begin
4589%undef ymm_xpos_int_inc
4590%undef ymm_xpos_frac_inc
4591    AVX2_UnpckXFrac ymm_tmp0, ymm_xfrac1, ymm_xfrac0, ymm_7fff
4592    vpermq          ymm_xfrac0, ymm_tmp0,   01001110b
4593    vpermq          ymm_xfrac1, ymm_xfrac1, 01001110b
4594    vmovdqa         ymm_xfrac0_begin, ymm_xfrac0
4595    vmovdqa         ymm_xfrac1_begin, ymm_xfrac1
4596    vpcmpeqw        ymm_tmp0, ymm_tmp0, ymm_tmp0
4597    vpmullw         ymm_tmp0, ymm_tmp0, ymm_xfrac_inc
4598    vpunpcklwd      ymm_tmp0, ymm_tmp0, ymm_xfrac_inc
4599    vmovdqa         ymm_xfrac_inc, ymm_tmp0
4600    AVX2_GeneralBilinearDownsampler_loop AVX2_GeneralBilinearAccurateDownsample_16px, 0
4601
4602.final_row:
4603    mov             p_src_row0, i_ypos
4604    shr             p_src_row0, 15
4605    imul            p_src_row0, i_src_stride
4606    add             p_src_row0, p_src
4607    mov             i_xpos, 1 << 15
4608    mov             i_width_cnt, i_dst_width
4609
4610.final_row_width:
4611    mov             r_tmp0, i_xpos
4612    shr             r_tmp0, 16
4613    movzx           r_tmp0, byte [p_src_row0 + r_tmp0]
4614    mov             [p_dst], r_tmp0b
4615    add             p_dst, 1
4616    add             i_xpos, i_scalex
4617    sub             i_width_cnt, 1
4618    jg              .final_row_width
4619
4620.done:
4621    vzeroupper
4622%ifdef X86_32
4623    mov             esp, [esp]
4624%endif
4625    POP_XMM
4626    LOAD_7_PARA_POP
4627%ifndef X86_32
4628%ifdef WIN64
4629    pop             rsi
4630    pop             rdi
4631%endif
4632    pop             rbp
4633    pop             rbx
4634    pop             r13
4635    pop             r12
4636%endif
4637    ret
4638%undef p_dst
4639%undef i_dst_stride_less_width
4640%undef i_dst_width
4641%undef i_dst_height
4642%undef p_src
4643%undef i_src_stride
4644%undef i_scalex
4645%undef i_scalexd
4646%undef i_scaleyd
4647%undef i_xpos
4648%undef i_ypos
4649%undef i_yposd
4650%undef p_src_row0
4651%undef p_src_row1
4652%undef i_width_cnt
4653%undef r_tmp0
4654%undef r_tmp0b
4655%undef ymm_xpos_frac
4656%undef ymm_xpos_frac_inc
4657%undef ymm_xpos_int
4658%undef ymm_xpos_int_inc
4659%undef ymm_yfrac0
4660%undef ymm_yfrac1
4661%undef xmm_tmp0
4662%undef ymm_tmp0
4663%undef xmm_tmp1
4664%undef ymm_tmp1
4665%undef xmm_tmp2
4666%undef ymm_tmp2
4667%undef xmm_tmp3
4668%undef ymm_tmp3
4669%undef xmm_tmp4
4670%undef ymm_tmp4
4671%undef xmm_tmp5
4672%undef ymm_tmp5
4673%undef ymm_0
4674%undef ymm_7fff
4675%undef ymm_xpos_int_begin
4676%undef ymm_xpos_frac_begin
4677%undef ymm_xfrac0
4678%undef ymm_xfrac1
4679%undef ymm_xfrac0_begin
4680%undef ymm_xfrac1_begin
4681%undef ymm_xfrac_inc
4682%undef ymm_db80h
4683%undef xmm_shufb_0000000088888888
4684%undef xmm_shufb_000044448888CCCC
4685
4686%endif
4687