• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*!
2;* \copy
3;*     Copyright (c)  2010-2013, Cisco Systems
4;*     All rights reserved.
5;*
6;*     Redistribution and use in source and binary forms, with or without
7;*     modification, are permitted provided that the following conditions
8;*     are met:
9;*
10;*        * Redistributions of source code must retain the above copyright
11;*          notice, this list of conditions and the following disclaimer.
12;*
13;*        * Redistributions in binary form must reproduce the above copyright
14;*          notice, this list of conditions and the following disclaimer in
15;*          the documentation and/or other materials provided with the
16;*          distribution.
17;*
18;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29;*     POSSIBILITY OF SUCH DAMAGE.
30;*
31;*
32;*      vaa.asm
33;*
34;*      Abstract
35;*      sse2 for pVaa routines
36;*
37;*  History
38;*      04/14/2010      Created
39;*              06/07/2010      Added AnalysisVaaInfoIntra_sse2(ssse3)
40;*              06/10/2010      Tune rc_sad_frame_sse2 and got about 40% improvement
41;*              08/11/2010      Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
42;*
43;*************************************************************************/
44%include "asm_inc.asm"
45
46
47;***********************************************************************
48; Macros and other preprocessor constants
49;***********************************************************************
50%macro SUM_SQR_SSE2     3       ; dst, pSrc, zero
51    movdqa %1, %2
52    punpcklbw %1, %3
53    punpckhbw %2, %3
54    pmaddwd %1, %1
55    pmaddwd %2, %2
56    paddd %1, %2
57    pshufd %2, %1, 04Eh   ; 01001110 B
58    paddd %1, %2
59    pshufd %2, %1, 0B1h   ; 10110001 B
60    paddd %1, %2
61%endmacro       ; END OF SUM_SQR_SSE2
62
63%macro WELS_SAD_16x2_SSE2  3 ;esi :%1 edi:%2 ebx:%3
64    movdqa        xmm1,   [%1]
65    movdqa        xmm2,   [%2]
66    movdqa        xmm3,   [%1+%3]
67    movdqa        xmm4,   [%2+%3]
68    psadbw        xmm1,   xmm2
69    psadbw        xmm3,   xmm4
70    paddd xmm6,   xmm1
71    paddd xmm6,   xmm3
72    lea           %1,     [%1+%3*2]
73    lea           %2,     [%2+%3*2]
74%endmacro
75
76; by comparing it outperforms than phaddw(SSSE3) sets
77%macro SUM_WORD_8x2_SSE2        2       ; dst(pSrc), tmp
78    ; @sum_8x2 begin
79    pshufd %2, %1, 04Eh   ; 01001110 B
80    paddw %1, %2
81    pshuflw %2, %1, 04Eh  ; 01001110 B
82    paddw %1, %2
83    pshuflw %2, %1, 0B1h  ; 10110001 B
84    paddw %1, %2
85    ; end of @sum_8x2
86%endmacro       ; END of SUM_WORD_8x2_SSE2
87
88%macro WELS_SAD_SUM_SQSUM_16x1_SSE2 3 ;esi:%1,edi:%2,ebx:%3
89    movdqa        xmm1,   [%1]
90    movdqa        xmm2,   [%2]
91    movdqa        xmm3,   xmm1
92    psadbw        xmm3,   xmm2
93    paddd         xmm6,   xmm3
94
95    movdqa        xmm3,   xmm1
96    psadbw        xmm3,   xmm0
97    paddd         xmm5,   xmm3
98
99    movdqa        xmm2,   xmm1
100    punpcklbw     xmm1,   xmm0
101    punpckhbw     xmm2,   xmm0
102    pmaddwd               xmm1,   xmm1
103    pmaddwd               xmm2,   xmm2
104    paddd         xmm4,   xmm1
105    paddd         xmm4,   xmm2
106
107    add           %1,     %3
108    add           %2,     %3
109%endmacro
110
111%macro WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 3 ;esi:%1 edi:%2 ebx:%3
112    movdqa        xmm1,   [%1]
113    movdqa        xmm2,   [%2]
114    movdqa        xmm3,   xmm1
115    psadbw        xmm3,   xmm2
116    paddd         xmm7,   xmm3    ; sad
117
118    movdqa        xmm3,   xmm1
119    pmaxub        xmm3,   xmm2
120    pminub        xmm2,   xmm1
121    psubb xmm3,   xmm2    ; diff
122
123    movdqa        xmm2,   xmm1
124    psadbw        xmm2,   xmm0
125    paddd xmm6,   xmm2    ; sum
126
127    movdqa                xmm2,   xmm1
128    punpcklbw     xmm1,   xmm0
129    punpckhbw     xmm2,   xmm0
130    pmaddwd               xmm1,   xmm1
131    pmaddwd               xmm2,   xmm2
132    paddd         xmm5,   xmm1
133    paddd         xmm5,   xmm2    ; sqsum
134
135    movdqa                xmm1,   xmm3
136    punpcklbw     xmm1,   xmm0
137    punpckhbw     xmm3,   xmm0
138    pmaddwd               xmm1,   xmm1
139    pmaddwd               xmm3,   xmm3
140    paddd         xmm4,   xmm1
141    paddd         xmm4,   xmm3    ; sqdiff
142
143    add           %1,     %3
144    add           %2,     %3
145%endmacro
146
147%macro WELS_SAD_SD_MAD_16x1_SSE2       7 ;esi:%5 edi:%6 ebx:%7
148%define sad_reg                 %1
149%define sum_cur_reg             %2
150%define sum_ref_reg             %3
151%define mad_reg                 %4
152    movdqa        xmm1,           [%5]
153    movdqa        xmm2,           [%6]
154    movdqa        xmm3,           xmm1
155    psadbw        xmm3,           xmm0
156    paddd         sum_cur_reg,    xmm3    ; sum_cur
157    movdqa        xmm3,           xmm2
158    psadbw        xmm3,           xmm0
159    paddd sum_ref_reg,                    xmm3    ; sum_ref
160
161    movdqa        xmm3,           xmm1
162    pmaxub        xmm3,           xmm2
163    pminub        xmm2,           xmm1
164    psubb xmm3,           xmm2    ; abs diff
165    pmaxub        mad_reg,        xmm3    ; max abs diff
166
167    psadbw        xmm3,           xmm0
168    paddd sad_reg,        xmm3    ; sad
169
170    add                   %5,             %7
171    add                   %6,             %7
172%endmacro
173
174
175%macro WELS_MAX_REG_SSE2       1       ; xmm1, xmm2, xmm3 can be used
176%define max_reg  %1
177    movdqa        xmm1,           max_reg
178    psrldq        xmm1,           4
179    pmaxub        max_reg,        xmm1
180    movdqa        xmm1,           max_reg
181    psrldq        xmm1,           2
182    pmaxub        max_reg,        xmm1
183    movdqa        xmm1,           max_reg
184    psrldq        xmm1,           1
185    pmaxub        max_reg,        xmm1
186%endmacro
187
188%macro WELS_SAD_BGD_SQDIFF_16x1_SSE2   7 ;esi:%5 edi:%6 ebx:%7
189%define sad_reg         %1
190%define sum_reg         %2
191%define mad_reg         %3
192%define sqdiff_reg      %4
193    movdqa                xmm1,           [%5]
194    movdqa                xmm2,           xmm1
195    movdqa                xmm3,           xmm1
196    punpcklbw     xmm2,           xmm0
197    punpckhbw     xmm3,           xmm0
198    pmaddwd               xmm2,           xmm2
199    pmaddwd               xmm3,           xmm3
200    paddd         xmm2,           xmm3
201    movdqa                xmm3,           xmm2
202    psllq         xmm2,           32
203    psrlq         xmm3,           32
204    psllq         xmm3,           32
205    paddd         xmm2,           xmm3
206    paddd         sad_reg,        xmm2            ; sqsum
207
208    movdqa        xmm2,           [%6]
209    movdqa        xmm3,           xmm1
210    psadbw        xmm3,           xmm0
211    paddd sum_reg,                        xmm3    ; sum_cur
212    movdqa        xmm3,           xmm2
213    psadbw        xmm3,           xmm0
214    pslldq        xmm3,           4
215    paddd sum_reg,                        xmm3    ; sum_ref
216
217    movdqa        xmm3,           xmm1
218    pmaxub        xmm3,           xmm2
219    pminub        xmm2,           xmm1
220    psubb xmm3,           xmm2    ; abs diff
221    pmaxub        mad_reg,        xmm3    ; max abs diff
222
223    movdqa        xmm1,           xmm3
224    psadbw        xmm3,           xmm0
225    paddd sad_reg,        xmm3    ; sad
226
227    movdqa                xmm3,   xmm1
228    punpcklbw     xmm1,   xmm0
229    punpckhbw     xmm3,   xmm0
230    pmaddwd               xmm1,   xmm1
231    pmaddwd               xmm3,   xmm3
232    paddd         sqdiff_reg,     xmm1
233    paddd         sqdiff_reg,     xmm3    ; sqdiff
234
235    add           %5,     %7
236    add           %6,     %7
237%endmacro
238
239
240;***********************************************************************
241; Code
242;***********************************************************************
243
244SECTION .text
245
246%ifdef X86_32
247
248;***********************************************************************
249;   void SampleVariance16x16_sse2(      uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
250;***********************************************************************
251WELS_EXTERN SampleVariance16x16_sse2
252    push esi
253    push edi
254    push ebx
255
256    sub esp, 16
257    %define SUM                   [esp]
258    %define SUM_CUR               [esp+4]
259    %define SQR                   [esp+8]
260    %define SQR_CUR               [esp+12]
261    %define PUSH_SIZE     28      ; 12 + 16
262
263    mov edi, [esp+PUSH_SIZE+4]    ; y_ref
264    mov edx, [esp+PUSH_SIZE+8]    ; y_ref_stride
265    mov esi, [esp+PUSH_SIZE+12]   ; y_src
266    mov eax, [esp+PUSH_SIZE+16]   ; y_src_stride
267    mov ecx, 010h                         ; height = 16
268
269    pxor xmm7, xmm7
270    movdqu SUM, xmm7
271
272.hloops:
273    movdqa xmm0, [edi]            ; y_ref
274    movdqa xmm1, [esi]            ; y_src
275    movdqa xmm2, xmm0             ; store first for future process
276    movdqa xmm3, xmm1
277    ; sum += diff;
278    movdqa xmm4, xmm0
279    psadbw xmm4, xmm1             ; 2 parts, [0,..,15], [64,..,79]
280    ; to be continued for sum
281    pshufd xmm5, xmm4, 0C6h       ; 11000110 B
282    paddw xmm4, xmm5
283    movd ebx, xmm4
284    add SUM, ebx
285
286    ; sqr += diff * diff;
287    pmaxub xmm0, xmm1
288    pminub xmm1, xmm2
289    psubb xmm0, xmm1                              ; diff
290    SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
291    movd ebx, xmm1
292    add SQR, ebx
293
294    ; sum_cur += y_src[x];
295    movdqa xmm0, xmm3             ; cur_orig
296    movdqa xmm1, xmm0
297    punpcklbw xmm0, xmm7
298    punpckhbw xmm1, xmm7
299    paddw xmm0, xmm1              ; 8x2
300    SUM_WORD_8x2_SSE2 xmm0, xmm1
301    movd ebx, xmm0
302    and ebx, 0ffffh
303    add SUM_CUR, ebx
304
305    ; sqr_cur += y_src[x] * y_src[x];
306    SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
307    movd ebx, xmm0
308    add SQR_CUR, ebx
309
310    lea edi, [edi+edx]
311    lea esi, [esi+eax]
312    dec ecx
313    jnz near .hloops
314
315    mov ebx, 0
316    mov bx, word SUM
317    sar ebx, 8
318    imul ebx, ebx
319    mov ecx, SQR
320    sar ecx, 8
321    sub ecx, ebx
322    mov edi, [esp+PUSH_SIZE+20]   ; pMotionTexture
323    mov [edi], cx                         ; to store uiMotionIndex
324    mov ebx, 0
325    mov bx, word SUM_CUR
326    sar ebx, 8
327    imul ebx, ebx
328    mov ecx, SQR_CUR
329    sar ecx, 8
330    sub ecx, ebx
331    mov [edi+2], cx                               ; to store uiTextureIndex
332
333    %undef SUM
334    %undef SUM_CUR
335    %undef SQR
336    %undef SQR_CUR
337    %undef PUSH_SIZE
338
339    add esp, 16
340    pop ebx
341    pop edi
342    pop esi
343
344    ret
345
346
347
348;*************************************************************************************************************
349;void VAACalcSad_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
350;                                                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
351;*************************************************************************************************************
352
353
354WELS_EXTERN VAACalcSad_sse2
355%define         cur_data                        esp + pushsize + 4
356%define         ref_data                        esp + pushsize + 8
357%define         iPicWidth                       esp + pushsize + 12
358%define         iPicHeight                      esp + pushsize + 16
359%define         iPicStride                      esp + pushsize + 20
360%define         psadframe                       esp + pushsize + 24
361%define         psad8x8                         esp + pushsize + 28
362%define         pushsize        12
363    push  esi
364    push  edi
365    push  ebx
366    mov           esi,    [cur_data]
367    mov           edi,    [ref_data]
368    mov           ebx,    [iPicStride]
369    mov           edx,    [psad8x8]
370    mov           eax,    ebx
371
372    shr           dword [iPicWidth],      4                                       ; iPicWidth/16
373    shr           dword [iPicHeight],     4                                       ; iPicHeight/16
374    shl           eax,    4                                                               ; iPicStride*16
375    pxor  xmm0,   xmm0
376    pxor  xmm7,   xmm7            ; iFrameSad
377height_loop:
378    mov           ecx,    dword [iPicWidth]
379    push  esi
380    push  edi
381width_loop:
382    pxor  xmm6,   xmm6            ;
383    WELS_SAD_16x2_SSE2 esi,edi,ebx
384    WELS_SAD_16x2_SSE2 esi,edi,ebx
385    WELS_SAD_16x2_SSE2 esi,edi,ebx
386    WELS_SAD_16x2_SSE2 esi,edi,ebx
387    paddd xmm7,           xmm6
388    movd  [edx],          xmm6
389    psrldq        xmm6,           8
390    movd  [edx+4],        xmm6
391
392    pxor  xmm6,   xmm6
393    WELS_SAD_16x2_SSE2 esi,edi,ebx
394    WELS_SAD_16x2_SSE2 esi,edi,ebx
395    WELS_SAD_16x2_SSE2 esi,edi,ebx
396    WELS_SAD_16x2_SSE2 esi,edi,ebx
397    paddd xmm7,           xmm6
398    movd  [edx+8],        xmm6
399    psrldq        xmm6,           8
400    movd  [edx+12],       xmm6
401
402    add           edx,    16
403    sub           esi,    eax
404    sub           edi,    eax
405    add           esi,    16
406    add           edi,    16
407
408    dec           ecx
409    jnz           width_loop
410
411    pop           edi
412    pop           esi
413    add           esi,    eax
414    add           edi,    eax
415
416    dec   dword [iPicHeight]
417    jnz           height_loop
418
419    mov           edx,    [psadframe]
420    movdqa        xmm5,   xmm7
421    psrldq        xmm7,   8
422    paddd xmm7,   xmm5
423    movd  [edx],  xmm7
424
425%undef          cur_data
426%undef          ref_data
427%undef          iPicWidth
428%undef          iPicHeight
429%undef          iPicStride
430%undef          psadframe
431%undef          psad8x8
432%undef          pushsize
433    pop           ebx
434    pop           edi
435    pop           esi
436    ret
437
438%else  ;64-bit
439
440;***********************************************************************
441;   void SampleVariance16x16_sse2(      uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
442;***********************************************************************
443WELS_EXTERN SampleVariance16x16_sse2
444    %define SUM                   r10;[esp]
445    %define SUM_CUR               r11;[esp+4]
446    %define SQR                   r13;[esp+8]
447    %define SQR_CUR               r15;[esp+12]
448
449    push r12
450    push r13
451    push r14
452    push r15
453    %assign push_num 4
454    LOAD_5_PARA
455    PUSH_XMM 8
456    SIGN_EXTENSION r1,r1d
457    SIGN_EXTENSION r3,r3d
458
459    mov r12,010h
460    pxor xmm7, xmm7
461    movq SUM, xmm7
462    movq SUM_CUR,xmm7
463    movq SQR,xmm7
464    movq SQR_CUR,xmm7
465
466.hloops:
467    mov r14,0
468    movdqa xmm0, [r0]             ; y_ref
469    movdqa xmm1, [r2]             ; y_src
470    movdqa xmm2, xmm0             ; store first for future process
471    movdqa xmm3, xmm1
472    ; sum += diff;
473    movdqa xmm4, xmm0
474    psadbw xmm4, xmm1             ; 2 parts, [0,..,15], [64,..,79]
475    ; to be continued for sum
476    pshufd xmm5, xmm4, 0C6h       ; 11000110 B
477    paddw xmm4, xmm5
478    movd r14d, xmm4
479    add SUM, r14
480
481    ; sqr += diff * diff;
482    pmaxub xmm0, xmm1
483    pminub xmm1, xmm2
484    psubb xmm0, xmm1                              ; diff
485    SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
486    movd r14d, xmm1
487    add SQR, r14
488
489    ; sum_cur += y_src[x];
490    movdqa xmm0, xmm3             ; cur_orig
491    movdqa xmm1, xmm0
492    punpcklbw xmm0, xmm7
493    punpckhbw xmm1, xmm7
494    paddw xmm0, xmm1              ; 8x2
495    SUM_WORD_8x2_SSE2 xmm0, xmm1
496    movd r14d, xmm0
497    and r14, 0ffffh
498    add SUM_CUR, r14
499
500    ; sqr_cur += y_src[x] * y_src[x];
501    SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
502    movd r14d, xmm0
503    add SQR_CUR, r14
504
505    lea r0, [r0+r1]
506    lea r2, [r2+r3]
507    dec r12
508    jnz near .hloops
509
510    mov r0, SUM
511    sar r0, 8
512    imul r0, r0
513    mov r1, SQR
514    sar r1, 8
515    sub r1, r0
516    mov [r4], r1w                         ; to store uiMotionIndex
517    mov r0, SUM_CUR
518    sar r0, 8
519    imul r0, r0
520    mov r1, SQR_CUR
521    sar r1, 8
522    sub r1, r0
523    mov [r4+2], r1w                               ; to store uiTextureIndex
524
525    POP_XMM
526    LOAD_5_PARA_POP
527    pop r15
528    pop r14
529    pop r13
530    pop r12
531
532
533    %assign push_num 0
534
535    ret
536
537
538;*************************************************************************************************************
539;void VAACalcSad_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
540;                                                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
541;*************************************************************************************************************
542
543
544WELS_EXTERN VAACalcSad_sse2
545%define         cur_data                        r0
546%define         ref_data                        r1
547%define         iPicWidth                       r2
548%define         iPicHeight              r3
549%define         iPicStride              r4
550%define         psadframe                       r5
551%define         psad8x8                         r6
552
553    push r12
554    push r13
555    %assign push_num 2
556    LOAD_7_PARA
557    PUSH_XMM 8
558    SIGN_EXTENSION r2,r2d
559    SIGN_EXTENSION r3,r3d
560    SIGN_EXTENSION r4,r4d
561
562    mov   r12,r4
563    shr           r2,     4                                       ; iPicWidth/16
564    shr           r3,     4                                       ; iPicHeight/16
565
566    shl           r12,    4                                                               ; iPicStride*16
567    pxor  xmm0,   xmm0
568    pxor  xmm7,   xmm7            ; iFrameSad
569height_loop:
570    mov           r13,    r2
571    push  r0
572    push  r1
573width_loop:
574    pxor  xmm6,   xmm6
575    WELS_SAD_16x2_SSE2 r0,r1,r4
576    WELS_SAD_16x2_SSE2 r0,r1,r4
577    WELS_SAD_16x2_SSE2 r0,r1,r4
578    WELS_SAD_16x2_SSE2 r0,r1,r4
579    paddd xmm7,           xmm6
580    movd  [r6],           xmm6
581    psrldq        xmm6,           8
582    movd  [r6+4], xmm6
583
584    pxor  xmm6,   xmm6
585    WELS_SAD_16x2_SSE2 r0,r1,r4
586    WELS_SAD_16x2_SSE2 r0,r1,r4
587    WELS_SAD_16x2_SSE2 r0,r1,r4
588    WELS_SAD_16x2_SSE2 r0,r1,r4
589    paddd xmm7,           xmm6
590    movd  [r6+8], xmm6
591    psrldq        xmm6,           8
592    movd  [r6+12],        xmm6
593
594    add           r6,     16
595    sub           r0,     r12
596    sub           r1,     r12
597    add           r0,     16
598    add           r1,     16
599
600    dec           r13
601    jnz           width_loop
602
603    pop           r1
604    pop           r0
605    add           r0,     r12
606    add           r1,     r12
607
608    dec   r3
609    jnz           height_loop
610
611    ;mov          r13,    [psadframe]
612    movdqa        xmm5,   xmm7
613    psrldq        xmm7,   8
614    paddd xmm7,   xmm5
615    movd  [psadframe],    xmm7
616
617%undef          cur_data
618%undef          ref_data
619%undef          iPicWidth
620%undef          iPicHeight
621%undef          iPicStride
622%undef          psadframe
623%undef          psad8x8
624%undef          pushsize
625    POP_XMM
626    LOAD_7_PARA_POP
627    pop r13
628    pop r12
629    %assign push_num 0
630    ret
631
632%endif
633
634
635%ifdef X86_32
636;*************************************************************************************************************
637;void VAACalcSadVar_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
638;               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
639;*************************************************************************************************************
640
641
642WELS_EXTERN VAACalcSadVar_sse2
643%define         localsize               8
644%define         cur_data                        esp + pushsize + localsize + 4
645%define         ref_data                        esp + pushsize + localsize + 8
646%define         iPicWidth                       esp + pushsize + localsize + 12
647%define         iPicHeight                      esp + pushsize + localsize + 16
648%define         iPicStride                      esp + pushsize + localsize + 20
649%define         psadframe                       esp + pushsize + localsize + 24
650%define         psad8x8                         esp + pushsize + localsize + 28
651%define         psum16x16                       esp + pushsize + localsize + 32
652%define         psqsum16x16                     esp + pushsize + localsize + 36
653%define         tmp_esi                         esp + 0
654%define         tmp_edi                         esp + 4
655%define         pushsize                16
656    push  ebp
657    push  esi
658    push  edi
659    push  ebx
660    sub           esp,    localsize
661    mov           esi,    [cur_data]
662    mov           edi,    [ref_data]
663    mov           ebx,    [iPicStride]
664    mov           edx,    [psad8x8]
665    mov           eax,    ebx
666
667    shr           dword [iPicWidth],      4                                       ; iPicWidth/16
668    shr           dword [iPicHeight],     4                                       ; iPicHeight/16
669    shl           eax,    4                                                       ; iPicStride*16
670    pxor  xmm0,   xmm0
671    pxor  xmm7,   xmm7            ; iFrameSad
672var_height_loop:
673    mov           ecx,    dword [iPicWidth]
674    mov           [tmp_esi],      esi
675    mov           [tmp_edi],      edi
676var_width_loop:
677    pxor  xmm6,   xmm6            ; hiQuad_loQuad pSad8x8
678    pxor  xmm5,   xmm5            ; pSum16x16
679    pxor  xmm4,   xmm4            ; sqsum_16x16
680    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
681    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
682    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
683    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
684    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
685    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
686    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
687    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
688    paddd xmm7,           xmm6
689    movd  [edx],          xmm6
690    psrldq        xmm6,           8
691    movd  [edx+4],        xmm6
692
693    pxor  xmm6,   xmm6
694    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
695    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
696    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
697    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
698    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
699    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
700    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
701    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
702    paddd xmm7,           xmm6
703    movd  [edx+8],        xmm6
704    psrldq        xmm6,           8
705    movd  [edx+12],       xmm6
706
707    mov           ebp,    [psum16x16]
708    movdqa        xmm1,   xmm5
709    psrldq        xmm1,   8
710    paddd xmm5,   xmm1
711    movd  [ebp],  xmm5
712    add           dword [psum16x16], 4
713
714    movdqa        xmm5,   xmm4
715    psrldq        xmm5,   8
716    paddd xmm4,   xmm5
717    movdqa        xmm3,   xmm4
718    psrldq        xmm3,   4
719    paddd xmm4,   xmm3
720
721    mov           ebp,    [psqsum16x16]
722    movd  [ebp],  xmm4
723    add           dword [psqsum16x16], 4
724
725    add           edx,    16
726    sub           esi,    eax
727    sub           edi,    eax
728    add           esi,    16
729    add           edi,    16
730
731    dec           ecx
732    jnz           var_width_loop
733
734    mov           esi,    [tmp_esi]
735    mov           edi,    [tmp_edi]
736    add           esi,    eax
737    add           edi,    eax
738
739    dec   dword [iPicHeight]
740    jnz           var_height_loop
741
742    mov           edx,    [psadframe]
743    movdqa        xmm5,   xmm7
744    psrldq        xmm7,   8
745    paddd xmm7,   xmm5
746    movd  [edx],  xmm7
747
748    add           esp,    localsize
749    pop           ebx
750    pop           edi
751    pop           esi
752    pop           ebp
753%undef          cur_data
754%undef          ref_data
755%undef          iPicWidth
756%undef          iPicHeight
757%undef          iPicStride
758%undef          psadframe
759%undef          psad8x8
760%undef          psum16x16
761%undef          psqsum16x16
762%undef          tmp_esi
763%undef          tmp_edi
764%undef          pushsize
765%undef          localsize
766    ret
767
768%else  ;64-bit
769
770;*************************************************************************************************************
771;void VAACalcSadVar_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
772;               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
773;*************************************************************************************************************
774
775
776WELS_EXTERN VAACalcSadVar_sse2
777%define         cur_data                        arg1 ;r0
778%define         ref_data                        arg2 ;r1
779%define         iPicWidth                       arg3 ;r2
780%define         iPicHeight                  arg4 ;r3
781%define         iPicStride                  arg5
782%define         psadframe                       arg6
783%define         psad8x8                         arg7
784%define         psum16x16                       arg8
785%define         psqsum16x16                 arg9
786
787    push r12
788    push r13
789    push r14
790    push r15
791    %assign push_num 4
792    PUSH_XMM 8
793
794%ifdef WIN64
795    mov r4, arg5  ;iPicStride
796    mov r5, arg6  ;psad8x8
797%endif
798    mov r14,arg7
799    SIGN_EXTENSION r2,r2d
800    SIGN_EXTENSION r3,r3d
801    SIGN_EXTENSION r4,r4d
802
803    mov   r13,r4
804    shr   r2,4
805    shr   r3,4
806
807    shl   r13,4   ; iPicStride*16
808    pxor  xmm0,   xmm0
809    pxor  xmm7,   xmm7            ; iFrameSad
810var_height_loop:
811    push    r2
812    %assign push_num push_num+1
813    mov           r11,    r0
814    mov           r12,    r1
815var_width_loop:
816    pxor  xmm6,   xmm6            ; hiQuad_loQuad pSad8x8
817    pxor  xmm5,   xmm5            ; pSum16x16
818    pxor  xmm4,   xmm4            ; sqsum_16x16
819    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
820    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
821    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
822    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
823    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
824    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
825    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
826    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
827    paddd xmm7,           xmm6
828    movd  [r14],          xmm6
829    psrldq        xmm6,           8
830    movd  [r14+4],        xmm6
831
832    pxor  xmm6,   xmm6
833    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
834    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
835    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
836    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
837    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
838    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
839    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
840    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
841    paddd   xmm7,           xmm6
842    movd    [r14+8],        xmm6
843    psrldq  xmm6,           8
844    movd    [r14+12],       xmm6
845
846    mov             r15,    psum16x16
847    movdqa  xmm1,   xmm5
848    psrldq  xmm1,   8
849    paddd   xmm5,   xmm1
850    movd    [r15],  xmm5
851    add             dword psum16x16, 4
852
853    movdqa  xmm5,   xmm4
854    psrldq  xmm5,   8
855    paddd   xmm4,   xmm5
856    movdqa  xmm3,   xmm4
857    psrldq  xmm3,   4
858    paddd   xmm4,   xmm3
859
860    mov             r15,    psqsum16x16
861    movd    [r15],  xmm4
862    add             dword psqsum16x16, 4
863
864    add             r14,16
865    sub             r0,     r13
866    sub             r1,     r13
867    add             r0,     16
868    add             r1,     16
869
870    dec             r2
871    jnz             var_width_loop
872
873    pop     r2
874    %assign push_num push_num-1
875    mov             r0,     r11
876    mov             r1,     r12
877    add             r0,     r13
878    add             r1,     r13
879    dec     r3
880    jnz             var_height_loop
881
882    mov             r15,    psadframe
883    movdqa  xmm5,   xmm7
884    psrldq  xmm7,   8
885    paddd   xmm7,   xmm5
886    movd    [r15],  xmm7
887
888    POP_XMM
889    pop r15
890    pop r14
891    pop r13
892    pop r12
893%assign push_num 0
894%undef          cur_data
895%undef          ref_data
896%undef          iPicWidth
897%undef          iPicHeight
898%undef          iPicStride
899%undef          psadframe
900%undef          psad8x8
901%undef          psum16x16
902%undef          psqsum16x16
903%undef          tmp_esi
904%undef          tmp_edi
905%undef          pushsize
906%undef          localsize
907    ret
908
909%endif
910
911%ifdef X86_32
912
913;*************************************************************************************************************
914;void VAACalcSadSsd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
915;       int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
916;*************************************************************************************************************
917
918
919WELS_EXTERN VAACalcSadSsd_sse2
920%define         localsize               12
921%define         cur_data                        esp + pushsize + localsize + 4
922%define         ref_data                        esp + pushsize + localsize + 8
923%define         iPicWidth                       esp + pushsize + localsize + 12
924%define         iPicHeight                      esp + pushsize + localsize + 16
925%define         iPicStride                      esp + pushsize + localsize + 20
926%define         psadframe                       esp + pushsize + localsize + 24
927%define         psad8x8                         esp + pushsize + localsize + 28
928%define         psum16x16                       esp + pushsize + localsize + 32
929%define         psqsum16x16                     esp + pushsize + localsize + 36
930%define         psqdiff16x16            esp + pushsize + localsize + 40
931%define         tmp_esi                         esp + 0
932%define         tmp_edi                         esp + 4
933%define         tmp_sadframe            esp + 8
934%define         pushsize                16
935    push    ebp
936    push    esi
937    push    edi
938    push    ebx
939    sub             esp,    localsize
940
941    mov             ecx,    [iPicWidth]
942    mov             ecx,    [iPicHeight]
943    mov             esi,    [cur_data]
944    mov             edi,    [ref_data]
945    mov             ebx,    [iPicStride]
946    mov             edx,    [psad8x8]
947    mov             eax,    ebx
948
949    shr             dword [iPicWidth],      4                                       ; iPicWidth/16
950    shr             dword [iPicHeight],     4                                       ; iPicHeight/16
951    shl             eax,    4                                                       ; iPicStride*16
952    mov             ecx,    [iPicWidth]
953    mov             ecx,    [iPicHeight]
954    pxor    xmm0,   xmm0
955    movd    [tmp_sadframe], xmm0
956sqdiff_height_loop:
957    mov             ecx,    dword [iPicWidth]
958    mov             [tmp_esi],      esi
959    mov             [tmp_edi],      edi
960sqdiff_width_loop:
961    pxor    xmm7,   xmm7            ; hiQuad_loQuad pSad8x8
962    pxor    xmm6,   xmm6            ; pSum16x16
963    pxor    xmm5,   xmm5            ; sqsum_16x16  four dword
964    pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
965    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
966    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
967    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
968    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
969    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
970    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
971    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
972    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
973    movdqa  xmm1,           xmm7
974    movd    [edx],          xmm7
975    psrldq  xmm7,           8
976    paddd   xmm1,           xmm7
977    movd    [edx+4],        xmm7
978    movd    ebp,            xmm1
979    add             [tmp_sadframe], ebp
980
981    pxor    xmm7,   xmm7
982    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
983    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
984    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
985    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
986    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
987    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
988    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
989    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
990    movdqa  xmm1,           xmm7
991    movd    [edx+8],        xmm7
992    psrldq  xmm7,           8
993    paddd   xmm1,           xmm7
994    movd    [edx+12],       xmm7
995    movd    ebp,            xmm1
996    add             [tmp_sadframe], ebp
997
998    mov             ebp,    [psum16x16]
999    movdqa  xmm1,   xmm6
1000    psrldq  xmm1,   8
1001    paddd   xmm6,   xmm1
1002    movd    [ebp],  xmm6
1003    add             dword [psum16x16], 4
1004
1005    mov             ebp,    [psqsum16x16]
1006    pshufd  xmm6,   xmm5,   14 ;00001110
1007    paddd   xmm6,   xmm5
1008    pshufd  xmm5,   xmm6,   1  ;00000001
1009    paddd   xmm5,   xmm6
1010    movd    [ebp],  xmm5
1011    add             dword [psqsum16x16], 4
1012
1013    mov             ebp,    [psqdiff16x16]
1014    pshufd  xmm5,   xmm4,   14      ; 00001110
1015    paddd   xmm5,   xmm4
1016    pshufd  xmm4,   xmm5,   1       ; 00000001
1017    paddd   xmm4,   xmm5
1018    movd    [ebp],  xmm4
1019    add             dword   [psqdiff16x16], 4
1020
1021    add             edx,    16
1022    sub             esi,    eax
1023    sub             edi,    eax
1024    add             esi,    16
1025    add             edi,    16
1026
1027    dec             ecx
1028    jnz             sqdiff_width_loop
1029
1030    mov             esi,    [tmp_esi]
1031    mov             edi,    [tmp_edi]
1032    add             esi,    eax
1033    add             edi,    eax
1034
1035    dec     dword [iPicHeight]
1036    jnz             sqdiff_height_loop
1037
1038    mov             ebx,    [tmp_sadframe]
1039    mov             eax,    [psadframe]
1040    mov             [eax],  ebx
1041
1042    add             esp,    localsize
1043    pop             ebx
1044    pop             edi
1045    pop             esi
1046    pop             ebp
1047%undef          cur_data
1048%undef          ref_data
1049%undef          iPicWidth
1050%undef          iPicHeight
1051%undef          iPicStride
1052%undef          psadframe
1053%undef          psad8x8
1054%undef          psum16x16
1055%undef          psqsum16x16
1056%undef          psqdiff16x16
1057%undef          tmp_esi
1058%undef          tmp_edi
1059%undef          tmp_sadframe
1060%undef          pushsize
1061%undef          localsize
1062    ret
1063
1064%else
1065
1066
1067;*************************************************************************************************************
1068;void VAACalcSadSsd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
1069;       int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
1070;*************************************************************************************************************
1071
1072
1073WELS_EXTERN VAACalcSadSsd_sse2
1074%define         localsize               12
1075%define         cur_data                        arg1;r0
1076%define         ref_data                        arg2;r1
1077%define         iPicWidth                       arg3;r2
1078%define         iPicHeight                      arg4;r3
1079%define         iPicStride                      arg5;
1080%define         psadframe                       arg6;
1081%define         psad8x8                         arg7;
1082%define         psum16x16                       arg8;
1083%define         psqsum16x16                     arg9;
1084%define         psqdiff16x16                    arg10
1085
1086    push r12
1087    push r13
1088    push r14
1089    push r15
1090    %assign push_num 4
1091    PUSH_XMM 10
1092
1093%ifdef WIN64
1094    mov r4,arg5
1095%endif
1096    mov r14,arg7
1097    SIGN_EXTENSION r2,r2d
1098    SIGN_EXTENSION r3,r3d
1099    SIGN_EXTENSION r4,r4d
1100
1101    mov        r13,r4
1102    shr     r2,4   ; iPicWidth/16
1103    shr     r3,4   ; iPicHeight/16
1104    shl     r13,4   ; iPicStride*16
1105    pxor    xmm0,   xmm0
1106    pxor  xmm8, xmm8  ;framesad
1107    pxor  xmm9, xmm9
1108sqdiff_height_loop:
1109    ;mov            ecx,    dword [iPicWidth]
1110    ;mov      r14,r2
1111    push r2
1112    %assign push_num push_num +1
1113    mov             r10,    r0
1114    mov             r11,    r1
1115sqdiff_width_loop:
1116    pxor    xmm7,   xmm7            ; hiQuad_loQuad pSad8x8
1117    pxor    xmm6,   xmm6            ; pSum16x16
1118    pxor    xmm5,   xmm5            ; sqsum_16x16  four dword
1119    pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
1120    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
1121    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
1122    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
1123    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
1124    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
1125    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
1126    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
1127    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
1128    movdqa  xmm1,           xmm7
1129    movd    [r14],          xmm7
1130    psrldq  xmm7,           8
1131    paddd   xmm1,           xmm7
1132    movd    [r14+4],        xmm7
1133    movd    r15d,           xmm1
1134    movd  xmm9, r15d
1135    paddd xmm8,xmm9
1136
1137
1138    pxor    xmm7,   xmm7
1139    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
1140    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
1141    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
1142    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
1143    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
1144    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
1145    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
1146    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
1147    movdqa  xmm1,           xmm7
1148    movd    [r14+8],        xmm7
1149    psrldq  xmm7,           8
1150    paddd   xmm1,           xmm7
1151    movd    [r14+12],       xmm7
1152    movd    r15d,           xmm1
1153    movd  xmm9, r15d
1154    paddd xmm8,xmm9
1155
1156    mov             r15,    psum16x16
1157    movdqa  xmm1,   xmm6
1158    psrldq  xmm1,   8
1159    paddd   xmm6,   xmm1
1160    movd    [r15],  xmm6
1161    add             dword psum16x16, 4
1162
1163    mov             r15,    psqsum16x16
1164    pshufd  xmm6,   xmm5,   14 ;00001110
1165    paddd   xmm6,   xmm5
1166    pshufd  xmm5,   xmm6,   1  ;00000001
1167    paddd   xmm5,   xmm6
1168    movd    [r15],  xmm5
1169    add             dword psqsum16x16, 4
1170
1171    mov             r15,    psqdiff16x16
1172    pshufd  xmm5,   xmm4,   14      ; 00001110
1173    paddd   xmm5,   xmm4
1174    pshufd  xmm4,   xmm5,   1       ; 00000001
1175    paddd   xmm4,   xmm5
1176    movd    [r15],  xmm4
1177    add             dword   psqdiff16x16,   4
1178
1179    add             r14,16
1180    sub             r0,     r13
1181    sub             r1,     r13
1182    add             r0,     16
1183    add             r1,     16
1184
1185    dec             r2
1186    jnz             sqdiff_width_loop
1187
1188    pop r2
1189    %assign push_num push_num -1
1190
1191    mov             r0,     r10
1192    mov             r1,     r11
1193    add             r0,     r13
1194    add             r1,     r13
1195
1196    dec     r3
1197    jnz             sqdiff_height_loop
1198
1199    mov             r13,    psadframe
1200    movd    [r13],  xmm8
1201
1202    POP_XMM
1203    pop r15
1204    pop r14
1205    pop r13
1206    pop r12
1207    %assign push_num 0
1208
1209%undef          cur_data
1210%undef          ref_data
1211%undef          iPicWidth
1212%undef          iPicHeight
1213%undef          iPicStride
1214%undef          psadframe
1215%undef          psad8x8
1216%undef          psum16x16
1217%undef          psqsum16x16
1218%undef          psqdiff16x16
1219%undef          tmp_esi
1220%undef          tmp_edi
1221%undef          tmp_sadframe
1222%undef          pushsize
1223%undef          localsize
1224    ret
1225
1226
1227
1228%endif
1229
1230%ifdef X86_32
1231;*************************************************************************************************************
1232;void VAACalcSadBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
1233;                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
1234;*************************************************************************************************************
1235
1236
1237WELS_EXTERN VAACalcSadBgd_sse2
1238%define         localsize               12
1239%define         cur_data                        esp + pushsize + localsize + 4
1240%define         ref_data                        esp + pushsize + localsize + 8
1241%define         iPicWidth                       esp + pushsize + localsize + 12
1242%define         iPicHeight                      esp + pushsize + localsize + 16
1243%define         iPicStride                      esp + pushsize + localsize + 20
1244%define         psadframe                       esp + pushsize + localsize + 24
1245%define         psad8x8                         esp + pushsize + localsize + 28
1246%define         p_sd8x8                         esp + pushsize + localsize + 32
1247%define         p_mad8x8                        esp + pushsize + localsize + 36
1248%define         tmp_esi                         esp + 0
1249%define         tmp_edi                         esp + 4
1250%define         tmp_ecx                         esp + 8
1251%define         pushsize                16
1252    push    ebp
1253    push    esi
1254    push    edi
1255    push    ebx
1256    sub             esp,    localsize
1257    mov             esi,    [cur_data]
1258    mov             edi,    [ref_data]
1259    mov             ebx,    [iPicStride]
1260    mov             eax,    ebx
1261
1262    shr             dword [iPicWidth],      4                                       ; iPicWidth/16
1263    shr             dword [iPicHeight],     4                                       ; iPicHeight/16
1264    shl             eax,    4                                                       ; iPicStride*16
1265    xor             ebp,    ebp
1266    pxor    xmm0,   xmm0
1267bgd_height_loop:
1268    mov             ecx,    dword [iPicWidth]
1269    mov             [tmp_esi],      esi
1270    mov             [tmp_edi],      edi
1271bgd_width_loop:
1272    pxor    xmm7,   xmm7            ; pSad8x8
1273    pxor    xmm6,   xmm6            ; sum_cur_8x8
1274    pxor    xmm5,   xmm5            ; sum_ref_8x8
1275    pxor    xmm4,   xmm4            ; pMad8x8
1276    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
1277    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
1278    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
1279    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
1280    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
1281    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
1282    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
1283    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
1284
1285
1286    mov                     edx,            [p_mad8x8]
1287    WELS_MAX_REG_SSE2       xmm4
1288
1289    ;movdqa         xmm1,   xmm4
1290    ;punpcklbw      xmm1,   xmm0
1291    ;punpcklwd      xmm1,   xmm0
1292    ;movd           [edx],  xmm1
1293    ;punpckhbw      xmm4,   xmm0
1294    ;punpcklwd      xmm4,   xmm0
1295    ;movd           [edx+4],        xmm4
1296    ;add                    edx,            8
1297    ;mov                    [p_mad8x8],     edx
1298    mov                     [tmp_ecx],      ecx
1299    movhlps         xmm1,   xmm4
1300    movd            ecx,    xmm4
1301    mov                     [edx],  cl
1302    movd            ecx,    xmm1
1303    mov                     [edx+1],cl
1304    add                     edx,    2
1305    mov                     [p_mad8x8],     edx
1306
1307
1308    pslldq          xmm7,   4
1309    pslldq          xmm6,   4
1310    pslldq          xmm5,   4
1311
1312
1313    pxor    xmm4,   xmm4            ; pMad8x8
1314    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
1315    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
1316    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
1317    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
1318    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
1319    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
1320    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
1321    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
1322
1323    mov                     edx,            [p_mad8x8]
1324    WELS_MAX_REG_SSE2       xmm4
1325
1326    ;movdqa         xmm1,   xmm4
1327    ;punpcklbw      xmm1,   xmm0
1328    ;punpcklwd      xmm1,   xmm0
1329    ;movd           [edx],  xmm1
1330    ;punpckhbw      xmm4,   xmm0
1331    ;punpcklwd      xmm4,   xmm0
1332    ;movd           [edx+4],        xmm4
1333    ;add                    edx,            8
1334    ;mov                    [p_mad8x8],     edx
1335    movhlps         xmm1,   xmm4
1336    movd            ecx,    xmm4
1337    mov                     [edx],  cl
1338    movd            ecx,    xmm1
1339    mov                     [edx+1],cl
1340    add                     edx,    2
1341    mov                     [p_mad8x8],     edx
1342
1343    ; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
1344
1345    mov             edx,    [psad8x8]
1346    pshufd  xmm1,   xmm7,   10001101b               ; D3 D2 D1 D0
1347    movdqa  [edx],  xmm1
1348    add             edx,    16
1349    mov             [psad8x8],      edx                                     ; sad8x8
1350
1351    paddd   xmm1,   xmm7                                    ; D1+3 D3+2 D0+1 D2+0
1352    pshufd  xmm2,   xmm1,   00000011b
1353    paddd   xmm1,   xmm2
1354    movd    edx,    xmm1
1355    add             ebp,    edx                                             ; sad frame
1356
1357    mov             edx,    [p_sd8x8]
1358    psubd   xmm6,   xmm5
1359    pshufd  xmm1,   xmm6,   10001101b
1360    movdqa  [edx],  xmm1
1361    add             edx,    16
1362    mov             [p_sd8x8],      edx
1363
1364
1365    add             edx,    16
1366    sub             esi,    eax
1367    sub             edi,    eax
1368    add             esi,    16
1369    add             edi,    16
1370
1371    mov             ecx,    [tmp_ecx]
1372    dec             ecx
1373    jnz             bgd_width_loop
1374
1375    mov             esi,    [tmp_esi]
1376    mov             edi,    [tmp_edi]
1377    add             esi,    eax
1378    add             edi,    eax
1379
1380    dec             dword [iPicHeight]
1381    jnz             bgd_height_loop
1382
1383    mov             edx,    [psadframe]
1384    mov             [edx],  ebp
1385
1386    add             esp,    localsize
1387    pop             ebx
1388    pop             edi
1389    pop             esi
1390    pop             ebp
1391%undef          cur_data
1392%undef          ref_data
1393%undef          iPicWidth
1394%undef          iPicHeight
1395%undef          iPicStride
1396%undef          psadframe
1397%undef          psad8x8
1398%undef          p_sd8x8
1399%undef          p_mad8x8
1400%undef          tmp_esi
1401%undef          tmp_edi
1402%undef          pushsize
1403%undef          localsize
1404    ret
1405
1406
1407
1408;*************************************************************************************************************
1409;void VAACalcSadSsdBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
1410;                int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
1411;                       int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
1412;*************************************************************************************************************
1413
1414
1415WELS_EXTERN VAACalcSadSsdBgd_sse2
1416%define         localsize               16
1417%define         cur_data                        esp + pushsize + localsize + 4
1418%define         ref_data                        esp + pushsize + localsize + 8
1419%define         iPicWidth                       esp + pushsize + localsize + 12
1420%define         iPicHeight                      esp + pushsize + localsize + 16
1421%define         iPicStride                      esp + pushsize + localsize + 20
1422%define         psadframe                       esp + pushsize + localsize + 24
1423%define         psad8x8                         esp + pushsize + localsize + 28
1424%define         psum16x16                       esp + pushsize + localsize + 32
1425%define         psqsum16x16                     esp + pushsize + localsize + 36
1426%define         psqdiff16x16            esp + pushsize + localsize + 40
1427%define         p_sd8x8                         esp + pushsize + localsize + 44
1428%define         p_mad8x8                        esp + pushsize + localsize + 48
1429%define         tmp_esi                         esp + 0
1430%define         tmp_edi                         esp + 4
1431%define         tmp_sadframe            esp + 8
1432%define         tmp_ecx                         esp + 12
1433%define         pushsize                16
1434    push    ebp
1435    push    esi
1436    push    edi
1437    push    ebx
1438    sub             esp,    localsize
1439    mov             esi,    [cur_data]
1440    mov             edi,    [ref_data]
1441    mov             ebx,    [iPicStride]
1442    mov             eax,    ebx
1443
1444    shr             dword [iPicWidth],      4                                       ; iPicWidth/16
1445    shr             dword [iPicHeight],     4                                       ; iPicHeight/16
1446    shl             eax,    4                                                       ; iPicStride*16
1447    pxor    xmm0,   xmm0
1448    movd    [tmp_sadframe], xmm0
1449sqdiff_bgd_height_loop:
1450    mov             ecx,    dword [iPicWidth]
1451    mov             [tmp_esi],      esi
1452    mov             [tmp_edi],      edi
1453sqdiff_bgd_width_loop:
1454    pxor    xmm7,   xmm7            ; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
1455    pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
1456    pxor    xmm5,   xmm5            ; pMad8x8
1457    pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
1458    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
1459    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
1460    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
1461    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
1462    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
1463    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
1464    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
1465    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
1466
1467    mov             edx,            [psad8x8]
1468    movdqa  xmm2,           xmm7
1469    pshufd  xmm1,           xmm2,           00001110b
1470    movd    [edx],          xmm2
1471    movd    [edx+4],        xmm1
1472    add             edx,            8
1473    mov             [psad8x8],      edx                     ; sad8x8
1474
1475    paddd   xmm1,                           xmm2
1476    movd    edx,                            xmm1
1477    add             [tmp_sadframe],         edx                     ; iFrameSad
1478
1479    mov             edx,            [psum16x16]
1480    movdqa  xmm1,           xmm6
1481    pshufd  xmm2,           xmm1,           00001110b
1482    paddd   xmm1,           xmm2
1483    movd    [edx],          xmm1                            ; sum
1484
1485    mov             edx,            [p_sd8x8]
1486    pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
1487    psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
1488    pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
1489    movq    [edx],          xmm1
1490    add             edx,            8
1491    mov             [p_sd8x8],      edx
1492
1493    mov                     edx,            [p_mad8x8]
1494    WELS_MAX_REG_SSE2       xmm5
1495    ;movdqa         xmm1,   xmm5
1496    ;punpcklbw      xmm1,   xmm0
1497    ;punpcklwd      xmm1,   xmm0
1498    ;movd           [edx],  xmm1
1499    ;punpckhbw      xmm5,   xmm0
1500    ;punpcklwd      xmm5,   xmm0
1501    ;movd           [edx+4],        xmm5
1502    ;add                    edx,            8
1503    ;mov                    [p_mad8x8],     edx
1504    mov                     [tmp_ecx],      ecx
1505    movhlps         xmm1,   xmm5
1506    movd            ecx,    xmm5
1507    mov                     [edx],  cl
1508    movd            ecx,    xmm1
1509    mov                     [edx+1],cl
1510    add                     edx,    2
1511    mov                     [p_mad8x8],     edx
1512
1513    psrlq   xmm7,   32
1514    psllq   xmm7,   32                      ; clear sad
1515    pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
1516    pxor    xmm5,   xmm5            ; pMad8x8
1517    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
1518    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
1519    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
1520    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
1521    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
1522    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
1523    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
1524    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
1525
1526    mov             edx,            [psad8x8]
1527    movdqa  xmm2,           xmm7
1528    pshufd  xmm1,           xmm2,           00001110b
1529    movd    [edx],          xmm2
1530    movd    [edx+4],        xmm1
1531    add             edx,            8
1532    mov             [psad8x8],      edx                     ; sad8x8
1533
1534    paddd   xmm1,                           xmm2
1535    movd    edx,                            xmm1
1536    add             [tmp_sadframe],         edx                     ; iFrameSad
1537
1538    mov             edx,                    [psum16x16]
1539    movdqa  xmm1,                   xmm6
1540    pshufd  xmm2,                   xmm1,           00001110b
1541    paddd   xmm1,                   xmm2
1542    movd    ebp,                    xmm1                            ; sum
1543    add             [edx],                  ebp
1544    add             edx,                    4
1545    mov             [psum16x16],    edx
1546
1547    mov             edx,                    [psqsum16x16]
1548    psrlq   xmm7,                   32
1549    pshufd  xmm2,                   xmm7,           00001110b
1550    paddd   xmm2,                   xmm7
1551    movd    [edx],                  xmm2                            ; sqsum
1552    add             edx,                    4
1553    mov             [psqsum16x16],  edx
1554
1555    mov             edx,            [p_sd8x8]
1556    pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
1557    psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
1558    pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
1559    movq    [edx],          xmm1
1560    add             edx,            8
1561    mov             [p_sd8x8],      edx
1562
1563    mov             edx,            [p_mad8x8]
1564    WELS_MAX_REG_SSE2       xmm5
1565    ;movdqa         xmm1,   xmm5
1566    ;punpcklbw      xmm1,   xmm0
1567    ;punpcklwd      xmm1,   xmm0
1568    ;movd           [edx],  xmm1
1569    ;punpckhbw      xmm5,   xmm0
1570    ;punpcklwd      xmm5,   xmm0
1571    ;movd           [edx+4],        xmm5
1572    ;add                    edx,            8
1573    ;mov                    [p_mad8x8],     edx
1574    movhlps         xmm1,   xmm5
1575    movd            ecx,    xmm5
1576    mov                     [edx],  cl
1577    movd            ecx,    xmm1
1578    mov                     [edx+1],cl
1579    add                     edx,    2
1580    mov                     [p_mad8x8],     edx
1581
1582    mov             edx,            [psqdiff16x16]
1583    pshufd  xmm1,           xmm4,           00001110b
1584    paddd   xmm4,           xmm1
1585    pshufd  xmm1,           xmm4,           00000001b
1586    paddd   xmm4,           xmm1
1587    movd    [edx],          xmm4
1588    add             edx,            4
1589    mov             [psqdiff16x16], edx
1590
1591    add             edx,    16
1592    sub             esi,    eax
1593    sub             edi,    eax
1594    add             esi,    16
1595    add             edi,    16
1596
1597    mov             ecx,    [tmp_ecx]
1598    dec             ecx
1599    jnz             sqdiff_bgd_width_loop
1600
1601    mov             esi,    [tmp_esi]
1602    mov             edi,    [tmp_edi]
1603    add             esi,    eax
1604    add             edi,    eax
1605
1606    dec     dword [iPicHeight]
1607    jnz             sqdiff_bgd_height_loop
1608
1609    mov             edx,    [psadframe]
1610    mov             ebp,    [tmp_sadframe]
1611    mov             [edx],  ebp
1612
1613    add             esp,    localsize
1614    pop             ebx
1615    pop             edi
1616    pop             esi
1617    pop             ebp
1618%undef          cur_data
1619%undef          ref_data
1620%undef          iPicWidth
1621%undef          iPicHeight
1622%undef          iPicStride
1623%undef          psadframe
1624%undef          psad8x8
1625%undef          psum16x16
1626%undef          psqsum16x16
1627%undef          psqdiff16x16
1628%undef          p_sd8x8
1629%undef          p_mad8x8
1630%undef          tmp_esi
1631%undef          tmp_edi
1632%undef          pushsize
1633%undef          localsize
1634    ret
1635%else
1636
1637;*************************************************************************************************************
1638;void VAACalcSadBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
1639;                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
1640;*************************************************************************************************************
1641
1642
1643WELS_EXTERN VAACalcSadBgd_sse2
1644%define         cur_data                        arg1;
1645%define         ref_data                        arg2;
1646%define         iPicWidth                       arg3;
1647%define         iPicHeight                      arg4;
1648%define         iPicStride                      arg5;
1649%define         psadframe                       arg6;
1650%define         psad8x8                         arg7;
1651%define         p_sd8x8                         arg8;
1652%define         p_mad8x8                        arg9;
1653
1654    push r12
1655    push r13
1656    push r14
1657    push r15
1658%assign push_num 4
1659    PUSH_XMM 10
1660%ifdef WIN64
1661    mov r4,arg5
1662    ;  mov r5,arg6
1663%endif
1664    mov r14,arg7
1665    SIGN_EXTENSION r2,r2d
1666    SIGN_EXTENSION r3,r3d
1667    SIGN_EXTENSION r4,r4d
1668
1669
1670    mov     r13,r4
1671    mov     r15,r0
1672    shr     r2,4
1673    shr     r3,4
1674    shl     r13,4
1675    pxor    xmm0,   xmm0
1676    pxor    xmm8,   xmm8
1677    pxor    xmm9,   xmm9
1678bgd_height_loop:
1679    ;mov            ecx,    dword [iPicWidth]
1680    push r2
1681    %assign push_num push_num+1
1682    mov             r10,    r15
1683    mov             r11,    r1
1684bgd_width_loop:
1685    pxor    xmm7,   xmm7            ; pSad8x8
1686    pxor    xmm6,   xmm6            ; sum_cur_8x8
1687    pxor    xmm5,   xmm5            ; sum_ref_8x8
1688    pxor    xmm4,   xmm4            ; pMad8x8
1689    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
1690    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
1691    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
1692    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
1693    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
1694    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
1695    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
1696    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
1697
1698
1699    mov                     r14,            p_mad8x8
1700    WELS_MAX_REG_SSE2       xmm4
1701
1702    ;mov                    [tmp_ecx],      ecx
1703    movhlps         xmm1,   xmm4
1704    movd            r0d,    xmm4
1705
1706
1707    mov                     [r14],  r0b
1708    movd            r0d,    xmm1
1709    mov                     [r14+1],r0b
1710    add                     r14,    2
1711    ;mov                     p_mad8x8,       r14
1712
1713
1714    pslldq          xmm7,   4
1715    pslldq          xmm6,   4
1716    pslldq          xmm5,   4
1717
1718
1719    pxor    xmm4,   xmm4            ; pMad8x8
1720    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
1721    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
1722    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
1723    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
1724    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
1725    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
1726    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
1727    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
1728
1729    ;mov                     r14,            [p_mad8x8]
1730    WELS_MAX_REG_SSE2       xmm4
1731
1732    movhlps         xmm1,   xmm4
1733    movd            r0d,    xmm4
1734    mov                     [r14],  r0b
1735    movd            r0d,    xmm1
1736    mov                     [r14+1],r0b
1737    add                     r14,    2
1738    mov                     p_mad8x8,       r14
1739
1740    ; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
1741
1742    mov             r14,    psad8x8
1743    pshufd  xmm1,   xmm7,   10001101b               ; D3 D2 D1 D0
1744    movdqa  [r14],  xmm1
1745    add             r14,    16
1746    mov             psad8x8,        r14                                     ; sad8x8
1747
1748    paddd   xmm1,   xmm7                                    ; D1+3 D3+2 D0+1 D2+0
1749    pshufd  xmm2,   xmm1,   00000011b
1750    paddd   xmm1,   xmm2
1751    movd    r14d,   xmm1
1752    movd    xmm9, r14d
1753    paddd   xmm8,   xmm9                                            ; sad frame
1754
1755    mov             r14,    p_sd8x8
1756    psubd   xmm6,   xmm5
1757    pshufd  xmm1,   xmm6,   10001101b
1758    movdqa  [r14],  xmm1
1759    add             r14,    16
1760    mov             p_sd8x8,        r14
1761
1762
1763    ;add            edx,    16
1764    sub             r15,    r13
1765    sub             r1,     r13
1766    add             r15,    16
1767    add             r1,     16
1768
1769
1770    dec             r2
1771    jnz             bgd_width_loop
1772    pop     r2
1773%assign push_num push_num-1
1774    mov             r15,    r10
1775    mov             r1,     r11
1776    add             r15,    r13
1777    add             r1,     r13
1778
1779    dec             r3
1780    jnz             bgd_height_loop
1781
1782    mov             r13,    psadframe
1783    movd    [r13],  xmm8
1784
1785    POP_XMM
1786    pop r15
1787    pop r14
1788    pop r13
1789    pop r12
1790%assign push_num 0
1791%undef          cur_data
1792%undef          ref_data
1793%undef          iPicWidth
1794%undef          iPicHeight
1795%undef          iPicStride
1796%undef          psadframe
1797%undef          psad8x8
1798%undef          p_sd8x8
1799%undef          p_mad8x8
1800%undef          tmp_esi
1801%undef          tmp_edi
1802%undef          pushsize
1803%undef          localsize
1804    ret
1805
1806
1807
1808;*************************************************************************************************************
1809;void VAACalcSadSsdBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
1810;                int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
1811;                       int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
1812;*************************************************************************************************************
1813
1814
1815WELS_EXTERN VAACalcSadSsdBgd_sse2
1816%define         cur_data                        arg1;
1817%define         ref_data                        arg2;
1818%define         iPicWidth                       arg3;
1819%define         iPicHeight                      arg4;
1820%define         iPicStride                      arg5;
1821%define         psadframe                       arg6;
1822%define         psad8x8                         arg7;
1823%define         psum16x16                       arg8;
1824%define         psqsum16x16                     arg9;
1825%define         psqdiff16x16                    arg10;
1826%define         p_sd8x8                         arg11
1827%define         p_mad8x8                        arg12
1828
1829    push r12
1830    push r13
1831    push r14
1832    push r15
1833%assign push_num 4
1834    PUSH_XMM 10
1835%ifdef WIN64
1836    mov r4,arg5
1837    ;mov r5,arg6
1838%endif
1839    SIGN_EXTENSION r2,r2d
1840    SIGN_EXTENSION r3,r3d
1841    SIGN_EXTENSION r4,r4d
1842
1843    mov     r13,r4
1844    shr             r2,     4                                       ; iPicWidth/16
1845    shr             r3,     4                                       ; iPicHeight/16
1846    shl             r13,    4                                                       ; iPicStride*16
1847    pxor    xmm0,   xmm0
1848    pxor    xmm8,   xmm8
1849    pxor    xmm9,   xmm9
1850
1851
1852sqdiff_bgd_height_loop:
1853    mov             r10,    r0
1854    mov             r11,    r1
1855    push r2
1856%assign push_num push_num+1
1857sqdiff_bgd_width_loop:
1858
1859    pxor    xmm7,   xmm7            ; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
1860    pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
1861    pxor    xmm5,   xmm5            ; pMad8x8
1862    pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
1863    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
1864    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
1865    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
1866    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
1867    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
1868    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
1869    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
1870    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
1871
1872    mov             r14,            psad8x8
1873    movdqa  xmm2,           xmm7
1874    pshufd  xmm1,           xmm2,           00001110b
1875    movd    [r14],          xmm2
1876    movd    [r14+4],        xmm1
1877    add             r14,            8
1878    mov             psad8x8,        r14                     ; sad8x8
1879
1880    paddd   xmm1,                           xmm2
1881    movd    r14d,                           xmm1
1882    movd    xmm9,r14d
1883    paddd           xmm8,           xmm9                    ; iFrameSad
1884
1885    mov             r14,            psum16x16
1886    movdqa  xmm1,           xmm6
1887    pshufd  xmm2,           xmm1,           00001110b
1888    paddd   xmm1,           xmm2
1889    movd    [r14],          xmm1                            ; sum
1890
1891    mov             r14,            p_sd8x8
1892    pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
1893    psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
1894    pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
1895    movq    [r14],          xmm1
1896    add             r14,            8
1897    mov             p_sd8x8,        r14
1898
1899    mov                     r14,            p_mad8x8
1900    WELS_MAX_REG_SSE2       xmm5
1901
1902    movhlps         xmm1,   xmm5
1903    push r0
1904    movd            r0d,    xmm5
1905    mov                     [r14],  r0b
1906    movd            r0d,    xmm1
1907    mov                     [r14+1],r0b
1908    pop r0
1909    add                     r14,    2
1910    mov                     p_mad8x8,       r14
1911
1912    psrlq   xmm7,   32
1913    psllq   xmm7,   32                      ; clear sad
1914    pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
1915    pxor    xmm5,   xmm5            ; pMad8x8
1916    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
1917    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
1918    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
1919    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
1920    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
1921    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
1922    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
1923    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
1924
1925    mov             r14,            psad8x8
1926    movdqa  xmm2,           xmm7
1927    pshufd  xmm1,           xmm2,           00001110b
1928    movd    [r14],          xmm2
1929    movd    [r14+4],        xmm1
1930    add             r14,            8
1931    mov             psad8x8,        r14                     ; sad8x8
1932
1933    paddd   xmm1,                           xmm2
1934    movd    r14d,                           xmm1
1935    movd    xmm9, r14d
1936    paddd   xmm8,           xmm9            ; iFrameSad
1937
1938    mov             r14,                    psum16x16
1939    movdqa  xmm1,                   xmm6
1940    pshufd  xmm2,                   xmm1,           00001110b
1941    paddd   xmm1,                   xmm2
1942    movd    r15d,                   xmm1                            ; sum
1943    add             [r14],                  r15d
1944    add             r14,                    4
1945    mov             psum16x16,      r14
1946
1947    mov             r14,                    psqsum16x16
1948    psrlq   xmm7,                   32
1949    pshufd  xmm2,                   xmm7,           00001110b
1950    paddd   xmm2,                   xmm7
1951    movd    [r14],                  xmm2                            ; sqsum
1952    add             r14,                    4
1953    mov             psqsum16x16,    r14
1954
1955    mov             r14,            p_sd8x8
1956    pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
1957    psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
1958    pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
1959    movq    [r14],          xmm1
1960    add             r14,            8
1961    mov             p_sd8x8,        r14
1962
1963    mov             r14,            p_mad8x8
1964    WELS_MAX_REG_SSE2       xmm5
1965
1966
1967    movhlps         xmm1,   xmm5
1968    push r0
1969    movd            r0d,    xmm5
1970    mov                     [r14],  r0b
1971    movd            r0d,    xmm1
1972    mov                     [r14+1],r0b
1973    pop r0
1974    add                     r14,    2
1975    mov                     p_mad8x8,       r14
1976
1977    mov             r14,            psqdiff16x16
1978    pshufd  xmm1,           xmm4,           00001110b
1979    paddd   xmm4,           xmm1
1980    pshufd  xmm1,           xmm4,           00000001b
1981    paddd   xmm4,           xmm1
1982    movd    [r14],          xmm4
1983    add             r14,            4
1984    mov             psqdiff16x16,   r14
1985
1986    add             r14,    16
1987    sub             r0,     r13
1988    sub             r1,     r13
1989    add             r0,     16
1990    add             r1,     16
1991
1992    dec             r2
1993    jnz             sqdiff_bgd_width_loop
1994    pop r2
1995    %assign push_num push_num-1
1996    mov             r0,     r10
1997    mov             r1,     r11
1998    add             r0,     r13
1999    add             r1,     r13
2000
2001    dec     r3
2002    jnz             sqdiff_bgd_height_loop
2003
2004    mov             r14,    psadframe
2005    movd    [r14],  xmm8
2006
2007    POP_XMM
2008    pop r15
2009    pop r14
2010    pop r13
2011    pop r12
2012%assign push_num 0
2013%undef          cur_data
2014%undef          ref_data
2015%undef          iPicWidth
2016%undef          iPicHeight
2017%undef          iPicStride
2018%undef          psadframe
2019%undef          psad8x8
2020%undef          psum16x16
2021%undef          psqsum16x16
2022%undef          psqdiff16x16
2023%undef          p_sd8x8
2024%undef          p_mad8x8
2025%undef          tmp_esi
2026%undef          tmp_edi
2027%undef          pushsize
2028%undef          localsize
2029    ret
2030%endif
2031
2032%ifdef X86_32
2033%define ptrword dword
2034%else
2035%define ptrword qword
2036%endif
2037
2038%define xmm_width 16
2039%define ymm_width 32
2040
2041%macro PUSHM 1-*
2042    %rep %0
2043        push           %1
2044        %rotate 1
2045    %endrep
2046    %assign push_num push_num + %0
2047%endmacro
2048
2049%macro POPM 1-*
2050    %rep %0
2051        %rotate -1
2052        pop            %1
2053    %endrep
2054    %assign push_num push_num - %0
2055%endmacro
2056
2057%ifdef X86_32
2058%define stack_alloc_min 4
2059%else
2060%define stack_alloc_min 8
2061%endif
2062
2063; Allocate aligned stack space.
2064; address_out=%1 size=%2 alignment=%3
2065%macro STACK_ALLOC 3
2066%if (%3) & ((%3) - 1)
2067    %error non-power-of-2 alignment requested.
2068%endif
2069%if (%3) > 0
2070    %assign stack_alloc_align ((%3) + stack_alloc_min - 1) / stack_alloc_min
2071%else
2072    %assign stack_alloc_align 1
2073%endif
2074    %assign stack_alloc_num ((%2) + stack_alloc_min - 1) / stack_alloc_min + stack_alloc_align - 1
2075    %assign push_num push_num + stack_alloc_num
2076    sub            r7, stack_alloc_min * stack_alloc_num
2077%if stack_alloc_align == 1
2078    mov            %1, r7
2079%else
2080    lea            %1, [r7 + stack_alloc_min * (stack_alloc_align - 1)]
2081    and            %1, -(stack_alloc_min * stack_alloc_align)
2082%endif
2083%endmacro
2084
2085; Deallocate stack space allocated with STACK_ALLOC.
2086%macro STACK_DEALLOC 0
2087    add            r7, stack_alloc_min * stack_alloc_num
2088    %assign push_num push_num - stack_alloc_num
2089%endmacro
2090
2091%ifdef HAVE_AVX2
2092; Max unsigned byte per quadword
2093; out=%1 in=%2 tmp=%3
2094%macro AVX2_Maxubq 3
2095    vpsrlq         %3, %2, 32
2096    vpmaxub        %1, %2, %3
2097    vpsrlq         %3, %1, 16
2098    vpmaxub        %1, %1, %3
2099    vpsrlq         %3, %1,  8
2100    vpmaxub        %1, %1, %3
2101%endmacro
2102
2103; Max unsigned byte per quadword. 2 register input.
2104; Results interleaved as least significant byte of even/odd doublewords.
2105; out=%1 in_a=%2 in_b=%3 tmp=%4
2106%macro AVX2_Maxubq2 4
2107    vpblendd       %4, %2, %3, 10101010b
2108    vpshufd        %4, %4, 10110001b
2109    vpblendd       %1, %2, %3, 01010101b
2110    vpmaxub        %1, %4, %1
2111    vpsrld         %4, %1, 16
2112    vpmaxub        %1, %1, %4
2113    vpsrld         %4, %1,  8
2114    vpmaxub        %1, %1, %4
2115%endmacro
2116
2117; res=%1 src=%2 zero=%3 tmp=%4 add_to_res=%5
2118%macro AVX2_Sqsumbdw 5
2119    vpunpcklbw     %4, %2, %3
2120%if %5
2121    vpmaddwd       %4, %4, %4
2122    vpaddd         %1, %1, %4
2123%else
2124    vpmaddwd       %1, %4, %4
2125%endif
2126    vpunpckhbw     %4, %2, %3
2127    vpmaddwd       %4, %4, %4
2128    vpaddd         %1, %1, %4
2129%endmacro
2130
2131; res=%1 src=%2 zero=%3 tmp=%4 add_to_res=%5
2132%macro AVX2_Sumbdw 5
2133%if %5
2134    vpsadbw        %4, %2, %3
2135    vpaddd         %1, %1, %4
2136%else
2137    vpsadbw        %1, %2, %3
2138%endif
2139%endmacro
2140
2141; res=%1 a=%2 b=%3 a=%4 tmp=%5
2142%macro AVX2_AbsDiffub 5
2143    vpsubusb       %5, %2, %3
2144    vpsubusb       %1, %3, %4
2145    vpor           %1, %5, %1
2146%endmacro
2147
2148; sad=%1 cur_data=%2 ref_data=%3 tmp=%4 accumulate_results=%5
2149%macro AVX2_Sadbdw 5
2150%if %5
2151    vpsadbw        %4, %2, %3
2152    vpaddd         %1, %1, %4
2153%else
2154    vpsadbw        %1, %2, %3
2155%endif
2156%endmacro
2157
2158; sad=%1 sum_cur=%2 sqsum_cur=%3 cur_data=%4 ref_data=%5 zero=%6 tmp=%7 accumulate_results=%8
2159%macro AVX2_SadSumSqsumbdw 8
2160    AVX2_Sadbdw    %1, %4, %5, %7, %8
2161    AVX2_Sumbdw    %2, %4, %6, %7, %8
2162    AVX2_Sqsumbdw  %3, %4, %6, %7, %8
2163%endmacro
2164
2165; sad=%1 pCur=%2 pRef=%3 tmp=%4 accumulate_results=%5
2166%macro AVX2_Sad 5
2167    vmovdqu        %4, [%2]
2168    AVX2_Sadbdw    %1, %4, [%3], %4, %5
2169%endmacro
2170
2171; sad=%1 sum_cur=%2 sqsum_cur=%3 pCur=%4 pRef=%5 zero=%6 tmp=%7,%8 accumulate_results=%9
2172%macro AVX2_SadSumSqsum 9
2173    vmovdqu        %7, [%4]
2174    AVX2_SadSumSqsumbdw %1, %2, %3, %7, [%5], %6, %8, %9
2175%endmacro
2176
2177; sad=%1 sum_cur=%2 sqsum_cur=%3 sqdiff=%4 pCur=%5 pRef=%6 zero=%7 tmp=%8,%9,%10 accumulate_results=%11
2178%macro AVX2_SadSumSqsumSqdiff 11
2179    vmovdqu        %8,  [%5]
2180    vmovdqu        %9,  [%6]
2181    AVX2_SadSumSqsumbdw %1, %2, %3, %8, %9, %7, %10, %11
2182    AVX2_AbsDiffub %9,  %8,  %9,  %8,  %10
2183    AVX2_Sqsumbdw  %4,  %9,  %7,  %10, %11
2184%endmacro
2185
2186; sad=%1 sum_cur=%2 sum_ref=%3 mad=%4 pCur=%5 pRef=%6 zero=%7 tmp=%8,%9,%10 accumulate_results=%11
2187%macro AVX2_SadSdMad 11
2188    vmovdqu        %8,  [%5]
2189    vmovdqu        %9,  [%6]
2190    AVX2_Sumbdw    %2,  %8,  %7,  %10, %11
2191    AVX2_Sumbdw    %3,  %9,  %7,  %10, %11
2192    AVX2_Sadbdw    %1,  %8,  %9,  %10, %11
2193%if %11
2194    AVX2_AbsDiffub %9,  %8,  %9,  %8, %10
2195    vpmaxub        %4,  %4,  %9
2196%else
2197    AVX2_AbsDiffub %4,  %8,  %9,  %8, %10
2198%endif
2199%endmacro
2200
2201; sad=%1 sum_cur=%2 sum_ref=%3 mad=%4 sqdiff=%5 sqsum_cur=%6 pCur=%7 pRef=%8 zero=%9 tmp=%10,%11,%12 accumulate_results=%13
2202%macro AVX2_SadBgdSqdiff 13
2203%ifidn %12, 0
2204    vmovdqu        %10, [%7]
2205    AVX2_Sumbdw    %2,  %10, %9,  %11, %13
2206    AVX2_Sqsumbdw  %6,  %10, %9,  %11, %13
2207    vmovdqu        %11, [%8]
2208    AVX2_Sadbdw    %1,  %10, %11, %10, %13
2209    AVX2_Sumbdw    %3,  %11, %9,  %10, %13
2210    vmovdqu        %10, [%7]
2211%if %13
2212    AVX2_AbsDiffub %11, %10, %11, [%7], %10
2213    vpmaxub        %4,  %4,  %11
2214    AVX2_Sqsumbdw  %5,  %11, %9,  %10, %13
2215%else
2216    AVX2_AbsDiffub %4,  %10, %11, [%7], %10
2217    AVX2_Sqsumbdw  %5,  %4,  %9,  %10, %13
2218%endif
2219%else
2220    vmovdqu        %10, [%7]
2221    vmovdqu        %11, [%8]
2222    AVX2_Sadbdw    %1,  %10, %11, %12, %13
2223    AVX2_Sumbdw    %2,  %10, %9,  %12, %13
2224    AVX2_Sumbdw    %3,  %11, %9,  %12, %13
2225    AVX2_Sqsumbdw  %6,  %10, %9,  %12, %13
2226%if %13
2227    AVX2_AbsDiffub %11, %10, %11, %10, %12
2228    vpmaxub        %4,  %4,  %11
2229    AVX2_Sqsumbdw  %5,  %11, %9,  %10, %13
2230%else
2231    AVX2_AbsDiffub %4,  %10, %11, %10, %12
2232    AVX2_Sqsumbdw  %5,  %4,  %9,  %10, %13
2233%endif
2234%endif
2235%endmacro
2236
2237; p_dst=%1 mmreg_prefix=%2 data=%3 tmp=%4 second_blocks=%5
2238%macro AVX2_Store8x8Accdw 5
2239    vpshufd        %2%4, %2%3, 1000b
2240%ifidni %2, x
2241    vmovlps        [%1 + 8 * %5], x%4
2242%elif %5 == 0
2243    vmovdqu        [%1], %2%4
2244%else
2245    vmovlps        [%1 +  8], x%4
2246    vextracti128   x%4, %2%4, 1
2247    vmovlps        [%1 + 24], x%4
2248%endif
2249%endmacro
2250
2251; p_dst=%1 mmreg_prefix=%2 data=%3 tmp=%4 second_blocks=%5
2252%macro AVX2_Store8x8Accb 5
2253    vpunpckhqdq    %2%4, %2%3, %2%3
2254    vpunpcklbw     %2%4, %2%3, %2%4
2255%if %5 == 0
2256    vmovd          [%1 + 0], x%4
2257%ifidni %2, y
2258    vextracti128   x%4, %2%4, 1
2259    vmovd          [%1 + 4], x%4
2260%endif
2261%else
2262    vpextrw        [%1 + 2], x%4, 0
2263%ifidni %2, y
2264    vextracti128   x%4, %2%4, 1
2265    vpextrw        [%1 + 6], x%4, 0
2266%endif
2267%endif
2268%endmacro
2269
2270; p_dst=%1 data=%2 tmp=%3,%4 second_blocks=%5
2271%macro AVX2_Store2x8x8Accb 5
2272    vpunpckhqdq    y%3, y%2, y%2
2273    vpunpcklbw     y%3, y%2, y%3
2274    vextracti128   x%4, y%3, 1
2275    vpsllq         x%4, x%4, 32
2276    vpblendd       x%4, x%3, x%4, 1010b
2277%if %5
2278    vpslld         x%4, x%4, 16
2279    vpblendw       x%4, x%4, [%1], 01010101b
2280%endif
2281    vmovdqu        [%1], x%4
2282%endmacro
2283
2284; p_dst=%1 mmreg_prefix=%2 data=%3 tmp=%4 add_to_dst=%5
2285%macro AVX2_Store16x16Accdw 5
2286%ifidni %2, x
2287%if %5
2288    vmovd          x%4, [%1 + 0]
2289    vpaddd         x%3, x%4, x%3
2290%endif
2291    vmovd          [%1 + 0], x%3
2292%elif %5 == 0
2293    vmovd          [%1 + 0], x%3
2294    vextracti128   x%3, %2%3, 1
2295    vmovd          [%1 + 4], x%3
2296%else
2297    vextracti128   x%4, %2%3, 1
2298    vpunpckldq     x%4, x%3, x%4
2299    vmovq          x%3, [%1 + 0]
2300    vpaddd         x%3, x%3, x%4
2301    vmovlps        [%1 + 0], x%3
2302%endif
2303%endmacro
2304
2305; p_dst1=%1 p_dst2=%2 i_dst_offset=%3 gpr_tmp=%4 mmreg_prefix=%5 data=%6 mm_tmp=%7 add_to_dst=%8
2306%macro AVX2_Store2x16x16Accdw 8
2307%ifidni %5, x
2308    mov            %4, %1
2309%if %8 == 0
2310    vmovd          [%4 + %3], x%6
2311    mov            %4, %2
2312    vpextrd        [%4 + %3], x%6, 2
2313%else
2314    vmovd          x%7, [%4 + %3]
2315    vpaddd         x%7, x%7, x%6
2316    vmovd          [%4 + %3], x%7
2317    mov            %4, %2
2318    vpbroadcastd   x%7, [%4 + %3]
2319    vpaddd         x%7, x%7, x%6
2320    vpextrd        [%4 + %3], x%7, 2
2321%endif
2322%else
2323    vextracti128   x%7, %5%6, 1
2324    vpblendd       x%6, x%6, x%7, 1010b
2325    mov            %4, %1
2326%if %8 == 0
2327    vmovlps        [%4 + %3], x%6
2328    mov            %4, %2
2329    vmovhps        [%4 + %3], x%6
2330%else
2331    vmovq          x%7, [%4 + %3]
2332    vpaddd         x%7, x%7, x%6
2333    vmovlps        [%4 + %3], x%7
2334    mov            %4, %2
2335    vpbroadcastq   x%7, [%4 + %3]
2336    vpaddd         x%7, x%7, x%6
2337    vmovhps        [%4 + %3], x%7
2338%endif
2339%endif
2340%endmacro
2341
2342
2343; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6 b_second_blocks=%7
2344%macro AVX2_CalcSad_8Lines 7
2345%define mm_tmp0    %2
2346%define mm_sad     %3
2347%define mm_sad2    %4
2348%define mm_sad3    %5
2349%define mm_sad4    %6
2350%define b_second_blocks %7
2351%ifdef i_stride5
2352    %define i_stride5_ i_stride5
2353%else
2354    lea            r_tmp, [5 * i_stride]
2355    %define i_stride5_ r_tmp
2356%endif
2357    ; Use multiple accumulators to shorten dependency chains and enable more parallelism.
2358    AVX2_Sad       %1 %+ mm_sad,  p_cur,                  p_ref,                  %1 %+ mm_tmp0, 0
2359    AVX2_Sad       %1 %+ mm_sad2, p_cur + 1 * i_stride,   p_ref + 1 * i_stride,   %1 %+ mm_tmp0, 0
2360    AVX2_Sad       %1 %+ mm_sad3, p_cur + 2 * i_stride,   p_ref + 2 * i_stride,   %1 %+ mm_tmp0, 0
2361    AVX2_Sad       %1 %+ mm_sad4, p_cur + 1 * i_stride3,  p_ref + 1 * i_stride3,  %1 %+ mm_tmp0, 0
2362    AVX2_Sad       %1 %+ mm_sad,  p_cur + 4 * i_stride,   p_ref + 4 * i_stride,   %1 %+ mm_tmp0, 1
2363    AVX2_Sad       %1 %+ mm_sad2, p_cur + 1 * i_stride5_, p_ref + 1 * i_stride5_, %1 %+ mm_tmp0, 1
2364%ifdef i_stride7
2365    %define i_stride7_ i_stride7
2366%else
2367    lea            r_tmp, [i_stride + 2 * i_stride3]
2368    %define i_stride7_ r_tmp
2369%endif
2370    AVX2_Sad       %1 %+ mm_sad3, p_cur + 2 * i_stride3,  p_ref + 2 * i_stride3,  %1 %+ mm_tmp0, 1
2371    AVX2_Sad       %1 %+ mm_sad4, p_cur + 1 * i_stride7_, p_ref + 1 * i_stride7_, %1 %+ mm_tmp0, 1
2372%undef i_stride5_
2373%undef i_stride7_
2374    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
2375    add            p_cur, %1 %+ mm_width
2376    add            p_ref, %1 %+ mm_width
2377    ; Collapse accumulators.
2378    vpaddd         %1 %+ mm_sad,  %1 %+ mm_sad,  %1 %+ mm_sad2
2379    vpaddd         %1 %+ mm_sad3, %1 %+ mm_sad3, %1 %+ mm_sad4
2380    vpaddd         %1 %+ mm_sad,  %1 %+ mm_sad,  %1 %+ mm_sad3
2381    AVX2_Store8x8Accdw p_sad8x8 + xcnt_unit * i_xcnt, %1, mm_sad, mm_tmp0, b_second_blocks
2382    vpaddd         y %+ mm_sadframe, y %+ mm_sadframe, y %+ mm_sad
2383%undef mm_tmp0
2384%undef mm_sad
2385%undef mm_sad2
2386%undef mm_sad3
2387%undef mm_sad4
2388%undef b_second_blocks
2389%endmacro
2390
2391;*************************************************************************************************************
2392;void VAACalcSad_avx2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
2393;                                                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
2394;*************************************************************************************************************
2395
2396WELS_EXTERN VAACalcSad_avx2
2397%define          p_sadframe                    ptrword arg6
2398%define          p_sad8x8                      ptrword arg7
2399%ifdef X86_32
2400%define          saveregs                      r5, r6
2401%else
2402%define          saveregs                      rbx, rbp, r12
2403%endif
2404
2405%assign push_num 0
2406    LOAD_5_PARA
2407    PUSH_XMM 7
2408    SIGN_EXTENSION r2, r2d
2409    SIGN_EXTENSION r3, r3d
2410    SIGN_EXTENSION r4, r4d
2411    PUSHM          saveregs
2412
2413%define mm_zero mm0
2414%define mm_sadframe mm6
2415    vpxor          x %+ mm_zero, x %+ mm_zero, x %+ mm_zero
2416    vmovdqa        y %+ mm_sadframe, y %+ mm_zero
2417
2418    and            r2, -16                     ; iPicWidth &= -16
2419    jle            .done                       ; bail if iPicWidth < 16
2420    sar            r3, 4                       ; iPicHeight / 16
2421    jle            .done                       ; bail if iPicHeight < 16
2422    shr            r2, 2                       ; iPicWidth / 4
2423
2424%define p_cur     r0
2425%define p_ref     r1
2426%define i_xcnt    r2
2427%define i_ycnt    ptrword arg4
2428%define i_stride  r4
2429%define xcnt_unit 4
2430%ifdef X86_32
2431    mov            i_ycnt, r3
2432    mov            r5, p_sad8x8
2433    %define i_stride3 r3
2434    %undef  p_sad8x8
2435    %define p_sad8x8  r5
2436    %define r_tmp     r6
2437    lea            i_stride3, [3 * i_stride]
2438%else
2439    mov            rbp, p_sad8x8
2440    %define i_stride3 rbx
2441    %define i_stride5 r12
2442    %define i_stride7 r6
2443    %undef  p_sad8x8
2444    %define p_sad8x8  rbp
2445    lea            i_stride3, [3 * i_stride]
2446    lea            i_stride5, [5 * i_stride]
2447    lea            i_stride7, [i_stride + 2 * i_stride3]
2448%endif
2449
2450    ; offset pointer so as to compensate for the i_xcnt offset below.
2451    sub            p_sad8x8, 4 * 16 / xcnt_unit
2452
2453    push           i_xcnt
2454%assign push_num push_num + 1
2455%define i_xcnt_load ptrword [r7]
2456
2457.height_loop:
2458    ; use end-of-line pointers so as to enable use of a negative counter as index.
2459    lea            p_sad8x8, [p_sad8x8 + xcnt_unit * i_xcnt]
2460    ; use a negative loop counter so as to enable counting toward zero and indexing with the same counter.
2461    neg            i_xcnt
2462    add            i_xcnt, 16 / xcnt_unit
2463    jz             .width_loop_upper8_remaining16
2464.width_loop_upper8:
2465    AVX2_CalcSad_8Lines y, mm1, mm2, mm3, mm4, mm5, 0
2466    add            i_xcnt, 32 / xcnt_unit
2467    jl             .width_loop_upper8
2468    jg             .width_loop_upper8_end
2469.width_loop_upper8_remaining16:
2470    AVX2_CalcSad_8Lines x, mm1, mm2, mm3, mm4, mm5, 0
2471.width_loop_upper8_end:
2472    lea            p_cur, [p_cur + 8 * i_stride]
2473    lea            p_ref, [p_ref + 8 * i_stride]
2474    xor            i_xcnt, i_xcnt
2475    sub            i_xcnt, i_xcnt_load
2476    lea            p_cur, [p_cur + xcnt_unit * i_xcnt]
2477    lea            p_ref, [p_ref + xcnt_unit * i_xcnt]
2478    add            i_xcnt, 16 / xcnt_unit
2479    jz             .width_loop_lower8_remaining16
2480.width_loop_lower8:
2481    AVX2_CalcSad_8Lines y, mm1, mm2, mm3, mm4, mm5, 1
2482    add            i_xcnt, 32 / xcnt_unit
2483    jl             .width_loop_lower8
2484    jg             .width_loop_lower8_end
2485.width_loop_lower8_remaining16:
2486    AVX2_CalcSad_8Lines x, mm1, mm2, mm3, mm4, mm5, 1
2487.width_loop_lower8_end:
2488    lea            p_cur, [p_cur + 8 * i_stride]
2489    lea            p_ref, [p_ref + 8 * i_stride]
2490    xor            i_xcnt, i_xcnt
2491    sub            i_xcnt, i_xcnt_load
2492    lea            p_cur, [p_cur + xcnt_unit * i_xcnt]
2493    lea            p_ref, [p_ref + xcnt_unit * i_xcnt]
2494    neg            i_xcnt
2495    sub            i_ycnt, 1
2496    jnz            .height_loop
2497
2498    pop            i_xcnt
2499%assign push_num push_num - 1
2500%undef i_xcnt_load
2501
2502.done:
2503    mov            r6, p_sadframe
2504    vextracti128   xmm2, y %+ mm_sadframe, 1
2505    vpaddd         xmm2, x %+ mm_sadframe, xmm2
2506    vpunpckhqdq    xmm1, xmm2, xmm2
2507    vpaddd         xmm2, xmm2, xmm1
2508    vmovd          [r6], xmm2
2509    vzeroupper
2510
2511    POPM           saveregs
2512    POP_XMM
2513    LOAD_5_PARA_POP
2514%undef           p_cur
2515%undef           p_ref
2516%undef           i_xcnt
2517%undef           i_ycnt
2518%undef           i_stride
2519%undef           r_tmp
2520%undef           xcnt_unit
2521%undef           i_stride3
2522%undef           i_stride5
2523%undef           i_stride7
2524%undef           mm_sadframe
2525%undef           mm_zero
2526%undef           saveregs
2527%undef           p_sadframe
2528%undef           p_sad8x8
2529    ret
2530
2531
2532; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6 b_second_blocks=%7
2533%macro AVX2_CalcSadVar_8Lines 7
2534%define mm_tmp0    %2
2535%define mm_tmp1    %3
2536%define mm_sad     %4
2537%define mm_sum     %5
2538%define mm_sqsum   %6
2539%define b_second_blocks %7
2540    ; Unroll for better performance on Haswell.
2541    ; Avoid unrolling for the 16 px case so as to reduce the code footprint.
2542%ifidni %1, y
2543    lea            r_tmp, [5 * i_stride]
2544    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur,                 p_ref,                 %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 0
2545    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 1 * i_stride,  p_ref + 1 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
2546    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 2 * i_stride,  p_ref + 2 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
2547    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
2548    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 4 * i_stride,  p_ref + 4 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
2549    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + r_tmp,         p_ref + r_tmp,         %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
2550    lea            r_tmp, [i_stride + 2 * i_stride3]
2551    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
2552    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + r_tmp,         p_ref + r_tmp,         %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
2553    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
2554    add            p_cur, %1 %+ mm_width
2555    add            p_ref, %1 %+ mm_width
2556%else
2557    vpxor          x %+ mm_sad, x %+ mm_sad, x %+ mm_sad
2558    vpxor          x %+ mm_sum, x %+ mm_sum, x %+ mm_sum
2559    vpxor          x %+ mm_sqsum, x %+ mm_sqsum, x %+ mm_sqsum
2560    lea            r_tmp, [8 * i_stride]
2561    add            p_cur, r_tmp
2562    add            p_ref, r_tmp
2563    neg            r_tmp
2564%%loop:
2565    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
2566    add            r_tmp, i_stride
2567    jl             %%loop
2568    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
2569    lea            r_tmp, [8 * i_stride - %1 %+ mm_width]
2570    sub            p_cur, r_tmp
2571    sub            p_ref, r_tmp
2572%endif
2573    AVX2_Store8x8Accdw p_sad8x8 + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks
2574    vpaddd         y %+ mm_sadframe, y %+ mm_sadframe, y %+ mm_sad
2575    vpunpcklqdq    %1 %+ mm_tmp0, %1 %+ mm_sum, %1 %+ mm_sqsum
2576    vpunpckhqdq    %1 %+ mm_tmp1, %1 %+ mm_sum, %1 %+ mm_sqsum
2577    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1
2578    vpshufd        %1 %+ mm_tmp1, %1 %+ mm_tmp0, 10110001b
2579    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1
2580    AVX2_Store2x16x16Accdw p_sum16x16, p_sqsum16x16, i_xcnt, r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks
2581%undef mm_tmp0
2582%undef mm_tmp1
2583%undef mm_sad
2584%undef mm_sum
2585%undef mm_sqsum
2586%undef b_second_blocks
2587%endmacro
2588
2589;*************************************************************************************************************
2590;void VAACalcSadVar_avx2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
2591;               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
2592;*************************************************************************************************************
2593
2594WELS_EXTERN VAACalcSadVar_avx2
2595%define          p_sadframe                    ptrword arg6
2596%define          p_sad8x8                      ptrword arg7
2597%define          p_sum16x16                    ptrword arg8
2598%define          p_sqsum16x16                  ptrword arg9
2599%ifdef X86_32
2600%define          saveregs                      r5, r6
2601%else
2602%define          saveregs                      rbx, rbp, r12, r13
2603%endif
2604
2605%assign push_num 0
2606    LOAD_5_PARA
2607    PUSH_XMM 7
2608    SIGN_EXTENSION r2, r2d
2609    SIGN_EXTENSION r3, r3d
2610    SIGN_EXTENSION r4, r4d
2611    PUSHM          saveregs
2612
2613%define mm_zero mm0
2614%define mm_sadframe mm6
2615    vpxor          x %+ mm_zero, x %+ mm_zero, x %+ mm_zero
2616    vmovdqa        y %+ mm_sadframe, y %+ mm_zero
2617
2618    and            r2, -16                     ; iPicWidth &= -16
2619    jle            .done                       ; bail if iPicWidth < 16
2620    sar            r3, 4                       ; iPicHeight / 16
2621    jle            .done                       ; bail if iPicHeight < 16
2622    shr            r2, 2                       ; iPicWidth / 4
2623
2624%define p_cur     r0
2625%define p_ref     r1
2626%define i_xcnt    r2
2627%define i_ycnt    ptrword arg4
2628%define i_stride  r4
2629%define r_tmp     r6
2630%define xcnt_unit 4
2631%ifdef X86_32
2632    mov            i_ycnt, r3
2633    mov            r3, p_sad8x8
2634    %undef  p_sad8x8
2635    %define p_sad8x8 r3
2636    %define i_stride3 r5
2637%else
2638    mov            rbp, p_sad8x8
2639    mov            r12, p_sum16x16
2640    mov            r13, p_sqsum16x16
2641    %undef  p_sad8x8
2642    %undef  p_sum16x16
2643    %undef  p_sqsum16x16
2644    %define p_sad8x8 rbp
2645    %define p_sum16x16 r12
2646    %define p_sqsum16x16 r13
2647    %define i_stride3 rbx
2648%endif
2649    lea            i_stride3, [3 * i_stride]
2650
2651    ; offset pointers so as to compensate for the i_xcnt offset below.
2652    sub            p_sad8x8,      4 * 16 / xcnt_unit
2653    sub            p_sum16x16,    1 * 16 / xcnt_unit
2654    sub            p_sqsum16x16,  1 * 16 / xcnt_unit
2655
2656    ; use a negative loop counter so as to enable counting toward zero and indexing with the same counter.
2657    neg            i_xcnt
2658
2659.height_loop:
2660    push           i_xcnt
2661%assign push_num push_num + 1
2662%define i_xcnt_load ptrword [r7]
2663    ; use end-of-line pointers so as to enable use of a negative counter as index.
2664    lea            r_tmp, [xcnt_unit * i_xcnt]
2665    sub            p_sad8x8, r_tmp
2666    sub            p_sum16x16, i_xcnt
2667    sub            p_sqsum16x16, i_xcnt
2668    add            i_xcnt, 16 / xcnt_unit
2669    jz             .width_loop_upper8_remaining16
2670.width_loop_upper8:
2671    AVX2_CalcSadVar_8Lines y, mm1, mm2, mm3, mm4, mm5, 0
2672    add            i_xcnt, 32 / xcnt_unit
2673    jl             .width_loop_upper8
2674    jg             .width_loop_upper8_end
2675.width_loop_upper8_remaining16:
2676    AVX2_CalcSadVar_8Lines x, mm1, mm2, mm3, mm4, mm5, 0
2677.width_loop_upper8_end:
2678    lea            p_cur, [p_cur + 8 * i_stride]
2679    lea            p_ref, [p_ref + 8 * i_stride]
2680    mov            i_xcnt, i_xcnt_load
2681    lea            p_cur, [p_cur + xcnt_unit * i_xcnt]
2682    lea            p_ref, [p_ref + xcnt_unit * i_xcnt]
2683    add            i_xcnt, 16 / xcnt_unit
2684    jz             .width_loop_lower8_remaining16
2685.width_loop_lower8:
2686    AVX2_CalcSadVar_8Lines y, mm1, mm2, mm3, mm4, mm5, 1
2687    add            i_xcnt, 32 / xcnt_unit
2688    jl             .width_loop_lower8
2689    jg             .width_loop_lower8_end
2690.width_loop_lower8_remaining16:
2691    AVX2_CalcSadVar_8Lines x, mm1, mm2, mm3, mm4, mm5, 1
2692.width_loop_lower8_end:
2693    lea            p_cur, [p_cur + 8 * i_stride]
2694    lea            p_ref, [p_ref + 8 * i_stride]
2695%undef i_xcnt_load
2696    pop            i_xcnt
2697    %assign push_num push_num - 1
2698    lea            p_cur, [p_cur + xcnt_unit * i_xcnt]
2699    lea            p_ref, [p_ref + xcnt_unit * i_xcnt]
2700    sub            i_ycnt, 1
2701    jnz            .height_loop
2702
2703.done:
2704    mov            r_tmp, p_sadframe
2705    vextracti128   xmm2, y %+ mm_sadframe, 1
2706    vpaddd         xmm2, x %+ mm_sadframe, xmm2
2707    vpunpckhqdq    xmm1, xmm2, xmm2
2708    vpaddd         xmm2, xmm2, xmm1
2709    vmovd          [r_tmp], xmm2
2710    vzeroupper
2711
2712    POPM           saveregs
2713    POP_XMM
2714    LOAD_5_PARA_POP
2715%undef           p_cur
2716%undef           p_ref
2717%undef           i_xcnt
2718%undef           i_ycnt
2719%undef           i_stride
2720%undef           i_stride3
2721%undef           r_tmp
2722%undef           xcnt_unit
2723%undef           mm_sadframe
2724%undef           mm_zero
2725%undef           saveregs
2726%undef           p_sadframe
2727%undef           p_sad8x8
2728%undef           p_sum16x16
2729%undef           p_sqsum16x16
2730    ret
2731
2732
2733; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6,%7,%8 b_second_blocks=%9
2734%macro AVX2_CalcSadSsd_8Lines 9
2735%define mm_tmp0    %2
2736%define mm_tmp1    %3
2737%define mm_tmp2    %4
2738%define mm_sad     %5
2739%define mm_sum     %6
2740%define mm_sqsum   %7
2741%define mm_sqdiff  %8
2742%define b_second_blocks %9
2743    ; Unroll for better performance on Haswell.
2744    ; Avoid unrolling for the 16 px case so as to reduce the code footprint.
2745%ifidni %1, y
2746%ifdef i_stride5
2747    lea            r_tmp, [i_stride + 2 * i_stride3]
2748    %define i_stride5_ i_stride5
2749%else
2750    lea            r_tmp, [5 * i_stride]
2751    %define i_stride5_ r_tmp
2752%endif
2753    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur,                  p_ref,                  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 0
2754    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 1 * i_stride,   p_ref + 1 * i_stride,   %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
2755    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 2 * i_stride,   p_ref + 2 * i_stride,   %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
2756    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 1 * i_stride3,  p_ref + 1 * i_stride3,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
2757    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 4 * i_stride,   p_ref + 4 * i_stride,   %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
2758    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 1 * i_stride5_, p_ref + 1 * i_stride5_, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
2759%ifndef i_stride5
2760    lea            r_tmp, [i_stride + 2 * i_stride3]
2761%endif
2762%undef i_stride5_
2763    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 2 * i_stride3,  p_ref + 2 * i_stride3,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
2764    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + r_tmp,          p_ref + r_tmp,          %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
2765    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
2766    add            p_cur, %1 %+ mm_width
2767    add            p_ref, %1 %+ mm_width
2768%else
2769    vpxor          x %+ mm_sad, x %+ mm_sad, x %+ mm_sad
2770    vpxor          x %+ mm_sum, x %+ mm_sum, x %+ mm_sum
2771    vpxor          x %+ mm_sqsum, x %+ mm_sqsum, x %+ mm_sqsum
2772    vpxor          x %+ mm_sqdiff, x %+ mm_sqdiff, x %+ mm_sqdiff
2773    lea            r_tmp, [8 * i_stride]
2774    add            p_cur, r_tmp
2775    add            p_ref, r_tmp
2776    neg            r_tmp
2777%%loop:
2778    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
2779    add            r_tmp, i_stride
2780    jl             %%loop
2781    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
2782    lea            r_tmp, [8 * i_stride - %1 %+ mm_width]
2783    sub            p_cur, r_tmp
2784    sub            p_ref, r_tmp
2785%endif
2786    mov            r_tmp, p_sad8x8
2787    AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks
2788%ifdef X86_32
2789    vpaddd         y %+ mm_tmp1, y %+ mm_sad, sadframe_acc
2790    vmovdqa        sadframe_acc, y %+ mm_tmp1
2791%else
2792    vpaddd         sadframe_acc, sadframe_acc, y %+ mm_sad
2793%endif
2794    mov            r_tmp, i_xcnt
2795    add            r_tmp, p_sum16x16
2796    vpunpckhqdq    %1 %+ mm_tmp1, %1 %+ mm_sum, %1 %+ mm_sum
2797    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_sum, %1 %+ mm_tmp1
2798    AVX2_Store16x16Accdw r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks
2799    vpunpcklqdq    %1 %+ mm_tmp0, %1 %+ mm_sqsum, %1 %+ mm_sqdiff
2800    vpunpckhqdq    %1 %+ mm_tmp1, %1 %+ mm_sqsum, %1 %+ mm_sqdiff
2801    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1
2802    vpshufd        %1 %+ mm_tmp1, %1 %+ mm_tmp0, 10110001b
2803    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1
2804    AVX2_Store2x16x16Accdw p_sqsum16x16, p_sqdiff16x16, i_xcnt, r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks
2805%undef mm_tmp0
2806%undef mm_tmp1
2807%undef mm_tmp2
2808%undef mm_sad
2809%undef mm_sum
2810%undef mm_sqsum
2811%undef mm_sqdiff
2812%undef b_second_blocks
2813%endmacro
2814
2815;*************************************************************************************************************
2816;void VAACalcSadSsd_avx2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
2817;       int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
2818;*************************************************************************************************************
2819
2820WELS_EXTERN VAACalcSadSsd_avx2
2821%define          p_sadframe                    ptrword arg6
2822%define          p_sad8x8                      ptrword arg7
2823%define          p_sum16x16                    ptrword arg8
2824%define          p_sqsum16x16                  ptrword arg9
2825%define          p_sqdiff16x16                 ptrword arg10
2826%ifdef X86_32
2827%define          saveregs                      r5, r6
2828%else
2829%define          saveregs                      rbx, rbp, r12, r13, r14, r15
2830%endif
2831
2832%assign push_num 0
2833    LOAD_5_PARA
2834    PUSH_XMM 9
2835    SIGN_EXTENSION r2, r2d
2836    SIGN_EXTENSION r3, r3d
2837    SIGN_EXTENSION r4, r4d
2838    PUSHM          saveregs
2839
2840%define mm_zero mm0
2841    vpxor          x %+ mm_zero, x %+ mm_zero, x %+ mm_zero
2842
2843%ifdef X86_32
2844    STACK_ALLOC    r5, ymm_width, ymm_width
2845    %define sadframe_acc_addr r5
2846    %define sadframe_acc [sadframe_acc_addr]
2847%else
2848    %define sadframe_acc ymm8
2849    %define xsadframe_acc xmm8
2850%endif
2851    vmovdqa        sadframe_acc, y %+ mm_zero
2852
2853    and            r2, -16                     ; iPicWidth &= -16
2854    jle            .done                       ; bail if iPicWidth < 16
2855    sar            r3, 4                       ; iPicHeight / 16
2856    jle            .done                       ; bail if iPicHeight < 16
2857    shr            r2, 2                       ; iPicWidth / 4
2858
2859%define p_cur     r0
2860%define p_ref     r1
2861%define i_xcnt    r2
2862%define i_ycnt    ptrword arg4
2863%define i_stride  r4
2864%define r_tmp     r6
2865%define xcnt_unit 4
2866%ifdef X86_32
2867    mov            i_ycnt, r3
2868    %define i_stride3 r3
2869%else
2870    mov            r12, p_sad8x8
2871    mov            r13, p_sum16x16
2872    mov            r14, p_sqsum16x16
2873    mov            r15, p_sqdiff16x16
2874    %undef  p_sad8x8
2875    %undef  p_sum16x16
2876    %undef  p_sqsum16x16
2877    %undef  p_sqdiff16x16
2878    %define p_sad8x8 r12
2879    %define p_sum16x16 r13
2880    %define p_sqsum16x16 r14
2881    %define p_sqdiff16x16 r15
2882    %define i_stride3 rbx
2883    %define i_stride5 rbp
2884    lea            i_stride5, [5 * i_stride]
2885%endif
2886    lea            i_stride3, [3 * i_stride]
2887
2888    ; offset pointers so as to compensate for i_xcnt offset below.
2889    sub            p_sad8x8,      4 * 16 / xcnt_unit
2890    sub            p_sum16x16,    1 * 16 / xcnt_unit
2891    sub            p_sqsum16x16,  1 * 16 / xcnt_unit
2892    sub            p_sqdiff16x16, 1 * 16 / xcnt_unit
2893
2894    ; use a negative loop counter so as to enable counting toward zero and indexing with the same counter.
2895    neg            i_xcnt
2896
2897.height_loop:
2898    push           i_xcnt
2899%assign push_num push_num + 1
2900%define i_xcnt_load ptrword [r7]
2901    ; use end-of-line pointers so as to enable use of a negative counter as index.
2902    lea            r_tmp, [xcnt_unit * i_xcnt]
2903    sub            p_sad8x8, r_tmp
2904    sub            p_sum16x16, i_xcnt
2905    sub            p_sqsum16x16, i_xcnt
2906    sub            p_sqdiff16x16, i_xcnt
2907    add            i_xcnt, 16 / xcnt_unit
2908    jz             .width_loop_upper8_remaining16
2909.width_loop_upper8:
2910    AVX2_CalcSadSsd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0
2911    add            i_xcnt, 32 / xcnt_unit
2912    jl             .width_loop_upper8
2913    jg             .width_loop_upper8_end
2914.width_loop_upper8_remaining16:
2915    AVX2_CalcSadSsd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0
2916.width_loop_upper8_end:
2917    lea            p_cur, [p_cur + 8 * i_stride]
2918    lea            p_ref, [p_ref + 8 * i_stride]
2919    mov            i_xcnt, i_xcnt_load
2920    lea            p_cur, [p_cur + xcnt_unit * i_xcnt]
2921    lea            p_ref, [p_ref + xcnt_unit * i_xcnt]
2922    add            i_xcnt, 16 / xcnt_unit
2923    jz             .width_loop_lower8_remaining16
2924.width_loop_lower8:
2925    AVX2_CalcSadSsd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1
2926    add            i_xcnt, 32 / xcnt_unit
2927    jl             .width_loop_lower8
2928    jg             .width_loop_lower8_end
2929.width_loop_lower8_remaining16:
2930    AVX2_CalcSadSsd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1
2931.width_loop_lower8_end:
2932    lea            p_cur, [p_cur + 8 * i_stride]
2933    lea            p_ref, [p_ref + 8 * i_stride]
2934%undef i_xcnt_load
2935    pop            i_xcnt
2936    %assign push_num push_num - 1
2937    lea            p_cur, [p_cur + xcnt_unit * i_xcnt]
2938    lea            p_ref, [p_ref + xcnt_unit * i_xcnt]
2939    sub            i_ycnt, 1
2940    jnz            .height_loop
2941
2942.done:
2943    mov            r_tmp, p_sadframe
2944%ifdef X86_32
2945    vmovdqa        xmm2, sadframe_acc
2946    vpaddd         xmm2, xmm2, [sadframe_acc_addr + xmm_width]
2947%else
2948    vextracti128   xmm2, sadframe_acc, 1
2949    vpaddd         xmm2, xsadframe_acc, xmm2
2950%endif
2951    vpunpckhqdq    xmm1, xmm2, xmm2
2952    vpaddd         xmm2, xmm2, xmm1
2953    vmovd          [r_tmp], xmm2
2954    vzeroupper
2955%ifdef X86_32
2956    STACK_DEALLOC
2957%endif
2958    POPM           saveregs
2959    POP_XMM
2960    LOAD_5_PARA_POP
2961%undef           p_cur
2962%undef           p_ref
2963%undef           i_xcnt
2964%undef           i_ycnt
2965%undef           i_stride
2966%undef           i_stride3
2967%undef           i_stride5
2968%undef           r_tmp
2969%undef           xcnt_unit
2970%undef           sadframe_acc
2971%undef           sadframe_acc_addr
2972%undef           xsadframe_acc
2973%undef           mm_zero
2974%undef           saveregs
2975%undef           p_sadframe
2976%undef           p_sad8x8
2977%undef           p_sum16x16
2978%undef           p_sqsum16x16
2979%undef           p_sqdiff16x16
2980    ret
2981
2982
2983; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6,%7,%8 b_second_blocks=%9
2984%macro AVX2_CalcSadBgd_8Lines 9
2985%define mm_tmp0    %2
2986%define mm_tmp1    %3
2987%define mm_tmp2    %8
2988%define mm_mad     %4
2989%define mm_sumcur  %5
2990%define mm_sumref  %6
2991%define mm_sad     %7
2992%define b_second_blocks %9
2993    ; Unroll for better performance on Haswell.
2994    ; Avoid unrolling for the 16 px case so as to reduce the code footprint.
2995%ifidni %1, y
2996    lea            r_tmp, [5 * i_stride]
2997    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur,                 p_ref,                 %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 0
2998    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 1 * i_stride,  p_ref + 1 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
2999    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 2 * i_stride,  p_ref + 2 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
3000    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
3001    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 4 * i_stride,  p_ref + 4 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
3002    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + r_tmp,         p_ref + r_tmp,         %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
3003    lea            r_tmp, [i_stride + 2 * i_stride3]
3004    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
3005    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + r_tmp,         p_ref + r_tmp,         %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
3006    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
3007    add            p_cur, %1 %+ mm_width
3008    add            p_ref, %1 %+ mm_width
3009%else
3010    vpxor          x %+ mm_sad, x %+ mm_sad, x %+ mm_sad
3011    vpxor          x %+ mm_sumcur, x %+ mm_sumcur, x %+ mm_sumcur
3012    vpxor          x %+ mm_sumref, x %+ mm_sumref, x %+ mm_sumref
3013    vpxor          x %+ mm_mad, x %+ mm_mad, x %+ mm_mad
3014    lea            r_tmp, [8 * i_stride]
3015    add            p_cur, r_tmp
3016    add            p_ref, r_tmp
3017    neg            r_tmp
3018%%loop:
3019    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
3020    add            r_tmp, i_stride
3021    jl             %%loop
3022    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
3023    lea            r_tmp, [8 * i_stride - %1 %+ mm_width]
3024    sub            p_cur, r_tmp
3025    sub            p_ref, r_tmp
3026%endif
3027    mov            r_tmp, p_sad8x8
3028    AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks
3029%ifdef X86_32
3030    vpaddd         y %+ mm_tmp1, y %+ mm_sad, sadframe_acc
3031    vmovdqa        sadframe_acc, y %+ mm_tmp1
3032%else
3033    vpaddd         sadframe_acc, sadframe_acc, y %+ mm_sad
3034%endif
3035    mov            r_tmp, p_sd8x8
3036    vpsubd         %1 %+ mm_tmp0, %1 %+ mm_sumcur, %1 %+ mm_sumref
3037    AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_tmp0, mm_tmp1, b_second_blocks
3038    ; Coalesce store and horizontal reduction of MAD accumulator for even and
3039    ; odd iterations so as to enable more parallelism.
3040%ifidni %1, y
3041    test           i_xcnt, 32 / xcnt_unit
3042    jz             %%preserve_mad
3043    mov            r_tmp, p_mad8x8
3044    AVX2_Maxubq2   y %+ mm_mad, y %+ mm_mad, prev_mad, y %+ mm_tmp0
3045    AVX2_Store2x8x8Accb r_tmp + i_xcnt - 8, mm_mad, mm_tmp0, mm_tmp1, b_second_blocks
3046%%preserve_mad:
3047    vmovdqa        prev_mad, y %+ mm_mad
3048%else
3049    mov            r_tmp, p_mad8x8
3050    AVX2_Maxubq    %1 %+ mm_mad, %1 %+ mm_mad, %1 %+ mm_tmp0
3051    AVX2_Store8x8Accb r_tmp + i_xcnt, %1, mm_mad, mm_tmp0, b_second_blocks
3052%endif
3053%undef mm_tmp0
3054%undef mm_tmp1
3055%undef mm_tmp2
3056%undef mm_mad
3057%undef mm_sumcur
3058%undef mm_sumref
3059%undef mm_sad
3060%undef b_second_blocks
3061%endmacro
3062
3063; Store remaining MAD accumulator for width & 32 cases.
3064; width/xcnt_unit=%1 mm_tmp=%2,%3 b_second_blocks=%4
3065%macro AVX2_StoreRemainingSingleMad 4
3066    test           %1, 32 / xcnt_unit
3067    jz             %%skip
3068    mov            r_tmp, p_mad8x8
3069    vmovdqa        y%2, prev_mad
3070    AVX2_Maxubq    y%2, y%2, y%3
3071    AVX2_Store8x8Accb r_tmp + i_xcnt - 8, y, %2, %3, %4
3072%%skip:
3073%endmacro
3074
3075;*************************************************************************************************************
3076;void VAACalcSadBgd_avx2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
3077;                        int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
3078;*************************************************************************************************************
3079
3080WELS_EXTERN VAACalcSadBgd_avx2
3081%define          p_sadframe                    arg6
3082%define          p_sad8x8                      arg7
3083%define          p_sd8x8                       arg8
3084%define          p_mad8x8                      arg9
3085%ifdef X86_32
3086%define          saveregs                      r5, r6
3087%else
3088%define          saveregs                      rbx, rbp, r12, r13
3089%endif
3090
3091%assign push_num 0
3092    LOAD_5_PARA
3093    PUSH_XMM 10
3094    SIGN_EXTENSION r2, r2d
3095    SIGN_EXTENSION r3, r3d
3096    SIGN_EXTENSION r4, r4d
3097    PUSHM          saveregs
3098
3099%define mm_zero mm0
3100    vpxor          x %+ mm_zero, x %+ mm_zero, x %+ mm_zero
3101
3102%ifdef X86_32
3103    STACK_ALLOC    r5, 2 * ymm_width, ymm_width
3104    %define sadframe_acc_addr r5
3105    %define sadframe_acc [sadframe_acc_addr]
3106    %define prev_mad [r5 + ymm_width]
3107%else
3108    %define sadframe_acc ymm8
3109    %define xsadframe_acc xmm8
3110    %define prev_mad ymm9
3111%endif
3112    vmovdqa        sadframe_acc, y %+ mm_zero
3113
3114    and            r2, -16                     ; iPicWidth &= -16
3115    jle            .done                       ; bail if iPicWidth < 16
3116    sar            r3, 4                       ; iPicHeight / 16
3117    jle            .done                       ; bail if iPicHeight < 16
3118    shr            r2, 2                       ; iPicWidth / 4
3119
3120%define p_cur     r0
3121%define p_ref     r1
3122%define i_xcnt    r2
3123%define i_ycnt    ptrword arg4
3124%define i_stride  r4
3125%define r_tmp     r6
3126%define xcnt_unit 4
3127%ifdef X86_32
3128    mov            i_ycnt, r3
3129    %define i_stride3 r3
3130%else
3131    mov            rbp, p_sad8x8
3132    mov            r12, p_sd8x8
3133    mov            r13, p_mad8x8
3134    %undef  p_sad8x8
3135    %undef  p_sd8x8
3136    %undef  p_mad8x8
3137    %define p_sad8x8 rbp
3138    %define p_sd8x8 r12
3139    %define p_mad8x8 r13
3140    %define i_stride3 rbx
3141%endif
3142    lea            i_stride3, [3 * i_stride]
3143
3144    ; offset pointers to compensate for the i_xcnt offset below.
3145    mov            r_tmp, i_xcnt
3146    and            r_tmp, 64 / xcnt_unit - 1
3147    sub            p_mad8x8, r_tmp
3148    shl            r_tmp, 2
3149    sub            p_sad8x8, r_tmp
3150    sub            p_sd8x8, r_tmp
3151
3152.height_loop:
3153    push           i_xcnt
3154%assign push_num push_num + 1
3155%define i_xcnt_load ptrword [r7]
3156    ; use end-of-line pointers so as to enable use of a negative counter as index.
3157    lea            r_tmp, [xcnt_unit * i_xcnt]
3158    add            p_sad8x8, r_tmp
3159    add            p_sd8x8, r_tmp
3160    add            p_mad8x8, i_xcnt
3161    and            i_xcnt, -(64 / xcnt_unit)
3162    jz             .width_loop_upper8_64x_end
3163    ; use a negative loop counter to enable counting toward zero and indexing with the same counter.
3164    neg            i_xcnt
3165.width_loop_upper8:
3166    AVX2_CalcSadBgd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0
3167    add            i_xcnt, 32 / xcnt_unit
3168    jl             .width_loop_upper8
3169    jg             .width_loop_upper8_32x_end
3170.width_loop_upper8_64x_end:
3171    test           i_xcnt_load, 32 / xcnt_unit
3172    jnz            .width_loop_upper8
3173.width_loop_upper8_32x_end:
3174    AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 0
3175    test           i_xcnt_load, 16 / xcnt_unit
3176    jz             .width_loop_upper8_end
3177    ; remaining 16.
3178    AVX2_CalcSadBgd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0
3179.width_loop_upper8_end:
3180    lea            p_cur, [p_cur + 8 * i_stride]
3181    lea            p_ref, [p_ref + 8 * i_stride]
3182    mov            i_xcnt, i_xcnt_load
3183    lea            r_tmp, [xcnt_unit * i_xcnt]
3184    sub            p_cur, r_tmp
3185    sub            p_ref, r_tmp
3186    and            i_xcnt, -(64 / xcnt_unit)
3187    jz             .width_loop_lower8_64x_end
3188    neg            i_xcnt
3189.width_loop_lower8:
3190    AVX2_CalcSadBgd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1
3191    add            i_xcnt, 32 / xcnt_unit
3192    jl             .width_loop_lower8
3193    jg             .width_loop_lower8_32x_end
3194.width_loop_lower8_64x_end:
3195    test           i_xcnt_load, 32 / xcnt_unit
3196    jnz            .width_loop_lower8
3197.width_loop_lower8_32x_end:
3198    AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 1
3199    test           i_xcnt_load, 16 / xcnt_unit
3200    jz             .width_loop_lower8_end
3201    ; remaining 16.
3202    AVX2_CalcSadBgd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1
3203.width_loop_lower8_end:
3204    lea            p_cur, [p_cur + 8 * i_stride]
3205    lea            p_ref, [p_ref + 8 * i_stride]
3206    pop            i_xcnt
3207%undef i_xcnt_load
3208    %assign push_num push_num - 1
3209    lea            r_tmp, [xcnt_unit * i_xcnt]
3210    sub            p_cur, r_tmp
3211    sub            p_ref, r_tmp
3212    sub            i_ycnt, 1
3213    jnz            .height_loop
3214
3215.done:
3216    mov            r_tmp, p_sadframe
3217%ifdef X86_32
3218    vmovdqa        xmm2, sadframe_acc
3219    vpaddd         xmm2, xmm2, [sadframe_acc_addr + xmm_width]
3220%else
3221    vextracti128   xmm2, sadframe_acc, 1
3222    vpaddd         xmm2, xsadframe_acc, xmm2
3223%endif
3224    vpunpckhqdq    xmm1, xmm2, xmm2
3225    vpaddd         xmm2, xmm2, xmm1
3226    vmovd          [r_tmp], xmm2
3227    vzeroupper
3228%ifdef X86_32
3229    STACK_DEALLOC
3230%endif
3231    POPM           saveregs
3232    POP_XMM
3233    LOAD_5_PARA_POP
3234%undef           p_cur
3235%undef           p_ref
3236%undef           i_xcnt
3237%undef           i_ycnt
3238%undef           i_stride
3239%undef           i_stride3
3240%undef           r_tmp
3241%undef           xcnt_unit
3242%undef           sadframe_acc
3243%undef           sadframe_acc_addr
3244%undef           xsadframe_acc
3245%undef           prev_mad
3246%undef           mm_zero
3247%undef           saveregs
3248%undef           p_sadframe
3249%undef           p_sad8x8
3250%undef           p_sd8x8
3251%undef           p_mad8x8
3252    ret
3253
3254
3255; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6,%7,%8,%9,%10 b_second_blocks=%11
3256%macro AVX2_CalcSadSsdBgd_8Lines 11
3257%define mm_tmp0    %2
3258%define mm_tmp1    %3
3259%define mm_sad     %4
3260%define mm_sum     %5
3261%define mm_sumref  %6
3262%define mm_mad     %7
3263%define mm_sqsum   %8
3264%define mm_sqdiff  %9
3265%ifidn %10, 0
3266%define tmp2       0
3267%else
3268%define tmp2       %1 %+ %10
3269%endif
3270%define b_second_blocks %11
3271    ; Unroll for better performance on Haswell.
3272    ; Avoid unrolling for the 16 px case so as to reduce the code footprint.
3273%ifidni %1, y
3274    lea            r_tmp, [5 * i_stride]
3275    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur,                 p_ref,                 %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 0
3276    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 1 * i_stride,  p_ref + 1 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
3277    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 2 * i_stride,  p_ref + 2 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
3278    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
3279    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 4 * i_stride,  p_ref + 4 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
3280    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + r_tmp,         p_ref + r_tmp,         %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
3281    lea            r_tmp, [i_stride + 2 * i_stride3]
3282    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
3283    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + r_tmp,         p_ref + r_tmp,         %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
3284    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
3285    add            p_cur, %1 %+ mm_width
3286    add            p_ref, %1 %+ mm_width
3287%else
3288    vpxor          x %+ mm_sad, x %+ mm_sad, x %+ mm_sad
3289    vpxor          x %+ mm_sum, x %+ mm_sum, x %+ mm_sum
3290    vpxor          x %+ mm_sumref, x %+ mm_sumref, x %+ mm_sumref
3291    vpxor          x %+ mm_mad, x %+ mm_mad, x %+ mm_mad
3292    vpxor          x %+ mm_sqsum, x %+ mm_sqsum, x %+ mm_sqsum
3293    vpxor          x %+ mm_sqdiff, x %+ mm_sqdiff, x %+ mm_sqdiff
3294    lea            r_tmp, [8 * i_stride]
3295    add            p_cur, r_tmp
3296    add            p_ref, r_tmp
3297    neg            r_tmp
3298%%loop:
3299    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
3300    add            r_tmp, i_stride
3301    jl             %%loop
3302    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
3303    lea            r_tmp, [8 * i_stride - %1 %+ mm_width]
3304    sub            p_cur, r_tmp
3305    sub            p_ref, r_tmp
3306%endif
3307    mov            r_tmp, p_sad8x8
3308    AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks
3309%ifdef X86_32
3310    vpaddd         y %+ mm_tmp1, y %+ mm_sad, sadframe_acc
3311    vmovdqa        sadframe_acc, y %+ mm_tmp1
3312%else
3313    vpaddd         sadframe_acc, sadframe_acc, y %+ mm_sad
3314%endif
3315    mov            r_tmp, i_xcnt
3316    add            r_tmp, p_sum16x16
3317    vpunpckhqdq    %1 %+ mm_tmp1, %1 %+ mm_sum, %1 %+ mm_sum
3318    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_sum, %1 %+ mm_tmp1
3319    AVX2_Store16x16Accdw r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks
3320    mov            r_tmp, p_sd8x8
3321    vpsubd         %1 %+ mm_sum,  %1 %+ mm_sum, %1 %+ mm_sumref
3322    AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sum, mm_tmp0, b_second_blocks
3323    ; Coalesce store and horizontal reduction of MAD accumulator for even and
3324    ; odd iterations so as to enable more parallelism.
3325%ifidni %1, y
3326    test           i_xcnt, 32 / xcnt_unit
3327    jz             %%preserve_mad
3328    mov            r_tmp, p_mad8x8
3329    AVX2_Maxubq2   y %+ mm_mad, y %+ mm_mad, prev_mad, y %+ mm_tmp0
3330    AVX2_Store2x8x8Accb r_tmp + i_xcnt - 8, mm_mad, mm_tmp0, mm_tmp1, b_second_blocks
3331%%preserve_mad:
3332    vmovdqa        prev_mad, y %+ mm_mad
3333%else
3334    mov            r_tmp, p_mad8x8
3335    AVX2_Maxubq    %1 %+ mm_mad, %1 %+ mm_mad, %1 %+ mm_tmp0
3336    AVX2_Store8x8Accb r_tmp + i_xcnt, %1, mm_mad, mm_tmp0, b_second_blocks
3337%endif
3338    vpunpcklqdq    %1 %+ mm_tmp0, %1 %+ mm_sqsum, %1 %+ mm_sqdiff
3339    vpunpckhqdq    %1 %+ mm_tmp1, %1 %+ mm_sqsum, %1 %+ mm_sqdiff
3340    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_tmp0,  %1 %+ mm_tmp1
3341    vpshufd        %1 %+ mm_tmp1, %1 %+ mm_tmp0,  10110001b
3342    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_tmp0,  %1 %+ mm_tmp1
3343    AVX2_Store2x16x16Accdw p_sqsum16x16, p_sqdiff16x16, i_xcnt, r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks
3344%undef mm_tmp0
3345%undef mm_tmp1
3346%undef mm_sqsum
3347%undef mm_sqdiff
3348%undef mm_mad
3349%undef mm_sum
3350%undef mm_sumref
3351%undef mm_sad
3352%undef tmp2
3353%undef b_second_blocks
3354%endmacro
3355
3356;*************************************************************************************************************
3357;void VAACalcSadSsdBgd_avx2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
3358;                int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
3359;                       int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
3360;*************************************************************************************************************
3361
3362WELS_EXTERN VAACalcSadSsdBgd_avx2
3363%define         p_sadframe                      arg6
3364%define         p_sad8x8                        arg7
3365%define         p_sum16x16                      arg8
3366%define         p_sqsum16x16                    arg9
3367%define         p_sqdiff16x16                   arg10
3368%define         p_sd8x8                         arg11
3369%define         p_mad8x8                        arg12
3370%ifdef X86_32
3371%define         saveregs                        r5, r6
3372%else
3373%define         saveregs                        rbx, rbp, r12, r13, r14, r15
3374%endif
3375
3376%assign push_num 0
3377    LOAD_5_PARA
3378    PUSH_XMM 12
3379    SIGN_EXTENSION r2, r2d
3380    SIGN_EXTENSION r3, r3d
3381    SIGN_EXTENSION r4, r4d
3382    PUSHM          saveregs
3383
3384%ifdef X86_32
3385    STACK_ALLOC    r5, 3 * ymm_width, ymm_width
3386    %define mm8 0
3387    %define sadframe_acc_addr r5
3388    %define sadframe_acc [sadframe_acc_addr]
3389    %define prev_mad [r5 + ymm_width]
3390    %define ymm_zero [r5 + 2 * ymm_width]
3391    %define xmm_zero ymm_zero
3392    vpxor          xmm0, xmm0, xmm0
3393    vmovdqa        sadframe_acc, ymm0
3394    vmovdqa        ymm_zero, ymm0
3395%else
3396    %define sadframe_acc ymm9
3397    %define xsadframe_acc xmm9
3398    %define prev_mad ymm10
3399    %define ymm_zero ymm11
3400    %define xmm_zero xmm11
3401    vpxor          xmm_zero, xmm_zero, xmm_zero
3402    vpxor          xsadframe_acc, xsadframe_acc, xsadframe_acc
3403%endif
3404
3405    and            r2, -16                     ; iPicWidth &= -16
3406    jle            .done                       ; bail if iPicWidth < 16
3407    sar            r3, 4                       ; iPicHeight / 16
3408    jle            .done                       ; bail if iPicHeight < 16
3409    shr            r2, 2                       ; iPicWidth / 4
3410
3411%define p_cur     r0
3412%define p_ref     r1
3413%define i_xcnt    r2
3414%define i_ycnt    ptrword arg4
3415%define i_stride  r4
3416%define r_tmp     r6
3417%define xcnt_unit 4
3418%ifdef X86_32
3419    mov            i_ycnt, r3
3420    %define i_stride3 r3
3421%else
3422    mov            rbp, p_sad8x8
3423    mov            r12, p_sum16x16
3424    mov            r13, p_sqsum16x16
3425    mov            r14, p_sqdiff16x16
3426    mov            r15, p_sd8x8
3427    %undef p_sad8x8
3428    %undef p_sum16x16
3429    %undef p_sqsum16x16
3430    %undef p_sqdiff16x16
3431    %undef p_sd8x8
3432    %define p_sad8x8 rbp
3433    %define p_sum16x16 r12
3434    %define p_sqsum16x16 r13
3435    %define p_sqdiff16x16 r14
3436    %define p_sd8x8 r15
3437    %define i_stride3 rbx
3438%endif
3439    lea            i_stride3, [3 * i_stride]
3440
3441    ; offset pointers so as to compensate for the i_xcnt offset below.
3442    mov            r_tmp, i_xcnt
3443    and            r_tmp, 64 / xcnt_unit - 1
3444    sub            p_sum16x16, r_tmp
3445    sub            p_sqsum16x16, r_tmp
3446    sub            p_sqdiff16x16, r_tmp
3447    sub            p_mad8x8, r_tmp
3448    shl            r_tmp, 2
3449    sub            p_sad8x8, r_tmp
3450    sub            p_sd8x8, r_tmp
3451
3452.height_loop:
3453    push           i_xcnt
3454%assign push_num push_num + 1
3455%define i_xcnt_load ptrword [r7]
3456    ; use end-of-line pointers so as to enable use of a negative counter as index.
3457    lea            r_tmp, [xcnt_unit * i_xcnt]
3458    add            p_sad8x8, r_tmp
3459    add            p_sum16x16, i_xcnt
3460    add            p_sqsum16x16, i_xcnt
3461    add            p_sqdiff16x16, i_xcnt
3462    add            p_sd8x8, r_tmp
3463    add            p_mad8x8, i_xcnt
3464    and            i_xcnt, -(64 / xcnt_unit)
3465    jz             .width_loop_upper8_64x_end
3466    ; use a negative loop counter to enable counting toward zero and indexing with the same counter.
3467    neg            i_xcnt
3468.width_loop_upper8:
3469    AVX2_CalcSadSsdBgd_8Lines y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 0
3470    add            i_xcnt, 32 / xcnt_unit
3471    jl             .width_loop_upper8
3472    jg             .width_loop_upper8_32x_end
3473.width_loop_upper8_64x_end:
3474    test           i_xcnt_load, 32 / xcnt_unit
3475    jnz            .width_loop_upper8
3476.width_loop_upper8_32x_end:
3477    AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 0
3478    test           i_xcnt_load, 16 / xcnt_unit
3479    jz             .width_loop_upper8_end
3480    ; remaining 16.
3481    AVX2_CalcSadSsdBgd_8Lines x, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 0
3482.width_loop_upper8_end:
3483    lea            p_cur, [p_cur + 8 * i_stride]
3484    lea            p_ref, [p_ref + 8 * i_stride]
3485    mov            i_xcnt, i_xcnt_load
3486    lea            r_tmp, [xcnt_unit * i_xcnt]
3487    sub            p_cur, r_tmp
3488    sub            p_ref, r_tmp
3489    and            i_xcnt, -(64 / xcnt_unit)
3490    jz             .width_loop_lower8_64x_end
3491    neg            i_xcnt
3492.width_loop_lower8:
3493    AVX2_CalcSadSsdBgd_8Lines y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 1
3494    add            i_xcnt, 32 / xcnt_unit
3495    jl             .width_loop_lower8
3496    jg             .width_loop_lower8_32x_end
3497.width_loop_lower8_64x_end:
3498    test           i_xcnt_load, 32 / xcnt_unit
3499    jnz            .width_loop_lower8
3500.width_loop_lower8_32x_end:
3501    AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 1
3502    test           i_xcnt_load, 16 / xcnt_unit
3503    jz             .width_loop_lower8_end
3504    ; remaining 16.
3505    AVX2_CalcSadSsdBgd_8Lines x, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 1
3506.width_loop_lower8_end:
3507    lea            p_cur, [p_cur + 8 * i_stride]
3508    lea            p_ref, [p_ref + 8 * i_stride]
3509    pop            i_xcnt
3510%undef i_xcnt_load
3511    %assign push_num push_num - 1
3512    lea            r_tmp, [xcnt_unit * i_xcnt]
3513    sub            p_cur, r_tmp
3514    sub            p_ref, r_tmp
3515    sub            i_ycnt, 1
3516    jnz            .height_loop
3517
3518.done:
3519    mov            r_tmp, p_sadframe
3520%ifdef X86_32
3521    vmovdqa        xmm2, sadframe_acc
3522    vpaddd         xmm2, xmm2, [sadframe_acc_addr + xmm_width]
3523%else
3524    vextracti128   xmm2, sadframe_acc, 1
3525    vpaddd         xmm2, xsadframe_acc, xmm2
3526%endif
3527    vpunpckhqdq    xmm1, xmm2, xmm2
3528    vpaddd         xmm2, xmm2, xmm1
3529    vmovd          [r_tmp], xmm2
3530    vzeroupper
3531%ifdef X86_32
3532    STACK_DEALLOC
3533%endif
3534    POPM           saveregs
3535    POP_XMM
3536    LOAD_5_PARA_POP
3537%undef           p_cur
3538%undef           p_ref
3539%undef           i_xcnt
3540%undef           i_ycnt
3541%undef           i_stride
3542%undef           i_stride3
3543%undef           r_tmp
3544%undef           xcnt_unit
3545%undef           mm8
3546%undef           sadframe_acc
3547%undef           sadframe_acc_addr
3548%undef           xsadframe_acc
3549%undef           prev_mad
3550%undef           ymm_zero
3551%undef           xmm_zero
3552%undef           saveregs
3553%undef           p_sadframe
3554%undef           p_sad8x8
3555%undef           p_sum16x16
3556%undef           p_sqsum16x16
3557%undef           p_sqdiff16x16
3558%undef           p_sd8x8
3559%undef           p_mad8x8
3560    ret
3561
3562%endif
3563
3564