• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*!
2;* \copy
3;*     Copyright (c)  2009-2013, Cisco Systems
4;*     All rights reserved.
5;*
6;*     Redistribution and use in source and binary forms, with or without
7;*     modification, are permitted provided that the following conditions
8;*     are met:
9;*
10;*        * Redistributions of source code must retain the above copyright
11;*          notice, this list of conditions and the following disclaimer.
12;*
13;*        * Redistributions in binary form must reproduce the above copyright
14;*          notice, this list of conditions and the following disclaimer in
15;*          the documentation and/or other materials provided with the
16;*          distribution.
17;*
18;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29;*     POSSIBILITY OF SUCH DAMAGE.
30;*
31;*
32;*  satd_sad.asm
33;*
34;*  Abstract
35;*      WelsSampleSatd4x4_sse2
36;*      WelsSampleSatd8x8_sse2
37;*      WelsSampleSatd16x8_sse2
38;*      WelsSampleSatd8x16_sse2
39;*      WelsSampleSatd16x16_sse2
40;*
41;*      WelsSampleSad16x8_sse2
42;*      WelsSampleSad16x16_sse2
43;*
44;*  History
45;*      8/5/2009 Created
46;*     24/9/2009 modified
47;*
48;*
49;*************************************************************************/
50
51%include "asm_inc.asm"
52
53;***********************************************************************
54; Data
55;***********************************************************************
56%ifdef X86_32_PICASM
57SECTION .text align=16
58%else
59SECTION .rodata align=16
60%endif
61
62align 16
63HSumSubDB1:   db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
64align 16
65HSumSubDW1:   dw 1,-1,1,-1,1,-1,1,-1
66align 16
67PDW1:  dw 1,1,1,1,1,1,1,1
68align 16
69PDQ2:  dw 2,0,0,0,2,0,0,0
70align 16
71HSwapSumSubDB1:   times 2 db 1, 1, 1, 1, 1, -1, 1, -1
72
73;***********************************************************************
74; Code
75;***********************************************************************
76SECTION .text
77
78;***********************************************************************
79;
80;Pixel_satd_wxh_sse2 BEGIN
81;
82;***********************************************************************
83%macro MMX_DW_1_2REG 2
84    pxor %1, %1
85    pcmpeqw %2, %2
86    psubw %1, %2
87%endmacro
88
89%macro SSE2_SumWHorizon1 2
90    movdqa      %2, %1
91    psrldq      %2, 8
92    paddusw     %1, %2
93    movdqa      %2, %1
94    psrldq      %2, 4
95    paddusw     %1, %2
96    movdqa      %2, %1
97    psrldq      %2, 2
98    paddusw     %1, %2
99%endmacro
100
101%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4  pOut: xmm4,xmm2,xmm1,xmm3
102    SSE2_SumSub %1, %2, %5
103    SSE2_SumSub %3, %4, %5
104    SSE2_SumSub %2, %4, %5
105    SSE2_SumSub %1, %3, %5
106%endmacro
107
108%macro SSE2_SumAbs4 7
109    WELS_AbsW %1, %3
110    WELS_AbsW %2, %3
111    WELS_AbsW %4, %6
112    WELS_AbsW %5, %6
113    paddusw       %1, %2
114    paddusw       %4, %5
115    paddusw       %7, %1
116    paddusw       %7, %4
117%endmacro
118
119%macro SSE2_SumWHorizon 3
120    movhlps     %2, %1          ; x2 = xx xx xx xx d7 d6 d5 d4
121    paddw       %1, %2          ; x1 = xx xx xx xx d37 d26 d15 d04
122    punpcklwd   %1, %3          ; x1 =  d37  d26 d15 d04
123    movhlps     %2, %1          ; x2 = xxxx xxxx d37 d26
124    paddd       %1, %2          ; x1 = xxxx xxxx d1357 d0246
125    pshuflw     %2, %1, 0x4e    ; x2 = xxxx xxxx d0246 d1357
126    paddd       %1, %2          ; x1 = xxxx xxxx xxxx  d01234567
127%endmacro
128
129%macro SSE2_GetSatd8x8 0
130    SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
131    SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
132    lea                 r0, [r0+2*r1]
133    lea                 r2, [r2+2*r3]
134    SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
135    SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
136
137    SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
138    SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
139    SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
140    SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
141
142    lea                 r0,    [r0+2*r1]
143    lea                 r2,    [r2+2*r3]
144    SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
145    SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
146    lea                 r0, [r0+2*r1]
147    lea                 r2, [r2+2*r3]
148    SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
149    SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
150
151    SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
152    SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
153    SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
154    SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
155%endmacro
156
157;***********************************************************************
158;
159;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
160;
161;***********************************************************************
162WELS_EXTERN WelsSampleSatd4x4_sse2
163    %assign  push_num 0
164    LOAD_4_PARA
165    PUSH_XMM 8
166    SIGN_EXTENSION r1, r1d
167    SIGN_EXTENSION r3, r3d
168    movd      xmm0, [r0]
169    movd      xmm1, [r0+r1]
170    lea       r0 , [r0+2*r1]
171    movd      xmm2, [r0]
172    movd      xmm3, [r0+r1]
173    punpckldq xmm0, xmm2
174    punpckldq xmm1, xmm3
175
176    movd      xmm4, [r2]
177    movd      xmm5, [r2+r3]
178    lea       r2 , [r2+2*r3]
179    movd      xmm6, [r2]
180    movd      xmm7, [r2+r3]
181    punpckldq xmm4, xmm6
182    punpckldq xmm5, xmm7
183
184    pxor      xmm6, xmm6
185    punpcklbw xmm0, xmm6
186    punpcklbw xmm1, xmm6
187    punpcklbw xmm4, xmm6
188    punpcklbw xmm5, xmm6
189
190    psubw     xmm0, xmm4
191    psubw     xmm1, xmm5
192
193    movdqa    xmm2, xmm0
194    paddw     xmm0, xmm1
195    psubw     xmm2, xmm1
196    SSE2_XSawp qdq, xmm0, xmm2, xmm3
197
198    movdqa     xmm4, xmm0
199    paddw      xmm0, xmm3
200    psubw      xmm4, xmm3
201
202    movdqa         xmm2, xmm0
203    punpcklwd      xmm0, xmm4
204    punpckhwd      xmm4, xmm2
205
206    SSE2_XSawp     dq,  xmm0, xmm4, xmm3
207    SSE2_XSawp     qdq, xmm0, xmm3, xmm5
208
209    movdqa         xmm7, xmm0
210    paddw          xmm0, xmm5
211    psubw          xmm7, xmm5
212
213    SSE2_XSawp     qdq,  xmm0, xmm7, xmm1
214
215    movdqa         xmm2, xmm0
216    paddw          xmm0, xmm1
217    psubw          xmm2, xmm1
218
219    WELS_AbsW  xmm0, xmm3
220    paddusw        xmm6, xmm0
221    WELS_AbsW  xmm2, xmm4
222    paddusw        xmm6, xmm2
223    SSE2_SumWHorizon1  xmm6, xmm4
224    movd           retrd,  xmm6
225    and            retrd,  0xffff
226    shr            retrd,  1
227    POP_XMM
228    LOAD_4_PARA_POP
229    ret
230
231 ;***********************************************************************
232 ;
233 ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
234 ;
235 ;***********************************************************************
236WELS_EXTERN WelsSampleSatd8x8_sse2
237    %assign  push_num 0
238    LOAD_4_PARA
239    PUSH_XMM 8
240    SIGN_EXTENSION r1, r1d
241    SIGN_EXTENSION r3, r3d
242    pxor   xmm6,   xmm6
243    pxor   xmm7,   xmm7
244    SSE2_GetSatd8x8
245    psrlw   xmm6,  1
246    SSE2_SumWHorizon   xmm6,xmm4,xmm7
247    movd    retrd,   xmm6
248    POP_XMM
249    LOAD_4_PARA_POP
250    ret
251
252 ;***********************************************************************
253 ;
254 ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
255 ;
256 ;***********************************************************************
257WELS_EXTERN WelsSampleSatd8x16_sse2
258    %assign  push_num 0
259    LOAD_4_PARA
260    PUSH_XMM 8
261    SIGN_EXTENSION r1, r1d
262    SIGN_EXTENSION r3, r3d
263    pxor   xmm6,   xmm6
264    pxor   xmm7,   xmm7
265
266    SSE2_GetSatd8x8
267    lea    r0,    [r0+2*r1]
268    lea    r2,    [r2+2*r3]
269    SSE2_GetSatd8x8
270
271    psrlw   xmm6,  1
272    SSE2_SumWHorizon   xmm6,xmm4,xmm7
273    movd    retrd,   xmm6
274    POP_XMM
275    LOAD_4_PARA_POP
276    ret
277
278;***********************************************************************
279;
280;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
281;
282;***********************************************************************
283WELS_EXTERN WelsSampleSatd16x8_sse2
284    %assign  push_num 0
285    LOAD_4_PARA
286    PUSH_XMM 8
287    SIGN_EXTENSION r1, r1d
288    SIGN_EXTENSION r3, r3d
289    push r0
290    push r2
291    pxor   xmm6,   xmm6
292    pxor   xmm7,   xmm7
293
294    SSE2_GetSatd8x8
295
296    pop r2
297    pop r0
298    add    r0,    8
299    add    r2,    8
300    SSE2_GetSatd8x8
301
302    psrlw   xmm6,  1
303    SSE2_SumWHorizon   xmm6,xmm4,xmm7
304    movd    retrd,   xmm6
305    POP_XMM
306    LOAD_4_PARA_POP
307    ret
308
309;***********************************************************************
310;
311;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
312;
313;***********************************************************************
314WELS_EXTERN WelsSampleSatd16x16_sse2
315    %assign  push_num 0
316    LOAD_4_PARA
317    PUSH_XMM 8
318    SIGN_EXTENSION r1, r1d
319    SIGN_EXTENSION r3, r3d
320    push r0
321    push r2
322    pxor   xmm6,   xmm6
323    pxor   xmm7,   xmm7
324
325    SSE2_GetSatd8x8
326    lea    r0,    [r0+2*r1]
327    lea    r2,    [r2+2*r3]
328    SSE2_GetSatd8x8
329
330    pop r2
331    pop r0
332    add    r0,    8
333    add    r2,    8
334
335    SSE2_GetSatd8x8
336    lea    r0,    [r0+2*r1]
337    lea    r2,    [r2+2*r3]
338    SSE2_GetSatd8x8
339
340 ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
341    psrlw   xmm6,  1
342    SSE2_SumWHorizon   xmm6,xmm4,xmm7
343    movd    retrd,   xmm6
344    POP_XMM
345    LOAD_4_PARA_POP
346    ret
347
348;***********************************************************************
349;
350;Pixel_satd_wxh_sse2 END
351;
352;***********************************************************************
353
354;***********************************************************************
355;
356;Pixel_satd_intra_sse2 BEGIN
357;
358;***********************************************************************
359
360
361%macro SSE_DB_1_2REG 2
362    pxor %1, %1
363    pcmpeqw %2, %2
364    psubb %1, %2
365%endmacro
366
367;***********************************************************************
368;
369;int32_t WelsSampleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
370;                             uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
371;
372;***********************************************************************
373WELS_EXTERN WelsSampleSatdThree4x4_sse2
374
375%ifdef X86_32
376    push r3
377    push r4
378    push r5
379    push r6
380    %assign  push_num 4
381%else
382    %assign  push_num 0
383%endif
384    PUSH_XMM 8
385
386    mov  r2, arg3
387    mov  r3, arg4
388    SIGN_EXTENSION r3, r3d
389
390    ; load source 4x4 samples and Hadamard transform
391    movd      xmm0, [r2]
392    movd      xmm1, [r2+r3]
393    lea       r2 , [r2+2*r3]
394    movd      xmm2, [r2]
395    movd      xmm3, [r2+r3]
396    punpckldq xmm0, xmm2
397    punpckldq xmm1, xmm3
398
399    pxor      xmm6, xmm6
400    punpcklbw xmm0, xmm6
401    punpcklbw xmm1, xmm6
402
403    movdqa    xmm2, xmm0
404    paddw     xmm0, xmm1
405    psubw     xmm2, xmm1
406    SSE2_XSawp  qdq, xmm0, xmm2, xmm3
407
408    movdqa    xmm4, xmm0
409    paddw     xmm0, xmm3
410    psubw     xmm4, xmm3
411
412    movdqa    xmm2, xmm0
413    punpcklwd xmm0, xmm4
414    punpckhwd xmm4, xmm2
415
416    SSE2_XSawp  dq,  xmm0, xmm4, xmm3
417    SSE2_XSawp  qdq, xmm0, xmm3, xmm5
418
419    movdqa    xmm7, xmm0
420    paddw     xmm0, xmm5
421    psubw     xmm7, xmm5
422
423    SSE2_XSawp  qdq,  xmm0, xmm7, xmm1
424
425    ; Hadamard transform results are saved in xmm0 and xmm2
426    movdqa    xmm2, xmm0
427    paddw     xmm0, xmm1
428    psubw     xmm2, xmm1
429
430    ;load top boundary samples: [a b c d]
431    mov r0, arg1
432    mov r1, arg2
433    SIGN_EXTENSION r1, r1d
434    sub r0, r1
435%ifdef UNIX64
436    push r4
437    push r5
438%endif
439
440    movzx     r2d,  byte [r0]
441    movzx     r3d,  byte [r0+1]
442    movzx     r4d,  byte [r0+2]
443    movzx     r5d,  byte [r0+3]
444
445    ; get the transform results of top boundary samples: [a b c d]
446    add       r3d, r2d ; r3d = a + b
447    add       r5d, r4d ; r5d = c + d
448    add       r2d, r2d ; r2d = a + a
449    add       r4d, r4d ; r4d = c + c
450    sub       r2d, r3d ; r2d = a + a - a - b = a - b
451    sub       r4d, r5d ; r4d = c + c - c - d = c - d
452    add       r5d, r3d ; r5d = (a + b) + (c + d)
453    add       r3d, r3d
454    sub       r3d, r5d ; r3d = (a + b) - (c + d)
455    add       r4d, r2d ; r4d = (a - b) + (c - d)
456    add       r2d, r2d
457    sub       r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
458
459    movdqa    xmm6, xmm0
460    movdqa    xmm7, xmm2
461    movd      xmm5, r5d ; store the edi for DC mode
462    pxor      xmm3, xmm3
463    pxor      xmm4, xmm4
464    pinsrw    xmm3, r5d, 0
465    pinsrw    xmm3, r4d, 4
466    psllw     xmm3, 2
467    pinsrw    xmm4, r3d, 0
468    pinsrw    xmm4, r2d, 4
469    psllw     xmm4, 2
470
471    ; get the satd of H
472    psubw     xmm0, xmm3
473    psubw     xmm2, xmm4
474
475    WELS_AbsW  xmm0, xmm1
476    WELS_AbsW  xmm2, xmm1
477    paddusw        xmm0, xmm2
478    SSE2_SumWHorizon1  xmm0, xmm1 ; satd of V is stored in xmm0
479
480    ;load left boundary samples: [a b c d]'
481    add r0, r1
482
483    movzx     r2d,  byte [r0-1]
484    movzx     r3d,  byte [r0+r1-1]
485    lea       r0 , [r0+2*r1]
486    movzx     r4d,  byte [r0-1]
487    movzx     r5d,  byte [r0+r1-1]
488
489    ; get the transform results of left boundary samples: [a b c d]'
490    add       r3d, r2d ; r3d = a + b
491    add       r5d, r4d ; r5d = c + d
492    add       r2d, r2d ; r2d = a + a
493    add       r4d, r4d ; r4d = c + c
494    sub       r2d, r3d ; r2d = a + a - a - b = a - b
495    sub       r4d, r5d ; r4d = c + c - c - d = c - d
496    add       r5d, r3d ; r5d = (a + b) + (c + d)
497    add       r3d, r3d
498    sub       r3d, r5d ; r3d = (a + b) - (c + d)
499    add       r4d, r2d ; r4d = (a - b) + (c - d)
500    add       r2d, r2d
501    sub       r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
502
503    ; store the transform results in xmm3
504    movd      xmm3, r5d
505    pinsrw    xmm3, r3d, 1
506    pinsrw    xmm3, r2d, 2
507    pinsrw    xmm3, r4d, 3
508    psllw     xmm3, 2
509
510    ; get the satd of V
511    movdqa    xmm2, xmm6
512    movdqa    xmm4, xmm7
513    psubw     xmm2, xmm3
514    WELS_AbsW  xmm2, xmm1
515    WELS_AbsW  xmm4, xmm1
516    paddusw        xmm2, xmm4
517    SSE2_SumWHorizon1  xmm2, xmm1 ; satd of H is stored in xmm2
518
519    ; DC result is stored in xmm1
520    add       r5d, 4
521    movd      xmm1, r5d
522    paddw     xmm1, xmm5
523    psrlw     xmm1, 3
524    movdqa    xmm5, xmm1
525    psllw     xmm1, 4
526
527    ; get the satd of DC
528    psubw          xmm6, xmm1
529    WELS_AbsW  xmm6, xmm1
530    WELS_AbsW  xmm7, xmm1
531    paddusw        xmm6, xmm7
532    SSE2_SumWHorizon1  xmm6, xmm1 ; satd of DC is stored in xmm6
533%ifdef UNIX64
534    pop r5
535    pop r4
536%endif
537    ; comparing order: DC H V
538
539    mov  r4, arg5
540    movd      r2d, xmm6
541    movd      r3d, xmm2
542    movd      r6d, xmm0
543
544    and       r2d, 0xffff
545    shr       r2d, 1
546    and       r3d, 0xffff
547    shr       r3d, 1
548    and       r6d, 0xffff
549    shr       r6d, 1
550    add       r2d, dword arg7
551    add       r3d, dword arg8
552    add       r6d, dword arg9
553    cmp       r2w, r3w
554    jg near   not_dc
555    cmp       r2w, r6w
556    jg near   not_dc_h
557
558    ; for DC mode
559    movd      r3d, xmm5
560    imul      r3d, 0x01010101
561    movd      xmm5, r3d
562    pshufd    xmm5, xmm5, 0
563    movdqa    [r4], xmm5
564    mov r5, arg6
565    mov       dword [r5], 0x02
566    mov retrd, r2d
567    POP_XMM
568%ifdef X86_32
569    pop r6
570    pop r5
571    pop r4
572    pop r3
573%endif
574    ret
575
576not_dc:
577    cmp       r3w, r6w
578    jg near   not_dc_h
579
580    ; for H mode
581    SSE_DB_1_2REG  xmm6, xmm7
582    sub        r0, r1
583    sub        r0, r1
584    movzx      r6d,  byte [r0-1]
585    movd       xmm0, r6d
586    pmuludq    xmm0, xmm6
587
588    movzx     r6d,  byte [r0+r1-1]
589    movd      xmm1, r6d
590    pmuludq   xmm1, xmm6
591    punpckldq xmm0, xmm1
592
593    lea       r0,   [r0+r1*2]
594    movzx     r6d,  byte [r0-1]
595    movd      xmm2, r6d
596    pmuludq   xmm2, xmm6
597
598    movzx     r6d,  byte [r0+r1-1]
599    movd      xmm3, r6d
600    pmuludq   xmm3, xmm6
601    punpckldq  xmm2, xmm3
602    punpcklqdq xmm0, xmm2
603
604    movdqa    [r4],xmm0
605
606    mov       retrd, r3d
607    mov r5, arg6
608    mov       dword [r5], 0x01
609    POP_XMM
610%ifdef X86_32
611    pop r6
612    pop r5
613    pop r4
614    pop r3
615%endif
616    ret
617not_dc_h:
618    sub        r0, r1
619    sub        r0, r1
620    sub        r0, r1
621    movd      xmm0, [r0]
622    pshufd    xmm0, xmm0, 0
623    movdqa    [r4],xmm0
624    mov       retrd, r6d
625    mov r5, arg6
626    mov       dword [r5], 0x00
627    POP_XMM
628%ifdef X86_32
629    pop r6
630    pop r5
631    pop r4
632    pop r3
633%endif
634    ret
635
636
637%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
638    pmaddubsw    %1, xmm5
639    movdqa       %2, %1
640    pmaddwd      %1, xmm7
641    pmaddwd      %2, xmm6
642    movdqa       %3, %1
643    punpckldq    %1, %2
644    punpckhdq    %2, %3
645    movdqa       %3, %1
646    punpcklqdq   %1, %2
647    punpckhqdq   %3, %2
648    paddd        xmm4, %1 ;for dc
649    paddd        xmm4, %3 ;for dc
650    packssdw     %1, %3
651    psllw        %1, 2
652%endmacro
653%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
654    pmaddubsw    %1, xmm5
655    movdqa       %2, %1
656    pmaddwd      %1, xmm7
657    pmaddwd      %2, xmm6
658    movdqa       %3, %1
659    punpckldq    %1, %2
660    punpckhdq    %2, %3
661    movdqa       %3, %1
662    punpcklqdq   %1, %2
663    punpckhqdq   %3, %2
664;    paddd        xmm4, %1 ;for dc
665;    paddd        xmm4, %3 ;for dc
666    movdqa       %4, %1
667    punpcklqdq   %4, %3
668    packssdw     %1, %3
669    psllw        %1, 2
670%endmacro
671
672%macro SSE41_GetX38x4SatdDec 0
673    pxor        xmm7,   xmm7
674    movq        xmm0,   [r2]
675    movq        xmm1,   [r2+r3]
676    lea         r2,    [r2+2*r3]
677    movq        xmm2,   [r2]
678    movq        xmm3,   [r2+r3]
679    lea         r2,    [r2+2*r3]
680    punpcklbw   xmm0,   xmm7
681    punpcklbw   xmm1,   xmm7
682    punpcklbw   xmm2,   xmm7
683    punpcklbw   xmm3,   xmm7
684    SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm7
685    SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm7
686    SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
687    ;doesn't need another transpose
688%endmacro
689
690%macro SSE41_GetX38x4SatdV 2
691    pxor        xmm0,   xmm0
692    pinsrw      xmm0,   word[r6+%2],   0
693    pinsrw      xmm0,   word[r6+%2+8], 4
694    psubsw      xmm0,   xmm7
695    pabsw       xmm0,   xmm0
696    paddw       xmm4,   xmm0
697    pxor        xmm0,   xmm0
698    pinsrw      xmm0,   word[r6+%2+2],  0
699    pinsrw      xmm0,   word[r6+%2+10], 4
700    psubsw      xmm0,   xmm1
701    pabsw       xmm0,   xmm0
702    paddw       xmm4,   xmm0
703    pxor        xmm0,   xmm0
704    pinsrw      xmm0,   word[r6+%2+4],  0
705    pinsrw      xmm0,   word[r6+%2+12], 4
706    psubsw      xmm0,   xmm3
707    pabsw       xmm0,   xmm0
708    paddw       xmm4,   xmm0
709    pxor        xmm0,   xmm0
710    pinsrw      xmm0,   word[r6+%2+6],  0
711    pinsrw      xmm0,   word[r6+%2+14], 4
712    psubsw      xmm0,   xmm2
713    pabsw       xmm0,   xmm0
714    paddw       xmm4,   xmm0
715%endmacro
716%macro SSE41_GetX38x4SatdH  3
717    movq        xmm0,   [r6+%3+8*%1]
718    punpcklqdq  xmm0,   xmm0
719    psubsw      xmm0,   xmm7
720    pabsw       xmm0,   xmm0
721    paddw       xmm5,   xmm0
722    pabsw       xmm1,   xmm1
723    pabsw       xmm2,   xmm2
724    pabsw       xmm3,   xmm3
725    paddw       xmm2,   xmm1;for DC
726    paddw       xmm2,   xmm3;for DC
727    paddw       xmm5,   xmm2
728%endmacro
729%macro SSE41_I16X16GetX38x4SatdDC 0
730    pxor        xmm0,   xmm0
731    movq2dq     xmm0,   mm4
732    punpcklqdq  xmm0,   xmm0
733    psubsw      xmm0,   xmm7
734    pabsw       xmm0,   xmm0
735    paddw       xmm6,   xmm0
736    paddw       xmm6,   xmm2
737%endmacro
738%macro SSE41_ChromaGetX38x4SatdDC 1
739    shl         %1,     4
740    movdqa      xmm0,   [r6+32+%1]
741    psubsw      xmm0,   xmm7
742    pabsw       xmm0,   xmm0
743    paddw       xmm6,   xmm0
744    paddw       xmm6,   xmm2
745%endmacro
746%macro SSE41_I16x16GetX38x4Satd 2
747    SSE41_GetX38x4SatdDec
748    SSE41_GetX38x4SatdV   %1, %2
749    SSE41_GetX38x4SatdH   %1, %2, 32
750    SSE41_I16X16GetX38x4SatdDC
751%endmacro
752%macro SSE41_ChromaGetX38x4Satd 2
753    SSE41_GetX38x4SatdDec
754    SSE41_GetX38x4SatdV   %1, %2
755    SSE41_GetX38x4SatdH   %1, %2, 16
756    SSE41_ChromaGetX38x4SatdDC %1
757%endmacro
758%macro SSE41_HSum8W 3
759    pmaddwd     %1, %2
760    movhlps     %3, %1
761    paddd       %1, %3
762    pshuflw     %3, %1,0Eh
763    paddd       %1, %3
764%endmacro
765
766WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
767    %assign  push_num 0
768    LOAD_7_PARA
769    PUSH_XMM 8
770    SIGN_EXTENSION r1, r1d
771    SIGN_EXTENSION r3, r3d
772    SIGN_EXTENSION r5, r5d
773
774%ifndef X86_32
775    push r12
776    mov  r12, r2
777%endif
778
779    INIT_X86_32_PIC r2
780    pxor        xmm4,   xmm4
781    movdqa      xmm5,   [pic(HSumSubDB1)]
782    movdqa      xmm6,   [pic(HSumSubDW1)]
783    movdqa      xmm7,   [pic(PDW1)]
784    DEINIT_X86_32_PIC
785    sub         r0,    r1
786    movdqu      xmm0,   [r0]
787    movhlps     xmm1,   xmm0
788    punpcklqdq  xmm0,   xmm0
789    punpcklqdq  xmm1,   xmm1
790    SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
791    SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
792    movdqa      [r6],  xmm0 ;V
793    movdqa      [r6+16], xmm1
794    add         r0,    r1
795    pinsrb      xmm0,   byte[r0-1], 0
796    pinsrb      xmm0,   byte[r0+r1-1], 1
797    lea         r0,    [r0+2*r1]
798    pinsrb      xmm0,   byte[r0-1],     2
799    pinsrb      xmm0,   byte[r0+r1-1], 3
800    lea         r0,    [r0+2*r1]
801    pinsrb      xmm0,   byte[r0-1],     4
802    pinsrb      xmm0,   byte[r0+r1-1], 5
803    lea         r0,    [r0+2*r1]
804    pinsrb      xmm0,   byte[r0-1],     6
805    pinsrb      xmm0,   byte[r0+r1-1], 7
806    lea         r0,    [r0+2*r1]
807    pinsrb      xmm0,   byte[r0-1],     8
808    pinsrb      xmm0,   byte[r0+r1-1], 9
809    lea         r0,    [r0+2*r1]
810    pinsrb      xmm0,   byte[r0-1],     10
811    pinsrb      xmm0,   byte[r0+r1-1], 11
812    lea         r0,    [r0+2*r1]
813    pinsrb      xmm0,   byte[r0-1],     12
814    pinsrb      xmm0,   byte[r0+r1-1], 13
815    lea         r0,    [r0+2*r1]
816    pinsrb      xmm0,   byte[r0-1],     14
817    pinsrb      xmm0,   byte[r0+r1-1], 15
818    movhlps     xmm1,   xmm0
819    punpcklqdq  xmm0,   xmm0
820    punpcklqdq  xmm1,   xmm1
821    SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
822    SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
823    movdqa      [r6+32], xmm0 ;H
824    movdqa      [r6+48], xmm1
825    movd        r0d,    xmm4 ;dc
826    add         r0d,    16   ;(sum+16)
827    shr         r0d,    5    ;((sum+16)>>5)
828    shl         r0d,    4    ;
829    movd        mm4,    r0d  ; mm4 copy DC
830    pxor        xmm4,   xmm4 ;V
831    pxor        xmm5,   xmm5 ;H
832    pxor        xmm6,   xmm6 ;DC
833%ifdef UNIX64
834    push r4
835%endif
836    mov         r0,    0
837    mov         r4,    0
838
839.loop16x16_get_satd:
840.loopStart1:
841    SSE41_I16x16GetX38x4Satd r0, r4
842    inc          r0
843    cmp         r0, 4
844    jl          .loopStart1
845    cmp         r4, 16
846    je          .loop16x16_get_satd_end
847%ifdef X86_32
848    mov r2, arg3
849%else
850    mov r2, r12
851%endif
852    add         r2, 8
853    mov         r0, 0
854    add         r4, 16
855    jmp         .loop16x16_get_satd
856 .loop16x16_get_satd_end:
857    MMX_DW_1_2REG    xmm0, xmm1
858    psrlw       xmm4, 1 ;/2
859    psrlw       xmm5, 1 ;/2
860    psrlw       xmm6, 1 ;/2
861    SSE41_HSum8W     xmm4, xmm0, xmm1
862    SSE41_HSum8W     xmm5, xmm0, xmm1
863    SSE41_HSum8W     xmm6, xmm0, xmm1
864
865%ifdef UNIX64
866    pop r4
867%endif
868    ; comparing order: DC H V
869    movd      r3d, xmm6 ;DC
870    movd      r1d, xmm5 ;H
871    movd      r0d, xmm4 ;V
872%ifndef X86_32
873    pop r12
874%endif
875    shl       r5d, 1
876    add       r1d, r5d
877    add       r3d, r5d
878    mov       r4, arg5
879    cmp       r3d, r1d
880    jge near   not_dc_16x16
881    cmp        r3d, r0d
882    jge near   not_dc_h_16x16
883
884    ; for DC mode
885    mov       dword[r4], 2;I16_PRED_DC
886    mov       retrd, r3d
887    jmp near return_satd_intra_16x16_x3
888not_dc_16x16:
889    ; for H mode
890    cmp       r1d, r0d
891    jge near   not_dc_h_16x16
892    mov       dword[r4], 1;I16_PRED_H
893    mov       retrd, r1d
894    jmp near return_satd_intra_16x16_x3
895not_dc_h_16x16:
896    ; for V mode
897    mov       dword[r4], 0;I16_PRED_V
898    mov       retrd, r0d
899return_satd_intra_16x16_x3:
900    WELSEMMS
901    POP_XMM
902    LOAD_7_PARA_POP
903ret
904
905%macro SSE41_ChromaGetX38x8Satd 0
906    movdqa      xmm5,   [pic(HSumSubDB1)]
907    movdqa      xmm6,   [pic(HSumSubDW1)]
908    movdqa      xmm7,   [pic(PDW1)]
909    sub         r0,    r1
910    movq        xmm0,   [r0]
911    punpcklqdq  xmm0,   xmm0
912    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
913    movdqa      [r6],  xmm0 ;V
914    add         r0,    r1
915    pinsrb      xmm0,   byte[r0-1], 0
916    pinsrb      xmm0,   byte[r0+r1-1], 1
917    lea         r0,    [r0+2*r1]
918    pinsrb      xmm0,   byte[r0-1],     2
919    pinsrb      xmm0,   byte[r0+r1-1], 3
920    lea         r0,    [r0+2*r1]
921    pinsrb      xmm0,   byte[r0-1],     4
922    pinsrb      xmm0,   byte[r0+r1-1], 5
923    lea         r0,    [r0+2*r1]
924    pinsrb      xmm0,   byte[r0-1],     6
925    pinsrb      xmm0,   byte[r0+r1-1], 7
926    punpcklqdq  xmm0,   xmm0
927    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
928    movdqa      [r6+16], xmm0 ;H
929;(sum+2)>>2
930    movdqa      xmm6,   [pic(PDQ2)]
931    movdqa      xmm5,   xmm4
932    punpckhqdq  xmm5,   xmm1
933    paddd       xmm5,   xmm6
934    psrld       xmm5,   2
935;(sum1+sum2+4)>>3
936    paddd       xmm6,   xmm6
937    paddd       xmm4,   xmm1
938    paddd       xmm4,   xmm6
939    psrld       xmm4,   3
940;satd *16
941    pslld       xmm5,   4
942    pslld       xmm4,   4
943;temp satd
944    movdqa      xmm6,   xmm4
945    punpcklqdq  xmm4,   xmm5
946    psllq       xmm4,   32
947    psrlq       xmm4,   32
948    movdqa      [r6+32], xmm4
949    punpckhqdq  xmm5,   xmm6
950    psllq       xmm5,   32
951    psrlq       xmm5,   32
952    movdqa      [r6+48], xmm5
953
954    pxor        xmm4,   xmm4 ;V
955    pxor        xmm5,   xmm5 ;H
956    pxor        xmm6,   xmm6 ;DC
957    mov         r0,    0
958    SSE41_ChromaGetX38x4Satd r0, 0
959    inc             r0
960    SSE41_ChromaGetX38x4Satd r0, 0
961%endmacro
962
963%macro SSEReg2MMX 3
964    movdq2q     %2, %1
965    movhlps     %1, %1
966    movdq2q     %3, %1
967%endmacro
968%macro MMXReg2SSE 4
969    movq2dq     %1, %3
970    movq2dq     %2, %4
971    punpcklqdq  %1, %2
972%endmacro
973;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
974
975WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
976    %assign  push_num 0
977    LOAD_7_PARA
978    PUSH_XMM 8
979    SIGN_EXTENSION r1, r1d
980    SIGN_EXTENSION r3, r3d
981    SIGN_EXTENSION r5, r5d
982loop_chroma_satdx3:
983    INIT_X86_32_PIC r4
984    SSE41_ChromaGetX38x8Satd
985    SSEReg2MMX  xmm4, mm0,mm1
986    SSEReg2MMX  xmm5, mm2,mm3
987    SSEReg2MMX  xmm6, mm5,mm6
988    mov r0,     arg8
989    mov r2,     arg9
990
991    SSE41_ChromaGetX38x8Satd
992    DEINIT_X86_32_PIC
993
994    MMXReg2SSE  xmm0, xmm3, mm0, mm1
995    MMXReg2SSE  xmm1, xmm3, mm2, mm3
996    MMXReg2SSE  xmm2, xmm3, mm5, mm6
997
998    paddw       xmm4, xmm0
999    paddw       xmm5, xmm1
1000    paddw       xmm6, xmm2
1001
1002    MMX_DW_1_2REG    xmm0, xmm1
1003    psrlw       xmm4, 1 ;/2
1004    psrlw       xmm5, 1 ;/2
1005    psrlw       xmm6, 1 ;/2
1006    SSE41_HSum8W     xmm4, xmm0, xmm1
1007    SSE41_HSum8W     xmm5, xmm0, xmm1
1008    SSE41_HSum8W     xmm6, xmm0, xmm1
1009    ; comparing order: DC H V
1010    movd      r3d, xmm6 ;DC
1011    movd      r1d, xmm5 ;H
1012    movd      r0d, xmm4 ;V
1013
1014
1015    shl       r5d, 1
1016    add       r1d, r5d
1017    add       r0d, r5d
1018    cmp       r3d, r1d
1019    jge near   not_dc_8x8
1020    cmp        r3d, r0d
1021    jge near   not_dc_h_8x8
1022
1023    ; for DC mode
1024    mov       dword[r4], 0;I8_PRED_DC
1025    mov       retrd, r3d
1026    jmp near return_satd_intra_8x8_x3
1027not_dc_8x8:
1028    ; for H mode
1029    cmp       r1d, r0d
1030    jge near   not_dc_h_8x8
1031    mov       dword[r4], 1;I8_PRED_H
1032    mov       retrd, r1d
1033    jmp near return_satd_intra_8x8_x3
1034not_dc_h_8x8:
1035    ; for V mode
1036    mov       dword[r4], 2;I8_PRED_V
1037    mov       retrd, r0d
1038return_satd_intra_8x8_x3:
1039    WELSEMMS
1040    POP_XMM
1041    LOAD_7_PARA_POP
1042ret
1043
1044
1045;***********************************************************************
1046;
1047;Pixel_satd_intra_sse2 END
1048;
1049;***********************************************************************
1050%macro SSSE3_Get16BSadHVDC 2
1051    movd        xmm6,%1
1052    pshufb      xmm6,xmm1
1053    movdqa      %1,  xmm6
1054    movdqa      xmm0,%2
1055    psadbw      xmm0,xmm7
1056    paddw       xmm4,xmm0
1057    movdqa      xmm0,%2
1058    psadbw      xmm0,xmm5
1059    paddw       xmm2,xmm0
1060    psadbw      xmm6,%2
1061    paddw       xmm3,xmm6
1062%endmacro
1063%macro WelsAddDCValue 4
1064    movzx   %2, byte %1
1065    mov    %3, %2
1066    add     %4, %2
1067%endmacro
1068
1069;***********************************************************************
1070;
1071;Pixel_sad_intra_ssse3 BEGIN
1072;
1073;***********************************************************************
1074WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
1075    %assign  push_num 0
1076    LOAD_7_PARA
1077    PUSH_XMM 8
1078    SIGN_EXTENSION r1, r1d
1079    SIGN_EXTENSION r3, r3d
1080    SIGN_EXTENSION r5, r5d
1081
1082    push  r5
1083    push  r4
1084    push  r3
1085
1086    sub    r0,    r1
1087    movdqa      xmm5,[r0]
1088    pxor        xmm0,xmm0
1089    psadbw      xmm0,xmm5
1090    movhlps     xmm1,xmm0
1091    paddw       xmm0,xmm1
1092    movd        r5d, xmm0
1093
1094    add         r0,r1
1095    lea         r3,[r1+2*r1]    ;ebx r3
1096    WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d    ; esi r4d, eax r5d
1097    WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
1098    WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
1099    WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
1100    lea         r0, [r0+4*r1]
1101    add         r6, 64
1102    WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d
1103    WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
1104    WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
1105    WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
1106    lea         r0, [r0+4*r1]
1107    add         r6, 64
1108    WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d
1109    WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
1110    WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
1111    WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
1112    lea         r0, [r0+4*r1]
1113    add         r6, 64
1114    WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d
1115    WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
1116    WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
1117    WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
1118    sub         r6, 192
1119    add         r5d,10h
1120    shr         r5d,5
1121    movd        xmm7,r5d
1122    pxor        xmm1,xmm1
1123    pshufb      xmm7,xmm1
1124    pxor        xmm4,xmm4
1125    pxor        xmm3,xmm3
1126    pxor        xmm2,xmm2
1127    ;sad begin
1128    pop   r3
1129    lea         r4, [r3+2*r3] ;esi r4
1130    SSSE3_Get16BSadHVDC [r6], [r2]
1131    SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
1132    SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
1133    SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
1134    add         r6, 64
1135    lea         r2, [r2+4*r3]
1136    SSSE3_Get16BSadHVDC [r6], [r2]
1137    SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
1138    SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
1139    SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
1140    add         r6, 64
1141    lea         r2, [r2+4*r3]
1142    SSSE3_Get16BSadHVDC [r6], [r2]
1143    SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
1144    SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
1145    SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
1146    add         r6, 64
1147    lea         r2, [r2+4*r3]
1148    SSSE3_Get16BSadHVDC [r6], [r2]
1149    SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
1150    SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
1151    SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
1152
1153    pop r4
1154    pop r5
1155    pslldq      xmm3,4
1156    por         xmm3,xmm2
1157    movhlps     xmm1,xmm3
1158    paddw       xmm3,xmm1
1159    movhlps     xmm0,xmm4
1160    paddw       xmm4,xmm0
1161    ; comparing order: DC H V
1162    movd        r1d, xmm4 ;DC   ;ebx r1d
1163    movd        r0d, xmm3 ;V    ;ecx r0d
1164    psrldq      xmm3, 4
1165    movd        r2d, xmm3 ;H    ;esi r2d
1166
1167    ;mov         eax, [esp+36] ;lamda ;eax r5
1168    shl         r5d, 1
1169    add         r2d, r5d
1170    add         r1d, r5d
1171    ;mov         edx, [esp+32]  ;edx r4
1172    cmp         r1d, r2d
1173    jge near   not_dc_16x16_sad
1174    cmp        r1d, r0d
1175    jge near   not_dc_h_16x16_sad
1176    ; for DC mode
1177    mov       dword[r4], 2;I16_PRED_DC
1178    mov       retrd, r1d
1179    sub        r6, 192
1180%assign x 0
1181%rep 16
1182    movdqa    [r6+16*x], xmm7
1183%assign x x+1
1184%endrep
1185    jmp near return_sad_intra_16x16_x3
1186not_dc_16x16_sad:
1187    ; for H mode
1188    cmp       r2d, r0d
1189    jge near   not_dc_h_16x16_sad
1190    mov       dword[r4], 1;I16_PRED_H
1191    mov       retrd, r2d
1192    jmp near return_sad_intra_16x16_x3
1193not_dc_h_16x16_sad:
1194    ; for V mode
1195    mov       dword[r4], 0;I16_PRED_V
1196    mov       retrd, r0d
1197    sub       r6, 192
1198%assign x 0
1199%rep 16
1200    movdqa    [r6+16*x], xmm5
1201%assign x x+1
1202%endrep
1203return_sad_intra_16x16_x3:
1204    POP_XMM
1205    LOAD_7_PARA_POP
1206    ret
1207
1208;***********************************************************************
1209;
1210;Pixel_sad_intra_ssse3 END
1211;
1212;***********************************************************************
1213;***********************************************************************
1214;
1215;Pixel_satd_wxh_sse41 BEGIN
1216;
1217;***********************************************************************
1218
1219;SSE4.1
1220%macro SSE41_GetSatd8x4 0
1221    movq             xmm0, [r0]
1222    punpcklqdq       xmm0, xmm0
1223    pmaddubsw        xmm0, xmm7
1224    movq             xmm1, [r0+r1]
1225    punpcklqdq       xmm1, xmm1
1226    pmaddubsw        xmm1, xmm7
1227    movq             xmm2, [r2]
1228    punpcklqdq       xmm2, xmm2
1229    pmaddubsw        xmm2, xmm7
1230    movq             xmm3, [r2+r3]
1231    punpcklqdq       xmm3, xmm3
1232    pmaddubsw        xmm3, xmm7
1233    psubsw           xmm0, xmm2
1234    psubsw           xmm1, xmm3
1235    movq             xmm2, [r0+2*r1]
1236    punpcklqdq       xmm2, xmm2
1237    pmaddubsw        xmm2, xmm7
1238    movq             xmm3, [r0+r4]
1239    punpcklqdq       xmm3, xmm3
1240    pmaddubsw        xmm3, xmm7
1241    movq             xmm4, [r2+2*r3]
1242    punpcklqdq       xmm4, xmm4
1243    pmaddubsw        xmm4, xmm7
1244    movq             xmm5, [r2+r5]
1245    punpcklqdq       xmm5, xmm5
1246    pmaddubsw        xmm5, xmm7
1247    psubsw           xmm2, xmm4
1248    psubsw           xmm3, xmm5
1249    SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4
1250    pabsw            xmm0, xmm0
1251    pabsw            xmm2, xmm2
1252    pabsw            xmm1, xmm1
1253    pabsw            xmm3, xmm3
1254    movdqa           xmm4, xmm3
1255    pblendw          xmm3, xmm1, 0xAA
1256    pslld            xmm1, 16
1257    psrld            xmm4, 16
1258    por              xmm1, xmm4
1259    pmaxuw           xmm1, xmm3
1260    paddw            xmm6, xmm1
1261    movdqa           xmm4, xmm0
1262    pblendw          xmm0, xmm2, 0xAA
1263    pslld            xmm2, 16
1264    psrld            xmm4, 16
1265    por              xmm2, xmm4
1266    pmaxuw           xmm0, xmm2
1267    paddw            xmm6, xmm0
1268%endmacro
1269
1270%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
1271    MMX_DW_1_2REG    %3, %4
1272    pmaddwd     %2, %3
1273    movhlps     %4, %2
1274    paddd       %2, %4
1275    pshuflw     %4, %2,0Eh
1276    paddd       %2, %4
1277    movd        %1, %2
1278%endmacro
1279;***********************************************************************
1280;
1281;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
1282;
1283;***********************************************************************
1284WELS_EXTERN WelsSampleSatd4x4_sse41
1285    %assign  push_num 0
1286    INIT_X86_32_PIC r5
1287    LOAD_4_PARA
1288    PUSH_XMM 8
1289    SIGN_EXTENSION r1, r1d
1290    SIGN_EXTENSION r3, r3d
1291    movdqa      xmm4,[pic(HSwapSumSubDB1)]
1292    movd        xmm2,[r2]
1293    movd        xmm5,[r2+r3]
1294    shufps      xmm2,xmm5,0
1295    movd        xmm3,[r2+r3*2]
1296    lea         r2, [r3*2+r2]
1297    movd        xmm5,[r2+r3]
1298    shufps      xmm3,xmm5,0
1299    movd        xmm0,[r0]
1300    movd        xmm5,[r0+r1]
1301    shufps      xmm0,xmm5,0
1302    movd        xmm1,[r0+r1*2]
1303    lea         r0, [r1*2+r0]
1304    movd        xmm5,[r0+r1]
1305    shufps      xmm1,xmm5,0
1306    pmaddubsw   xmm0,xmm4
1307    pmaddubsw   xmm1,xmm4
1308    pmaddubsw   xmm2,xmm4
1309    pmaddubsw   xmm3,xmm4
1310    psubw       xmm0,xmm2
1311    psubw       xmm1,xmm3
1312    movdqa      xmm2,xmm0
1313    paddw       xmm0,xmm1
1314    psubw       xmm1,xmm2
1315    movdqa      xmm2,xmm0
1316    punpcklqdq  xmm0,xmm1
1317    punpckhqdq  xmm2,xmm1
1318    movdqa      xmm1,xmm0
1319    paddw       xmm0,xmm2
1320    psubw       xmm2,xmm1
1321    movdqa      xmm1,xmm0
1322    pblendw     xmm0,xmm2,0AAh
1323    pslld       xmm2,16
1324    psrld       xmm1,16
1325    por         xmm2,xmm1
1326    pabsw       xmm0,xmm0
1327    pabsw       xmm2,xmm2
1328    pmaxsw      xmm0,xmm2
1329    SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
1330    POP_XMM
1331    LOAD_4_PARA_POP
1332    DEINIT_X86_32_PIC
1333    ret
1334
1335;***********************************************************************
1336;
1337;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
1338;
1339;***********************************************************************
1340WELS_EXTERN WelsSampleSatd8x8_sse41
1341%ifdef X86_32
1342    push  r4
1343    push  r5
1344%endif
1345    %assign  push_num 2
1346    INIT_X86_32_PIC r6
1347    LOAD_4_PARA
1348    PUSH_XMM 8
1349    SIGN_EXTENSION r1, r1d
1350    SIGN_EXTENSION r3, r3d
1351
1352    movdqa      xmm7, [pic(HSumSubDB1)]
1353    lea         r4,  [r1+r1*2]
1354    lea         r5,  [r3+r3*2]
1355    pxor        xmm6, xmm6
1356    SSE41_GetSatd8x4
1357    lea         r0,  [r0+4*r1]
1358    lea         r2,  [r2+4*r3]
1359    SSE41_GetSatd8x4
1360    SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
1361    POP_XMM
1362    LOAD_4_PARA_POP
1363    DEINIT_X86_32_PIC
1364%ifdef X86_32
1365    pop  r5
1366    pop  r4
1367%endif
1368    ret
1369
1370;***********************************************************************
1371;
1372;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
1373;
1374;***********************************************************************
1375WELS_EXTERN WelsSampleSatd8x16_sse41
1376%ifdef X86_32
1377    push  r4
1378    push  r5
1379    push  r6
1380%endif
1381    %assign  push_num 3
1382    LOAD_4_PARA
1383    PUSH_XMM 8
1384    SIGN_EXTENSION r1, r1d
1385    SIGN_EXTENSION r3, r3d
1386
1387    INIT_X86_32_PIC_NOPRESERVE r4
1388    movdqa      xmm7, [pic(HSumSubDB1)]
1389    DEINIT_X86_32_PIC
1390    lea         r4,  [r1+r1*2]
1391    lea         r5,  [r3+r3*2]
1392    pxor        xmm6, xmm6
1393    mov         r6,    0
1394loop_get_satd_8x16:
1395    SSE41_GetSatd8x4
1396    lea         r0,  [r0+4*r1]
1397    lea         r2,  [r2+4*r3]
1398    inc         r6
1399    cmp         r6,  4
1400    jl          loop_get_satd_8x16
1401    SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
1402    POP_XMM
1403    LOAD_4_PARA_POP
1404%ifdef X86_32
1405    pop  r6
1406    pop  r5
1407    pop  r4
1408%endif
1409    ret
1410
1411;***********************************************************************
1412;
1413;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
1414;
1415;***********************************************************************
1416WELS_EXTERN WelsSampleSatd16x8_sse41
1417%ifdef X86_32
1418    push  r4
1419    push  r5
1420%endif
1421    %assign  push_num 2
1422    INIT_X86_32_PIC r6
1423    LOAD_4_PARA
1424    PUSH_XMM 8
1425    SIGN_EXTENSION r1, r1d
1426    SIGN_EXTENSION r3, r3d
1427    push  r0
1428    push  r2
1429
1430    movdqa      xmm7, [pic(HSumSubDB1)]
1431    lea         r4,  [r1+r1*2]
1432    lea         r5,  [r3+r3*2]
1433    pxor        xmm6,   xmm6
1434    SSE41_GetSatd8x4
1435    lea         r0,  [r0+4*r1]
1436    lea         r2,  [r2+4*r3]
1437    SSE41_GetSatd8x4
1438
1439    pop  r2
1440    pop  r0
1441    add         r0,    8
1442    add         r2,    8
1443    SSE41_GetSatd8x4
1444    lea         r0,  [r0+4*r1]
1445    lea         r2,  [r2+4*r3]
1446    SSE41_GetSatd8x4
1447    SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
1448    POP_XMM
1449    LOAD_4_PARA_POP
1450    DEINIT_X86_32_PIC
1451%ifdef X86_32
1452    pop  r5
1453    pop  r4
1454%endif
1455    ret
1456
1457;***********************************************************************
1458;
1459;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
1460;
1461;***********************************************************************
1462
1463WELS_EXTERN WelsSampleSatd16x16_sse41
1464%ifdef X86_32
1465    push  r4
1466    push  r5
1467    push  r6
1468%endif
1469    %assign  push_num 3
1470    LOAD_4_PARA
1471    PUSH_XMM 8
1472    SIGN_EXTENSION r1, r1d
1473    SIGN_EXTENSION r3, r3d
1474
1475    push  r0
1476    push  r2
1477
1478    INIT_X86_32_PIC_NOPRESERVE r4
1479    movdqa      xmm7, [pic(HSumSubDB1)]
1480    DEINIT_X86_32_PIC
1481    lea         r4,  [r1+r1*2]
1482    lea         r5,  [r3+r3*2]
1483    pxor        xmm6,   xmm6
1484    mov         r6,    0
1485loop_get_satd_16x16_left:
1486    SSE41_GetSatd8x4
1487    lea         r0,  [r0+4*r1]
1488    lea         r2,  [r2+4*r3]
1489    inc         r6
1490    cmp         r6,  4
1491    jl          loop_get_satd_16x16_left
1492
1493    pop  r2
1494    pop  r0
1495    add         r0,    8
1496    add         r2,    8
1497    mov         r6,    0
1498loop_get_satd_16x16_right:
1499    SSE41_GetSatd8x4
1500    lea         r0,  [r0+4*r1]
1501    lea         r2,  [r2+4*r3]
1502    inc         r6
1503    cmp         r6,  4
1504    jl          loop_get_satd_16x16_right
1505    SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
1506    POP_XMM
1507    LOAD_4_PARA_POP
1508%ifdef X86_32
1509    pop  r6
1510    pop  r5
1511    pop  r4
1512%endif
1513    ret
1514
1515;***********************************************************************
1516;
1517;Pixel_satd_wxh_sse41 END
1518;
1519;***********************************************************************
1520
1521;***********************************************************************
1522;
1523;Pixel_satd_wxh_avx2 BEGIN
1524;
1525;***********************************************************************
1526
1527%ifdef HAVE_AVX2
1528; out=%1 pSrcA=%2 pSrcB=%3 HSumSubDB1_256=%4 ymm_clobber=%5
1529%macro AVX2_LoadDiffSatd16x1 5
1530    vbroadcasti128   %1, [%2]
1531    vpmaddubsw       %1, %1, %4             ; hadamard neighboring horizontal sums and differences
1532    vbroadcasti128   %5, [%3]
1533    vpmaddubsw       %5, %5, %4             ; hadamard neighboring horizontal sums and differences
1534    vpsubw           %1, %1, %5             ; diff srcA srcB
1535%endmacro
1536
1537; out=%1 pSrcA=%2 pSrcA+4*iStride=%3 pSrcB=%4 pSrcB+4*iStride=%5 HSumSubDB1_128x2=%6 ymm_clobber=%7,%8
1538%macro AVX2_LoadDiffSatd8x2 8
1539    vpbroadcastq     %1, [%2]
1540    vpbroadcastq     %7, [%3]
1541    vpblendd         %1, %1, %7, 11110000b
1542    vpmaddubsw       %1, %1, %6             ; hadamard neighboring horizontal sums and differences
1543    vpbroadcastq     %7, [%4]
1544    vpbroadcastq     %8, [%5]
1545    vpblendd         %7, %7, %8, 11110000b
1546    vpmaddubsw       %7, %7, %6             ; hadamard neighboring horizontal sums and differences
1547    vpsubw           %1, %1, %7             ; diff srcA srcB
1548%endmacro
1549
1550; in/out=%1,%2,%3,%4 clobber=%5
1551%macro AVX2_HDMFour4x4 5
1552    vpsubw           %5, %1, %4             ; s3 = x0 - x3
1553    vpaddw           %1, %1, %4             ; s0 = x0 + x3
1554    vpsubw           %4, %2, %3             ; s2 = x1 - x2
1555    vpaddw           %2, %2, %3             ; s1 = x1 + x2
1556    vpsubw           %3, %1, %2             ; y2 = s0 - s1
1557    vpaddw           %1, %1, %2             ; y0 = s0 + s1
1558    vpaddw           %2, %5, %4             ; y1 = s3 + s2
1559    vpsubw           %4, %5, %4             ; y3 = s3 - s2
1560%endmacro
1561
1562; out=%1 in=%1,%2,%3,%4 clobber=%5
1563%macro AVX2_SatdFour4x4 5
1564    AVX2_HDMFour4x4  %1, %2, %3, %4, %5
1565    vpabsw           %1, %1
1566    vpabsw           %2, %2
1567    vpabsw           %3, %3
1568    vpabsw           %4, %4
1569    ; second stage of horizontal hadamard.
1570    ; utilizes that |a + b| + |a - b| = 2 * max(|a|, |b|)
1571    vpblendw         %5, %1, %2, 10101010b
1572    vpslld           %2, %2, 16
1573    vpsrld           %1, %1, 16
1574    vpor             %2, %2, %1
1575    vpmaxuw          %2, %2, %5
1576    vpblendw         %5, %3, %4, 10101010b
1577    vpslld           %4, %4, 16
1578    vpsrld           %3, %3, 16
1579    vpor             %4, %4, %3
1580    vpmaxuw          %3, %5, %4
1581    vpaddw           %1, %2, %3
1582%endmacro
1583
1584; out=%1 pSrcA=%2 iStrideA=%3 3*iStrideA=%4 pSrcB=%5 iStrideB=%6 3*iStrideB=%7 HSumSubDB1_256=%8 ymm_clobber=%9,%10,%11,%12
1585%macro AVX2_GetSatd16x4 12
1586    AVX2_LoadDiffSatd16x1  %1, %2 + 0 * %3, %5 + 0 * %6, %8, %12
1587    AVX2_LoadDiffSatd16x1  %9, %2 + 1 * %3, %5 + 1 * %6, %8, %12
1588    AVX2_LoadDiffSatd16x1 %10, %2 + 2 * %3, %5 + 2 * %6, %8, %12
1589    AVX2_LoadDiffSatd16x1 %11, %2 + 1 * %4, %5 + 1 * %7, %8, %12
1590    AVX2_SatdFour4x4 %1, %9, %10, %11, %12
1591%endmacro
1592
1593; out=%1 pSrcA=%2 iStrideA=%3 3*iStrideA=%4 pSrcB=%5 iStrideB=%6 3*iStrideB=%7 HSumSubDB1_128x2=%8 ymm_clobber=%9,%10,%11,%12,%13
1594%macro AVX2_GetSatd8x8 13
1595    AVX2_LoadDiffSatd8x2  %1, %2 + 0 * %3, %2 + 4 * %3, %5 + 0 * %6, %5 + 4 * %6, %8, %12, %13
1596    AVX2_LoadDiffSatd8x2 %10, %2 + 2 * %3, %2 + 2 * %4, %5 + 2 * %6, %5 + 2 * %7, %8, %12, %13
1597    add              %2, %3
1598    add              %5, %6
1599    AVX2_LoadDiffSatd8x2  %9, %2 + 0 * %3, %2 + 4 * %3, %5 + 0 * %6, %5 + 4 * %6, %8, %12, %13
1600    AVX2_LoadDiffSatd8x2 %11, %2 + 2 * %3, %2 + 2 * %4, %5 + 2 * %6, %5 + 2 * %7, %8, %12, %13
1601    AVX2_SatdFour4x4 %1, %9, %10, %11, %12
1602%endmacro
1603
1604; d_out=%1 mm_in=%2 mm_clobber=%3
1605%macro AVX2_SumWHorizon 3
1606    WELS_DW1_VEX     y%3
1607    vpmaddwd         y%2, y%2, y%3
1608    vextracti128     x%3, y%2, 1
1609    vpaddd           x%2, x%2, x%3
1610    vpunpckhqdq      x%3, x%2, x%2
1611    vpaddd           x%2, x%2, x%3
1612    vpsrldq          x%3, x%2, 4
1613    vpaddd           x%2, x%2, x%3
1614    vmovd            %1, x%2
1615%endmacro
1616
1617;***********************************************************************
1618;
1619;int32_t WelsSampleSatd8x16_avx2( uint8_t *, int32_t, uint8_t *, int32_t, );
1620;
1621;***********************************************************************
1622
1623WELS_EXTERN WelsSampleSatd8x16_avx2
1624    %assign push_num 0
1625%ifdef X86_32
1626    push r4
1627    %assign push_num 1
1628%endif
1629    mov r4, 2                      ; loop cnt
1630    jmp WelsSampleSatd8x8N_avx2
1631
1632;***********************************************************************
1633;
1634;int32_t WelsSampleSatd8x8_avx2( uint8_t *, int32_t, uint8_t *, int32_t, );
1635;
1636;***********************************************************************
1637
1638WELS_EXTERN WelsSampleSatd8x8_avx2
1639    %assign push_num 0
1640%ifdef X86_32
1641    push           r4
1642    %assign push_num 1
1643%endif
1644    mov            r4, 1           ; loop cnt
1645                                   ; fall through
1646WelsSampleSatd8x8N_avx2:
1647%ifdef X86_32
1648    push           r5
1649    push           r6
1650    %assign push_num push_num+2
1651%endif
1652    LOAD_4_PARA
1653    PUSH_XMM 8
1654    SIGN_EXTENSION r1, r1d
1655    SIGN_EXTENSION r3, r3d
1656
1657    INIT_X86_32_PIC_NOPRESERVE r5
1658    vbroadcasti128 ymm7, [pic(HSumSubDB1)]
1659    DEINIT_X86_32_PIC
1660    lea            r5, [3 * r1]
1661    lea            r6, [3 * r3]
1662    vpxor          ymm6, ymm6, ymm6
1663.loop:
1664    AVX2_GetSatd8x8 ymm0, r0, r1, r5, r2, r3, r6, ymm7, ymm1, ymm2, ymm3, ymm4, ymm5
1665    vpaddw         ymm6, ymm6, ymm0
1666    sub            r4, 1
1667    jbe            .loop_end
1668    add            r0, r5
1669    add            r2, r6
1670    lea            r0, [r0 + 4 * r1]
1671    lea            r2, [r2 + 4 * r3]
1672    jmp            .loop
1673.loop_end:
1674    AVX2_SumWHorizon retrd, mm6, mm5
1675    vzeroupper
1676    POP_XMM
1677    LOAD_4_PARA_POP
1678%ifdef X86_32
1679    pop            r6
1680    pop            r5
1681    pop            r4
1682%endif
1683    ret
1684
1685;***********************************************************************
1686;
1687;int32_t WelsSampleSatd16x16_avx2( uint8_t *, int32_t, uint8_t *, int32_t, );
1688;
1689;***********************************************************************
1690
1691WELS_EXTERN WelsSampleSatd16x16_avx2
1692    %assign push_num 0
1693%ifdef X86_32
1694    push r4
1695    %assign push_num 1
1696%endif
1697    mov r4, 4                      ; loop cnt
1698    jmp WelsSampleSatd16x4N_avx2
1699
1700;***********************************************************************
1701;
1702;int32_t WelsSampleSatd16x8_avx2( uint8_t *, int32_t, uint8_t *, int32_t, );
1703;
1704;***********************************************************************
1705
1706WELS_EXTERN WelsSampleSatd16x8_avx2
1707    %assign push_num 0
1708%ifdef X86_32
1709    push r4
1710    %assign push_num 1
1711%endif
1712    mov r4, 2                      ; loop cnt
1713                                   ; fall through
1714WelsSampleSatd16x4N_avx2:
1715%ifdef X86_32
1716    push r5
1717    push r6
1718    %assign push_num push_num+2
1719%endif
1720    LOAD_4_PARA
1721    PUSH_XMM 7
1722    SIGN_EXTENSION r1, r1d
1723    SIGN_EXTENSION r3, r3d
1724
1725    INIT_X86_32_PIC_NOPRESERVE r5
1726    vpbroadcastq xmm0, [pic(HSumSubDB1)]
1727    vpbroadcastq ymm6, [pic(HSumSubDB1 + 8)]
1728    vpblendd     ymm6, ymm0, ymm6, 11110000b
1729    DEINIT_X86_32_PIC
1730    lea          r5, [3 * r1]
1731    lea          r6, [3 * r3]
1732    vpxor        ymm5, ymm5, ymm5
1733.loop:
1734    AVX2_GetSatd16x4 ymm0, r0, r1, r5, r2, r3, r6, ymm6, ymm1, ymm2, ymm3, ymm4
1735    vpaddw       ymm5, ymm5, ymm0
1736    lea          r0, [r0 + 4 * r1]
1737    lea          r2, [r2 + 4 * r3]
1738    sub          r4, 1
1739    ja           .loop
1740    AVX2_SumWHorizon retrd, mm5, mm0
1741    vzeroupper
1742    POP_XMM
1743    LOAD_4_PARA_POP
1744%ifdef X86_32
1745    pop r6
1746    pop r5
1747    pop r4
1748%endif
1749    ret
1750
1751%endif
1752
1753;***********************************************************************
1754;
1755;Pixel_satd_wxh_avx2 END
1756;
1757;***********************************************************************
1758
1759;***********************************************************************
1760;
1761;Pixel_sad_wxh_sse2 BEGIN
1762;
1763;***********************************************************************
1764
1765%macro SSE2_GetSad2x16 0
1766    lea    r0,    [r0+2*r1]
1767    lea    r2,    [r2+2*r3]
1768    movdqu xmm1,   [r2]
1769    MOVDQ  xmm2,   [r0];[eax] must aligned 16
1770    psadbw xmm1,   xmm2
1771    paddw  xmm0,   xmm1
1772    movdqu xmm1,   [r2+r3]
1773    MOVDQ  xmm2,   [r0+r1]
1774    psadbw xmm1,   xmm2
1775    paddw  xmm0,   xmm1
1776%endmacro
1777
1778
1779%macro SSE2_GetSad4x16 0
1780    movdqu xmm0,   [r2]
1781    MOVDQ  xmm2,   [r0]
1782    psadbw xmm0,   xmm2
1783    paddw  xmm7,   xmm0
1784    movdqu xmm1,   [r2+r3]
1785    MOVDQ  xmm2,   [r0+r1]
1786    psadbw xmm1,   xmm2
1787    paddw  xmm7,   xmm1
1788    movdqu xmm1,   [r2+2*r3]
1789    MOVDQ  xmm2,   [r0+2*r1];[eax] must aligned 16
1790    psadbw xmm1,   xmm2
1791    paddw  xmm7,   xmm1
1792    movdqu xmm1,   [r2+r5]
1793    MOVDQ  xmm2,   [r0+r4]
1794    psadbw xmm1,   xmm2
1795    paddw  xmm7,   xmm1
1796%endmacro
1797
1798
1799%macro SSE2_GetSad8x4 0
1800    movq   xmm0,   [r0]
1801    movq   xmm1,   [r0+r1]
1802    lea    r0,     [r0+2*r1]
1803    movhps xmm0,   [r0]
1804    movhps xmm1,   [r0+r1]
1805
1806    movq   xmm2,   [r2]
1807    movq   xmm3,   [r2+r3]
1808    lea    r2,     [r2+2*r3]
1809    movhps xmm2,   [r2]
1810    movhps xmm3,   [r2+r3]
1811    psadbw xmm0,   xmm2
1812    psadbw xmm1,   xmm3
1813    paddw  xmm6,   xmm0
1814    paddw  xmm6,   xmm1
1815%endmacro
1816
1817;***********************************************************************
1818;
1819;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
1820;First parameter can align to 16 bytes,
1821;In wels, the third parameter can't align to 16 bytes.
1822;
1823;***********************************************************************
1824WELS_EXTERN WelsSampleSad16x16_sse2
1825%ifdef X86_32
1826    push  r4
1827    push  r5
1828%endif
1829
1830    %assign  push_num 2
1831    LOAD_4_PARA
1832    PUSH_XMM 8
1833    SIGN_EXTENSION r1, r1d
1834    SIGN_EXTENSION r3, r3d
1835    lea r4, [3*r1]
1836    lea r5, [3*r3]
1837
1838    pxor   xmm7,   xmm7
1839    SSE2_GetSad4x16
1840    lea    r0,  [r0+4*r1]
1841    lea    r2,  [r2+4*r3]
1842    SSE2_GetSad4x16
1843    lea    r0,  [r0+4*r1]
1844    lea    r2,  [r2+4*r3]
1845    SSE2_GetSad4x16
1846    lea    r0,  [r0+4*r1]
1847    lea    r2,  [r2+4*r3]
1848    SSE2_GetSad4x16
1849    movhlps xmm0, xmm7
1850    paddw xmm0, xmm7
1851    movd retrd, xmm0
1852    POP_XMM
1853    LOAD_4_PARA_POP
1854%ifdef X86_32
1855    pop  r5
1856    pop  r4
1857%endif
1858    ret
1859
1860;***********************************************************************
1861;
1862;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
1863;First parameter can align to 16 bytes,
1864;In wels, the third parameter can't align to 16 bytes.
1865;
1866;***********************************************************************
1867WELS_EXTERN WelsSampleSad16x8_sse2
1868    %assign  push_num 0
1869    LOAD_4_PARA
1870    SIGN_EXTENSION r1, r1d
1871    SIGN_EXTENSION r3, r3d
1872    movdqu xmm0,   [r2]
1873    MOVDQ  xmm2,   [r0]
1874    psadbw xmm0,   xmm2
1875    movdqu xmm1,   [r2+r3]
1876    MOVDQ  xmm2,   [r0+r1]
1877    psadbw xmm1,   xmm2
1878    paddw  xmm0,   xmm1
1879
1880    SSE2_GetSad2x16
1881    SSE2_GetSad2x16
1882    SSE2_GetSad2x16
1883
1884    movhlps     xmm1, xmm0
1885    paddw       xmm0, xmm1
1886    movd        retrd,  xmm0
1887    LOAD_4_PARA_POP
1888    ret
1889
1890
1891
1892WELS_EXTERN WelsSampleSad8x16_sse2
1893    %assign  push_num 0
1894    LOAD_4_PARA
1895    PUSH_XMM 7
1896    SIGN_EXTENSION r1, r1d
1897    SIGN_EXTENSION r3, r3d
1898    pxor   xmm6,   xmm6
1899
1900    SSE2_GetSad8x4
1901    lea    r0,    [r0+2*r1]
1902    lea    r2,    [r2+2*r3]
1903    SSE2_GetSad8x4
1904    lea    r0,    [r0+2*r1]
1905    lea    r2,    [r2+2*r3]
1906    SSE2_GetSad8x4
1907    lea    r0,    [r0+2*r1]
1908    lea    r2,    [r2+2*r3]
1909    SSE2_GetSad8x4
1910
1911    movhlps    xmm0, xmm6
1912    paddw      xmm0, xmm6
1913    movd       retrd,  xmm0
1914    POP_XMM
1915    LOAD_4_PARA_POP
1916    ret
1917
1918
1919%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
1920and    %1,  0x1f|(%3>>1)
1921cmp    %1,  (32-%2)|(%3>>1)
1922%endmacro
1923
1924WELS_EXTERN WelsSampleSad8x8_sse21
1925    %assign  push_num 0
1926    mov     r2,  arg3
1927    push    r2
1928    CACHE_SPLIT_CHECK r2, 8, 64
1929    jle    near   .pixel_sad_8x8_nsplit
1930    pop     r2
1931%ifdef X86_32
1932    push    r3
1933    push    r4
1934    push    r5
1935%endif
1936    %assign  push_num 3
1937    PUSH_XMM 8
1938    mov     r0,  arg1
1939    mov     r1,  arg2
1940    SIGN_EXTENSION r1, r1d
1941    pxor   xmm7,   xmm7
1942
1943    ;ecx r2, edx r4, edi r5
1944
1945    mov    r5,    r2
1946    and    r5,    0x07
1947    sub    r2,    r5
1948    mov    r4,    8
1949    sub    r4,    r5
1950
1951    shl    r5,    3
1952    shl    r4,    3
1953    movd   xmm5,   r5d
1954    movd   xmm6,   r4d
1955    mov    r5,    8
1956    add    r5,    r2
1957    mov    r3,    arg4
1958    SIGN_EXTENSION r3, r3d
1959    movq   xmm0,   [r0]
1960    movhps xmm0,   [r0+r1]
1961
1962    movq   xmm1,   [r2]
1963    movq   xmm2,   [r5]
1964    movhps xmm1,   [r2+r3]
1965    movhps xmm2,   [r5+r3]
1966    psrlq  xmm1,   xmm5
1967    psllq  xmm2,   xmm6
1968    por    xmm1,   xmm2
1969
1970    psadbw xmm0,   xmm1
1971    paddw  xmm7,   xmm0
1972
1973    lea    r0,    [r0+2*r1]
1974    lea    r2,    [r2+2*r3]
1975    lea    r5,    [r5+2*r3]
1976
1977    movq   xmm0,   [r0]
1978    movhps xmm0,   [r0+r1]
1979
1980    movq   xmm1,   [r2]
1981    movq   xmm2,   [r5]
1982    movhps xmm1,   [r2+r3]
1983    movhps xmm2,   [r5+r3]
1984    psrlq  xmm1,   xmm5
1985    psllq  xmm2,   xmm6
1986    por    xmm1,   xmm2
1987
1988    psadbw xmm0,   xmm1
1989    paddw  xmm7,   xmm0
1990
1991    lea    r0,    [r0+2*r1]
1992    lea    r2,    [r2+2*r3]
1993    lea    r5,    [r5+2*r3]
1994
1995    movq   xmm0,   [r0]
1996    movhps xmm0,   [r0+r1]
1997
1998    movq   xmm1,   [r2]
1999    movq   xmm2,   [r5]
2000    movhps xmm1,   [r2+r3]
2001    movhps xmm2,   [r5+r3]
2002    psrlq  xmm1,   xmm5
2003    psllq  xmm2,   xmm6
2004    por    xmm1,   xmm2
2005
2006    psadbw xmm0,   xmm1
2007    paddw  xmm7,   xmm0
2008
2009    lea    r0,    [r0+2*r1]
2010    lea    r2,    [r2+2*r3]
2011    lea    r5,    [r5+2*r3]
2012
2013    movq   xmm0,   [r0]
2014    movhps xmm0,   [r0+r1]
2015
2016    movq   xmm1,   [r2]
2017    movq   xmm2,   [r5]
2018    movhps xmm1,   [r2+r3]
2019    movhps xmm2,   [r5+r3]
2020    psrlq  xmm1,   xmm5
2021    psllq  xmm2,   xmm6
2022    por    xmm1,   xmm2
2023
2024    psadbw xmm0,   xmm1
2025    paddw  xmm7,   xmm0
2026
2027    movhlps    xmm0, xmm7
2028    paddw      xmm0, xmm7
2029    movd       retrd,  xmm0
2030    POP_XMM
2031%ifdef X86_32
2032    pop  r5
2033    pop  r4
2034    pop  r3
2035%endif
2036    jmp        .return
2037
2038.pixel_sad_8x8_nsplit:
2039
2040    pop r2
2041    %assign  push_num 0
2042    LOAD_4_PARA
2043    PUSH_XMM 7
2044    SIGN_EXTENSION r1, r1d
2045    SIGN_EXTENSION r3, r3d
2046    pxor   xmm6,   xmm6
2047    SSE2_GetSad8x4
2048    lea    r0,    [r0+2*r1]
2049    lea    r2,    [r2+2*r3]
2050    SSE2_GetSad8x4
2051    movhlps    xmm0, xmm6
2052    paddw      xmm0, xmm6
2053    movd       retrd,  xmm0
2054    POP_XMM
2055    LOAD_4_PARA_POP
2056.return:
2057    ret
2058
2059
2060;***********************************************************************
2061;
2062;Pixel_sad_wxh_sse2 END
2063;
2064;***********************************************************************
2065
2066
2067;***********************************************************************
2068;
2069;Pixel_sad_4_wxh_sse2 BEGIN
2070;
2071;***********************************************************************
2072
2073
2074%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
2075    psadbw %1,   %4
2076    paddw  xmm5, %1
2077    psadbw %4,   %3
2078    paddw  xmm4, %4
2079    movdqu %4,   [%5-1]
2080    psadbw %4,   %2
2081    paddw  xmm6, %4
2082    movdqu %4,   [%5+1]
2083    psadbw %4,   %2
2084    paddw  xmm7, %4
2085%endmacro
2086WELS_EXTERN WelsSampleSadFour16x16_sse2
2087    %assign  push_num 0
2088    LOAD_5_PARA
2089    PUSH_XMM 8
2090    SIGN_EXTENSION r1, r1d
2091    SIGN_EXTENSION r3, r3d
2092    pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
2093    pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
2094    pxor   xmm6,   xmm6    ;sad pRefMb-1
2095    pxor   xmm7,   xmm7    ;sad pRefMb+1
2096    movdqa xmm0,   [r0]
2097    sub    r2,    r3
2098    movdqu xmm3,   [r2]
2099    psadbw xmm3,   xmm0
2100    paddw  xmm4,   xmm3
2101
2102    movdqa xmm1,   [r0+r1]
2103    movdqu xmm3,   [r2+r3]
2104    psadbw xmm3,   xmm1
2105    paddw  xmm4,   xmm3
2106
2107    movdqu xmm2,   [r2+r3-1]
2108    psadbw xmm2,   xmm0
2109    paddw  xmm6,   xmm2
2110
2111    movdqu xmm3,   [r2+r3+1]
2112    psadbw xmm3,   xmm0
2113    paddw  xmm7,   xmm3
2114
2115    lea    r0,    [r0+2*r1]
2116    lea    r2,    [r2+2*r3]
2117    movdqa xmm2,   [r0]
2118    movdqu xmm3,   [r2]
2119    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
2120    movdqa xmm0,   [r0+r1]
2121    movdqu xmm3,   [r2+r3]
2122    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
2123    lea    r0,    [r0+2*r1]
2124    lea    r2,    [r2+2*r3]
2125    movdqa xmm1,   [r0]
2126    movdqu xmm3,   [r2]
2127    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
2128    movdqa xmm2,   [r0+r1]
2129    movdqu xmm3,   [r2+r3]
2130    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
2131    lea    r0,    [r0+2*r1]
2132    lea    r2,    [r2+2*r3]
2133    movdqa xmm0,   [r0]
2134    movdqu xmm3,   [r2]
2135    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
2136    movdqa xmm1,   [r0+r1]
2137    movdqu xmm3,   [r2+r3]
2138    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
2139    lea    r0,    [r0+2*r1]
2140    lea    r2,    [r2+2*r3]
2141    movdqa xmm2,   [r0]
2142    movdqu xmm3,   [r2]
2143    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
2144    movdqa xmm0,   [r0+r1]
2145    movdqu xmm3,   [r2+r3]
2146    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
2147    lea    r0,    [r0+2*r1]
2148    lea    r2,    [r2+2*r3]
2149    movdqa xmm1,   [r0]
2150    movdqu xmm3,   [r2]
2151    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
2152    movdqa xmm2,   [r0+r1]
2153    movdqu xmm3,   [r2+r3]
2154    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
2155    lea    r0,    [r0+2*r1]
2156    lea    r2,    [r2+2*r3]
2157    movdqa xmm0,   [r0]
2158    movdqu xmm3,   [r2]
2159    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
2160    movdqa xmm1,   [r0+r1]
2161    movdqu xmm3,   [r2+r3]
2162    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
2163    lea    r0,    [r0+2*r1]
2164    lea    r2,    [r2+2*r3]
2165    movdqa xmm2,   [r0]
2166    movdqu xmm3,   [r2]
2167    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
2168    movdqa xmm0,   [r0+r1]
2169    movdqu xmm3,   [r2+r3]
2170    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
2171    lea    r2,    [r2+2*r3]
2172    movdqu xmm3,   [r2]
2173    psadbw xmm2,   xmm3
2174    paddw xmm5,   xmm2
2175
2176    movdqu xmm2,   [r2-1]
2177    psadbw xmm2,   xmm0
2178    paddw xmm6,   xmm2
2179
2180    movdqu xmm3,   [r2+1]
2181    psadbw xmm3,   xmm0
2182    paddw xmm7,   xmm3
2183
2184    movdqu xmm3,   [r2+r3]
2185    psadbw xmm0,   xmm3
2186    paddw xmm5,   xmm0
2187
2188    movhlps    xmm0, xmm4
2189    paddw      xmm4, xmm0
2190    movhlps    xmm0, xmm5
2191    paddw      xmm5, xmm0
2192    movhlps    xmm0, xmm6
2193    paddw      xmm6, xmm0
2194    movhlps    xmm0, xmm7
2195    paddw      xmm7, xmm0
2196    punpckldq  xmm4, xmm5
2197    punpckldq  xmm6, xmm7
2198    punpcklqdq xmm4, xmm6
2199    movdqa     [r4],xmm4
2200    POP_XMM
2201    LOAD_5_PARA_POP
2202    ret
2203
2204
2205WELS_EXTERN WelsSampleSadFour16x8_sse2
2206    %assign  push_num 0
2207    LOAD_5_PARA
2208    PUSH_XMM 8
2209    SIGN_EXTENSION r1, r1d
2210    SIGN_EXTENSION r3, r3d
2211    pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
2212    pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
2213    pxor   xmm6,   xmm6    ;sad pRefMb-1
2214    pxor   xmm7,   xmm7    ;sad pRefMb+1
2215    movdqa xmm0,   [r0]
2216    sub    r2,    r3
2217    movdqu xmm3,   [r2]
2218    psadbw xmm3,   xmm0
2219    paddw xmm4,   xmm3
2220
2221    movdqa xmm1,   [r0+r1]
2222    movdqu xmm3,   [r2+r3]
2223    psadbw xmm3,   xmm1
2224    paddw xmm4,   xmm3
2225
2226    movdqu xmm2,   [r2+r3-1]
2227    psadbw xmm2,   xmm0
2228    paddw xmm6,   xmm2
2229
2230    movdqu xmm3,   [r2+r3+1]
2231    psadbw xmm3,   xmm0
2232    paddw xmm7,   xmm3
2233
2234    lea    r0,    [r0+2*r1]
2235    lea    r2,    [r2+2*r3]
2236    movdqa xmm2,   [r0]
2237    movdqu xmm3,   [r2]
2238    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
2239    movdqa xmm0,   [r0+r1]
2240    movdqu xmm3,   [r2+r3]
2241    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
2242    lea    r0,    [r0+2*r1]
2243    lea    r2,    [r2+2*r3]
2244    movdqa xmm1,   [r0]
2245    movdqu xmm3,   [r2]
2246    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
2247    movdqa xmm2,   [r0+r1]
2248    movdqu xmm3,   [r2+r3]
2249    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
2250    lea    r0,    [r0+2*r1]
2251    lea    r2,    [r2+2*r3]
2252    movdqa xmm0,   [r0]
2253    movdqu xmm3,   [r2]
2254    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
2255    movdqa xmm1,   [r0+r1]
2256    movdqu xmm3,   [r2+r3]
2257    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
2258    lea    r2,    [r2+2*r3]
2259    movdqu xmm3,   [r2]
2260    psadbw xmm0,   xmm3
2261    paddw xmm5,   xmm0
2262
2263    movdqu xmm0,   [r2-1]
2264    psadbw xmm0,   xmm1
2265    paddw xmm6,   xmm0
2266
2267    movdqu xmm3,   [r2+1]
2268    psadbw xmm3,   xmm1
2269    paddw xmm7,   xmm3
2270
2271    movdqu xmm3,   [r2+r3]
2272    psadbw xmm1,   xmm3
2273    paddw xmm5,   xmm1
2274
2275    movhlps    xmm0, xmm4
2276    paddw      xmm4, xmm0
2277    movhlps    xmm0, xmm5
2278    paddw      xmm5, xmm0
2279    movhlps    xmm0, xmm6
2280    paddw      xmm6, xmm0
2281    movhlps    xmm0, xmm7
2282    paddw      xmm7, xmm0
2283    punpckldq  xmm4, xmm5
2284    punpckldq  xmm6, xmm7
2285    punpcklqdq xmm4, xmm6
2286    movdqa     [r4],xmm4
2287    POP_XMM
2288    LOAD_5_PARA_POP
2289    ret
2290
2291WELS_EXTERN WelsSampleSadFour8x16_sse2
2292    %assign  push_num 0
2293    LOAD_5_PARA
2294    PUSH_XMM 8
2295    SIGN_EXTENSION r1, r1d
2296    SIGN_EXTENSION r3, r3d
2297    pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
2298    pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
2299    pxor   xmm6,   xmm6    ;sad pRefMb-1
2300    pxor   xmm7,   xmm7    ;sad pRefMb+1
2301    movq   xmm0,   [r0]
2302    movhps xmm0,   [r0+r1]
2303    sub    r2,    r3
2304    movq   xmm3,   [r2]
2305    movhps xmm3,   [r2+r3]
2306    psadbw xmm3,   xmm0
2307    paddw  xmm4,   xmm3
2308
2309    movq   xmm1,  [r2+r3-1]
2310    movq   xmm3,  [r2+r3+1]
2311
2312    lea    r0,    [r0+2*r1]
2313    lea    r2,    [r2+2*r3]
2314    movhps xmm1,  [r2-1]
2315    movhps xmm3,  [r2+1]
2316    psadbw xmm1,  xmm0
2317    paddw  xmm6,  xmm1
2318    psadbw xmm3,  xmm0
2319    paddw  xmm7,  xmm3
2320
2321    movq   xmm3,  [r2]
2322    movhps xmm3,  [r2+r3]
2323    psadbw xmm0,  xmm3
2324    paddw  xmm5,  xmm0
2325
2326    movq   xmm0,  [r0]
2327    movhps xmm0,  [r0+r1]
2328    psadbw xmm3,  xmm0
2329    paddw  xmm4,  xmm3
2330
2331    movq   xmm1,  [r2+r3-1]
2332    movq   xmm3,  [r2+r3+1]
2333
2334    lea    r0,    [r0+2*r1]
2335    lea    r2,    [r2+2*r3]
2336    movhps xmm1,  [r2-1]
2337    movhps xmm3,  [r2+1]
2338
2339    psadbw xmm1,  xmm0
2340    paddw  xmm6,  xmm1
2341    psadbw xmm3,  xmm0
2342    paddw  xmm7,  xmm3
2343
2344    movq   xmm3,  [r2]
2345    movhps xmm3,  [r2+r3]
2346    psadbw xmm0,  xmm3
2347    paddw  xmm5,  xmm0
2348
2349    movq   xmm0,  [r0]
2350    movhps xmm0,  [r0+r1]
2351    psadbw xmm3,  xmm0
2352    paddw  xmm4,  xmm3
2353
2354    movq   xmm1,  [r2+r3-1]
2355    movq   xmm3,  [r2+r3+1]
2356
2357    lea    r0,    [r0+2*r1]
2358    lea    r2,    [r2+2*r3]
2359    movhps xmm1,  [r2-1]
2360    movhps xmm3,  [r2+1]
2361
2362    psadbw xmm1,  xmm0
2363    paddw  xmm6,  xmm1
2364    psadbw xmm3,  xmm0
2365    paddw  xmm7,  xmm3
2366
2367    movq   xmm3,  [r2]
2368    movhps xmm3,  [r2+r3]
2369    psadbw xmm0,  xmm3
2370    paddw  xmm5,  xmm0
2371
2372    movq   xmm0,  [r0]
2373    movhps xmm0,  [r0+r1]
2374    psadbw xmm3,  xmm0
2375    paddw  xmm4,  xmm3
2376
2377    movq   xmm1,  [r2+r3-1]
2378    movq   xmm3,  [r2+r3+1]
2379
2380    lea    r0,    [r0+2*r1]
2381    lea    r2,    [r2+2*r3]
2382    movhps xmm1,  [r2-1]
2383    movhps xmm3,  [r2+1]
2384
2385    psadbw xmm1,  xmm0
2386    paddw  xmm6,  xmm1
2387    psadbw xmm3,  xmm0
2388    paddw  xmm7,  xmm3
2389
2390    movq   xmm3,  [r2]
2391    movhps xmm3,  [r2+r3]
2392    psadbw xmm0,  xmm3
2393    paddw  xmm5,  xmm0
2394
2395    movq   xmm0,  [r0]
2396    movhps xmm0,  [r0+r1]
2397    psadbw xmm3,  xmm0
2398    paddw  xmm4,  xmm3
2399
2400    movq   xmm1,  [r2+r3-1]
2401    movq   xmm3,  [r2+r3+1]
2402
2403    lea    r0,    [r0+2*r1]
2404    lea    r2,    [r2+2*r3]
2405    movhps xmm1,  [r2-1]
2406    movhps xmm3,  [r2+1]
2407
2408    psadbw xmm1,  xmm0
2409    paddw  xmm6,  xmm1
2410    psadbw xmm3,  xmm0
2411    paddw  xmm7,  xmm3
2412
2413    movq   xmm3,  [r2]
2414    movhps xmm3,  [r2+r3]
2415    psadbw xmm0,  xmm3
2416    paddw  xmm5,  xmm0
2417
2418    movq   xmm0,  [r0]
2419    movhps xmm0,  [r0+r1]
2420    psadbw xmm3,  xmm0
2421    paddw  xmm4,  xmm3
2422
2423    movq   xmm1,  [r2+r3-1]
2424    movq   xmm3,  [r2+r3+1]
2425
2426    lea    r0,    [r0+2*r1]
2427    lea    r2,    [r2+2*r3]
2428    movhps xmm1,  [r2-1]
2429    movhps xmm3,  [r2+1]
2430
2431    psadbw xmm1,  xmm0
2432    paddw  xmm6,  xmm1
2433    psadbw xmm3,  xmm0
2434    paddw  xmm7,  xmm3
2435
2436    movq   xmm3,  [r2]
2437    movhps xmm3,  [r2+r3]
2438    psadbw xmm0,  xmm3
2439    paddw  xmm5,  xmm0
2440
2441    movq   xmm0,  [r0]
2442    movhps xmm0,  [r0+r1]
2443    psadbw xmm3,  xmm0
2444    paddw  xmm4,  xmm3
2445
2446    movq   xmm1,  [r2+r3-1]
2447    movq   xmm3,  [r2+r3+1]
2448
2449    lea    r0,    [r0+2*r1]
2450    lea    r2,    [r2+2*r3]
2451    movhps xmm1,  [r2-1]
2452    movhps xmm3,  [r2+1]
2453
2454    psadbw xmm1,  xmm0
2455    paddw  xmm6,  xmm1
2456    psadbw xmm3,  xmm0
2457    paddw  xmm7,  xmm3
2458
2459    movq   xmm3,  [r2]
2460    movhps xmm3,  [r2+r3]
2461    psadbw xmm0,  xmm3
2462    paddw  xmm5,  xmm0
2463
2464    movq   xmm0,  [r0]
2465    movhps xmm0,  [r0+r1]
2466    psadbw xmm3,  xmm0
2467    paddw  xmm4,  xmm3
2468
2469    movq   xmm1,  [r2+r3-1]
2470    movq   xmm3,  [r2+r3+1]
2471
2472    lea    r0,    [r0+2*r1]
2473    lea    r2,    [r2+2*r3]
2474    movhps xmm1,  [r2-1]
2475    movhps xmm3,  [r2+1]
2476
2477    psadbw xmm1,  xmm0
2478    paddw  xmm6,  xmm1
2479    psadbw xmm3,  xmm0
2480    paddw  xmm7,  xmm3
2481
2482    movq   xmm3,  [r2]
2483    movhps xmm3,  [r2+r3]
2484    psadbw xmm0,  xmm3
2485    paddw  xmm5,  xmm0
2486
2487    movhlps    xmm0, xmm4
2488    paddw      xmm4, xmm0
2489    movhlps    xmm0, xmm5
2490    paddw      xmm5, xmm0
2491    movhlps    xmm0, xmm6
2492    paddw      xmm6, xmm0
2493    movhlps    xmm0, xmm7
2494    paddw      xmm7, xmm0
2495    punpckldq  xmm4, xmm5
2496    punpckldq  xmm6, xmm7
2497    punpcklqdq xmm4, xmm6
2498    movdqa     [r4],xmm4
2499    POP_XMM
2500    LOAD_5_PARA_POP
2501    ret
2502
2503
2504WELS_EXTERN WelsSampleSadFour8x8_sse2
2505    %assign  push_num 0
2506    LOAD_5_PARA
2507    PUSH_XMM 8
2508    SIGN_EXTENSION r1, r1d
2509    SIGN_EXTENSION r3, r3d
2510    pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
2511    pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
2512    pxor   xmm6,   xmm6    ;sad pRefMb-1
2513    pxor   xmm7,   xmm7    ;sad pRefMb+1
2514    movq   xmm0,   [r0]
2515    movhps xmm0,   [r0+r1]
2516    sub    r2,    r3
2517    movq   xmm3,   [r2]
2518    movhps xmm3,   [r2+r3]
2519    psadbw xmm3,   xmm0
2520    paddw  xmm4,   xmm3
2521
2522    movq   xmm1,  [r2+r3-1]
2523    movq   xmm3,  [r2+r3+1]
2524
2525    lea    r0,    [r0+2*r1]
2526    lea    r2,    [r2+2*r3]
2527    movhps xmm1,  [r2-1]
2528    movhps xmm3,  [r2+1]
2529    psadbw xmm1,  xmm0
2530    paddw  xmm6,  xmm1
2531    psadbw xmm3,  xmm0
2532    paddw  xmm7,  xmm3
2533
2534    movq   xmm3,  [r2]
2535    movhps xmm3,  [r2+r3]
2536    psadbw xmm0,  xmm3
2537    paddw  xmm5,  xmm0
2538
2539    movq   xmm0,  [r0]
2540    movhps xmm0,  [r0+r1]
2541    psadbw xmm3,  xmm0
2542    paddw  xmm4,  xmm3
2543
2544    movq   xmm1,  [r2+r3-1]
2545    movq   xmm3,  [r2+r3+1]
2546
2547    lea    r0,    [r0+2*r1]
2548    lea    r2,    [r2+2*r3]
2549    movhps xmm1,  [r2-1]
2550    movhps xmm3,  [r2+1]
2551
2552    psadbw xmm1,  xmm0
2553    paddw  xmm6,  xmm1
2554    psadbw xmm3,  xmm0
2555    paddw  xmm7,  xmm3
2556
2557    movq   xmm3,  [r2]
2558    movhps xmm3,  [r2+r3]
2559    psadbw xmm0,  xmm3
2560    paddw  xmm5,  xmm0
2561
2562    movq   xmm0,  [r0]
2563    movhps xmm0,  [r0+r1]
2564    psadbw xmm3,  xmm0
2565    paddw  xmm4,  xmm3
2566
2567    movq   xmm1,  [r2+r3-1]
2568    movq   xmm3,  [r2+r3+1]
2569
2570    lea    r0,    [r0+2*r1]
2571    lea    r2,    [r2+2*r3]
2572    movhps xmm1,  [r2-1]
2573    movhps xmm3,  [r2+1]
2574
2575    psadbw xmm1,  xmm0
2576    paddw  xmm6,  xmm1
2577    psadbw xmm3,  xmm0
2578    paddw  xmm7,  xmm3
2579
2580    movq   xmm3,  [r2]
2581    movhps xmm3,  [r2+r3]
2582    psadbw xmm0,  xmm3
2583    paddw  xmm5,  xmm0
2584
2585    movq   xmm0,  [r0]
2586    movhps xmm0,  [r0+r1]
2587    psadbw xmm3,  xmm0
2588    paddw  xmm4,  xmm3
2589
2590
2591    movq   xmm1,  [r2+r3-1]
2592    movq   xmm3,  [r2+r3+1]
2593
2594    lea    r0,    [r0+2*r1]
2595    lea    r2,    [r2+2*r3]
2596    movhps xmm1,  [r2-1]
2597    movhps xmm3,  [r2+1]
2598
2599    psadbw xmm1,  xmm0
2600    paddw  xmm6,  xmm1
2601    psadbw xmm3,  xmm0
2602    paddw  xmm7,  xmm3
2603
2604    movq   xmm3,  [r2]
2605    movhps xmm3,  [r2+r3]
2606    psadbw xmm0,  xmm3
2607    paddw  xmm5,  xmm0
2608
2609    movhlps    xmm0, xmm4
2610    paddw      xmm4, xmm0
2611    movhlps    xmm0, xmm5
2612    paddw      xmm5, xmm0
2613    movhlps    xmm0, xmm6
2614    paddw      xmm6, xmm0
2615    movhlps    xmm0, xmm7
2616    paddw      xmm7, xmm0
2617    punpckldq  xmm4, xmm5
2618    punpckldq  xmm6, xmm7
2619    punpcklqdq xmm4, xmm6
2620    movdqa     [r4],xmm4
2621    POP_XMM
2622    LOAD_5_PARA_POP
2623    ret
2624
2625WELS_EXTERN WelsSampleSadFour4x4_sse2
2626    %assign  push_num 0
2627    LOAD_5_PARA
2628    PUSH_XMM 8
2629    SIGN_EXTENSION r1, r1d
2630    SIGN_EXTENSION r3, r3d
2631    movd   xmm0,   [r0]
2632    movd   xmm1,   [r0+r1]
2633    lea        r0,    [r0+2*r1]
2634    movd       xmm2,   [r0]
2635    movd       xmm3,   [r0+r1]
2636    punpckldq  xmm0, xmm1
2637    punpckldq  xmm2, xmm3
2638    punpcklqdq xmm0, xmm2
2639    sub        r2,  r3
2640    movd       xmm1, [r2]
2641    movd       xmm2, [r2+r3]
2642    punpckldq  xmm1, xmm2
2643    movd       xmm2, [r2+r3-1]
2644    movd       xmm3, [r2+r3+1]
2645
2646    lea        r2,  [r2+2*r3]
2647
2648    movd       xmm4, [r2]
2649    movd       xmm5, [r2-1]
2650    punpckldq  xmm2, xmm5
2651    movd       xmm5, [r2+1]
2652    punpckldq  xmm3, xmm5
2653
2654    movd       xmm5, [r2+r3]
2655    punpckldq  xmm4, xmm5
2656
2657    punpcklqdq xmm1, xmm4 ;-L
2658
2659    movd       xmm5, [r2+r3-1]
2660    movd       xmm6, [r2+r3+1]
2661
2662    lea        r2,  [r2+2*r3]
2663    movd       xmm7, [r2-1]
2664    punpckldq  xmm5, xmm7
2665    punpcklqdq xmm2, xmm5 ;-1
2666    movd       xmm7, [r2+1]
2667    punpckldq  xmm6, xmm7
2668    punpcklqdq xmm3, xmm6 ;+1
2669    movd       xmm6, [r2]
2670    movd       xmm7, [r2+r3]
2671    punpckldq  xmm6, xmm7
2672    punpcklqdq xmm4, xmm6 ;+L
2673    psadbw     xmm1, xmm0
2674    psadbw     xmm2, xmm0
2675    psadbw     xmm3, xmm0
2676    psadbw     xmm4, xmm0
2677
2678    movhlps    xmm0, xmm1
2679    paddw      xmm1, xmm0
2680    movhlps    xmm0, xmm2
2681    paddw      xmm2, xmm0
2682    movhlps    xmm0, xmm3
2683    paddw      xmm3, xmm0
2684    movhlps    xmm0, xmm4
2685    paddw      xmm4, xmm0
2686    punpckldq  xmm1, xmm4
2687    punpckldq  xmm2, xmm3
2688    punpcklqdq xmm1, xmm2
2689    movdqa     [r4],xmm1
2690    POP_XMM
2691    LOAD_5_PARA_POP
2692    ret
2693
2694;***********************************************************************
2695;
2696;Pixel_sad_4_wxh_sse2 END
2697;
2698;***********************************************************************
2699
2700;***********************************************************************
2701;   int32_t WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
2702;***********************************************************************
2703WELS_EXTERN WelsSampleSad4x4_mmx
2704    %assign  push_num 0
2705    LOAD_4_PARA
2706    SIGN_EXTENSION r1, r1d
2707    SIGN_EXTENSION r3, r3d
2708    movd      mm0, [r0]
2709    movd      mm1, [r0+r1]
2710    punpckldq mm0, mm1
2711
2712    movd      mm3, [r2]
2713    movd      mm4, [r2+r3]
2714    punpckldq mm3, mm4
2715    psadbw    mm0, mm3
2716
2717    lea       r0, [r0+2*r1]
2718    lea       r2, [r2+2*r3]
2719
2720    movd      mm1, [r0]
2721    movd      mm2, [r0+r1]
2722    punpckldq mm1, mm2
2723
2724    movd      mm3, [r2]
2725    movd      mm4, [r2+r3]
2726    punpckldq mm3, mm4
2727    psadbw    mm1, mm3
2728    paddw     mm0, mm1
2729
2730    movd      retrd, mm0
2731
2732    WELSEMMS
2733    LOAD_4_PARA_POP
2734    ret
2735