• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*!
2;* \copy
3;*     Copyright (c)  2009-2013, Cisco Systems
4;*     All rights reserved.
5;*
6;*     Redistribution and use in source and binary forms, with or without
7;*     modification, are permitted provided that the following conditions
8;*     are met:
9;*
10;*        * Redistributions of source code must retain the above copyright
11;*          notice, this list of conditions and the following disclaimer.
12;*
13;*        * Redistributions in binary form must reproduce the above copyright
14;*          notice, this list of conditions and the following disclaimer in
15;*          the documentation and/or other materials provided with the
16;*          distribution.
17;*
18;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29;*     POSSIBILITY OF SUCH DAMAGE.
30;*
31;*************************************************************************/
32%include "asm_inc.asm"
33
34;***********************************************************************
35; Local Data (Read Only)
36;***********************************************************************
37%ifdef X86_32_PICASM
38SECTION .text align=16
39%else
40SECTION .rodata align=16
41%endif
42
43ALIGN 16
44mv_x_inc_x4     dw  0x10, 0x10, 0x10, 0x10
45mv_y_inc_x4     dw  0x04, 0x04, 0x04, 0x04
46mx_x_offset_x4  dw  0x00, 0x04, 0x08, 0x0C
47
48SECTION .text
49%ifdef X86_32
50;**********************************************************************************************************************
51;void SumOf8x8BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
52;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
53;*********************************************************************************************************************
54WELS_EXTERN SumOf8x8BlockOfFrame_sse2
55%define     pushsize        16
56%define     localsize       4
57%define     ref             esp + pushsize + localsize + 4
58%define     sum_ref         esp + pushsize + localsize + 20
59%define     times_of_sum    esp + pushsize + localsize + 24
60%define     width           esp + pushsize + localsize + 8
61%define     height          esp + pushsize + localsize + 12
62%define     linesize        esp + pushsize + localsize + 16
63%define     tmp_width       esp + 0
64    push    ebx
65    push    ebp
66    push    esi
67    push    edi
68    sub     esp,    localsize
69
70    pxor    xmm0,   xmm0
71    mov     esi,    [ref]
72    mov     edi,    [sum_ref]
73    mov     edx,    [times_of_sum]
74    mov     ebx,    [linesize]
75    mov     eax,    [width]
76    lea     ecx,    [ebx+ebx*2] ; 3*linesize
77
78    mov     [tmp_width],    eax
79    lea     ebp,    [esi+ebx*4]
80FIRST_ROW:
81    movq    xmm1,   [esi]
82    movq    xmm2,   [esi+ebx]
83    movq    xmm3,   [esi+ebx*2]
84    movq    xmm4,   [esi+ecx]
85
86    shufps  xmm1,   xmm2,   01000100b
87    shufps  xmm3,   xmm4,   01000100b
88    psadbw  xmm1,   xmm0
89    psadbw  xmm3,   xmm0
90    paddd   xmm1,   xmm3
91
92    movq    xmm2,   [ebp]
93    movq    xmm3,   [ebp+ebx]
94    movq    xmm4,   [ebp+ebx*2]
95    movq    xmm5,   [ebp+ecx]
96
97    shufps  xmm2,   xmm3,   01000100b
98    shufps  xmm4,   xmm5,   01000100b
99    psadbw  xmm2,   xmm0
100    psadbw  xmm4,   xmm0
101    paddd   xmm2,   xmm4
102
103    paddd   xmm1,   xmm2
104    pshufd  xmm2,   xmm1,   00001110b
105    paddd   xmm1,   xmm2
106    movd    eax,    xmm1
107    mov     [edi],  ax
108    inc     dword [edx+eax*4]
109
110    inc     esi
111    inc     ebp
112    add     edi,    2
113
114    dec     dword [tmp_width]
115    jg      FIRST_ROW
116
117    mov     esi,    [ref]
118    mov     edi,    [sum_ref]
119    mov     ebp,    [width]
120    dec     dword [height]
121HEIGHT_LOOP:
122    mov     [tmp_width],    ebp
123WIDTH_LOOP:
124    movq    xmm1,   [esi+ebx*8]
125    movq    xmm2,   [esi]
126    psadbw  xmm1,   xmm0
127    psadbw  xmm2,   xmm0
128    psubd   xmm1,   xmm2
129    movd    eax,    xmm1
130    mov     cx,     [edi]
131    add     eax,    ecx
132
133    mov     [edi+ebp*2],    ax
134    inc     dword [edx+eax*4]
135
136    inc     esi
137    add     edi,    2
138
139    dec     dword [tmp_width]
140    jg      WIDTH_LOOP
141
142    add     esi,    ebx
143    sub     esi,    ebp
144
145    dec     dword [height]
146    jg      HEIGHT_LOOP
147
148    add     esp,    localsize
149    pop     edi
150    pop     esi
151    pop     ebp
152    pop     ebx
153%undef      pushsize
154%undef      localsize
155%undef      ref
156%undef      sum_ref
157%undef      times_of_sum
158%undef      width
159%undef      height
160%undef      linesize
161%undef      tmp_width
162    ret
163
164
165%macro COUNT_SUM 3
166%define xmm_reg %1
167%define tmp_reg %2
168    movd    tmp_reg,    xmm_reg
169    inc     dword [edx+tmp_reg*4]
170%if %3 == 1
171    psrldq  xmm_reg,    4
172%endif
173%endmacro
174
175
176;-----------------------------------------------------------------------------
177; requires:  width % 8 == 0 && height > 1
178;-----------------------------------------------------------------------------
179;void SumOf8x8BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
180;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
181;-----------------------------------------------------------------------------
182; read extra (16 - (width % 8) ) mod 16 bytes of every line
183; write extra (16 - (width % 8)*2 ) mod 16 bytes in the end of sum_ref
184WELS_EXTERN SumOf8x8BlockOfFrame_sse4
185%define     pushsize        16
186%define     localsize       4
187%define     ref             esp + pushsize + localsize + 4
188%define     sum_ref         esp + pushsize + localsize + 20
189%define     times_of_sum    esp + pushsize + localsize + 24
190%define     width           esp + pushsize + localsize + 8
191%define     height          esp + pushsize + localsize + 12
192%define     linesize        esp + pushsize + localsize + 16
193%define     tmp_width       esp + 0
194    push    ebx
195    push    ebp
196    push    esi
197    push    edi
198    sub     esp,    localsize
199
200    pxor    xmm0,   xmm0
201    mov     esi,    [ref]
202    mov     edi,    [sum_ref]
203    mov     edx,    [times_of_sum]
204    mov     ebx,    [linesize]
205    mov     eax,    [width]
206    lea     ecx,    [ebx+ebx*2] ; 3*linesize
207
208    mov     [tmp_width],    eax
209    lea     ebp,    [esi+ebx*4]
210FIRST_ROW_SSE4:
211    movdqu  xmm1,   [esi]
212    movdqu  xmm3,   [esi+ebx]
213    movdqu  xmm5,   [esi+ebx*2]
214    movdqu  xmm7,   [esi+ecx]
215
216    movdqa  xmm2,   xmm1
217    mpsadbw xmm1,   xmm0,   000b
218    mpsadbw xmm2,   xmm0,   100b
219    paddw   xmm1,   xmm2            ; 8 sums of line1
220
221    movdqa  xmm4,   xmm3
222    mpsadbw xmm3,   xmm0,   000b
223    mpsadbw xmm4,   xmm0,   100b
224    paddw   xmm3,   xmm4            ; 8 sums of line2
225
226    movdqa  xmm2,   xmm5
227    mpsadbw xmm5,   xmm0,   000b
228    mpsadbw xmm2,   xmm0,   100b
229    paddw   xmm5,   xmm2            ; 8 sums of line3
230
231    movdqa  xmm4,   xmm7
232    mpsadbw xmm7,   xmm0,   000b
233    mpsadbw xmm4,   xmm0,   100b
234    paddw   xmm7,   xmm4            ; 8 sums of line4
235
236    paddw   xmm1,   xmm3
237    paddw   xmm5,   xmm7
238    paddw   xmm1,   xmm5            ; sum the upper 4 lines first
239
240    movdqu  xmm2,   [ebp]
241    movdqu  xmm3,   [ebp+ebx]
242    movdqu  xmm4,   [ebp+ebx*2]
243    movdqu  xmm5,   [ebp+ecx]
244
245    movdqa  xmm6,   xmm2
246    mpsadbw xmm2,   xmm0,   000b
247    mpsadbw xmm6,   xmm0,   100b
248    paddw   xmm2,   xmm6
249
250    movdqa  xmm7,   xmm3
251    mpsadbw xmm3,   xmm0,   000b
252    mpsadbw xmm7,   xmm0,   100b
253    paddw   xmm3,   xmm7
254
255    movdqa  xmm6,   xmm4
256    mpsadbw xmm4,   xmm0,   000b
257    mpsadbw xmm6,   xmm0,   100b
258    paddw   xmm4,   xmm6
259
260    movdqa  xmm7,   xmm5
261    mpsadbw xmm5,   xmm0,   000b
262    mpsadbw xmm7,   xmm0,   100b
263    paddw   xmm5,   xmm7
264
265    paddw   xmm2,   xmm3
266    paddw   xmm4,   xmm5
267    paddw   xmm1,   xmm2
268    paddw   xmm1,   xmm4            ; sum of lines 1- 8
269
270    movdqu  [edi],  xmm1
271
272    movdqa  xmm2,   xmm1
273    punpcklwd   xmm1,   xmm0
274    punpckhwd   xmm2,   xmm0
275
276    COUNT_SUM   xmm1,   eax,    1
277    COUNT_SUM   xmm1,   eax,    1
278    COUNT_SUM   xmm1,   eax,    1
279    COUNT_SUM   xmm1,   eax,    0
280    COUNT_SUM   xmm2,   eax,    1
281    COUNT_SUM   xmm2,   eax,    1
282    COUNT_SUM   xmm2,   eax,    1
283    COUNT_SUM   xmm2,   eax,    0
284
285    lea     esi,    [esi+8]
286    lea     ebp,    [ebp+8]
287    lea     edi,    [edi+16]        ; element size is 2
288
289    sub     dword [tmp_width], 8
290    jg      near FIRST_ROW_SSE4
291
292    mov     esi,    [ref]
293    mov     edi,    [sum_ref]
294    mov     ebp,    [width]
295    dec     dword [height]
296HEIGHT_LOOP_SSE4:
297    mov     ecx,    ebp
298WIDTH_LOOP_SSE4:
299    movdqu  xmm1,   [esi+ebx*8]
300    movdqu  xmm2,   [esi]
301    movdqu  xmm7,   [edi]
302
303    movdqa  xmm3,   xmm1
304    mpsadbw xmm1,   xmm0,   000b
305    mpsadbw xmm3,   xmm0,   100b
306    paddw   xmm1,   xmm3
307
308    movdqa  xmm4,   xmm2
309    mpsadbw xmm2,   xmm0,   000b
310    mpsadbw xmm4,   xmm0,   100b
311    paddw   xmm2,   xmm4
312
313    paddw   xmm7,   xmm1
314    psubw   xmm7,   xmm2
315    movdqu  [edi+ebp*2], xmm7
316
317    movdqa  xmm6,   xmm7
318    punpcklwd   xmm7,   xmm0
319    punpckhwd   xmm6,   xmm0
320
321    COUNT_SUM   xmm7,   eax,    1
322    COUNT_SUM   xmm7,   eax,    1
323    COUNT_SUM   xmm7,   eax,    1
324    COUNT_SUM   xmm7,   eax,    0
325    COUNT_SUM   xmm6,   eax,    1
326    COUNT_SUM   xmm6,   eax,    1
327    COUNT_SUM   xmm6,   eax,    1
328    COUNT_SUM   xmm6,   eax,    0
329
330    lea     esi,    [esi+8]
331    lea     edi,    [edi+16]
332
333    sub     ecx,    8
334    jg      near WIDTH_LOOP_SSE4
335
336    lea     esi,    [esi+ebx]
337    sub     esi,    ebp
338
339    dec     dword [height]
340    jg      near HEIGHT_LOOP_SSE4
341
342    add     esp,    localsize
343    pop     edi
344    pop     esi
345    pop     ebp
346    pop     ebx
347%undef      pushsize
348%undef      localsize
349%undef      ref
350%undef      sum_ref
351%undef      times_of_sum
352%undef      width
353%undef      height
354%undef      linesize
355%undef      tmp_width
356    ret
357
358
359;****************************************************************************************************************************************************
360;void SumOf16x16BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
361;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
362;****************************************************************************************************************************************************
363WELS_EXTERN SumOf16x16BlockOfFrame_sse2
364%define     pushsize        16
365%define     localsize       4
366%define     ref             esp + pushsize + localsize + 4
367%define     sum_ref         esp + pushsize + localsize + 20
368%define     times_of_sum    esp + pushsize + localsize + 24
369%define     width           esp + pushsize + localsize + 8
370%define     height          esp + pushsize + localsize + 12
371%define     linesize        esp + pushsize + localsize + 16
372%define     tmp_width       esp
373    push    ebx
374    push    ebp
375    push    esi
376    push    edi
377    sub     esp,    localsize
378
379    pxor    xmm0,   xmm0
380    mov     esi,    [ref]
381    mov     edi,    [sum_ref]
382    mov     edx,    [times_of_sum]
383    mov     ebx,    [linesize]
384    mov     eax,    [width]
385
386    lea     ecx,    [ebx+ebx*2]
387    mov     [tmp_width],    eax
388FIRST_ROW_X16H:
389    movdqu  xmm1,   [esi]
390    movdqu  xmm2,   [esi+ebx]
391    movdqu  xmm3,   [esi+ebx*2]
392    movdqu  xmm4,   [esi+ecx]
393
394    psadbw  xmm1,   xmm0
395    psadbw  xmm2,   xmm0
396    psadbw  xmm3,   xmm0
397    psadbw  xmm4,   xmm0
398    paddw   xmm1,   xmm2
399    paddw   xmm3,   xmm4
400    paddw   xmm1,   xmm3
401
402    lea     ebp,    [esi+ebx*4]
403    movdqu  xmm2,   [ebp]
404    movdqu  xmm3,   [ebp+ebx]
405    movdqu  xmm4,   [ebp+ebx*2]
406    movdqu  xmm5,   [ebp+ecx]
407
408    psadbw  xmm2,   xmm0
409    psadbw  xmm3,   xmm0
410    psadbw  xmm4,   xmm0
411    psadbw  xmm5,   xmm0
412    paddw   xmm2,   xmm3
413    paddw   xmm4,   xmm5
414    paddw   xmm2,   xmm4
415
416    paddw   xmm1,   xmm2
417
418    lea     ebp,    [ebp+ebx*4]
419    movdqu  xmm2,   [ebp]
420    movdqu  xmm3,   [ebp+ebx]
421    movdqu  xmm4,   [ebp+ebx*2]
422    movdqu  xmm5,   [ebp+ecx]
423
424    psadbw  xmm2,   xmm0
425    psadbw  xmm3,   xmm0
426    psadbw  xmm4,   xmm0
427    psadbw  xmm5,   xmm0
428    paddw   xmm2,   xmm3
429    paddw   xmm4,   xmm5
430    paddw   xmm2,   xmm4
431
432    paddw   xmm1,   xmm2
433
434    lea     ebp,    [ebp+ebx*4]
435    movdqu  xmm2,   [ebp]
436    movdqu  xmm3,   [ebp+ebx]
437    movdqu  xmm4,   [ebp+ebx*2]
438    movdqu  xmm5,   [ebp+ecx]
439
440    psadbw  xmm2,   xmm0
441    psadbw  xmm3,   xmm0
442    psadbw  xmm4,   xmm0
443    psadbw  xmm5,   xmm0
444    paddw   xmm2,   xmm3
445    paddw   xmm4,   xmm5
446    paddw   xmm2,   xmm4
447
448    paddw   xmm1,   xmm2
449    movdqa  xmm2,   xmm1
450    punpckhwd xmm2, xmm0
451    paddw xmm1, xmm2
452    movd    eax,    xmm1
453    mov     [edi],  ax
454    inc     dword [edx+eax*4]
455
456    inc     esi
457    lea     edi,    [edi+2]
458
459    dec     dword [tmp_width]
460    jg      near FIRST_ROW_X16H
461
462    mov     esi,    [ref]
463    mov     edi,    [sum_ref]
464    mov     ebp,    [width]
465    dec     dword [height]
466
467    mov     ecx,    ebx
468    sal     ecx,    4       ; succeeded 16th line
469HEIGHT_LOOP_X16:
470    mov     [tmp_width],    ebp
471WIDTH_LOOP_X16:
472    movdqu  xmm1,   [esi+ecx]
473    movdqu  xmm2,   [esi]
474    psadbw  xmm1,   xmm0
475    psadbw  xmm2,   xmm0
476    psubw   xmm1,   xmm2
477    movdqa  xmm2,   xmm1
478    punpckhwd xmm2, xmm0
479    paddw   xmm1,   xmm2
480    movd    eax,    xmm1
481    add     ax, word [edi]
482    mov     [edi+ebp*2],    ax
483    inc     dword [edx+eax*4]
484
485    inc     esi
486    add     edi,    2
487
488    dec     dword [tmp_width]
489    jg      near WIDTH_LOOP_X16
490
491    add     esi,    ebx
492    sub     esi,    ebp
493
494    dec     dword [height]
495    jg      near HEIGHT_LOOP_X16
496
497    add     esp,    localsize
498    pop     edi
499    pop     esi
500    pop     ebp
501    pop     ebx
502%undef      pushsize
503%undef      localsize
504%undef      ref
505%undef      sum_ref
506%undef      times_of_sum
507%undef      width
508%undef      height
509%undef      linesize
510%undef      tmp_width
511    ret
512
513; requires:  width % 16 == 0 && height > 1
514;-----------------------------------------------------------------------------------------------------------------------------
515;void SumOf16x16BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
516;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
517;-----------------------------------------------------------------------------------------------------------------------------
518; try 8 mv via offset
519%macro SUM_LINE_X16_SSE41  5    ; ref, dst0, dst1, tmp0, tmp1
520    movdqu  %2, [%1]
521    movdqu  %3, [%1+8h]
522    movdqa  %4, %2
523    movdqa  %5, %3
524
525    mpsadbw %2, xmm0,   0   ; 000 B
526    mpsadbw %4, xmm0,   5   ; 101 B
527    mpsadbw %3, xmm0,   2   ; 010 B
528    mpsadbw %5, xmm0,   7   ; 111 B
529    paddw   %2, %4
530    paddw   %3, %5
531    paddw   %2, %3  ; accumulate cost
532%endmacro   ; end of SAD_16x16_LINE_SSE41
533
534WELS_EXTERN SumOf16x16BlockOfFrame_sse4
535%define     pushsize        16
536%define     localsize       4
537%define     ref             esp + pushsize + localsize + 4
538%define     sum_ref         esp + pushsize + localsize + 20
539%define     times_of_sum    esp + pushsize + localsize + 24
540%define     width           esp + pushsize + localsize + 8
541%define     height          esp + pushsize + localsize + 12
542%define     linesize        esp + pushsize + localsize + 16
543%define     tmp_width       esp
544    push    ebx
545    push    ebp
546    push    esi
547    push    edi
548    sub     esp,    localsize
549
550    pxor    xmm0,   xmm0
551    mov     esi,    [ref]
552    mov     edi,    [sum_ref]
553    mov     edx,    [times_of_sum]
554    mov     ebx,    [linesize]
555    mov     eax,    [width]
556
557    lea     ecx,    [ebx+ebx*2]
558    mov     [tmp_width],    eax
559FIRST_ROW_X16_SSE4:
560    SUM_LINE_X16_SSE41  esi,        xmm1, xmm2, xmm3, xmm4
561    SUM_LINE_X16_SSE41  esi+ebx,    xmm2, xmm3, xmm4, xmm5
562    SUM_LINE_X16_SSE41  esi+ebx*2,  xmm3, xmm4, xmm5, xmm6
563    SUM_LINE_X16_SSE41  esi+ecx,    xmm4, xmm5, xmm6, xmm7
564    paddw   xmm1, xmm2
565    paddw   xmm3, xmm4
566    paddw   xmm1, xmm3
567
568    lea     ebp,    [esi+ebx*4]
569    SUM_LINE_X16_SSE41  ebp,        xmm2, xmm3, xmm4, xmm5
570    paddw   xmm1, xmm2
571    SUM_LINE_X16_SSE41  ebp+ebx,    xmm2, xmm3, xmm4, xmm5
572    paddw   xmm1, xmm2
573    SUM_LINE_X16_SSE41  ebp+ebx*2,  xmm2, xmm3, xmm4, xmm5
574    paddw   xmm1, xmm2
575    SUM_LINE_X16_SSE41  ebp+ecx,    xmm2, xmm3, xmm4, xmm5
576    paddw   xmm1, xmm2
577
578    lea     ebp,    [ebp+ebx*4]
579    SUM_LINE_X16_SSE41  ebp,        xmm2, xmm3, xmm4, xmm5
580    paddw   xmm1, xmm2
581    SUM_LINE_X16_SSE41  ebp+ebx,    xmm2, xmm3, xmm4, xmm5
582    paddw   xmm1, xmm2
583    SUM_LINE_X16_SSE41  ebp+ebx*2,  xmm2, xmm3, xmm4, xmm5
584    paddw   xmm1, xmm2
585    SUM_LINE_X16_SSE41  ebp+ecx,    xmm2, xmm3, xmm4, xmm5
586    paddw   xmm1, xmm2
587
588    lea     ebp,    [ebp+ebx*4]
589    SUM_LINE_X16_SSE41  ebp,        xmm2, xmm3, xmm4, xmm5
590    paddw   xmm1, xmm2
591    SUM_LINE_X16_SSE41  ebp+ebx,    xmm2, xmm3, xmm4, xmm5
592    paddw   xmm1, xmm2
593    SUM_LINE_X16_SSE41  ebp+ebx*2,  xmm2, xmm3, xmm4, xmm5
594    paddw   xmm1, xmm2
595    SUM_LINE_X16_SSE41  ebp+ecx,    xmm2, xmm3, xmm4, xmm5
596    paddw   xmm1, xmm2
597
598    movdqa  [edi],  xmm1
599    movdqa  xmm2,   xmm1
600    punpcklwd   xmm1,   xmm0
601    punpckhwd   xmm2,   xmm0
602
603    COUNT_SUM   xmm1,   eax,    1
604    COUNT_SUM   xmm1,   eax,    1
605    COUNT_SUM   xmm1,   eax,    1
606    COUNT_SUM   xmm1,   eax,    0
607    COUNT_SUM   xmm2,   eax,    1
608    COUNT_SUM   xmm2,   eax,    1
609    COUNT_SUM   xmm2,   eax,    1
610    COUNT_SUM   xmm2,   eax,    0
611
612    lea     esi,    [esi+8]
613    lea     edi,    [edi+16]    ; element size is 2
614
615    sub     dword [tmp_width], 8
616    jg      near FIRST_ROW_X16_SSE4
617
618    mov     esi,    [ref]
619    mov     edi,    [sum_ref]
620    mov     ebp,    [width]
621    dec     dword [height]
622
623    mov     ecx,    ebx
624    sal     ecx,    4       ; succeeded 16th line
625
626HEIGHT_LOOP_X16_SSE4:
627    mov     [tmp_width],    ebp
628WIDTH_LOOP_X16_SSE4:
629    movdqa  xmm7,   [edi]
630    SUM_LINE_X16_SSE41  esi+ecx, xmm1, xmm2, xmm3, xmm4
631    SUM_LINE_X16_SSE41  esi, xmm2, xmm3, xmm4, xmm5
632
633    paddw   xmm7,   xmm1
634    psubw   xmm7,   xmm2
635    movdqa  [edi+ebp*2], xmm7
636
637    movdqa  xmm6,   xmm7
638    punpcklwd   xmm7,   xmm0
639    punpckhwd   xmm6,   xmm0
640
641    COUNT_SUM   xmm7,   eax,    1
642    COUNT_SUM   xmm7,   eax,    1
643    COUNT_SUM   xmm7,   eax,    1
644    COUNT_SUM   xmm7,   eax,    0
645    COUNT_SUM   xmm6,   eax,    1
646    COUNT_SUM   xmm6,   eax,    1
647    COUNT_SUM   xmm6,   eax,    1
648    COUNT_SUM   xmm6,   eax,    0
649
650    lea     esi,    [esi+8]
651    lea     edi,    [edi+16]
652
653    sub     dword [tmp_width], 8
654    jg      near WIDTH_LOOP_X16_SSE4
655
656    add     esi,    ebx
657    sub     esi,    ebp
658
659    dec     dword [height]
660    jg      near HEIGHT_LOOP_X16_SSE4
661
662    add     esp,    localsize
663    pop     edi
664    pop     esi
665    pop     ebp
666    pop     ebx
667%undef      pushsize
668%undef      localsize
669%undef      ref
670%undef      sum_ref
671%undef      times_of_sum
672%undef      width
673%undef      height
674%undef      linesize
675%undef      tmp_width
676    ret
677
678
679;-----------------------------------------------------------------------------------------------------------------------------
680; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
681;-----------------------------------------------------------------------------------------------------------------------------
682WELS_EXTERN FillQpelLocationByFeatureValue_sse2
683    push    esi
684    push    edi
685    push    ebx
686    push    ebp
687
688    %define _ps         16              ; push size
689    %define _ls         4               ; local size
690    %define sum_ref     esp+_ps+_ls+4
691    %define pos_list    esp+_ps+_ls+16
692    %define width       esp+_ps+_ls+8
693    %define height      esp+_ps+_ls+12
694    %define i_height    esp
695    sub     esp,    _ls
696
697    mov     esi,    [sum_ref]
698    mov     edi,    [pos_list]
699    mov     ebp,    [width]
700    mov     ebx,    [height]
701    mov     [i_height], ebx
702
703    %assign push_num 5
704    INIT_X86_32_PIC_NOPRESERVE ecx
705    movq    xmm7,   [pic(mv_x_inc_x4)]     ; x_qpel inc
706    movq    xmm6,   [pic(mv_y_inc_x4)]     ; y_qpel inc
707    movq    xmm5,   [pic(mx_x_offset_x4)]  ; x_qpel vector
708    DEINIT_X86_32_PIC
709    pxor    xmm4,   xmm4
710    pxor    xmm3,   xmm3                ; y_qpel vector
711HASH_HEIGHT_LOOP_SSE2:
712    movdqa  xmm2,   xmm5    ; x_qpel vector
713    mov     ecx,    ebp
714HASH_WIDTH_LOOP_SSE2:
715    movq    xmm0,   [esi]           ; load x8 sum
716    punpcklwd   xmm0,   xmm4
717    movdqa      xmm1,   xmm2
718    punpcklwd   xmm1,   xmm3
719%rep    3
720    movd    edx,    xmm0
721    lea     ebx,    [edi+edx*4]
722    mov     eax,    [ebx]
723    movd    [eax],  xmm1
724    mov     edx,    [eax+4] ; explictly load eax+4 due cache miss from vtune observation
725    lea     eax,    [eax+4]
726    mov     [ebx],  eax
727    psrldq  xmm1,   4
728    psrldq  xmm0,   4
729%endrep
730    movd    edx,    xmm0
731    lea     ebx,    [edi+edx*4]
732    mov     eax,    [ebx]
733    movd    [eax],  xmm1
734    mov     edx,    [eax+4] ; explictly load eax+4 due cache miss from vtune observation
735    lea     eax,    [eax+4]
736    mov     [ebx],  eax
737
738    paddw   xmm2,   xmm7
739    lea     esi,    [esi+8]
740    sub     ecx,    4
741    jnz near HASH_WIDTH_LOOP_SSE2
742    paddw   xmm3,   xmm6
743    dec dword [i_height]
744    jnz near HASH_HEIGHT_LOOP_SSE2
745
746    add     esp,    _ls
747    %undef  _ps
748    %undef  _ls
749    %undef  sum_ref
750    %undef  pos_list
751    %undef  width
752    %undef  height
753    %undef  i_height
754    pop     ebp
755    pop     ebx
756    pop     edi
757    pop     esi
758    ret
759
760;---------------------------------------------------------------------------------------------------------------------------------------------------
761; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
762;                        uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList )
763;---------------------------------------------------------------------------------------------------------------------------------------------------
764WELS_EXTERN InitializeHashforFeature_sse2
765    push    ebx
766    push    esi
767    push    edi
768    push    ebp
769    %define _ps 16  ; push size
770    mov     edi,    [esp+_ps+16]    ; pPositionOfSum
771    mov     ebp,    [esp+_ps+20]    ; sum_idx_list
772    mov     esi,    [esp+_ps+4]     ; pTimesOfSum
773    mov     ebx,    [esp+_ps+8]     ; pBuf
774    mov     edx,    [esp+_ps+12]    ; list_sz
775    sar     edx,    2
776    mov     ecx,    0
777    pxor    xmm7,   xmm7
778hash_assign_loop_x4_sse2:
779    movdqa  xmm0,   [esi+ecx]
780    pslld   xmm0,   2
781
782    movdqa  xmm1,   xmm0
783    pcmpeqd xmm1,   xmm7
784    movmskps    eax,    xmm1
785    cmp eax, 0x0f
786    je  near hash_assign_with_copy_sse2
787
788%assign x   0
789%rep 4
790    lea     eax,    [edi+ecx+x]
791    mov     [eax],  ebx
792    lea     eax,    [ebp+ecx+x]
793    mov     [eax],  ebx
794    movd    eax,    xmm0
795    add     ebx,    eax
796    psrldq  xmm0,   4
797%assign x   x+4
798%endrep
799    jmp near assign_next_sse2
800
801hash_assign_with_copy_sse2:
802    movd    xmm1,   ebx
803    pshufd  xmm2,   xmm1,   0
804    movdqa  [edi+ecx], xmm2
805    movdqa  [ebp+ecx], xmm2
806
807assign_next_sse2:
808    add     ecx,    16
809    dec     edx
810    jnz     near hash_assign_loop_x4_sse2
811
812    mov     edx,    [esp+_ps+12]    ; list_sz
813    and     edx,    3
814    jz      near hash_assign_no_rem_sse2
815hash_assign_loop_x4_rem_sse2:
816    lea     eax,    [edi+ecx]
817    mov     [eax],  ebx
818    lea     eax,    [ebp+ecx]
819    mov     [eax],  ebx
820    mov     eax,    [esi+ecx]
821    sal     eax,    2
822    add     ebx,    eax
823    add     ecx,    4
824    dec     edx
825    jnz     near hash_assign_loop_x4_rem_sse2
826
827hash_assign_no_rem_sse2:
828    %undef  _ps
829    pop     ebp
830    pop     edi
831    pop     esi
832    pop     ebx
833    ret
834%else
835
836;**********************************************************************************************************************
837;void SumOf8x8BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
838;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
839;*********************************************************************************************************************
840WELS_EXTERN SumOf8x8BlockOfFrame_sse2
841    %assign  push_num 0
842    LOAD_6_PARA
843    PUSH_XMM 6
844    SIGN_EXTENSION  r1, r1d
845    SIGN_EXTENSION  r2, r2d
846    SIGN_EXTENSION  r3, r3d
847    push r12
848    push r13
849    push r0
850    push r2
851    push r4
852
853    pxor    xmm0,   xmm0
854    lea     r6, [r3+r3*2]
855
856    mov     r12,    r1              ;r12:tmp_width
857    lea     r13,    [r0+r3*4]       ;rbp:r13
858FIRST_ROW:
859    movq    xmm1,   [r0]
860    movq    xmm2,   [r0+r3]
861    movq    xmm3,   [r0+r3*2]
862    movq    xmm4,   [r0+r6]
863
864    shufps  xmm1,   xmm2,   01000100b
865    shufps  xmm3,   xmm4,   01000100b
866    psadbw  xmm1,   xmm0
867    psadbw  xmm3,   xmm0
868    paddd   xmm1,   xmm3
869
870    movq    xmm2,   [r13]
871    movq    xmm3,   [r13+r3]
872    movq    xmm4,   [r13+r3*2]
873    movq    xmm5,   [r13+r6]
874
875    shufps  xmm2,   xmm3,   01000100b
876    shufps  xmm4,   xmm5,   01000100b
877    psadbw  xmm2,   xmm0
878    psadbw  xmm4,   xmm0
879    paddd   xmm2,   xmm4
880
881    paddd   xmm1,   xmm2
882    pshufd  xmm2,   xmm1,   00001110b
883    paddd   xmm1,   xmm2
884    movd    r2d,    xmm1
885    mov     [r4],   r2w
886    inc     dword [r5+r2*4]
887
888    inc     r0
889    inc     r13
890    add     r4, 2
891
892    dec     r12
893    jg      FIRST_ROW
894
895    pop r4
896    pop r2
897    pop r0
898    mov r13, r2
899    dec r13
900HEIGHT_LOOP:
901    mov     r12,    r1
902WIDTH_LOOP:
903    movq    xmm1,   [r0+r3*8]
904    movq    xmm2,   [r0]
905    psadbw  xmm1,   xmm0
906    psadbw  xmm2,   xmm0
907    psubd   xmm1,   xmm2
908    movd    r2d,    xmm1
909    mov     r6w,    [r4]
910    add     r2d,    r6d
911    mov     [r4+r1*2],  r2w
912    inc     dword [r5+r2*4]
913
914    inc     r0
915    add     r4, 2
916
917    dec     r12
918    jg      WIDTH_LOOP
919
920    add     r0, r3
921    sub     r0, r1
922
923
924    dec     r13
925    jg      HEIGHT_LOOP
926
927    pop     r13
928    pop     r12
929    POP_XMM
930    LOAD_6_PARA_POP
931    ret
932
933
934%macro COUNT_SUM 4
935%define xmm_reg %1
936%define tmp_dreg %2
937%define tmp_qreg %3
938    movd    tmp_dreg,   xmm_reg
939    inc     dword [r5+tmp_qreg*4]
940%if %4 == 1
941    psrldq  xmm_reg,    4
942%endif
943%endmacro
944
945
946;-----------------------------------------------------------------------------
947; requires:  width % 8 == 0 && height > 1
948;-----------------------------------------------------------------------------
949;void SumOf8x8BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
950;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
951;-----------------------------------------------------------------------------
952; read extra (16 - (width % 8) ) mod 16 bytes of every line
953; write extra (16 - (width % 8)*2 ) mod 16 bytes in the end of sum_ref
954WELS_EXTERN SumOf8x8BlockOfFrame_sse4
955    %assign  push_num 0
956    LOAD_6_PARA
957    PUSH_XMM 8
958    SIGN_EXTENSION  r1, r1d
959    SIGN_EXTENSION  r2, r2d
960    SIGN_EXTENSION  r3, r3d
961    push r12
962    push r13
963    push r0
964    push r2
965    push r4
966
967    pxor    xmm0,   xmm0
968    lea     r6, [r3+r3*2]
969
970    mov     r12,    r1              ;r12:tmp_width
971    lea     r13,    [r0+r3*4]       ;rbp:r13
972FIRST_ROW_SSE4:
973    movdqu  xmm1,   [r0]
974    movdqu  xmm3,   [r0+r3]
975    movdqu  xmm5,   [r0+r3*2]
976    movdqu  xmm7,   [r0+r6]
977
978    movdqa  xmm2,   xmm1
979    mpsadbw xmm1,   xmm0,   000b
980    mpsadbw xmm2,   xmm0,   100b
981    paddw   xmm1,   xmm2            ; 8 sums of line1
982
983    movdqa  xmm4,   xmm3
984    mpsadbw xmm3,   xmm0,   000b
985    mpsadbw xmm4,   xmm0,   100b
986    paddw   xmm3,   xmm4            ; 8 sums of line2
987
988    movdqa  xmm2,   xmm5
989    mpsadbw xmm5,   xmm0,   000b
990    mpsadbw xmm2,   xmm0,   100b
991    paddw   xmm5,   xmm2            ; 8 sums of line3
992
993    movdqa  xmm4,   xmm7
994    mpsadbw xmm7,   xmm0,   000b
995    mpsadbw xmm4,   xmm0,   100b
996    paddw   xmm7,   xmm4            ; 8 sums of line4
997
998    paddw   xmm1,   xmm3
999    paddw   xmm5,   xmm7
1000    paddw   xmm1,   xmm5            ; sum the upper 4 lines first
1001
1002    movdqu  xmm2,   [r13]
1003    movdqu  xmm3,   [r13+r3]
1004    movdqu  xmm4,   [r13+r3*2]
1005    movdqu  xmm5,   [r13+r6]
1006
1007    movdqa  xmm6,   xmm2
1008    mpsadbw xmm2,   xmm0,   000b
1009    mpsadbw xmm6,   xmm0,   100b
1010    paddw   xmm2,   xmm6
1011
1012    movdqa  xmm7,   xmm3
1013    mpsadbw xmm3,   xmm0,   000b
1014    mpsadbw xmm7,   xmm0,   100b
1015    paddw   xmm3,   xmm7
1016
1017    movdqa  xmm6,   xmm4
1018    mpsadbw xmm4,   xmm0,   000b
1019    mpsadbw xmm6,   xmm0,   100b
1020    paddw   xmm4,   xmm6
1021
1022    movdqa  xmm7,   xmm5
1023    mpsadbw xmm5,   xmm0,   000b
1024    mpsadbw xmm7,   xmm0,   100b
1025    paddw   xmm5,   xmm7
1026
1027    paddw   xmm2,   xmm3
1028    paddw   xmm4,   xmm5
1029    paddw   xmm1,   xmm2
1030    paddw   xmm1,   xmm4            ; sum of lines 1- 8
1031
1032    movdqu  [r4],   xmm1
1033
1034    movdqa  xmm2,   xmm1
1035    punpcklwd   xmm1,   xmm0
1036    punpckhwd   xmm2,   xmm0
1037
1038    COUNT_SUM   xmm1,   r2d, r2, 1
1039    COUNT_SUM   xmm1,   r2d, r2, 1
1040    COUNT_SUM   xmm1,   r2d, r2, 1
1041    COUNT_SUM   xmm1,   r2d, r2, 0
1042    COUNT_SUM   xmm2,   r2d, r2 ,1
1043    COUNT_SUM   xmm2,   r2d, r2 ,1
1044    COUNT_SUM   xmm2,   r2d, r2 ,1
1045    COUNT_SUM   xmm2,   r2d, r2 ,0
1046
1047    lea     r0,     [r0+8]
1048    lea     r13,    [r13+8]
1049    lea     r4,     [r4+16]     ; element size is 2
1050
1051    sub     r12, 8
1052    jg      near FIRST_ROW_SSE4
1053
1054    pop r4
1055    pop r2
1056    pop r0
1057    mov r13, r2
1058    dec r13
1059HEIGHT_LOOP_SSE4:
1060    mov     r12,    r1
1061WIDTH_LOOP_SSE4:
1062    movdqu  xmm1,   [r0+r3*8]
1063    movdqu  xmm2,   [r0]
1064    movdqu  xmm7,   [r4]
1065
1066    movdqa  xmm3,   xmm1
1067    mpsadbw xmm1,   xmm0,   000b
1068    mpsadbw xmm3,   xmm0,   100b
1069    paddw   xmm1,   xmm3
1070
1071    movdqa  xmm4,   xmm2
1072    mpsadbw xmm2,   xmm0,   000b
1073    mpsadbw xmm4,   xmm0,   100b
1074    paddw   xmm2,   xmm4
1075
1076    paddw   xmm7,   xmm1
1077    psubw   xmm7,   xmm2
1078    movdqu  [r4+r1*2], xmm7
1079
1080    movdqa  xmm6,   xmm7
1081    punpcklwd   xmm7,   xmm0
1082    punpckhwd   xmm6,   xmm0
1083
1084    COUNT_SUM   xmm7,   r2d, r2, 1
1085    COUNT_SUM   xmm7,   r2d, r2, 1
1086    COUNT_SUM   xmm7,   r2d, r2, 1
1087    COUNT_SUM   xmm7,   r2d, r2, 0
1088    COUNT_SUM   xmm6,   r2d, r2, 1
1089    COUNT_SUM   xmm6,   r2d, r2, 1
1090    COUNT_SUM   xmm6,   r2d, r2, 1
1091    COUNT_SUM   xmm6,   r2d, r2, 0
1092
1093    lea     r0, [r0+8]
1094    lea     r4, [r4+16]
1095
1096    sub     r12,    8
1097    jg      near WIDTH_LOOP_SSE4
1098
1099    lea     r0, [r0+r3]
1100    sub     r0, r1
1101
1102    dec     r13
1103    jg      near HEIGHT_LOOP_SSE4
1104
1105    pop     r13
1106    pop     r12
1107    POP_XMM
1108    LOAD_6_PARA_POP
1109    ret
1110
1111
1112;****************************************************************************************************************************************************
1113;void SumOf16x16BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
1114;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
1115;****************************************************************************************************************************************************
1116WELS_EXTERN SumOf16x16BlockOfFrame_sse2
1117    %assign  push_num 0
1118    LOAD_6_PARA
1119    PUSH_XMM 6
1120    SIGN_EXTENSION  r1, r1d
1121    SIGN_EXTENSION  r2, r2d
1122    SIGN_EXTENSION  r3, r3d
1123    push r12
1124    push r13
1125    push r0
1126    push r2
1127    push r4
1128
1129    pxor    xmm0,   xmm0
1130    lea     r6, [r3+r3*2]
1131
1132    mov     r12,    r1              ;r12:tmp_width
1133FIRST_ROW_X16H:
1134    movdqu  xmm1,   [r0]
1135    movdqu  xmm2,   [r0+r3]
1136    movdqu  xmm3,   [r0+r3*2]
1137    movdqu  xmm4,   [r0+r6]
1138
1139    psadbw  xmm1,   xmm0
1140    psadbw  xmm2,   xmm0
1141    psadbw  xmm3,   xmm0
1142    psadbw  xmm4,   xmm0
1143    paddw   xmm1,   xmm2
1144    paddw   xmm3,   xmm4
1145    paddw   xmm1,   xmm3
1146
1147    lea     r13,    [r0+r3*4]       ;ebp:r13
1148    movdqu  xmm2,   [r13]
1149    movdqu  xmm3,   [r13+r3]
1150    movdqu  xmm4,   [r13+r3*2]
1151    movdqu  xmm5,   [r13+r6]
1152
1153    psadbw  xmm2,   xmm0
1154    psadbw  xmm3,   xmm0
1155    psadbw  xmm4,   xmm0
1156    psadbw  xmm5,   xmm0
1157    paddw   xmm2,   xmm3
1158    paddw   xmm4,   xmm5
1159    paddw   xmm2,   xmm4
1160
1161    paddw   xmm1,   xmm2
1162
1163    lea     r13,    [r13+r3*4]
1164    movdqu  xmm2,   [r13]
1165    movdqu  xmm3,   [r13+r3]
1166    movdqu  xmm4,   [r13+r3*2]
1167    movdqu  xmm5,   [r13+r6]
1168
1169    psadbw  xmm2,   xmm0
1170    psadbw  xmm3,   xmm0
1171    psadbw  xmm4,   xmm0
1172    psadbw  xmm5,   xmm0
1173    paddw   xmm2,   xmm3
1174    paddw   xmm4,   xmm5
1175    paddw   xmm2,   xmm4
1176
1177    paddw   xmm1,   xmm2
1178
1179    lea     r13,    [r13+r3*4]
1180    movdqu  xmm2,   [r13]
1181    movdqu  xmm3,   [r13+r3]
1182    movdqu  xmm4,   [r13+r3*2]
1183    movdqu  xmm5,   [r13+r6]
1184
1185    psadbw  xmm2,   xmm0
1186    psadbw  xmm3,   xmm0
1187    psadbw  xmm4,   xmm0
1188    psadbw  xmm5,   xmm0
1189    paddw   xmm2,   xmm3
1190    paddw   xmm4,   xmm5
1191    paddw   xmm2,   xmm4
1192
1193    paddw   xmm1,   xmm2
1194    movdqa  xmm2,   xmm1
1195    punpckhwd xmm2, xmm0
1196    paddw xmm1, xmm2
1197    movd    r2d,    xmm1
1198    mov     [r4],   r2w
1199    inc     dword [r5+r2*4]
1200
1201    inc     r0
1202    lea     r4, [r4+2]
1203
1204    dec     r12
1205    jg      near FIRST_ROW_X16H
1206
1207    pop r4
1208    pop r2
1209    pop r0
1210    mov r13, r2
1211    dec r13
1212    mov     r6, r3
1213    sal     r6, 4       ; succeeded 16th line
1214HEIGHT_LOOP_X16:
1215    mov     r12,    r1
1216WIDTH_LOOP_X16:
1217    movdqu  xmm1,   [r0+r6]
1218    movdqu  xmm2,   [r0]
1219    psadbw  xmm1,   xmm0
1220    psadbw  xmm2,   xmm0
1221    psubw   xmm1,   xmm2
1222    movdqa  xmm2,   xmm1
1223    punpckhwd xmm2, xmm0
1224    paddw   xmm1,   xmm2
1225    movd    r2d,    xmm1
1226    add     r2w,    word [r4]
1227    mov     [r4+r1*2],  r2w
1228    inc     dword [r5+r2*4]
1229
1230    inc     r0
1231    add     r4, 2
1232
1233    dec     r12
1234    jg      near WIDTH_LOOP_X16
1235
1236    add     r0, r3
1237    sub     r0, r1
1238
1239    dec     r13
1240    jg      near HEIGHT_LOOP_X16
1241
1242    pop     r13
1243    pop     r12
1244    POP_XMM
1245    LOAD_6_PARA_POP
1246    ret
1247
1248; requires:  width % 16 == 0 && height > 1
1249;-----------------------------------------------------------------------------------------------------------------------------
1250;void SumOf16x16BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
1251;                             uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
1252;-----------------------------------------------------------------------------------------------------------------------------
1253; try 8 mv via offset
1254%macro SUM_LINE_X16_SSE41  5    ; ref, dst0, dst1, tmp0, tmp1
1255    movdqu  %2, [%1]
1256    movdqu  %3, [%1+8h]
1257    movdqa  %4, %2
1258    movdqa  %5, %3
1259
1260    mpsadbw %2, xmm0,   0   ; 000 B
1261    mpsadbw %4, xmm0,   5   ; 101 B
1262    mpsadbw %3, xmm0,   2   ; 010 B
1263    mpsadbw %5, xmm0,   7   ; 111 B
1264    paddw   %2, %4
1265    paddw   %3, %5
1266    paddw   %2, %3  ; accumulate cost
1267%endmacro   ; end of SAD_16x16_LINE_SSE41
1268
1269WELS_EXTERN SumOf16x16BlockOfFrame_sse4
1270    %assign  push_num 0
1271    LOAD_6_PARA
1272    PUSH_XMM 8
1273    SIGN_EXTENSION  r1, r1d
1274    SIGN_EXTENSION  r2, r2d
1275    SIGN_EXTENSION  r3, r3d
1276    push r12
1277    push r13
1278    push r0
1279    push r2
1280    push r4
1281
1282    pxor    xmm0,   xmm0
1283    lea     r6, [r3+r3*2]
1284
1285    mov     r12,    r1              ;r12:tmp_width
1286FIRST_ROW_X16_SSE4:
1287    SUM_LINE_X16_SSE41  r0,     xmm1, xmm2, xmm3, xmm4
1288    SUM_LINE_X16_SSE41  r0+r3,  xmm2, xmm3, xmm4, xmm5
1289    SUM_LINE_X16_SSE41  r0+r3*2,xmm3, xmm4, xmm5, xmm6
1290    SUM_LINE_X16_SSE41  r0+r6,  xmm4, xmm5, xmm6, xmm7
1291    paddw   xmm1, xmm2
1292    paddw   xmm3, xmm4
1293    paddw   xmm1, xmm3
1294
1295    lea     r13,    [r0+r3*4]
1296    SUM_LINE_X16_SSE41  r13,        xmm2, xmm3, xmm4, xmm5
1297    paddw   xmm1, xmm2
1298    SUM_LINE_X16_SSE41  r13+r3,     xmm2, xmm3, xmm4, xmm5
1299    paddw   xmm1, xmm2
1300    SUM_LINE_X16_SSE41  r13+r3*2,   xmm2, xmm3, xmm4, xmm5
1301    paddw   xmm1, xmm2
1302    SUM_LINE_X16_SSE41  r13+r6,     xmm2, xmm3, xmm4, xmm5
1303    paddw   xmm1, xmm2
1304
1305    lea     r13,    [r13+r3*4]
1306    SUM_LINE_X16_SSE41  r13,        xmm2, xmm3, xmm4, xmm5
1307    paddw   xmm1, xmm2
1308    SUM_LINE_X16_SSE41  r13+r3,     xmm2, xmm3, xmm4, xmm5
1309    paddw   xmm1, xmm2
1310    SUM_LINE_X16_SSE41  r13+r3*2,   xmm2, xmm3, xmm4, xmm5
1311    paddw   xmm1, xmm2
1312    SUM_LINE_X16_SSE41  r13+r6,     xmm2, xmm3, xmm4, xmm5
1313    paddw   xmm1, xmm2
1314
1315    lea     r13,    [r13+r3*4]
1316    SUM_LINE_X16_SSE41  r13,        xmm2, xmm3, xmm4, xmm5
1317    paddw   xmm1, xmm2
1318    SUM_LINE_X16_SSE41  r13+r3,     xmm2, xmm3, xmm4, xmm5
1319    paddw   xmm1, xmm2
1320    SUM_LINE_X16_SSE41  r13+r3*2,   xmm2, xmm3, xmm4, xmm5
1321    paddw   xmm1, xmm2
1322    SUM_LINE_X16_SSE41  r13+r6,     xmm2, xmm3, xmm4, xmm5
1323    paddw   xmm1, xmm2
1324
1325    movdqa  [r4],   xmm1
1326    movdqa  xmm2,   xmm1
1327    punpcklwd   xmm1,   xmm0
1328    punpckhwd   xmm2,   xmm0
1329
1330    COUNT_SUM   xmm1,   r2d, r2, 1
1331    COUNT_SUM   xmm1,   r2d, r2, 1
1332    COUNT_SUM   xmm1,   r2d, r2, 1
1333    COUNT_SUM   xmm1,   r2d, r2, 0
1334    COUNT_SUM   xmm2,   r2d, r2, 1
1335    COUNT_SUM   xmm2,   r2d, r2, 1
1336    COUNT_SUM   xmm2,   r2d, r2, 1
1337    COUNT_SUM   xmm2,   r2d, r2, 0
1338
1339    lea     r0, [r0+8]
1340    lea     r4, [r4+16] ; element size is 2
1341
1342    sub     r12, 8
1343    jg      near FIRST_ROW_X16_SSE4
1344
1345    pop r4
1346    pop r2
1347    pop r0
1348    mov r13, r2
1349    dec r13
1350    mov     r6, r3
1351    sal     r6, 4       ; succeeded 16th line
1352
1353HEIGHT_LOOP_X16_SSE4:
1354    mov     r12,    r1
1355WIDTH_LOOP_X16_SSE4:
1356    movdqa  xmm7,   [r4]
1357    SUM_LINE_X16_SSE41  r0+r6, xmm1, xmm2, xmm3, xmm4
1358    SUM_LINE_X16_SSE41  r0, xmm2, xmm3, xmm4, xmm5
1359
1360    paddw   xmm7,   xmm1
1361    psubw   xmm7,   xmm2
1362    movdqa  [r4+r1*2], xmm7
1363
1364    movdqa  xmm6,   xmm7
1365    punpcklwd   xmm7,   xmm0
1366    punpckhwd   xmm6,   xmm0
1367
1368    COUNT_SUM   xmm7,   r2d, r2, 1
1369    COUNT_SUM   xmm7,   r2d, r2, 1
1370    COUNT_SUM   xmm7,   r2d, r2, 1
1371    COUNT_SUM   xmm7,   r2d, r2, 0
1372    COUNT_SUM   xmm6,   r2d, r2, 1
1373    COUNT_SUM   xmm6,   r2d, r2, 1
1374    COUNT_SUM   xmm6,   r2d, r2, 1
1375    COUNT_SUM   xmm6,   r2d, r2, 0
1376
1377    lea     r0, [r0+8]
1378    lea     r4, [r4+16]
1379
1380    sub     r12, 8
1381    jg      near WIDTH_LOOP_X16_SSE4
1382
1383    add     r0, r3
1384    sub     r0, r1
1385
1386    dec     r13
1387    jg      near HEIGHT_LOOP_X16_SSE4
1388
1389    pop     r13
1390    pop     r12
1391    POP_XMM
1392    LOAD_6_PARA_POP
1393    ret
1394
1395;-----------------------------------------------------------------------------------------------------------------------------
1396; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
1397;-----------------------------------------------------------------------------------------------------------------------------
1398WELS_EXTERN FillQpelLocationByFeatureValue_sse2
1399    %assign  push_num 0
1400    LOAD_4_PARA
1401    PUSH_XMM 8
1402    SIGN_EXTENSION  r1, r1d
1403    SIGN_EXTENSION  r2, r2d
1404    push r12
1405    push r13
1406    mov     r12,    r2
1407
1408    movq    xmm7,   [mv_x_inc_x4]       ; x_qpel inc
1409    movq    xmm6,   [mv_y_inc_x4]       ; y_qpel inc
1410    movq    xmm5,   [mx_x_offset_x4]    ; x_qpel vector
1411    pxor    xmm4,   xmm4
1412    pxor    xmm3,   xmm3                ; y_qpel vector
1413HASH_HEIGHT_LOOP_SSE2:
1414    movdqa  xmm2,   xmm5    ; x_qpel vector
1415    mov     r4, r1
1416HASH_WIDTH_LOOP_SSE2:
1417    movq    xmm0,   [r0]            ; load x8 sum
1418    punpcklwd   xmm0,   xmm4
1419    movdqa      xmm1,   xmm2
1420    punpcklwd   xmm1,   xmm3
1421%rep    3
1422    movd    r2d,    xmm0        ;edx:r3
1423    lea     r5,     [r3+r2*8]   ;ebx:r5
1424    mov     r6,     [r5]        ;eax:r6
1425    movd    [r6],   xmm1
1426    mov     r13,    [r6+4]  ; explictly load eax+4 due cache miss from vtune observation
1427    lea     r6,     [r6+4]
1428    mov     [r5],   r6
1429    psrldq  xmm1,   4
1430    psrldq  xmm0,   4
1431%endrep
1432    movd    r2d,    xmm0
1433    lea     r5,     [r3+r2*8]   ;ebx:r5
1434    mov     r6,     [r5]        ;eax:r6
1435    movd    [r6],   xmm1
1436    mov     r13,    [r6+4]  ; explictly load eax+4 due cache miss from vtune observation
1437    lea     r6,     [r6+4]
1438    mov     [r5],   r6
1439
1440    paddw   xmm2,   xmm7
1441    lea     r0,     [r0+8]
1442    sub     r4,     4
1443    jnz near HASH_WIDTH_LOOP_SSE2
1444    paddw   xmm3,   xmm6
1445    dec r12
1446    jnz near HASH_HEIGHT_LOOP_SSE2
1447
1448    pop     r13
1449    pop     r12
1450    POP_XMM
1451    ret
1452
1453;---------------------------------------------------------------------------------------------------------------------------------------------------
1454; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
1455;                                 uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
1456;uint16_t** pPositionOfSum, uint16_t** sum_idx_list, uint32_t* pTimesOfSum, uint16_t* pBuf, const int32_t list_sz )
1457;---------------------------------------------------------------------------------------------------------------------------------------------------
1458WELS_EXTERN InitializeHashforFeature_sse2
1459    %assign  push_num 0
1460    LOAD_5_PARA
1461    SIGN_EXTENSION  r2, r2d
1462    push r12
1463    push r13
1464    mov     r12,    r2
1465    sar     r2,     2
1466    mov     r5,     0       ;r5:ecx
1467    xor     r6,     r6
1468    pxor    xmm3,   xmm3
1469hash_assign_loop_x4_sse2:
1470    movdqa  xmm0,   [r0+r5]
1471    pslld   xmm0,   2
1472
1473    movdqa  xmm1,   xmm0
1474    pcmpeqd xmm1,   xmm3
1475    movmskps    r6, xmm1
1476    cmp     r6,     0x0f
1477    jz  near hash_assign_with_copy_sse2
1478
1479%assign x   0
1480%rep 4
1481    lea     r13,    [r3+r5*2+x]
1482    mov     [r13],  r1
1483    lea     r13,    [r4+r5*2+x]
1484    mov     [r13],  r1
1485    movd    r6d,    xmm0
1486    add     r1,     r6
1487    psrldq  xmm0,   4
1488%assign x   x+8
1489%endrep
1490    jmp near assign_next_sse2
1491
1492hash_assign_with_copy_sse2:
1493    movq    xmm1,   r1
1494    pshufd  xmm2,   xmm1,   01000100b
1495    movdqa  [r3+r5*2], xmm2
1496    movdqa  [r4+r5*2], xmm2
1497    movdqa  [r3+r5*2+16], xmm2
1498    movdqa  [r4+r5*2+16], xmm2
1499
1500assign_next_sse2:
1501    add     r5, 16
1502    dec     r2
1503    jnz     near hash_assign_loop_x4_sse2
1504
1505    and     r12,    3
1506    jz      near hash_assign_no_rem_sse2
1507hash_assign_loop_x4_rem_sse2:
1508    lea     r13,    [r3+r5*2]
1509    mov     [r13],  r1
1510    lea     r13,    [r4+r5*2]
1511    mov     [r13],  r1
1512    mov     r6d,    [r0+r5]
1513    sal     r6,     2
1514    add     r1,     r6
1515    add     r5,     4
1516    dec     r12
1517    jnz     near hash_assign_loop_x4_rem_sse2
1518
1519hash_assign_no_rem_sse2:
1520    pop     r13
1521    pop     r12
1522    ret
1523
1524%endif
1525
1526;**********************************************************************************************************************************
1527;   int32_t SumOf8x8SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
1528;**********************************************************************************************************************************
1529WELS_EXTERN SumOf8x8SingleBlock_sse2
1530    %assign  push_num 0
1531    LOAD_2_PARA
1532    SIGN_EXTENSION  r1, r1d
1533
1534    pxor xmm0, xmm0
1535    movq xmm1, [r0]
1536    movhps xmm1, [r0+r1]
1537    lea r0, [r0+2*r1]
1538    movq xmm2, [r0]
1539    movhps xmm2, [r0+r1]
1540    lea r0, [r0+2*r1]
1541    movq xmm3, [r0]
1542    movhps xmm3, [r0+r1]
1543    lea r0, [r0+2*r1]
1544    movq xmm4, [r0]
1545    movhps xmm4, [r0+r1]
1546
1547    psadbw xmm1, xmm0
1548    psadbw xmm2, xmm0
1549    psadbw xmm3, xmm0
1550    psadbw xmm4, xmm0
1551    paddw xmm1, xmm2
1552    paddw xmm3, xmm4
1553    paddw xmm1, xmm3
1554
1555    movdqa xmm2, xmm1
1556    punpckhwd xmm2, xmm0
1557    paddw xmm1, xmm2
1558
1559    movd retrd, xmm1
1560    ret
1561
1562;**********************************************************************************************************************************
1563;   int32_t SumOf16x16SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
1564;**********************************************************************************************************************************
1565WELS_EXTERN SumOf16x16SingleBlock_sse2
1566    %assign  push_num 0
1567    LOAD_2_PARA
1568    PUSH_XMM 6
1569    SIGN_EXTENSION  r1, r1d
1570
1571    pxor xmm0, xmm0
1572    movdqa xmm1, [r0]
1573    movdqa xmm2, [r0+r1]
1574    lea r0, [r0+2*r1]
1575    movdqa xmm3, [r0]
1576    movdqa xmm4, [r0+r1]
1577    psadbw xmm1, xmm0
1578    psadbw xmm2, xmm0
1579    psadbw xmm3, xmm0
1580    psadbw xmm4, xmm0
1581    paddw xmm1, xmm2
1582    paddw xmm3, xmm4
1583    paddw xmm1, xmm3
1584
1585    lea r0, [r0+2*r1]
1586    movdqa xmm2, [r0]
1587    movdqa xmm3, [r0+r1]
1588    lea r0, [r0+2*r1]
1589    movdqa xmm4, [r0]
1590    movdqa xmm5, [r0+r1]
1591    psadbw xmm2, xmm0
1592    psadbw xmm3, xmm0
1593    psadbw xmm4, xmm0
1594    psadbw xmm5, xmm0
1595    paddw xmm2, xmm3
1596    paddw xmm4, xmm5
1597    paddw xmm2, xmm4
1598
1599    paddw xmm1, xmm2
1600
1601    lea r0, [r0+2*r1]
1602    movdqa xmm2, [r0]
1603    movdqa xmm3, [r0+r1]
1604    lea r0, [r0+2*r1]
1605    movdqa xmm4, [r0]
1606    movdqa xmm5, [r0+r1]
1607    psadbw xmm2, xmm0
1608    psadbw xmm3, xmm0
1609    psadbw xmm4, xmm0
1610    psadbw xmm5, xmm0
1611    paddw xmm2, xmm3
1612    paddw xmm4, xmm5
1613    paddw xmm2, xmm4
1614
1615    paddw xmm1, xmm2
1616
1617    lea r0, [r0+2*r1]
1618    movdqa xmm2, [r0]
1619    movdqa xmm3, [r0+r1]
1620    lea r0, [r0+2*r1]
1621    movdqa xmm4, [r0]
1622    movdqa xmm5, [r0+r1]
1623    psadbw xmm2, xmm0
1624    psadbw xmm3, xmm0
1625    psadbw xmm4, xmm0
1626    psadbw xmm5, xmm0
1627    paddw xmm2, xmm3
1628    paddw xmm4, xmm5
1629    paddw xmm2, xmm4
1630
1631    paddw xmm1, xmm2
1632
1633    movdqa xmm2, xmm1
1634    punpckhwd xmm2, xmm0
1635    paddw xmm1, xmm2
1636
1637    movd retrd, xmm1
1638    POP_XMM
1639    ret
1640
1641;**********************************************************************************************************************************
1642;
1643;   uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
1644;
1645;   \note:
1646;       src need align with 16 bytes, ref is optional
1647;   \return value:
1648;       return minimal SAD cost, according index carried by index_min_cost
1649;**********************************************************************************************************************************
1650; try 8 mv via offset
1651; xmm7 store sad costs
1652%macro SAD_16x16_LINE_SSE41  4  ; src, ref, stride_src, stride_ref
1653    movdqa      xmm0, [%1]
1654    movdqu      xmm1, [%2]
1655    movdqu      xmm2, [%2+8h]
1656    movdqa      xmm3, xmm1
1657    movdqa      xmm4, xmm2
1658
1659    mpsadbw     xmm1, xmm0, 0   ; 000 B
1660    paddw       xmm7, xmm1      ; accumulate cost
1661
1662    mpsadbw     xmm3, xmm0, 5   ; 101 B
1663    paddw       xmm7, xmm3      ; accumulate cost
1664
1665    mpsadbw     xmm2, xmm0, 2   ; 010 B
1666    paddw       xmm7, xmm2      ; accumulate cost
1667
1668    mpsadbw     xmm4, xmm0, 7   ; 111 B
1669    paddw       xmm7, xmm4      ; accumulate cost
1670
1671    add         %1, %3
1672    add         %2, %4
1673%endmacro   ; end of SAD_16x16_LINE_SSE41
1674%macro SAD_16x16_LINE_SSE41E  4 ; src, ref, stride_src, stride_ref
1675    movdqa      xmm0, [%1]
1676    movdqu      xmm1, [%2]
1677    movdqu      xmm2, [%2+8h]
1678    movdqa      xmm3, xmm1
1679    movdqa      xmm4, xmm2
1680
1681    mpsadbw     xmm1, xmm0, 0   ; 000 B
1682    paddw       xmm7, xmm1      ; accumulate cost
1683
1684    mpsadbw     xmm3, xmm0, 5   ; 101 B
1685    paddw       xmm7, xmm3      ; accumulate cost
1686
1687    mpsadbw     xmm2, xmm0, 2   ; 010 B
1688    paddw       xmm7, xmm2      ; accumulate cost
1689
1690    mpsadbw     xmm4, xmm0, 7   ; 111 B
1691    paddw       xmm7, xmm4      ; accumulate cost
1692%endmacro   ; end of SAD_16x16_LINE_SSE41E
1693
1694WELS_EXTERN SampleSad16x16Hor8_sse41
1695    ;push ebx
1696    ;push esi
1697    ;mov eax, [esp+12]  ;   src
1698    ;mov ecx, [esp+16]  ;   stride_src
1699    ;mov ebx, [esp+20]  ;   ref
1700    ;mov edx, [esp+24]  ;   stride_ref
1701    ;mov esi, [esp+28]  ;   base_cost
1702    %assign  push_num 0
1703    LOAD_6_PARA
1704    PUSH_XMM 8
1705    SIGN_EXTENSION  r1, r1d
1706    SIGN_EXTENSION  r3, r3d
1707    pxor    xmm7,   xmm7
1708
1709    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
1710    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
1711    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
1712    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
1713
1714    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
1715    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
1716    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
1717    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
1718
1719    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
1720    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
1721    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
1722    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
1723
1724    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
1725    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
1726    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
1727    SAD_16x16_LINE_SSE41E   r0, r2, r1, r3
1728
1729    pxor    xmm0,   xmm0
1730    movdqa  xmm6,   xmm7
1731    punpcklwd   xmm6,   xmm0
1732    punpckhwd   xmm7,   xmm0
1733
1734    movdqa  xmm5,   [r4]
1735    movdqa  xmm4,   xmm5
1736    punpcklwd   xmm4,   xmm0
1737    punpckhwd   xmm5,   xmm0
1738
1739    paddd   xmm4,   xmm6
1740    paddd   xmm5,   xmm7
1741    movdqa  xmm3,   xmm4
1742    pminud  xmm3,   xmm5
1743    pshufd  xmm2,   xmm3,   01001110B
1744    pminud  xmm2,   xmm3
1745    pshufd  xmm3,   xmm2,   10110001B
1746    pminud  xmm2,   xmm3
1747    movd    retrd,  xmm2
1748    pcmpeqd xmm4,   xmm2
1749    movmskps    r2d, xmm4
1750    bsf     r1d,    r2d
1751    jnz near WRITE_INDEX
1752
1753    pcmpeqd xmm5,   xmm2
1754    movmskps    r2d, xmm5
1755    bsf     r1d,    r2d
1756    add     r1d,    4
1757
1758WRITE_INDEX:
1759    mov     [r5],   r1d
1760    POP_XMM
1761    LOAD_6_PARA_POP
1762    ret
1763
1764;**********************************************************************************************************************************
1765;
1766;   uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
1767;
1768;   \note:
1769;       src and ref is optional to align with 16 due inter 8x8
1770;   \return value:
1771;       return minimal SAD cost, according index carried by index_min_cost
1772;
1773;**********************************************************************************************************************************
1774; try 8 mv via offset
1775; xmm7 store sad costs
1776%macro SAD_8x8_LINE_SSE41  4    ; src, ref, stride_src, stride_ref
1777    movdqu      xmm0, [%1]
1778    movdqu      xmm1, [%2]
1779    movdqa      xmm2, xmm1
1780
1781    mpsadbw     xmm1, xmm0, 0   ; 000 B
1782    paddw       xmm7, xmm1      ; accumulate cost
1783
1784    mpsadbw     xmm2, xmm0, 5   ; 101 B
1785    paddw       xmm7, xmm2      ; accumulate cost
1786
1787    add         %1, %3
1788    add         %2, %4
1789%endmacro   ; end of SAD_8x8_LINE_SSE41
1790%macro SAD_8x8_LINE_SSE41E  4   ; src, ref, stride_src, stride_ref
1791    movdqu      xmm0, [%1]
1792    movdqu      xmm1, [%2]
1793    movdqa      xmm2, xmm1
1794
1795    mpsadbw     xmm1, xmm0, 0   ; 000 B
1796    paddw       xmm7, xmm1      ; accumulate cost
1797
1798    mpsadbw     xmm2, xmm0, 5   ; 101 B
1799    paddw       xmm7, xmm2      ; accumulate cost
1800%endmacro   ; end of SAD_8x8_LINE_SSE41E
1801
1802WELS_EXTERN SampleSad8x8Hor8_sse41
1803    %assign  push_num 0
1804    LOAD_6_PARA
1805    PUSH_XMM 8
1806    SIGN_EXTENSION  r1, r1d
1807    SIGN_EXTENSION  r3, r3d
1808    movdqa xmm7, [r4]   ;   load base cost list
1809
1810    SAD_8x8_LINE_SSE41  r0, r2, r1, r3
1811    SAD_8x8_LINE_SSE41  r0, r2, r1, r3
1812    SAD_8x8_LINE_SSE41  r0, r2, r1, r3
1813    SAD_8x8_LINE_SSE41  r0, r2, r1, r3
1814
1815    SAD_8x8_LINE_SSE41  r0, r2, r1, r3
1816    SAD_8x8_LINE_SSE41  r0, r2, r1, r3
1817    SAD_8x8_LINE_SSE41  r0, r2, r1, r3
1818    SAD_8x8_LINE_SSE41E r0, r2, r1, r3
1819
1820    phminposuw  xmm0, xmm7  ; horizon search the minimal sad cost and its index
1821    movd    retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
1822    mov     r1d, retrd
1823    and     retrd, 0xFFFF
1824    sar     r1d, 16
1825    mov     [r5], r1d
1826
1827    POP_XMM
1828    LOAD_6_PARA_POP
1829    ret
1830