• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%define xmm_filter_shift            7
15
16;unsigned int vp8_get_mb_ss_sse2
17;(
18;    short *src_ptr
19;)
20global sym(vp8_get_mb_ss_sse2) PRIVATE
21sym(vp8_get_mb_ss_sse2):
22    push        rbp
23    mov         rbp, rsp
24    SHADOW_ARGS_TO_STACK 1
25    GET_GOT     rbx
26    push rsi
27    push rdi
28    sub         rsp, 16
29    ; end prolog
30
31
32        mov         rax, arg(0) ;[src_ptr]
33        mov         rcx, 8
34        pxor        xmm4, xmm4
35
36.NEXTROW:
37        movdqa      xmm0, [rax]
38        movdqa      xmm1, [rax+16]
39        movdqa      xmm2, [rax+32]
40        movdqa      xmm3, [rax+48]
41        pmaddwd     xmm0, xmm0
42        pmaddwd     xmm1, xmm1
43        pmaddwd     xmm2, xmm2
44        pmaddwd     xmm3, xmm3
45
46        paddd       xmm0, xmm1
47        paddd       xmm2, xmm3
48        paddd       xmm4, xmm0
49        paddd       xmm4, xmm2
50
51        add         rax, 0x40
52        dec         rcx
53        ja          .NEXTROW
54
55        movdqa      xmm3,xmm4
56        psrldq      xmm4,8
57        paddd       xmm4,xmm3
58        movdqa      xmm3,xmm4
59        psrldq      xmm4,4
60        paddd       xmm4,xmm3
61        movq        rax,xmm4
62
63
64    ; begin epilog
65    add rsp, 16
66    pop rdi
67    pop rsi
68    RESTORE_GOT
69    UNSHADOW_ARGS
70    pop         rbp
71    ret
72
73
74;unsigned int vp8_get16x16var_sse2
75;(
76;    unsigned char   *  src_ptr,
77;    int             source_stride,
78;    unsigned char   *  ref_ptr,
79;    int             recon_stride,
80;    unsigned int    *  SSE,
81;    int             *  Sum
82;)
83global sym(vp8_get16x16var_sse2) PRIVATE
84sym(vp8_get16x16var_sse2):
85    push        rbp
86    mov         rbp, rsp
87    SHADOW_ARGS_TO_STACK 6
88    SAVE_XMM 7
89    push rbx
90    push rsi
91    push rdi
92    ; end prolog
93
94        mov         rsi,            arg(0) ;[src_ptr]
95        mov         rdi,            arg(2) ;[ref_ptr]
96
97        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
98        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
99
100        ; Prefetch data
101        lea             rcx,    [rax+rax*2]
102        prefetcht0      [rsi]
103        prefetcht0      [rsi+rax]
104        prefetcht0      [rsi+rax*2]
105        prefetcht0      [rsi+rcx]
106        lea             rbx,    [rsi+rax*4]
107        prefetcht0      [rbx]
108        prefetcht0      [rbx+rax]
109        prefetcht0      [rbx+rax*2]
110        prefetcht0      [rbx+rcx]
111
112        lea             rcx,    [rdx+rdx*2]
113        prefetcht0      [rdi]
114        prefetcht0      [rdi+rdx]
115        prefetcht0      [rdi+rdx*2]
116        prefetcht0      [rdi+rcx]
117        lea             rbx,    [rdi+rdx*4]
118        prefetcht0      [rbx]
119        prefetcht0      [rbx+rdx]
120        prefetcht0      [rbx+rdx*2]
121        prefetcht0      [rbx+rcx]
122
123        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
124        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
125
126        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
127        mov         rcx,            16
128
129.var16loop:
130        movdqu      xmm1,           XMMWORD PTR [rsi]
131        movdqu      xmm2,           XMMWORD PTR [rdi]
132
133        prefetcht0      [rsi+rax*8]
134        prefetcht0      [rdi+rdx*8]
135
136        movdqa      xmm3,           xmm1
137        movdqa      xmm4,           xmm2
138
139
140        punpcklbw   xmm1,           xmm0
141        punpckhbw   xmm3,           xmm0
142
143        punpcklbw   xmm2,           xmm0
144        punpckhbw   xmm4,           xmm0
145
146
147        psubw       xmm1,           xmm2
148        psubw       xmm3,           xmm4
149
150        paddw       xmm7,           xmm1
151        pmaddwd     xmm1,           xmm1
152
153        paddw       xmm7,           xmm3
154        pmaddwd     xmm3,           xmm3
155
156        paddd       xmm6,           xmm1
157        paddd       xmm6,           xmm3
158
159        add         rsi,            rax
160        add         rdi,            rdx
161
162        sub         rcx,            1
163        jnz         .var16loop
164
165
166        movdqa      xmm1,           xmm6
167        pxor        xmm6,           xmm6
168
169        pxor        xmm5,           xmm5
170        punpcklwd   xmm6,           xmm7
171
172        punpckhwd   xmm5,           xmm7
173        psrad       xmm5,           16
174
175        psrad       xmm6,           16
176        paddd       xmm6,           xmm5
177
178        movdqa      xmm2,           xmm1
179        punpckldq   xmm1,           xmm0
180
181        punpckhdq   xmm2,           xmm0
182        movdqa      xmm7,           xmm6
183
184        paddd       xmm1,           xmm2
185        punpckldq   xmm6,           xmm0
186
187        punpckhdq   xmm7,           xmm0
188        paddd       xmm6,           xmm7
189
190        movdqa      xmm2,           xmm1
191        movdqa      xmm7,           xmm6
192
193        psrldq      xmm1,           8
194        psrldq      xmm6,           8
195
196        paddd       xmm7,           xmm6
197        paddd       xmm1,           xmm2
198
199        mov         rax,            arg(5) ;[Sum]
200        mov         rdi,            arg(4) ;[SSE]
201
202        movd DWORD PTR [rax],       xmm7
203        movd DWORD PTR [rdi],       xmm1
204
205
206    ; begin epilog
207    pop rdi
208    pop rsi
209    pop rbx
210    RESTORE_XMM
211    UNSHADOW_ARGS
212    pop         rbp
213    ret
214
215
216
217
218;unsigned int vp8_get8x8var_sse2
219;(
220;    unsigned char   *  src_ptr,
221;    int             source_stride,
222;    unsigned char   *  ref_ptr,
223;    int             recon_stride,
224;    unsigned int    *  SSE,
225;    int             *  Sum
226;)
227global sym(vp8_get8x8var_sse2) PRIVATE
228sym(vp8_get8x8var_sse2):
229    push        rbp
230    mov         rbp, rsp
231    SHADOW_ARGS_TO_STACK 6
232    SAVE_XMM 7
233    GET_GOT     rbx
234    push rsi
235    push rdi
236    sub         rsp, 16
237    ; end prolog
238
239        mov         rsi,            arg(0) ;[src_ptr]
240        mov         rdi,            arg(2) ;[ref_ptr]
241
242        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
243        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
244
245        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
246        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
247
248        movq        xmm1,           QWORD PTR [rsi]
249        movq        xmm2,           QWORD PTR [rdi]
250
251        punpcklbw   xmm1,           xmm0
252        punpcklbw   xmm2,           xmm0
253
254        psubsw      xmm1,           xmm2
255        paddw       xmm7,           xmm1
256
257        pmaddwd     xmm1,           xmm1
258
259        movq        xmm2,           QWORD PTR[rsi + rax]
260        movq        xmm3,           QWORD PTR[rdi + rdx]
261
262        punpcklbw   xmm2,           xmm0
263        punpcklbw   xmm3,           xmm0
264
265        psubsw      xmm2,           xmm3
266        paddw       xmm7,           xmm2
267
268        pmaddwd     xmm2,           xmm2
269        paddd       xmm1,           xmm2
270
271
272        movq        xmm2,           QWORD PTR[rsi + rax * 2]
273        movq        xmm3,           QWORD PTR[rdi + rdx * 2]
274
275        punpcklbw   xmm2,           xmm0
276        punpcklbw   xmm3,           xmm0
277
278        psubsw      xmm2,           xmm3
279        paddw       xmm7,           xmm2
280
281        pmaddwd     xmm2,           xmm2
282        paddd       xmm1,           xmm2
283
284
285        lea         rsi,            [rsi + rax * 2]
286        lea         rdi,            [rdi + rdx * 2]
287        movq        xmm2,           QWORD PTR[rsi + rax]
288        movq        xmm3,           QWORD PTR[rdi + rdx]
289
290        punpcklbw   xmm2,           xmm0
291        punpcklbw   xmm3,           xmm0
292
293        psubsw      xmm2,           xmm3
294        paddw       xmm7,           xmm2
295
296        pmaddwd     xmm2,           xmm2
297        paddd       xmm1,           xmm2
298
299        movq        xmm2,           QWORD PTR[rsi + rax *2]
300        movq        xmm3,           QWORD PTR[rdi + rdx *2]
301
302        punpcklbw   xmm2,           xmm0
303        punpcklbw   xmm3,           xmm0
304
305        psubsw      xmm2,           xmm3
306        paddw       xmm7,           xmm2
307
308        pmaddwd     xmm2,           xmm2
309        paddd       xmm1,           xmm2
310
311
312        lea         rsi,            [rsi + rax * 2]
313        lea         rdi,            [rdi + rdx * 2]
314
315
316        movq        xmm2,           QWORD PTR[rsi + rax]
317        movq        xmm3,           QWORD PTR[rdi + rdx]
318
319        punpcklbw   xmm2,           xmm0
320        punpcklbw   xmm3,           xmm0
321
322        psubsw      xmm2,           xmm3
323        paddw       xmm7,           xmm2
324
325        pmaddwd     xmm2,           xmm2
326        paddd       xmm1,           xmm2
327
328        movq        xmm2,           QWORD PTR[rsi + rax *2]
329        movq        xmm3,           QWORD PTR[rdi + rdx *2]
330
331        punpcklbw   xmm2,           xmm0
332        punpcklbw   xmm3,           xmm0
333
334        psubsw      xmm2,           xmm3
335        paddw       xmm7,           xmm2
336
337        pmaddwd     xmm2,           xmm2
338        paddd       xmm1,           xmm2
339
340
341        lea         rsi,            [rsi + rax * 2]
342        lea         rdi,            [rdi + rdx * 2]
343
344        movq        xmm2,           QWORD PTR[rsi + rax]
345        movq        xmm3,           QWORD PTR[rdi + rdx]
346
347        punpcklbw   xmm2,           xmm0
348        punpcklbw   xmm3,           xmm0
349
350        psubsw      xmm2,           xmm3
351        paddw       xmm7,           xmm2
352
353        pmaddwd     xmm2,           xmm2
354        paddd       xmm1,           xmm2
355
356
357        movdqa      xmm6,           xmm7
358        punpcklwd   xmm6,           xmm0
359
360        punpckhwd   xmm7,           xmm0
361        movdqa      xmm2,           xmm1
362
363        paddw       xmm6,           xmm7
364        punpckldq   xmm1,           xmm0
365
366        punpckhdq   xmm2,           xmm0
367        movdqa      xmm7,           xmm6
368
369        paddd       xmm1,           xmm2
370        punpckldq   xmm6,           xmm0
371
372        punpckhdq   xmm7,           xmm0
373        paddw       xmm6,           xmm7
374
375        movdqa      xmm2,           xmm1
376        movdqa      xmm7,           xmm6
377
378        psrldq      xmm1,           8
379        psrldq      xmm6,           8
380
381        paddw       xmm7,           xmm6
382        paddd       xmm1,           xmm2
383
384        mov         rax,            arg(5) ;[Sum]
385        mov         rdi,            arg(4) ;[SSE]
386
387        movq        rdx,            xmm7
388        movsx       rcx,            dx
389
390        mov  dword ptr [rax],       ecx
391        movd DWORD PTR [rdi],       xmm1
392
393    ; begin epilog
394    add rsp, 16
395    pop rdi
396    pop rsi
397    RESTORE_GOT
398    RESTORE_XMM
399    UNSHADOW_ARGS
400    pop         rbp
401    ret
402
403;void vp8_filter_block2d_bil_var_sse2
404;(
405;    unsigned char *ref_ptr,
406;    int ref_pixels_per_line,
407;    unsigned char *src_ptr,
408;    int src_pixels_per_line,
409;    unsigned int Height,
410;    int  xoffset,
411;    int  yoffset,
412;    int *sum,
413;    unsigned int *sumsquared;;
414;
415;)
416global sym(vp8_filter_block2d_bil_var_sse2) PRIVATE
417sym(vp8_filter_block2d_bil_var_sse2):
418    push        rbp
419    mov         rbp, rsp
420    SHADOW_ARGS_TO_STACK 9
421    SAVE_XMM 7
422    GET_GOT     rbx
423    push rsi
424    push rdi
425    push rbx
426    ; end prolog
427
428        pxor            xmm6,           xmm6                 ;
429        pxor            xmm7,           xmm7                 ;
430
431        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding
432        movdqa          xmm4,           XMMWORD PTR [rsi]
433
434        lea             rcx,            [GLOBAL(vp8_bilinear_filters_sse2)]
435        movsxd          rax,            dword ptr arg(5)     ; xoffset
436
437        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
438        je              filter_block2d_bil_var_sse2_sp_only
439
440        shl             rax,            5                    ; point to filter coeff with xoffset
441        lea             rax,            [rax + rcx]          ; HFilter
442
443        movsxd          rdx,            dword ptr arg(6)     ; yoffset
444
445        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
446        je              filter_block2d_bil_var_sse2_fp_only
447
448        shl             rdx,            5
449        lea             rdx,            [rdx + rcx]          ; VFilter
450
451        mov             rsi,            arg(0)               ;ref_ptr
452        mov             rdi,            arg(2)               ;src_ptr
453        movsxd          rcx,            dword ptr arg(4)     ;Height
454
455        pxor            xmm0,           xmm0                 ;
456        movq            xmm1,           QWORD PTR [rsi]      ;
457        movq            xmm3,           QWORD PTR [rsi+1]    ;
458
459        punpcklbw       xmm1,           xmm0                 ;
460        pmullw          xmm1,           [rax]                ;
461        punpcklbw       xmm3,           xmm0
462        pmullw          xmm3,           [rax+16]             ;
463
464        paddw           xmm1,           xmm3                 ;
465        paddw           xmm1,           xmm4                 ;
466        psraw           xmm1,           xmm_filter_shift     ;
467        movdqa          xmm5,           xmm1
468
469        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
470        lea             rsi,            [rsi + rbx]
471%if ABI_IS_32BIT=0
472        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
473%endif
474
475filter_block2d_bil_var_sse2_loop:
476        movq            xmm1,           QWORD PTR [rsi]               ;
477        movq            xmm3,           QWORD PTR [rsi+1]             ;
478
479        punpcklbw       xmm1,           xmm0                 ;
480        pmullw          xmm1,           [rax]               ;
481        punpcklbw       xmm3,           xmm0                 ;
482        pmullw          xmm3,           [rax+16]             ;
483
484        paddw           xmm1,           xmm3                 ;
485        paddw           xmm1,           xmm4               ;
486        psraw           xmm1,           xmm_filter_shift    ;
487
488        movdqa          xmm3,           xmm5                 ;
489        movdqa          xmm5,           xmm1                 ;
490
491        pmullw          xmm3,           [rdx]               ;
492        pmullw          xmm1,           [rdx+16]             ;
493        paddw           xmm1,           xmm3                 ;
494        paddw           xmm1,           xmm4                 ;
495        psraw           xmm1,           xmm_filter_shift    ;
496
497        movq            xmm3,           QWORD PTR [rdi]               ;
498        punpcklbw       xmm3,           xmm0                 ;
499
500        psubw           xmm1,           xmm3                 ;
501        paddw           xmm6,           xmm1                 ;
502
503        pmaddwd         xmm1,           xmm1                 ;
504        paddd           xmm7,           xmm1                 ;
505
506        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
507%if ABI_IS_32BIT
508        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
509%else
510        lea             rdi,            [rdi + r9]
511%endif
512
513        sub             rcx,            1                   ;
514        jnz             filter_block2d_bil_var_sse2_loop       ;
515
516        jmp             filter_block2d_bil_variance
517
518filter_block2d_bil_var_sse2_sp_only:
519        movsxd          rdx,            dword ptr arg(6)     ; yoffset
520
521        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0
522        je              filter_block2d_bil_var_sse2_full_pixel
523
524        shl             rdx,            5
525        lea             rdx,            [rdx + rcx]          ; VFilter
526
527        mov             rsi,            arg(0)               ;ref_ptr
528        mov             rdi,            arg(2)               ;src_ptr
529        movsxd          rcx,            dword ptr arg(4)     ;Height
530        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
531
532        pxor            xmm0,           xmm0                 ;
533        movq            xmm1,           QWORD PTR [rsi]      ;
534        punpcklbw       xmm1,           xmm0                 ;
535
536        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
537        lea             rsi,            [rsi + rax]
538
539filter_block2d_bil_sp_only_loop:
540        movq            xmm3,           QWORD PTR [rsi]             ;
541        punpcklbw       xmm3,           xmm0                 ;
542        movdqa          xmm5,           xmm3
543
544        pmullw          xmm1,           [rdx]               ;
545        pmullw          xmm3,           [rdx+16]             ;
546        paddw           xmm1,           xmm3                 ;
547        paddw           xmm1,           xmm4                 ;
548        psraw           xmm1,           xmm_filter_shift    ;
549
550        movq            xmm3,           QWORD PTR [rdi]               ;
551        punpcklbw       xmm3,           xmm0                 ;
552
553        psubw           xmm1,           xmm3                 ;
554        paddw           xmm6,           xmm1                 ;
555
556        pmaddwd         xmm1,           xmm1                 ;
557        paddd           xmm7,           xmm1                 ;
558
559        movdqa          xmm1,           xmm5                 ;
560        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
561        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
562
563        sub             rcx,            1                   ;
564        jnz             filter_block2d_bil_sp_only_loop       ;
565
566        jmp             filter_block2d_bil_variance
567
568filter_block2d_bil_var_sse2_full_pixel:
569        mov             rsi,            arg(0)               ;ref_ptr
570        mov             rdi,            arg(2)               ;src_ptr
571        movsxd          rcx,            dword ptr arg(4)     ;Height
572        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
573        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
574        pxor            xmm0,           xmm0                 ;
575
576filter_block2d_bil_full_pixel_loop:
577        movq            xmm1,           QWORD PTR [rsi]               ;
578        punpcklbw       xmm1,           xmm0                 ;
579
580        movq            xmm2,           QWORD PTR [rdi]               ;
581        punpcklbw       xmm2,           xmm0                 ;
582
583        psubw           xmm1,           xmm2                 ;
584        paddw           xmm6,           xmm1                 ;
585
586        pmaddwd         xmm1,           xmm1                 ;
587        paddd           xmm7,           xmm1                 ;
588
589        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
590        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
591
592        sub             rcx,            1                   ;
593        jnz             filter_block2d_bil_full_pixel_loop       ;
594
595        jmp             filter_block2d_bil_variance
596
597filter_block2d_bil_var_sse2_fp_only:
598        mov             rsi,            arg(0)               ;ref_ptr
599        mov             rdi,            arg(2)               ;src_ptr
600        movsxd          rcx,            dword ptr arg(4)     ;Height
601        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
602
603        pxor            xmm0,           xmm0                 ;
604        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
605
606filter_block2d_bil_fp_only_loop:
607        movq            xmm1,           QWORD PTR [rsi]       ;
608        movq            xmm3,           QWORD PTR [rsi+1]     ;
609
610        punpcklbw       xmm1,           xmm0                 ;
611        pmullw          xmm1,           [rax]               ;
612        punpcklbw       xmm3,           xmm0                 ;
613        pmullw          xmm3,           [rax+16]             ;
614
615        paddw           xmm1,           xmm3                 ;
616        paddw           xmm1,           xmm4  ;
617        psraw           xmm1,           xmm_filter_shift    ;
618
619        movq            xmm3,           QWORD PTR [rdi]     ;
620        punpcklbw       xmm3,           xmm0                 ;
621
622        psubw           xmm1,           xmm3                 ;
623        paddw           xmm6,           xmm1                 ;
624
625        pmaddwd         xmm1,           xmm1                 ;
626        paddd           xmm7,           xmm1                 ;
627        lea             rsi,            [rsi + rdx]
628        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
629
630        sub             rcx,            1                   ;
631        jnz             filter_block2d_bil_fp_only_loop       ;
632
633        jmp             filter_block2d_bil_variance
634
635filter_block2d_bil_variance:
636        movdq2q         mm6,            xmm6                ;
637        movdq2q         mm7,            xmm7                ;
638
639        psrldq          xmm6,           8
640        psrldq          xmm7,           8
641
642        movdq2q         mm2,            xmm6
643        movdq2q         mm3,            xmm7
644
645        paddw           mm6,            mm2
646        paddd           mm7,            mm3
647
648        pxor            mm3,            mm3                 ;
649        pxor            mm2,            mm2                 ;
650
651        punpcklwd       mm2,            mm6                 ;
652        punpckhwd       mm3,            mm6                 ;
653
654        paddd           mm2,            mm3                 ;
655        movq            mm6,            mm2                 ;
656
657        psrlq           mm6,            32                  ;
658        paddd           mm2,            mm6                 ;
659
660        psrad           mm2,            16                  ;
661        movq            mm4,            mm7                 ;
662
663        psrlq           mm4,            32                  ;
664        paddd           mm4,            mm7                 ;
665
666        mov             rsi,            arg(7) ; sum
667        mov             rdi,            arg(8) ; sumsquared
668
669        movd            [rsi],          mm2    ; xsum
670        movd            [rdi],          mm4    ; xxsum
671
672    ; begin epilog
673    pop rbx
674    pop rdi
675    pop rsi
676    RESTORE_GOT
677    RESTORE_XMM
678    UNSHADOW_ARGS
679    pop         rbp
680    ret
681
682
683;void vp8_half_horiz_vert_variance8x_h_sse2
684;(
685;    unsigned char *ref_ptr,
686;    int ref_pixels_per_line,
687;    unsigned char *src_ptr,
688;    int src_pixels_per_line,
689;    unsigned int Height,
690;    int *sum,
691;    unsigned int *sumsquared
692;)
693global sym(vp8_half_horiz_vert_variance8x_h_sse2) PRIVATE
694sym(vp8_half_horiz_vert_variance8x_h_sse2):
695    push        rbp
696    mov         rbp, rsp
697    SHADOW_ARGS_TO_STACK 7
698    SAVE_XMM 7
699    GET_GOT     rbx
700    push rsi
701    push rdi
702    ; end prolog
703
704%if ABI_IS_32BIT=0
705    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
706    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
707%endif
708
709        pxor            xmm6,           xmm6                ;  error accumulator
710        pxor            xmm7,           xmm7                ;  sse eaccumulator
711        mov             rsi,            arg(0) ;ref_ptr              ;
712
713        mov             rdi,            arg(2) ;src_ptr              ;
714        movsxd          rcx,            dword ptr arg(4) ;Height              ;
715        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
716
717        pxor            xmm0,           xmm0                ;
718
719        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
720        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
721        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
722
723%if ABI_IS_32BIT
724        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
725%else
726        add             rsi, r8
727%endif
728
729vp8_half_horiz_vert_variance8x_h_1:
730
731        movq            xmm1,           QWORD PTR [rsi]     ;
732        movq            xmm2,           QWORD PTR [rsi+1]   ;
733        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
734
735        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
736        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
737
738        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
739        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
740
741        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
742        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
743        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
744        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
745
746        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
747
748%if ABI_IS_32BIT
749        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
750        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
751%else
752        add             rsi, r8
753        add             rdi, r9
754%endif
755
756        sub             rcx,            1                   ;
757        jnz             vp8_half_horiz_vert_variance8x_h_1     ;
758
759        movdq2q         mm6,            xmm6                ;
760        movdq2q         mm7,            xmm7                ;
761
762        psrldq          xmm6,           8
763        psrldq          xmm7,           8
764
765        movdq2q         mm2,            xmm6
766        movdq2q         mm3,            xmm7
767
768        paddw           mm6,            mm2
769        paddd           mm7,            mm3
770
771        pxor            mm3,            mm3                 ;
772        pxor            mm2,            mm2                 ;
773
774        punpcklwd       mm2,            mm6                 ;
775        punpckhwd       mm3,            mm6                 ;
776
777        paddd           mm2,            mm3                 ;
778        movq            mm6,            mm2                 ;
779
780        psrlq           mm6,            32                  ;
781        paddd           mm2,            mm6                 ;
782
783        psrad           mm2,            16                  ;
784        movq            mm4,            mm7                 ;
785
786        psrlq           mm4,            32                  ;
787        paddd           mm4,            mm7                 ;
788
789        mov             rsi,            arg(5) ; sum
790        mov             rdi,            arg(6) ; sumsquared
791
792        movd            [rsi],          mm2                 ;
793        movd            [rdi],          mm4                 ;
794
795
796    ; begin epilog
797    pop rdi
798    pop rsi
799    RESTORE_GOT
800    RESTORE_XMM
801    UNSHADOW_ARGS
802    pop         rbp
803    ret
804
805;void vp8_half_horiz_vert_variance16x_h_sse2
806;(
807;    unsigned char *ref_ptr,
808;    int ref_pixels_per_line,
809;    unsigned char *src_ptr,
810;    int src_pixels_per_line,
811;    unsigned int Height,
812;    int *sum,
813;    unsigned int *sumsquared
814;)
815global sym(vp8_half_horiz_vert_variance16x_h_sse2) PRIVATE
816sym(vp8_half_horiz_vert_variance16x_h_sse2):
817    push        rbp
818    mov         rbp, rsp
819    SHADOW_ARGS_TO_STACK 7
820    SAVE_XMM 7
821    GET_GOT     rbx
822    push rsi
823    push rdi
824    ; end prolog
825
826        pxor            xmm6,           xmm6                ;  error accumulator
827        pxor            xmm7,           xmm7                ;  sse eaccumulator
828        mov             rsi,            arg(0) ;ref_ptr              ;
829
830        mov             rdi,            arg(2) ;src_ptr              ;
831        movsxd          rcx,            dword ptr arg(4) ;Height              ;
832        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
833        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
834
835        pxor            xmm0,           xmm0                ;
836
837        movdqu          xmm5,           XMMWORD PTR [rsi]
838        movdqu          xmm3,           XMMWORD PTR [rsi+1]
839        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
840
841        lea             rsi,            [rsi + rax]
842
843vp8_half_horiz_vert_variance16x_h_1:
844        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
845        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
846        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
847
848        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
849
850        movdqa          xmm4,           xmm5
851        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
852        punpckhbw       xmm4,           xmm0
853
854        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
855        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
856        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
857
858        movq            xmm3,           QWORD PTR [rdi+8]
859        punpcklbw       xmm3,           xmm0
860        psubw           xmm4,           xmm3
861
862        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
863        paddw           xmm6,           xmm4
864        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
865        pmaddwd         xmm4,           xmm4
866        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
867        paddd           xmm7,           xmm4
868
869        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
870
871        lea             rsi,            [rsi + rax]
872        lea             rdi,            [rdi + rdx]
873
874        sub             rcx,            1                   ;
875        jnz             vp8_half_horiz_vert_variance16x_h_1     ;
876
877        pxor        xmm1,           xmm1
878        pxor        xmm5,           xmm5
879
880        punpcklwd   xmm0,           xmm6
881        punpckhwd   xmm1,           xmm6
882        psrad       xmm0,           16
883        psrad       xmm1,           16
884        paddd       xmm0,           xmm1
885        movdqa      xmm1,           xmm0
886
887        movdqa      xmm6,           xmm7
888        punpckldq   xmm6,           xmm5
889        punpckhdq   xmm7,           xmm5
890        paddd       xmm6,           xmm7
891
892        punpckldq   xmm0,           xmm5
893        punpckhdq   xmm1,           xmm5
894        paddd       xmm0,           xmm1
895
896        movdqa      xmm7,           xmm6
897        movdqa      xmm1,           xmm0
898
899        psrldq      xmm7,           8
900        psrldq      xmm1,           8
901
902        paddd       xmm6,           xmm7
903        paddd       xmm0,           xmm1
904
905        mov         rsi,            arg(5) ;[Sum]
906        mov         rdi,            arg(6) ;[SSE]
907
908        movd        [rsi],       xmm0
909        movd        [rdi],       xmm6
910
911    ; begin epilog
912    pop rdi
913    pop rsi
914    RESTORE_GOT
915    RESTORE_XMM
916    UNSHADOW_ARGS
917    pop         rbp
918    ret
919
920
921;void vp8_half_vert_variance8x_h_sse2
922;(
923;    unsigned char *ref_ptr,
924;    int ref_pixels_per_line,
925;    unsigned char *src_ptr,
926;    int src_pixels_per_line,
927;    unsigned int Height,
928;    int *sum,
929;    unsigned int *sumsquared
930;)
931global sym(vp8_half_vert_variance8x_h_sse2) PRIVATE
932sym(vp8_half_vert_variance8x_h_sse2):
933    push        rbp
934    mov         rbp, rsp
935    SHADOW_ARGS_TO_STACK 7
936    SAVE_XMM 7
937    GET_GOT     rbx
938    push rsi
939    push rdi
940    ; end prolog
941
942%if ABI_IS_32BIT=0
943    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
944    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
945%endif
946
947        pxor            xmm6,           xmm6                ;  error accumulator
948        pxor            xmm7,           xmm7                ;  sse eaccumulator
949        mov             rsi,            arg(0) ;ref_ptr              ;
950
951        mov             rdi,            arg(2) ;src_ptr              ;
952        movsxd          rcx,            dword ptr arg(4) ;Height              ;
953        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
954
955        pxor            xmm0,           xmm0                ;
956vp8_half_vert_variance8x_h_1:
957        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
958        movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
959
960        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
961        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
962
963        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
964        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
965
966        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
967        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
968        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
969        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
970
971%if ABI_IS_32BIT
972        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
973        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
974%else
975        add             rsi, r8
976        add             rdi, r9
977%endif
978
979        sub             rcx,            1                   ;
980        jnz             vp8_half_vert_variance8x_h_1          ;
981
982        movdq2q         mm6,            xmm6                ;
983        movdq2q         mm7,            xmm7                ;
984
985        psrldq          xmm6,           8
986        psrldq          xmm7,           8
987
988        movdq2q         mm2,            xmm6
989        movdq2q         mm3,            xmm7
990
991        paddw           mm6,            mm2
992        paddd           mm7,            mm3
993
994        pxor            mm3,            mm3                 ;
995        pxor            mm2,            mm2                 ;
996
997        punpcklwd       mm2,            mm6                 ;
998        punpckhwd       mm3,            mm6                 ;
999
1000        paddd           mm2,            mm3                 ;
1001        movq            mm6,            mm2                 ;
1002
1003        psrlq           mm6,            32                  ;
1004        paddd           mm2,            mm6                 ;
1005
1006        psrad           mm2,            16                  ;
1007        movq            mm4,            mm7                 ;
1008
1009        psrlq           mm4,            32                  ;
1010        paddd           mm4,            mm7                 ;
1011
1012        mov             rsi,            arg(5) ; sum
1013        mov             rdi,            arg(6) ; sumsquared
1014
1015        movd            [rsi],          mm2                 ;
1016        movd            [rdi],          mm4                 ;
1017
1018
1019    ; begin epilog
1020    pop rdi
1021    pop rsi
1022    RESTORE_GOT
1023    RESTORE_XMM
1024    UNSHADOW_ARGS
1025    pop         rbp
1026    ret
1027
1028;void vp8_half_vert_variance16x_h_sse2
1029;(
1030;    unsigned char *ref_ptr,
1031;    int ref_pixels_per_line,
1032;    unsigned char *src_ptr,
1033;    int src_pixels_per_line,
1034;    unsigned int Height,
1035;    int *sum,
1036;    unsigned int *sumsquared
1037;)
1038global sym(vp8_half_vert_variance16x_h_sse2) PRIVATE
1039sym(vp8_half_vert_variance16x_h_sse2):
1040    push        rbp
1041    mov         rbp, rsp
1042    SHADOW_ARGS_TO_STACK 7
1043    SAVE_XMM 7
1044    GET_GOT     rbx
1045    push rsi
1046    push rdi
1047    ; end prolog
1048
1049        pxor            xmm6,           xmm6                ;  error accumulator
1050        pxor            xmm7,           xmm7                ;  sse eaccumulator
1051        mov             rsi,            arg(0)              ;ref_ptr
1052
1053        mov             rdi,            arg(2)              ;src_ptr
1054        movsxd          rcx,            dword ptr arg(4)    ;Height
1055        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
1056        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
1057
1058        movdqu          xmm5,           XMMWORD PTR [rsi]
1059        lea             rsi,            [rsi + rax          ]
1060        pxor            xmm0,           xmm0
1061
1062vp8_half_vert_variance16x_h_1:
1063        movdqu          xmm3,           XMMWORD PTR [rsi]
1064
1065        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
1066        movdqa          xmm4,           xmm5
1067        punpcklbw       xmm5,           xmm0
1068        punpckhbw       xmm4,           xmm0
1069
1070        movq            xmm2,           QWORD PTR [rdi]
1071        punpcklbw       xmm2,           xmm0
1072        psubw           xmm5,           xmm2
1073        movq            xmm2,           QWORD PTR [rdi+8]
1074        punpcklbw       xmm2,           xmm0
1075        psubw           xmm4,           xmm2
1076
1077        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
1078        paddw           xmm6,           xmm4
1079        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
1080        pmaddwd         xmm4,           xmm4
1081        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
1082        paddd           xmm7,           xmm4
1083
1084        movdqa          xmm5,           xmm3
1085
1086        lea             rsi,            [rsi + rax]
1087        lea             rdi,            [rdi + rdx]
1088
1089        sub             rcx,            1
1090        jnz             vp8_half_vert_variance16x_h_1
1091
1092        pxor        xmm1,           xmm1
1093        pxor        xmm5,           xmm5
1094
1095        punpcklwd   xmm0,           xmm6
1096        punpckhwd   xmm1,           xmm6
1097        psrad       xmm0,           16
1098        psrad       xmm1,           16
1099        paddd       xmm0,           xmm1
1100        movdqa      xmm1,           xmm0
1101
1102        movdqa      xmm6,           xmm7
1103        punpckldq   xmm6,           xmm5
1104        punpckhdq   xmm7,           xmm5
1105        paddd       xmm6,           xmm7
1106
1107        punpckldq   xmm0,           xmm5
1108        punpckhdq   xmm1,           xmm5
1109        paddd       xmm0,           xmm1
1110
1111        movdqa      xmm7,           xmm6
1112        movdqa      xmm1,           xmm0
1113
1114        psrldq      xmm7,           8
1115        psrldq      xmm1,           8
1116
1117        paddd       xmm6,           xmm7
1118        paddd       xmm0,           xmm1
1119
1120        mov         rsi,            arg(5) ;[Sum]
1121        mov         rdi,            arg(6) ;[SSE]
1122
1123        movd        [rsi],       xmm0
1124        movd        [rdi],       xmm6
1125
1126    ; begin epilog
1127    pop rdi
1128    pop rsi
1129    RESTORE_GOT
1130    RESTORE_XMM
1131    UNSHADOW_ARGS
1132    pop         rbp
1133    ret
1134
1135
1136;void vp8_half_horiz_variance8x_h_sse2
1137;(
1138;    unsigned char *ref_ptr,
1139;    int ref_pixels_per_line,
1140;    unsigned char *src_ptr,
1141;    int src_pixels_per_line,
1142;    unsigned int Height,
1143;    int *sum,
1144;    unsigned int *sumsquared
1145;)
1146global sym(vp8_half_horiz_variance8x_h_sse2) PRIVATE
1147sym(vp8_half_horiz_variance8x_h_sse2):
1148    push        rbp
1149    mov         rbp, rsp
1150    SHADOW_ARGS_TO_STACK 7
1151    SAVE_XMM 7
1152    GET_GOT     rbx
1153    push rsi
1154    push rdi
1155    ; end prolog
1156
1157%if ABI_IS_32BIT=0
1158    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
1159    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
1160%endif
1161
1162        pxor            xmm6,           xmm6                ;  error accumulator
1163        pxor            xmm7,           xmm7                ;  sse eaccumulator
1164        mov             rsi,            arg(0) ;ref_ptr              ;
1165
1166        mov             rdi,            arg(2) ;src_ptr              ;
1167        movsxd          rcx,            dword ptr arg(4) ;Height              ;
1168
1169        pxor            xmm0,           xmm0                ;
1170vp8_half_horiz_variance8x_h_1:
1171        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
1172        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
1173
1174        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
1175        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
1176
1177        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
1178        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
1179
1180        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
1181        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
1182        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
1183        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
1184
1185%if ABI_IS_32BIT
1186        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
1187        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
1188%else
1189        add             rsi, r8
1190        add             rdi, r9
1191%endif
1192        sub             rcx,            1                   ;
1193        jnz             vp8_half_horiz_variance8x_h_1        ;
1194
1195        movdq2q         mm6,            xmm6                ;
1196        movdq2q         mm7,            xmm7                ;
1197
1198        psrldq          xmm6,           8
1199        psrldq          xmm7,           8
1200
1201        movdq2q         mm2,            xmm6
1202        movdq2q         mm3,            xmm7
1203
1204        paddw           mm6,            mm2
1205        paddd           mm7,            mm3
1206
1207        pxor            mm3,            mm3                 ;
1208        pxor            mm2,            mm2                 ;
1209
1210        punpcklwd       mm2,            mm6                 ;
1211        punpckhwd       mm3,            mm6                 ;
1212
1213        paddd           mm2,            mm3                 ;
1214        movq            mm6,            mm2                 ;
1215
1216        psrlq           mm6,            32                  ;
1217        paddd           mm2,            mm6                 ;
1218
1219        psrad           mm2,            16                  ;
1220        movq            mm4,            mm7                 ;
1221
1222        psrlq           mm4,            32                  ;
1223        paddd           mm4,            mm7                 ;
1224
1225        mov             rsi,            arg(5) ; sum
1226        mov             rdi,            arg(6) ; sumsquared
1227
1228        movd            [rsi],          mm2                 ;
1229        movd            [rdi],          mm4                 ;
1230
1231
1232    ; begin epilog
1233    pop rdi
1234    pop rsi
1235    RESTORE_GOT
1236    RESTORE_XMM
1237    UNSHADOW_ARGS
1238    pop         rbp
1239    ret
1240
1241;void vp8_half_horiz_variance16x_h_sse2
1242;(
1243;    unsigned char *ref_ptr,
1244;    int ref_pixels_per_line,
1245;    unsigned char *src_ptr,
1246;    int src_pixels_per_line,
1247;    unsigned int Height,
1248;    int *sum,
1249;    unsigned int *sumsquared
1250;)
1251global sym(vp8_half_horiz_variance16x_h_sse2) PRIVATE
1252sym(vp8_half_horiz_variance16x_h_sse2):
1253    push        rbp
1254    mov         rbp, rsp
1255    SHADOW_ARGS_TO_STACK 7
1256    SAVE_XMM 7
1257    GET_GOT     rbx
1258    push rsi
1259    push rdi
1260    ; end prolog
1261
1262        pxor            xmm6,           xmm6                ;  error accumulator
1263        pxor            xmm7,           xmm7                ;  sse eaccumulator
1264        mov             rsi,            arg(0) ;ref_ptr              ;
1265
1266        mov             rdi,            arg(2) ;src_ptr              ;
1267        movsxd          rcx,            dword ptr arg(4) ;Height              ;
1268        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
1269        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
1270
1271        pxor            xmm0,           xmm0                ;
1272
1273vp8_half_horiz_variance16x_h_1:
1274        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
1275        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
1276
1277        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
1278        movdqa          xmm1,           xmm5
1279        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
1280        punpckhbw       xmm1,           xmm0
1281
1282        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
1283        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
1284        movq            xmm2,           QWORD PTR [rdi+8]
1285        punpcklbw       xmm2,           xmm0
1286
1287        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
1288        psubw           xmm1,           xmm2
1289        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
1290        paddw           xmm6,           xmm1
1291        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
1292        pmaddwd         xmm1,           xmm1
1293        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
1294        paddd           xmm7,           xmm1
1295
1296        lea             rsi,            [rsi + rax]
1297        lea             rdi,            [rdi + rdx]
1298
1299        sub             rcx,            1                   ;
1300        jnz             vp8_half_horiz_variance16x_h_1        ;
1301
1302        pxor        xmm1,           xmm1
1303        pxor        xmm5,           xmm5
1304
1305        punpcklwd   xmm0,           xmm6
1306        punpckhwd   xmm1,           xmm6
1307        psrad       xmm0,           16
1308        psrad       xmm1,           16
1309        paddd       xmm0,           xmm1
1310        movdqa      xmm1,           xmm0
1311
1312        movdqa      xmm6,           xmm7
1313        punpckldq   xmm6,           xmm5
1314        punpckhdq   xmm7,           xmm5
1315        paddd       xmm6,           xmm7
1316
1317        punpckldq   xmm0,           xmm5
1318        punpckhdq   xmm1,           xmm5
1319        paddd       xmm0,           xmm1
1320
1321        movdqa      xmm7,           xmm6
1322        movdqa      xmm1,           xmm0
1323
1324        psrldq      xmm7,           8
1325        psrldq      xmm1,           8
1326
1327        paddd       xmm6,           xmm7
1328        paddd       xmm0,           xmm1
1329
1330        mov         rsi,            arg(5) ;[Sum]
1331        mov         rdi,            arg(6) ;[SSE]
1332
1333        movd        [rsi],       xmm0
1334        movd        [rdi],       xmm6
1335
1336    ; begin epilog
1337    pop rdi
1338    pop rsi
1339    RESTORE_GOT
1340    RESTORE_XMM
1341    UNSHADOW_ARGS
1342    pop         rbp
1343    ret
1344
1345SECTION_RODATA
1346;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
1347align 16
1348xmm_bi_rd:
1349    times 8 dw 64
1350align 16
1351vp8_bilinear_filters_sse2:
1352    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
1353    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
1354    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
1355    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
1356    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
1357    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
1358    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
1359    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
1360