• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;void copy_mem16x16_sse2(
15;    unsigned char *src,
16;    int src_stride,
17;    unsigned char *dst,
18;    int dst_stride
19;    )
20global sym(vp8_copy_mem16x16_sse2) PRIVATE
21sym(vp8_copy_mem16x16_sse2):
22    push        rbp
23    mov         rbp, rsp
24    SHADOW_ARGS_TO_STACK 4
25    push        rsi
26    push        rdi
27    ; end prolog
28
29        mov         rsi,        arg(0) ;src;
30        movdqu      xmm0,       [rsi]
31
32        movsxd      rax,        dword ptr arg(1) ;src_stride;
33        mov         rdi,        arg(2) ;dst;
34
35        movdqu      xmm1,       [rsi+rax]
36        movdqu      xmm2,       [rsi+rax*2]
37
38        movsxd      rcx,        dword ptr arg(3) ;dst_stride
39        lea         rsi,        [rsi+rax*2]
40
41        movdqa      [rdi],      xmm0
42        add         rsi,        rax
43
44        movdqa      [rdi+rcx],  xmm1
45        movdqa      [rdi+rcx*2],xmm2
46
47        lea         rdi,        [rdi+rcx*2]
48        movdqu      xmm3,       [rsi]
49
50        add         rdi,        rcx
51        movdqu      xmm4,       [rsi+rax]
52
53        movdqu      xmm5,       [rsi+rax*2]
54        lea         rsi,        [rsi+rax*2]
55
56        movdqa      [rdi],  xmm3
57        add         rsi,        rax
58
59        movdqa      [rdi+rcx],  xmm4
60        movdqa      [rdi+rcx*2],xmm5
61
62        lea         rdi,        [rdi+rcx*2]
63        movdqu      xmm0,       [rsi]
64
65        add         rdi,        rcx
66        movdqu      xmm1,       [rsi+rax]
67
68        movdqu      xmm2,       [rsi+rax*2]
69        lea         rsi,        [rsi+rax*2]
70
71        movdqa      [rdi],      xmm0
72        add         rsi,        rax
73
74        movdqa      [rdi+rcx],  xmm1
75
76        movdqa      [rdi+rcx*2],    xmm2
77        movdqu      xmm3,       [rsi]
78
79        movdqu      xmm4,       [rsi+rax]
80        lea         rdi,        [rdi+rcx*2]
81
82        add         rdi,        rcx
83        movdqu      xmm5,       [rsi+rax*2]
84
85        lea         rsi,        [rsi+rax*2]
86        movdqa      [rdi],  xmm3
87
88        add         rsi,        rax
89        movdqa      [rdi+rcx],  xmm4
90
91        movdqa      [rdi+rcx*2],xmm5
92        movdqu      xmm0,       [rsi]
93
94        lea         rdi,        [rdi+rcx*2]
95        movdqu      xmm1,       [rsi+rax]
96
97        add         rdi,        rcx
98        movdqu      xmm2,       [rsi+rax*2]
99
100        lea         rsi,        [rsi+rax*2]
101        movdqa      [rdi],      xmm0
102
103        movdqa      [rdi+rcx],  xmm1
104        movdqa      [rdi+rcx*2],xmm2
105
106        movdqu      xmm3,       [rsi+rax]
107        lea         rdi,        [rdi+rcx*2]
108
109        movdqa      [rdi+rcx],  xmm3
110
111    ; begin epilog
112    pop rdi
113    pop rsi
114    UNSHADOW_ARGS
115    pop         rbp
116    ret
117
118
119;void vp8_intra_pred_uv_dc_mmx2(
120;    unsigned char *dst,
121;    int dst_stride
122;    unsigned char *above,
123;    unsigned char *left,
124;    int left_stride,
125;    )
126global sym(vp8_intra_pred_uv_dc_mmx2) PRIVATE
127sym(vp8_intra_pred_uv_dc_mmx2):
128    push        rbp
129    mov         rbp, rsp
130    SHADOW_ARGS_TO_STACK 5
131    push        rsi
132    push        rdi
133    ; end prolog
134
135    ; from top
136    mov         rdi,        arg(2) ;above;
137    mov         rsi,        arg(3) ;left;
138    movsxd      rax,        dword ptr arg(4) ;left_stride;
139    pxor        mm0,        mm0
140    movq        mm1,        [rdi]
141    lea         rdi,        [rax*3]
142    psadbw      mm1,        mm0
143    ; from left
144    movzx       ecx,        byte [rsi]
145    movzx       edx,        byte [rsi+rax*1]
146    add         ecx,        edx
147    movzx       edx,        byte [rsi+rax*2]
148    add         ecx,        edx
149
150    movzx       edx,        byte [rsi+rdi]
151    lea         rsi,        [rsi+rax*4]
152    add         ecx,        edx
153    movzx       edx,        byte [rsi]
154    add         ecx,        edx
155    movzx       edx,        byte [rsi+rax]
156    add         ecx,        edx
157    movzx       edx,        byte [rsi+rax*2]
158    add         ecx,        edx
159    movzx       edx,        byte [rsi+rdi]
160    add         ecx,        edx
161
162    ; add up
163    pextrw      edx,        mm1, 0x0
164    lea         edx,        [edx+ecx+8]
165    sar         edx,        4
166    movd        mm1,        edx
167    movsxd      rcx,        dword ptr arg(1) ;dst_stride
168    pshufw      mm1,        mm1, 0x0
169    mov         rdi,        arg(0) ;dst;
170    packuswb    mm1,        mm1
171
172    ; write out
173    lea         rax,        [rcx*3]
174    lea         rdx,        [rdi+rcx*4]
175
176    movq [rdi      ],       mm1
177    movq [rdi+rcx  ],       mm1
178    movq [rdi+rcx*2],       mm1
179    movq [rdi+rax  ],       mm1
180    movq [rdx      ],       mm1
181    movq [rdx+rcx  ],       mm1
182    movq [rdx+rcx*2],       mm1
183    movq [rdx+rax  ],       mm1
184
185    ; begin epilog
186    pop         rdi
187    pop         rsi
188    UNSHADOW_ARGS
189    pop         rbp
190    ret
191
192;void vp8_intra_pred_uv_dctop_mmx2(
193;    unsigned char *dst,
194;    int dst_stride
195;    unsigned char *above,
196;    unsigned char *left,
197;    int left_stride,
198;    )
199global sym(vp8_intra_pred_uv_dctop_mmx2) PRIVATE
200sym(vp8_intra_pred_uv_dctop_mmx2):
201    push        rbp
202    mov         rbp, rsp
203    SHADOW_ARGS_TO_STACK 5
204    GET_GOT     rbx
205    push        rsi
206    push        rdi
207    ; end prolog
208
209    ;arg(3), arg(4) not used
210
211    ; from top
212    mov         rsi,        arg(2) ;above;
213    pxor        mm0,        mm0
214    movq        mm1,        [rsi]
215    psadbw      mm1,        mm0
216
217    ; add up
218    paddw       mm1,        [GLOBAL(dc_4)]
219    psraw       mm1,        3
220    pshufw      mm1,        mm1, 0x0
221    packuswb    mm1,        mm1
222
223    ; write out
224    mov         rdi,        arg(0) ;dst;
225    movsxd      rcx,        dword ptr arg(1) ;dst_stride
226    lea         rax,        [rcx*3]
227
228    movq [rdi      ],       mm1
229    movq [rdi+rcx  ],       mm1
230    movq [rdi+rcx*2],       mm1
231    movq [rdi+rax  ],       mm1
232    lea         rdi,        [rdi+rcx*4]
233    movq [rdi      ],       mm1
234    movq [rdi+rcx  ],       mm1
235    movq [rdi+rcx*2],       mm1
236    movq [rdi+rax  ],       mm1
237
238    ; begin epilog
239    pop         rdi
240    pop         rsi
241    RESTORE_GOT
242    UNSHADOW_ARGS
243    pop         rbp
244    ret
245
246;void vp8_intra_pred_uv_dcleft_mmx2(
247;    unsigned char *dst,
248;    int dst_stride
249;    unsigned char *above,
250;    unsigned char *left,
251;    int left_stride,
252;    )
253global sym(vp8_intra_pred_uv_dcleft_mmx2) PRIVATE
254sym(vp8_intra_pred_uv_dcleft_mmx2):
255    push        rbp
256    mov         rbp, rsp
257    SHADOW_ARGS_TO_STACK 5
258    push        rsi
259    push        rdi
260    ; end prolog
261
262    ;arg(2) not used
263
264    ; from left
265    mov         rsi,        arg(3) ;left;
266    movsxd      rax,        dword ptr arg(4) ;left_stride;
267    lea         rdi,        [rax*3]
268    movzx       ecx,        byte [rsi]
269    movzx       edx,        byte [rsi+rax]
270    add         ecx,        edx
271    movzx       edx,        byte [rsi+rax*2]
272    add         ecx,        edx
273    movzx       edx,        byte [rsi+rdi]
274    add         ecx,        edx
275    lea         rsi,        [rsi+rax*4]
276    movzx       edx,        byte [rsi]
277    add         ecx,        edx
278    movzx       edx,        byte [rsi+rax]
279    add         ecx,        edx
280    movzx       edx,        byte [rsi+rax*2]
281    add         ecx,        edx
282    movzx       edx,        byte [rsi+rdi]
283    lea         edx,        [ecx+edx+4]
284
285    ; add up
286    shr         edx,        3
287    movd        mm1,        edx
288    pshufw      mm1,        mm1, 0x0
289    packuswb    mm1,        mm1
290
291    ; write out
292    mov         rdi,        arg(0) ;dst;
293    movsxd      rcx,        dword ptr arg(1) ;dst_stride
294    lea         rax,        [rcx*3]
295
296    movq [rdi      ],       mm1
297    movq [rdi+rcx  ],       mm1
298    movq [rdi+rcx*2],       mm1
299    movq [rdi+rax  ],       mm1
300    lea         rdi,        [rdi+rcx*4]
301    movq [rdi      ],       mm1
302    movq [rdi+rcx  ],       mm1
303    movq [rdi+rcx*2],       mm1
304    movq [rdi+rax  ],       mm1
305
306    ; begin epilog
307    pop         rdi
308    pop         rsi
309    UNSHADOW_ARGS
310    pop         rbp
311    ret
312
313;void vp8_intra_pred_uv_dc128_mmx(
314;    unsigned char *dst,
315;    int dst_stride
316;    unsigned char *above,
317;    unsigned char *left,
318;    int left_stride,
319;    )
320global sym(vp8_intra_pred_uv_dc128_mmx) PRIVATE
321sym(vp8_intra_pred_uv_dc128_mmx):
322    push        rbp
323    mov         rbp, rsp
324    SHADOW_ARGS_TO_STACK 5
325    GET_GOT     rbx
326    ; end prolog
327
328    ;arg(2), arg(3), arg(4) not used
329
330    ; write out
331    movq        mm1,        [GLOBAL(dc_128)]
332    mov         rax,        arg(0) ;dst;
333    movsxd      rdx,        dword ptr arg(1) ;dst_stride
334    lea         rcx,        [rdx*3]
335
336    movq [rax      ],       mm1
337    movq [rax+rdx  ],       mm1
338    movq [rax+rdx*2],       mm1
339    movq [rax+rcx  ],       mm1
340    lea         rax,        [rax+rdx*4]
341    movq [rax      ],       mm1
342    movq [rax+rdx  ],       mm1
343    movq [rax+rdx*2],       mm1
344    movq [rax+rcx  ],       mm1
345
346    ; begin epilog
347    RESTORE_GOT
348    UNSHADOW_ARGS
349    pop         rbp
350    ret
351
352;void vp8_intra_pred_uv_tm_sse2(
353;    unsigned char *dst,
354;    int dst_stride
355;    unsigned char *above,
356;    unsigned char *left,
357;    int left_stride,
358;    )
359%macro vp8_intra_pred_uv_tm 1
360global sym(vp8_intra_pred_uv_tm_%1) PRIVATE
361sym(vp8_intra_pred_uv_tm_%1):
362    push        rbp
363    mov         rbp, rsp
364    SHADOW_ARGS_TO_STACK 5
365    GET_GOT     rbx
366    push        rsi
367    push        rdi
368    push        rbx
369    ; end prolog
370
371    ; read top row
372    mov         edx,        4
373    mov         rsi,        arg(2) ;above
374    movsxd      rax,        dword ptr arg(4) ;left_stride;
375    pxor        xmm0,       xmm0
376%ifidn %1, ssse3
377    movdqa      xmm2,       [GLOBAL(dc_1024)]
378%endif
379    movq        xmm1,       [rsi]
380    punpcklbw   xmm1,       xmm0
381
382    ; set up left ptrs ans subtract topleft
383    movd        xmm3,       [rsi-1]
384    mov         rsi,        arg(3) ;left;
385%ifidn %1, sse2
386    punpcklbw   xmm3,       xmm0
387    pshuflw     xmm3,       xmm3, 0x0
388    punpcklqdq  xmm3,       xmm3
389%else
390    pshufb      xmm3,       xmm2
391%endif
392    psubw       xmm1,       xmm3
393
394    ; set up dest ptrs
395    mov         rdi,        arg(0) ;dst;
396    movsxd      rcx,        dword ptr arg(1) ;dst_stride
397
398.vp8_intra_pred_uv_tm_%1_loop:
399    mov         bl,         [rsi]
400    movd        xmm3,       ebx
401
402    mov         bl,         [rsi+rax]
403    movd        xmm5,       ebx
404%ifidn %1, sse2
405    punpcklbw   xmm3,       xmm0
406    punpcklbw   xmm5,       xmm0
407    pshuflw     xmm3,       xmm3, 0x0
408    pshuflw     xmm5,       xmm5, 0x0
409    punpcklqdq  xmm3,       xmm3
410    punpcklqdq  xmm5,       xmm5
411%else
412    pshufb      xmm3,       xmm2
413    pshufb      xmm5,       xmm2
414%endif
415    paddw       xmm3,       xmm1
416    paddw       xmm5,       xmm1
417    packuswb    xmm3,       xmm5
418    movq  [rdi    ],        xmm3
419    movhps[rdi+rcx],        xmm3
420    lea         rsi,        [rsi+rax*2]
421    lea         rdi,        [rdi+rcx*2]
422    dec         edx
423    jnz .vp8_intra_pred_uv_tm_%1_loop
424
425    ; begin epilog
426    pop         rbx
427    pop         rdi
428    pop         rsi
429    RESTORE_GOT
430    UNSHADOW_ARGS
431    pop         rbp
432    ret
433%endmacro
434
435vp8_intra_pred_uv_tm sse2
436vp8_intra_pred_uv_tm ssse3
437
438;void vp8_intra_pred_uv_ve_mmx(
439;    unsigned char *dst,
440;    int dst_stride
441;    unsigned char *above,
442;    unsigned char *left,
443;    int left_stride,
444;    )
445global sym(vp8_intra_pred_uv_ve_mmx) PRIVATE
446sym(vp8_intra_pred_uv_ve_mmx):
447    push        rbp
448    mov         rbp, rsp
449    SHADOW_ARGS_TO_STACK 5
450    ; end prolog
451
452    ; arg(3), arg(4) not used
453
454    ; read from top
455    mov         rax,        arg(2) ;src;
456
457    movq        mm1,        [rax]
458
459    ; write out
460    mov         rax,        arg(0) ;dst;
461    movsxd      rdx,        dword ptr arg(1) ;dst_stride
462    lea         rcx,        [rdx*3]
463
464    movq [rax      ],       mm1
465    movq [rax+rdx  ],       mm1
466    movq [rax+rdx*2],       mm1
467    movq [rax+rcx  ],       mm1
468    lea         rax,        [rax+rdx*4]
469    movq [rax      ],       mm1
470    movq [rax+rdx  ],       mm1
471    movq [rax+rdx*2],       mm1
472    movq [rax+rcx  ],       mm1
473
474    ; begin epilog
475    UNSHADOW_ARGS
476    pop         rbp
477    ret
478
479;void vp8_intra_pred_uv_ho_mmx2(
480;    unsigned char *dst,
481;    int dst_stride
482;    unsigned char *above,
483;    unsigned char *left,
484;    int left_stride
485;    )
486%macro vp8_intra_pred_uv_ho 1
487global sym(vp8_intra_pred_uv_ho_%1) PRIVATE
488sym(vp8_intra_pred_uv_ho_%1):
489    push        rbp
490    mov         rbp, rsp
491    SHADOW_ARGS_TO_STACK 5
492    push        rsi
493    push        rdi
494    push        rbx
495%ifidn %1, ssse3
496    GET_GOT     rbx
497%endif
498    ; end prolog
499
500    ;arg(2) not used
501
502    ; read from left and write out
503%ifidn %1, mmx2
504    mov         edx,        4
505%endif
506    mov         rsi,        arg(3) ;left
507    movsxd      rax,        dword ptr arg(4) ;left_stride;
508    mov         rdi,        arg(0) ;dst;
509    movsxd      rcx,        dword ptr arg(1) ;dst_stride
510%ifidn %1, ssse3
511    lea         rdx,        [rcx*3]
512    movdqa      xmm2,       [GLOBAL(dc_00001111)]
513%endif
514
515%ifidn %1, mmx2
516.vp8_intra_pred_uv_ho_%1_loop:
517    mov         bl,         [rsi]
518    movd        mm0,        ebx
519
520    mov         bl,         [rsi+rax]
521    movd        mm1,        ebx
522
523    punpcklbw   mm0,        mm0
524    punpcklbw   mm1,        mm1
525    pshufw      mm0,        mm0, 0x0
526    pshufw      mm1,        mm1, 0x0
527    movq  [rdi    ],        mm0
528    movq  [rdi+rcx],        mm1
529    lea         rsi,        [rsi+rax*2]
530    lea         rdi,        [rdi+rcx*2]
531    dec         edx
532    jnz .vp8_intra_pred_uv_ho_%1_loop
533%else
534    mov         bl,         [rsi]
535    movd        xmm0,       ebx
536
537    mov         bl,         [rsi+rax]
538    movd        xmm3,       ebx
539
540    mov         bl,         [rsi+rax*2]
541    movd        xmm1,       ebx
542
543    lea         rbx,        [rax*3]
544    mov         bl,         [rsi+rbx]
545    movd        xmm4,       ebx
546
547    punpcklbw   xmm0,       xmm3
548    punpcklbw   xmm1,       xmm4
549    pshufb      xmm0,       xmm2
550    pshufb      xmm1,       xmm2
551    movq   [rdi    ],       xmm0
552    movhps [rdi+rcx],       xmm0
553    movq [rdi+rcx*2],       xmm1
554    movhps [rdi+rdx],       xmm1
555    lea         rsi,        [rsi+rax*4]
556    lea         rdi,        [rdi+rcx*4]
557
558    mov         bl,         [rsi]
559    movd        xmm0,       ebx
560
561    mov         bl,         [rsi+rax]
562    movd        xmm3,       ebx
563
564    mov         bl,         [rsi+rax*2]
565    movd        xmm1,       ebx
566
567    lea         rbx,        [rax*3]
568    mov         bl,         [rsi+rbx]
569    movd        xmm4,       ebx
570
571    punpcklbw   xmm0,       xmm3
572    punpcklbw   xmm1,       xmm4
573    pshufb      xmm0,       xmm2
574    pshufb      xmm1,       xmm2
575    movq   [rdi    ],       xmm0
576    movhps [rdi+rcx],       xmm0
577    movq [rdi+rcx*2],       xmm1
578    movhps [rdi+rdx],       xmm1
579%endif
580
581    ; begin epilog
582%ifidn %1, ssse3
583    RESTORE_GOT
584%endif
585    pop         rbx
586    pop         rdi
587    pop         rsi
588    UNSHADOW_ARGS
589    pop         rbp
590    ret
591%endmacro
592
593vp8_intra_pred_uv_ho mmx2
594vp8_intra_pred_uv_ho ssse3
595
596;void vp8_intra_pred_y_dc_sse2(
597;    unsigned char *dst,
598;    int dst_stride
599;    unsigned char *above,
600;    unsigned char *left,
601;    int left_stride
602;    )
603global sym(vp8_intra_pred_y_dc_sse2) PRIVATE
604sym(vp8_intra_pred_y_dc_sse2):
605    push        rbp
606    mov         rbp, rsp
607    SHADOW_ARGS_TO_STACK 5
608    push        rsi
609    push        rdi
610    ; end prolog
611
612    ; from top
613    mov         rdi,        arg(2) ;above
614    mov         rsi,        arg(3) ;left
615    movsxd      rax,        dword ptr arg(4) ;left_stride;
616
617    pxor        xmm0,       xmm0
618    movdqa      xmm1,       [rdi]
619    psadbw      xmm1,       xmm0
620    movq        xmm2,       xmm1
621    punpckhqdq  xmm1,       xmm1
622    paddw       xmm1,       xmm2
623
624    ; from left
625    lea         rdi,        [rax*3]
626
627    movzx       ecx,        byte [rsi]
628    movzx       edx,        byte [rsi+rax]
629    add         ecx,        edx
630    movzx       edx,        byte [rsi+rax*2]
631    add         ecx,        edx
632    movzx       edx,        byte [rsi+rdi]
633    add         ecx,        edx
634    lea         rsi,        [rsi+rax*4]
635
636    movzx       edx,        byte [rsi]
637    add         ecx,        edx
638    movzx       edx,        byte [rsi+rax]
639    add         ecx,        edx
640    movzx       edx,        byte [rsi+rax*2]
641    add         ecx,        edx
642    movzx       edx,        byte [rsi+rdi]
643    add         ecx,        edx
644    lea         rsi,        [rsi+rax*4]
645
646    movzx       edx,        byte [rsi]
647    add         ecx,        edx
648    movzx       edx,        byte [rsi+rax]
649    add         ecx,        edx
650    movzx       edx,        byte [rsi+rax*2]
651    add         ecx,        edx
652    movzx       edx,        byte [rsi+rdi]
653    add         ecx,        edx
654    lea         rsi,        [rsi+rax*4]
655
656    movzx       edx,        byte [rsi]
657    add         ecx,        edx
658    movzx       edx,        byte [rsi+rax]
659    add         ecx,        edx
660    movzx       edx,        byte [rsi+rax*2]
661    add         ecx,        edx
662    movzx       edx,        byte [rsi+rdi]
663    add         ecx,        edx
664
665    ; add up
666    pextrw      edx,        xmm1, 0x0
667    lea         edx,        [edx+ecx+16]
668    sar         edx,        5
669    movd        xmm1,       edx
670    ; FIXME use pshufb for ssse3 version
671    pshuflw     xmm1,       xmm1, 0x0
672    punpcklqdq  xmm1,       xmm1
673    packuswb    xmm1,       xmm1
674
675    ; write out
676    mov         rsi,        2
677    mov         rdi,        arg(0) ;dst;
678    movsxd      rcx,        dword ptr arg(1) ;dst_stride
679    lea         rax,        [rcx*3]
680
681.label
682    movdqa [rdi      ],     xmm1
683    movdqa [rdi+rcx  ],     xmm1
684    movdqa [rdi+rcx*2],     xmm1
685    movdqa [rdi+rax  ],     xmm1
686    lea         rdi,        [rdi+rcx*4]
687    movdqa [rdi      ],     xmm1
688    movdqa [rdi+rcx  ],     xmm1
689    movdqa [rdi+rcx*2],     xmm1
690    movdqa [rdi+rax  ],     xmm1
691    lea         rdi,        [rdi+rcx*4]
692    dec         rsi
693    jnz .label
694
695    ; begin epilog
696    pop         rdi
697    pop         rsi
698    UNSHADOW_ARGS
699    pop         rbp
700    ret
701
702;void vp8_intra_pred_y_dctop_sse2(
703;    unsigned char *dst,
704;    int dst_stride
705;    unsigned char *above,
706;    unsigned char *left,
707;    int left_stride
708;    )
709global sym(vp8_intra_pred_y_dctop_sse2) PRIVATE
710sym(vp8_intra_pred_y_dctop_sse2):
711    push        rbp
712    mov         rbp, rsp
713    SHADOW_ARGS_TO_STACK 5
714    push        rsi
715    GET_GOT     rbx
716    ; end prolog
717
718    ;arg(3), arg(4) not used
719
720    ; from top
721    mov         rcx,        arg(2) ;above;
722    pxor        xmm0,       xmm0
723    movdqa      xmm1,       [rcx]
724    psadbw      xmm1,       xmm0
725    movdqa      xmm2,       xmm1
726    punpckhqdq  xmm1,       xmm1
727    paddw       xmm1,       xmm2
728
729    ; add up
730    paddw       xmm1,       [GLOBAL(dc_8)]
731    psraw       xmm1,       4
732    ; FIXME use pshufb for ssse3 version
733    pshuflw     xmm1,       xmm1, 0x0
734    punpcklqdq  xmm1,       xmm1
735    packuswb    xmm1,       xmm1
736
737    ; write out
738    mov         rsi,        2
739    mov         rdx,        arg(0) ;dst;
740    movsxd      rcx,        dword ptr arg(1) ;dst_stride
741    lea         rax,        [rcx*3]
742
743.label
744    movdqa [rdx      ],     xmm1
745    movdqa [rdx+rcx  ],     xmm1
746    movdqa [rdx+rcx*2],     xmm1
747    movdqa [rdx+rax  ],     xmm1
748    lea         rdx,        [rdx+rcx*4]
749    movdqa [rdx      ],     xmm1
750    movdqa [rdx+rcx  ],     xmm1
751    movdqa [rdx+rcx*2],     xmm1
752    movdqa [rdx+rax  ],     xmm1
753    lea         rdx,        [rdx+rcx*4]
754    dec         rsi
755    jnz .label
756
757    ; begin epilog
758    RESTORE_GOT
759    pop         rsi
760    UNSHADOW_ARGS
761    pop         rbp
762    ret
763
764;void vp8_intra_pred_y_dcleft_sse2(
765;    unsigned char *dst,
766;    int dst_stride
767;    unsigned char *above,
768;    unsigned char *left,
769;    int left_stride
770;    )
771global sym(vp8_intra_pred_y_dcleft_sse2) PRIVATE
772sym(vp8_intra_pred_y_dcleft_sse2):
773    push        rbp
774    mov         rbp, rsp
775    SHADOW_ARGS_TO_STACK 5
776    push        rsi
777    push        rdi
778    ; end prolog
779
780    ;arg(2) not used
781
782    ; from left
783    mov         rsi,        arg(3) ;left;
784    movsxd      rax,        dword ptr arg(4) ;left_stride;
785
786    lea         rdi,        [rax*3]
787    movzx       ecx,        byte [rsi]
788    movzx       edx,        byte [rsi+rax]
789    add         ecx,        edx
790    movzx       edx,        byte [rsi+rax*2]
791    add         ecx,        edx
792    movzx       edx,        byte [rsi+rdi]
793    add         ecx,        edx
794    lea         rsi,        [rsi+rax*4]
795    movzx       edx,        byte [rsi]
796    add         ecx,        edx
797    movzx       edx,        byte [rsi+rax]
798    add         ecx,        edx
799    movzx       edx,        byte [rsi+rax*2]
800    add         ecx,        edx
801    movzx       edx,        byte [rsi+rdi]
802    add         ecx,        edx
803    lea         rsi,        [rsi+rax*4]
804    movzx       edx,        byte [rsi]
805    add         ecx,        edx
806    movzx       edx,        byte [rsi+rax]
807    add         ecx,        edx
808    movzx       edx,        byte [rsi+rax*2]
809    add         ecx,        edx
810    movzx       edx,        byte [rsi+rdi]
811    add         ecx,        edx
812    lea         rsi,        [rsi+rax*4]
813    movzx       edx,        byte [rsi]
814    add         ecx,        edx
815    movzx       edx,        byte [rsi+rax]
816    add         ecx,        edx
817    movzx       edx,        byte [rsi+rax*2]
818    add         ecx,        edx
819    movzx       edx,        byte [rsi+rdi]
820    lea         edx,        [ecx+edx+8]
821
822    ; add up
823    shr         edx,        4
824    movd        xmm1,       edx
825    ; FIXME use pshufb for ssse3 version
826    pshuflw     xmm1,       xmm1, 0x0
827    punpcklqdq  xmm1,       xmm1
828    packuswb    xmm1,       xmm1
829
830    ; write out
831    mov         rsi,        2
832    mov         rdi,        arg(0) ;dst;
833    movsxd      rcx,        dword ptr arg(1) ;dst_stride
834    lea         rax,        [rcx*3]
835
836.label
837    movdqa [rdi      ],     xmm1
838    movdqa [rdi+rcx  ],     xmm1
839    movdqa [rdi+rcx*2],     xmm1
840    movdqa [rdi+rax  ],     xmm1
841    lea         rdi,        [rdi+rcx*4]
842    movdqa [rdi      ],     xmm1
843    movdqa [rdi+rcx  ],     xmm1
844    movdqa [rdi+rcx*2],     xmm1
845    movdqa [rdi+rax  ],     xmm1
846    lea         rdi,        [rdi+rcx*4]
847    dec         rsi
848    jnz .label
849
850    ; begin epilog
851    pop         rdi
852    pop         rsi
853    UNSHADOW_ARGS
854    pop         rbp
855    ret
856
857;void vp8_intra_pred_y_dc128_sse2(
858;    unsigned char *dst,
859;    int dst_stride
860;    unsigned char *above,
861;    unsigned char *left,
862;    int left_stride
863;    )
864global sym(vp8_intra_pred_y_dc128_sse2) PRIVATE
865sym(vp8_intra_pred_y_dc128_sse2):
866    push        rbp
867    mov         rbp, rsp
868    SHADOW_ARGS_TO_STACK 5
869    push        rsi
870    GET_GOT     rbx
871    ; end prolog
872
873    ;arg(2), arg(3), arg(4) not used
874
875    ; write out
876    mov         rsi,        2
877    movdqa      xmm1,       [GLOBAL(dc_128)]
878    mov         rax,        arg(0) ;dst;
879    movsxd      rdx,        dword ptr arg(1) ;dst_stride
880    lea         rcx,        [rdx*3]
881
882.label
883    movdqa [rax      ],     xmm1
884    movdqa [rax+rdx  ],     xmm1
885    movdqa [rax+rdx*2],     xmm1
886    movdqa [rax+rcx  ],     xmm1
887    lea         rax,        [rax+rdx*4]
888    movdqa [rax      ],     xmm1
889    movdqa [rax+rdx  ],     xmm1
890    movdqa [rax+rdx*2],     xmm1
891    movdqa [rax+rcx  ],     xmm1
892    lea         rax,        [rax+rdx*4]
893    dec         rsi
894    jnz .label
895
896    ; begin epilog
897    RESTORE_GOT
898    pop         rsi
899    UNSHADOW_ARGS
900    pop         rbp
901    ret
902
903;void vp8_intra_pred_y_tm_sse2(
904;    unsigned char *dst,
905;    int dst_stride
906;    unsigned char *above,
907;    unsigned char *left,
908;    int left_stride
909;    )
910%macro vp8_intra_pred_y_tm 1
911global sym(vp8_intra_pred_y_tm_%1) PRIVATE
912sym(vp8_intra_pred_y_tm_%1):
913    push        rbp
914    mov         rbp, rsp
915    SHADOW_ARGS_TO_STACK 5
916    SAVE_XMM 7
917    push        rsi
918    push        rdi
919    push        rbx
920    GET_GOT     rbx
921    ; end prolog
922
923    ; read top row
924    mov         edx,        8
925    mov         rsi,        arg(2) ;above
926    movsxd      rax,        dword ptr arg(4) ;left_stride;
927    pxor        xmm0,       xmm0
928%ifidn %1, ssse3
929    movdqa      xmm3,       [GLOBAL(dc_1024)]
930%endif
931    movdqa      xmm1,       [rsi]
932    movdqa      xmm2,       xmm1
933    punpcklbw   xmm1,       xmm0
934    punpckhbw   xmm2,       xmm0
935
936    ; set up left ptrs ans subtract topleft
937    movd        xmm4,       [rsi-1]
938    mov         rsi,        arg(3) ;left
939%ifidn %1, sse2
940    punpcklbw   xmm4,       xmm0
941    pshuflw     xmm4,       xmm4, 0x0
942    punpcklqdq  xmm4,       xmm4
943%else
944    pshufb      xmm4,       xmm3
945%endif
946    psubw       xmm1,       xmm4
947    psubw       xmm2,       xmm4
948
949    ; set up dest ptrs
950    mov         rdi,        arg(0) ;dst;
951    movsxd      rcx,        dword ptr arg(1) ;dst_stride
952vp8_intra_pred_y_tm_%1_loop:
953    mov         bl,         [rsi]
954    movd        xmm4,       ebx
955
956    mov         bl,         [rsi+rax]
957    movd        xmm5,       ebx
958%ifidn %1, sse2
959    punpcklbw   xmm4,       xmm0
960    punpcklbw   xmm5,       xmm0
961    pshuflw     xmm4,       xmm4, 0x0
962    pshuflw     xmm5,       xmm5, 0x0
963    punpcklqdq  xmm4,       xmm4
964    punpcklqdq  xmm5,       xmm5
965%else
966    pshufb      xmm4,       xmm3
967    pshufb      xmm5,       xmm3
968%endif
969    movdqa      xmm6,       xmm4
970    movdqa      xmm7,       xmm5
971    paddw       xmm4,       xmm1
972    paddw       xmm6,       xmm2
973    paddw       xmm5,       xmm1
974    paddw       xmm7,       xmm2
975    packuswb    xmm4,       xmm6
976    packuswb    xmm5,       xmm7
977    movdqa [rdi    ],       xmm4
978    movdqa [rdi+rcx],       xmm5
979    lea         rsi,        [rsi+rax*2]
980    lea         rdi,        [rdi+rcx*2]
981    dec         edx
982    jnz vp8_intra_pred_y_tm_%1_loop
983
984    ; begin epilog
985    RESTORE_GOT
986    pop         rbx
987    pop         rdi
988    pop         rsi
989    RESTORE_XMM
990    UNSHADOW_ARGS
991    pop         rbp
992    ret
993%endmacro
994
995vp8_intra_pred_y_tm sse2
996vp8_intra_pred_y_tm ssse3
997
998;void vp8_intra_pred_y_ve_sse2(
999;    unsigned char *dst,
1000;    int dst_stride
1001;    unsigned char *above,
1002;    unsigned char *left,
1003;    int left_stride
1004;    )
1005global sym(vp8_intra_pred_y_ve_sse2) PRIVATE
1006sym(vp8_intra_pred_y_ve_sse2):
1007    push        rbp
1008    mov         rbp, rsp
1009    SHADOW_ARGS_TO_STACK 5
1010    push        rsi
1011    ; end prolog
1012
1013    ;arg(3), arg(4) not used
1014
1015    mov         rax,        arg(2) ;above;
1016    mov         rsi,        2
1017    movsxd      rdx,        dword ptr arg(1) ;dst_stride
1018
1019    ; read from top
1020    movdqa      xmm1,       [rax]
1021
1022    ; write out
1023    mov         rax,        arg(0) ;dst;
1024    lea         rcx,        [rdx*3]
1025
1026.label
1027    movdqa [rax      ],     xmm1
1028    movdqa [rax+rdx  ],     xmm1
1029    movdqa [rax+rdx*2],     xmm1
1030    movdqa [rax+rcx  ],     xmm1
1031    lea         rax,        [rax+rdx*4]
1032    movdqa [rax      ],     xmm1
1033    movdqa [rax+rdx  ],     xmm1
1034    movdqa [rax+rdx*2],     xmm1
1035    movdqa [rax+rcx  ],     xmm1
1036    lea         rax,        [rax+rdx*4]
1037    dec         rsi
1038    jnz .label
1039
1040    ; begin epilog
1041    pop         rsi
1042    UNSHADOW_ARGS
1043    pop         rbp
1044    ret
1045
1046;void vp8_intra_pred_y_ho_sse2(
1047;    unsigned char *dst,
1048;    int dst_stride
1049;    unsigned char *above,
1050;    unsigned char *left,
1051;    int left_stride,
1052;    )
1053global sym(vp8_intra_pred_y_ho_sse2) PRIVATE
1054sym(vp8_intra_pred_y_ho_sse2):
1055    push        rbp
1056    mov         rbp, rsp
1057    SHADOW_ARGS_TO_STACK 5
1058    push        rsi
1059    push        rdi
1060    push        rbx
1061    ; end prolog
1062
1063    ;arg(2) not used
1064
1065    ; read from left and write out
1066    mov         edx,        8
1067    mov         rsi,        arg(3) ;left;
1068    movsxd      rax,        dword ptr arg(4) ;left_stride;
1069    mov         rdi,        arg(0) ;dst;
1070    movsxd      rcx,        dword ptr arg(1) ;dst_stride
1071
1072vp8_intra_pred_y_ho_sse2_loop:
1073    mov         bl,         [rsi]
1074    movd        xmm0,       ebx
1075    mov         bl,         [rsi+rax]
1076    movd        xmm1,       ebx
1077
1078    ; FIXME use pshufb for ssse3 version
1079    punpcklbw   xmm0,       xmm0
1080    punpcklbw   xmm1,       xmm1
1081    pshuflw     xmm0,       xmm0, 0x0
1082    pshuflw     xmm1,       xmm1, 0x0
1083    punpcklqdq  xmm0,       xmm0
1084    punpcklqdq  xmm1,       xmm1
1085    movdqa [rdi    ],       xmm0
1086    movdqa [rdi+rcx],       xmm1
1087    lea         rsi,        [rsi+rax*2]
1088    lea         rdi,        [rdi+rcx*2]
1089    dec         edx
1090    jnz vp8_intra_pred_y_ho_sse2_loop
1091
1092    ; begin epilog
1093    pop         rbx
1094    pop         rdi
1095    pop         rsi
1096    UNSHADOW_ARGS
1097    pop         rbp
1098    ret
1099
1100SECTION_RODATA
1101align 16
1102dc_128:
1103    times 16 db 128
1104dc_4:
1105    times 4 dw 4
1106align 16
1107dc_8:
1108    times 8 dw 8
1109align 16
1110dc_1024:
1111    times 8 dw 0x400
1112align 16
1113dc_00001111:
1114    times 8 db 0
1115    times 8 db 1
1116