• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%macro PROCESS_16X2X3 1
15%if %1
16        movdqa          xmm0,       XMMWORD PTR [rsi]
17        lddqu           xmm5,       XMMWORD PTR [rdi]
18        lddqu           xmm6,       XMMWORD PTR [rdi+1]
19        lddqu           xmm7,       XMMWORD PTR [rdi+2]
20
21        psadbw          xmm5,       xmm0
22        psadbw          xmm6,       xmm0
23        psadbw          xmm7,       xmm0
24%else
25        movdqa          xmm0,       XMMWORD PTR [rsi]
26        lddqu           xmm1,       XMMWORD PTR [rdi]
27        lddqu           xmm2,       XMMWORD PTR [rdi+1]
28        lddqu           xmm3,       XMMWORD PTR [rdi+2]
29
30        psadbw          xmm1,       xmm0
31        psadbw          xmm2,       xmm0
32        psadbw          xmm3,       xmm0
33
34        paddw           xmm5,       xmm1
35        paddw           xmm6,       xmm2
36        paddw           xmm7,       xmm3
37%endif
38        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
39        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
40        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
41        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
42
43        lea             rsi,        [rsi+rax*2]
44        lea             rdi,        [rdi+rdx*2]
45
46        psadbw          xmm1,       xmm0
47        psadbw          xmm2,       xmm0
48        psadbw          xmm3,       xmm0
49
50        paddw           xmm5,       xmm1
51        paddw           xmm6,       xmm2
52        paddw           xmm7,       xmm3
53%endmacro
54
55%macro PROCESS_16X2X3_OFFSET 2
56%if %1
57        movdqa          xmm0,       XMMWORD PTR [rsi]
58        movdqa          xmm4,       XMMWORD PTR [rdi]
59        movdqa          xmm7,       XMMWORD PTR [rdi+16]
60
61        movdqa          xmm5,       xmm7
62        palignr         xmm5,       xmm4,       %2
63
64        movdqa          xmm6,       xmm7
65        palignr         xmm6,       xmm4,       (%2+1)
66
67        palignr         xmm7,       xmm4,       (%2+2)
68
69        psadbw          xmm5,       xmm0
70        psadbw          xmm6,       xmm0
71        psadbw          xmm7,       xmm0
72%else
73        movdqa          xmm0,       XMMWORD PTR [rsi]
74        movdqa          xmm4,       XMMWORD PTR [rdi]
75        movdqa          xmm3,       XMMWORD PTR [rdi+16]
76
77        movdqa          xmm1,       xmm3
78        palignr         xmm1,       xmm4,       %2
79
80        movdqa          xmm2,       xmm3
81        palignr         xmm2,       xmm4,       (%2+1)
82
83        palignr         xmm3,       xmm4,       (%2+2)
84
85        psadbw          xmm1,       xmm0
86        psadbw          xmm2,       xmm0
87        psadbw          xmm3,       xmm0
88
89        paddw           xmm5,       xmm1
90        paddw           xmm6,       xmm2
91        paddw           xmm7,       xmm3
92%endif
93        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
94        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
95        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
96
97        movdqa          xmm1,       xmm3
98        palignr         xmm1,       xmm4,       %2
99
100        movdqa          xmm2,       xmm3
101        palignr         xmm2,       xmm4,       (%2+1)
102
103        palignr         xmm3,       xmm4,       (%2+2)
104
105        lea             rsi,        [rsi+rax*2]
106        lea             rdi,        [rdi+rdx*2]
107
108        psadbw          xmm1,       xmm0
109        psadbw          xmm2,       xmm0
110        psadbw          xmm3,       xmm0
111
112        paddw           xmm5,       xmm1
113        paddw           xmm6,       xmm2
114        paddw           xmm7,       xmm3
115%endmacro
116
117%macro PROCESS_16X16X3_OFFSET 2
118%2_aligned_by_%1:
119
120        sub             rdi,        %1
121
122        PROCESS_16X2X3_OFFSET 1, %1
123        PROCESS_16X2X3_OFFSET 0, %1
124        PROCESS_16X2X3_OFFSET 0, %1
125        PROCESS_16X2X3_OFFSET 0, %1
126        PROCESS_16X2X3_OFFSET 0, %1
127        PROCESS_16X2X3_OFFSET 0, %1
128        PROCESS_16X2X3_OFFSET 0, %1
129        PROCESS_16X2X3_OFFSET 0, %1
130
131        jmp             %2_store_off
132
133%endmacro
134
135%macro PROCESS_16X8X3_OFFSET 2
136%2_aligned_by_%1:
137
138        sub             rdi,        %1
139
140        PROCESS_16X2X3_OFFSET 1, %1
141        PROCESS_16X2X3_OFFSET 0, %1
142        PROCESS_16X2X3_OFFSET 0, %1
143        PROCESS_16X2X3_OFFSET 0, %1
144
145        jmp             %2_store_off
146
147%endmacro
148
149SECTION .text
150
151;void int vpx_sad16x16x3_ssse3(
152;    unsigned char *src_ptr,
153;    int  src_stride,
154;    unsigned char *ref_ptr,
155;    int  ref_stride,
156;    int  *results)
157global sym(vpx_sad16x16x3_ssse3) PRIVATE
158sym(vpx_sad16x16x3_ssse3):
159    push        rbp
160    mov         rbp, rsp
161    SHADOW_ARGS_TO_STACK 5
162    SAVE_XMM 7
163    push        rsi
164    push        rdi
165    push        rcx
166    ; end prolog
167
168        mov             rsi,        arg(0) ;src_ptr
169        mov             rdi,        arg(2) ;ref_ptr
170
171        mov             rdx,        0xf
172        and             rdx,        rdi
173
174        jmp .vpx_sad16x16x3_ssse3_skiptable
175.vpx_sad16x16x3_ssse3_jumptable:
176        dd .vpx_sad16x16x3_ssse3_aligned_by_0  - .vpx_sad16x16x3_ssse3_do_jump
177        dd .vpx_sad16x16x3_ssse3_aligned_by_1  - .vpx_sad16x16x3_ssse3_do_jump
178        dd .vpx_sad16x16x3_ssse3_aligned_by_2  - .vpx_sad16x16x3_ssse3_do_jump
179        dd .vpx_sad16x16x3_ssse3_aligned_by_3  - .vpx_sad16x16x3_ssse3_do_jump
180        dd .vpx_sad16x16x3_ssse3_aligned_by_4  - .vpx_sad16x16x3_ssse3_do_jump
181        dd .vpx_sad16x16x3_ssse3_aligned_by_5  - .vpx_sad16x16x3_ssse3_do_jump
182        dd .vpx_sad16x16x3_ssse3_aligned_by_6  - .vpx_sad16x16x3_ssse3_do_jump
183        dd .vpx_sad16x16x3_ssse3_aligned_by_7  - .vpx_sad16x16x3_ssse3_do_jump
184        dd .vpx_sad16x16x3_ssse3_aligned_by_8  - .vpx_sad16x16x3_ssse3_do_jump
185        dd .vpx_sad16x16x3_ssse3_aligned_by_9  - .vpx_sad16x16x3_ssse3_do_jump
186        dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump
187        dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump
188        dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump
189        dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump
190        dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump
191        dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump
192.vpx_sad16x16x3_ssse3_skiptable:
193
194        call .vpx_sad16x16x3_ssse3_do_jump
195.vpx_sad16x16x3_ssse3_do_jump:
196        pop             rcx                         ; get the address of do_jump
197        mov             rax,  .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump
198        add             rax,  rcx  ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable
199
200        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
201        add             rcx,        rax
202
203        movsxd          rax,        dword ptr arg(1) ;src_stride
204        movsxd          rdx,        dword ptr arg(3) ;ref_stride
205
206        jmp             rcx
207
208        PROCESS_16X16X3_OFFSET 0,  .vpx_sad16x16x3_ssse3
209        PROCESS_16X16X3_OFFSET 1,  .vpx_sad16x16x3_ssse3
210        PROCESS_16X16X3_OFFSET 2,  .vpx_sad16x16x3_ssse3
211        PROCESS_16X16X3_OFFSET 3,  .vpx_sad16x16x3_ssse3
212        PROCESS_16X16X3_OFFSET 4,  .vpx_sad16x16x3_ssse3
213        PROCESS_16X16X3_OFFSET 5,  .vpx_sad16x16x3_ssse3
214        PROCESS_16X16X3_OFFSET 6,  .vpx_sad16x16x3_ssse3
215        PROCESS_16X16X3_OFFSET 7,  .vpx_sad16x16x3_ssse3
216        PROCESS_16X16X3_OFFSET 8,  .vpx_sad16x16x3_ssse3
217        PROCESS_16X16X3_OFFSET 9,  .vpx_sad16x16x3_ssse3
218        PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3
219        PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3
220        PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3
221        PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3
222        PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3
223
224.vpx_sad16x16x3_ssse3_aligned_by_15:
225        PROCESS_16X2X3 1
226        PROCESS_16X2X3 0
227        PROCESS_16X2X3 0
228        PROCESS_16X2X3 0
229        PROCESS_16X2X3 0
230        PROCESS_16X2X3 0
231        PROCESS_16X2X3 0
232        PROCESS_16X2X3 0
233
234.vpx_sad16x16x3_ssse3_store_off:
235        mov             rdi,        arg(4) ;Results
236
237        movq            xmm0,       xmm5
238        psrldq          xmm5,       8
239
240        paddw           xmm0,       xmm5
241        movd            [rdi],      xmm0
242;-
243        movq            xmm0,       xmm6
244        psrldq          xmm6,       8
245
246        paddw           xmm0,       xmm6
247        movd            [rdi+4],    xmm0
248;-
249        movq            xmm0,       xmm7
250        psrldq          xmm7,       8
251
252        paddw           xmm0,       xmm7
253        movd            [rdi+8],    xmm0
254
255    ; begin epilog
256    pop         rcx
257    pop         rdi
258    pop         rsi
259    RESTORE_XMM
260    UNSHADOW_ARGS
261    pop         rbp
262    ret
263
264;void int vpx_sad16x8x3_ssse3(
265;    unsigned char *src_ptr,
266;    int  src_stride,
267;    unsigned char *ref_ptr,
268;    int  ref_stride,
269;    int  *results)
270global sym(vpx_sad16x8x3_ssse3) PRIVATE
271sym(vpx_sad16x8x3_ssse3):
272    push        rbp
273    mov         rbp, rsp
274    SHADOW_ARGS_TO_STACK 5
275    SAVE_XMM 7
276    push        rsi
277    push        rdi
278    push        rcx
279    ; end prolog
280
281        mov             rsi,        arg(0) ;src_ptr
282        mov             rdi,        arg(2) ;ref_ptr
283
284        mov             rdx,        0xf
285        and             rdx,        rdi
286
287        jmp .vpx_sad16x8x3_ssse3_skiptable
288.vpx_sad16x8x3_ssse3_jumptable:
289        dd .vpx_sad16x8x3_ssse3_aligned_by_0  - .vpx_sad16x8x3_ssse3_do_jump
290        dd .vpx_sad16x8x3_ssse3_aligned_by_1  - .vpx_sad16x8x3_ssse3_do_jump
291        dd .vpx_sad16x8x3_ssse3_aligned_by_2  - .vpx_sad16x8x3_ssse3_do_jump
292        dd .vpx_sad16x8x3_ssse3_aligned_by_3  - .vpx_sad16x8x3_ssse3_do_jump
293        dd .vpx_sad16x8x3_ssse3_aligned_by_4  - .vpx_sad16x8x3_ssse3_do_jump
294        dd .vpx_sad16x8x3_ssse3_aligned_by_5  - .vpx_sad16x8x3_ssse3_do_jump
295        dd .vpx_sad16x8x3_ssse3_aligned_by_6  - .vpx_sad16x8x3_ssse3_do_jump
296        dd .vpx_sad16x8x3_ssse3_aligned_by_7  - .vpx_sad16x8x3_ssse3_do_jump
297        dd .vpx_sad16x8x3_ssse3_aligned_by_8  - .vpx_sad16x8x3_ssse3_do_jump
298        dd .vpx_sad16x8x3_ssse3_aligned_by_9  - .vpx_sad16x8x3_ssse3_do_jump
299        dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump
300        dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump
301        dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump
302        dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump
303        dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump
304        dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump
305.vpx_sad16x8x3_ssse3_skiptable:
306
307        call .vpx_sad16x8x3_ssse3_do_jump
308.vpx_sad16x8x3_ssse3_do_jump:
309        pop             rcx                         ; get the address of do_jump
310        mov             rax,  .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump
311        add             rax,  rcx  ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable
312
313        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
314        add             rcx,        rax
315
316        movsxd          rax,        dword ptr arg(1) ;src_stride
317        movsxd          rdx,        dword ptr arg(3) ;ref_stride
318
319        jmp             rcx
320
321        PROCESS_16X8X3_OFFSET 0,  .vpx_sad16x8x3_ssse3
322        PROCESS_16X8X3_OFFSET 1,  .vpx_sad16x8x3_ssse3
323        PROCESS_16X8X3_OFFSET 2,  .vpx_sad16x8x3_ssse3
324        PROCESS_16X8X3_OFFSET 3,  .vpx_sad16x8x3_ssse3
325        PROCESS_16X8X3_OFFSET 4,  .vpx_sad16x8x3_ssse3
326        PROCESS_16X8X3_OFFSET 5,  .vpx_sad16x8x3_ssse3
327        PROCESS_16X8X3_OFFSET 6,  .vpx_sad16x8x3_ssse3
328        PROCESS_16X8X3_OFFSET 7,  .vpx_sad16x8x3_ssse3
329        PROCESS_16X8X3_OFFSET 8,  .vpx_sad16x8x3_ssse3
330        PROCESS_16X8X3_OFFSET 9,  .vpx_sad16x8x3_ssse3
331        PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3
332        PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3
333        PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3
334        PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3
335        PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3
336
337.vpx_sad16x8x3_ssse3_aligned_by_15:
338
339        PROCESS_16X2X3 1
340        PROCESS_16X2X3 0
341        PROCESS_16X2X3 0
342        PROCESS_16X2X3 0
343
344.vpx_sad16x8x3_ssse3_store_off:
345        mov             rdi,        arg(4) ;Results
346
347        movq            xmm0,       xmm5
348        psrldq          xmm5,       8
349
350        paddw           xmm0,       xmm5
351        movd            [rdi],      xmm0
352;-
353        movq            xmm0,       xmm6
354        psrldq          xmm6,       8
355
356        paddw           xmm0,       xmm6
357        movd            [rdi+4],    xmm0
358;-
359        movq            xmm0,       xmm7
360        psrldq          xmm7,       8
361
362        paddw           xmm0,       xmm7
363        movd            [rdi+8],    xmm0
364
365    ; begin epilog
366    pop         rcx
367    pop         rdi
368    pop         rsi
369    RESTORE_XMM
370    UNSHADOW_ARGS
371    pop         rbp
372    ret
373