• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "vpx_ports/x86_abi_support.asm"
12
13;void vpx_half_horiz_vert_variance16x_h_sse2(unsigned char *ref,
14;                                            int ref_stride,
15;                                            unsigned char *src,
16;                                            int src_stride,
17;                                            unsigned int height,
18;                                            int *sum,
19;                                            unsigned int *sumsquared)
20global sym(vpx_half_horiz_vert_variance16x_h_sse2) PRIVATE
21sym(vpx_half_horiz_vert_variance16x_h_sse2):
22    push        rbp
23    mov         rbp, rsp
24    SHADOW_ARGS_TO_STACK 7
25    SAVE_XMM 7
26    GET_GOT     rbx
27    push rsi
28    push rdi
29    ; end prolog
30
31        pxor            xmm6,           xmm6                ;  error accumulator
32        pxor            xmm7,           xmm7                ;  sse eaccumulator
33        mov             rsi,            arg(0) ;ref
34
35        mov             rdi,            arg(2) ;src
36        movsxd          rcx,            dword ptr arg(4) ;height
37        movsxd          rax,            dword ptr arg(1) ;ref_stride
38        movsxd          rdx,            dword ptr arg(3)    ;src_stride
39
40        pxor            xmm0,           xmm0                ;
41
42        movdqu          xmm5,           XMMWORD PTR [rsi]
43        movdqu          xmm3,           XMMWORD PTR [rsi+1]
44        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
45
46        lea             rsi,            [rsi + rax]
47
48vpx_half_horiz_vert_variance16x_h_1:
49        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
50        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
51        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
52
53        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
54
55        movdqa          xmm4,           xmm5
56        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
57        punpckhbw       xmm4,           xmm0
58
59        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
60        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
61        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
62
63        movq            xmm3,           QWORD PTR [rdi+8]
64        punpcklbw       xmm3,           xmm0
65        psubw           xmm4,           xmm3
66
67        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
68        paddw           xmm6,           xmm4
69        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
70        pmaddwd         xmm4,           xmm4
71        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
72        paddd           xmm7,           xmm4
73
74        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
75
76        lea             rsi,            [rsi + rax]
77        lea             rdi,            [rdi + rdx]
78
79        sub             rcx,            1                   ;
80        jnz             vpx_half_horiz_vert_variance16x_h_1     ;
81
82        pxor        xmm1,           xmm1
83        pxor        xmm5,           xmm5
84
85        punpcklwd   xmm0,           xmm6
86        punpckhwd   xmm1,           xmm6
87        psrad       xmm0,           16
88        psrad       xmm1,           16
89        paddd       xmm0,           xmm1
90        movdqa      xmm1,           xmm0
91
92        movdqa      xmm6,           xmm7
93        punpckldq   xmm6,           xmm5
94        punpckhdq   xmm7,           xmm5
95        paddd       xmm6,           xmm7
96
97        punpckldq   xmm0,           xmm5
98        punpckhdq   xmm1,           xmm5
99        paddd       xmm0,           xmm1
100
101        movdqa      xmm7,           xmm6
102        movdqa      xmm1,           xmm0
103
104        psrldq      xmm7,           8
105        psrldq      xmm1,           8
106
107        paddd       xmm6,           xmm7
108        paddd       xmm0,           xmm1
109
110        mov         rsi,            arg(5) ;[Sum]
111        mov         rdi,            arg(6) ;[SSE]
112
113        movd        [rsi],       xmm0
114        movd        [rdi],       xmm6
115
116    ; begin epilog
117    pop rdi
118    pop rsi
119    RESTORE_GOT
120    RESTORE_XMM
121    UNSHADOW_ARGS
122    pop         rbp
123    ret
124
125
126;void vpx_half_vert_variance16x_h_sse2(unsigned char *ref,
127;                                      int ref_stride,
128;                                      unsigned char *src,
129;                                      int src_stride,
130;                                      unsigned int height,
131;                                      int *sum,
132;                                      unsigned int *sumsquared)
133global sym(vpx_half_vert_variance16x_h_sse2) PRIVATE
134sym(vpx_half_vert_variance16x_h_sse2):
135    push        rbp
136    mov         rbp, rsp
137    SHADOW_ARGS_TO_STACK 7
138    SAVE_XMM 7
139    GET_GOT     rbx
140    push rsi
141    push rdi
142    ; end prolog
143
144        pxor            xmm6,           xmm6                ;  error accumulator
145        pxor            xmm7,           xmm7                ;  sse eaccumulator
146        mov             rsi,            arg(0)              ;ref
147
148        mov             rdi,            arg(2)              ;src
149        movsxd          rcx,            dword ptr arg(4)    ;height
150        movsxd          rax,            dword ptr arg(1)    ;ref_stride
151        movsxd          rdx,            dword ptr arg(3)    ;src_stride
152
153        movdqu          xmm5,           XMMWORD PTR [rsi]
154        lea             rsi,            [rsi + rax          ]
155        pxor            xmm0,           xmm0
156
157vpx_half_vert_variance16x_h_1:
158        movdqu          xmm3,           XMMWORD PTR [rsi]
159
160        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
161        movdqa          xmm4,           xmm5
162        punpcklbw       xmm5,           xmm0
163        punpckhbw       xmm4,           xmm0
164
165        movq            xmm2,           QWORD PTR [rdi]
166        punpcklbw       xmm2,           xmm0
167        psubw           xmm5,           xmm2
168        movq            xmm2,           QWORD PTR [rdi+8]
169        punpcklbw       xmm2,           xmm0
170        psubw           xmm4,           xmm2
171
172        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
173        paddw           xmm6,           xmm4
174        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
175        pmaddwd         xmm4,           xmm4
176        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
177        paddd           xmm7,           xmm4
178
179        movdqa          xmm5,           xmm3
180
181        lea             rsi,            [rsi + rax]
182        lea             rdi,            [rdi + rdx]
183
184        sub             rcx,            1
185        jnz             vpx_half_vert_variance16x_h_1
186
187        pxor        xmm1,           xmm1
188        pxor        xmm5,           xmm5
189
190        punpcklwd   xmm0,           xmm6
191        punpckhwd   xmm1,           xmm6
192        psrad       xmm0,           16
193        psrad       xmm1,           16
194        paddd       xmm0,           xmm1
195        movdqa      xmm1,           xmm0
196
197        movdqa      xmm6,           xmm7
198        punpckldq   xmm6,           xmm5
199        punpckhdq   xmm7,           xmm5
200        paddd       xmm6,           xmm7
201
202        punpckldq   xmm0,           xmm5
203        punpckhdq   xmm1,           xmm5
204        paddd       xmm0,           xmm1
205
206        movdqa      xmm7,           xmm6
207        movdqa      xmm1,           xmm0
208
209        psrldq      xmm7,           8
210        psrldq      xmm1,           8
211
212        paddd       xmm6,           xmm7
213        paddd       xmm0,           xmm1
214
215        mov         rsi,            arg(5) ;[Sum]
216        mov         rdi,            arg(6) ;[SSE]
217
218        movd        [rsi],       xmm0
219        movd        [rdi],       xmm6
220
221    ; begin epilog
222    pop rdi
223    pop rsi
224    RESTORE_GOT
225    RESTORE_XMM
226    UNSHADOW_ARGS
227    pop         rbp
228    ret
229
230
231;void vpx_half_horiz_variance16x_h_sse2(unsigned char *ref,
232;                                       int ref_stride
233;                                       unsigned char *src,
234;                                       int src_stride,
235;                                       unsigned int height,
236;                                       int *sum,
237;                                       unsigned int *sumsquared)
238global sym(vpx_half_horiz_variance16x_h_sse2) PRIVATE
239sym(vpx_half_horiz_variance16x_h_sse2):
240    push        rbp
241    mov         rbp, rsp
242    SHADOW_ARGS_TO_STACK 7
243    SAVE_XMM 7
244    GET_GOT     rbx
245    push rsi
246    push rdi
247    ; end prolog
248
249        pxor            xmm6,           xmm6                ;  error accumulator
250        pxor            xmm7,           xmm7                ;  sse eaccumulator
251        mov             rsi,            arg(0) ;ref
252
253        mov             rdi,            arg(2) ;src
254        movsxd          rcx,            dword ptr arg(4) ;height
255        movsxd          rax,            dword ptr arg(1) ;ref_stride
256        movsxd          rdx,            dword ptr arg(3)    ;src_stride
257
258        pxor            xmm0,           xmm0                ;
259
260vpx_half_horiz_variance16x_h_1:
261        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
262        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
263
264        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
265        movdqa          xmm1,           xmm5
266        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
267        punpckhbw       xmm1,           xmm0
268
269        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
270        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
271        movq            xmm2,           QWORD PTR [rdi+8]
272        punpcklbw       xmm2,           xmm0
273
274        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
275        psubw           xmm1,           xmm2
276        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
277        paddw           xmm6,           xmm1
278        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
279        pmaddwd         xmm1,           xmm1
280        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
281        paddd           xmm7,           xmm1
282
283        lea             rsi,            [rsi + rax]
284        lea             rdi,            [rdi + rdx]
285
286        sub             rcx,            1                   ;
287        jnz             vpx_half_horiz_variance16x_h_1        ;
288
289        pxor        xmm1,           xmm1
290        pxor        xmm5,           xmm5
291
292        punpcklwd   xmm0,           xmm6
293        punpckhwd   xmm1,           xmm6
294        psrad       xmm0,           16
295        psrad       xmm1,           16
296        paddd       xmm0,           xmm1
297        movdqa      xmm1,           xmm0
298
299        movdqa      xmm6,           xmm7
300        punpckldq   xmm6,           xmm5
301        punpckhdq   xmm7,           xmm5
302        paddd       xmm6,           xmm7
303
304        punpckldq   xmm0,           xmm5
305        punpckhdq   xmm1,           xmm5
306        paddd       xmm0,           xmm1
307
308        movdqa      xmm7,           xmm6
309        movdqa      xmm1,           xmm0
310
311        psrldq      xmm7,           8
312        psrldq      xmm1,           8
313
314        paddd       xmm6,           xmm7
315        paddd       xmm0,           xmm1
316
317        mov         rsi,            arg(5) ;[Sum]
318        mov         rdi,            arg(6) ;[SSE]
319
320        movd        [rsi],       xmm0
321        movd        [rdi],       xmm6
322
323    ; begin epilog
324    pop rdi
325    pop rsi
326    RESTORE_GOT
327    RESTORE_XMM
328    UNSHADOW_ARGS
329    pop         rbp
330    ret
331
332SECTION_RODATA
333;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
334align 16
335xmm_bi_rd:
336    times 8 dw 64
337align 16
338vpx_bilinear_filters_sse2:
339    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
340    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
341    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
342    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
343    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
344    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
345    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
346    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
347