• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION .text
14
15%macro convolve_fn 1-2
16%ifidn %1, avg
17%define AUX_XMM_REGS 4
18%else
19%define AUX_XMM_REGS 0
20%endif
21%ifidn %2, highbd
22%define pavg pavgw
23cglobal %2_convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \
24                                              dst, dst_stride, \
25                                              f, fxo, fxs, fyo, fys, w, h, bd
26%else
27%define pavg pavgb
28cglobal convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \
29                                           dst, dst_stride, \
30                                           f, fxo, fxs, fyo, fys, w, h
31%endif
32  mov r4d, dword wm
33%ifidn %2, highbd
34  shl r4d, 1
35  shl src_strideq, 1
36  shl dst_strideq, 1
37%else
38  cmp r4d, 4
39  je .w4
40%endif
41  cmp r4d, 8
42  je .w8
43  cmp r4d, 16
44  je .w16
45  cmp r4d, 32
46  je .w32
47%ifidn %2, highbd
48  cmp r4d, 64
49  je .w64
50
51  mov                    r4d, dword hm
52.loop128:
53  movu                    m0, [srcq]
54  movu                    m1, [srcq+16]
55  movu                    m2, [srcq+32]
56  movu                    m3, [srcq+48]
57%ifidn %1, avg
58  pavg                    m0, [dstq]
59  pavg                    m1, [dstq+16]
60  pavg                    m2, [dstq+32]
61  pavg                    m3, [dstq+48]
62%endif
63  mova             [dstq   ], m0
64  mova             [dstq+16], m1
65  mova             [dstq+32], m2
66  mova             [dstq+48], m3
67  movu                    m0, [srcq+64]
68  movu                    m1, [srcq+80]
69  movu                    m2, [srcq+96]
70  movu                    m3, [srcq+112]
71  add                   srcq, src_strideq
72%ifidn %1, avg
73  pavg                    m0, [dstq+64]
74  pavg                    m1, [dstq+80]
75  pavg                    m2, [dstq+96]
76  pavg                    m3, [dstq+112]
77%endif
78  mova             [dstq+64], m0
79  mova             [dstq+80], m1
80  mova             [dstq+96], m2
81  mova            [dstq+112], m3
82  add                   dstq, dst_strideq
83  dec                    r4d
84  jnz .loop128
85  RET
86%endif
87
88.w64:
89  mov                    r4d, dword hm
90.loop64:
91  movu                    m0, [srcq]
92  movu                    m1, [srcq+16]
93  movu                    m2, [srcq+32]
94  movu                    m3, [srcq+48]
95  add                   srcq, src_strideq
96%ifidn %1, avg
97  pavg                    m0, [dstq]
98  pavg                    m1, [dstq+16]
99  pavg                    m2, [dstq+32]
100  pavg                    m3, [dstq+48]
101%endif
102  mova             [dstq   ], m0
103  mova             [dstq+16], m1
104  mova             [dstq+32], m2
105  mova             [dstq+48], m3
106  add                   dstq, dst_strideq
107  dec                    r4d
108  jnz .loop64
109  RET
110
111.w32:
112  mov                    r4d, dword hm
113.loop32:
114  movu                    m0, [srcq]
115  movu                    m1, [srcq+16]
116  movu                    m2, [srcq+src_strideq]
117  movu                    m3, [srcq+src_strideq+16]
118  lea                   srcq, [srcq+src_strideq*2]
119%ifidn %1, avg
120  pavg                    m0, [dstq]
121  pavg                    m1, [dstq            +16]
122  pavg                    m2, [dstq+dst_strideq]
123  pavg                    m3, [dstq+dst_strideq+16]
124%endif
125  mova [dstq               ], m0
126  mova [dstq            +16], m1
127  mova [dstq+dst_strideq   ], m2
128  mova [dstq+dst_strideq+16], m3
129  lea                   dstq, [dstq+dst_strideq*2]
130  sub                    r4d, 2
131  jnz .loop32
132  RET
133
134.w16:
135  mov                    r4d, dword hm
136  lea                    r5q, [src_strideq*3]
137  lea                    r6q, [dst_strideq*3]
138.loop16:
139  movu                    m0, [srcq]
140  movu                    m1, [srcq+src_strideq]
141  movu                    m2, [srcq+src_strideq*2]
142  movu                    m3, [srcq+r5q]
143  lea                   srcq, [srcq+src_strideq*4]
144%ifidn %1, avg
145  pavg                    m0, [dstq]
146  pavg                    m1, [dstq+dst_strideq]
147  pavg                    m2, [dstq+dst_strideq*2]
148  pavg                    m3, [dstq+r6q]
149%endif
150  mova  [dstq              ], m0
151  mova  [dstq+dst_strideq  ], m1
152  mova  [dstq+dst_strideq*2], m2
153  mova  [dstq+r6q          ], m3
154  lea                   dstq, [dstq+dst_strideq*4]
155  sub                    r4d, 4
156  jnz .loop16
157  RET
158
159.w8:
160  mov                    r4d, dword hm
161  lea                    r5q, [src_strideq*3]
162  lea                    r6q, [dst_strideq*3]
163.loop8:
164  movh                    m0, [srcq]
165  movh                    m1, [srcq+src_strideq]
166  movh                    m2, [srcq+src_strideq*2]
167  movh                    m3, [srcq+r5q]
168  lea                   srcq, [srcq+src_strideq*4]
169%ifidn %1, avg
170  movh                    m4, [dstq]
171  movh                    m5, [dstq+dst_strideq]
172  movh                    m6, [dstq+dst_strideq*2]
173  movh                    m7, [dstq+r6q]
174  pavg                    m0, m4
175  pavg                    m1, m5
176  pavg                    m2, m6
177  pavg                    m3, m7
178%endif
179  movh  [dstq              ], m0
180  movh  [dstq+dst_strideq  ], m1
181  movh  [dstq+dst_strideq*2], m2
182  movh  [dstq+r6q          ], m3
183  lea                   dstq, [dstq+dst_strideq*4]
184  sub                    r4d, 4
185  jnz .loop8
186  RET
187
188%ifnidn %2, highbd
189.w4:
190  mov                    r4d, dword hm
191  lea                    r5q, [src_strideq*3]
192  lea                    r6q, [dst_strideq*3]
193.loop4:
194  movd                    m0, [srcq]
195  movd                    m1, [srcq+src_strideq]
196  movd                    m2, [srcq+src_strideq*2]
197  movd                    m3, [srcq+r5q]
198  lea                   srcq, [srcq+src_strideq*4]
199%ifidn %1, avg
200  movd                    m4, [dstq]
201  movd                    m5, [dstq+dst_strideq]
202  movd                    m6, [dstq+dst_strideq*2]
203  movd                    m7, [dstq+r6q]
204  pavg                    m0, m4
205  pavg                    m1, m5
206  pavg                    m2, m6
207  pavg                    m3, m7
208%endif
209  movd  [dstq              ], m0
210  movd  [dstq+dst_strideq  ], m1
211  movd  [dstq+dst_strideq*2], m2
212  movd  [dstq+r6q          ], m3
213  lea                   dstq, [dstq+dst_strideq*4]
214  sub                    r4d, 4
215  jnz .loop4
216  RET
217%endif
218%endmacro
219
220INIT_XMM sse2
221convolve_fn copy
222convolve_fn avg
223%if CONFIG_VP9_HIGHBITDEPTH
224convolve_fn copy, highbd
225convolve_fn avg, highbd
226%endif
227