• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%include "third_party/x86inc/x86inc.asm"
15
16SECTION .text
17
18%macro convolve_fn 1-2
19%ifidn %1, avg
20%define AUX_XMM_REGS 4
21%else
22%define AUX_XMM_REGS 0
23%endif
24%ifidn %2, highbd
25%define pavg pavgw
26cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
27                                              dst, dst_stride, \
28                                              fx, fxs, fy, fys, w, h, bd
29%else
30%define pavg pavgb
31cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
32                                           dst, dst_stride, \
33                                           fx, fxs, fy, fys, w, h
34%endif
35  mov r4d, dword wm
36%ifidn %2, highbd
37  shl r4d, 1
38  shl srcq, 1
39  shl src_strideq, 1
40  shl dstq, 1
41  shl dst_strideq, 1
42%else
43  cmp r4d, 4
44  je .w4
45%endif
46  cmp r4d, 8
47  je .w8
48  cmp r4d, 16
49  je .w16
50  cmp r4d, 32
51  je .w32
52
53  cmp r4d, 64
54  je .w64
55%ifidn %2, highbd
56  cmp r4d, 128
57  je .w128
58
59.w256:
60  mov                    r4d, dword hm
61.loop256:
62  movu                    m0, [srcq]
63  movu                    m1, [srcq+16]
64  movu                    m2, [srcq+32]
65  movu                    m3, [srcq+48]
66%ifidn %1, avg
67  pavg                    m0, [dstq]
68  pavg                    m1, [dstq+16]
69  pavg                    m2, [dstq+32]
70  pavg                    m3, [dstq+48]
71%endif
72  mova             [dstq   ], m0
73  mova             [dstq+16], m1
74  mova             [dstq+32], m2
75  mova             [dstq+48], m3
76  movu                    m0, [srcq+64]
77  movu                    m1, [srcq+80]
78  movu                    m2, [srcq+96]
79  movu                    m3, [srcq+112]
80%ifidn %1, avg
81  pavg                    m0, [dstq+64]
82  pavg                    m1, [dstq+80]
83  pavg                    m2, [dstq+96]
84  pavg                    m3, [dstq+112]
85%endif
86  mova             [dstq+64], m0
87  mova             [dstq+80], m1
88  mova             [dstq+96], m2
89  mova            [dstq+112], m3
90  movu                    m0, [srcq+128]
91  movu                    m1, [srcq+128+16]
92  movu                    m2, [srcq+128+32]
93  movu                    m3, [srcq+128+48]
94%ifidn %1, avg
95  pavg                    m0, [dstq+128]
96  pavg                    m1, [dstq+128+16]
97  pavg                    m2, [dstq+128+32]
98  pavg                    m3, [dstq+128+48]
99%endif
100  mova         [dstq+128   ], m0
101  mova         [dstq+128+16], m1
102  mova         [dstq+128+32], m2
103  mova         [dstq+128+48], m3
104  movu                    m0, [srcq+128+64]
105  movu                    m1, [srcq+128+80]
106  movu                    m2, [srcq+128+96]
107  movu                    m3, [srcq+128+112]
108  add                   srcq, src_strideq
109%ifidn %1, avg
110  pavg                    m0, [dstq+128+64]
111  pavg                    m1, [dstq+128+80]
112  pavg                    m2, [dstq+128+96]
113  pavg                    m3, [dstq+128+112]
114%endif
115  mova         [dstq+128+64], m0
116  mova         [dstq+128+80], m1
117  mova         [dstq+128+96], m2
118  mova        [dstq+128+112], m3
119  add                   dstq, dst_strideq
120  sub                    r4d, 1
121  jnz .loop256
122  RET
123%endif
124
125.w128:
126  mov                    r4d, dword hm
127.loop128:
128  movu                    m0, [srcq]
129  movu                    m1, [srcq+16]
130  movu                    m2, [srcq+32]
131  movu                    m3, [srcq+48]
132%ifidn %1, avg
133  pavg                    m0, [dstq]
134  pavg                    m1, [dstq+16]
135  pavg                    m2, [dstq+32]
136  pavg                    m3, [dstq+48]
137%endif
138  mova             [dstq   ], m0
139  mova             [dstq+16], m1
140  mova             [dstq+32], m2
141  mova             [dstq+48], m3
142  movu                    m0, [srcq+64]
143  movu                    m1, [srcq+80]
144  movu                    m2, [srcq+96]
145  movu                    m3, [srcq+112]
146  add                   srcq, src_strideq
147%ifidn %1, avg
148  pavg                    m0, [dstq+64]
149  pavg                    m1, [dstq+80]
150  pavg                    m2, [dstq+96]
151  pavg                    m3, [dstq+112]
152%endif
153  mova             [dstq+64], m0
154  mova             [dstq+80], m1
155  mova             [dstq+96], m2
156  mova            [dstq+112], m3
157  add                   dstq, dst_strideq
158  sub                    r4d, 1
159  jnz .loop128
160  RET
161
162.w64:
163  mov                    r4d, dword hm
164.loop64:
165  movu                    m0, [srcq]
166  movu                    m1, [srcq+16]
167  movu                    m2, [srcq+32]
168  movu                    m3, [srcq+48]
169  add                   srcq, src_strideq
170%ifidn %1, avg
171  pavg                    m0, [dstq]
172  pavg                    m1, [dstq+16]
173  pavg                    m2, [dstq+32]
174  pavg                    m3, [dstq+48]
175%endif
176  mova             [dstq   ], m0
177  mova             [dstq+16], m1
178  mova             [dstq+32], m2
179  mova             [dstq+48], m3
180  add                   dstq, dst_strideq
181  sub                    r4d, 1
182  jnz .loop64
183  RET
184
185.w32:
186  mov                    r4d, dword hm
187.loop32:
188  movu                    m0, [srcq]
189  movu                    m1, [srcq+16]
190  movu                    m2, [srcq+src_strideq]
191  movu                    m3, [srcq+src_strideq+16]
192  lea                   srcq, [srcq+src_strideq*2]
193%ifidn %1, avg
194  pavg                    m0, [dstq]
195  pavg                    m1, [dstq            +16]
196  pavg                    m2, [dstq+dst_strideq]
197  pavg                    m3, [dstq+dst_strideq+16]
198%endif
199  mova [dstq               ], m0
200  mova [dstq            +16], m1
201  mova [dstq+dst_strideq   ], m2
202  mova [dstq+dst_strideq+16], m3
203  lea                   dstq, [dstq+dst_strideq*2]
204  sub                    r4d, 2
205  jnz .loop32
206  RET
207
208.w16:
209  mov                    r4d, dword hm
210  lea                    r5q, [src_strideq*3]
211  lea                    r6q, [dst_strideq*3]
212.loop16:
213  movu                    m0, [srcq]
214  movu                    m1, [srcq+src_strideq]
215  movu                    m2, [srcq+src_strideq*2]
216  movu                    m3, [srcq+r5q]
217  lea                   srcq, [srcq+src_strideq*4]
218%ifidn %1, avg
219  pavg                    m0, [dstq]
220  pavg                    m1, [dstq+dst_strideq]
221  pavg                    m2, [dstq+dst_strideq*2]
222  pavg                    m3, [dstq+r6q]
223%endif
224  mova  [dstq              ], m0
225  mova  [dstq+dst_strideq  ], m1
226  mova  [dstq+dst_strideq*2], m2
227  mova  [dstq+r6q          ], m3
228  lea                   dstq, [dstq+dst_strideq*4]
229  sub                    r4d, 4
230  jnz .loop16
231  RET
232
233.w8:
234  mov                    r4d, dword hm
235  lea                    r5q, [src_strideq*3]
236  lea                    r6q, [dst_strideq*3]
237.loop8:
238  movh                    m0, [srcq]
239  movh                    m1, [srcq+src_strideq]
240  movh                    m2, [srcq+src_strideq*2]
241  movh                    m3, [srcq+r5q]
242  lea                   srcq, [srcq+src_strideq*4]
243%ifidn %1, avg
244  movh                    m4, [dstq]
245  movh                    m5, [dstq+dst_strideq]
246  movh                    m6, [dstq+dst_strideq*2]
247  movh                    m7, [dstq+r6q]
248  pavg                    m0, m4
249  pavg                    m1, m5
250  pavg                    m2, m6
251  pavg                    m3, m7
252%endif
253  movh  [dstq              ], m0
254  movh  [dstq+dst_strideq  ], m1
255  movh  [dstq+dst_strideq*2], m2
256  movh  [dstq+r6q          ], m3
257  lea                   dstq, [dstq+dst_strideq*4]
258  sub                    r4d, 4
259  jnz .loop8
260  RET
261
262%ifnidn %2, highbd
263.w4:
264  mov                    r4d, dword hm
265  lea                    r5q, [src_strideq*3]
266  lea                    r6q, [dst_strideq*3]
267.loop4:
268  movd                    m0, [srcq]
269  movd                    m1, [srcq+src_strideq]
270  movd                    m2, [srcq+src_strideq*2]
271  movd                    m3, [srcq+r5q]
272  lea                   srcq, [srcq+src_strideq*4]
273%ifidn %1, avg
274  movd                    m4, [dstq]
275  movd                    m5, [dstq+dst_strideq]
276  movd                    m6, [dstq+dst_strideq*2]
277  movd                    m7, [dstq+r6q]
278  pavg                    m0, m4
279  pavg                    m1, m5
280  pavg                    m2, m6
281  pavg                    m3, m7
282%endif
283  movd  [dstq              ], m0
284  movd  [dstq+dst_strideq  ], m1
285  movd  [dstq+dst_strideq*2], m2
286  movd  [dstq+r6q          ], m3
287  lea                   dstq, [dstq+dst_strideq*4]
288  sub                    r4d, 4
289  jnz .loop4
290  RET
291%endif
292%endmacro
293
294INIT_XMM sse2
295convolve_fn copy
296convolve_fn avg
297convolve_fn copy, highbd
298