• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION .text
14
15%macro convolve_fn 1-2
16INIT_XMM sse2
17%ifidn %2, highbd
18%define pavg pavgw
19cglobal %2_convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
20                                 fx, fxs, fy, fys, w, h, bd
21%else
22%define pavg pavgb
23cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
24                              fx, fxs, fy, fys, w, h
25%endif
26  mov r4d, dword wm
27%ifidn %2, highbd
28  shl r4d, 1
29  shl srcq, 1
30  shl src_strideq, 1
31  shl dstq, 1
32  shl dst_strideq, 1
33%else
34  cmp r4d, 4
35  je .w4
36%endif
37  cmp r4d, 8
38  je .w8
39  cmp r4d, 16
40  je .w16
41  cmp r4d, 32
42  je .w32
43%ifidn %2, highbd
44  cmp r4d, 64
45  je .w64
46
47  mov                    r4d, dword hm
48.loop128:
49  movu                    m0, [srcq]
50  movu                    m1, [srcq+16]
51  movu                    m2, [srcq+32]
52  movu                    m3, [srcq+48]
53%ifidn %1, avg
54  pavg                    m0, [dstq]
55  pavg                    m1, [dstq+16]
56  pavg                    m2, [dstq+32]
57  pavg                    m3, [dstq+48]
58%endif
59  mova             [dstq   ], m0
60  mova             [dstq+16], m1
61  mova             [dstq+32], m2
62  mova             [dstq+48], m3
63  movu                    m0, [srcq+64]
64  movu                    m1, [srcq+80]
65  movu                    m2, [srcq+96]
66  movu                    m3, [srcq+112]
67  add                   srcq, src_strideq
68%ifidn %1, avg
69  pavg                    m0, [dstq+64]
70  pavg                    m1, [dstq+80]
71  pavg                    m2, [dstq+96]
72  pavg                    m3, [dstq+112]
73%endif
74  mova             [dstq+64], m0
75  mova             [dstq+80], m1
76  mova             [dstq+96], m2
77  mova            [dstq+112], m3
78  add                   dstq, dst_strideq
79  dec                    r4d
80  jnz .loop128
81  RET
82%endif
83
84.w64
85  mov                    r4d, dword hm
86.loop64:
87  movu                    m0, [srcq]
88  movu                    m1, [srcq+16]
89  movu                    m2, [srcq+32]
90  movu                    m3, [srcq+48]
91  add                   srcq, src_strideq
92%ifidn %1, avg
93  pavg                    m0, [dstq]
94  pavg                    m1, [dstq+16]
95  pavg                    m2, [dstq+32]
96  pavg                    m3, [dstq+48]
97%endif
98  mova             [dstq   ], m0
99  mova             [dstq+16], m1
100  mova             [dstq+32], m2
101  mova             [dstq+48], m3
102  add                   dstq, dst_strideq
103  dec                    r4d
104  jnz .loop64
105  RET
106
107.w32:
108  mov                    r4d, dword hm
109.loop32:
110  movu                    m0, [srcq]
111  movu                    m1, [srcq+16]
112  movu                    m2, [srcq+src_strideq]
113  movu                    m3, [srcq+src_strideq+16]
114  lea                   srcq, [srcq+src_strideq*2]
115%ifidn %1, avg
116  pavg                    m0, [dstq]
117  pavg                    m1, [dstq            +16]
118  pavg                    m2, [dstq+dst_strideq]
119  pavg                    m3, [dstq+dst_strideq+16]
120%endif
121  mova [dstq               ], m0
122  mova [dstq            +16], m1
123  mova [dstq+dst_strideq   ], m2
124  mova [dstq+dst_strideq+16], m3
125  lea                   dstq, [dstq+dst_strideq*2]
126  sub                    r4d, 2
127  jnz .loop32
128  RET
129
130.w16:
131  mov                    r4d, dword hm
132  lea                    r5q, [src_strideq*3]
133  lea                    r6q, [dst_strideq*3]
134.loop16:
135  movu                    m0, [srcq]
136  movu                    m1, [srcq+src_strideq]
137  movu                    m2, [srcq+src_strideq*2]
138  movu                    m3, [srcq+r5q]
139  lea                   srcq, [srcq+src_strideq*4]
140%ifidn %1, avg
141  pavg                    m0, [dstq]
142  pavg                    m1, [dstq+dst_strideq]
143  pavg                    m2, [dstq+dst_strideq*2]
144  pavg                    m3, [dstq+r6q]
145%endif
146  mova  [dstq              ], m0
147  mova  [dstq+dst_strideq  ], m1
148  mova  [dstq+dst_strideq*2], m2
149  mova  [dstq+r6q          ], m3
150  lea                   dstq, [dstq+dst_strideq*4]
151  sub                    r4d, 4
152  jnz .loop16
153  RET
154
155INIT_MMX sse
156.w8:
157  mov                    r4d, dword hm
158  lea                    r5q, [src_strideq*3]
159  lea                    r6q, [dst_strideq*3]
160.loop8:
161  movu                    m0, [srcq]
162  movu                    m1, [srcq+src_strideq]
163  movu                    m2, [srcq+src_strideq*2]
164  movu                    m3, [srcq+r5q]
165  lea                   srcq, [srcq+src_strideq*4]
166%ifidn %1, avg
167  pavg                    m0, [dstq]
168  pavg                    m1, [dstq+dst_strideq]
169  pavg                    m2, [dstq+dst_strideq*2]
170  pavg                    m3, [dstq+r6q]
171%endif
172  mova  [dstq              ], m0
173  mova  [dstq+dst_strideq  ], m1
174  mova  [dstq+dst_strideq*2], m2
175  mova  [dstq+r6q          ], m3
176  lea                   dstq, [dstq+dst_strideq*4]
177  sub                    r4d, 4
178  jnz .loop8
179  RET
180
181%ifnidn %2, highbd
182.w4:
183  mov                    r4d, dword hm
184  lea                    r5q, [src_strideq*3]
185  lea                    r6q, [dst_strideq*3]
186.loop4:
187  movh                    m0, [srcq]
188  movh                    m1, [srcq+src_strideq]
189  movh                    m2, [srcq+src_strideq*2]
190  movh                    m3, [srcq+r5q]
191  lea                   srcq, [srcq+src_strideq*4]
192%ifidn %1, avg
193  movh                    m4, [dstq]
194  movh                    m5, [dstq+dst_strideq]
195  movh                    m6, [dstq+dst_strideq*2]
196  movh                    m7, [dstq+r6q]
197  pavg                    m0, m4
198  pavg                    m1, m5
199  pavg                    m2, m6
200  pavg                    m3, m7
201%endif
202  movh  [dstq              ], m0
203  movh  [dstq+dst_strideq  ], m1
204  movh  [dstq+dst_strideq*2], m2
205  movh  [dstq+r6q          ], m3
206  lea                   dstq, [dstq+dst_strideq*4]
207  sub                    r4d, 4
208  jnz .loop4
209  RET
210%endif
211%endmacro
212
213convolve_fn copy
214convolve_fn avg
215%if CONFIG_VP9_HIGHBITDEPTH
216convolve_fn copy, highbd
217convolve_fn avg, highbd
218%endif
219