1 /*
2 * Copyright © 2018-2021, VideoLAN and dav1d authors
3 * Copyright © 2018-2021, Two Orioles, LLC
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 * list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 * this list of conditions and the following disclaimer in the documentation
14 * and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #include "src/cpu.h"
29 #include "src/mc.h"
30
31 #define decl_fn(type, name) \
32 decl_##type##_fn(BF(name, sse2)); \
33 decl_##type##_fn(BF(name, ssse3)); \
34 decl_##type##_fn(BF(name, avx2)); \
35 decl_##type##_fn(BF(name, avx512icl));
36 #define init_mc_fn(type, name, suffix) \
37 c->mc[type] = BF(dav1d_put_##name, suffix)
38 #define init_mct_fn(type, name, suffix) \
39 c->mct[type] = BF(dav1d_prep_##name, suffix)
40 #define init_mc_scaled_fn(type, name, suffix) \
41 c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
42 #define init_mct_scaled_fn(type, name, suffix) \
43 c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
44
45 decl_fn(mc, dav1d_put_8tap_regular);
46 decl_fn(mc, dav1d_put_8tap_regular_smooth);
47 decl_fn(mc, dav1d_put_8tap_regular_sharp);
48 decl_fn(mc, dav1d_put_8tap_smooth);
49 decl_fn(mc, dav1d_put_8tap_smooth_regular);
50 decl_fn(mc, dav1d_put_8tap_smooth_sharp);
51 decl_fn(mc, dav1d_put_8tap_sharp);
52 decl_fn(mc, dav1d_put_8tap_sharp_regular);
53 decl_fn(mc, dav1d_put_8tap_sharp_smooth);
54 decl_fn(mc, dav1d_put_bilin);
55
56 decl_fn(mct, dav1d_prep_8tap_regular);
57 decl_fn(mct, dav1d_prep_8tap_regular_smooth);
58 decl_fn(mct, dav1d_prep_8tap_regular_sharp);
59 decl_fn(mct, dav1d_prep_8tap_smooth);
60 decl_fn(mct, dav1d_prep_8tap_smooth_regular);
61 decl_fn(mct, dav1d_prep_8tap_smooth_sharp);
62 decl_fn(mct, dav1d_prep_8tap_sharp);
63 decl_fn(mct, dav1d_prep_8tap_sharp_regular);
64 decl_fn(mct, dav1d_prep_8tap_sharp_smooth);
65 decl_fn(mct, dav1d_prep_bilin);
66
67 decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular);
68 decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_smooth);
69 decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_sharp);
70 decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth);
71 decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_regular);
72 decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_sharp);
73 decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp);
74 decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_regular);
75 decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_smooth);
76 decl_fn(mc_scaled, dav1d_put_bilin_scaled);
77
78 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular);
79 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_smooth);
80 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_sharp);
81 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth);
82 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_regular);
83 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_sharp);
84 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp);
85 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_regular);
86 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_smooth);
87 decl_fn(mct_scaled, dav1d_prep_bilin_scaled);
88
89 decl_fn(avg, dav1d_avg);
90 decl_fn(w_avg, dav1d_w_avg);
91 decl_fn(mask, dav1d_mask);
92 decl_fn(w_mask, dav1d_w_mask_420);
93 decl_fn(w_mask, dav1d_w_mask_422);
94 decl_fn(w_mask, dav1d_w_mask_444);
95 decl_fn(blend, dav1d_blend);
96 decl_fn(blend_dir, dav1d_blend_v);
97 decl_fn(blend_dir, dav1d_blend_h);
98
99 decl_fn(warp8x8, dav1d_warp_affine_8x8);
100 decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, sse4));
101 decl_fn(warp8x8t, dav1d_warp_affine_8x8t);
102 decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, sse4));
103
104 decl_fn(emu_edge, dav1d_emu_edge);
105
106 decl_fn(resize, dav1d_resize);
107
mc_dsp_init_x86(Dav1dMCDSPContext * const c)108 static ALWAYS_INLINE void mc_dsp_init_x86(Dav1dMCDSPContext *const c) {
109 const unsigned flags = dav1d_get_cpu_flags();
110
111 if(!(flags & DAV1D_X86_CPU_FLAG_SSE2))
112 return;
113
114 #if BITDEPTH == 8
115 init_mct_fn(FILTER_2D_BILINEAR, bilin, sse2);
116 init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, sse2);
117 init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, sse2);
118 init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, sse2);
119 init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, sse2);
120 init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, sse2);
121 init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, sse2);
122 init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, sse2);
123 init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, sse2);
124 init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, sse2);
125
126 c->warp8x8 = BF(dav1d_warp_affine_8x8, sse2);
127 c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse2);
128 #endif
129
130 if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
131 return;
132
133 init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
134 init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
135 init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
136 init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
137 init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3);
138 init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3);
139 init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
140 init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
141 init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
142 init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3);
143
144 init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
145 init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
146 init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
147 init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
148 init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3);
149 init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3);
150 init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
151 init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
152 init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
153 init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3);
154
155 init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
156 init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
157 init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
158 init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
159 init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3);
160 init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3);
161 init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3);
162 init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3);
163 init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3);
164 init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
165
166 init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
167 init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
168 init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
169 init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
170 init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3);
171 init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3);
172 init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3);
173 init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3);
174 init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3);
175 init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
176
177 c->avg = BF(dav1d_avg, ssse3);
178 c->w_avg = BF(dav1d_w_avg, ssse3);
179 c->mask = BF(dav1d_mask, ssse3);
180 c->w_mask[0] = BF(dav1d_w_mask_444, ssse3);
181 c->w_mask[1] = BF(dav1d_w_mask_422, ssse3);
182 c->w_mask[2] = BF(dav1d_w_mask_420, ssse3);
183 c->blend = BF(dav1d_blend, ssse3);
184 c->blend_v = BF(dav1d_blend_v, ssse3);
185 c->blend_h = BF(dav1d_blend_h, ssse3);
186 c->warp8x8 = BF(dav1d_warp_affine_8x8, ssse3);
187 c->warp8x8t = BF(dav1d_warp_affine_8x8t, ssse3);
188 c->emu_edge = BF(dav1d_emu_edge, ssse3);
189 c->resize = BF(dav1d_resize, ssse3);
190
191 if(!(flags & DAV1D_X86_CPU_FLAG_SSE41))
192 return;
193
194 #if BITDEPTH == 8
195 c->warp8x8 = BF(dav1d_warp_affine_8x8, sse4);
196 c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse4);
197 #endif
198
199 #if ARCH_X86_64
200 if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
201 return;
202
203 init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
204 init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
205 init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
206 init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
207 init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
208 init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
209 init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
210 init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
211 init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
212 init_mc_fn(FILTER_2D_BILINEAR, bilin, avx2);
213
214 init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
215 init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
216 init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
217 init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
218 init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
219 init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
220 init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
221 init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
222 init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
223 init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2);
224
225 init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
226 init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
227 init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2);
228 init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
229 init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2);
230 init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2);
231 init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2);
232 init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2);
233 init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
234 init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
235
236 init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
237 init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
238 init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2);
239 init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
240 init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2);
241 init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2);
242 init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2);
243 init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2);
244 init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
245 init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
246
247 c->avg = BF(dav1d_avg, avx2);
248 c->w_avg = BF(dav1d_w_avg, avx2);
249 c->mask = BF(dav1d_mask, avx2);
250 c->w_mask[0] = BF(dav1d_w_mask_444, avx2);
251 c->w_mask[1] = BF(dav1d_w_mask_422, avx2);
252 c->w_mask[2] = BF(dav1d_w_mask_420, avx2);
253 c->blend = BF(dav1d_blend, avx2);
254 c->blend_v = BF(dav1d_blend_v, avx2);
255 c->blend_h = BF(dav1d_blend_h, avx2);
256 c->warp8x8 = BF(dav1d_warp_affine_8x8, avx2);
257 c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx2);
258 c->emu_edge = BF(dav1d_emu_edge, avx2);
259 c->resize = BF(dav1d_resize, avx2);
260
261 if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))
262 return;
263
264 init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl);
265 init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
266 init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl);
267 init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl);
268 init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl);
269 init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl);
270 init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl);
271 init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl);
272 init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl);
273 init_mc_fn (FILTER_2D_BILINEAR, bilin, avx512icl);
274
275 init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl);
276 init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
277 init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl);
278 init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl);
279 init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl);
280 init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl);
281 init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl);
282 init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl);
283 init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl);
284 init_mct_fn(FILTER_2D_BILINEAR, bilin, avx512icl);
285
286 c->avg = BF(dav1d_avg, avx512icl);
287 c->w_avg = BF(dav1d_w_avg, avx512icl);
288 c->mask = BF(dav1d_mask, avx512icl);
289 c->w_mask[0] = BF(dav1d_w_mask_444, avx512icl);
290 c->w_mask[1] = BF(dav1d_w_mask_422, avx512icl);
291 c->w_mask[2] = BF(dav1d_w_mask_420, avx512icl);
292 c->blend = BF(dav1d_blend, avx512icl);
293 c->blend_v = BF(dav1d_blend_v, avx512icl);
294 c->blend_h = BF(dav1d_blend_h, avx512icl);
295
296 if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) {
297 c->resize = BF(dav1d_resize, avx512icl);
298 c->warp8x8 = BF(dav1d_warp_affine_8x8, avx512icl);
299 c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx512icl);
300 }
301 #endif
302 }
303