1 /* GStreamer
2 * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Library General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Library General Public License for more details.
13 *
14 * You should have received a copy of the GNU Library General Public
15 * License along with this library; if not, write to the
16 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
17 * Boston, MA 02110-1301, USA.
18 */
19
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23
24 #include "audio-resampler-x86-sse2.h"
25
26 #if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
27 #include <emmintrin.h>
28
29 static inline void
inner_product_gint16_full_1_sse2(gint16 * o,const gint16 * a,const gint16 * b,gint len,const gint16 * icoeff,gint bstride)30 inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a,
31 const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
32 {
33 gint i;
34 __m128i sum, t;
35
36 sum = _mm_setzero_si128 ();
37
38 for (i = 0; i < len; i += 16) {
39 t = _mm_loadu_si128 ((__m128i *) (a + i));
40 sum =
41 _mm_add_epi32 (sum, _mm_madd_epi16 (t,
42 _mm_load_si128 ((__m128i *) (b + i + 0))));
43
44 t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
45 sum =
46 _mm_add_epi32 (sum, _mm_madd_epi16 (t,
47 _mm_load_si128 ((__m128i *) (b + i + 8))));
48 }
49 sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3)));
50 sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1)));
51
52 sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
53 sum = _mm_srai_epi32 (sum, PRECISION_S16);
54 sum = _mm_packs_epi32 (sum, sum);
55 *o = _mm_extract_epi16 (sum, 0);
56 }
57
58 static inline void
inner_product_gint16_linear_1_sse2(gint16 * o,const gint16 * a,const gint16 * b,gint len,const gint16 * icoeff,gint bstride)59 inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a,
60 const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
61 {
62 gint i = 0;
63 __m128i sum[2], t;
64 __m128i f = _mm_set_epi64x (0, *((gint64 *) icoeff));
65 const gint16 *c[2] = { (gint16 *) ((gint8 *) b + 0 * bstride),
66 (gint16 *) ((gint8 *) b + 1 * bstride)
67 };
68
69 sum[0] = sum[1] = _mm_setzero_si128 ();
70 f = _mm_unpacklo_epi16 (f, sum[0]);
71
72 for (; i < len; i += 16) {
73 t = _mm_loadu_si128 ((__m128i *) (a + i + 0));
74 sum[0] =
75 _mm_add_epi32 (sum[0], _mm_madd_epi16 (t,
76 _mm_load_si128 ((__m128i *) (c[0] + i + 0))));
77 sum[1] =
78 _mm_add_epi32 (sum[1], _mm_madd_epi16 (t,
79 _mm_load_si128 ((__m128i *) (c[1] + i + 0))));
80
81 t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
82 sum[0] =
83 _mm_add_epi32 (sum[0], _mm_madd_epi16 (t,
84 _mm_load_si128 ((__m128i *) (c[0] + i + 8))));
85 sum[1] =
86 _mm_add_epi32 (sum[1], _mm_madd_epi16 (t,
87 _mm_load_si128 ((__m128i *) (c[1] + i + 8))));
88 }
89 sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
90 sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16);
91
92 sum[0] =
93 _mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
94 sum[1] =
95 _mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
96 sum[0] = _mm_add_epi32 (sum[0], sum[1]);
97
98 sum[0] =
99 _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
100 3)));
101 sum[0] =
102 _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
103 1)));
104
105 sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
106 sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
107 sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
108 *o = _mm_extract_epi16 (sum[0], 0);
109 }
110
111 static inline void
inner_product_gint16_cubic_1_sse2(gint16 * o,const gint16 * a,const gint16 * b,gint len,const gint16 * icoeff,gint bstride)112 inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a,
113 const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
114 {
115 gint i = 0;
116 __m128i sum[4], t[4];
117 __m128i f = _mm_set_epi64x (0, *((long long *) icoeff));
118 const gint16 *c[4] = { (gint16 *) ((gint8 *) b + 0 * bstride),
119 (gint16 *) ((gint8 *) b + 1 * bstride),
120 (gint16 *) ((gint8 *) b + 2 * bstride),
121 (gint16 *) ((gint8 *) b + 3 * bstride)
122 };
123
124 sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
125 f = _mm_unpacklo_epi16 (f, sum[0]);
126
127 for (; i < len; i += 8) {
128 t[0] = _mm_loadu_si128 ((__m128i *) (a + i));
129 sum[0] =
130 _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0],
131 _mm_load_si128 ((__m128i *) (c[0] + i))));
132 sum[1] =
133 _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0],
134 _mm_load_si128 ((__m128i *) (c[1] + i))));
135 sum[2] =
136 _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0],
137 _mm_load_si128 ((__m128i *) (c[2] + i))));
138 sum[3] =
139 _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0],
140 _mm_load_si128 ((__m128i *) (c[3] + i))));
141 }
142 t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]);
143 t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]);
144 t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]);
145 t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]);
146
147 sum[0] =
148 _mm_add_epi32 (_mm_unpacklo_epi64 (t[0], t[1]), _mm_unpackhi_epi64 (t[0],
149 t[1]));
150 sum[2] =
151 _mm_add_epi32 (_mm_unpacklo_epi64 (t[2], t[3]), _mm_unpackhi_epi64 (t[2],
152 t[3]));
153 sum[0] = _mm_add_epi32 (sum[0], sum[2]);
154
155 sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
156 sum[0] = _mm_madd_epi16 (sum[0], f);
157
158 sum[0] =
159 _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
160 3)));
161 sum[0] =
162 _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
163 1)));
164
165 sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
166 sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
167 sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
168 *o = _mm_extract_epi16 (sum[0], 0);
169 }
170
171 static inline void
inner_product_gdouble_full_1_sse2(gdouble * o,const gdouble * a,const gdouble * b,gint len,const gdouble * icoeff,gint bstride)172 inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a,
173 const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
174 {
175 gint i = 0;
176 __m128d sum = _mm_setzero_pd ();
177
178 for (; i < len; i += 8) {
179 sum =
180 _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
181 _mm_load_pd (b + i + 0)));
182 sum =
183 _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
184 _mm_load_pd (b + i + 2)));
185 sum =
186 _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
187 _mm_load_pd (b + i + 4)));
188 sum =
189 _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
190 _mm_load_pd (b + i + 6)));
191 }
192 sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
193 _mm_store_sd (o, sum);
194 }
195
196 static inline void
inner_product_gdouble_linear_1_sse2(gdouble * o,const gdouble * a,const gdouble * b,gint len,const gdouble * icoeff,gint bstride)197 inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a,
198 const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
199 {
200 gint i = 0;
201 __m128d sum[2], t;
202 const gdouble *c[2] = { (gdouble *) ((gint8 *) b + 0 * bstride),
203 (gdouble *) ((gint8 *) b + 1 * bstride)
204 };
205
206 sum[0] = sum[1] = _mm_setzero_pd ();
207
208 for (; i < len; i += 4) {
209 t = _mm_loadu_pd (a + i + 0);
210 sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0)));
211 sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0)));
212 t = _mm_loadu_pd (a + i + 2);
213 sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2)));
214 sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2)));
215 }
216 sum[0] = _mm_mul_pd (_mm_sub_pd (sum[0], sum[1]), _mm_load1_pd (icoeff));
217 sum[0] = _mm_add_pd (sum[0], sum[1]);
218 sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
219 _mm_store_sd (o, sum[0]);
220 }
221
222 static inline void
inner_product_gdouble_cubic_1_sse2(gdouble * o,const gdouble * a,const gdouble * b,gint len,const gdouble * icoeff,gint bstride)223 inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a,
224 const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
225 {
226 gint i;
227 __m128d f[2], sum[4], t;
228 const gdouble *c[4] = { (gdouble *) ((gint8 *) b + 0 * bstride),
229 (gdouble *) ((gint8 *) b + 1 * bstride),
230 (gdouble *) ((gint8 *) b + 2 * bstride),
231 (gdouble *) ((gint8 *) b + 3 * bstride)
232 };
233
234 f[0] = _mm_loadu_pd (icoeff + 0);
235 f[1] = _mm_loadu_pd (icoeff + 2);
236 sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd ();
237
238 for (i = 0; i < len; i += 2) {
239 t = _mm_loadu_pd (a + i + 0);
240 sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i)));
241 sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i)));
242 sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i)));
243 sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i)));
244 }
245 sum[0] =
246 _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0)));
247 sum[1] =
248 _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1)));
249 sum[2] =
250 _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0)));
251 sum[3] =
252 _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1)));
253 sum[0] = _mm_add_pd (sum[0], sum[1]);
254 sum[2] = _mm_add_pd (sum[2], sum[3]);
255 sum[0] = _mm_add_pd (sum[0], sum[2]);
256 sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
257 _mm_store_sd (o, sum[0]);
258 }
259
260 MAKE_RESAMPLE_FUNC (gint16, full, 1, sse2);
261 MAKE_RESAMPLE_FUNC (gint16, linear, 1, sse2);
262 MAKE_RESAMPLE_FUNC (gint16, cubic, 1, sse2);
263
264 MAKE_RESAMPLE_FUNC (gdouble, full, 1, sse2);
265 MAKE_RESAMPLE_FUNC (gdouble, linear, 1, sse2);
266 MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, sse2);
267
268 void
interpolate_gint16_linear_sse2(gpointer op,const gpointer ap,gint len,const gpointer icp,gint astride)269 interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap,
270 gint len, const gpointer icp, gint astride)
271 {
272 gint i = 0;
273 gint16 *o = op, *a = ap, *ic = icp;
274 __m128i ta, tb, t1, t2;
275 __m128i f = _mm_set_epi64x (0, *((gint64 *) ic));
276 const gint16 *c[2] = { (gint16 *) ((gint8 *) a + 0 * astride),
277 (gint16 *) ((gint8 *) a + 1 * astride)
278 };
279
280 f = _mm_unpacklo_epi32 (f, f);
281 f = _mm_unpacklo_epi64 (f, f);
282
283 for (; i < len; i += 8) {
284 ta = _mm_load_si128 ((__m128i *) (c[0] + i));
285 tb = _mm_load_si128 ((__m128i *) (c[1] + i));
286
287 t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f);
288 t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f);
289
290 t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
291 t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
292
293 t1 = _mm_srai_epi32 (t1, PRECISION_S16);
294 t2 = _mm_srai_epi32 (t2, PRECISION_S16);
295
296 t1 = _mm_packs_epi32 (t1, t2);
297 _mm_store_si128 ((__m128i *) (o + i), t1);
298 }
299 }
300
301 void
interpolate_gint16_cubic_sse2(gpointer op,const gpointer ap,gint len,const gpointer icp,gint astride)302 interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap,
303 gint len, const gpointer icp, gint astride)
304 {
305 gint i = 0;
306 gint16 *o = op, *a = ap, *ic = icp;
307 __m128i ta, tb, tl1, tl2, th1, th2;
308 __m128i f[2];
309 const gint16 *c[4] = { (gint16 *) ((gint8 *) a + 0 * astride),
310 (gint16 *) ((gint8 *) a + 1 * astride),
311 (gint16 *) ((gint8 *) a + 2 * astride),
312 (gint16 *) ((gint8 *) a + 3 * astride)
313 };
314
315 f[0] = _mm_set_epi16 (ic[1], ic[0], ic[1], ic[0], ic[1], ic[0], ic[1], ic[0]);
316 f[1] = _mm_set_epi16 (ic[3], ic[2], ic[3], ic[2], ic[3], ic[2], ic[3], ic[2]);
317
318 for (; i < len; i += 8) {
319 ta = _mm_load_si128 ((__m128i *) (c[0] + i));
320 tb = _mm_load_si128 ((__m128i *) (c[1] + i));
321
322 tl1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[0]);
323 th1 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[0]);
324
325 ta = _mm_load_si128 ((__m128i *) (c[2] + i));
326 tb = _mm_load_si128 ((__m128i *) (c[3] + i));
327
328 tl2 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[1]);
329 th2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[1]);
330
331 tl1 = _mm_add_epi32 (tl1, tl2);
332 th1 = _mm_add_epi32 (th1, th2);
333
334 tl1 = _mm_add_epi32 (tl1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
335 th1 = _mm_add_epi32 (th1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
336
337 tl1 = _mm_srai_epi32 (tl1, PRECISION_S16);
338 th1 = _mm_srai_epi32 (th1, PRECISION_S16);
339
340 tl1 = _mm_packs_epi32 (tl1, th1);
341 _mm_store_si128 ((__m128i *) (o + i), tl1);
342 }
343 }
344
345 void
interpolate_gdouble_linear_sse2(gpointer op,const gpointer ap,gint len,const gpointer icp,gint astride)346 interpolate_gdouble_linear_sse2 (gpointer op, const gpointer ap,
347 gint len, const gpointer icp, gint astride)
348 {
349 gint i;
350 gdouble *o = op, *a = ap, *ic = icp;
351 __m128d f[2], t1, t2;
352 const gdouble *c[2] = { (gdouble *) ((gint8 *) a + 0 * astride),
353 (gdouble *) ((gint8 *) a + 1 * astride)
354 };
355
356 f[0] = _mm_load1_pd (ic + 0);
357 f[1] = _mm_load1_pd (ic + 1);
358
359 for (i = 0; i < len; i += 4) {
360 t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
361 t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
362 _mm_store_pd (o + i + 0, _mm_add_pd (t1, t2));
363
364 t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 2), f[0]);
365 t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 2), f[1]);
366 _mm_store_pd (o + i + 2, _mm_add_pd (t1, t2));
367 }
368 }
369
370 void
interpolate_gdouble_cubic_sse2(gpointer op,const gpointer ap,gint len,const gpointer icp,gint astride)371 interpolate_gdouble_cubic_sse2 (gpointer op, const gpointer ap,
372 gint len, const gpointer icp, gint astride)
373 {
374 gint i;
375 gdouble *o = op, *a = ap, *ic = icp;
376 __m128d f[4], t[4];
377 const gdouble *c[4] = { (gdouble *) ((gint8 *) a + 0 * astride),
378 (gdouble *) ((gint8 *) a + 1 * astride),
379 (gdouble *) ((gint8 *) a + 2 * astride),
380 (gdouble *) ((gint8 *) a + 3 * astride)
381 };
382
383 f[0] = _mm_load1_pd (ic + 0);
384 f[1] = _mm_load1_pd (ic + 1);
385 f[2] = _mm_load1_pd (ic + 2);
386 f[3] = _mm_load1_pd (ic + 3);
387
388 for (i = 0; i < len; i += 2) {
389 t[0] = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
390 t[1] = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
391 t[2] = _mm_mul_pd (_mm_load_pd (c[2] + i + 0), f[2]);
392 t[3] = _mm_mul_pd (_mm_load_pd (c[3] + i + 0), f[3]);
393 t[0] = _mm_add_pd (t[0], t[1]);
394 t[2] = _mm_add_pd (t[2], t[3]);
395 _mm_store_pd (o + i + 0, _mm_add_pd (t[0], t[2]));
396 }
397 }
398
399 #endif
400