1 /* libFLAC - Free Lossless Audio Codec library
2 * Copyright (C) 2000-2009 Josh Coalson
3 * Copyright (C) 2011-2016 Xiph.Org Foundation
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * - Neither the name of the Xiph.org Foundation nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 #ifdef HAVE_CONFIG_H
34 # include <config.h>
35 #endif
36
37 #include "private/cpu.h"
38
39 #ifndef FLAC__INTEGER_ONLY_LIBRARY
40 #ifndef FLAC__NO_ASM
41 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42 #include "private/lpc.h"
43 #ifdef FLAC__SSE_SUPPORTED
44 #include "FLAC/assert.h"
45 #include "FLAC/format.h"
46
47 #include <xmmintrin.h> /* SSE */
48
49 /* new routines: more unaligned loads, less shuffle
50 * old routines: less unaligned loads, more shuffle
51 * these *_old routines are equivalent to the ASM routines in ia32/lpc_asm.nasm
52 */
53
54 /* new routines: faster on current Intel (starting from Core i aka Nehalem) and all AMD CPUs */
55
56 FLAC__SSE_TARGET("sse")
FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_new(const FLAC__real data[],unsigned data_len,unsigned lag,FLAC__real autoc[])57 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
58 {
59 int i;
60 int limit = data_len - 4;
61 __m128 sum0;
62
63 (void) lag;
64 FLAC__ASSERT(lag <= 4);
65 FLAC__ASSERT(lag <= data_len);
66
67 sum0 = _mm_setzero_ps();
68
69 for(i = 0; i <= limit; i++) {
70 __m128 d, d0;
71 d0 = _mm_loadu_ps(data+i);
72 d = d0; d = _mm_shuffle_ps(d, d, 0);
73 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
74 }
75
76 {
77 __m128 d0 = _mm_setzero_ps();
78 limit++; if(limit < 0) limit = 0;
79
80 for(i = data_len-1; i >= limit; i--) {
81 __m128 d;
82 d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
83 d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
84 d0 = _mm_move_ss(d0, d);
85 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
86 }
87 }
88
89 _mm_storeu_ps(autoc, sum0);
90 }
91
92 FLAC__SSE_TARGET("sse")
FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_new(const FLAC__real data[],unsigned data_len,unsigned lag,FLAC__real autoc[])93 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
94 {
95 int i;
96 int limit = data_len - 8;
97 __m128 sum0, sum1;
98
99 (void) lag;
100 FLAC__ASSERT(lag <= 8);
101 FLAC__ASSERT(lag <= data_len);
102
103 sum0 = _mm_setzero_ps();
104 sum1 = _mm_setzero_ps();
105
106 for(i = 0; i <= limit; i++) {
107 __m128 d, d0, d1;
108 d0 = _mm_loadu_ps(data+i);
109 d1 = _mm_loadu_ps(data+i+4);
110 d = d0; d = _mm_shuffle_ps(d, d, 0);
111 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
112 sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
113 }
114
115 {
116 __m128 d0 = _mm_setzero_ps();
117 __m128 d1 = _mm_setzero_ps();
118 limit++; if(limit < 0) limit = 0;
119
120 for(i = data_len-1; i >= limit; i--) {
121 __m128 d;
122 d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
123 d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
124 d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
125 d1 = _mm_move_ss(d1, d0);
126 d0 = _mm_move_ss(d0, d);
127 sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
128 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
129 }
130 }
131
132 _mm_storeu_ps(autoc, sum0);
133 _mm_storeu_ps(autoc+4, sum1);
134 }
135
136 FLAC__SSE_TARGET("sse")
FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new(const FLAC__real data[],unsigned data_len,unsigned lag,FLAC__real autoc[])137 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
138 {
139 int i;
140 int limit = data_len - 12;
141 __m128 sum0, sum1, sum2;
142
143 (void) lag;
144 FLAC__ASSERT(lag <= 12);
145 FLAC__ASSERT(lag <= data_len);
146
147 sum0 = _mm_setzero_ps();
148 sum1 = _mm_setzero_ps();
149 sum2 = _mm_setzero_ps();
150
151 for(i = 0; i <= limit; i++) {
152 __m128 d, d0, d1, d2;
153 d0 = _mm_loadu_ps(data+i);
154 d1 = _mm_loadu_ps(data+i+4);
155 d2 = _mm_loadu_ps(data+i+8);
156 d = d0; d = _mm_shuffle_ps(d, d, 0);
157 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
158 sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
159 sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d));
160 }
161
162 {
163 __m128 d0 = _mm_setzero_ps();
164 __m128 d1 = _mm_setzero_ps();
165 __m128 d2 = _mm_setzero_ps();
166 limit++; if(limit < 0) limit = 0;
167
168 for(i = data_len-1; i >= limit; i--) {
169 __m128 d;
170 d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
171 d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3));
172 d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
173 d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
174 d2 = _mm_move_ss(d2, d1);
175 d1 = _mm_move_ss(d1, d0);
176 d0 = _mm_move_ss(d0, d);
177 sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2));
178 sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
179 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
180 }
181 }
182
183 _mm_storeu_ps(autoc, sum0);
184 _mm_storeu_ps(autoc+4, sum1);
185 _mm_storeu_ps(autoc+8, sum2);
186 }
187
188 FLAC__SSE_TARGET("sse")
FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new(const FLAC__real data[],unsigned data_len,unsigned lag,FLAC__real autoc[])189 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
190 {
191 int i;
192 int limit = data_len - 16;
193 __m128 sum0, sum1, sum2, sum3;
194
195 (void) lag;
196 FLAC__ASSERT(lag <= 16);
197 FLAC__ASSERT(lag <= data_len);
198
199 sum0 = _mm_setzero_ps();
200 sum1 = _mm_setzero_ps();
201 sum2 = _mm_setzero_ps();
202 sum3 = _mm_setzero_ps();
203
204 for(i = 0; i <= limit; i++) {
205 __m128 d, d0, d1, d2, d3;
206 d0 = _mm_loadu_ps(data+i);
207 d1 = _mm_loadu_ps(data+i+4);
208 d2 = _mm_loadu_ps(data+i+8);
209 d3 = _mm_loadu_ps(data+i+12);
210 d = d0; d = _mm_shuffle_ps(d, d, 0);
211 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
212 sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
213 sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d));
214 sum3 = _mm_add_ps(sum3, _mm_mul_ps(d3, d));
215 }
216
217 {
218 __m128 d0 = _mm_setzero_ps();
219 __m128 d1 = _mm_setzero_ps();
220 __m128 d2 = _mm_setzero_ps();
221 __m128 d3 = _mm_setzero_ps();
222 limit++; if(limit < 0) limit = 0;
223
224 for(i = data_len-1; i >= limit; i--) {
225 __m128 d;
226 d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
227 d3 = _mm_shuffle_ps(d3, d3, _MM_SHUFFLE(2,1,0,3));
228 d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3));
229 d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
230 d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
231 d3 = _mm_move_ss(d3, d2);
232 d2 = _mm_move_ss(d2, d1);
233 d1 = _mm_move_ss(d1, d0);
234 d0 = _mm_move_ss(d0, d);
235 sum3 = _mm_add_ps(sum3, _mm_mul_ps(d, d3));
236 sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2));
237 sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
238 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
239 }
240 }
241
242 _mm_storeu_ps(autoc, sum0);
243 _mm_storeu_ps(autoc+4, sum1);
244 _mm_storeu_ps(autoc+8, sum2);
245 _mm_storeu_ps(autoc+12,sum3);
246 }
247
248 /* old routines: faster on older Intel CPUs (up to Core 2) */
249
250 FLAC__SSE_TARGET("sse")
FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old(const FLAC__real data[],unsigned data_len,unsigned lag,FLAC__real autoc[])251 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
252 {
253 __m128 xmm0, xmm2, xmm5;
254
255 (void) lag;
256 FLAC__ASSERT(lag > 0);
257 FLAC__ASSERT(lag <= 4);
258 FLAC__ASSERT(lag <= data_len);
259 FLAC__ASSERT(data_len > 0);
260
261 xmm5 = _mm_setzero_ps();
262
263 xmm0 = _mm_load_ss(data++);
264 xmm2 = xmm0;
265 xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
266
267 xmm0 = _mm_mul_ps(xmm0, xmm2);
268 xmm5 = _mm_add_ps(xmm5, xmm0);
269
270 data_len--;
271
272 while(data_len)
273 {
274 xmm0 = _mm_load1_ps(data++);
275
276 xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
277 xmm2 = _mm_move_ss(xmm2, xmm0);
278 xmm0 = _mm_mul_ps(xmm0, xmm2);
279 xmm5 = _mm_add_ps(xmm5, xmm0);
280
281 data_len--;
282 }
283
284 _mm_storeu_ps(autoc, xmm5);
285 }
286
287 FLAC__SSE_TARGET("sse")
FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_old(const FLAC__real data[],unsigned data_len,unsigned lag,FLAC__real autoc[])288 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
289 {
290 __m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6;
291
292 (void) lag;
293 FLAC__ASSERT(lag > 0);
294 FLAC__ASSERT(lag <= 8);
295 FLAC__ASSERT(lag <= data_len);
296 FLAC__ASSERT(data_len > 0);
297
298 xmm5 = _mm_setzero_ps();
299 xmm6 = _mm_setzero_ps();
300
301 xmm0 = _mm_load_ss(data++);
302 xmm2 = xmm0;
303 xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
304 xmm3 = _mm_setzero_ps();
305
306 xmm0 = _mm_mul_ps(xmm0, xmm2);
307 xmm5 = _mm_add_ps(xmm5, xmm0);
308
309 data_len--;
310
311 while(data_len)
312 {
313 xmm0 = _mm_load1_ps(data++);
314
315 xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
316 xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3));
317 xmm3 = _mm_move_ss(xmm3, xmm2);
318 xmm2 = _mm_move_ss(xmm2, xmm0);
319
320 xmm1 = xmm0;
321 xmm1 = _mm_mul_ps(xmm1, xmm3);
322 xmm0 = _mm_mul_ps(xmm0, xmm2);
323 xmm6 = _mm_add_ps(xmm6, xmm1);
324 xmm5 = _mm_add_ps(xmm5, xmm0);
325
326 data_len--;
327 }
328
329 _mm_storeu_ps(autoc, xmm5);
330 _mm_storeu_ps(autoc+4, xmm6);
331 }
332
333 FLAC__SSE_TARGET("sse")
FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_old(const FLAC__real data[],unsigned data_len,unsigned lag,FLAC__real autoc[])334 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
335 {
336 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
337
338 (void) lag;
339 FLAC__ASSERT(lag > 0);
340 FLAC__ASSERT(lag <= 12);
341 FLAC__ASSERT(lag <= data_len);
342 FLAC__ASSERT(data_len > 0);
343
344 xmm5 = _mm_setzero_ps();
345 xmm6 = _mm_setzero_ps();
346 xmm7 = _mm_setzero_ps();
347
348 xmm0 = _mm_load_ss(data++);
349 xmm2 = xmm0;
350 xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
351 xmm3 = _mm_setzero_ps();
352 xmm4 = _mm_setzero_ps();
353
354 xmm0 = _mm_mul_ps(xmm0, xmm2);
355 xmm5 = _mm_add_ps(xmm5, xmm0);
356
357 data_len--;
358
359 while(data_len)
360 {
361 xmm0 = _mm_load1_ps(data++);
362
363 xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
364 xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3));
365 xmm4 = _mm_shuffle_ps(xmm4, xmm4, _MM_SHUFFLE(2,1,0,3));
366 xmm4 = _mm_move_ss(xmm4, xmm3);
367 xmm3 = _mm_move_ss(xmm3, xmm2);
368 xmm2 = _mm_move_ss(xmm2, xmm0);
369
370 xmm1 = xmm0;
371 xmm1 = _mm_mul_ps(xmm1, xmm2);
372 xmm5 = _mm_add_ps(xmm5, xmm1);
373 xmm1 = xmm0;
374 xmm1 = _mm_mul_ps(xmm1, xmm3);
375 xmm6 = _mm_add_ps(xmm6, xmm1);
376 xmm0 = _mm_mul_ps(xmm0, xmm4);
377 xmm7 = _mm_add_ps(xmm7, xmm0);
378
379 data_len--;
380 }
381
382 _mm_storeu_ps(autoc, xmm5);
383 _mm_storeu_ps(autoc+4, xmm6);
384 _mm_storeu_ps(autoc+8, xmm7);
385 }
386
387 FLAC__SSE_TARGET("sse")
FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_old(const FLAC__real data[],unsigned data_len,unsigned lag,FLAC__real autoc[])388 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
389 {
390 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9;
391
392 (void) lag;
393 FLAC__ASSERT(lag > 0);
394 FLAC__ASSERT(lag <= 16);
395 FLAC__ASSERT(lag <= data_len);
396 FLAC__ASSERT(data_len > 0);
397
398 xmm6 = _mm_setzero_ps();
399 xmm7 = _mm_setzero_ps();
400 xmm8 = _mm_setzero_ps();
401 xmm9 = _mm_setzero_ps();
402
403 xmm0 = _mm_load_ss(data++);
404 xmm2 = xmm0;
405 xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
406 xmm3 = _mm_setzero_ps();
407 xmm4 = _mm_setzero_ps();
408 xmm5 = _mm_setzero_ps();
409
410 xmm0 = _mm_mul_ps(xmm0, xmm2);
411 xmm6 = _mm_add_ps(xmm6, xmm0);
412
413 data_len--;
414
415 while(data_len)
416 {
417 xmm0 = _mm_load1_ps(data++);
418
419 /* shift xmm5:xmm4:xmm3:xmm2 left by one float */
420 xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(2,1,0,3));
421 xmm4 = _mm_shuffle_ps(xmm4, xmm4, _MM_SHUFFLE(2,1,0,3));
422 xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3));
423 xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
424 xmm5 = _mm_move_ss(xmm5, xmm4);
425 xmm4 = _mm_move_ss(xmm4, xmm3);
426 xmm3 = _mm_move_ss(xmm3, xmm2);
427 xmm2 = _mm_move_ss(xmm2, xmm0);
428
429 /* xmm9|xmm8|xmm7|xmm6 += xmm0|xmm0|xmm0|xmm0 * xmm5|xmm4|xmm3|xmm2 */
430 xmm1 = xmm0;
431 xmm1 = _mm_mul_ps(xmm1, xmm5);
432 xmm9 = _mm_add_ps(xmm9, xmm1);
433 xmm1 = xmm0;
434 xmm1 = _mm_mul_ps(xmm1, xmm4);
435 xmm8 = _mm_add_ps(xmm8, xmm1);
436 xmm1 = xmm0;
437 xmm1 = _mm_mul_ps(xmm1, xmm3);
438 xmm7 = _mm_add_ps(xmm7, xmm1);
439 xmm0 = _mm_mul_ps(xmm0, xmm2);
440 xmm6 = _mm_add_ps(xmm6, xmm0);
441
442 data_len--;
443 }
444
445 _mm_storeu_ps(autoc, xmm6);
446 _mm_storeu_ps(autoc+4, xmm7);
447 _mm_storeu_ps(autoc+8, xmm8);
448 _mm_storeu_ps(autoc+12,xmm9);
449 }
450
451 #endif /* FLAC__SSE_SUPPORTED */
452 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
453 #endif /* FLAC__NO_ASM */
454 #endif /* FLAC__INTEGER_ONLY_LIBRARY */
455