• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2016  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #ifdef HAVE_CONFIG_H
34 #  include <config.h>
35 #endif
36 
37 #include "private/cpu.h"
38 
39 #ifndef FLAC__INTEGER_ONLY_LIBRARY
40 #ifndef FLAC__NO_ASM
41 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42 #include "private/lpc.h"
43 #ifdef FLAC__SSE_SUPPORTED
44 #include "FLAC/assert.h"
45 #include "FLAC/format.h"
46 
47 #include <xmmintrin.h> /* SSE */
48 
49 /*   new routines: more unaligned loads, less shuffle
50  *   old routines: less unaligned loads, more shuffle
51  *   these *_old routines are equivalent to the ASM routines in ia32/lpc_asm.nasm
52  */
53 
54 /* new routines: faster on current Intel (starting from Core i aka Nehalem) and all AMD CPUs */
55 
56 FLAC__SSE_TARGET("sse")
FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_new(const FLAC__real data[],unsigned data_len,unsigned lag,FLAC__real autoc[])57 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
58 {
59 	int i;
60 	int limit = data_len - 4;
61 	__m128 sum0;
62 
63 	(void) lag;
64 	FLAC__ASSERT(lag <= 4);
65 	FLAC__ASSERT(lag <= data_len);
66 
67 	sum0 = _mm_setzero_ps();
68 
69 	for(i = 0; i <= limit; i++) {
70 		__m128 d, d0;
71 		d0 = _mm_loadu_ps(data+i);
72 		d = d0; d = _mm_shuffle_ps(d, d, 0);
73 		sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
74 	}
75 
76 	{
77 		__m128 d0 = _mm_setzero_ps();
78 		limit++; if(limit < 0) limit = 0;
79 
80 		for(i = data_len-1; i >= limit; i--) {
81 			__m128 d;
82 			d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
83 			d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
84 			d0 = _mm_move_ss(d0, d);
85 			sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
86 		}
87 	}
88 
89 	_mm_storeu_ps(autoc,   sum0);
90 }
91 
92 FLAC__SSE_TARGET("sse")
FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_new(const FLAC__real data[],unsigned data_len,unsigned lag,FLAC__real autoc[])93 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
94 {
95 	int i;
96 	int limit = data_len - 8;
97 	__m128 sum0, sum1;
98 
99 	(void) lag;
100 	FLAC__ASSERT(lag <= 8);
101 	FLAC__ASSERT(lag <= data_len);
102 
103 	sum0 = _mm_setzero_ps();
104 	sum1 = _mm_setzero_ps();
105 
106 	for(i = 0; i <= limit; i++) {
107 		__m128 d, d0, d1;
108 		d0 = _mm_loadu_ps(data+i);
109 		d1 = _mm_loadu_ps(data+i+4);
110 		d = d0; d = _mm_shuffle_ps(d, d, 0);
111 		sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
112 		sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
113 	}
114 
115 	{
116 		__m128 d0 = _mm_setzero_ps();
117 		__m128 d1 = _mm_setzero_ps();
118 		limit++; if(limit < 0) limit = 0;
119 
120 		for(i = data_len-1; i >= limit; i--) {
121 			__m128 d;
122 			d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
123 			d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
124 			d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
125 			d1 = _mm_move_ss(d1, d0);
126 			d0 = _mm_move_ss(d0, d);
127 			sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
128 			sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
129 		}
130 	}
131 
132 	_mm_storeu_ps(autoc,   sum0);
133 	_mm_storeu_ps(autoc+4, sum1);
134 }
135 
136 FLAC__SSE_TARGET("sse")
FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new(const FLAC__real data[],unsigned data_len,unsigned lag,FLAC__real autoc[])137 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
138 {
139 	int i;
140 	int limit = data_len - 12;
141 	__m128 sum0, sum1, sum2;
142 
143 	(void) lag;
144 	FLAC__ASSERT(lag <= 12);
145 	FLAC__ASSERT(lag <= data_len);
146 
147 	sum0 = _mm_setzero_ps();
148 	sum1 = _mm_setzero_ps();
149 	sum2 = _mm_setzero_ps();
150 
151 	for(i = 0; i <= limit; i++) {
152 		__m128 d, d0, d1, d2;
153 		d0 = _mm_loadu_ps(data+i);
154 		d1 = _mm_loadu_ps(data+i+4);
155 		d2 = _mm_loadu_ps(data+i+8);
156 		d = d0; d = _mm_shuffle_ps(d, d, 0);
157 		sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
158 		sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
159 		sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d));
160 	}
161 
162 	{
163 		__m128 d0 = _mm_setzero_ps();
164 		__m128 d1 = _mm_setzero_ps();
165 		__m128 d2 = _mm_setzero_ps();
166 		limit++; if(limit < 0) limit = 0;
167 
168 		for(i = data_len-1; i >= limit; i--) {
169 			__m128 d;
170 			d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
171 			d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3));
172 			d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
173 			d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
174 			d2 = _mm_move_ss(d2, d1);
175 			d1 = _mm_move_ss(d1, d0);
176 			d0 = _mm_move_ss(d0, d);
177 			sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2));
178 			sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
179 			sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
180 		}
181 	}
182 
183 	_mm_storeu_ps(autoc,   sum0);
184 	_mm_storeu_ps(autoc+4, sum1);
185 	_mm_storeu_ps(autoc+8, sum2);
186 }
187 
188 FLAC__SSE_TARGET("sse")
FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new(const FLAC__real data[],unsigned data_len,unsigned lag,FLAC__real autoc[])189 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
190 {
191 	int i;
192 	int limit = data_len - 16;
193 	__m128 sum0, sum1, sum2, sum3;
194 
195 	(void) lag;
196 	FLAC__ASSERT(lag <= 16);
197 	FLAC__ASSERT(lag <= data_len);
198 
199 	sum0 = _mm_setzero_ps();
200 	sum1 = _mm_setzero_ps();
201 	sum2 = _mm_setzero_ps();
202 	sum3 = _mm_setzero_ps();
203 
204 	for(i = 0; i <= limit; i++) {
205 		__m128 d, d0, d1, d2, d3;
206 		d0 = _mm_loadu_ps(data+i);
207 		d1 = _mm_loadu_ps(data+i+4);
208 		d2 = _mm_loadu_ps(data+i+8);
209 		d3 = _mm_loadu_ps(data+i+12);
210 		d = d0; d = _mm_shuffle_ps(d, d, 0);
211 		sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
212 		sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
213 		sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d));
214 		sum3 = _mm_add_ps(sum3, _mm_mul_ps(d3, d));
215 	}
216 
217 	{
218 		__m128 d0 = _mm_setzero_ps();
219 		__m128 d1 = _mm_setzero_ps();
220 		__m128 d2 = _mm_setzero_ps();
221 		__m128 d3 = _mm_setzero_ps();
222 		limit++; if(limit < 0) limit = 0;
223 
224 		for(i = data_len-1; i >= limit; i--) {
225 			__m128 d;
226 			d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
227 			d3 = _mm_shuffle_ps(d3, d3, _MM_SHUFFLE(2,1,0,3));
228 			d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3));
229 			d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
230 			d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
231 			d3 = _mm_move_ss(d3, d2);
232 			d2 = _mm_move_ss(d2, d1);
233 			d1 = _mm_move_ss(d1, d0);
234 			d0 = _mm_move_ss(d0, d);
235 			sum3 = _mm_add_ps(sum3, _mm_mul_ps(d, d3));
236 			sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2));
237 			sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
238 			sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
239 		}
240 	}
241 
242 	_mm_storeu_ps(autoc,   sum0);
243 	_mm_storeu_ps(autoc+4, sum1);
244 	_mm_storeu_ps(autoc+8, sum2);
245 	_mm_storeu_ps(autoc+12,sum3);
246 }
247 
248 /* old routines: faster on older Intel CPUs (up to Core 2) */
249 
250 FLAC__SSE_TARGET("sse")
FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old(const FLAC__real data[],unsigned data_len,unsigned lag,FLAC__real autoc[])251 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
252 {
253 	__m128 xmm0, xmm2, xmm5;
254 
255 	(void) lag;
256 	FLAC__ASSERT(lag > 0);
257 	FLAC__ASSERT(lag <= 4);
258 	FLAC__ASSERT(lag <= data_len);
259 	FLAC__ASSERT(data_len > 0);
260 
261 	xmm5 = _mm_setzero_ps();
262 
263 	xmm0 = _mm_load_ss(data++);
264 	xmm2 = xmm0;
265 	xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
266 
267 	xmm0 = _mm_mul_ps(xmm0, xmm2);
268 	xmm5 = _mm_add_ps(xmm5, xmm0);
269 
270 	data_len--;
271 
272 	while(data_len)
273 	{
274 		xmm0 = _mm_load1_ps(data++);
275 
276 		xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
277 		xmm2 = _mm_move_ss(xmm2, xmm0);
278 		xmm0 = _mm_mul_ps(xmm0, xmm2);
279 		xmm5 = _mm_add_ps(xmm5, xmm0);
280 
281 		data_len--;
282 	}
283 
284 	_mm_storeu_ps(autoc, xmm5);
285 }
286 
287 FLAC__SSE_TARGET("sse")
FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_old(const FLAC__real data[],unsigned data_len,unsigned lag,FLAC__real autoc[])288 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
289 {
290 	__m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6;
291 
292 	(void) lag;
293 	FLAC__ASSERT(lag > 0);
294 	FLAC__ASSERT(lag <= 8);
295 	FLAC__ASSERT(lag <= data_len);
296 	FLAC__ASSERT(data_len > 0);
297 
298 	xmm5 = _mm_setzero_ps();
299 	xmm6 = _mm_setzero_ps();
300 
301 	xmm0 = _mm_load_ss(data++);
302 	xmm2 = xmm0;
303 	xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
304 	xmm3 = _mm_setzero_ps();
305 
306 	xmm0 = _mm_mul_ps(xmm0, xmm2);
307 	xmm5 = _mm_add_ps(xmm5, xmm0);
308 
309 	data_len--;
310 
311 	while(data_len)
312 	{
313 		xmm0 = _mm_load1_ps(data++);
314 
315 		xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
316 		xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3));
317 		xmm3 = _mm_move_ss(xmm3, xmm2);
318 		xmm2 = _mm_move_ss(xmm2, xmm0);
319 
320 		xmm1 = xmm0;
321 		xmm1 = _mm_mul_ps(xmm1, xmm3);
322 		xmm0 = _mm_mul_ps(xmm0, xmm2);
323 		xmm6 = _mm_add_ps(xmm6, xmm1);
324 		xmm5 = _mm_add_ps(xmm5, xmm0);
325 
326 		data_len--;
327 	}
328 
329 	_mm_storeu_ps(autoc,   xmm5);
330 	_mm_storeu_ps(autoc+4, xmm6);
331 }
332 
333 FLAC__SSE_TARGET("sse")
FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_old(const FLAC__real data[],unsigned data_len,unsigned lag,FLAC__real autoc[])334 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
335 {
336 	__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
337 
338 	(void) lag;
339 	FLAC__ASSERT(lag > 0);
340 	FLAC__ASSERT(lag <= 12);
341 	FLAC__ASSERT(lag <= data_len);
342 	FLAC__ASSERT(data_len > 0);
343 
344 	xmm5 = _mm_setzero_ps();
345 	xmm6 = _mm_setzero_ps();
346 	xmm7 = _mm_setzero_ps();
347 
348 	xmm0 = _mm_load_ss(data++);
349 	xmm2 = xmm0;
350 	xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
351 	xmm3 = _mm_setzero_ps();
352 	xmm4 = _mm_setzero_ps();
353 
354 	xmm0 = _mm_mul_ps(xmm0, xmm2);
355 	xmm5 = _mm_add_ps(xmm5, xmm0);
356 
357 	data_len--;
358 
359 	while(data_len)
360 	{
361 		xmm0 = _mm_load1_ps(data++);
362 
363 		xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
364 		xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3));
365 		xmm4 = _mm_shuffle_ps(xmm4, xmm4, _MM_SHUFFLE(2,1,0,3));
366 		xmm4 = _mm_move_ss(xmm4, xmm3);
367 		xmm3 = _mm_move_ss(xmm3, xmm2);
368 		xmm2 = _mm_move_ss(xmm2, xmm0);
369 
370 		xmm1 = xmm0;
371 		xmm1 = _mm_mul_ps(xmm1, xmm2);
372 		xmm5 = _mm_add_ps(xmm5, xmm1);
373 		xmm1 = xmm0;
374 		xmm1 = _mm_mul_ps(xmm1, xmm3);
375 		xmm6 = _mm_add_ps(xmm6, xmm1);
376 		xmm0 = _mm_mul_ps(xmm0, xmm4);
377 		xmm7 = _mm_add_ps(xmm7, xmm0);
378 
379 		data_len--;
380 	}
381 
382 	_mm_storeu_ps(autoc,   xmm5);
383 	_mm_storeu_ps(autoc+4, xmm6);
384 	_mm_storeu_ps(autoc+8, xmm7);
385 }
386 
387 FLAC__SSE_TARGET("sse")
FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_old(const FLAC__real data[],unsigned data_len,unsigned lag,FLAC__real autoc[])388 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
389 {
390 	__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9;
391 
392 	(void) lag;
393 	FLAC__ASSERT(lag > 0);
394 	FLAC__ASSERT(lag <= 16);
395 	FLAC__ASSERT(lag <= data_len);
396 	FLAC__ASSERT(data_len > 0);
397 
398 	xmm6 = _mm_setzero_ps();
399 	xmm7 = _mm_setzero_ps();
400 	xmm8 = _mm_setzero_ps();
401 	xmm9 = _mm_setzero_ps();
402 
403 	xmm0 = _mm_load_ss(data++);
404 	xmm2 = xmm0;
405 	xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
406 	xmm3 = _mm_setzero_ps();
407 	xmm4 = _mm_setzero_ps();
408 	xmm5 = _mm_setzero_ps();
409 
410 	xmm0 = _mm_mul_ps(xmm0, xmm2);
411 	xmm6 = _mm_add_ps(xmm6, xmm0);
412 
413 	data_len--;
414 
415 	while(data_len)
416 	{
417 		xmm0 = _mm_load1_ps(data++);
418 
419 		/* shift xmm5:xmm4:xmm3:xmm2 left by one float */
420 		xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(2,1,0,3));
421 		xmm4 = _mm_shuffle_ps(xmm4, xmm4, _MM_SHUFFLE(2,1,0,3));
422 		xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3));
423 		xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
424 		xmm5 = _mm_move_ss(xmm5, xmm4);
425 		xmm4 = _mm_move_ss(xmm4, xmm3);
426 		xmm3 = _mm_move_ss(xmm3, xmm2);
427 		xmm2 = _mm_move_ss(xmm2, xmm0);
428 
429 		/* xmm9|xmm8|xmm7|xmm6 += xmm0|xmm0|xmm0|xmm0 * xmm5|xmm4|xmm3|xmm2 */
430 		xmm1 = xmm0;
431 		xmm1 = _mm_mul_ps(xmm1, xmm5);
432 		xmm9 = _mm_add_ps(xmm9, xmm1);
433 		xmm1 = xmm0;
434 		xmm1 = _mm_mul_ps(xmm1, xmm4);
435 		xmm8 = _mm_add_ps(xmm8, xmm1);
436 		xmm1 = xmm0;
437 		xmm1 = _mm_mul_ps(xmm1, xmm3);
438 		xmm7 = _mm_add_ps(xmm7, xmm1);
439 		xmm0 = _mm_mul_ps(xmm0, xmm2);
440 		xmm6 = _mm_add_ps(xmm6, xmm0);
441 
442 		data_len--;
443 	}
444 
445 	_mm_storeu_ps(autoc,   xmm6);
446 	_mm_storeu_ps(autoc+4, xmm7);
447 	_mm_storeu_ps(autoc+8, xmm8);
448 	_mm_storeu_ps(autoc+12,xmm9);
449 }
450 
451 #endif /* FLAC__SSE_SUPPORTED */
452 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
453 #endif /* FLAC__NO_ASM */
454 #endif /* FLAC__INTEGER_ONLY_LIBRARY */
455