1 /* libFLAC - Free Lossless Audio Codec library
2 * Copyright (C) 2000-2009 Josh Coalson
3 * Copyright (C) 2011-2016 Xiph.Org Foundation
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * - Neither the name of the Xiph.org Foundation nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 #ifdef HAVE_CONFIG_H
34 # include <config.h>
35 #endif
36
37 #include "private/cpu.h"
38
39 #ifndef FLAC__INTEGER_ONLY_LIBRARY
40 #ifndef FLAC__NO_ASM
41 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42 #include "private/lpc.h"
43 #ifdef FLAC__SSE2_SUPPORTED
44
45 #include "FLAC/assert.h"
46 #include "FLAC/format.h"
47
48 #include <emmintrin.h> /* SSE2 */
49
50 #define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
51 #define DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
52
53 FLAC__SSE_TARGET("sse2")
FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])54 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
55 {
56 int i;
57 FLAC__int32 sum;
58 const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
59
60 FLAC__ASSERT(order > 0);
61 FLAC__ASSERT(order <= 32);
62
63 if(order <= 12) {
64 if(order > 8) {
65 if(order > 10) {
66 if(order == 12) {
67 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
68 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
69 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
70 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
71 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
72 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
73 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
74 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
75 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
76 q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
77 q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
78 q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
79 q11 = _mm_cvtsi32_si128(0xffff & qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
80
81 for(i = 0; i < (int)data_len-3; i+=4) {
82 __m128i summ, mull;
83 summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(data+i-12)));
84 mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
85 mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
86 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
87 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
88 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
89 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
90 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
91 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
92 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
93 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
94 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
95 summ = _mm_sra_epi32(summ, cnt);
96 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
97 }
98 }
99 else { /* order == 11 */
100 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
101 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
102 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
103 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
104 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
105 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
106 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
107 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
108 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
109 q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
110 q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
111 q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
112
113 for(i = 0; i < (int)data_len-3; i+=4) {
114 __m128i summ, mull;
115 summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11)));
116 mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
117 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
118 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
119 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
120 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
121 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
122 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
123 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
124 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
125 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
126 summ = _mm_sra_epi32(summ, cnt);
127 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
128 }
129 }
130 }
131 else {
132 if(order == 10) {
133 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
134 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
135 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
136 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
137 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
138 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
139 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
140 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
141 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
142 q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
143 q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
144
145 for(i = 0; i < (int)data_len-3; i+=4) {
146 __m128i summ, mull;
147 summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10)));
148 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
149 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
150 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
151 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
152 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
153 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
154 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
155 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
156 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
157 summ = _mm_sra_epi32(summ, cnt);
158 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
159 }
160 }
161 else { /* order == 9 */
162 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
163 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
164 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
165 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
166 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
167 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
168 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
169 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
170 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
171 q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
172
173 for(i = 0; i < (int)data_len-3; i+=4) {
174 __m128i summ, mull;
175 summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9)));
176 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
177 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
178 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
179 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
180 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
181 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
182 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
183 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
184 summ = _mm_sra_epi32(summ, cnt);
185 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
186 }
187 }
188 }
189 }
190 else if(order > 4) {
191 if(order > 6) {
192 if(order == 8) {
193 __m128i q0, q1, q2, q3, q4, q5, q6, q7;
194 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
195 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
196 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
197 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
198 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
199 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
200 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
201 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
202
203 for(i = 0; i < (int)data_len-3; i+=4) {
204 __m128i summ, mull;
205 summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8)));
206 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
207 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
208 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
209 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
210 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
211 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
212 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
213 summ = _mm_sra_epi32(summ, cnt);
214 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
215 }
216 }
217 else { /* order == 7 */
218 __m128i q0, q1, q2, q3, q4, q5, q6;
219 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
220 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
221 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
222 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
223 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
224 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
225 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
226
227 for(i = 0; i < (int)data_len-3; i+=4) {
228 __m128i summ, mull;
229 summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7)));
230 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
231 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
232 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
233 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
234 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
235 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
236 summ = _mm_sra_epi32(summ, cnt);
237 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
238 }
239 }
240 }
241 else {
242 if(order == 6) {
243 __m128i q0, q1, q2, q3, q4, q5;
244 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
245 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
246 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
247 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
248 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
249 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
250
251 for(i = 0; i < (int)data_len-3; i+=4) {
252 __m128i summ, mull;
253 summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6)));
254 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
255 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
256 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
257 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
258 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
259 summ = _mm_sra_epi32(summ, cnt);
260 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
261 }
262 }
263 else { /* order == 5 */
264 __m128i q0, q1, q2, q3, q4;
265 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
266 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
267 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
268 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
269 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
270
271 for(i = 0; i < (int)data_len-3; i+=4) {
272 __m128i summ, mull;
273 summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5)));
274 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
275 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
276 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
277 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
278 summ = _mm_sra_epi32(summ, cnt);
279 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
280 }
281 }
282 }
283 }
284 else {
285 if(order > 2) {
286 if(order == 4) {
287 __m128i q0, q1, q2, q3;
288 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
289 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
290 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
291 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
292
293 for(i = 0; i < (int)data_len-3; i+=4) {
294 __m128i summ, mull;
295 summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4)));
296 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
297 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
298 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
299 summ = _mm_sra_epi32(summ, cnt);
300 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
301 }
302 }
303 else { /* order == 3 */
304 __m128i q0, q1, q2;
305 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
306 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
307 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
308
309 for(i = 0; i < (int)data_len-3; i+=4) {
310 __m128i summ, mull;
311 summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3)));
312 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
313 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
314 summ = _mm_sra_epi32(summ, cnt);
315 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
316 }
317 }
318 }
319 else {
320 if(order == 2) {
321 __m128i q0, q1;
322 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
323 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
324
325 for(i = 0; i < (int)data_len-3; i+=4) {
326 __m128i summ, mull;
327 summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2)));
328 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
329 summ = _mm_sra_epi32(summ, cnt);
330 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
331 }
332 }
333 else { /* order == 1 */
334 __m128i q0;
335 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
336
337 for(i = 0; i < (int)data_len-3; i+=4) {
338 __m128i summ;
339 summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1)));
340 summ = _mm_sra_epi32(summ, cnt);
341 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
342 }
343 }
344 }
345 }
346 for(; i < (int)data_len; i++) {
347 sum = 0;
348 switch(order) {
349 case 12: sum += qlp_coeff[11] * data[i-12]; /* Falls through. */
350 case 11: sum += qlp_coeff[10] * data[i-11]; /* Falls through. */
351 case 10: sum += qlp_coeff[ 9] * data[i-10]; /* Falls through. */
352 case 9: sum += qlp_coeff[ 8] * data[i- 9]; /* Falls through. */
353 case 8: sum += qlp_coeff[ 7] * data[i- 8]; /* Falls through. */
354 case 7: sum += qlp_coeff[ 6] * data[i- 7]; /* Falls through. */
355 case 6: sum += qlp_coeff[ 5] * data[i- 6]; /* Falls through. */
356 case 5: sum += qlp_coeff[ 4] * data[i- 5]; /* Falls through. */
357 case 4: sum += qlp_coeff[ 3] * data[i- 4]; /* Falls through. */
358 case 3: sum += qlp_coeff[ 2] * data[i- 3]; /* Falls through. */
359 case 2: sum += qlp_coeff[ 1] * data[i- 2]; /* Falls through. */
360 case 1: sum += qlp_coeff[ 0] * data[i- 1];
361 }
362 residual[i] = data[i] - (sum >> lp_quantization);
363 }
364 }
365 else { /* order > 12 */
366 for(i = 0; i < (int)data_len; i++) {
367 sum = 0;
368 switch(order) {
369 case 32: sum += qlp_coeff[31] * data[i-32]; /* Falls through. */
370 case 31: sum += qlp_coeff[30] * data[i-31]; /* Falls through. */
371 case 30: sum += qlp_coeff[29] * data[i-30]; /* Falls through. */
372 case 29: sum += qlp_coeff[28] * data[i-29]; /* Falls through. */
373 case 28: sum += qlp_coeff[27] * data[i-28]; /* Falls through. */
374 case 27: sum += qlp_coeff[26] * data[i-27]; /* Falls through. */
375 case 26: sum += qlp_coeff[25] * data[i-26]; /* Falls through. */
376 case 25: sum += qlp_coeff[24] * data[i-25]; /* Falls through. */
377 case 24: sum += qlp_coeff[23] * data[i-24]; /* Falls through. */
378 case 23: sum += qlp_coeff[22] * data[i-23]; /* Falls through. */
379 case 22: sum += qlp_coeff[21] * data[i-22]; /* Falls through. */
380 case 21: sum += qlp_coeff[20] * data[i-21]; /* Falls through. */
381 case 20: sum += qlp_coeff[19] * data[i-20]; /* Falls through. */
382 case 19: sum += qlp_coeff[18] * data[i-19]; /* Falls through. */
383 case 18: sum += qlp_coeff[17] * data[i-18]; /* Falls through. */
384 case 17: sum += qlp_coeff[16] * data[i-17]; /* Falls through. */
385 case 16: sum += qlp_coeff[15] * data[i-16]; /* Falls through. */
386 case 15: sum += qlp_coeff[14] * data[i-15]; /* Falls through. */
387 case 14: sum += qlp_coeff[13] * data[i-14]; /* Falls through. */
388 case 13: sum += qlp_coeff[12] * data[i-13];
389 sum += qlp_coeff[11] * data[i-12];
390 sum += qlp_coeff[10] * data[i-11];
391 sum += qlp_coeff[ 9] * data[i-10];
392 sum += qlp_coeff[ 8] * data[i- 9];
393 sum += qlp_coeff[ 7] * data[i- 8];
394 sum += qlp_coeff[ 6] * data[i- 7];
395 sum += qlp_coeff[ 5] * data[i- 6];
396 sum += qlp_coeff[ 4] * data[i- 5];
397 sum += qlp_coeff[ 3] * data[i- 4];
398 sum += qlp_coeff[ 2] * data[i- 3];
399 sum += qlp_coeff[ 1] * data[i- 2];
400 sum += qlp_coeff[ 0] * data[i- 1];
401 }
402 residual[i] = data[i] - (sum >> lp_quantization);
403 }
404 }
405 }
406
407 FLAC__SSE_TARGET("sse2")
FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])408 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
409 {
410 int i;
411
412 FLAC__ASSERT(order > 0);
413 FLAC__ASSERT(order <= 32);
414
415 if(order <= 12) {
416 if(order > 8) { /* order == 9, 10, 11, 12 */
417 if(order > 10) { /* order == 11, 12 */
418 if(order == 12) {
419 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
420 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); // 0 0 q[1] q[0]
421 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); // 0 0 q[3] q[2]
422 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); // 0 0 q[5] q[4]
423 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); // 0 0 q[7] q[6]
424 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); // 0 0 q[9] q[8]
425 xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0 0 q[11] q[10]
426
427 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0]
428 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2]
429 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0 q[5] 0 q[4]
430 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0 q[7] 0 q[6]
431 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0 q[9] 0 q[8]
432 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0 q[11] 0 q[10]
433
434 for(i = 0; i < (int)data_len; i++) {
435 //sum = 0;
436 //sum += qlp_coeff[11] * data[i-12];
437 //sum += qlp_coeff[10] * data[i-11];
438 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12)); // 0 0 d[i-11] d[i-12]
439 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
440 xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
441
442 //sum += qlp_coeff[9] * data[i-10];
443 //sum += qlp_coeff[8] * data[i-9];
444 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
445 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
446 xmm6 = _mm_mul_epu32(xmm6, xmm4);
447 xmm7 = _mm_add_epi32(xmm7, xmm6);
448
449 //sum += qlp_coeff[7] * data[i-8];
450 //sum += qlp_coeff[6] * data[i-7];
451 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
452 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
453 xmm6 = _mm_mul_epu32(xmm6, xmm3);
454 xmm7 = _mm_add_epi32(xmm7, xmm6);
455
456 //sum += qlp_coeff[5] * data[i-6];
457 //sum += qlp_coeff[4] * data[i-5];
458 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
459 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
460 xmm6 = _mm_mul_epu32(xmm6, xmm2);
461 xmm7 = _mm_add_epi32(xmm7, xmm6);
462
463 //sum += qlp_coeff[3] * data[i-4];
464 //sum += qlp_coeff[2] * data[i-3];
465 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
466 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
467 xmm6 = _mm_mul_epu32(xmm6, xmm1);
468 xmm7 = _mm_add_epi32(xmm7, xmm6);
469
470 //sum += qlp_coeff[1] * data[i-2];
471 //sum += qlp_coeff[0] * data[i-1];
472 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
473 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
474 xmm6 = _mm_mul_epu32(xmm6, xmm0);
475 xmm7 = _mm_add_epi32(xmm7, xmm6);
476
477 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
478 RESIDUAL32_RESULT(xmm7);
479 }
480 }
481 else { /* order == 11 */
482 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
483 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
484 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
485 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
486 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
487 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
488 xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
489
490 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
491 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
492 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
493 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
494 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
495
496 for(i = 0; i < (int)data_len; i++) {
497 //sum = 0;
498 //sum = qlp_coeff[10] * data[i-11];
499 xmm7 = _mm_cvtsi32_si128(data[i-11]);
500 xmm7 = _mm_mul_epu32(xmm7, xmm5);
501
502 //sum += qlp_coeff[9] * data[i-10];
503 //sum += qlp_coeff[8] * data[i-9];
504 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
505 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
506 xmm6 = _mm_mul_epu32(xmm6, xmm4);
507 xmm7 = _mm_add_epi32(xmm7, xmm6);
508
509 //sum += qlp_coeff[7] * data[i-8];
510 //sum += qlp_coeff[6] * data[i-7];
511 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
512 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
513 xmm6 = _mm_mul_epu32(xmm6, xmm3);
514 xmm7 = _mm_add_epi32(xmm7, xmm6);
515
516 //sum += qlp_coeff[5] * data[i-6];
517 //sum += qlp_coeff[4] * data[i-5];
518 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
519 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
520 xmm6 = _mm_mul_epu32(xmm6, xmm2);
521 xmm7 = _mm_add_epi32(xmm7, xmm6);
522
523 //sum += qlp_coeff[3] * data[i-4];
524 //sum += qlp_coeff[2] * data[i-3];
525 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
526 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
527 xmm6 = _mm_mul_epu32(xmm6, xmm1);
528 xmm7 = _mm_add_epi32(xmm7, xmm6);
529
530 //sum += qlp_coeff[1] * data[i-2];
531 //sum += qlp_coeff[0] * data[i-1];
532 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
533 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
534 xmm6 = _mm_mul_epu32(xmm6, xmm0);
535 xmm7 = _mm_add_epi32(xmm7, xmm6);
536
537 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
538 RESIDUAL32_RESULT(xmm7);
539 }
540 }
541 }
542 else { /* order == 9, 10 */
543 if(order == 10) {
544 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
545 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
546 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
547 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
548 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
549 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
550
551 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
552 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
553 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
554 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
555 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
556
557 for(i = 0; i < (int)data_len; i++) {
558 //sum = 0;
559 //sum += qlp_coeff[9] * data[i-10];
560 //sum += qlp_coeff[8] * data[i-9];
561 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
562 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
563 xmm7 = _mm_mul_epu32(xmm7, xmm4);
564
565 //sum += qlp_coeff[7] * data[i-8];
566 //sum += qlp_coeff[6] * data[i-7];
567 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
568 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
569 xmm6 = _mm_mul_epu32(xmm6, xmm3);
570 xmm7 = _mm_add_epi32(xmm7, xmm6);
571
572 //sum += qlp_coeff[5] * data[i-6];
573 //sum += qlp_coeff[4] * data[i-5];
574 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
575 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
576 xmm6 = _mm_mul_epu32(xmm6, xmm2);
577 xmm7 = _mm_add_epi32(xmm7, xmm6);
578
579 //sum += qlp_coeff[3] * data[i-4];
580 //sum += qlp_coeff[2] * data[i-3];
581 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
582 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
583 xmm6 = _mm_mul_epu32(xmm6, xmm1);
584 xmm7 = _mm_add_epi32(xmm7, xmm6);
585
586 //sum += qlp_coeff[1] * data[i-2];
587 //sum += qlp_coeff[0] * data[i-1];
588 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
589 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
590 xmm6 = _mm_mul_epu32(xmm6, xmm0);
591 xmm7 = _mm_add_epi32(xmm7, xmm6);
592
593 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
594 RESIDUAL32_RESULT(xmm7);
595 }
596 }
597 else { /* order == 9 */
598 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
599 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
600 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
601 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
602 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
603 xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
604
605 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
606 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
607 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
608 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
609
610 for(i = 0; i < (int)data_len; i++) {
611 //sum = 0;
612 //sum = qlp_coeff[8] * data[i-9];
613 xmm7 = _mm_cvtsi32_si128(data[i-9]);
614 xmm7 = _mm_mul_epu32(xmm7, xmm4);
615
616 //sum += qlp_coeff[7] * data[i-8];
617 //sum += qlp_coeff[6] * data[i-7];
618 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
619 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
620 xmm6 = _mm_mul_epu32(xmm6, xmm3);
621 xmm7 = _mm_add_epi32(xmm7, xmm6);
622
623 //sum += qlp_coeff[5] * data[i-6];
624 //sum += qlp_coeff[4] * data[i-5];
625 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
626 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
627 xmm6 = _mm_mul_epu32(xmm6, xmm2);
628 xmm7 = _mm_add_epi32(xmm7, xmm6);
629
630 //sum += qlp_coeff[3] * data[i-4];
631 //sum += qlp_coeff[2] * data[i-3];
632 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
633 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
634 xmm6 = _mm_mul_epu32(xmm6, xmm1);
635 xmm7 = _mm_add_epi32(xmm7, xmm6);
636
637 //sum += qlp_coeff[1] * data[i-2];
638 //sum += qlp_coeff[0] * data[i-1];
639 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
640 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
641 xmm6 = _mm_mul_epu32(xmm6, xmm0);
642 xmm7 = _mm_add_epi32(xmm7, xmm6);
643
644 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
645 RESIDUAL32_RESULT(xmm7);
646 }
647 }
648 }
649 }
650 else if(order > 4) { /* order == 5, 6, 7, 8 */
651 if(order > 6) { /* order == 7, 8 */
652 if(order == 8) {
653 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
654 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
655 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
656 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
657 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
658
659 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
660 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
661 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
662 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
663
664 for(i = 0; i < (int)data_len; i++) {
665 //sum = 0;
666 //sum += qlp_coeff[7] * data[i-8];
667 //sum += qlp_coeff[6] * data[i-7];
668 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
669 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
670 xmm7 = _mm_mul_epu32(xmm7, xmm3);
671
672 //sum += qlp_coeff[5] * data[i-6];
673 //sum += qlp_coeff[4] * data[i-5];
674 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
675 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
676 xmm6 = _mm_mul_epu32(xmm6, xmm2);
677 xmm7 = _mm_add_epi32(xmm7, xmm6);
678
679 //sum += qlp_coeff[3] * data[i-4];
680 //sum += qlp_coeff[2] * data[i-3];
681 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
682 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
683 xmm6 = _mm_mul_epu32(xmm6, xmm1);
684 xmm7 = _mm_add_epi32(xmm7, xmm6);
685
686 //sum += qlp_coeff[1] * data[i-2];
687 //sum += qlp_coeff[0] * data[i-1];
688 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
689 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
690 xmm6 = _mm_mul_epu32(xmm6, xmm0);
691 xmm7 = _mm_add_epi32(xmm7, xmm6);
692
693 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
694 RESIDUAL32_RESULT(xmm7);
695 }
696 }
697 else { /* order == 7 */
698 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
699 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
700 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
701 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
702 xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
703
704 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
705 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
706 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
707
708 for(i = 0; i < (int)data_len; i++) {
709 //sum = 0;
710 //sum = qlp_coeff[6] * data[i-7];
711 xmm7 = _mm_cvtsi32_si128(data[i-7]);
712 xmm7 = _mm_mul_epu32(xmm7, xmm3);
713
714 //sum += qlp_coeff[5] * data[i-6];
715 //sum += qlp_coeff[4] * data[i-5];
716 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
717 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
718 xmm6 = _mm_mul_epu32(xmm6, xmm2);
719 xmm7 = _mm_add_epi32(xmm7, xmm6);
720
721 //sum += qlp_coeff[3] * data[i-4];
722 //sum += qlp_coeff[2] * data[i-3];
723 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
724 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
725 xmm6 = _mm_mul_epu32(xmm6, xmm1);
726 xmm7 = _mm_add_epi32(xmm7, xmm6);
727
728 //sum += qlp_coeff[1] * data[i-2];
729 //sum += qlp_coeff[0] * data[i-1];
730 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
731 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
732 xmm6 = _mm_mul_epu32(xmm6, xmm0);
733 xmm7 = _mm_add_epi32(xmm7, xmm6);
734
735 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
736 RESIDUAL32_RESULT(xmm7);
737 }
738 }
739 }
740 else { /* order == 5, 6 */
741 if(order == 6) {
742 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
743 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
744 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
745 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
746
747 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
748 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
749 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
750
751 for(i = 0; i < (int)data_len; i++) {
752 //sum = 0;
753 //sum += qlp_coeff[5] * data[i-6];
754 //sum += qlp_coeff[4] * data[i-5];
755 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
756 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
757 xmm7 = _mm_mul_epu32(xmm7, xmm2);
758
759 //sum += qlp_coeff[3] * data[i-4];
760 //sum += qlp_coeff[2] * data[i-3];
761 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
762 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
763 xmm6 = _mm_mul_epu32(xmm6, xmm1);
764 xmm7 = _mm_add_epi32(xmm7, xmm6);
765
766 //sum += qlp_coeff[1] * data[i-2];
767 //sum += qlp_coeff[0] * data[i-1];
768 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
769 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
770 xmm6 = _mm_mul_epu32(xmm6, xmm0);
771 xmm7 = _mm_add_epi32(xmm7, xmm6);
772
773 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
774 RESIDUAL32_RESULT(xmm7);
775 }
776 }
777 else { /* order == 5 */
778 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
779 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
780 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
781 xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
782
783 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
784 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
785
786 for(i = 0; i < (int)data_len; i++) {
787 //sum = 0;
788 //sum = qlp_coeff[4] * data[i-5];
789 xmm7 = _mm_cvtsi32_si128(data[i-5]);
790 xmm7 = _mm_mul_epu32(xmm7, xmm2);
791
792 //sum += qlp_coeff[3] * data[i-4];
793 //sum += qlp_coeff[2] * data[i-3];
794 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
795 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
796 xmm6 = _mm_mul_epu32(xmm6, xmm1);
797 xmm7 = _mm_add_epi32(xmm7, xmm6);
798
799 //sum += qlp_coeff[1] * data[i-2];
800 //sum += qlp_coeff[0] * data[i-1];
801 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
802 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
803 xmm6 = _mm_mul_epu32(xmm6, xmm0);
804 xmm7 = _mm_add_epi32(xmm7, xmm6);
805
806 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
807 RESIDUAL32_RESULT(xmm7);
808 }
809 }
810 }
811 }
812 else { /* order == 1, 2, 3, 4 */
813 if(order > 2) { /* order == 3, 4 */
814 if(order == 4) {
815 __m128i xmm0, xmm1, xmm6, xmm7;
816 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
817 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
818
819 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
820 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
821
822 for(i = 0; i < (int)data_len; i++) {
823 //sum = 0;
824 //sum += qlp_coeff[3] * data[i-4];
825 //sum += qlp_coeff[2] * data[i-3];
826 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
827 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
828 xmm7 = _mm_mul_epu32(xmm7, xmm1);
829
830 //sum += qlp_coeff[1] * data[i-2];
831 //sum += qlp_coeff[0] * data[i-1];
832 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
833 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
834 xmm6 = _mm_mul_epu32(xmm6, xmm0);
835 xmm7 = _mm_add_epi32(xmm7, xmm6);
836
837 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
838 RESIDUAL32_RESULT(xmm7);
839 }
840 }
841 else { /* order == 3 */
842 __m128i xmm0, xmm1, xmm6, xmm7;
843 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
844 xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
845
846 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
847
848 for(i = 0; i < (int)data_len; i++) {
849 //sum = 0;
850 //sum = qlp_coeff[2] * data[i-3];
851 xmm7 = _mm_cvtsi32_si128(data[i-3]);
852 xmm7 = _mm_mul_epu32(xmm7, xmm1);
853
854 //sum += qlp_coeff[1] * data[i-2];
855 //sum += qlp_coeff[0] * data[i-1];
856 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
857 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
858 xmm6 = _mm_mul_epu32(xmm6, xmm0);
859 xmm7 = _mm_add_epi32(xmm7, xmm6);
860
861 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
862 RESIDUAL32_RESULT(xmm7);
863 }
864 }
865 }
866 else { /* order == 1, 2 */
867 if(order == 2) {
868 __m128i xmm0, xmm7;
869 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
870 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
871
872 for(i = 0; i < (int)data_len; i++) {
873 //sum = 0;
874 //sum += qlp_coeff[1] * data[i-2];
875 //sum += qlp_coeff[0] * data[i-1];
876 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
877 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
878 xmm7 = _mm_mul_epu32(xmm7, xmm0);
879
880 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
881 RESIDUAL32_RESULT(xmm7);
882 }
883 }
884 else { /* order == 1 */
885 for(i = 0; i < (int)data_len; i++)
886 residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
887 }
888 }
889 }
890 }
891 else { /* order > 12 */
892 FLAC__int32 sum;
893 for(i = 0; i < (int)data_len; i++) {
894 sum = 0;
895 switch(order) {
896 case 32: sum += qlp_coeff[31] * data[i-32]; /* Falls through. */
897 case 31: sum += qlp_coeff[30] * data[i-31]; /* Falls through. */
898 case 30: sum += qlp_coeff[29] * data[i-30]; /* Falls through. */
899 case 29: sum += qlp_coeff[28] * data[i-29]; /* Falls through. */
900 case 28: sum += qlp_coeff[27] * data[i-28]; /* Falls through. */
901 case 27: sum += qlp_coeff[26] * data[i-27]; /* Falls through. */
902 case 26: sum += qlp_coeff[25] * data[i-26]; /* Falls through. */
903 case 25: sum += qlp_coeff[24] * data[i-25]; /* Falls through. */
904 case 24: sum += qlp_coeff[23] * data[i-24]; /* Falls through. */
905 case 23: sum += qlp_coeff[22] * data[i-23]; /* Falls through. */
906 case 22: sum += qlp_coeff[21] * data[i-22]; /* Falls through. */
907 case 21: sum += qlp_coeff[20] * data[i-21]; /* Falls through. */
908 case 20: sum += qlp_coeff[19] * data[i-20]; /* Falls through. */
909 case 19: sum += qlp_coeff[18] * data[i-19]; /* Falls through. */
910 case 18: sum += qlp_coeff[17] * data[i-18]; /* Falls through. */
911 case 17: sum += qlp_coeff[16] * data[i-17]; /* Falls through. */
912 case 16: sum += qlp_coeff[15] * data[i-16]; /* Falls through. */
913 case 15: sum += qlp_coeff[14] * data[i-15]; /* Falls through. */
914 case 14: sum += qlp_coeff[13] * data[i-14]; /* Falls through. */
915 case 13: sum += qlp_coeff[12] * data[i-13];
916 sum += qlp_coeff[11] * data[i-12];
917 sum += qlp_coeff[10] * data[i-11];
918 sum += qlp_coeff[ 9] * data[i-10];
919 sum += qlp_coeff[ 8] * data[i- 9];
920 sum += qlp_coeff[ 7] * data[i- 8];
921 sum += qlp_coeff[ 6] * data[i- 7];
922 sum += qlp_coeff[ 5] * data[i- 6];
923 sum += qlp_coeff[ 4] * data[i- 5];
924 sum += qlp_coeff[ 3] * data[i- 4];
925 sum += qlp_coeff[ 2] * data[i- 3];
926 sum += qlp_coeff[ 1] * data[i- 2];
927 sum += qlp_coeff[ 0] * data[i- 1];
928 }
929 residual[i] = data[i] - (sum >> lp_quantization);
930 }
931 }
932 }
933
934 #endif /* FLAC__SSE2_SUPPORTED */
935 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
936 #endif /* FLAC__NO_ASM */
937 #endif /* FLAC__INTEGER_ONLY_LIBRARY */
938