1 /*
2 * MP3 quantization, intrinsics functions
3 *
4 * Copyright (c) 2005-2006 Gabriel Bouvigne
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public
17 * License along with this library; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
20 */
21
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include "lame.h"
28 #include "machine.h"
29 #include "encoder.h"
30 #include "util.h"
31 #include "lame_intrin.h"
32
33
34
35 #ifdef HAVE_XMMINTRIN_H
36
37 #include <xmmintrin.h>
38
39 typedef union {
40 int32_t _i_32[4]; /* unions are initialized by its first member */
41 float _float[4];
42 __m128 _m128;
43 } vecfloat_union;
44
45 #define TRI_SIZE (5-1) /* 1024 = 4**5 */
46 static const FLOAT costab[TRI_SIZE * 2] = {
47 9.238795325112867e-01, 3.826834323650898e-01,
48 9.951847266721969e-01, 9.801714032956060e-02,
49 9.996988186962042e-01, 2.454122852291229e-02,
50 9.999811752826011e-01, 6.135884649154475e-03
51 };
52
53
54 /* make sure functions with SSE instructions maintain their own properly aligned stack */
55 #if defined (__GNUC__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 2)))
56 #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
57 #else
58 #define SSE_FUNCTION
59 #endif
60
61
62 SSE_FUNCTION void
init_xrpow_core_sse(gr_info * const cod_info,FLOAT xrpow[576],int max_nz,FLOAT * sum)63 init_xrpow_core_sse(gr_info * const cod_info, FLOAT xrpow[576], int max_nz, FLOAT * sum)
64 {
65 int i;
66 float tmp_max = 0;
67 float tmp_sum = 0;
68 int upper = max_nz + 1;
69 int upper4 = (upper / 4) * 4;
70 int rest = upper-upper4;
71
72 const vecfloat_union fabs_mask = {{ 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF }};
73 const __m128 vec_fabs_mask = _mm_loadu_ps(&fabs_mask._float[0]);
74 vecfloat_union vec_xrpow_max;
75 vecfloat_union vec_sum;
76 vecfloat_union vec_tmp;
77
78 _mm_prefetch((char const *) cod_info->xr, _MM_HINT_T0);
79 _mm_prefetch((char const *) xrpow, _MM_HINT_T0);
80
81 vec_xrpow_max._m128 = _mm_set_ps1(0);
82 vec_sum._m128 = _mm_set_ps1(0);
83
84 for (i = 0; i < upper4; i += 4) {
85 vec_tmp._m128 = _mm_loadu_ps(&(cod_info->xr[i])); /* load */
86 vec_tmp._m128 = _mm_and_ps(vec_tmp._m128, vec_fabs_mask); /* fabs */
87 vec_sum._m128 = _mm_add_ps(vec_sum._m128, vec_tmp._m128);
88 vec_tmp._m128 = _mm_sqrt_ps(_mm_mul_ps(vec_tmp._m128, _mm_sqrt_ps(vec_tmp._m128)));
89 vec_xrpow_max._m128 = _mm_max_ps(vec_xrpow_max._m128, vec_tmp._m128); /* retrieve max */
90 _mm_storeu_ps(&(xrpow[i]), vec_tmp._m128); /* store into xrpow[] */
91 }
92 vec_tmp._m128 = _mm_set_ps1(0);
93 switch (rest) {
94 case 3: vec_tmp._float[2] = cod_info->xr[upper4+2];
95 case 2: vec_tmp._float[1] = cod_info->xr[upper4+1];
96 case 1: vec_tmp._float[0] = cod_info->xr[upper4+0];
97 vec_tmp._m128 = _mm_and_ps(vec_tmp._m128, vec_fabs_mask); /* fabs */
98 vec_sum._m128 = _mm_add_ps(vec_sum._m128, vec_tmp._m128);
99 vec_tmp._m128 = _mm_sqrt_ps(_mm_mul_ps(vec_tmp._m128, _mm_sqrt_ps(vec_tmp._m128)));
100 vec_xrpow_max._m128 = _mm_max_ps(vec_xrpow_max._m128, vec_tmp._m128); /* retrieve max */
101 switch (rest) {
102 case 3: xrpow[upper4+2] = vec_tmp._float[2];
103 case 2: xrpow[upper4+1] = vec_tmp._float[1];
104 case 1: xrpow[upper4+0] = vec_tmp._float[0];
105 default:
106 break;
107 }
108 default:
109 break;
110 }
111 tmp_sum = vec_sum._float[0] + vec_sum._float[1] + vec_sum._float[2] + vec_sum._float[3];
112 {
113 float ma = vec_xrpow_max._float[0] > vec_xrpow_max._float[1]
114 ? vec_xrpow_max._float[0] : vec_xrpow_max._float[1];
115 float mb = vec_xrpow_max._float[2] > vec_xrpow_max._float[3]
116 ? vec_xrpow_max._float[2] : vec_xrpow_max._float[3];
117 tmp_max = ma > mb ? ma : mb;
118 }
119 cod_info->xrpow_max = tmp_max;
120 *sum = tmp_sum;
121 }
122
123
124 SSE_FUNCTION static void
store4(__m128 v,float * f0,float * f1,float * f2,float * f3)125 store4(__m128 v, float* f0, float* f1, float* f2, float* f3)
126 {
127 vecfloat_union r;
128 r._m128 = v;
129 *f0 = r._float[0];
130 *f1 = r._float[1];
131 *f2 = r._float[2];
132 *f3 = r._float[3];
133 }
134
135
136 SSE_FUNCTION void
fht_SSE2(FLOAT * fz,int n)137 fht_SSE2(FLOAT * fz, int n)
138 {
139 const FLOAT *tri = costab;
140 int k4;
141 FLOAT *fi, *gi;
142 FLOAT const *fn;
143
144 n <<= 1; /* to get BLKSIZE, because of 3DNow! ASM routine */
145 fn = fz + n;
146 k4 = 4;
147 do {
148 FLOAT s1, c1;
149 int i, k1, k2, k3, kx;
150 kx = k4 >> 1;
151 k1 = k4;
152 k2 = k4 << 1;
153 k3 = k2 + k1;
154 k4 = k2 << 1;
155 fi = fz;
156 gi = fi + kx;
157 do {
158 FLOAT f0, f1, f2, f3;
159 f1 = fi[0] - fi[k1];
160 f0 = fi[0] + fi[k1];
161 f3 = fi[k2] - fi[k3];
162 f2 = fi[k2] + fi[k3];
163 fi[k2] = f0 - f2;
164 fi[0] = f0 + f2;
165 fi[k3] = f1 - f3;
166 fi[k1] = f1 + f3;
167 f1 = gi[0] - gi[k1];
168 f0 = gi[0] + gi[k1];
169 f3 = SQRT2 * gi[k3];
170 f2 = SQRT2 * gi[k2];
171 gi[k2] = f0 - f2;
172 gi[0] = f0 + f2;
173 gi[k3] = f1 - f3;
174 gi[k1] = f1 + f3;
175 gi += k4;
176 fi += k4;
177 } while (fi < fn);
178 c1 = tri[0];
179 s1 = tri[1];
180 for (i = 1; i < kx; i++) {
181 __m128 v_s2;
182 __m128 v_c2;
183 __m128 v_c1;
184 __m128 v_s1;
185 FLOAT c2, s2, s1_2 = s1+s1;
186 c2 = 1 - s1_2 * s1;
187 s2 = s1_2 * c1;
188 fi = fz + i;
189 gi = fz + k1 - i;
190 v_c1 = _mm_set_ps1(c1);
191 v_s1 = _mm_set_ps1(s1);
192 v_c2 = _mm_set_ps1(c2);
193 v_s2 = _mm_set_ps1(s2);
194 {
195 static const vecfloat_union sign_mask = {{0x80000000,0,0,0}};
196 v_c1 = _mm_xor_ps(sign_mask._m128, v_c1); /* v_c1 := {-c1, +c1, +c1, +c1} */
197 }
198 {
199 static const vecfloat_union sign_mask = {{0,0x80000000,0,0}};
200 v_s1 = _mm_xor_ps(sign_mask._m128, v_s1); /* v_s1 := {+s1, -s1, +s1, +s1} */
201 }
202 {
203 static const vecfloat_union sign_mask = {{0,0,0x80000000,0x80000000}};
204 v_c2 = _mm_xor_ps(sign_mask._m128, v_c2); /* v_c2 := {+c2, +c2, -c2, -c2} */
205 }
206 do {
207 __m128 p, q, r;
208
209 q = _mm_setr_ps(fi[k1], fi[k3], gi[k1], gi[k3]); /* Q := {fi_k1,fi_k3,gi_k1,gi_k3}*/
210 p = _mm_mul_ps(v_s2, q); /* P := s2 * Q */
211 q = _mm_mul_ps(v_c2, q); /* Q := c2 * Q */
212 q = _mm_shuffle_ps(q, q, _MM_SHUFFLE(1,0,3,2)); /* Q := {-c2*gi_k1,-c2*gi_k3,c2*fi_k1,c2*fi_k3} */
213 p = _mm_add_ps(p, q);
214
215 r = _mm_setr_ps(gi[0], gi[k2], fi[0], fi[k2]); /* R := {gi_0,gi_k2,fi_0,fi_k2} */
216 q = _mm_sub_ps(r, p); /* Q := {gi_0-p0,gi_k2-p1,fi_0-p2,fi_k2-p3} */
217 r = _mm_add_ps(r, p); /* R := {gi_0+p0,gi_k2+p1,fi_0+p2,fi_k2+p3} */
218 p = _mm_shuffle_ps(q, r, _MM_SHUFFLE(2,0,2,0)); /* P := {q0,q2,r0,r2} */
219 p = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3,1,2,0)); /* P := {q0,r0,q2,r2} */
220 q = _mm_shuffle_ps(q, r, _MM_SHUFFLE(3,1,3,1)); /* Q := {q1,q3,r1,r3} */
221 r = _mm_mul_ps(v_c1, q);
222 q = _mm_mul_ps(v_s1, q);
223 q = _mm_shuffle_ps(q, q, _MM_SHUFFLE(0,1,2,3)); /* Q := {q3,q2,q1,q0} */
224 q = _mm_add_ps(q, r);
225
226 store4(_mm_sub_ps(p, q), &gi[k3], &gi[k2], &fi[k3], &fi[k2]);
227 store4(_mm_add_ps(p, q), &gi[k1], &gi[ 0], &fi[k1], &fi[ 0]);
228
229 gi += k4;
230 fi += k4;
231 } while (fi < fn);
232 c2 = c1;
233 c1 = c2 * tri[0] - s1 * tri[1];
234 s1 = c2 * tri[1] + s1 * tri[0];
235 }
236 tri += 2;
237 } while (k4 < n);
238 }
239
240 #endif /* HAVE_XMMINTRIN_H */
241
242