1 /***********************************************************************
2 Copyright (c) 2017 Google Inc.
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions
5 are met:
6 - Redistributions of source code must retain the above copyright notice,
7 this list of conditions and the following disclaimer.
8 - Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
12 names of specific contributors, may be used to endorse or promote
13 products derived from this software without specific prior written
14 permission.
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 POSSIBILITY OF SUCH DAMAGE.
26 ***********************************************************************/
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include <arm_neon.h>
33 #include "pitch.h"
34
35 #ifdef FIXED_POINT
36
celt_inner_prod_neon(const opus_val16 * x,const opus_val16 * y,int N)37 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
38 {
39 int i;
40 opus_val32 xy;
41 int16x8_t x_s16x8, y_s16x8;
42 int32x4_t xy_s32x4 = vdupq_n_s32(0);
43 int64x2_t xy_s64x2;
44 int64x1_t xy_s64x1;
45
46 for (i = 0; i < N - 7; i += 8) {
47 x_s16x8 = vld1q_s16(&x[i]);
48 y_s16x8 = vld1q_s16(&y[i]);
49 xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8));
50 xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8));
51 }
52
53 if (N - i >= 4) {
54 const int16x4_t x_s16x4 = vld1_s16(&x[i]);
55 const int16x4_t y_s16x4 = vld1_s16(&y[i]);
56 xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4);
57 i += 4;
58 }
59
60 xy_s64x2 = vpaddlq_s32(xy_s32x4);
61 xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2));
62 xy = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0);
63
64 for (; i < N; i++) {
65 xy = MAC16_16(xy, x[i], y[i]);
66 }
67
68 #ifdef OPUS_CHECK_ASM
69 celt_assert(celt_inner_prod_c(x, y, N) == xy);
70 #endif
71
72 return xy;
73 }
74
dual_inner_prod_neon(const opus_val16 * x,const opus_val16 * y01,const opus_val16 * y02,int N,opus_val32 * xy1,opus_val32 * xy2)75 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
76 int N, opus_val32 *xy1, opus_val32 *xy2)
77 {
78 int i;
79 opus_val32 xy01, xy02;
80 int16x8_t x_s16x8, y01_s16x8, y02_s16x8;
81 int32x4_t xy01_s32x4 = vdupq_n_s32(0);
82 int32x4_t xy02_s32x4 = vdupq_n_s32(0);
83 int64x2_t xy01_s64x2, xy02_s64x2;
84 int64x1_t xy01_s64x1, xy02_s64x1;
85
86 for (i = 0; i < N - 7; i += 8) {
87 x_s16x8 = vld1q_s16(&x[i]);
88 y01_s16x8 = vld1q_s16(&y01[i]);
89 y02_s16x8 = vld1q_s16(&y02[i]);
90 xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8));
91 xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8));
92 xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8));
93 xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8));
94 }
95
96 if (N - i >= 4) {
97 const int16x4_t x_s16x4 = vld1_s16(&x[i]);
98 const int16x4_t y01_s16x4 = vld1_s16(&y01[i]);
99 const int16x4_t y02_s16x4 = vld1_s16(&y02[i]);
100 xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4);
101 xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4);
102 i += 4;
103 }
104
105 xy01_s64x2 = vpaddlq_s32(xy01_s32x4);
106 xy02_s64x2 = vpaddlq_s32(xy02_s32x4);
107 xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2));
108 xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2));
109 xy01 = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0);
110 xy02 = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0);
111
112 for (; i < N; i++) {
113 xy01 = MAC16_16(xy01, x[i], y01[i]);
114 xy02 = MAC16_16(xy02, x[i], y02[i]);
115 }
116 *xy1 = xy01;
117 *xy2 = xy02;
118
119 #ifdef OPUS_CHECK_ASM
120 {
121 opus_val32 xy1_c, xy2_c;
122 dual_inner_prod_c(x, y01, y02, N, &xy1_c, &xy2_c);
123 celt_assert(xy1_c == *xy1);
124 celt_assert(xy2_c == *xy2);
125 }
126 #endif
127 }
128
129 #else /* !FIXED_POINT */
130
131 /* ========================================================================== */
132
133 #ifdef OPUS_CHECK_ASM
134
135 /* This part of code simulates floating-point NEON operations. */
136
137 /* celt_inner_prod_neon_float_c_simulation() simulates the floating-point */
138 /* operations of celt_inner_prod_neon(), and both functions should have bit */
139 /* exact output. */
celt_inner_prod_neon_float_c_simulation(const opus_val16 * x,const opus_val16 * y,float * err,int N)140 static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, float *err, int N)
141 {
142 int i;
143 *err = 0;
144 opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
145 for (i = 0; i < N - 3; i += 4) {
146 xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
147 xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
148 xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
149 xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
150 *err += ABS32(xy0)+ABS32(xy1)+ABS32(xy2)+ABS32(xy3);
151 }
152 xy0 += xy2;
153 xy1 += xy3;
154 xy = xy0 + xy1;
155 *err += ABS32(xy1)+ABS32(xy0)+ABS32(xy);
156 for (; i < N; i++) {
157 xy = MAC16_16(xy, x[i], y[i]);
158 *err += ABS32(xy);
159 }
160 *err = *err*2e-7 + N*1e-37;
161 return xy;
162 }
163
164 /* dual_inner_prod_neon_float_c_simulation() simulates the floating-point */
165 /* operations of dual_inner_prod_neon(), and both functions should have bit */
166 /* exact output. */
dual_inner_prod_neon_float_c_simulation(const opus_val16 * x,const opus_val16 * y01,const opus_val16 * y02,int N,opus_val32 * xy1,opus_val32 * xy2,float * err)167 static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
168 int N, opus_val32 *xy1, opus_val32 *xy2, float *err)
169 {
170 *xy1 = celt_inner_prod_neon_float_c_simulation(x, y01, &err[0], N);
171 *xy2 = celt_inner_prod_neon_float_c_simulation(x, y02, &err[1], N);
172 }
173
174 #endif /* OPUS_CHECK_ASM */
175
176 /* ========================================================================== */
177
celt_inner_prod_neon(const opus_val16 * x,const opus_val16 * y,int N)178 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
179 {
180 int i;
181 opus_val32 xy;
182 float32x4_t xy_f32x4 = vdupq_n_f32(0);
183 float32x2_t xy_f32x2;
184
185 for (i = 0; i < N - 7; i += 8) {
186 float32x4_t x_f32x4, y_f32x4;
187 x_f32x4 = vld1q_f32(&x[i]);
188 y_f32x4 = vld1q_f32(&y[i]);
189 xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
190 x_f32x4 = vld1q_f32(&x[i + 4]);
191 y_f32x4 = vld1q_f32(&y[i + 4]);
192 xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
193 }
194
195 if (N - i >= 4) {
196 const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
197 const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
198 xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
199 i += 4;
200 }
201
202 xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
203 xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
204 xy = vget_lane_f32(xy_f32x2, 0);
205
206 for (; i < N; i++) {
207 xy = MAC16_16(xy, x[i], y[i]);
208 }
209
210 #ifdef OPUS_CHECK_ASM
211 {
212 float err, res;
213 res = celt_inner_prod_neon_float_c_simulation(x, y, &err, N);
214 /*if (ABS32(res - xy) > err) fprintf(stderr, "%g %g %g\n", res, xy, err);*/
215 celt_assert(ABS32(res - xy) <= err);
216 }
217 #endif
218
219 return xy;
220 }
221
dual_inner_prod_neon(const opus_val16 * x,const opus_val16 * y01,const opus_val16 * y02,int N,opus_val32 * xy1,opus_val32 * xy2)222 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
223 int N, opus_val32 *xy1, opus_val32 *xy2)
224 {
225 int i;
226 opus_val32 xy01, xy02;
227 float32x4_t xy01_f32x4 = vdupq_n_f32(0);
228 float32x4_t xy02_f32x4 = vdupq_n_f32(0);
229 float32x2_t xy01_f32x2, xy02_f32x2;
230
231 for (i = 0; i < N - 7; i += 8) {
232 float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
233 x_f32x4 = vld1q_f32(&x[i]);
234 y01_f32x4 = vld1q_f32(&y01[i]);
235 y02_f32x4 = vld1q_f32(&y02[i]);
236 xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
237 xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
238 x_f32x4 = vld1q_f32(&x[i + 4]);
239 y01_f32x4 = vld1q_f32(&y01[i + 4]);
240 y02_f32x4 = vld1q_f32(&y02[i + 4]);
241 xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
242 xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
243 }
244
245 if (N - i >= 4) {
246 const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
247 const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
248 const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
249 xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
250 xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
251 i += 4;
252 }
253
254 xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
255 xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
256 xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
257 xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
258 xy01 = vget_lane_f32(xy01_f32x2, 0);
259 xy02 = vget_lane_f32(xy02_f32x2, 0);
260
261 for (; i < N; i++) {
262 xy01 = MAC16_16(xy01, x[i], y01[i]);
263 xy02 = MAC16_16(xy02, x[i], y02[i]);
264 }
265 *xy1 = xy01;
266 *xy2 = xy02;
267
268 #ifdef OPUS_CHECK_ASM
269 {
270 opus_val32 xy1_c, xy2_c;
271 float err[2];
272 dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c, err);
273 /*if (ABS32(xy1_c - *xy1) > err[0]) fprintf(stderr, "dual1 fail: %g %g %g\n", xy1_c, *xy1, err[0]);
274 if (ABS32(xy2_c - *xy2) > err[1]) fprintf(stderr, "dual2 fail: %g %g %g\n", xy2_c, *xy2, err[1]);*/
275 celt_assert(ABS32(xy1_c - *xy1) <= err[0]);
276 celt_assert(ABS32(xy2_c - *xy2) <= err[1]);
277 }
278 #endif
279 }
280
281 #endif /* FIXED_POINT */
282