• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /***********************************************************************
2 Copyright (c) 2017 Google Inc.
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions
5 are met:
6 - Redistributions of source code must retain the above copyright notice,
7 this list of conditions and the following disclaimer.
8 - Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
12 names of specific contributors, may be used to endorse or promote
13 products derived from this software without specific prior written
14 permission.
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 POSSIBILITY OF SUCH DAMAGE.
26 ***********************************************************************/
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #include <arm_neon.h>
33 #include "pitch.h"
34 
35 #ifdef FIXED_POINT
36 
celt_inner_prod_neon(const opus_val16 * x,const opus_val16 * y,int N)37 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
38 {
39     int i;
40     opus_val32 xy;
41     int16x8_t x_s16x8, y_s16x8;
42     int32x4_t xy_s32x4 = vdupq_n_s32(0);
43     int64x2_t xy_s64x2;
44     int64x1_t xy_s64x1;
45 
46     for (i = 0; i < N - 7; i += 8) {
47         x_s16x8  = vld1q_s16(&x[i]);
48         y_s16x8  = vld1q_s16(&y[i]);
49         xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8));
50         xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8));
51     }
52 
53     if (N - i >= 4) {
54         const int16x4_t x_s16x4 = vld1_s16(&x[i]);
55         const int16x4_t y_s16x4 = vld1_s16(&y[i]);
56         xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4);
57         i += 4;
58     }
59 
60     xy_s64x2 = vpaddlq_s32(xy_s32x4);
61     xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2));
62     xy       = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0);
63 
64     for (; i < N; i++) {
65         xy = MAC16_16(xy, x[i], y[i]);
66     }
67 
68 #ifdef OPUS_CHECK_ASM
69     celt_assert(celt_inner_prod_c(x, y, N) == xy);
70 #endif
71 
72     return xy;
73 }
74 
dual_inner_prod_neon(const opus_val16 * x,const opus_val16 * y01,const opus_val16 * y02,int N,opus_val32 * xy1,opus_val32 * xy2)75 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
76         int N, opus_val32 *xy1, opus_val32 *xy2)
77 {
78     int i;
79     opus_val32 xy01, xy02;
80     int16x8_t x_s16x8, y01_s16x8, y02_s16x8;
81     int32x4_t xy01_s32x4 = vdupq_n_s32(0);
82     int32x4_t xy02_s32x4 = vdupq_n_s32(0);
83     int64x2_t xy01_s64x2, xy02_s64x2;
84     int64x1_t xy01_s64x1, xy02_s64x1;
85 
86     for (i = 0; i < N - 7; i += 8) {
87         x_s16x8    = vld1q_s16(&x[i]);
88         y01_s16x8  = vld1q_s16(&y01[i]);
89         y02_s16x8  = vld1q_s16(&y02[i]);
90         xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8));
91         xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8));
92         xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8));
93         xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8));
94     }
95 
96     if (N - i >= 4) {
97         const int16x4_t x_s16x4   = vld1_s16(&x[i]);
98         const int16x4_t y01_s16x4 = vld1_s16(&y01[i]);
99         const int16x4_t y02_s16x4 = vld1_s16(&y02[i]);
100         xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4);
101         xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4);
102         i += 4;
103     }
104 
105     xy01_s64x2 = vpaddlq_s32(xy01_s32x4);
106     xy02_s64x2 = vpaddlq_s32(xy02_s32x4);
107     xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2));
108     xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2));
109     xy01       = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0);
110     xy02       = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0);
111 
112     for (; i < N; i++) {
113         xy01 = MAC16_16(xy01, x[i], y01[i]);
114         xy02 = MAC16_16(xy02, x[i], y02[i]);
115     }
116     *xy1 = xy01;
117     *xy2 = xy02;
118 
119 #ifdef OPUS_CHECK_ASM
120     {
121         opus_val32 xy1_c, xy2_c;
122         dual_inner_prod_c(x, y01, y02, N, &xy1_c, &xy2_c);
123         celt_assert(xy1_c == *xy1);
124         celt_assert(xy2_c == *xy2);
125     }
126 #endif
127 }
128 
129 #else /* !FIXED_POINT */
130 
131 /* ========================================================================== */
132 
133 #ifdef OPUS_CHECK_ASM
134 
135 /* This part of code simulates floating-point NEON operations. */
136 
137 /* celt_inner_prod_neon_float_c_simulation() simulates the floating-point   */
138 /* operations of celt_inner_prod_neon(), and both functions should have bit */
139 /* exact output.                                                            */
celt_inner_prod_neon_float_c_simulation(const opus_val16 * x,const opus_val16 * y,float * err,int N)140 static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, float *err, int N)
141 {
142    int i;
143    *err = 0;
144    opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
145    for (i = 0; i < N - 3; i += 4) {
146       xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
147       xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
148       xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
149       xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
150       *err += ABS32(xy0)+ABS32(xy1)+ABS32(xy2)+ABS32(xy3);
151    }
152    xy0 += xy2;
153    xy1 += xy3;
154    xy = xy0 + xy1;
155    *err += ABS32(xy1)+ABS32(xy0)+ABS32(xy);
156    for (; i < N; i++) {
157       xy = MAC16_16(xy, x[i], y[i]);
158       *err += ABS32(xy);
159    }
160    *err = *err*2e-7 + N*1e-37;
161    return xy;
162 }
163 
164 /* dual_inner_prod_neon_float_c_simulation() simulates the floating-point   */
165 /* operations of dual_inner_prod_neon(), and both functions should have bit */
166 /* exact output.                                                            */
dual_inner_prod_neon_float_c_simulation(const opus_val16 * x,const opus_val16 * y01,const opus_val16 * y02,int N,opus_val32 * xy1,opus_val32 * xy2,float * err)167 static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
168       int N, opus_val32 *xy1, opus_val32 *xy2, float *err)
169 {
170    *xy1 = celt_inner_prod_neon_float_c_simulation(x, y01, &err[0], N);
171    *xy2 = celt_inner_prod_neon_float_c_simulation(x, y02, &err[1], N);
172 }
173 
174 #endif /* OPUS_CHECK_ASM */
175 
176 /* ========================================================================== */
177 
celt_inner_prod_neon(const opus_val16 * x,const opus_val16 * y,int N)178 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
179 {
180     int i;
181     opus_val32 xy;
182     float32x4_t xy_f32x4 = vdupq_n_f32(0);
183     float32x2_t xy_f32x2;
184 
185     for (i = 0; i < N - 7; i += 8) {
186         float32x4_t x_f32x4, y_f32x4;
187         x_f32x4  = vld1q_f32(&x[i]);
188         y_f32x4  = vld1q_f32(&y[i]);
189         xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
190         x_f32x4  = vld1q_f32(&x[i + 4]);
191         y_f32x4  = vld1q_f32(&y[i + 4]);
192         xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
193     }
194 
195     if (N - i >= 4) {
196         const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
197         const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
198         xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
199         i += 4;
200     }
201 
202     xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
203     xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
204     xy       = vget_lane_f32(xy_f32x2, 0);
205 
206     for (; i < N; i++) {
207         xy = MAC16_16(xy, x[i], y[i]);
208     }
209 
210 #ifdef OPUS_CHECK_ASM
211     {
212         float err, res;
213         res = celt_inner_prod_neon_float_c_simulation(x, y, &err, N);
214         /*if (ABS32(res - xy) > err) fprintf(stderr, "%g %g %g\n", res, xy, err);*/
215         celt_assert(ABS32(res - xy) <= err);
216     }
217 #endif
218 
219     return xy;
220 }
221 
dual_inner_prod_neon(const opus_val16 * x,const opus_val16 * y01,const opus_val16 * y02,int N,opus_val32 * xy1,opus_val32 * xy2)222 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
223         int N, opus_val32 *xy1, opus_val32 *xy2)
224 {
225     int i;
226     opus_val32 xy01, xy02;
227     float32x4_t xy01_f32x4 = vdupq_n_f32(0);
228     float32x4_t xy02_f32x4 = vdupq_n_f32(0);
229     float32x2_t xy01_f32x2, xy02_f32x2;
230 
231     for (i = 0; i < N - 7; i += 8) {
232         float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
233         x_f32x4    = vld1q_f32(&x[i]);
234         y01_f32x4  = vld1q_f32(&y01[i]);
235         y02_f32x4  = vld1q_f32(&y02[i]);
236         xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
237         xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
238         x_f32x4    = vld1q_f32(&x[i + 4]);
239         y01_f32x4  = vld1q_f32(&y01[i + 4]);
240         y02_f32x4  = vld1q_f32(&y02[i + 4]);
241         xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
242         xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
243     }
244 
245     if (N - i >= 4) {
246         const float32x4_t x_f32x4   = vld1q_f32(&x[i]);
247         const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
248         const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
249         xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
250         xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
251         i += 4;
252     }
253 
254     xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
255     xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
256     xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
257     xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
258     xy01       = vget_lane_f32(xy01_f32x2, 0);
259     xy02       = vget_lane_f32(xy02_f32x2, 0);
260 
261     for (; i < N; i++) {
262         xy01 = MAC16_16(xy01, x[i], y01[i]);
263         xy02 = MAC16_16(xy02, x[i], y02[i]);
264     }
265     *xy1 = xy01;
266     *xy2 = xy02;
267 
268 #ifdef OPUS_CHECK_ASM
269     {
270         opus_val32 xy1_c, xy2_c;
271         float err[2];
272         dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c, err);
273         /*if (ABS32(xy1_c - *xy1) > err[0]) fprintf(stderr, "dual1 fail: %g %g %g\n", xy1_c, *xy1, err[0]);
274         if (ABS32(xy2_c - *xy2) > err[1]) fprintf(stderr, "dual2 fail: %g %g %g\n", xy2_c, *xy2, err[1]);*/
275         celt_assert(ABS32(xy1_c - *xy1) <= err[0]);
276         celt_assert(ABS32(xy2_c - *xy2) <= err[1]);
277     }
278 #endif
279 }
280 
281 #endif /* FIXED_POINT */
282