1 /*
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h"
14 #include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h"
15
16 // Contains a function for the core loop in the normalized lattice MA
17 // filter routine for iSAC codec, optimized for ARM Neon platform.
18 // It does:
19 // for 0 <= n < HALF_SUBFRAMELEN - 1:
20 // *ptr2 = input2 * ((*ptr2) + input0 * (*ptr0));
21 // *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
22 // Output is not bit-exact with the reference C code, due to the replacement
23 // of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
24 // instructions. The difference should not be bigger than 1.
WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,int16_t input1,int32_t input2,int32_t * ptr0,int32_t * ptr1,int32_t * ptr2)25 void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0, // Filter coefficient
26 int16_t input1, // Filter coefficient
27 int32_t input2, // Inverse coefficient
28 int32_t* ptr0, // Sample buffer
29 int32_t* ptr1, // Sample buffer
30 int32_t* ptr2) // Sample buffer
31 {
32 int n = 0;
33 int loop = (HALF_SUBFRAMELEN - 1) >> 3;
34 int loop_tail = (HALF_SUBFRAMELEN - 1) & 0x7;
35
36 int32x4_t input0_v = vdupq_n_s32((int32_t)input0 << 16);
37 int32x4_t input1_v = vdupq_n_s32((int32_t)input1 << 16);
38 int32x4_t input2_v = vdupq_n_s32(input2);
39 int32x4_t tmp0a, tmp1a, tmp2a, tmp3a;
40 int32x4_t tmp0b, tmp1b, tmp2b, tmp3b;
41 int32x4_t ptr0va, ptr1va, ptr2va;
42 int32x4_t ptr0vb, ptr1vb, ptr2vb;
43
44 int64x2_t tmp2al_low, tmp2al_high, tmp2bl_low, tmp2bl_high;
45 // Unroll to process 8 samples at once.
46 for (n = 0; n < loop; n++) {
47 ptr0va = vld1q_s32(ptr0);
48 ptr0vb = vld1q_s32(ptr0 + 4);
49 ptr0 += 8;
50
51 ptr2va = vld1q_s32(ptr2);
52 ptr2vb = vld1q_s32(ptr2 + 4);
53
54 // Calculate tmp0 = (*ptr0) * input0.
55 tmp0a = vqrdmulhq_s32(ptr0va, input0_v);
56 tmp0b = vqrdmulhq_s32(ptr0vb, input0_v);
57
58 // Calculate tmp1 = (*ptr0) * input1.
59 tmp1a = vqrdmulhq_s32(ptr0va, input1_v);
60 tmp1b = vqrdmulhq_s32(ptr0vb, input1_v);
61
62 // Calculate tmp2 = tmp0 + *(ptr2).
63 tmp2a = vaddq_s32(tmp0a, ptr2va);
64 tmp2b = vaddq_s32(tmp0b, ptr2vb);
65
66 // Calculate *ptr2 = input2 * tmp2.
67 tmp2al_low = vmull_s32(vget_low_s32(tmp2a), vget_low_s32(input2_v));
68 #if defined(WEBRTC_ARCH_ARM64)
69 tmp2al_high = vmull_high_s32(tmp2a, input2_v);
70 #else
71 tmp2al_high = vmull_s32(vget_high_s32(tmp2a), vget_high_s32(input2_v));
72 #endif
73 ptr2va = vcombine_s32(vrshrn_n_s64(tmp2al_low, 16),
74 vrshrn_n_s64(tmp2al_high, 16));
75
76 tmp2bl_low = vmull_s32(vget_low_s32(tmp2b), vget_low_s32(input2_v));
77 #if defined(WEBRTC_ARCH_ARM64)
78 tmp2bl_high = vmull_high_s32(tmp2b, input2_v);
79 #else
80 tmp2bl_high = vmull_s32(vget_high_s32(tmp2b), vget_high_s32(input2_v));
81 #endif
82 ptr2vb = vcombine_s32(vrshrn_n_s64(tmp2bl_low, 16),
83 vrshrn_n_s64(tmp2bl_high, 16));
84
85 vst1q_s32(ptr2, ptr2va);
86 vst1q_s32(ptr2 + 4, ptr2vb);
87 ptr2 += 8;
88
89 // Calculate tmp3 = ptr2v * input0.
90 tmp3a = vqrdmulhq_s32(ptr2va, input0_v);
91 tmp3b = vqrdmulhq_s32(ptr2vb, input0_v);
92
93 // Calculate *ptr1 = tmp1 + tmp3.
94 ptr1va = vaddq_s32(tmp1a, tmp3a);
95 ptr1vb = vaddq_s32(tmp1b, tmp3b);
96
97 vst1q_s32(ptr1, ptr1va);
98 vst1q_s32(ptr1 + 4, ptr1vb);
99 ptr1 += 8;
100 }
101
102 // Process four more samples.
103 if (loop_tail & 0x4) {
104 ptr0va = vld1q_s32(ptr0);
105 ptr2va = vld1q_s32(ptr2);
106 ptr0 += 4;
107
108 // Calculate tmp0 = (*ptr0) * input0.
109 tmp0a = vqrdmulhq_s32(ptr0va, input0_v);
110
111 // Calculate tmp1 = (*ptr0) * input1.
112 tmp1a = vqrdmulhq_s32(ptr0va, input1_v);
113
114 // Calculate tmp2 = tmp0 + *(ptr2).
115 tmp2a = vaddq_s32(tmp0a, ptr2va);
116
117 // Calculate *ptr2 = input2 * tmp2.
118 tmp2al_low = vmull_s32(vget_low_s32(tmp2a), vget_low_s32(input2_v));
119
120 #if defined(WEBRTC_ARCH_ARM64)
121 tmp2al_high = vmull_high_s32(tmp2a, input2_v);
122 #else
123 tmp2al_high = vmull_s32(vget_high_s32(tmp2a), vget_high_s32(input2_v));
124 #endif
125 ptr2va = vcombine_s32(vrshrn_n_s64(tmp2al_low, 16),
126 vrshrn_n_s64(tmp2al_high, 16));
127
128 vst1q_s32(ptr2, ptr2va);
129 ptr2 += 4;
130
131 // Calculate tmp3 = *(ptr2) * input0.
132 tmp3a = vqrdmulhq_s32(ptr2va, input0_v);
133
134 // Calculate *ptr1 = tmp1 + tmp3.
135 ptr1va = vaddq_s32(tmp1a, tmp3a);
136
137 vst1q_s32(ptr1, ptr1va);
138 ptr1 += 4;
139 }
140
141 // Process two more samples.
142 if (loop_tail & 0x2) {
143 int32x2_t ptr0v_tail, ptr2v_tail, ptr1v_tail;
144 int32x2_t tmp0_tail, tmp1_tail, tmp2_tail, tmp3_tail;
145 int64x2_t tmp2l_tail;
146 ptr0v_tail = vld1_s32(ptr0);
147 ptr2v_tail = vld1_s32(ptr2);
148 ptr0 += 2;
149
150 // Calculate tmp0 = (*ptr0) * input0.
151 tmp0_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input0_v));
152
153 // Calculate tmp1 = (*ptr0) * input1.
154 tmp1_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input1_v));
155
156 // Calculate tmp2 = tmp0 + *(ptr2).
157 tmp2_tail = vadd_s32(tmp0_tail, ptr2v_tail);
158
159 // Calculate *ptr2 = input2 * tmp2.
160 tmp2l_tail = vmull_s32(tmp2_tail, vget_low_s32(input2_v));
161 ptr2v_tail = vrshrn_n_s64(tmp2l_tail, 16);
162
163 vst1_s32(ptr2, ptr2v_tail);
164 ptr2 += 2;
165
166 // Calculate tmp3 = *(ptr2) * input0.
167 tmp3_tail = vqrdmulh_s32(ptr2v_tail, vget_low_s32(input0_v));
168
169 // Calculate *ptr1 = tmp1 + tmp3.
170 ptr1v_tail = vadd_s32(tmp1_tail, tmp3_tail);
171
172 vst1_s32(ptr1, ptr1v_tail);
173 ptr1 += 2;
174 }
175
176 // Process one more sample.
177 if (loop_tail & 0x1) {
178 int16_t t16a = (int16_t)(input2 >> 16);
179 int16_t t16b = (int16_t)input2;
180 if (t16b < 0) t16a++;
181 int32_t tmp32a;
182 int32_t tmp32b;
183
184 // Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)).
185 tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0);
186 tmp32b = *ptr2 + tmp32a;
187 *ptr2 = (int32_t)(WEBRTC_SPL_MUL(t16a, tmp32b) +
188 (WEBRTC_SPL_MUL_16_32_RSFT16(t16b, tmp32b)));
189
190 // Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2).
191 tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0);
192 tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2);
193 *ptr1 = tmp32a + tmp32b;
194 }
195 }
196