1 /*
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <stddef.h>
12
13 #include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h"
14 #include "webrtc/typedefs.h"
15
16 // Filter ar_g_Q0[] and ar_f_Q0[] through an AR filter with coefficients
17 // cth_Q15[] and sth_Q15[].
WebRtcIsacfix_FilterArLoop(int16_t * ar_g_Q0,int16_t * ar_f_Q0,int16_t * cth_Q15,int16_t * sth_Q15,size_t order_coef)18 void WebRtcIsacfix_FilterArLoop(int16_t* ar_g_Q0, // Input samples
19 int16_t* ar_f_Q0, // Input samples
20 int16_t* cth_Q15, // Filter coefficients
21 int16_t* sth_Q15, // Filter coefficients
22 size_t order_coef) { // order of the filter
23 int n = 0;
24
25 for (n = 0; n < HALF_SUBFRAMELEN - 1; n++) {
26 int count = (int)(order_coef - 1);
27 int offset;
28 #if !defined(MIPS_DSP_R1_LE)
29 int16_t* tmp_cth;
30 int16_t* tmp_sth;
31 int16_t* tmp_arg;
32 int32_t max_q16 = 0x7fff;
33 int32_t min_q16 = 0xffff8000;
34 #endif
35 // Declare variables used as temporary registers.
36 int32_t r0, r1, r2, t0, t1, t2, t_ar;
37
38 __asm __volatile (
39 ".set push \n\t"
40 ".set noreorder \n\t"
41 "bltz %[count], 2f \n\t"
42 " lh %[t_ar], 0(%[tmp]) \n\t"
43 // Inner loop
44 "1: \n\t"
45 "sll %[offset], %[count], 1 \n\t"
46 #if defined(MIPS_DSP_R1_LE)
47 "lhx %[r0], %[offset](%[cth_Q15]) \n\t"
48 "lhx %[r1], %[offset](%[sth_Q15]) \n\t"
49 "lhx %[r2], %[offset](%[ar_g_Q0]) \n\t"
50 #else
51 "addu %[tmp_cth], %[cth_Q15], %[offset] \n\t"
52 "addu %[tmp_sth], %[sth_Q15], %[offset] \n\t"
53 "addu %[tmp_arg], %[ar_g_Q0], %[offset] \n\t"
54 "lh %[r0], 0(%[tmp_cth]) \n\t"
55 "lh %[r1], 0(%[tmp_sth]) \n\t"
56 "lh %[r2], 0(%[tmp_arg]) \n\t"
57 #endif
58 "mul %[t0], %[r0], %[t_ar] \n\t"
59 "mul %[t1], %[r1], %[t_ar] \n\t"
60 "mul %[t2], %[r1], %[r2] \n\t"
61 "mul %[r0], %[r0], %[r2] \n\t"
62 "subu %[t0], %[t0], %[t2] \n\t"
63 "addu %[t1], %[t1], %[r0] \n\t"
64 #if defined(MIPS_DSP_R1_LE)
65 "shra_r.w %[t1], %[t1], 15 \n\t"
66 "shra_r.w %[t0], %[t0], 15 \n\t"
67 #else
68 "addiu %[t1], %[t1], 0x4000 \n\t"
69 "sra %[t1], %[t1], 15 \n\t"
70 "addiu %[t0], %[t0], 0x4000 \n\t"
71 "sra %[t0], %[t0], 15 \n\t"
72 #endif
73 "addiu %[offset], %[offset], 2 \n\t"
74 #if defined(MIPS_DSP_R1_LE)
75 "shll_s.w %[t1], %[t1], 16 \n\t"
76 "shll_s.w %[t_ar], %[t0], 16 \n\t"
77 #else
78 "slt %[r0], %[t1], %[max_q16] \n\t"
79 "slt %[r1], %[t0], %[max_q16] \n\t"
80 "movz %[t1], %[max_q16], %[r0] \n\t"
81 "movz %[t0], %[max_q16], %[r1] \n\t"
82 #endif
83 "addu %[offset], %[offset], %[ar_g_Q0] \n\t"
84 #if defined(MIPS_DSP_R1_LE)
85 "sra %[t1], %[t1], 16 \n\t"
86 "sra %[t_ar], %[t_ar], 16 \n\t"
87 #else
88 "slt %[r0], %[t1], %[min_q16] \n\t"
89 "slt %[r1], %[t0], %[min_q16] \n\t"
90 "movn %[t1], %[min_q16], %[r0] \n\t"
91 "movn %[t0], %[min_q16], %[r1] \n\t"
92 "addu %[t_ar], $zero, %[t0] \n\t"
93 #endif
94 "sh %[t1], 0(%[offset]) \n\t"
95 "bgtz %[count], 1b \n\t"
96 " addiu %[count], %[count], -1 \n\t"
97 "2: \n\t"
98 "sh %[t_ar], 0(%[tmp]) \n\t"
99 "sh %[t_ar], 0(%[ar_g_Q0]) \n\t"
100 ".set pop \n\t"
101 : [t_ar] "=&r" (t_ar), [count] "+r" (count), [offset] "=&r" (offset),
102 [r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2), [t0] "=&r" (t0),
103 #if !defined(MIPS_DSP_R1_LE)
104 [tmp_cth] "=&r" (tmp_cth), [tmp_sth] "=&r" (tmp_sth),
105 [tmp_arg] "=&r" (tmp_arg),
106 #endif
107 [t1] "=&r" (t1), [t2] "=&r" (t2)
108 : [tmp] "r" (&ar_f_Q0[n+1]), [cth_Q15] "r" (cth_Q15),
109 #if !defined(MIPS_DSP_R1_LE)
110 [max_q16] "r" (max_q16), [min_q16] "r" (min_q16),
111 #endif
112 [sth_Q15] "r" (sth_Q15), [ar_g_Q0] "r" (ar_g_Q0)
113 : "memory", "hi", "lo"
114 );
115 }
116 }
117
118 // MIPS optimization of the inner loop used for function
119 // WebRtcIsacfix_NormLatticeFilterMa(). It does:
120 //
121 // for 0 <= n < HALF_SUBFRAMELEN - 1:
122 // *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
123 // *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
124 //
125 // Note, function WebRtcIsacfix_FilterMaLoopMIPS and WebRtcIsacfix_FilterMaLoopC
126 // are not bit-exact. The accuracy of the MIPS function is same or better.
WebRtcIsacfix_FilterMaLoopMIPS(int16_t input0,int16_t input1,int32_t input2,int32_t * ptr0,int32_t * ptr1,int32_t * ptr2)127 void WebRtcIsacfix_FilterMaLoopMIPS(int16_t input0, // Filter coefficient
128 int16_t input1, // Filter coefficient
129 int32_t input2, // Inverse coeff (1/input1)
130 int32_t* ptr0, // Sample buffer
131 int32_t* ptr1, // Sample buffer
132 int32_t* ptr2) { // Sample buffer
133 #if defined(MIPS_DSP_R2_LE)
134 // MIPS DSPR2 version. 4 available accumulators allows loop unrolling 4 times.
135 // This variant is not bit-exact with WebRtcIsacfix_FilterMaLoopC, since we
136 // are exploiting 64-bit accumulators. The accuracy of the MIPS DSPR2 function
137 // is same or better.
138 int n = (HALF_SUBFRAMELEN - 1) >> 2;
139 int m = (HALF_SUBFRAMELEN - 1) & 3;
140
141 int r0, r1, r2, r3;
142 int t0, t1, t2, t3;
143 int s0, s1, s2, s3;
144
145 __asm __volatile (
146 ".set push \n\t"
147 ".set noreorder \n\t"
148 "1: \n\t"
149 "lw %[r0], 0(%[ptr0]) \n\t"
150 "lw %[r1], 4(%[ptr0]) \n\t"
151 "lw %[r2], 8(%[ptr0]) \n\t"
152 "lw %[r3], 12(%[ptr0]) \n\t"
153 "mult $ac0, %[r0], %[input0] \n\t"
154 "mult $ac1, %[r1], %[input0] \n\t"
155 "mult $ac2, %[r2], %[input0] \n\t"
156 "mult $ac3, %[r3], %[input0] \n\t"
157 "lw %[t0], 0(%[ptr2]) \n\t"
158 "extr_rs.w %[s0], $ac0, 15 \n\t"
159 "extr_rs.w %[s1], $ac1, 15 \n\t"
160 "extr_rs.w %[s2], $ac2, 15 \n\t"
161 "extr_rs.w %[s3], $ac3, 15 \n\t"
162 "lw %[t1], 4(%[ptr2]) \n\t"
163 "lw %[t2], 8(%[ptr2]) \n\t"
164 "lw %[t3], 12(%[ptr2]) \n\t"
165 "addu %[t0], %[t0], %[s0] \n\t"
166 "addu %[t1], %[t1], %[s1] \n\t"
167 "addu %[t2], %[t2], %[s2] \n\t"
168 "addu %[t3], %[t3], %[s3] \n\t"
169 "mult $ac0, %[t0], %[input2] \n\t"
170 "mult $ac1, %[t1], %[input2] \n\t"
171 "mult $ac2, %[t2], %[input2] \n\t"
172 "mult $ac3, %[t3], %[input2] \n\t"
173 "addiu %[ptr0], %[ptr0], 16 \n\t"
174 "extr_rs.w %[t0], $ac0, 16 \n\t"
175 "extr_rs.w %[t1], $ac1, 16 \n\t"
176 "extr_rs.w %[t2], $ac2, 16 \n\t"
177 "extr_rs.w %[t3], $ac3, 16 \n\t"
178 "addiu %[n], %[n], -1 \n\t"
179 "mult $ac0, %[r0], %[input1] \n\t"
180 "mult $ac1, %[r1], %[input1] \n\t"
181 "mult $ac2, %[r2], %[input1] \n\t"
182 "mult $ac3, %[r3], %[input1] \n\t"
183 "sw %[t0], 0(%[ptr2]) \n\t"
184 "extr_rs.w %[s0], $ac0, 15 \n\t"
185 "extr_rs.w %[s1], $ac1, 15 \n\t"
186 "extr_rs.w %[s2], $ac2, 15 \n\t"
187 "extr_rs.w %[s3], $ac3, 15 \n\t"
188 "sw %[t1], 4(%[ptr2]) \n\t"
189 "sw %[t2], 8(%[ptr2]) \n\t"
190 "sw %[t3], 12(%[ptr2]) \n\t"
191 "mult $ac0, %[t0], %[input0] \n\t"
192 "mult $ac1, %[t1], %[input0] \n\t"
193 "mult $ac2, %[t2], %[input0] \n\t"
194 "mult $ac3, %[t3], %[input0] \n\t"
195 "addiu %[ptr2], %[ptr2], 16 \n\t"
196 "extr_rs.w %[t0], $ac0, 15 \n\t"
197 "extr_rs.w %[t1], $ac1, 15 \n\t"
198 "extr_rs.w %[t2], $ac2, 15 \n\t"
199 "extr_rs.w %[t3], $ac3, 15 \n\t"
200 "addu %[t0], %[t0], %[s0] \n\t"
201 "addu %[t1], %[t1], %[s1] \n\t"
202 "addu %[t2], %[t2], %[s2] \n\t"
203 "addu %[t3], %[t3], %[s3] \n\t"
204 "sw %[t0], 0(%[ptr1]) \n\t"
205 "sw %[t1], 4(%[ptr1]) \n\t"
206 "sw %[t2], 8(%[ptr1]) \n\t"
207 "sw %[t3], 12(%[ptr1]) \n\t"
208 "bgtz %[n], 1b \n\t"
209 " addiu %[ptr1], %[ptr1], 16 \n\t"
210 "beq %[m], %0, 3f \n\t"
211 " nop \n\t"
212 "2: \n\t"
213 "lw %[r0], 0(%[ptr0]) \n\t"
214 "lw %[t0], 0(%[ptr2]) \n\t"
215 "addiu %[ptr0], %[ptr0], 4 \n\t"
216 "mult $ac0, %[r0], %[input0] \n\t"
217 "mult $ac1, %[r0], %[input1] \n\t"
218 "extr_rs.w %[r1], $ac0, 15 \n\t"
219 "extr_rs.w %[t1], $ac1, 15 \n\t"
220 "addu %[t0], %[t0], %[r1] \n\t"
221 "mult $ac0, %[t0], %[input2] \n\t"
222 "extr_rs.w %[t0], $ac0, 16 \n\t"
223 "sw %[t0], 0(%[ptr2]) \n\t"
224 "mult $ac0, %[t0], %[input0] \n\t"
225 "addiu %[ptr2], %[ptr2], 4 \n\t"
226 "addiu %[m], %[m], -1 \n\t"
227 "extr_rs.w %[t0], $ac0, 15 \n\t"
228 "addu %[t0], %[t0], %[t1] \n\t"
229 "sw %[t0], 0(%[ptr1]) \n\t"
230 "bgtz %[m], 2b \n\t"
231 " addiu %[ptr1], %[ptr1], 4 \n\t"
232 "3: \n\t"
233 ".set pop \n\t"
234 : [r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2),
235 [r3] "=&r" (r3), [t0] "=&r" (t0), [t1] "=&r" (t1),
236 [t2] "=&r" (t2), [t3] "=&r" (t3), [s0] "=&r" (s0),
237 [s1] "=&r" (s1), [s2] "=&r" (s2), [s3] "=&r" (s3),
238 [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), [m] "+r" (m),
239 [ptr2] "+r" (ptr2), [n] "+r" (n)
240 : [input0] "r" (input0), [input1] "r" (input1),
241 [input2] "r" (input2)
242 : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi",
243 "$ac2lo", "$ac3hi", "$ac3lo"
244 );
245 #else
246 // Non-DSPR2 version of the function. Avoiding the accumulator usage due to
247 // large latencies. This variant is bit-exact with C code.
248 int n = HALF_SUBFRAMELEN - 1;
249 int32_t t16a, t16b;
250 int32_t r0, r1, r2, r3, r4;
251
252 __asm __volatile (
253 ".set push \n\t"
254 ".set noreorder \n\t"
255 "sra %[t16a], %[input2], 16 \n\t"
256 "andi %[t16b], %[input2], 0xFFFF \n\t"
257 #if defined(MIPS32R2_LE)
258 "seh %[t16b], %[t16b] \n\t"
259 "seh %[input0], %[input0] \n\t"
260 "seh %[input1], %[input1] \n\t"
261 #else
262 "sll %[t16b], %[t16b], 16 \n\t"
263 "sra %[t16b], %[t16b], 16 \n\t"
264 "sll %[input0], %[input0], 16 \n\t"
265 "sra %[input0], %[input0], 16 \n\t"
266 "sll %[input1], %[input1], 16 \n\t"
267 "sra %[input1], %[input1], 16 \n\t"
268 #endif
269 "addiu %[r0], %[t16a], 1 \n\t"
270 "slt %[r1], %[t16b], $zero \n\t"
271 "movn %[t16a], %[r0], %[r1] \n\t"
272 "1: \n\t"
273 "lw %[r0], 0(%[ptr0]) \n\t"
274 "lw %[r1], 0(%[ptr2]) \n\t"
275 "addiu %[ptr0], %[ptr0], 4 \n\t"
276 "sra %[r2], %[r0], 16 \n\t"
277 "andi %[r0], %[r0], 0xFFFF \n\t"
278 "mul %[r3], %[r2], %[input0] \n\t"
279 "mul %[r4], %[r0], %[input0] \n\t"
280 "mul %[r2], %[r2], %[input1] \n\t"
281 "mul %[r0], %[r0], %[input1] \n\t"
282 "addiu %[ptr2], %[ptr2], 4 \n\t"
283 "sll %[r3], %[r3], 1 \n\t"
284 "sra %[r4], %[r4], 1 \n\t"
285 "addiu %[r4], %[r4], 0x2000 \n\t"
286 "sra %[r4], %[r4], 14 \n\t"
287 "addu %[r3], %[r3], %[r4] \n\t"
288 "addu %[r1], %[r1], %[r3] \n\t"
289 "sra %[r3], %[r1], 16 \n\t"
290 "andi %[r4], %[r1], 0xFFFF \n\t"
291 "sra %[r4], %[r4], 1 \n\t"
292 "mul %[r1], %[r1], %[t16a] \n\t"
293 "mul %[r3], %[r3], %[t16b] \n\t"
294 "mul %[r4], %[r4], %[t16b] \n\t"
295 "sll %[r2], %[r2], 1 \n\t"
296 "sra %[r0], %[r0], 1 \n\t"
297 "addiu %[r0], %[r0], 0x2000 \n\t"
298 "sra %[r0], %[r0], 14 \n\t"
299 "addu %[r0], %[r0], %[r2] \n\t"
300 "addiu %[n], %[n], -1 \n\t"
301 "addu %[r1], %[r1], %[r3] \n\t"
302 "addiu %[r4], %[r4], 0x4000 \n\t"
303 "sra %[r4], %[r4], 15 \n\t"
304 "addu %[r1], %[r1], %[r4] \n\t"
305 "sra %[r2], %[r1], 16 \n\t"
306 "andi %[r3], %[r1], 0xFFFF \n\t"
307 "mul %[r3], %[r3], %[input0] \n\t"
308 "mul %[r2], %[r2], %[input0] \n\t"
309 "sw %[r1], -4(%[ptr2]) \n\t"
310 "sra %[r3], %[r3], 1 \n\t"
311 "addiu %[r3], %[r3], 0x2000 \n\t"
312 "sra %[r3], %[r3], 14 \n\t"
313 "addu %[r0], %[r0], %[r3] \n\t"
314 "sll %[r2], %[r2], 1 \n\t"
315 "addu %[r0], %[r0], %[r2] \n\t"
316 "sw %[r0], 0(%[ptr1]) \n\t"
317 "bgtz %[n], 1b \n\t"
318 " addiu %[ptr1], %[ptr1], 4 \n\t"
319 ".set pop \n\t"
320 : [t16a] "=&r" (t16a), [t16b] "=&r" (t16b), [r0] "=&r" (r0),
321 [r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3),
322 [r4] "=&r" (r4), [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1),
323 [ptr2] "+r" (ptr2), [n] "+r" (n)
324 : [input0] "r" (input0), [input1] "r" (input1),
325 [input2] "r" (input2)
326 : "hi", "lo", "memory"
327 );
328 #endif
329 }
330