1 /*
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <stdlib.h>
13
14 #include "rtc_base/checks.h"
15 #include "common_audio/signal_processing/include/signal_processing_library.h"
16
17 // Maximum absolute value of word16 vector. C version for generic platforms.
WebRtcSpl_MaxAbsValueW16Neon(const int16_t * vector,size_t length)18 int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, size_t length) {
19 int absolute = 0, maximum = 0;
20
21 RTC_DCHECK_GT(length, 0);
22
23 const int16_t* p_start = vector;
24 size_t rest = length & 7;
25 const int16_t* p_end = vector + length - rest;
26
27 int16x8_t v;
28 uint16x8_t max_qv;
29 max_qv = vdupq_n_u16(0);
30
31 while (p_start < p_end) {
32 v = vld1q_s16(p_start);
33 // Note vabs doesn't change the value of -32768.
34 v = vabsq_s16(v);
35 // Use u16 so we don't lose the value -32768.
36 max_qv = vmaxq_u16(max_qv, vreinterpretq_u16_s16(v));
37 p_start += 8;
38 }
39
40 #ifdef WEBRTC_ARCH_ARM64
41 maximum = (int)vmaxvq_u16(max_qv);
42 #else
43 uint16x4_t max_dv;
44 max_dv = vmax_u16(vget_low_u16(max_qv), vget_high_u16(max_qv));
45 max_dv = vpmax_u16(max_dv, max_dv);
46 max_dv = vpmax_u16(max_dv, max_dv);
47
48 maximum = (int)vget_lane_u16(max_dv, 0);
49 #endif
50
51 p_end = vector + length;
52 while (p_start < p_end) {
53 absolute = abs((int)(*p_start));
54
55 if (absolute > maximum) {
56 maximum = absolute;
57 }
58 p_start++;
59 }
60
61 // Guard the case for abs(-32768).
62 if (maximum > WEBRTC_SPL_WORD16_MAX) {
63 maximum = WEBRTC_SPL_WORD16_MAX;
64 }
65
66 return (int16_t)maximum;
67 }
68
69 // Maximum absolute value of word32 vector. NEON intrinsics version for
70 // ARM 32-bit/64-bit platforms.
WebRtcSpl_MaxAbsValueW32Neon(const int32_t * vector,size_t length)71 int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, size_t length) {
72 // Use uint32_t for the local variables, to accommodate the return value
73 // of abs(0x80000000), which is 0x80000000.
74
75 uint32_t absolute = 0, maximum = 0;
76 size_t i = 0;
77 size_t residual = length & 0x7;
78
79 RTC_DCHECK_GT(length, 0);
80
81 const int32_t* p_start = vector;
82 uint32x4_t max32x4_0 = vdupq_n_u32(0);
83 uint32x4_t max32x4_1 = vdupq_n_u32(0);
84
85 // First part, unroll the loop 8 times.
86 for (i = 0; i < length - residual; i += 8) {
87 int32x4_t in32x4_0 = vld1q_s32(p_start);
88 p_start += 4;
89 int32x4_t in32x4_1 = vld1q_s32(p_start);
90 p_start += 4;
91 in32x4_0 = vabsq_s32(in32x4_0);
92 in32x4_1 = vabsq_s32(in32x4_1);
93 // vabs doesn't change the value of 0x80000000.
94 // Use u32 so we don't lose the value 0x80000000.
95 max32x4_0 = vmaxq_u32(max32x4_0, vreinterpretq_u32_s32(in32x4_0));
96 max32x4_1 = vmaxq_u32(max32x4_1, vreinterpretq_u32_s32(in32x4_1));
97 }
98
99 uint32x4_t max32x4 = vmaxq_u32(max32x4_0, max32x4_1);
100 #if defined(WEBRTC_ARCH_ARM64)
101 maximum = vmaxvq_u32(max32x4);
102 #else
103 uint32x2_t max32x2 = vmax_u32(vget_low_u32(max32x4), vget_high_u32(max32x4));
104 max32x2 = vpmax_u32(max32x2, max32x2);
105
106 maximum = vget_lane_u32(max32x2, 0);
107 #endif
108
109 // Second part, do the remaining iterations (if any).
110 for (i = residual; i > 0; i--) {
111 absolute = abs((int)(*p_start));
112 if (absolute > maximum) {
113 maximum = absolute;
114 }
115 p_start++;
116 }
117
118 // Guard against the case for 0x80000000.
119 maximum = WEBRTC_SPL_MIN(maximum, WEBRTC_SPL_WORD32_MAX);
120
121 return (int32_t)maximum;
122 }
123
124 // Maximum value of word16 vector. NEON intrinsics version for
125 // ARM 32-bit/64-bit platforms.
WebRtcSpl_MaxValueW16Neon(const int16_t * vector,size_t length)126 int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, size_t length) {
127 int16_t maximum = WEBRTC_SPL_WORD16_MIN;
128 size_t i = 0;
129 size_t residual = length & 0x7;
130
131 RTC_DCHECK_GT(length, 0);
132
133 const int16_t* p_start = vector;
134 int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN);
135
136 // First part, unroll the loop 8 times.
137 for (i = 0; i < length - residual; i += 8) {
138 int16x8_t in16x8 = vld1q_s16(p_start);
139 max16x8 = vmaxq_s16(max16x8, in16x8);
140 p_start += 8;
141 }
142
143 #if defined(WEBRTC_ARCH_ARM64)
144 maximum = vmaxvq_s16(max16x8);
145 #else
146 int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8));
147 max16x4 = vpmax_s16(max16x4, max16x4);
148 max16x4 = vpmax_s16(max16x4, max16x4);
149
150 maximum = vget_lane_s16(max16x4, 0);
151 #endif
152
153 // Second part, do the remaining iterations (if any).
154 for (i = residual; i > 0; i--) {
155 if (*p_start > maximum)
156 maximum = *p_start;
157 p_start++;
158 }
159 return maximum;
160 }
161
162 // Maximum value of word32 vector. NEON intrinsics version for
163 // ARM 32-bit/64-bit platforms.
WebRtcSpl_MaxValueW32Neon(const int32_t * vector,size_t length)164 int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, size_t length) {
165 int32_t maximum = WEBRTC_SPL_WORD32_MIN;
166 size_t i = 0;
167 size_t residual = length & 0x7;
168
169 RTC_DCHECK_GT(length, 0);
170
171 const int32_t* p_start = vector;
172 int32x4_t max32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN);
173 int32x4_t max32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN);
174
175 // First part, unroll the loop 8 times.
176 for (i = 0; i < length - residual; i += 8) {
177 int32x4_t in32x4_0 = vld1q_s32(p_start);
178 p_start += 4;
179 int32x4_t in32x4_1 = vld1q_s32(p_start);
180 p_start += 4;
181 max32x4_0 = vmaxq_s32(max32x4_0, in32x4_0);
182 max32x4_1 = vmaxq_s32(max32x4_1, in32x4_1);
183 }
184
185 int32x4_t max32x4 = vmaxq_s32(max32x4_0, max32x4_1);
186 #if defined(WEBRTC_ARCH_ARM64)
187 maximum = vmaxvq_s32(max32x4);
188 #else
189 int32x2_t max32x2 = vmax_s32(vget_low_s32(max32x4), vget_high_s32(max32x4));
190 max32x2 = vpmax_s32(max32x2, max32x2);
191
192 maximum = vget_lane_s32(max32x2, 0);
193 #endif
194
195 // Second part, do the remaining iterations (if any).
196 for (i = residual; i > 0; i--) {
197 if (*p_start > maximum)
198 maximum = *p_start;
199 p_start++;
200 }
201 return maximum;
202 }
203
204 // Minimum value of word16 vector. NEON intrinsics version for
205 // ARM 32-bit/64-bit platforms.
WebRtcSpl_MinValueW16Neon(const int16_t * vector,size_t length)206 int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, size_t length) {
207 int16_t minimum = WEBRTC_SPL_WORD16_MAX;
208 size_t i = 0;
209 size_t residual = length & 0x7;
210
211 RTC_DCHECK_GT(length, 0);
212
213 const int16_t* p_start = vector;
214 int16x8_t min16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MAX);
215
216 // First part, unroll the loop 8 times.
217 for (i = 0; i < length - residual; i += 8) {
218 int16x8_t in16x8 = vld1q_s16(p_start);
219 min16x8 = vminq_s16(min16x8, in16x8);
220 p_start += 8;
221 }
222
223 #if defined(WEBRTC_ARCH_ARM64)
224 minimum = vminvq_s16(min16x8);
225 #else
226 int16x4_t min16x4 = vmin_s16(vget_low_s16(min16x8), vget_high_s16(min16x8));
227 min16x4 = vpmin_s16(min16x4, min16x4);
228 min16x4 = vpmin_s16(min16x4, min16x4);
229
230 minimum = vget_lane_s16(min16x4, 0);
231 #endif
232
233 // Second part, do the remaining iterations (if any).
234 for (i = residual; i > 0; i--) {
235 if (*p_start < minimum)
236 minimum = *p_start;
237 p_start++;
238 }
239 return minimum;
240 }
241
242 // Minimum value of word32 vector. NEON intrinsics version for
243 // ARM 32-bit/64-bit platforms.
WebRtcSpl_MinValueW32Neon(const int32_t * vector,size_t length)244 int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, size_t length) {
245 int32_t minimum = WEBRTC_SPL_WORD32_MAX;
246 size_t i = 0;
247 size_t residual = length & 0x7;
248
249 RTC_DCHECK_GT(length, 0);
250
251 const int32_t* p_start = vector;
252 int32x4_t min32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX);
253 int32x4_t min32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX);
254
255 // First part, unroll the loop 8 times.
256 for (i = 0; i < length - residual; i += 8) {
257 int32x4_t in32x4_0 = vld1q_s32(p_start);
258 p_start += 4;
259 int32x4_t in32x4_1 = vld1q_s32(p_start);
260 p_start += 4;
261 min32x4_0 = vminq_s32(min32x4_0, in32x4_0);
262 min32x4_1 = vminq_s32(min32x4_1, in32x4_1);
263 }
264
265 int32x4_t min32x4 = vminq_s32(min32x4_0, min32x4_1);
266 #if defined(WEBRTC_ARCH_ARM64)
267 minimum = vminvq_s32(min32x4);
268 #else
269 int32x2_t min32x2 = vmin_s32(vget_low_s32(min32x4), vget_high_s32(min32x4));
270 min32x2 = vpmin_s32(min32x2, min32x2);
271
272 minimum = vget_lane_s32(min32x2, 0);
273 #endif
274
275 // Second part, do the remaining iterations (if any).
276 for (i = residual; i > 0; i--) {
277 if (*p_start < minimum)
278 minimum = *p_start;
279 p_start++;
280 }
281 return minimum;
282 }
283
284 // Finds both the minimum and maximum elements in an array of 16-bit integers.
WebRtcSpl_MinMaxW16Neon(const int16_t * vector,size_t length,int16_t * min_val,int16_t * max_val)285 void WebRtcSpl_MinMaxW16Neon(const int16_t* vector, size_t length,
286 int16_t* min_val, int16_t* max_val) {
287 int16_t minimum = WEBRTC_SPL_WORD16_MAX;
288 int16_t maximum = WEBRTC_SPL_WORD16_MIN;
289 size_t i = 0;
290 size_t residual = length & 0x7;
291
292 RTC_DCHECK_GT(length, 0);
293
294 const int16_t* p_start = vector;
295 int16x8_t min16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MAX);
296 int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN);
297
298 // First part, unroll the loop 8 times.
299 for (i = 0; i < length - residual; i += 8) {
300 int16x8_t in16x8 = vld1q_s16(p_start);
301 min16x8 = vminq_s16(min16x8, in16x8);
302 max16x8 = vmaxq_s16(max16x8, in16x8);
303 p_start += 8;
304 }
305
306 #if defined(WEBRTC_ARCH_ARM64)
307 minimum = vminvq_s16(min16x8);
308 maximum = vmaxvq_s16(max16x8);
309 #else
310 int16x4_t min16x4 = vmin_s16(vget_low_s16(min16x8), vget_high_s16(min16x8));
311 min16x4 = vpmin_s16(min16x4, min16x4);
312 min16x4 = vpmin_s16(min16x4, min16x4);
313
314 minimum = vget_lane_s16(min16x4, 0);
315
316 int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8));
317 max16x4 = vpmax_s16(max16x4, max16x4);
318 max16x4 = vpmax_s16(max16x4, max16x4);
319
320 maximum = vget_lane_s16(max16x4, 0);
321 #endif
322
323 // Second part, do the remaining iterations (if any).
324 for (i = residual; i > 0; i--) {
325 if (*p_start < minimum)
326 minimum = *p_start;
327 if (*p_start > maximum)
328 maximum = *p_start;
329 p_start++;
330 }
331 *min_val = minimum;
332 *max_val = maximum;
333 }
334