1 /*
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "common_audio/signal_processing/include/signal_processing_library.h"
12 #include "rtc_base/system/arch.h"
13
14 #include <arm_neon.h>
15
DotProductWithScaleNeon(int32_t * cross_correlation,const int16_t * vector1,const int16_t * vector2,size_t length,int scaling)16 static inline void DotProductWithScaleNeon(int32_t* cross_correlation,
17 const int16_t* vector1,
18 const int16_t* vector2,
19 size_t length,
20 int scaling) {
21 size_t i = 0;
22 size_t len1 = length >> 3;
23 size_t len2 = length & 7;
24 int64x2_t sum0 = vdupq_n_s64(0);
25 int64x2_t sum1 = vdupq_n_s64(0);
26
27 for (i = len1; i > 0; i -= 1) {
28 int16x8_t seq1_16x8 = vld1q_s16(vector1);
29 int16x8_t seq2_16x8 = vld1q_s16(vector2);
30 #if defined(WEBRTC_ARCH_ARM64)
31 int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
32 vget_low_s16(seq2_16x8));
33 int32x4_t tmp1 = vmull_high_s16(seq1_16x8, seq2_16x8);
34 #else
35 int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
36 vget_low_s16(seq2_16x8));
37 int32x4_t tmp1 = vmull_s16(vget_high_s16(seq1_16x8),
38 vget_high_s16(seq2_16x8));
39 #endif
40 sum0 = vpadalq_s32(sum0, tmp0);
41 sum1 = vpadalq_s32(sum1, tmp1);
42 vector1 += 8;
43 vector2 += 8;
44 }
45
46 // Calculate the rest of the samples.
47 int64_t sum_res = 0;
48 for (i = len2; i > 0; i -= 1) {
49 sum_res += WEBRTC_SPL_MUL_16_16(*vector1, *vector2);
50 vector1++;
51 vector2++;
52 }
53
54 sum0 = vaddq_s64(sum0, sum1);
55 #if defined(WEBRTC_ARCH_ARM64)
56 int64_t sum2 = vaddvq_s64(sum0);
57 *cross_correlation = (int32_t)((sum2 + sum_res) >> scaling);
58 #else
59 int64x1_t shift = vdup_n_s64(-scaling);
60 int64x1_t sum2 = vadd_s64(vget_low_s64(sum0), vget_high_s64(sum0));
61 sum2 = vadd_s64(sum2, vdup_n_s64(sum_res));
62 sum2 = vshl_s64(sum2, shift);
63 vst1_lane_s32(cross_correlation, vreinterpret_s32_s64(sum2), 0);
64 #endif
65 }
66
67 /* NEON version of WebRtcSpl_CrossCorrelation() for ARM32/64 platforms. */
WebRtcSpl_CrossCorrelationNeon(int32_t * cross_correlation,const int16_t * seq1,const int16_t * seq2,size_t dim_seq,size_t dim_cross_correlation,int right_shifts,int step_seq2)68 void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation,
69 const int16_t* seq1,
70 const int16_t* seq2,
71 size_t dim_seq,
72 size_t dim_cross_correlation,
73 int right_shifts,
74 int step_seq2) {
75 size_t i = 0;
76
77 for (i = 0; i < dim_cross_correlation; i++) {
78 const int16_t* seq1_ptr = seq1;
79 const int16_t* seq2_ptr = seq2 + (step_seq2 * i);
80
81 DotProductWithScaleNeon(cross_correlation,
82 seq1_ptr,
83 seq2_ptr,
84 dim_seq,
85 right_shifts);
86 cross_correlation++;
87 }
88 }
89