1 /*
2 * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 /* This file contains WebRtcIsacfix_MatrixProduct1Neon() and
12 * WebRtcIsacfix_MatrixProduct2Neon() for ARM Neon platform. API's are in
13 * entropy_coding.c. Results are bit exact with the c code for
14 * generic platforms.
15 */
16
17 #include <arm_neon.h>
18 #include <stddef.h>
19
20 #include "modules/audio_coding/codecs/isac/fix/source/entropy_coding.h"
21 #include "common_audio/signal_processing/include/signal_processing_library.h"
22 #include "rtc_base/checks.h"
23
WebRtcIsacfix_MatrixProduct1Neon(const int16_t matrix0[],const int32_t matrix1[],int32_t matrix_product[],const int matrix1_index_factor1,const int matrix0_index_factor1,const int matrix1_index_init_case,const int matrix1_index_step,const int matrix0_index_step,const int inner_loop_count,const int mid_loop_count,const int shift)24 void WebRtcIsacfix_MatrixProduct1Neon(const int16_t matrix0[],
25 const int32_t matrix1[],
26 int32_t matrix_product[],
27 const int matrix1_index_factor1,
28 const int matrix0_index_factor1,
29 const int matrix1_index_init_case,
30 const int matrix1_index_step,
31 const int matrix0_index_step,
32 const int inner_loop_count,
33 const int mid_loop_count,
34 const int shift) {
35 int j = 0, k = 0, n = 0;
36 int matrix1_index = 0, matrix0_index = 0, matrix_prod_index = 0;
37 int* matrix1_index_factor2 = &j;
38 int* matrix0_index_factor2 = &k;
39 if (matrix1_index_init_case != 0) {
40 matrix1_index_factor2 = &k;
41 matrix0_index_factor2 = &j;
42 }
43 int32x4_t shift32x4 = vdupq_n_s32(shift);
44 int32x2_t shift32x2 = vdup_n_s32(shift);
45 int32x4_t sum_32x4 = vdupq_n_s32(0);
46 int32x2_t sum_32x2 = vdup_n_s32(0);
47
48 RTC_DCHECK_EQ(0, inner_loop_count % 2);
49 RTC_DCHECK_EQ(0, mid_loop_count % 2);
50
51 if (matrix1_index_init_case != 0 && matrix1_index_factor1 == 1) {
52 for (j = 0; j < SUBFRAMES; j++) {
53 matrix_prod_index = mid_loop_count * j;
54 for (k = 0; k < (mid_loop_count >> 2) << 2; k += 4) {
55 sum_32x4 = veorq_s32(sum_32x4, sum_32x4); // Initialize to zeros.
56 matrix1_index = k;
57 matrix0_index = matrix0_index_factor1 * j;
58 for (n = 0; n < inner_loop_count; n++) {
59 int32x4_t matrix0_32x4 =
60 vdupq_n_s32((int32_t)(matrix0[matrix0_index]) << 15);
61 int32x4_t matrix1_32x4 =
62 vshlq_s32(vld1q_s32(&matrix1[matrix1_index]), shift32x4);
63 int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4);
64 sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4);
65 matrix1_index += matrix1_index_step;
66 matrix0_index += matrix0_index_step;
67 }
68 vst1q_s32(&matrix_product[matrix_prod_index], sum_32x4);
69 matrix_prod_index += 4;
70 }
71 if (mid_loop_count % 4 > 1) {
72 sum_32x2 = veor_s32(sum_32x2, sum_32x2); // Initialize to zeros.
73 matrix1_index = k;
74 k += 2;
75 matrix0_index = matrix0_index_factor1 * j;
76 for (n = 0; n < inner_loop_count; n++) {
77 int32x2_t matrix0_32x2 =
78 vdup_n_s32((int32_t)(matrix0[matrix0_index]) << 15);
79 int32x2_t matrix1_32x2 =
80 vshl_s32(vld1_s32(&matrix1[matrix1_index]), shift32x2);
81 int32x2_t multi_32x2 = vqdmulh_s32(matrix0_32x2, matrix1_32x2);
82 sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
83 matrix1_index += matrix1_index_step;
84 matrix0_index += matrix0_index_step;
85 }
86 vst1_s32(&matrix_product[matrix_prod_index], sum_32x2);
87 matrix_prod_index += 2;
88 }
89 }
90 }
91 else if (matrix1_index_init_case == 0 && matrix0_index_factor1 == 1) {
92 int32x2_t multi_32x2 = vdup_n_s32(0);
93 int32x2_t matrix0_32x2 = vdup_n_s32(0);
94 for (j = 0; j < SUBFRAMES; j++) {
95 matrix_prod_index = mid_loop_count * j;
96 for (k = 0; k < (mid_loop_count >> 2) << 2; k += 4) {
97 sum_32x4 = veorq_s32(sum_32x4, sum_32x4); // Initialize to zeros.
98 matrix1_index = matrix1_index_factor1 * j;
99 matrix0_index = k;
100 for (n = 0; n < inner_loop_count; n++) {
101 int32x4_t matrix1_32x4 = vdupq_n_s32(matrix1[matrix1_index] << shift);
102 int32x4_t matrix0_32x4 =
103 vshll_n_s16(vld1_s16(&matrix0[matrix0_index]), 15);
104 int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4);
105 sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4);
106 matrix1_index += matrix1_index_step;
107 matrix0_index += matrix0_index_step;
108 }
109 vst1q_s32(&matrix_product[matrix_prod_index], sum_32x4);
110 matrix_prod_index += 4;
111 }
112 if (mid_loop_count % 4 > 1) {
113 sum_32x2 = veor_s32(sum_32x2, sum_32x2); // Initialize to zeros.
114 matrix1_index = matrix1_index_factor1 * j;
115 matrix0_index = k;
116 for (n = 0; n < inner_loop_count; n++) {
117 int32x2_t matrix1_32x2 = vdup_n_s32(matrix1[matrix1_index] << shift);
118 matrix0_32x2 =
119 vset_lane_s32((int32_t)matrix0[matrix0_index], matrix0_32x2, 0);
120 matrix0_32x2 = vset_lane_s32((int32_t)matrix0[matrix0_index + 1],
121 matrix0_32x2, 1);
122 matrix0_32x2 = vshl_n_s32(matrix0_32x2, 15);
123 multi_32x2 = vqdmulh_s32(matrix1_32x2, matrix0_32x2);
124 sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
125 matrix1_index += matrix1_index_step;
126 matrix0_index += matrix0_index_step;
127 }
128 vst1_s32(&matrix_product[matrix_prod_index], sum_32x2);
129 matrix_prod_index += 2;
130 }
131 }
132 }
133 else if (matrix1_index_init_case == 0 &&
134 matrix1_index_step == 1 &&
135 matrix0_index_step == 1) {
136 int32x2_t multi_32x2 = vdup_n_s32(0);
137 int32x2_t matrix0_32x2 = vdup_n_s32(0);
138 for (j = 0; j < SUBFRAMES; j++) {
139 matrix_prod_index = mid_loop_count * j;
140 for (k = 0; k < mid_loop_count; k++) {
141 sum_32x4 = veorq_s32(sum_32x4, sum_32x4); // Initialize to zeros.
142 matrix1_index = matrix1_index_factor1 * j;
143 matrix0_index = matrix0_index_factor1 * k;
144 for (n = 0; n < (inner_loop_count >> 2) << 2; n += 4) {
145 int32x4_t matrix1_32x4 =
146 vshlq_s32(vld1q_s32(&matrix1[matrix1_index]), shift32x4);
147 int32x4_t matrix0_32x4 =
148 vshll_n_s16(vld1_s16(&matrix0[matrix0_index]), 15);
149 int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4);
150 sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4);
151 matrix1_index += 4;
152 matrix0_index += 4;
153 }
154 sum_32x2 = vqadd_s32(vget_low_s32(sum_32x4), vget_high_s32(sum_32x4));
155 if (inner_loop_count % 4 > 1) {
156 int32x2_t matrix1_32x2 =
157 vshl_s32(vld1_s32(&matrix1[matrix1_index]), shift32x2);
158 matrix0_32x2 =
159 vset_lane_s32((int32_t)matrix0[matrix0_index], matrix0_32x2, 0);
160 matrix0_32x2 = vset_lane_s32((int32_t)matrix0[matrix0_index + 1],
161 matrix0_32x2, 1);
162 matrix0_32x2 = vshl_n_s32(matrix0_32x2, 15);
163 multi_32x2 = vqdmulh_s32(matrix1_32x2, matrix0_32x2);
164 sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
165 }
166 sum_32x2 = vpadd_s32(sum_32x2, sum_32x2);
167 vst1_lane_s32(&matrix_product[matrix_prod_index], sum_32x2, 0);
168 matrix_prod_index++;
169 }
170 }
171 }
172 else {
173 for (j = 0; j < SUBFRAMES; j++) {
174 matrix_prod_index = mid_loop_count * j;
175 for (k=0; k < mid_loop_count; k++) {
176 int32_t sum32 = 0;
177 matrix1_index = matrix1_index_factor1 * (*matrix1_index_factor2);
178 matrix0_index = matrix0_index_factor1 * (*matrix0_index_factor2);
179 for (n = 0; n < inner_loop_count; n++) {
180 sum32 += (WEBRTC_SPL_MUL_16_32_RSFT16(matrix0[matrix0_index],
181 matrix1[matrix1_index] << shift));
182 matrix1_index += matrix1_index_step;
183 matrix0_index += matrix0_index_step;
184 }
185 matrix_product[matrix_prod_index] = sum32;
186 matrix_prod_index++;
187 }
188 }
189 }
190 }
191
WebRtcIsacfix_MatrixProduct2Neon(const int16_t matrix0[],const int32_t matrix1[],int32_t matrix_product[],const int matrix0_index_factor,const int matrix0_index_step)192 void WebRtcIsacfix_MatrixProduct2Neon(const int16_t matrix0[],
193 const int32_t matrix1[],
194 int32_t matrix_product[],
195 const int matrix0_index_factor,
196 const int matrix0_index_step) {
197 int j = 0, n = 0;
198 int matrix1_index = 0, matrix0_index = 0, matrix_prod_index = 0;
199 int32x2_t sum_32x2 = vdup_n_s32(0);
200 for (j = 0; j < SUBFRAMES; j++) {
201 sum_32x2 = veor_s32(sum_32x2, sum_32x2); // Initialize to zeros.
202 matrix1_index = 0;
203 matrix0_index = matrix0_index_factor * j;
204 for (n = SUBFRAMES; n > 0; n--) {
205 int32x2_t matrix0_32x2 =
206 vdup_n_s32((int32_t)(matrix0[matrix0_index]) << 15);
207 int32x2_t matrix1_32x2 = vld1_s32(&matrix1[matrix1_index]);
208 int32x2_t multi_32x2 = vqdmulh_s32(matrix0_32x2, matrix1_32x2);
209 sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
210 matrix1_index += 2;
211 matrix0_index += matrix0_index_step;
212 }
213 sum_32x2 = vshr_n_s32(sum_32x2, 3);
214 vst1_s32(&matrix_product[matrix_prod_index], sum_32x2);
215 matrix_prod_index += 2;
216 }
217 }
218