1 /*
2 * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 /* This file contains WebRtcIsacfix_MatrixProduct1Neon() and
12 * WebRtcIsacfix_MatrixProduct2Neon() for ARM Neon platform. API's are in
13 * entropy_coding.c. Results are bit exact with the c code for
14 * generic platforms.
15 */
16
17 #include "entropy_coding.h"
18
19 #include <arm_neon.h>
20 #include <assert.h>
21 #include <stddef.h>
22
23 #include "signal_processing_library.h"
24
WebRtcIsacfix_MatrixProduct1Neon(const int16_t matrix0[],const int32_t matrix1[],int32_t matrix_product[],const int matrix1_index_factor1,const int matrix0_index_factor1,const int matrix1_index_init_case,const int matrix1_index_step,const int matrix0_index_step,const int inner_loop_count,const int mid_loop_count,const int shift)25 void WebRtcIsacfix_MatrixProduct1Neon(const int16_t matrix0[],
26 const int32_t matrix1[],
27 int32_t matrix_product[],
28 const int matrix1_index_factor1,
29 const int matrix0_index_factor1,
30 const int matrix1_index_init_case,
31 const int matrix1_index_step,
32 const int matrix0_index_step,
33 const int inner_loop_count,
34 const int mid_loop_count,
35 const int shift) {
36 int j = 0, k = 0, n = 0;
37 int matrix1_index = 0, matrix0_index = 0, matrix_prod_index = 0;
38 int* matrix1_index_factor2 = &j;
39 int* matrix0_index_factor2 = &k;
40 if (matrix1_index_init_case != 0) {
41 matrix1_index_factor2 = &k;
42 matrix0_index_factor2 = &j;
43 }
44 int32x4_t shift32x4 = vdupq_n_s32(shift);
45 int32x2_t shift32x2 = vdup_n_s32(shift);
46 int32x4_t sum_32x4 = vdupq_n_s32(0);
47 int32x2_t sum_32x2 = vdup_n_s32(0);
48
49 assert(inner_loop_count % 2 == 0);
50 assert(mid_loop_count % 2 == 0);
51
52 if (matrix1_index_init_case != 0 && matrix1_index_factor1 == 1) {
53 for (j = 0; j < SUBFRAMES; j++) {
54 matrix_prod_index = mid_loop_count * j;
55 for (k = 0; k < (mid_loop_count >> 2) << 2; k += 4) {
56 sum_32x4 = veorq_s32(sum_32x4, sum_32x4); // Initialize to zeros.
57 matrix1_index = k;
58 matrix0_index = matrix0_index_factor1 * j;
59 for (n = 0; n < inner_loop_count; n++) {
60 int32x4_t matrix0_32x4 =
61 vdupq_n_s32((int32_t)(matrix0[matrix0_index]) << 15);
62 int32x4_t matrix1_32x4 =
63 vshlq_s32(vld1q_s32(&matrix1[matrix1_index]), shift32x4);
64 int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4);
65 sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4);
66 matrix1_index += matrix1_index_step;
67 matrix0_index += matrix0_index_step;
68 }
69 vst1q_s32(&matrix_product[matrix_prod_index], sum_32x4);
70 matrix_prod_index += 4;
71 }
72 if (mid_loop_count % 4 > 1) {
73 sum_32x2 = veor_s32(sum_32x2, sum_32x2); // Initialize to zeros.
74 matrix1_index = k;
75 k += 2;
76 matrix0_index = matrix0_index_factor1 * j;
77 for (n = 0; n < inner_loop_count; n++) {
78 int32x2_t matrix0_32x2 =
79 vdup_n_s32((int32_t)(matrix0[matrix0_index]) << 15);
80 int32x2_t matrix1_32x2 =
81 vshl_s32(vld1_s32(&matrix1[matrix1_index]), shift32x2);
82 int32x2_t multi_32x2 = vqdmulh_s32(matrix0_32x2, matrix1_32x2);
83 sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
84 matrix1_index += matrix1_index_step;
85 matrix0_index += matrix0_index_step;
86 }
87 vst1_s32(&matrix_product[matrix_prod_index], sum_32x2);
88 matrix_prod_index += 2;
89 }
90 }
91 }
92 else if (matrix1_index_init_case == 0 && matrix0_index_factor1 == 1) {
93 int32x2_t multi_32x2 = vdup_n_s32(0);
94 int32x2_t matrix0_32x2 = vdup_n_s32(0);
95 for (j = 0; j < SUBFRAMES; j++) {
96 matrix_prod_index = mid_loop_count * j;
97 for (k = 0; k < (mid_loop_count >> 2) << 2; k += 4) {
98 sum_32x4 = veorq_s32(sum_32x4, sum_32x4); // Initialize to zeros.
99 matrix1_index = matrix1_index_factor1 * j;
100 matrix0_index = k;
101 for (n = 0; n < inner_loop_count; n++) {
102 int32x4_t matrix1_32x4 = vdupq_n_s32(matrix1[matrix1_index] << shift);
103 int32x4_t matrix0_32x4 =
104 vshll_n_s16(vld1_s16(&matrix0[matrix0_index]), 15);
105 int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4);
106 sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4);
107 matrix1_index += matrix1_index_step;
108 matrix0_index += matrix0_index_step;
109 }
110 vst1q_s32(&matrix_product[matrix_prod_index], sum_32x4);
111 matrix_prod_index += 4;
112 }
113 if (mid_loop_count % 4 > 1) {
114 sum_32x2 = veor_s32(sum_32x2, sum_32x2); // Initialize to zeros.
115 matrix1_index = matrix1_index_factor1 * j;
116 matrix0_index = k;
117 for (n = 0; n < inner_loop_count; n++) {
118 int32x2_t matrix1_32x2 = vdup_n_s32(matrix1[matrix1_index] << shift);
119 matrix0_32x2 =
120 vset_lane_s32((int32_t)matrix0[matrix0_index], matrix0_32x2, 0);
121 matrix0_32x2 = vset_lane_s32((int32_t)matrix0[matrix0_index + 1],
122 matrix0_32x2, 1);
123 matrix0_32x2 = vshl_n_s32(matrix0_32x2, 15);
124 multi_32x2 = vqdmulh_s32(matrix1_32x2, matrix0_32x2);
125 sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
126 matrix1_index += matrix1_index_step;
127 matrix0_index += matrix0_index_step;
128 }
129 vst1_s32(&matrix_product[matrix_prod_index], sum_32x2);
130 matrix_prod_index += 2;
131 }
132 }
133 }
134 else if (matrix1_index_init_case == 0 &&
135 matrix1_index_step == 1 &&
136 matrix0_index_step == 1) {
137 int32x2_t multi_32x2 = vdup_n_s32(0);
138 int32x2_t matrix0_32x2 = vdup_n_s32(0);
139 for (j = 0; j < SUBFRAMES; j++) {
140 matrix_prod_index = mid_loop_count * j;
141 for (k = 0; k < mid_loop_count; k++) {
142 sum_32x4 = veorq_s32(sum_32x4, sum_32x4); // Initialize to zeros.
143 matrix1_index = matrix1_index_factor1 * j;
144 matrix0_index = matrix0_index_factor1 * k;
145 for (n = 0; n < (inner_loop_count >> 2) << 2; n += 4) {
146 int32x4_t matrix1_32x4 =
147 vshlq_s32(vld1q_s32(&matrix1[matrix1_index]), shift32x4);
148 int32x4_t matrix0_32x4 =
149 vshll_n_s16(vld1_s16(&matrix0[matrix0_index]), 15);
150 int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4);
151 sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4);
152 matrix1_index += 4;
153 matrix0_index += 4;
154 }
155 sum_32x2 = vqadd_s32(vget_low_s32(sum_32x4), vget_high_s32(sum_32x4));
156 if (inner_loop_count % 4 > 1) {
157 int32x2_t matrix1_32x2 =
158 vshl_s32(vld1_s32(&matrix1[matrix1_index]), shift32x2);
159 matrix0_32x2 =
160 vset_lane_s32((int32_t)matrix0[matrix0_index], matrix0_32x2, 0);
161 matrix0_32x2 = vset_lane_s32((int32_t)matrix0[matrix0_index + 1],
162 matrix0_32x2, 1);
163 matrix0_32x2 = vshl_n_s32(matrix0_32x2, 15);
164 multi_32x2 = vqdmulh_s32(matrix1_32x2, matrix0_32x2);
165 sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
166 }
167 sum_32x2 = vpadd_s32(sum_32x2, sum_32x2);
168 vst1_lane_s32(&matrix_product[matrix_prod_index], sum_32x2, 0);
169 matrix_prod_index++;
170 }
171 }
172 }
173 else {
174 for (j = 0; j < SUBFRAMES; j++) {
175 matrix_prod_index = mid_loop_count * j;
176 for (k=0; k < mid_loop_count; k++) {
177 int32_t sum32 = 0;
178 matrix1_index = matrix1_index_factor1 * (*matrix1_index_factor2);
179 matrix0_index = matrix0_index_factor1 * (*matrix0_index_factor2);
180 for (n = 0; n < inner_loop_count; n++) {
181 sum32 += (WEBRTC_SPL_MUL_16_32_RSFT16(matrix0[matrix0_index],
182 matrix1[matrix1_index] << shift));
183 matrix1_index += matrix1_index_step;
184 matrix0_index += matrix0_index_step;
185 }
186 matrix_product[matrix_prod_index] = sum32;
187 matrix_prod_index++;
188 }
189 }
190 }
191 }
192
WebRtcIsacfix_MatrixProduct2Neon(const int16_t matrix0[],const int32_t matrix1[],int32_t matrix_product[],const int matrix0_index_factor,const int matrix0_index_step)193 void WebRtcIsacfix_MatrixProduct2Neon(const int16_t matrix0[],
194 const int32_t matrix1[],
195 int32_t matrix_product[],
196 const int matrix0_index_factor,
197 const int matrix0_index_step) {
198 int j = 0, n = 0;
199 int matrix1_index = 0, matrix0_index = 0, matrix_prod_index = 0;
200 int32x2_t sum_32x2 = vdup_n_s32(0);
201 for (j = 0; j < SUBFRAMES; j++) {
202 sum_32x2 = veor_s32(sum_32x2, sum_32x2); // Initialize to zeros.
203 matrix1_index = 0;
204 matrix0_index = matrix0_index_factor * j;
205 for (n = SUBFRAMES; n > 0; n--) {
206 int32x2_t matrix0_32x2 =
207 vdup_n_s32((int32_t)(matrix0[matrix0_index]) << 15);
208 int32x2_t matrix1_32x2 = vld1_s32(&matrix1[matrix1_index]);
209 int32x2_t multi_32x2 = vqdmulh_s32(matrix0_32x2, matrix1_32x2);
210 sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
211 matrix1_index += 2;
212 matrix0_index += matrix0_index_step;
213 }
214 sum_32x2 = vshr_n_s32(sum_32x2, 3);
215 vst1_s32(&matrix_product[matrix_prod_index], sum_32x2);
216 matrix_prod_index += 2;
217 }
218 }
219