• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 /* This file contains WebRtcIsacfix_MatrixProduct1Neon() and
12  * WebRtcIsacfix_MatrixProduct2Neon() for ARM Neon platform. API's are in
13  * entropy_coding.c. Results are bit exact with the c code for
14  * generic platforms.
15  */
16 
17 #include <arm_neon.h>
18 #include <stddef.h>
19 
20 #include "modules/audio_coding/codecs/isac/fix/source/entropy_coding.h"
21 #include "common_audio/signal_processing/include/signal_processing_library.h"
22 #include "rtc_base/checks.h"
23 
WebRtcIsacfix_MatrixProduct1Neon(const int16_t matrix0[],const int32_t matrix1[],int32_t matrix_product[],const int matrix1_index_factor1,const int matrix0_index_factor1,const int matrix1_index_init_case,const int matrix1_index_step,const int matrix0_index_step,const int inner_loop_count,const int mid_loop_count,const int shift)24 void WebRtcIsacfix_MatrixProduct1Neon(const int16_t matrix0[],
25                                       const int32_t matrix1[],
26                                       int32_t matrix_product[],
27                                       const int matrix1_index_factor1,
28                                       const int matrix0_index_factor1,
29                                       const int matrix1_index_init_case,
30                                       const int matrix1_index_step,
31                                       const int matrix0_index_step,
32                                       const int inner_loop_count,
33                                       const int mid_loop_count,
34                                       const int shift) {
35   int j = 0, k = 0, n = 0;
36   int matrix1_index = 0, matrix0_index = 0, matrix_prod_index = 0;
37   int* matrix1_index_factor2 = &j;
38   int* matrix0_index_factor2 = &k;
39   if (matrix1_index_init_case != 0) {
40     matrix1_index_factor2 = &k;
41     matrix0_index_factor2 = &j;
42   }
43   int32x4_t shift32x4 = vdupq_n_s32(shift);
44   int32x2_t shift32x2 = vdup_n_s32(shift);
45   int32x4_t sum_32x4 =  vdupq_n_s32(0);
46   int32x2_t sum_32x2 =  vdup_n_s32(0);
47 
48   RTC_DCHECK_EQ(0, inner_loop_count % 2);
49   RTC_DCHECK_EQ(0, mid_loop_count % 2);
50 
51   if (matrix1_index_init_case != 0 && matrix1_index_factor1 == 1) {
52     for (j = 0; j < SUBFRAMES; j++) {
53       matrix_prod_index = mid_loop_count * j;
54       for (k = 0; k < (mid_loop_count >> 2) << 2; k += 4) {
55         sum_32x4 = veorq_s32(sum_32x4, sum_32x4);  // Initialize to zeros.
56         matrix1_index = k;
57         matrix0_index = matrix0_index_factor1 * j;
58         for (n = 0; n < inner_loop_count; n++) {
59           int32x4_t matrix0_32x4 =
60               vdupq_n_s32((int32_t)(matrix0[matrix0_index]) << 15);
61           int32x4_t matrix1_32x4 =
62               vshlq_s32(vld1q_s32(&matrix1[matrix1_index]), shift32x4);
63           int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4);
64           sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4);
65           matrix1_index += matrix1_index_step;
66           matrix0_index += matrix0_index_step;
67         }
68         vst1q_s32(&matrix_product[matrix_prod_index], sum_32x4);
69         matrix_prod_index += 4;
70       }
71       if (mid_loop_count % 4 > 1) {
72         sum_32x2 = veor_s32(sum_32x2, sum_32x2);  // Initialize to zeros.
73         matrix1_index = k;
74         k += 2;
75         matrix0_index = matrix0_index_factor1 * j;
76         for (n = 0; n < inner_loop_count; n++) {
77           int32x2_t matrix0_32x2 =
78               vdup_n_s32((int32_t)(matrix0[matrix0_index]) << 15);
79           int32x2_t matrix1_32x2 =
80               vshl_s32(vld1_s32(&matrix1[matrix1_index]), shift32x2);
81           int32x2_t multi_32x2 = vqdmulh_s32(matrix0_32x2, matrix1_32x2);
82           sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
83           matrix1_index += matrix1_index_step;
84           matrix0_index += matrix0_index_step;
85         }
86         vst1_s32(&matrix_product[matrix_prod_index], sum_32x2);
87         matrix_prod_index += 2;
88       }
89     }
90   }
91   else if (matrix1_index_init_case == 0 && matrix0_index_factor1 == 1) {
92     int32x2_t multi_32x2 = vdup_n_s32(0);
93     int32x2_t matrix0_32x2 = vdup_n_s32(0);
94     for (j = 0; j < SUBFRAMES; j++) {
95       matrix_prod_index = mid_loop_count * j;
96       for (k = 0; k < (mid_loop_count >> 2) << 2; k += 4) {
97         sum_32x4 = veorq_s32(sum_32x4, sum_32x4);  // Initialize to zeros.
98         matrix1_index = matrix1_index_factor1 * j;
99         matrix0_index = k;
100         for (n = 0; n < inner_loop_count; n++) {
101           int32x4_t matrix1_32x4 = vdupq_n_s32(matrix1[matrix1_index] << shift);
102           int32x4_t matrix0_32x4 =
103               vshll_n_s16(vld1_s16(&matrix0[matrix0_index]), 15);
104           int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4);
105           sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4);
106           matrix1_index += matrix1_index_step;
107           matrix0_index += matrix0_index_step;
108         }
109         vst1q_s32(&matrix_product[matrix_prod_index], sum_32x4);
110         matrix_prod_index += 4;
111       }
112       if (mid_loop_count % 4 > 1) {
113         sum_32x2 = veor_s32(sum_32x2, sum_32x2);  // Initialize to zeros.
114         matrix1_index = matrix1_index_factor1 * j;
115         matrix0_index = k;
116         for (n = 0; n < inner_loop_count; n++) {
117           int32x2_t matrix1_32x2 = vdup_n_s32(matrix1[matrix1_index] << shift);
118           matrix0_32x2 =
119               vset_lane_s32((int32_t)matrix0[matrix0_index], matrix0_32x2, 0);
120           matrix0_32x2 = vset_lane_s32((int32_t)matrix0[matrix0_index + 1],
121                                      matrix0_32x2, 1);
122           matrix0_32x2 = vshl_n_s32(matrix0_32x2, 15);
123           multi_32x2 = vqdmulh_s32(matrix1_32x2, matrix0_32x2);
124           sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
125           matrix1_index += matrix1_index_step;
126           matrix0_index += matrix0_index_step;
127         }
128         vst1_s32(&matrix_product[matrix_prod_index], sum_32x2);
129         matrix_prod_index += 2;
130       }
131     }
132   }
133   else if (matrix1_index_init_case == 0 &&
134            matrix1_index_step == 1 &&
135            matrix0_index_step == 1) {
136     int32x2_t multi_32x2 = vdup_n_s32(0);
137     int32x2_t matrix0_32x2 = vdup_n_s32(0);
138     for (j = 0; j < SUBFRAMES; j++) {
139       matrix_prod_index = mid_loop_count * j;
140       for (k = 0; k < mid_loop_count; k++) {
141         sum_32x4 = veorq_s32(sum_32x4, sum_32x4);  // Initialize to zeros.
142         matrix1_index = matrix1_index_factor1 * j;
143         matrix0_index = matrix0_index_factor1 * k;
144         for (n = 0; n < (inner_loop_count >> 2) << 2; n += 4) {
145           int32x4_t matrix1_32x4 =
146               vshlq_s32(vld1q_s32(&matrix1[matrix1_index]), shift32x4);
147           int32x4_t matrix0_32x4 =
148               vshll_n_s16(vld1_s16(&matrix0[matrix0_index]), 15);
149           int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4);
150           sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4);
151           matrix1_index += 4;
152           matrix0_index += 4;
153         }
154         sum_32x2 = vqadd_s32(vget_low_s32(sum_32x4), vget_high_s32(sum_32x4));
155         if (inner_loop_count % 4 > 1) {
156           int32x2_t matrix1_32x2 =
157               vshl_s32(vld1_s32(&matrix1[matrix1_index]), shift32x2);
158           matrix0_32x2 =
159               vset_lane_s32((int32_t)matrix0[matrix0_index], matrix0_32x2, 0);
160           matrix0_32x2 = vset_lane_s32((int32_t)matrix0[matrix0_index + 1],
161                                      matrix0_32x2, 1);
162           matrix0_32x2 = vshl_n_s32(matrix0_32x2, 15);
163           multi_32x2 = vqdmulh_s32(matrix1_32x2, matrix0_32x2);
164           sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
165         }
166         sum_32x2 = vpadd_s32(sum_32x2, sum_32x2);
167         vst1_lane_s32(&matrix_product[matrix_prod_index], sum_32x2, 0);
168         matrix_prod_index++;
169       }
170     }
171   }
172   else {
173     for (j = 0; j < SUBFRAMES; j++) {
174       matrix_prod_index = mid_loop_count * j;
175       for (k=0; k < mid_loop_count; k++) {
176         int32_t sum32 = 0;
177         matrix1_index = matrix1_index_factor1 * (*matrix1_index_factor2);
178         matrix0_index = matrix0_index_factor1 * (*matrix0_index_factor2);
179         for (n = 0; n < inner_loop_count; n++) {
180           sum32 += (WEBRTC_SPL_MUL_16_32_RSFT16(matrix0[matrix0_index],
181               matrix1[matrix1_index] << shift));
182           matrix1_index += matrix1_index_step;
183           matrix0_index += matrix0_index_step;
184         }
185         matrix_product[matrix_prod_index] = sum32;
186         matrix_prod_index++;
187       }
188     }
189   }
190 }
191 
WebRtcIsacfix_MatrixProduct2Neon(const int16_t matrix0[],const int32_t matrix1[],int32_t matrix_product[],const int matrix0_index_factor,const int matrix0_index_step)192 void WebRtcIsacfix_MatrixProduct2Neon(const int16_t matrix0[],
193                                       const int32_t matrix1[],
194                                       int32_t matrix_product[],
195                                       const int matrix0_index_factor,
196                                       const int matrix0_index_step) {
197   int j = 0, n = 0;
198   int matrix1_index = 0, matrix0_index = 0, matrix_prod_index = 0;
199   int32x2_t sum_32x2 = vdup_n_s32(0);
200   for (j = 0; j < SUBFRAMES; j++) {
201     sum_32x2 = veor_s32(sum_32x2, sum_32x2);  // Initialize to zeros.
202     matrix1_index = 0;
203     matrix0_index = matrix0_index_factor * j;
204     for (n = SUBFRAMES; n > 0; n--) {
205       int32x2_t matrix0_32x2 =
206           vdup_n_s32((int32_t)(matrix0[matrix0_index]) << 15);
207       int32x2_t matrix1_32x2 = vld1_s32(&matrix1[matrix1_index]);
208       int32x2_t multi_32x2 = vqdmulh_s32(matrix0_32x2, matrix1_32x2);
209       sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
210       matrix1_index += 2;
211       matrix0_index += matrix0_index_step;
212     }
213     sum_32x2 = vshr_n_s32(sum_32x2, 3);
214     vst1_s32(&matrix_product[matrix_prod_index], sum_32x2);
215     matrix_prod_index += 2;
216   }
217 }
218