• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /***********************************************************************
2 Copyright (c) 2017 Google Inc., Jean-Marc Valin
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions
5 are met:
6 - Redistributions of source code must retain the above copyright notice,
7 this list of conditions and the following disclaimer.
8 - Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
12 names of specific contributors, may be used to endorse or promote
13 products derived from this software without specific prior written
14 permission.
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 POSSIBILITY OF SUCH DAMAGE.
26 ***********************************************************************/
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #include <arm_neon.h>
33 #ifdef OPUS_CHECK_ASM
34 # include <string.h>
35 #endif
36 #include "stack_alloc.h"
37 #include "main_FIX.h"
38 
calc_corr(const opus_int32 * const input_QS,opus_int64 * const corr_QC,const opus_int offset,const int32x4_t state_QS_s32x4)39 static OPUS_INLINE void calc_corr( const opus_int32 *const input_QS, opus_int64 *const corr_QC, const opus_int offset, const int32x4_t state_QS_s32x4 )
40 {
41     int64x2_t corr_QC_s64x2[ 2 ], t_s64x2[ 2 ];
42     const int32x4_t input_QS_s32x4 = vld1q_s32( input_QS + offset );
43     corr_QC_s64x2[ 0 ] = vld1q_s64( corr_QC + offset + 0 );
44     corr_QC_s64x2[ 1 ] = vld1q_s64( corr_QC + offset + 2 );
45     t_s64x2[ 0 ] = vmull_s32( vget_low_s32( state_QS_s32x4 ), vget_low_s32( input_QS_s32x4 ) );
46     t_s64x2[ 1 ] = vmull_s32( vget_high_s32( state_QS_s32x4 ), vget_high_s32( input_QS_s32x4 ) );
47     corr_QC_s64x2[ 0 ] = vsraq_n_s64( corr_QC_s64x2[ 0 ], t_s64x2[ 0 ], 2 * QS - QC );
48     corr_QC_s64x2[ 1 ] = vsraq_n_s64( corr_QC_s64x2[ 1 ], t_s64x2[ 1 ], 2 * QS - QC );
49     vst1q_s64( corr_QC + offset + 0, corr_QC_s64x2[ 0 ] );
50     vst1q_s64( corr_QC + offset + 2, corr_QC_s64x2[ 1 ] );
51 }
52 
calc_state(const int32x4_t state_QS0_s32x4,const int32x4_t state_QS0_1_s32x4,const int32x4_t state_QS1_1_s32x4,const int32x4_t warping_Q16_s32x4)53 static OPUS_INLINE int32x4_t calc_state( const int32x4_t state_QS0_s32x4, const int32x4_t state_QS0_1_s32x4, const int32x4_t state_QS1_1_s32x4, const int32x4_t warping_Q16_s32x4 )
54 {
55     int32x4_t t_s32x4 = vsubq_s32( state_QS0_s32x4, state_QS0_1_s32x4 );
56     t_s32x4 = vqdmulhq_s32( t_s32x4, warping_Q16_s32x4 );
57     return vaddq_s32( state_QS1_1_s32x4, t_s32x4 );
58 }
59 
silk_warped_autocorrelation_FIX_neon(opus_int32 * corr,opus_int * scale,const opus_int16 * input,const opus_int warping_Q16,const opus_int length,const opus_int order)60 void silk_warped_autocorrelation_FIX_neon(
61           opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
62           opus_int                  *scale,                                 /* O    Scaling of the correlation vector                                           */
63     const opus_int16                *input,                                 /* I    Input data to correlate                                                     */
64     const opus_int                  warping_Q16,                            /* I    Warping coefficient                                                         */
65     const opus_int                  length,                                 /* I    Length of input                                                             */
66     const opus_int                  order                                   /* I    Correlation order (even)                                                    */
67 )
68 {
69     if( ( MAX_SHAPE_LPC_ORDER > 24 ) || ( order < 6 ) ) {
70         silk_warped_autocorrelation_FIX_c( corr, scale, input, warping_Q16, length, order );
71     } else {
72         opus_int       n, i, lsh;
73         opus_int64     corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; /* In reverse order */
74         opus_int64     corr_QC_orderT;
75         int64x2_t      lsh_s64x2;
76         const opus_int orderT = ( order + 3 ) & ~3;
77         opus_int64     *corr_QCT;
78         opus_int32     *input_QS;
79         VARDECL( opus_int32, input_QST );
80         VARDECL( opus_int32, state );
81         SAVE_STACK;
82 
83         /* Order must be even */
84         silk_assert( ( order & 1 ) == 0 );
85         silk_assert( 2 * QS - QC >= 0 );
86 
87         /* The additional +4 is to ensure a later vld1q_s32 call does not overflow.               */
88         /* Strictly, only +3 is needed but +4 simplifies initialization using the 4x32 neon load. */
89         ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER + 4, opus_int32 );
90 
91         input_QS = input_QST;
92         /* input_QS has zero paddings in the beginning and end. */
93         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
94         input_QS += 4;
95         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
96         input_QS += 4;
97         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
98         input_QS += 4;
99         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
100         input_QS += 4;
101         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
102         input_QS += 4;
103         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
104         input_QS += 4;
105 
106         /* Loop over samples */
107         for( n = 0; n < length - 7; n += 8, input_QS += 8 ) {
108             const int16x8_t t0_s16x4 = vld1q_s16( input + n );
109             vst1q_s32( input_QS + 0, vshll_n_s16( vget_low_s16( t0_s16x4 ), QS ) );
110             vst1q_s32( input_QS + 4, vshll_n_s16( vget_high_s16( t0_s16x4 ), QS ) );
111         }
112         for( ; n < length; n++, input_QS++ ) {
113             input_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS );
114         }
115         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
116         input_QS += 4;
117         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
118         input_QS += 4;
119         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
120         input_QS += 4;
121         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
122         input_QS += 4;
123         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
124         input_QS += 4;
125         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
126         input_QS += 4;
127         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
128         input_QS = input_QST + MAX_SHAPE_LPC_ORDER - orderT;
129 
130         /* The following loop runs ( length + order ) times, with ( order ) extra epilogues.                  */
131         /* The zero paddings in input_QS guarantee corr_QC's correctness even with the extra epilogues.       */
132         /* The values of state_QS will be polluted by the extra epilogues, however they are temporary values. */
133 
134         /* Keep the C code here to help understand the intrinsics optimization. */
135         /*
136         {
137             opus_int32 state_QS[ 2 ][ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };
138             opus_int32 *state_QST[ 3 ];
139             state_QST[ 0 ] = state_QS[ 0 ];
140             state_QST[ 1 ] = state_QS[ 1 ];
141             for( n = 0; n < length + order; n++, input_QS++ ) {
142                 state_QST[ 0 ][ orderT ] = input_QS[ orderT ];
143                 for( i = 0; i < orderT; i++ ) {
144                     corr_QC[ i ] += silk_RSHIFT64( silk_SMULL( state_QST[ 0 ][ i ], input_QS[ i ] ), 2 * QS - QC );
145                     state_QST[ 1 ][ i ] = silk_SMLAWB( state_QST[ 1 ][ i + 1 ], state_QST[ 0 ][ i ] - state_QST[ 0 ][ i + 1 ], warping_Q16 );
146                 }
147                 state_QST[ 2 ] = state_QST[ 0 ];
148                 state_QST[ 0 ] = state_QST[ 1 ];
149                 state_QST[ 1 ] = state_QST[ 2 ];
150             }
151         }
152         */
153 
154         {
155             const int32x4_t warping_Q16_s32x4 = vdupq_n_s32( warping_Q16 << 15 );
156             const opus_int32 *in = input_QS + orderT;
157             opus_int o = orderT;
158             int32x4_t state_QS_s32x4[ 3 ][ 2 ];
159 
160             /* The additional +4 is to ensure a later vld1q_s32 call does not overflow. */
161             ALLOC( state, length + order + 4, opus_int32 );
162             state_QS_s32x4[ 2 ][ 1 ] = vdupq_n_s32( 0 );
163 
164             /* Calculate 8 taps of all inputs in each loop. */
165             do {
166                 state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 0 ][ 1 ] =
167                 state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 1 ][ 1 ] = vdupq_n_s32( 0 );
168                 n = 0;
169                 do {
170                     calc_corr( input_QS + n, corr_QC, o - 8, state_QS_s32x4[ 0 ][ 0 ] );
171                     calc_corr( input_QS + n, corr_QC, o - 4, state_QS_s32x4[ 0 ][ 1 ] );
172                     state_QS_s32x4[ 2 ][ 1 ] = vld1q_s32( in + n );
173                     vst1q_lane_s32( state + n, state_QS_s32x4[ 0 ][ 0 ], 0 );
174                     state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 0 ][ 1 ], 1 );
175                     state_QS_s32x4[ 2 ][ 1 ] = vextq_s32( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], 1 );
176                     state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 );
177                     state_QS_s32x4[ 0 ][ 1 ] = calc_state( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], state_QS_s32x4[ 1 ][ 1 ], warping_Q16_s32x4 );
178                     state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ];
179                     state_QS_s32x4[ 1 ][ 1 ] = state_QS_s32x4[ 2 ][ 1 ];
180                 } while( ++n < ( length + order ) );
181                 in = state;
182                 o -= 8;
183             } while( o > 4 );
184 
185             if( o ) {
186                 /* Calculate the last 4 taps of all inputs. */
187                 opus_int32 *stateT = state;
188                 silk_assert( o == 4 );
189                 state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 1 ][ 0 ] = vdupq_n_s32( 0 );
190                 n = length + order;
191                 do {
192                     calc_corr( input_QS, corr_QC, 0, state_QS_s32x4[ 0 ][ 0 ] );
193                     state_QS_s32x4[ 2 ][ 0 ] = vld1q_s32( stateT );
194                     vst1q_lane_s32( stateT, state_QS_s32x4[ 0 ][ 0 ], 0 );
195                     state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], 1 );
196                     state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 );
197                     state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ];
198                     input_QS++;
199                     stateT++;
200                 } while( --n );
201             }
202         }
203 
204         {
205             const opus_int16 *inputT = input;
206             int32x4_t t_s32x4;
207             int64x1_t t_s64x1;
208             int64x2_t t_s64x2 = vdupq_n_s64( 0 );
209             for( n = 0; n <= length - 8; n += 8 ) {
210                 int16x8_t input_s16x8 = vld1q_s16( inputT );
211                 t_s32x4 = vmull_s16( vget_low_s16( input_s16x8 ), vget_low_s16( input_s16x8 ) );
212                 t_s32x4 = vmlal_s16( t_s32x4, vget_high_s16( input_s16x8 ), vget_high_s16( input_s16x8 ) );
213                 t_s64x2 = vaddw_s32( t_s64x2, vget_low_s32( t_s32x4 ) );
214                 t_s64x2 = vaddw_s32( t_s64x2, vget_high_s32( t_s32x4 ) );
215                 inputT += 8;
216             }
217             t_s64x1 = vadd_s64( vget_low_s64( t_s64x2 ), vget_high_s64( t_s64x2 ) );
218             corr_QC_orderT = vget_lane_s64( t_s64x1, 0 );
219             for( ; n < length; n++ ) {
220                 corr_QC_orderT += silk_SMULL( input[ n ], input[ n ] );
221             }
222             corr_QC_orderT = silk_LSHIFT64( corr_QC_orderT, QC );
223             corr_QC[ orderT ] = corr_QC_orderT;
224         }
225 
226         corr_QCT = corr_QC + orderT - order;
227         lsh = silk_CLZ64( corr_QC_orderT ) - 35;
228         lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC );
229         *scale = -( QC + lsh );
230         silk_assert( *scale >= -30 && *scale <= 12 );
231         lsh_s64x2 = vdupq_n_s64( lsh );
232         for( i = 0; i <= order - 3; i += 4 ) {
233             int32x4_t corr_s32x4;
234             int64x2_t corr_QC0_s64x2, corr_QC1_s64x2;
235             corr_QC0_s64x2 = vld1q_s64( corr_QCT + i );
236             corr_QC1_s64x2 = vld1q_s64( corr_QCT + i + 2 );
237             corr_QC0_s64x2 = vshlq_s64( corr_QC0_s64x2, lsh_s64x2 );
238             corr_QC1_s64x2 = vshlq_s64( corr_QC1_s64x2, lsh_s64x2 );
239             corr_s32x4     = vcombine_s32( vmovn_s64( corr_QC1_s64x2 ), vmovn_s64( corr_QC0_s64x2 ) );
240             corr_s32x4     = vrev64q_s32( corr_s32x4 );
241             vst1q_s32( corr + order - i - 3, corr_s32x4 );
242         }
243         if( lsh >= 0 ) {
244             for( ; i < order + 1; i++ ) {
245                 corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QCT[ i ], lsh ) );
246             }
247         } else {
248             for( ; i < order + 1; i++ ) {
249                 corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QCT[ i ], -lsh ) );
250             }
251         }
252         silk_assert( corr_QCT[ order ] >= 0 ); /* If breaking, decrease QC*/
253         RESTORE_STACK;
254     }
255 
256 #ifdef OPUS_CHECK_ASM
257     {
258         opus_int32 corr_c[ MAX_SHAPE_LPC_ORDER + 1 ];
259         opus_int   scale_c;
260         silk_warped_autocorrelation_FIX_c( corr_c, &scale_c, input, warping_Q16, length, order );
261         silk_assert( !memcmp( corr_c, corr, sizeof( corr_c[ 0 ] ) * ( order + 1 ) ) );
262         silk_assert( scale_c == *scale );
263     }
264 #endif
265 }
266