• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *
3  *  Bluetooth low-complexity, subband codec (SBC) library
4  *
5  *  Copyright (C) 2004-2009  Marcel Holtmann <marcel@holtmann.org>
6  *  Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
7  *  Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
8  *
9  *
10  *  This library is free software; you can redistribute it and/or
11  *  modify it under the terms of the GNU Lesser General Public
12  *  License as published by the Free Software Foundation; either
13  *  version 2.1 of the License, or (at your option) any later version.
14  *
15  *  This library is distributed in the hope that it will be useful,
16  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  *  Lesser General Public License for more details.
19  *
20  *  You should have received a copy of the GNU Lesser General Public
21  *  License along with this library; if not, write to the Free Software
22  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
23  *
24  */
25 
26 #include <stdint.h>
27 #include <limits.h>
28 #include "sbc.h"
29 #include "sbc_math.h"
30 #include "sbc_tables.h"
31 
32 #include "sbc_primitives_neon.h"
33 
34 /*
35  * ARM NEON optimizations
36  */
37 
38 #ifdef SBC_BUILD_WITH_NEON_SUPPORT
39 
_sbc_analyze_four_neon(const int16_t * in,int32_t * out,const FIXED_T * consts)40 static inline void _sbc_analyze_four_neon(const int16_t *in, int32_t *out,
41 							const FIXED_T *consts)
42 {
43 	/* TODO: merge even and odd cases (or even merge all four calls to this
44 	 * function) in order to have only aligned reads from 'in' array
45 	 * and reduce number of load instructions */
46 	asm volatile (
47 		"vld1.16    {d4, d5}, [%0, :64]!\n"
48 		"vld1.16    {d8, d9}, [%1, :128]!\n"
49 
50 		"vmull.s16  q0, d4, d8\n"
51 		"vld1.16    {d6,  d7}, [%0, :64]!\n"
52 		"vmull.s16  q1, d5, d9\n"
53 		"vld1.16    {d10, d11}, [%1, :128]!\n"
54 
55 		"vmlal.s16  q0, d6, d10\n"
56 		"vld1.16    {d4, d5}, [%0, :64]!\n"
57 		"vmlal.s16  q1, d7, d11\n"
58 		"vld1.16    {d8, d9}, [%1, :128]!\n"
59 
60 		"vmlal.s16  q0, d4, d8\n"
61 		"vld1.16    {d6,  d7}, [%0, :64]!\n"
62 		"vmlal.s16  q1, d5, d9\n"
63 		"vld1.16    {d10, d11}, [%1, :128]!\n"
64 
65 		"vmlal.s16  q0, d6, d10\n"
66 		"vld1.16    {d4, d5}, [%0, :64]!\n"
67 		"vmlal.s16  q1, d7, d11\n"
68 		"vld1.16    {d8, d9}, [%1, :128]!\n"
69 
70 		"vmlal.s16  q0, d4, d8\n"
71 		"vmlal.s16  q1, d5, d9\n"
72 
73 		"vpadd.s32  d0, d0, d1\n"
74 		"vpadd.s32  d1, d2, d3\n"
75 
76 		"vrshrn.s32 d0, q0, %3\n"
77 
78 		"vld1.16    {d2, d3, d4, d5}, [%1, :128]!\n"
79 
80 		"vdup.i32   d1, d0[1]\n"  /* TODO: can be eliminated */
81 		"vdup.i32   d0, d0[0]\n"  /* TODO: can be eliminated */
82 
83 		"vmull.s16  q3, d2, d0\n"
84 		"vmull.s16  q4, d3, d0\n"
85 		"vmlal.s16  q3, d4, d1\n"
86 		"vmlal.s16  q4, d5, d1\n"
87 
88 		"vpadd.s32  d0, d6, d7\n" /* TODO: can be eliminated */
89 		"vpadd.s32  d1, d8, d9\n" /* TODO: can be eliminated */
90 
91 		"vst1.32    {d0, d1}, [%2, :128]\n"
92 		: "+r" (in), "+r" (consts)
93 		: "r" (out),
94 			"i" (SBC_PROTO_FIXED4_SCALE)
95 		: "memory",
96 			"d0", "d1", "d2", "d3", "d4", "d5",
97 			"d6", "d7", "d8", "d9", "d10", "d11");
98 }
99 
_sbc_analyze_eight_neon(const int16_t * in,int32_t * out,const FIXED_T * consts)100 static inline void _sbc_analyze_eight_neon(const int16_t *in, int32_t *out,
101 							const FIXED_T *consts)
102 {
103 	/* TODO: merge even and odd cases (or even merge all four calls to this
104 	 * function) in order to have only aligned reads from 'in' array
105 	 * and reduce number of load instructions */
106 	asm volatile (
107 		"vld1.16    {d4, d5}, [%0, :64]!\n"
108 		"vld1.16    {d8, d9}, [%1, :128]!\n"
109 
110 		"vmull.s16  q6, d4, d8\n"
111 		"vld1.16    {d6,  d7}, [%0, :64]!\n"
112 		"vmull.s16  q7, d5, d9\n"
113 		"vld1.16    {d10, d11}, [%1, :128]!\n"
114 		"vmull.s16  q8, d6, d10\n"
115 		"vld1.16    {d4, d5}, [%0, :64]!\n"
116 		"vmull.s16  q9, d7, d11\n"
117 		"vld1.16    {d8, d9}, [%1, :128]!\n"
118 
119 		"vmlal.s16  q6, d4, d8\n"
120 		"vld1.16    {d6,  d7}, [%0, :64]!\n"
121 		"vmlal.s16  q7, d5, d9\n"
122 		"vld1.16    {d10, d11}, [%1, :128]!\n"
123 		"vmlal.s16  q8, d6, d10\n"
124 		"vld1.16    {d4, d5}, [%0, :64]!\n"
125 		"vmlal.s16  q9, d7, d11\n"
126 		"vld1.16    {d8, d9}, [%1, :128]!\n"
127 
128 		"vmlal.s16  q6, d4, d8\n"
129 		"vld1.16    {d6,  d7}, [%0, :64]!\n"
130 		"vmlal.s16  q7, d5, d9\n"
131 		"vld1.16    {d10, d11}, [%1, :128]!\n"
132 		"vmlal.s16  q8, d6, d10\n"
133 		"vld1.16    {d4, d5}, [%0, :64]!\n"
134 		"vmlal.s16  q9, d7, d11\n"
135 		"vld1.16    {d8, d9}, [%1, :128]!\n"
136 
137 		"vmlal.s16  q6, d4, d8\n"
138 		"vld1.16    {d6,  d7}, [%0, :64]!\n"
139 		"vmlal.s16  q7, d5, d9\n"
140 		"vld1.16    {d10, d11}, [%1, :128]!\n"
141 		"vmlal.s16  q8, d6, d10\n"
142 		"vld1.16    {d4, d5}, [%0, :64]!\n"
143 		"vmlal.s16  q9, d7, d11\n"
144 		"vld1.16    {d8, d9}, [%1, :128]!\n"
145 
146 		"vmlal.s16  q6, d4, d8\n"
147 		"vld1.16    {d6,  d7}, [%0, :64]!\n"
148 		"vmlal.s16  q7, d5, d9\n"
149 		"vld1.16    {d10, d11}, [%1, :128]!\n"
150 
151 		"vmlal.s16  q8, d6, d10\n"
152 		"vmlal.s16  q9, d7, d11\n"
153 
154 		"vpadd.s32  d0, d12, d13\n"
155 		"vpadd.s32  d1, d14, d15\n"
156 		"vpadd.s32  d2, d16, d17\n"
157 		"vpadd.s32  d3, d18, d19\n"
158 
159 		"vrshr.s32 q0, q0, %3\n"
160 		"vrshr.s32 q1, q1, %3\n"
161 		"vmovn.s32 d0, q0\n"
162 		"vmovn.s32 d1, q1\n"
163 
164 		"vdup.i32   d3, d1[1]\n"  /* TODO: can be eliminated */
165 		"vdup.i32   d2, d1[0]\n"  /* TODO: can be eliminated */
166 		"vdup.i32   d1, d0[1]\n"  /* TODO: can be eliminated */
167 		"vdup.i32   d0, d0[0]\n"  /* TODO: can be eliminated */
168 
169 		"vld1.16    {d4, d5}, [%1, :128]!\n"
170 		"vmull.s16  q6, d4, d0\n"
171 		"vld1.16    {d6, d7}, [%1, :128]!\n"
172 		"vmull.s16  q7, d5, d0\n"
173 		"vmull.s16  q8, d6, d0\n"
174 		"vmull.s16  q9, d7, d0\n"
175 
176 		"vld1.16    {d4, d5}, [%1, :128]!\n"
177 		"vmlal.s16  q6, d4, d1\n"
178 		"vld1.16    {d6, d7}, [%1, :128]!\n"
179 		"vmlal.s16  q7, d5, d1\n"
180 		"vmlal.s16  q8, d6, d1\n"
181 		"vmlal.s16  q9, d7, d1\n"
182 
183 		"vld1.16    {d4, d5}, [%1, :128]!\n"
184 		"vmlal.s16  q6, d4, d2\n"
185 		"vld1.16    {d6, d7}, [%1, :128]!\n"
186 		"vmlal.s16  q7, d5, d2\n"
187 		"vmlal.s16  q8, d6, d2\n"
188 		"vmlal.s16  q9, d7, d2\n"
189 
190 		"vld1.16    {d4, d5}, [%1, :128]!\n"
191 		"vmlal.s16  q6, d4, d3\n"
192 		"vld1.16    {d6, d7}, [%1, :128]!\n"
193 		"vmlal.s16  q7, d5, d3\n"
194 		"vmlal.s16  q8, d6, d3\n"
195 		"vmlal.s16  q9, d7, d3\n"
196 
197 		"vpadd.s32  d0, d12, d13\n" /* TODO: can be eliminated */
198 		"vpadd.s32  d1, d14, d15\n" /* TODO: can be eliminated */
199 		"vpadd.s32  d2, d16, d17\n" /* TODO: can be eliminated */
200 		"vpadd.s32  d3, d18, d19\n" /* TODO: can be eliminated */
201 
202 		"vst1.32    {d0, d1, d2, d3}, [%2, :128]\n"
203 		: "+r" (in), "+r" (consts)
204 		: "r" (out),
205 			"i" (SBC_PROTO_FIXED8_SCALE)
206 		: "memory",
207 			"d0", "d1", "d2", "d3", "d4", "d5",
208 			"d6", "d7", "d8", "d9", "d10", "d11",
209 			"d12", "d13", "d14", "d15", "d16", "d17",
210 			"d18", "d19");
211 }
212 
sbc_analyze_4b_4s_neon(int16_t * x,int32_t * out,int out_stride)213 static inline void sbc_analyze_4b_4s_neon(int16_t *x,
214 						int32_t *out, int out_stride)
215 {
216 	/* Analyze blocks */
217 	_sbc_analyze_four_neon(x + 12, out, analysis_consts_fixed4_simd_odd);
218 	out += out_stride;
219 	_sbc_analyze_four_neon(x + 8, out, analysis_consts_fixed4_simd_even);
220 	out += out_stride;
221 	_sbc_analyze_four_neon(x + 4, out, analysis_consts_fixed4_simd_odd);
222 	out += out_stride;
223 	_sbc_analyze_four_neon(x + 0, out, analysis_consts_fixed4_simd_even);
224 }
225 
sbc_analyze_4b_8s_neon(int16_t * x,int32_t * out,int out_stride)226 static inline void sbc_analyze_4b_8s_neon(int16_t *x,
227 						int32_t *out, int out_stride)
228 {
229 	/* Analyze blocks */
230 	_sbc_analyze_eight_neon(x + 24, out, analysis_consts_fixed8_simd_odd);
231 	out += out_stride;
232 	_sbc_analyze_eight_neon(x + 16, out, analysis_consts_fixed8_simd_even);
233 	out += out_stride;
234 	_sbc_analyze_eight_neon(x + 8, out, analysis_consts_fixed8_simd_odd);
235 	out += out_stride;
236 	_sbc_analyze_eight_neon(x + 0, out, analysis_consts_fixed8_simd_even);
237 }
238 
sbc_init_primitives_neon(struct sbc_encoder_state * state)239 void sbc_init_primitives_neon(struct sbc_encoder_state *state)
240 {
241 	state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_neon;
242 	state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_neon;
243 	state->implementation_info = "NEON";
244 }
245 
246 #endif
247