• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Bluetooth low-complexity, subband codec (SBC)
3 *
4 * Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
5 * Copyright (C) 2008-2010  Nokia Corporation
6 * Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
7 * Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
8 * Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
9 *
10 * This file is part of FFmpeg.
11 *
12 * FFmpeg is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27/**
28 * @file
29 * SBC ARM NEON optimizations
30 */
31
32#include "libavutil/arm/asm.S"
33#include "neon.S"
34
35#define SBC_PROTO_FIXED_SCALE 16
36
37function ff_sbc_analyze_4_neon, export=1
38        /* TODO: merge even and odd cases (or even merge all four calls to this
39         * function) in order to have only aligned reads from 'in' array
40         * and reduce number of load instructions */
41        vld1.16         {d4, d5}, [r0, :64]!
42        vld1.16         {d8, d9}, [r2, :128]!
43
44        vmull.s16       q0, d4, d8
45        vld1.16         {d6,  d7}, [r0, :64]!
46        vmull.s16       q1, d5, d9
47        vld1.16         {d10, d11}, [r2, :128]!
48
49        vmlal.s16       q0, d6, d10
50        vld1.16         {d4, d5}, [r0, :64]!
51        vmlal.s16       q1, d7, d11
52        vld1.16         {d8, d9}, [r2, :128]!
53
54        vmlal.s16       q0, d4, d8
55        vld1.16         {d6,  d7}, [r0, :64]!
56        vmlal.s16       q1, d5, d9
57        vld1.16         {d10, d11}, [r2, :128]!
58
59        vmlal.s16       q0, d6, d10
60        vld1.16         {d4, d5}, [r0, :64]!
61        vmlal.s16       q1, d7, d11
62        vld1.16         {d8, d9}, [r2, :128]!
63
64        vmlal.s16       q0, d4, d8
65        vmlal.s16       q1, d5, d9
66
67        vpadd.s32       d0, d0, d1
68        vpadd.s32       d1, d2, d3
69
70        vrshrn.s32      d0, q0, SBC_PROTO_FIXED_SCALE
71
72        vld1.16         {d2, d3, d4, d5}, [r2, :128]!
73
74        vdup.i32        d1, d0[1]  /* TODO: can be eliminated */
75        vdup.i32        d0, d0[0]  /* TODO: can be eliminated */
76
77        vmull.s16       q3, d2, d0
78        vmull.s16       q4, d3, d0
79        vmlal.s16       q3, d4, d1
80        vmlal.s16       q4, d5, d1
81
82        vpadd.s32       d0, d6, d7 /* TODO: can be eliminated */
83        vpadd.s32       d1, d8, d9 /* TODO: can be eliminated */
84
85        vst1.32         {d0, d1}, [r1, :128]
86
87        bx              lr
88endfunc
89
90function ff_sbc_analyze_8_neon, export=1
91        /* TODO: merge even and odd cases (or even merge all four calls to this
92         * function) in order to have only aligned reads from 'in' array
93         * and reduce number of load instructions */
94        vld1.16         {d4, d5}, [r0, :64]!
95        vld1.16         {d8, d9}, [r2, :128]!
96
97        vmull.s16       q6, d4, d8
98        vld1.16         {d6,  d7}, [r0, :64]!
99        vmull.s16       q7, d5, d9
100        vld1.16         {d10, d11}, [r2, :128]!
101        vmull.s16       q8, d6, d10
102        vld1.16         {d4, d5}, [r0, :64]!
103        vmull.s16       q9, d7, d11
104        vld1.16         {d8, d9}, [r2, :128]!
105
106        vmlal.s16       q6, d4, d8
107        vld1.16         {d6,  d7}, [r0, :64]!
108        vmlal.s16       q7, d5, d9
109        vld1.16         {d10, d11}, [r2, :128]!
110        vmlal.s16       q8, d6, d10
111        vld1.16         {d4, d5}, [r0, :64]!
112        vmlal.s16       q9, d7, d11
113        vld1.16         {d8, d9}, [r2, :128]!
114
115        vmlal.s16       q6, d4, d8
116        vld1.16         {d6,  d7}, [r0, :64]!
117        vmlal.s16       q7, d5, d9
118        vld1.16         {d10, d11}, [r2, :128]!
119        vmlal.s16       q8, d6, d10
120        vld1.16         {d4, d5}, [r0, :64]!
121        vmlal.s16       q9, d7, d11
122        vld1.16         {d8, d9}, [r2, :128]!
123
124        vmlal.s16       q6, d4, d8
125        vld1.16         {d6,  d7}, [r0, :64]!
126        vmlal.s16       q7, d5, d9
127        vld1.16         {d10, d11}, [r2, :128]!
128        vmlal.s16       q8, d6, d10
129        vld1.16         {d4, d5}, [r0, :64]!
130        vmlal.s16       q9, d7, d11
131        vld1.16         {d8, d9}, [r2, :128]!
132
133        vmlal.s16       q6, d4, d8
134        vld1.16         {d6,  d7}, [r0, :64]!
135        vmlal.s16       q7, d5, d9
136        vld1.16         {d10, d11}, [r2, :128]!
137
138        vmlal.s16       q8, d6, d10
139        vmlal.s16       q9, d7, d11
140
141        vpadd.s32       d0, d12, d13
142        vpadd.s32       d1, d14, d15
143        vpadd.s32       d2, d16, d17
144        vpadd.s32       d3, d18, d19
145
146        vrshr.s32       q0, q0, SBC_PROTO_FIXED_SCALE
147        vrshr.s32       q1, q1, SBC_PROTO_FIXED_SCALE
148        vmovn.s32       d0, q0
149        vmovn.s32       d1, q1
150
151        vdup.i32        d3, d1[1]  /* TODO: can be eliminated */
152        vdup.i32        d2, d1[0]  /* TODO: can be eliminated */
153        vdup.i32        d1, d0[1]  /* TODO: can be eliminated */
154        vdup.i32        d0, d0[0]  /* TODO: can be eliminated */
155
156        vld1.16         {d4, d5}, [r2, :128]!
157        vmull.s16       q6, d4, d0
158        vld1.16         {d6, d7}, [r2, :128]!
159        vmull.s16       q7, d5, d0
160        vmull.s16       q8, d6, d0
161        vmull.s16       q9, d7, d0
162
163        vld1.16         {d4, d5}, [r2, :128]!
164        vmlal.s16       q6, d4, d1
165        vld1.16         {d6, d7}, [r2, :128]!
166        vmlal.s16       q7, d5, d1
167        vmlal.s16       q8, d6, d1
168        vmlal.s16       q9, d7, d1
169
170        vld1.16         {d4, d5}, [r2, :128]!
171        vmlal.s16       q6, d4, d2
172        vld1.16         {d6, d7}, [r2, :128]!
173        vmlal.s16       q7, d5, d2
174        vmlal.s16       q8, d6, d2
175        vmlal.s16       q9, d7, d2
176
177        vld1.16         {d4, d5}, [r2, :128]!
178        vmlal.s16       q6, d4, d3
179        vld1.16         {d6, d7}, [r2, :128]!
180        vmlal.s16       q7, d5, d3
181        vmlal.s16       q8, d6, d3
182        vmlal.s16       q9, d7, d3
183
184        vpadd.s32       d0, d12, d13 /* TODO: can be eliminated */
185        vpadd.s32       d1, d14, d15 /* TODO: can be eliminated */
186        vpadd.s32       d2, d16, d17 /* TODO: can be eliminated */
187        vpadd.s32       d3, d18, d19 /* TODO: can be eliminated */
188
189        vst1.32         {d0, d1, d2, d3}, [r1, :128]
190
191        bx              lr
192endfunc
193
194function ff_sbc_calc_scalefactors_neon, export=1
195        @ parameters
196        @ r0 = sb_sample_f
197        @ r1 = scale_factor
198        @ r2 = blocks
199        @ r3 = channels
200        @ r4 = subbands
201        @ local variables
202        @ r5 = in_loop_1
203        @ r6 = in
204        @ r7 = out_loop_1
205        @ r8 = out
206        @ r9 = ch
207        @ r10 = sb
208        @ r11 = inc
209        @ r12 = blk
210
211        push            {r1-r2, r4-r12}
212        ldr             r4,  [sp, #44]
213        mov             r11, #64
214
215        mov             r9,  #0
2161:
217        add             r5,  r0,  r9, lsl#5
218        add             r7,  r1,  r9, lsl#5
219
220        mov             r10,  #0
2212:
222        add             r6,  r5,  r10, lsl#2
223        add             r8,  r7,  r10, lsl#2
224        mov             r12, r2
225
226        vmov.s32        q0,  #0
227        vmov.s32        q1,  #0x8000            @ 1 << SCALE_OUT_BITS
228        vmov.s32        q14, #1
229        vmov.s32        q15, #16                @ 31 - SCALE_OUT_BITS
230        vadd.s32        q1,  q1,  q14
2313:
232        vld1.32         {d16, d17}, [r6, :128], r11
233        vabs.s32        q8,  q8
234        vld1.32         {d18, d19}, [r6, :128], r11
235        vabs.s32        q9,  q9
236        vld1.32         {d20, d21}, [r6, :128], r11
237        vabs.s32        q10, q10
238        vld1.32         {d22, d23}, [r6, :128], r11
239        vabs.s32        q11, q11
240        vmax.s32        q0,  q0,  q8
241        vmax.s32        q1,  q1,  q9
242        vmax.s32        q0,  q0,  q10
243        vmax.s32        q1,  q1,  q11
244        subs            r12, r12, #4
245        bgt             3b
246        vmax.s32        q0,  q0,  q1
247        vsub.s32        q0,  q0,  q14
248        vclz.s32        q0,  q0
249        vsub.s32        q0,  q15, q0
250        vst1.32         {d0, d1}, [r8, :128]
251
252        add             r10, r10, #4
253        cmp             r10, r4
254        blt             2b
255
256        add             r9,  r9,  #1
257        cmp             r9,  r3
258        blt             1b
259
260        pop             {r1-r2, r4-r12}
261        bx              lr
262endfunc
263
264/*
265 * constants: q13 = (31 - SCALE_OUT_BITS)
266 *            q14 = 1
267 * input:     q0  - ((1 << SCALE_OUT_BITS) + 1)
268 *            r5  - samples for channel 0
269 *            r6  - samples for shannel 1
270 * output:    q0, q1 - scale factors without joint stereo
271 *            q2, q3 - scale factors with joint stereo
272 *            q15    - joint stereo selection mask
273 */
274.macro calc_scalefactors
275        vmov.s32        q1,  q0
276        vmov.s32        q2,  q0
277        vmov.s32        q3,  q0
278        mov             r3,  r2
2791:
280        vld1.32         {d18, d19}, [r6, :128], r11
281        vbic.s32        q11, q9,  q14
282        vld1.32         {d16, d17}, [r5, :128], r11
283        vhadd.s32       q10, q8,  q11
284        vhsub.s32       q11, q8,  q11
285        vabs.s32        q8,  q8
286        vabs.s32        q9,  q9
287        vabs.s32        q10, q10
288        vabs.s32        q11, q11
289        vmax.s32        q0,  q0,  q8
290        vmax.s32        q1,  q1,  q9
291        vmax.s32        q2,  q2,  q10
292        vmax.s32        q3,  q3,  q11
293        subs            r3,  r3,  #1
294        bgt             1b
295        vsub.s32        q0,  q0,  q14
296        vsub.s32        q1,  q1,  q14
297        vsub.s32        q2,  q2,  q14
298        vsub.s32        q3,  q3,  q14
299        vclz.s32        q0,  q0
300        vclz.s32        q1,  q1
301        vclz.s32        q2,  q2
302        vclz.s32        q3,  q3
303        vsub.s32        q0,  q13, q0
304        vsub.s32        q1,  q13, q1
305        vsub.s32        q2,  q13, q2
306        vsub.s32        q3,  q13, q3
307.endm
308
309/*
310 * constants: q14 = 1
311 * input: q15 - joint stereo selection mask
312 *        r5  - value set by calc_scalefactors macro
313 *        r6  - value set by calc_scalefactors macro
314 */
315.macro update_joint_stereo_samples
316        sub             r8,  r6,  r11
317        sub             r7,  r5,  r11
318        sub             r6,  r6,  r11, asl #1
319        sub             r5,  r5,  r11, asl #1
320        vld1.32         {d18, d19}, [r6, :128]
321        vbic.s32        q11, q9,  q14
322        vld1.32         {d16, d17}, [r5, :128]
323        vld1.32         {d2, d3}, [r8, :128]
324        vbic.s32        q3,  q1,  q14
325        vld1.32         {d0, d1}, [r7, :128]
326        vhsub.s32       q10, q8,  q11
327        vhadd.s32       q11, q8,  q11
328        vhsub.s32       q2,  q0,  q3
329        vhadd.s32       q3,  q0,  q3
330        vbif.s32        q10, q9,  q15
331        vbif.s32        d22, d16, d30
332        sub             r11, r10, r11, asl #1
333        sub             r3,  r2,  #2
3342:
335        vbif.s32        d23, d17, d31
336        vst1.32         {d20, d21}, [r6, :128], r11
337        vbif.s32        d4,  d2,  d30
338        vld1.32         {d18, d19}, [r6, :128]
339        vbif.s32        d5,  d3,  d31
340        vst1.32         {d22, d23}, [r5, :128], r11
341        vbif.s32        d6,  d0,  d30
342        vld1.32         {d16, d17}, [r5, :128]
343        vbif.s32        d7,  d1,  d31
344        vst1.32         {d4, d5}, [r8, :128], r11
345        vbic.s32        q11, q9,  q14
346        vld1.32         {d2, d3}, [r8, :128]
347        vst1.32         {d6, d7}, [r7, :128], r11
348        vbic.s32        q3,  q1,  q14
349        vld1.32         {d0, d1}, [r7, :128]
350        vhsub.s32       q10, q8,  q11
351        vhadd.s32       q11, q8,  q11
352        vhsub.s32       q2,  q0,  q3
353        vhadd.s32       q3,  q0,  q3
354        vbif.s32        q10, q9,  q15
355        vbif.s32        d22, d16, d30
356        subs            r3,  r3,  #2
357        bgt             2b
358        sub             r11, r10, r11, asr #1
359        vbif.s32        d23, d17, d31
360        vst1.32         {d20, d21}, [r6, :128]
361        vbif.s32        q2,  q1,  q15
362        vst1.32         {d22, d23}, [r5, :128]
363        vbif.s32        q3,  q0,  q15
364        vst1.32         {d4, d5}, [r8, :128]
365        vst1.32         {d6, d7}, [r7, :128]
366.endm
367
368function ff_sbc_calc_scalefactors_j_neon, export=1
369        @ parameters
370        @ r0 = in = sb_sample_f
371        @ r1 = out = scale_factor
372        @ r2 = blocks
373        @ r3 = subbands
374        @ local variables
375        @ r4 = consts = ff_sbcdsp_joint_bits_mask
376        @ r5 = in0
377        @ r6 = in1
378        @ r7 = out0
379        @ r8 = out1
380        @ r10 = zero
381        @ r11 = inc
382        @ return r0 = joint
383
384        push            {r3-r11}
385        movrelx         r4,  X(ff_sbcdsp_joint_bits_mask)
386        mov             r10, #0
387        mov             r11, #64
388
389        vmov.s32        q14, #1
390        vmov.s32        q13, #16    @ 31 - SCALE_OUT_BITS
391
392        cmp             r3, #4
393        bne             8f
394
3954:      @ 4 subbands
396        add             r5,  r0,  #0
397        add             r6,  r0,  #32
398        add             r7,  r1,  #0
399        add             r8,  r1,  #32
400        vmov.s32        q0,  #0x8000    @ 1 << SCALE_OUT_BITS
401        vadd.s32        q0,  q0,  q14
402
403        calc_scalefactors
404
405        @ check whether to use joint stereo for subbands 0, 1, 2
406        vadd.s32        q15, q0,  q1
407        vadd.s32        q9,  q2,  q3
408        vmov.s32        d31[1], r10    @ last subband -> no joint
409        vld1.32         {d16, d17}, [r4, :128]!
410        vcgt.s32        q15, q15, q9
411
412        @ calculate and save to memory 'joint' variable
413        @ update and save scale factors to memory
414        vand.s32        q8,  q8,  q15
415        vbit.s32        q0,  q2,  q15
416        vpadd.s32       d16, d16, d17
417        vbit.s32        q1,  q3,  q15
418        vpadd.s32       d16, d16, d16
419        vst1.32         {d0, d1}, [r7, :128]
420        vst1.32         {d2, d3}, [r8, :128]
421        vmov.32         r0, d16[0]
422
423        update_joint_stereo_samples
424        b               9f
425
4268:      @ 8 subbands
427        add             r5,  r0,  #16
428        add             r6,  r0,  #48
429        add             r7,  r1,  #16
430        add             r8,  r1,  #48
431        vmov.s32        q0,  #0x8000    @ 1 << SCALE_OUT_BITS
432        vadd.s32        q0,  q0,  q14
433
434        calc_scalefactors
435
436        @ check whether to use joint stereo for subbands 4, 5, 6
437        vadd.s32        q15, q0,  q1
438        vadd.s32        q9,  q2,  q3
439        vmov.s32        d31[1], r10    @ last subband -> no joint
440        vld1.32         {d16, d17}, [r4, :128]!
441        vcgt.s32        q15, q15, q9
442
443        @ calculate part of 'joint' variable and save it to d24
444        @ update and save scale factors to memory
445        vand.s32        q8,  q8,  q15
446        vbit.s32        q0,  q2,  q15
447        vpadd.s32       d16, d16, d17
448        vbit.s32        q1,  q3,  q15
449        vst1.32         {d0, d1}, [r7, :128]
450        vst1.32         {d2, d3}, [r8, :128]
451        vpadd.s32       d24, d16, d16
452
453        update_joint_stereo_samples
454
455        add             r5,  r0,  #0
456        add             r6,  r0,  #32
457        add             r7,  r1,  #0
458        add             r8,  r1,  #32
459        vmov.s32        q0,  #0x8000    @ 1 << SCALE_OUT_BITS
460        vadd.s32        q0,  q0,  q14
461
462        calc_scalefactors
463
464        @ check whether to use joint stereo for subbands 0, 1, 2, 3
465        vadd.s32        q15, q0,  q1
466        vadd.s32        q9,  q2,  q3
467        vld1.32         {d16, d17}, [r4, :128]!
468        vcgt.s32        q15, q15, q9
469
470        @ combine last part of 'joint' with d24 and save to memory
471        @ update and save scale factors to memory
472        vand.s32        q8,  q8,  q15
473        vbit.s32        q0,  q2,  q15
474        vpadd.s32       d16, d16, d17
475        vbit.s32        q1,  q3,  q15
476        vpadd.s32       d16, d16, d16
477        vst1.32         {d0, d1}, [r7, :128]
478        vadd.s32        d16, d16, d24
479        vst1.32         {d2, d3}, [r8, :128]
480        vmov.32         r0,  d16[0]
481
482        update_joint_stereo_samples
4839:
484        pop             {r3-r11}
485        bx              lr
486endfunc
487
488function ff_sbc_enc_process_input_4s_neon, export=1
489        @ parameters
490        @ r0 = positioin
491        @ r1 = pcm
492        @ r2 = X
493        @ r3 = nsamples
494        @ r4 = nchannels
495        @ local variables
496        @ r5 = ff_sbc_input_perm_4
497        @ r6 = src / x
498        @ r7 = dst / y
499
500        push            {r1, r3-r7}
501        ldr             r4,  [sp, #24]
502        movrelx         r5,  X(ff_sbc_input_perm_4)
503
504        @ handle X buffer wraparound
505        cmp             r0,  r3
506        bge             1f                     @ if (position < nsamples)
507        add             r7,  r2,  #576         @ &X[0][SBC_X_BUFFER_SIZE - 40]
508        add             r6,  r2,  r0, lsl#1    @ &X[0][position]
509        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
510        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
511        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
512        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
513        vld1.16         {d0}, [r6, :64]!
514        vst1.16         {d0}, [r7, :64]!
515        cmp             r4,  #1
516        ble             2f                     @ if (nchannels > 1)
517        add             r7,  r2,  #1232        @ &X[1][SBC_X_BUFFER_SIZE - 40]
518        add             r6,  r2,  #656
519        add             r6,  r6,  r0, lsl#1    @ &X[1][position]
520        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
521        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
522        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
523        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
524        vld1.16         {d0}, [r6, :64]!
525        vst1.16         {d0}, [r7, :64]!
5262:
527        mov             r0,  #288              @ SBC_X_BUFFER_SIZE - 40
5281:
529
530        add             r6,  r2,  r0, lsl#1    @ &X[0][position]
531        add             r7,  r6,  #656         @ &X[1][position]
532
533        cmp             r4,  #1
534        ble             8f                     @ if (nchannels > 1)
535        tst             r1,  #1
536        beq             7f                     @ if (pcm & 1)
537        @ poor 'pcm' alignment
538        vld1.8          {d0, d1}, [r5, :128]
5391:
540        sub             r6,  r6,  #16
541        sub             r7,  r7,  #16
542        sub             r0,  r0,  #8
543        vld1.8          {d4, d5}, [r1]!
544        vuzp.16         d4,  d5
545        vld1.8          {d20, d21}, [r1]!
546        vuzp.16         d20, d21
547        vswp            d5,  d20
548        vtbl.8          d16, {d4, d5}, d0
549        vtbl.8          d17, {d4, d5}, d1
550        vtbl.8          d18, {d20, d21}, d0
551        vtbl.8          d19, {d20, d21}, d1
552        vst1.16         {d16, d17}, [r6, :128]
553        vst1.16         {d18, d19}, [r7, :128]
554        subs            r3,  r3,  #8
555        bgt             1b
556        b               9f
5577:
558        @ proper 'pcm' alignment
559        vld1.8          {d0, d1}, [r5, :128]
5601:
561        sub             r6,  r6,  #16
562        sub             r7,  r7,  #16
563        sub             r0,  r0,  #8
564        vld2.16         {d4, d5}, [r1]!
565        vld2.16         {d20, d21}, [r1]!
566        vswp            d5,  d20
567        vtbl.8          d16, {d4, d5}, d0
568        vtbl.8          d17, {d4, d5}, d1
569        vtbl.8          d18, {d20, d21}, d0
570        vtbl.8          d19, {d20, d21}, d1
571        vst1.16         {d16, d17}, [r6, :128]
572        vst1.16         {d18, d19}, [r7, :128]
573        subs            r3,  r3,  #8
574        bgt             1b
575        b               9f
5768:
577        @ mono
578        vld1.8          {d0, d1}, [r5, :128]
5791:
580        sub             r6,  r6,  #16
581        sub             r0,  r0,  #8
582        vld1.8          {d4, d5}, [r1]!
583        vtbl.8          d16, {d4, d5}, d0
584        vtbl.8          d17, {d4, d5}, d1
585        vst1.16         {d16, d17}, [r6, :128]
586        subs            r3,  r3,  #8
587        bgt             1b
5889:
589        pop             {r1, r3-r7}
590        bx              lr
591endfunc
592
593function ff_sbc_enc_process_input_8s_neon, export=1
594        @ parameters
595        @ r0 = positioin
596        @ r1 = pcm
597        @ r2 = X
598        @ r3 = nsamples
599        @ r4 = nchannels
600        @ local variables
601        @ r5 = ff_sbc_input_perm_8
602        @ r6 = src
603        @ r7 = dst
604
605        push            {r1, r3-r7}
606        ldr             r4,  [sp, #24]
607        movrelx         r5,  X(ff_sbc_input_perm_8)
608
609        @ handle X buffer wraparound
610        cmp             r0,  r3
611        bge             1f                     @ if (position < nsamples)
612        add             r7,  r2,  #512         @ &X[0][SBC_X_BUFFER_SIZE - 72]
613        add             r6,  r2,  r0, lsl#1    @ &X[0][position]
614        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
615        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
616        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
617        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
618        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
619        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
620        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
621        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
622        vld1.16         {d0, d1}, [r6, :128]!
623        vst1.16         {d0, d1}, [r7, :128]!
624        cmp             r4,  #1
625        ble             2f                     @ if (nchannels > 1)
626        add             r7,  r2,  #1168        @ &X[1][SBC_X_BUFFER_SIZE - 72]
627        add             r6,  r2,  #656
628        add             r6,  r6,  r0, lsl#1    @ &X[1][position]
629        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
630        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
631        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
632        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
633        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
634        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
635        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
636        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
637        vld1.16         {d0, d1}, [r6, :128]!
638        vst1.16         {d0, d1}, [r7, :128]!
6392:
640        mov             r0,  #256              @ SBC_X_BUFFER_SIZE - 72
6411:
642
643        add             r6,  r2,  r0, lsl#1    @ &X[0][position]
644        add             r7,  r6,  #656         @ &X[1][position]
645
646        cmp             r4,  #1
647        ble             8f                     @ if (nchannels > 1)
648        tst             r1,  #1
649        beq             7f                     @ if (pcm & 1)
650        @ poor 'pcm' alignment
651        vld1.8          {d0, d1, d2, d3}, [r5, :128]
6521:
653        sub             r6,  r6,  #32
654        sub             r7,  r7,  #32
655        sub             r0,  r0,  #16
656        vld1.8          {d4, d5, d6, d7}, [r1]!
657        vuzp.16         q2,  q3
658        vld1.8          {d20, d21, d22, d23}, [r1]!
659        vuzp.16         q10, q11
660        vswp            q3,  q10
661        vtbl.8          d16, {d4, d5, d6, d7}, d0
662        vtbl.8          d17, {d4, d5, d6, d7}, d1
663        vtbl.8          d18, {d4, d5, d6, d7}, d2
664        vtbl.8          d19, {d4, d5, d6, d7}, d3
665        vst1.16         {d16, d17, d18, d19}, [r6, :128]
666        vtbl.8          d16, {d20, d21, d22, d23}, d0
667        vtbl.8          d17, {d20, d21, d22, d23}, d1
668        vtbl.8          d18, {d20, d21, d22, d23}, d2
669        vtbl.8          d19, {d20, d21, d22, d23}, d3
670        vst1.16         {d16, d17, d18, d19}, [r7, :128]
671        subs            r3,  r3,  #16
672        bgt             1b
673        b 9f
6747:
675        @ proper 'pcm' alignment
676        vld1.8          {d0, d1, d2, d3}, [r5, :128]
6771:
678        sub             r6,  r6,  #32
679        sub             r7,  r7,  #32
680        sub             r0,  r0,  #16
681        vld2.16         {d4, d5, d6, d7}, [r1]!
682        vld2.16         {d20, d21, d22, d23}, [r1]!
683        vswp            q3,  q10
684        vtbl.8          d16, {d4, d5, d6, d7}, d0
685        vtbl.8          d17, {d4, d5, d6, d7}, d1
686        vtbl.8          d18, {d4, d5, d6, d7}, d2
687        vtbl.8          d19, {d4, d5, d6, d7}, d3
688        vst1.16         {d16, d17, d18, d19}, [r6, :128]
689        vtbl.8          d16, {d20, d21, d22, d23}, d0
690        vtbl.8          d17, {d20, d21, d22, d23}, d1
691        vtbl.8          d18, {d20, d21, d22, d23}, d2
692        vtbl.8          d19, {d20, d21, d22, d23}, d3
693        vst1.16         {d16, d17, d18, d19}, [r7, :128]
694        subs            r3,  r3,  #16
695        bgt             1b
696        b               9f
6978:
698        @ mono
699        vld1.8          {d0, d1, d2, d3}, [r5, :128]
7001:
701        sub             r6,  r6,  #32
702        sub             r0,  r0,  #16
703        vld1.8          {d4, d5, d6, d7}, [r1]!
704        vtbl.8          d16, {d4, d5, d6, d7}, d0
705        vtbl.8          d17, {d4, d5, d6, d7}, d1
706        vtbl.8          d18, {d4, d5, d6, d7}, d2
707        vtbl.8          d19, {d4, d5, d6, d7}, d3
708        vst1.16         {d16, d17, d18, d19}, [r6, :128]
709        subs            r3,  r3,  #16
710        bgt             1b
7119:
712        pop             {r1, r3-r7}
713        bx              lr
714endfunc
715