• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * ARMv7 NEON optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
5 * All rights reserved.
6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
7 * Copyright (C) 2014 Siarhei Siamashka.  All Rights Reserved.
8 * Copyright (C) 2014 Linaro Limited.  All Rights Reserved.
9 *
10 * This software is provided 'as-is', without any express or implied
11 * warranty.  In no event will the authors be held liable for any damages
12 * arising from the use of this software.
13 *
14 * Permission is granted to anyone to use this software for any purpose,
15 * including commercial applications, and to alter it and redistribute it
16 * freely, subject to the following restrictions:
17 *
18 * 1. The origin of this software must not be misrepresented; you must not
19 *    claim that you wrote the original software. If you use this software
20 *    in a product, an acknowledgment in the product documentation would be
21 *    appreciated but is not required.
22 * 2. Altered source versions must be plainly marked as such, and must not be
23 *    misrepresented as being the original software.
24 * 3. This notice may not be removed or altered from any source distribution.
25 */
26
27#if defined(__linux__) && defined(__ELF__)
28.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
29#endif
30
31.text
32.fpu neon
33.arch armv7a
34.object_arch armv4
35.arm
36
37
38#define RESPECT_STRICT_ALIGNMENT 1
39
40
41/*****************************************************************************/
42
43/* Supplementary macro for setting function attributes */
44.macro asm_function fname
45#ifdef __APPLE__
46    .globl _\fname
47_\fname:
48#else
49    .global \fname
50#ifdef __ELF__
51    .hidden \fname
52    .type \fname, %function
53#endif
54\fname:
55#endif
56.endm
57
58/* Transpose a block of 4x4 coefficients in four 64-bit registers */
59.macro transpose_4x4 x0, x1, x2, x3
60    vtrn.16 \x0, \x1
61    vtrn.16 \x2, \x3
62    vtrn.32 \x0, \x2
63    vtrn.32 \x1, \x3
64.endm
65
66
67#define CENTERJSAMPLE 128
68
69/*****************************************************************************/
70
71/*
72 * Perform dequantization and inverse DCT on one block of coefficients.
73 *
74 * GLOBAL(void)
75 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
76 *                        JSAMPARRAY output_buf, JDIMENSION output_col)
77 */
78
79#define FIX_0_298631336  (2446)
80#define FIX_0_390180644  (3196)
81#define FIX_0_541196100  (4433)
82#define FIX_0_765366865  (6270)
83#define FIX_0_899976223  (7373)
84#define FIX_1_175875602  (9633)
85#define FIX_1_501321110  (12299)
86#define FIX_1_847759065  (15137)
87#define FIX_1_961570560  (16069)
88#define FIX_2_053119869  (16819)
89#define FIX_2_562915447  (20995)
90#define FIX_3_072711026  (25172)
91
92#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
93#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
94#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
95#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
96#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
97#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
98#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
99#define FIX_0_541196100_PLUS_0_765366865  (FIX_0_541196100 + FIX_0_765366865)
100
101/*
102 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
103 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
104 */
105#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7)   \
106{                                                                             \
107    DCTELEM row0, row1, row2, row3, row4, row5, row6, row7;                   \
108    INT32   q1, q2, q3, q4, q5, q6, q7;                                       \
109    INT32   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
110                                                                              \
111    /* 1-D iDCT input data */                                                 \
112    row0 = xrow0;                                                             \
113    row1 = xrow1;                                                             \
114    row2 = xrow2;                                                             \
115    row3 = xrow3;                                                             \
116    row4 = xrow4;                                                             \
117    row5 = xrow5;                                                             \
118    row6 = xrow6;                                                             \
119    row7 = xrow7;                                                             \
120                                                                              \
121    q5 = row7 + row3;                                                         \
122    q4 = row5 + row1;                                                         \
123    q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) +                    \
124         MULTIPLY(q4, FIX_1_175875602);                                       \
125    q7 = MULTIPLY(q5, FIX_1_175875602) +                                      \
126         MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644);                     \
127    q2 = MULTIPLY(row2, FIX_0_541196100) +                                    \
128         MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065);                   \
129    q4 = q6;                                                                  \
130    q3 = ((INT32) row0 - (INT32) row4) << 13;                                 \
131    q6 += MULTIPLY(row5, -FIX_2_562915447) +                                  \
132          MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447);                  \
133    /* now we can use q1 (reloadable constants have been used up) */          \
134    q1 = q3 + q2;                                                             \
135    q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) +                 \
136          MULTIPLY(row1, -FIX_0_899976223);                                   \
137    q5 = q7;                                                                  \
138    q1 = q1 + q6;                                                             \
139    q7 += MULTIPLY(row7, -FIX_0_899976223) +                                  \
140          MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223);                  \
141                                                                              \
142    /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */        \
143    tmp11_plus_tmp2 = q1;                                                     \
144    row1 = 0;                                                                 \
145                                                                              \
146    q1 = q1 - q6;                                                             \
147    q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) +                 \
148          MULTIPLY(row3, -FIX_2_562915447);                                   \
149    q1 = q1 - q6;                                                             \
150    q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) +                   \
151         MULTIPLY(row6, FIX_0_541196100);                                     \
152    q3 = q3 - q2;                                                             \
153                                                                              \
154    /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */        \
155    tmp11_minus_tmp2 = q1;                                                    \
156                                                                              \
157    q1 = ((INT32) row0 + (INT32) row4) << 13;                                 \
158    q2 = q1 + q6;                                                             \
159    q1 = q1 - q6;                                                             \
160                                                                              \
161    /* pick up the results */                                                 \
162    tmp0  = q4;                                                               \
163    tmp1  = q5;                                                               \
164    tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2;                         \
165    tmp3  = q7;                                                               \
166    tmp10 = q2;                                                               \
167    tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2;                         \
168    tmp12 = q3;                                                               \
169    tmp13 = q1;                                                               \
170}
171
172#define XFIX_0_899976223                    d0[0]
173#define XFIX_0_541196100                    d0[1]
174#define XFIX_2_562915447                    d0[2]
175#define XFIX_0_298631336_MINUS_0_899976223  d0[3]
176#define XFIX_1_501321110_MINUS_0_899976223  d1[0]
177#define XFIX_2_053119869_MINUS_2_562915447  d1[1]
178#define XFIX_0_541196100_PLUS_0_765366865   d1[2]
179#define XFIX_1_175875602                    d1[3]
180#define XFIX_1_175875602_MINUS_0_390180644  d2[0]
181#define XFIX_0_541196100_MINUS_1_847759065  d2[1]
182#define XFIX_3_072711026_MINUS_2_562915447  d2[2]
183#define XFIX_1_175875602_MINUS_1_961570560  d2[3]
184
185.balign 16
186jsimd_idct_islow_neon_consts:
187    .short FIX_0_899976223                    /* d0[0] */
188    .short FIX_0_541196100                    /* d0[1] */
189    .short FIX_2_562915447                    /* d0[2] */
190    .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
191    .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
192    .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
193    .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
194    .short FIX_1_175875602                    /* d1[3] */
195    /* reloadable constants */
196    .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
197    .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
198    .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
199    .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
200
201asm_function jsimd_idct_islow_neon
202
203    DCT_TABLE       .req r0
204    COEF_BLOCK      .req r1
205    OUTPUT_BUF      .req r2
206    OUTPUT_COL      .req r3
207    TMP1            .req r0
208    TMP2            .req r1
209    TMP3            .req r2
210    TMP4            .req ip
211
212    ROW0L           .req d16
213    ROW0R           .req d17
214    ROW1L           .req d18
215    ROW1R           .req d19
216    ROW2L           .req d20
217    ROW2R           .req d21
218    ROW3L           .req d22
219    ROW3R           .req d23
220    ROW4L           .req d24
221    ROW4R           .req d25
222    ROW5L           .req d26
223    ROW5R           .req d27
224    ROW6L           .req d28
225    ROW6R           .req d29
226    ROW7L           .req d30
227    ROW7R           .req d31
228
229    /* Load and dequantize coefficients into NEON registers
230     * with the following allocation:
231     *       0 1 2 3 | 4 5 6 7
232     *      ---------+--------
233     *   0 | d16     | d17     ( q8  )
234     *   1 | d18     | d19     ( q9  )
235     *   2 | d20     | d21     ( q10 )
236     *   3 | d22     | d23     ( q11 )
237     *   4 | d24     | d25     ( q12 )
238     *   5 | d26     | d27     ( q13 )
239     *   6 | d28     | d29     ( q14 )
240     *   7 | d30     | d31     ( q15 )
241     */
242    adr             ip, jsimd_idct_islow_neon_consts
243    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
244    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
245    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
246    vmul.s16        q8, q8, q0
247    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
248    vmul.s16        q9, q9, q1
249    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
250    vmul.s16        q10, q10, q2
251    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
252    vmul.s16        q11, q11, q3
253    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
254    vmul.s16        q12, q12, q0
255    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
256    vmul.s16        q14, q14, q2
257    vmul.s16        q13, q13, q1
258    vld1.16         {d0, d1, d2, d3}, [ip, :128] /* load constants */
259    add             ip, ip, #16
260    vmul.s16        q15, q15, q3
261    vpush           {d8-d15} /* save NEON registers */
262    /* 1-D IDCT, pass 1, left 4x8 half */
263    vadd.s16        d4,    ROW7L, ROW3L
264    vadd.s16        d5,    ROW5L, ROW1L
265    vmull.s16       q6,    d4,    XFIX_1_175875602_MINUS_1_961570560
266    vmlal.s16       q6,    d5,    XFIX_1_175875602
267    vmull.s16       q7,    d4,    XFIX_1_175875602
268      /* Check for the zero coefficients in the right 4x8 half */
269      push            {r4, r5}
270    vmlal.s16       q7,    d5,    XFIX_1_175875602_MINUS_0_390180644
271    vsubl.s16       q3,    ROW0L, ROW4L
272      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
273    vmull.s16       q2,    ROW2L, XFIX_0_541196100
274    vmlal.s16       q2,    ROW6L, XFIX_0_541196100_MINUS_1_847759065
275      orr             r0,    r4,    r5
276    vmov            q4,    q6
277    vmlsl.s16       q6,    ROW5L, XFIX_2_562915447
278      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
279    vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
280    vshl.s32        q3,    q3,    #13
281      orr             r0,    r0,    r4
282    vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
283      orr             r0,    r0,    r5
284    vadd.s32        q1,    q3,    q2
285      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
286    vmov            q5,    q7
287    vadd.s32        q1,    q1,    q6
288      orr             r0,    r0,    r4
289    vmlsl.s16       q7,    ROW7L, XFIX_0_899976223
290      orr             r0,    r0,    r5
291    vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
292    vrshrn.s32      ROW1L, q1,    #11
293      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
294    vsub.s32        q1,    q1,    q6
295    vmlal.s16       q5,    ROW5L, XFIX_2_053119869_MINUS_2_562915447
296      orr             r0,    r0,    r4
297    vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
298      orr             r0,    r0,    r5
299    vsub.s32        q1,    q1,    q6
300    vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
301      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
302    vmlal.s16       q6,    ROW6L, XFIX_0_541196100
303    vsub.s32        q3,    q3,    q2
304      orr             r0,    r0,    r4
305    vrshrn.s32      ROW6L, q1,    #11
306      orr             r0,    r0,    r5
307    vadd.s32        q1,    q3,    q5
308      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
309    vsub.s32        q3,    q3,    q5
310    vaddl.s16       q5,    ROW0L, ROW4L
311      orr             r0,    r0,    r4
312    vrshrn.s32      ROW2L, q1,    #11
313      orr             r0,    r0,    r5
314    vrshrn.s32      ROW5L, q3,    #11
315      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
316    vshl.s32        q5,    q5,    #13
317    vmlal.s16       q4,    ROW7L, XFIX_0_298631336_MINUS_0_899976223
318      orr             r0,    r0,    r4
319    vadd.s32        q2,    q5,    q6
320      orrs            r0,    r0,    r5
321    vsub.s32        q1,    q5,    q6
322    vadd.s32        q6,    q2,    q7
323      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
324    vsub.s32        q2,    q2,    q7
325    vadd.s32        q5,    q1,    q4
326      orr             r0,    r4,    r5
327    vsub.s32        q3,    q1,    q4
328      pop             {r4, r5}
329    vrshrn.s32      ROW7L, q2,    #11
330    vrshrn.s32      ROW3L, q5,    #11
331    vrshrn.s32      ROW0L, q6,    #11
332    vrshrn.s32      ROW4L, q3,    #11
333
334      beq             3f /* Go to do some special handling for the sparse right 4x8 half */
335
336    /* 1-D IDCT, pass 1, right 4x8 half */
337    vld1.s16        {d2},  [ip, :64]    /* reload constants */
338    vadd.s16        d10,   ROW7R, ROW3R
339    vadd.s16        d8,    ROW5R, ROW1R
340      /* Transpose left 4x8 half */
341      vtrn.16         ROW6L, ROW7L
342    vmull.s16       q6,    d10,   XFIX_1_175875602_MINUS_1_961570560
343    vmlal.s16       q6,    d8,    XFIX_1_175875602
344      vtrn.16         ROW2L, ROW3L
345    vmull.s16       q7,    d10,   XFIX_1_175875602
346    vmlal.s16       q7,    d8,    XFIX_1_175875602_MINUS_0_390180644
347      vtrn.16         ROW0L, ROW1L
348    vsubl.s16       q3,    ROW0R, ROW4R
349    vmull.s16       q2,    ROW2R, XFIX_0_541196100
350    vmlal.s16       q2,    ROW6R, XFIX_0_541196100_MINUS_1_847759065
351      vtrn.16         ROW4L, ROW5L
352    vmov            q4,    q6
353    vmlsl.s16       q6,    ROW5R, XFIX_2_562915447
354    vmlal.s16       q6,    ROW3R, XFIX_3_072711026_MINUS_2_562915447
355      vtrn.32         ROW1L, ROW3L
356    vshl.s32        q3,    q3,    #13
357    vmlsl.s16       q4,    ROW1R, XFIX_0_899976223
358      vtrn.32         ROW4L, ROW6L
359    vadd.s32        q1,    q3,    q2
360    vmov            q5,    q7
361    vadd.s32        q1,    q1,    q6
362      vtrn.32         ROW0L, ROW2L
363    vmlsl.s16       q7,    ROW7R, XFIX_0_899976223
364    vmlal.s16       q7,    ROW1R, XFIX_1_501321110_MINUS_0_899976223
365    vrshrn.s32      ROW1R, q1,    #11
366      vtrn.32         ROW5L, ROW7L
367    vsub.s32        q1,    q1,    q6
368    vmlal.s16       q5,    ROW5R, XFIX_2_053119869_MINUS_2_562915447
369    vmlsl.s16       q5,    ROW3R, XFIX_2_562915447
370    vsub.s32        q1,    q1,    q6
371    vmull.s16       q6,    ROW2R, XFIX_0_541196100_PLUS_0_765366865
372    vmlal.s16       q6,    ROW6R, XFIX_0_541196100
373    vsub.s32        q3,    q3,    q2
374    vrshrn.s32      ROW6R, q1,    #11
375    vadd.s32        q1,    q3,    q5
376    vsub.s32        q3,    q3,    q5
377    vaddl.s16       q5,    ROW0R, ROW4R
378    vrshrn.s32      ROW2R, q1,    #11
379    vrshrn.s32      ROW5R, q3,    #11
380    vshl.s32        q5,    q5,    #13
381    vmlal.s16       q4,    ROW7R, XFIX_0_298631336_MINUS_0_899976223
382    vadd.s32        q2,    q5,    q6
383    vsub.s32        q1,    q5,    q6
384    vadd.s32        q6,    q2,    q7
385    vsub.s32        q2,    q2,    q7
386    vadd.s32        q5,    q1,    q4
387    vsub.s32        q3,    q1,    q4
388    vrshrn.s32      ROW7R, q2,    #11
389    vrshrn.s32      ROW3R, q5,    #11
390    vrshrn.s32      ROW0R, q6,    #11
391    vrshrn.s32      ROW4R, q3,    #11
392    /* Transpose right 4x8 half */
393    vtrn.16         ROW6R, ROW7R
394    vtrn.16         ROW2R, ROW3R
395    vtrn.16         ROW0R, ROW1R
396    vtrn.16         ROW4R, ROW5R
397    vtrn.32         ROW1R, ROW3R
398    vtrn.32         ROW4R, ROW6R
399    vtrn.32         ROW0R, ROW2R
400    vtrn.32         ROW5R, ROW7R
401
4021:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
403    vld1.s16        {d2},  [ip, :64]    /* reload constants */
404    vmull.s16       q6,    ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
405    vmlal.s16       q6,    ROW1L, XFIX_1_175875602
406    vmlal.s16       q6,    ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
407    vmlal.s16       q6,    ROW3L, XFIX_1_175875602_MINUS_1_961570560
408    vmull.s16       q7,    ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
409    vmlal.s16       q7,    ROW3L, XFIX_1_175875602
410    vmlal.s16       q7,    ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
411    vmlal.s16       q7,    ROW1L, XFIX_1_175875602_MINUS_0_390180644
412    vsubl.s16       q3,    ROW0L, ROW0R /* ROW4L <-> ROW0R */
413    vmull.s16       q2,    ROW2L, XFIX_0_541196100
414    vmlal.s16       q2,    ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
415    vmov            q4,    q6
416    vmlsl.s16       q6,    ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
417    vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
418    vshl.s32        q3,    q3,    #13
419    vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
420    vadd.s32        q1,    q3,    q2
421    vmov            q5,    q7
422    vadd.s32        q1,    q1,    q6
423    vmlsl.s16       q7,    ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
424    vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
425    vshrn.s32       ROW1L, q1,    #16
426    vsub.s32        q1,    q1,    q6
427    vmlal.s16       q5,    ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
428    vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
429    vsub.s32        q1,    q1,    q6
430    vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
431    vmlal.s16       q6,    ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
432    vsub.s32        q3,    q3,    q2
433    vshrn.s32       ROW2R, q1,    #16 /* ROW6L <-> ROW2R */
434    vadd.s32        q1,    q3,    q5
435    vsub.s32        q3,    q3,    q5
436    vaddl.s16       q5,    ROW0L, ROW0R /* ROW4L <-> ROW0R */
437    vshrn.s32       ROW2L, q1,    #16
438    vshrn.s32       ROW1R, q3,    #16 /* ROW5L <-> ROW1R */
439    vshl.s32        q5,    q5,    #13
440    vmlal.s16       q4,    ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
441    vadd.s32        q2,    q5,    q6
442    vsub.s32        q1,    q5,    q6
443    vadd.s32        q6,    q2,    q7
444    vsub.s32        q2,    q2,    q7
445    vadd.s32        q5,    q1,    q4
446    vsub.s32        q3,    q1,    q4
447    vshrn.s32       ROW3R, q2,    #16 /* ROW7L <-> ROW3R */
448    vshrn.s32       ROW3L, q5,    #16
449    vshrn.s32       ROW0L, q6,    #16
450    vshrn.s32       ROW0R, q3,    #16 /* ROW4L <-> ROW0R */
451    /* 1-D IDCT, pass 2, right 4x8 half */
452    vld1.s16        {d2},  [ip, :64]    /* reload constants */
453    vmull.s16       q6,    ROW5R, XFIX_1_175875602
454    vmlal.s16       q6,    ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
455    vmlal.s16       q6,    ROW7R, XFIX_1_175875602_MINUS_1_961570560
456    vmlal.s16       q6,    ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
457    vmull.s16       q7,    ROW7R, XFIX_1_175875602
458    vmlal.s16       q7,    ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
459    vmlal.s16       q7,    ROW5R, XFIX_1_175875602_MINUS_0_390180644
460    vmlal.s16       q7,    ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
461    vsubl.s16       q3,    ROW4L, ROW4R /* ROW4L <-> ROW0R */
462    vmull.s16       q2,    ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
463    vmlal.s16       q2,    ROW6R, XFIX_0_541196100_MINUS_1_847759065
464    vmov            q4,    q6
465    vmlsl.s16       q6,    ROW5R, XFIX_2_562915447
466    vmlal.s16       q6,    ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
467    vshl.s32        q3,    q3,    #13
468    vmlsl.s16       q4,    ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
469    vadd.s32        q1,    q3,    q2
470    vmov            q5,    q7
471    vadd.s32        q1,    q1,    q6
472    vmlsl.s16       q7,    ROW7R, XFIX_0_899976223
473    vmlal.s16       q7,    ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
474    vshrn.s32       ROW5L, q1,    #16 /* ROW5L <-> ROW1R */
475    vsub.s32        q1,    q1,    q6
476    vmlal.s16       q5,    ROW5R, XFIX_2_053119869_MINUS_2_562915447
477    vmlsl.s16       q5,    ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
478    vsub.s32        q1,    q1,    q6
479    vmull.s16       q6,    ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
480    vmlal.s16       q6,    ROW6R, XFIX_0_541196100
481    vsub.s32        q3,    q3,    q2
482    vshrn.s32       ROW6R, q1,    #16
483    vadd.s32        q1,    q3,    q5
484    vsub.s32        q3,    q3,    q5
485    vaddl.s16       q5,    ROW4L, ROW4R /* ROW4L <-> ROW0R */
486    vshrn.s32       ROW6L, q1,    #16 /* ROW6L <-> ROW2R */
487    vshrn.s32       ROW5R, q3,    #16
488    vshl.s32        q5,    q5,    #13
489    vmlal.s16       q4,    ROW7R, XFIX_0_298631336_MINUS_0_899976223
490    vadd.s32        q2,    q5,    q6
491    vsub.s32        q1,    q5,    q6
492    vadd.s32        q6,    q2,    q7
493    vsub.s32        q2,    q2,    q7
494    vadd.s32        q5,    q1,    q4
495    vsub.s32        q3,    q1,    q4
496    vshrn.s32       ROW7R, q2,    #16
497    vshrn.s32       ROW7L, q5,    #16 /* ROW7L <-> ROW3R */
498    vshrn.s32       ROW4L, q6,    #16 /* ROW4L <-> ROW0R */
499    vshrn.s32       ROW4R, q3,    #16
500
5012:  /* Descale to 8-bit and range limit */
502    vqrshrn.s16     d16,   q8,    #2
503    vqrshrn.s16     d17,   q9,    #2
504    vqrshrn.s16     d18,   q10,   #2
505    vqrshrn.s16     d19,   q11,   #2
506    vpop            {d8-d15} /* restore NEON registers */
507    vqrshrn.s16     d20,   q12,   #2
508      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
509      vtrn.16         q8,    q9
510    vqrshrn.s16     d21,   q13,   #2
511    vqrshrn.s16     d22,   q14,   #2
512      vmov.u8         q0,    #(CENTERJSAMPLE)
513    vqrshrn.s16     d23,   q15,   #2
514      vtrn.8          d16,   d17
515      vtrn.8          d18,   d19
516      vadd.u8         q8,    q8,    q0
517      vadd.u8         q9,    q9,    q0
518      vtrn.16         q10,   q11
519        /* Store results to the output buffer */
520        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
521        add             TMP1, TMP1, OUTPUT_COL
522        add             TMP2, TMP2, OUTPUT_COL
523        vst1.8          {d16}, [TMP1]
524      vtrn.8          d20, d21
525        vst1.8          {d17}, [TMP2]
526        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
527        add             TMP1, TMP1, OUTPUT_COL
528        add             TMP2, TMP2, OUTPUT_COL
529        vst1.8          {d18}, [TMP1]
530      vadd.u8         q10,   q10,   q0
531        vst1.8          {d19}, [TMP2]
532        ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
533        add             TMP1, TMP1, OUTPUT_COL
534        add             TMP2, TMP2, OUTPUT_COL
535        add             TMP3, TMP3, OUTPUT_COL
536        add             TMP4, TMP4, OUTPUT_COL
537      vtrn.8          d22, d23
538        vst1.8          {d20}, [TMP1]
539      vadd.u8         q11,   q11,   q0
540        vst1.8          {d21}, [TMP2]
541        vst1.8          {d22}, [TMP3]
542        vst1.8          {d23}, [TMP4]
543    bx              lr
544
5453:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
546
547    /* Transpose left 4x8 half */
548    vtrn.16         ROW6L, ROW7L
549    vtrn.16         ROW2L, ROW3L
550    vtrn.16         ROW0L, ROW1L
551    vtrn.16         ROW4L, ROW5L
552    vshl.s16        ROW0R, ROW0R, #2 /* PASS1_BITS */
553    vtrn.32         ROW1L, ROW3L
554    vtrn.32         ROW4L, ROW6L
555    vtrn.32         ROW0L, ROW2L
556    vtrn.32         ROW5L, ROW7L
557
558    cmp             r0, #0
559    beq             4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
560
561    /* Only row 0 is non-zero for the right 4x8 half  */
562    vdup.s16        ROW1R, ROW0R[1]
563    vdup.s16        ROW2R, ROW0R[2]
564    vdup.s16        ROW3R, ROW0R[3]
565    vdup.s16        ROW4R, ROW0R[0]
566    vdup.s16        ROW5R, ROW0R[1]
567    vdup.s16        ROW6R, ROW0R[2]
568    vdup.s16        ROW7R, ROW0R[3]
569    vdup.s16        ROW0R, ROW0R[0]
570    b               1b /* Go to 'normal' second pass */
571
5724:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
573    vld1.s16        {d2},  [ip, :64]    /* reload constants */
574    vmull.s16       q6,    ROW1L, XFIX_1_175875602
575    vmlal.s16       q6,    ROW3L, XFIX_1_175875602_MINUS_1_961570560
576    vmull.s16       q7,    ROW3L, XFIX_1_175875602
577    vmlal.s16       q7,    ROW1L, XFIX_1_175875602_MINUS_0_390180644
578    vmull.s16       q2,    ROW2L, XFIX_0_541196100
579    vshll.s16       q3,    ROW0L, #13
580    vmov            q4,    q6
581    vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
582    vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
583    vadd.s32        q1,    q3,    q2
584    vmov            q5,    q7
585    vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
586    vadd.s32        q1,    q1,    q6
587    vadd.s32        q6,    q6,    q6
588    vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
589    vshrn.s32       ROW1L, q1,    #16
590    vsub.s32        q1,    q1,    q6
591    vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
592    vsub.s32        q3,    q3,    q2
593    vshrn.s32       ROW2R, q1,    #16 /* ROW6L <-> ROW2R */
594    vadd.s32        q1,    q3,    q5
595    vsub.s32        q3,    q3,    q5
596    vshll.s16       q5,    ROW0L, #13
597    vshrn.s32       ROW2L, q1,    #16
598    vshrn.s32       ROW1R, q3,    #16 /* ROW5L <-> ROW1R */
599    vadd.s32        q2,    q5,    q6
600    vsub.s32        q1,    q5,    q6
601    vadd.s32        q6,    q2,    q7
602    vsub.s32        q2,    q2,    q7
603    vadd.s32        q5,    q1,    q4
604    vsub.s32        q3,    q1,    q4
605    vshrn.s32       ROW3R, q2,    #16 /* ROW7L <-> ROW3R */
606    vshrn.s32       ROW3L, q5,    #16
607    vshrn.s32       ROW0L, q6,    #16
608    vshrn.s32       ROW0R, q3,    #16 /* ROW4L <-> ROW0R */
609    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
610    vld1.s16        {d2},  [ip, :64]    /* reload constants */
611    vmull.s16       q6,    ROW5L, XFIX_1_175875602
612    vmlal.s16       q6,    ROW7L, XFIX_1_175875602_MINUS_1_961570560
613    vmull.s16       q7,    ROW7L, XFIX_1_175875602
614    vmlal.s16       q7,    ROW5L, XFIX_1_175875602_MINUS_0_390180644
615    vmull.s16       q2,    ROW6L, XFIX_0_541196100
616    vshll.s16       q3,    ROW4L, #13
617    vmov            q4,    q6
618    vmlal.s16       q6,    ROW7L, XFIX_3_072711026_MINUS_2_562915447
619    vmlsl.s16       q4,    ROW5L, XFIX_0_899976223
620    vadd.s32        q1,    q3,    q2
621    vmov            q5,    q7
622    vmlal.s16       q7,    ROW5L, XFIX_1_501321110_MINUS_0_899976223
623    vadd.s32        q1,    q1,    q6
624    vadd.s32        q6,    q6,    q6
625    vmlsl.s16       q5,    ROW7L, XFIX_2_562915447
626    vshrn.s32       ROW5L, q1,    #16 /* ROW5L <-> ROW1R */
627    vsub.s32        q1,    q1,    q6
628    vmull.s16       q6,    ROW6L, XFIX_0_541196100_PLUS_0_765366865
629    vsub.s32        q3,    q3,    q2
630    vshrn.s32       ROW6R, q1,    #16
631    vadd.s32        q1,    q3,    q5
632    vsub.s32        q3,    q3,    q5
633    vshll.s16       q5,    ROW4L, #13
634    vshrn.s32       ROW6L, q1,    #16 /* ROW6L <-> ROW2R */
635    vshrn.s32       ROW5R, q3,    #16
636    vadd.s32        q2,    q5,    q6
637    vsub.s32        q1,    q5,    q6
638    vadd.s32        q6,    q2,    q7
639    vsub.s32        q2,    q2,    q7
640    vadd.s32        q5,    q1,    q4
641    vsub.s32        q3,    q1,    q4
642    vshrn.s32       ROW7R, q2,    #16
643    vshrn.s32       ROW7L, q5,    #16 /* ROW7L <-> ROW3R */
644    vshrn.s32       ROW4L, q6,    #16 /* ROW4L <-> ROW0R */
645    vshrn.s32       ROW4R, q3,    #16
646    b               2b /* Go to epilogue */
647
648    .unreq          DCT_TABLE
649    .unreq          COEF_BLOCK
650    .unreq          OUTPUT_BUF
651    .unreq          OUTPUT_COL
652    .unreq          TMP1
653    .unreq          TMP2
654    .unreq          TMP3
655    .unreq          TMP4
656
657    .unreq          ROW0L
658    .unreq          ROW0R
659    .unreq          ROW1L
660    .unreq          ROW1R
661    .unreq          ROW2L
662    .unreq          ROW2R
663    .unreq          ROW3L
664    .unreq          ROW3R
665    .unreq          ROW4L
666    .unreq          ROW4R
667    .unreq          ROW5L
668    .unreq          ROW5R
669    .unreq          ROW6L
670    .unreq          ROW6R
671    .unreq          ROW7L
672    .unreq          ROW7R
673
674
675/*****************************************************************************/
676
677/*
678 * jsimd_idct_ifast_neon
679 *
680 * This function contains a fast, not so accurate integer implementation of
681 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
682 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
683 * function from jidctfst.c
684 *
685 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
686 * But in ARM NEON case some extra additions are required because VQDMULH
687 * instruction can't handle the constants larger than 1. So the expressions
688 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
689 * which introduces an extra addition. Overall, there are 6 extra additions
690 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
691 */
692
693#define XFIX_1_082392200 d0[0]
694#define XFIX_1_414213562 d0[1]
695#define XFIX_1_847759065 d0[2]
696#define XFIX_2_613125930 d0[3]
697
698.balign 16
699jsimd_idct_ifast_neon_consts:
700    .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
701    .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
702    .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
703    .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
704
705asm_function jsimd_idct_ifast_neon
706
707    DCT_TABLE       .req r0
708    COEF_BLOCK      .req r1
709    OUTPUT_BUF      .req r2
710    OUTPUT_COL      .req r3
711    TMP1            .req r0
712    TMP2            .req r1
713    TMP3            .req r2
714    TMP4            .req ip
715
716    /* Load and dequantize coefficients into NEON registers
717     * with the following allocation:
718     *       0 1 2 3 | 4 5 6 7
719     *      ---------+--------
720     *   0 | d16     | d17     ( q8  )
721     *   1 | d18     | d19     ( q9  )
722     *   2 | d20     | d21     ( q10 )
723     *   3 | d22     | d23     ( q11 )
724     *   4 | d24     | d25     ( q12 )
725     *   5 | d26     | d27     ( q13 )
726     *   6 | d28     | d29     ( q14 )
727     *   7 | d30     | d31     ( q15 )
728     */
729    adr             ip, jsimd_idct_ifast_neon_consts
730    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
731    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
732    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
733    vmul.s16        q8,  q8,  q0
734    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
735    vmul.s16        q9,  q9,  q1
736    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
737    vmul.s16        q10, q10, q2
738    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
739    vmul.s16        q11, q11, q3
740    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
741    vmul.s16        q12, q12, q0
742    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
743    vmul.s16        q14, q14, q2
744    vmul.s16        q13, q13, q1
745    vld1.16         {d0}, [ip, :64] /* load constants */
746    vmul.s16        q15, q15, q3
747    vpush           {d8-d13}        /* save NEON registers */
748    /* 1-D IDCT, pass 1 */
749    vsub.s16        q2,  q10, q14
750    vadd.s16        q14, q10, q14
751    vsub.s16        q1,  q11, q13
752    vadd.s16        q13, q11, q13
753    vsub.s16        q5,  q9,  q15
754    vadd.s16        q15, q9,  q15
755    vqdmulh.s16     q4,  q2,  XFIX_1_414213562
756    vqdmulh.s16     q6,  q1,  XFIX_2_613125930
757    vadd.s16        q3,  q1,  q1
758    vsub.s16        q1,  q5,  q1
759    vadd.s16        q10, q2,  q4
760    vqdmulh.s16     q4,  q1,  XFIX_1_847759065
761    vsub.s16        q2,  q15, q13
762    vadd.s16        q3,  q3,  q6
763    vqdmulh.s16     q6,  q2,  XFIX_1_414213562
764    vadd.s16        q1,  q1,  q4
765    vqdmulh.s16     q4,  q5,  XFIX_1_082392200
766    vsub.s16        q10, q10, q14
767    vadd.s16        q2,  q2,  q6
768    vsub.s16        q6,  q8,  q12
769    vadd.s16        q12, q8,  q12
770    vadd.s16        q9,  q5,  q4
771    vadd.s16        q5,  q6,  q10
772    vsub.s16        q10, q6,  q10
773    vadd.s16        q6,  q15, q13
774    vadd.s16        q8,  q12, q14
775    vsub.s16        q3,  q6,  q3
776    vsub.s16        q12, q12, q14
777    vsub.s16        q3,  q3,  q1
778    vsub.s16        q1,  q9,  q1
779    vadd.s16        q2,  q3,  q2
780    vsub.s16        q15, q8,  q6
781    vadd.s16        q1,  q1,  q2
782    vadd.s16        q8,  q8,  q6
783    vadd.s16        q14, q5,  q3
784    vsub.s16        q9,  q5,  q3
785    vsub.s16        q13, q10, q2
786    vadd.s16        q10, q10, q2
787      /* Transpose */
788      vtrn.16         q8,  q9
789    vsub.s16        q11, q12, q1
790      vtrn.16         q14, q15
791    vadd.s16        q12, q12, q1
792      vtrn.16         q10, q11
793      vtrn.16         q12, q13
794      vtrn.32         q9,  q11
795      vtrn.32         q12, q14
796      vtrn.32         q8,  q10
797      vtrn.32         q13, q15
798      vswp            d28, d21
799      vswp            d26, d19
800    /* 1-D IDCT, pass 2 */
801    vsub.s16        q2,  q10, q14
802      vswp            d30, d23
803    vadd.s16        q14, q10, q14
804      vswp            d24, d17
805    vsub.s16        q1,  q11, q13
806    vadd.s16        q13, q11, q13
807    vsub.s16        q5,  q9,  q15
808    vadd.s16        q15, q9,  q15
809    vqdmulh.s16     q4,  q2,  XFIX_1_414213562
810    vqdmulh.s16     q6,  q1,  XFIX_2_613125930
811    vadd.s16        q3,  q1,  q1
812    vsub.s16        q1,  q5,  q1
813    vadd.s16        q10, q2,  q4
814    vqdmulh.s16     q4,  q1,  XFIX_1_847759065
815    vsub.s16        q2,  q15, q13
816    vadd.s16        q3,  q3,  q6
817    vqdmulh.s16     q6,  q2,  XFIX_1_414213562
818    vadd.s16        q1,  q1,  q4
819    vqdmulh.s16     q4,  q5,  XFIX_1_082392200
820    vsub.s16        q10, q10, q14
821    vadd.s16        q2,  q2,  q6
822    vsub.s16        q6,  q8,  q12
823    vadd.s16        q12, q8,  q12
824    vadd.s16        q9,  q5,  q4
825    vadd.s16        q5,  q6,  q10
826    vsub.s16        q10, q6,  q10
827    vadd.s16        q6,  q15, q13
828    vadd.s16        q8,  q12, q14
829    vsub.s16        q3,  q6,  q3
830    vsub.s16        q12, q12, q14
831    vsub.s16        q3,  q3,  q1
832    vsub.s16        q1,  q9,  q1
833    vadd.s16        q2,  q3,  q2
834    vsub.s16        q15, q8,  q6
835    vadd.s16        q1,  q1,  q2
836    vadd.s16        q8,  q8,  q6
837    vadd.s16        q14, q5,  q3
838    vsub.s16        q9,  q5,  q3
839    vsub.s16        q13, q10, q2
840    vpop            {d8-d13}        /* restore NEON registers */
841    vadd.s16        q10, q10, q2
842    vsub.s16        q11, q12, q1
843    vadd.s16        q12, q12, q1
844    /* Descale to 8-bit and range limit */
845    vmov.u8         q0,  #0x80
846    vqshrn.s16      d16, q8,  #5
847    vqshrn.s16      d17, q9,  #5
848    vqshrn.s16      d18, q10, #5
849    vqshrn.s16      d19, q11, #5
850    vqshrn.s16      d20, q12, #5
851    vqshrn.s16      d21, q13, #5
852    vqshrn.s16      d22, q14, #5
853    vqshrn.s16      d23, q15, #5
854    vadd.u8         q8,  q8,  q0
855    vadd.u8         q9,  q9,  q0
856    vadd.u8         q10, q10, q0
857    vadd.u8         q11, q11, q0
858    /* Transpose the final 8-bit samples */
859    vtrn.16         q8,  q9
860    vtrn.16         q10, q11
861    vtrn.32         q8,  q10
862    vtrn.32         q9,  q11
863    vtrn.8          d16, d17
864    vtrn.8          d18, d19
865      /* Store results to the output buffer */
866      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
867      add             TMP1, TMP1, OUTPUT_COL
868      add             TMP2, TMP2, OUTPUT_COL
869      vst1.8          {d16}, [TMP1]
870      vst1.8          {d17}, [TMP2]
871      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
872      add             TMP1, TMP1, OUTPUT_COL
873      add             TMP2, TMP2, OUTPUT_COL
874      vst1.8          {d18}, [TMP1]
875    vtrn.8          d20, d21
876      vst1.8          {d19}, [TMP2]
877      ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
878      add             TMP1, TMP1, OUTPUT_COL
879      add             TMP2, TMP2, OUTPUT_COL
880      add             TMP3, TMP3, OUTPUT_COL
881      add             TMP4, TMP4, OUTPUT_COL
882      vst1.8          {d20}, [TMP1]
883    vtrn.8          d22, d23
884      vst1.8          {d21}, [TMP2]
885      vst1.8          {d22}, [TMP3]
886      vst1.8          {d23}, [TMP4]
887    bx              lr
888
889    .unreq          DCT_TABLE
890    .unreq          COEF_BLOCK
891    .unreq          OUTPUT_BUF
892    .unreq          OUTPUT_COL
893    .unreq          TMP1
894    .unreq          TMP2
895    .unreq          TMP3
896    .unreq          TMP4
897
898
899/*****************************************************************************/
900
901/*
902 * jsimd_idct_4x4_neon
903 *
904 * This function contains inverse-DCT code for getting reduced-size
905 * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
906 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
907 * function from jpeg-6b (jidctred.c).
908 *
909 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
910 *       requires much less arithmetic operations and hence should be faster.
911 *       The primary purpose of this particular NEON optimized function is
912 *       bit exact compatibility with jpeg-6b.
913 *
914 * TODO: a bit better instructions scheduling can be achieved by expanding
915 *       idct_helper/transpose_4x4 macros and reordering instructions,
916 *       but readability will suffer somewhat.
917 */
918
919#define CONST_BITS  13
920
921#define FIX_0_211164243  (1730)  /* FIX(0.211164243) */
922#define FIX_0_509795579  (4176)  /* FIX(0.509795579) */
923#define FIX_0_601344887  (4926)  /* FIX(0.601344887) */
924#define FIX_0_720959822  (5906)  /* FIX(0.720959822) */
925#define FIX_0_765366865  (6270)  /* FIX(0.765366865) */
926#define FIX_0_850430095  (6967)  /* FIX(0.850430095) */
927#define FIX_0_899976223  (7373)  /* FIX(0.899976223) */
928#define FIX_1_061594337  (8697)  /* FIX(1.061594337) */
929#define FIX_1_272758580  (10426) /* FIX(1.272758580) */
930#define FIX_1_451774981  (11893) /* FIX(1.451774981) */
931#define FIX_1_847759065  (15137) /* FIX(1.847759065) */
932#define FIX_2_172734803  (17799) /* FIX(2.172734803) */
933#define FIX_2_562915447  (20995) /* FIX(2.562915447) */
934#define FIX_3_624509785  (29692) /* FIX(3.624509785) */
935
936.balign 16
937jsimd_idct_4x4_neon_consts:
938    .short     FIX_1_847759065     /* d0[0] */
939    .short     -FIX_0_765366865    /* d0[1] */
940    .short     -FIX_0_211164243    /* d0[2] */
941    .short     FIX_1_451774981     /* d0[3] */
942    .short     -FIX_2_172734803    /* d1[0] */
943    .short     FIX_1_061594337     /* d1[1] */
944    .short     -FIX_0_509795579    /* d1[2] */
945    .short     -FIX_0_601344887    /* d1[3] */
946    .short     FIX_0_899976223     /* d2[0] */
947    .short     FIX_2_562915447     /* d2[1] */
948    .short     1 << (CONST_BITS+1) /* d2[2] */
949    .short     0                   /* d2[3] */
950
951.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
952    vmull.s16       q14, \x4,  d2[2]
953    vmlal.s16       q14, \x8,  d0[0]
954    vmlal.s16       q14, \x14, d0[1]
955
956    vmull.s16       q13, \x16, d1[2]
957    vmlal.s16       q13, \x12, d1[3]
958    vmlal.s16       q13, \x10, d2[0]
959    vmlal.s16       q13, \x6,  d2[1]
960
961    vmull.s16       q15, \x4,  d2[2]
962    vmlsl.s16       q15, \x8,  d0[0]
963    vmlsl.s16       q15, \x14, d0[1]
964
965    vmull.s16       q12, \x16, d0[2]
966    vmlal.s16       q12, \x12, d0[3]
967    vmlal.s16       q12, \x10, d1[0]
968    vmlal.s16       q12, \x6,  d1[1]
969
970    vadd.s32        q10, q14, q13
971    vsub.s32        q14, q14, q13
972
973.if \shift > 16
974    vrshr.s32       q10,  q10, #\shift
975    vrshr.s32       q14,  q14, #\shift
976    vmovn.s32       \y26, q10
977    vmovn.s32       \y29, q14
978.else
979    vrshrn.s32      \y26, q10, #\shift
980    vrshrn.s32      \y29, q14, #\shift
981.endif
982
983    vadd.s32        q10, q15, q12
984    vsub.s32        q15, q15, q12
985
986.if \shift > 16
987    vrshr.s32       q10,  q10, #\shift
988    vrshr.s32       q15,  q15, #\shift
989    vmovn.s32       \y27, q10
990    vmovn.s32       \y28, q15
991.else
992    vrshrn.s32      \y27, q10, #\shift
993    vrshrn.s32      \y28, q15, #\shift
994.endif
995
996.endm
997
998asm_function jsimd_idct_4x4_neon
999
1000    DCT_TABLE       .req r0
1001    COEF_BLOCK      .req r1
1002    OUTPUT_BUF      .req r2
1003    OUTPUT_COL      .req r3
1004    TMP1            .req r0
1005    TMP2            .req r1
1006    TMP3            .req r2
1007    TMP4            .req ip
1008
1009    vpush           {d8-d15}
1010
1011    /* Load constants (d3 is just used for padding) */
1012    adr             TMP4, jsimd_idct_4x4_neon_consts
1013    vld1.16         {d0, d1, d2, d3}, [TMP4, :128]
1014
1015    /* Load all COEF_BLOCK into NEON registers with the following allocation:
1016     *       0 1 2 3 | 4 5 6 7
1017     *      ---------+--------
1018     *   0 | d4      | d5
1019     *   1 | d6      | d7
1020     *   2 | d8      | d9
1021     *   3 | d10     | d11
1022     *   4 | -       | -
1023     *   5 | d12     | d13
1024     *   6 | d14     | d15
1025     *   7 | d16     | d17
1026     */
1027    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1028    vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
1029    add COEF_BLOCK, COEF_BLOCK, #16
1030    vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
1031    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
1032    /* dequantize */
1033    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1034    vmul.s16        q2, q2, q9
1035    vld1.16         {d22, d23, d24, d25}, [DCT_TABLE, :128]!
1036    vmul.s16        q3, q3, q10
1037    vmul.s16        q4, q4, q11
1038    add             DCT_TABLE, DCT_TABLE, #16
1039    vld1.16         {d26, d27, d28, d29}, [DCT_TABLE, :128]!
1040    vmul.s16        q5, q5, q12
1041    vmul.s16        q6, q6, q13
1042    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
1043    vmul.s16        q7, q7, q14
1044    vmul.s16        q8, q8, q15
1045
1046    /* Pass 1 */
1047    idct_helper     d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
1048    transpose_4x4   d4, d6, d8, d10
1049    idct_helper     d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
1050    transpose_4x4   d5, d7, d9, d11
1051
1052    /* Pass 2 */
1053    idct_helper     d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
1054    transpose_4x4   d26, d27, d28, d29
1055
1056    /* Range limit */
1057    vmov.u16        q15, #0x80
1058    vadd.s16        q13, q13, q15
1059    vadd.s16        q14, q14, q15
1060    vqmovun.s16     d26, q13
1061    vqmovun.s16     d27, q14
1062
1063    /* Store results to the output buffer */
1064    ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
1065    add             TMP1, TMP1, OUTPUT_COL
1066    add             TMP2, TMP2, OUTPUT_COL
1067    add             TMP3, TMP3, OUTPUT_COL
1068    add             TMP4, TMP4, OUTPUT_COL
1069
1070#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1071    /* We can use much less instructions on little endian systems if the
1072     * OS kernel is not configured to trap unaligned memory accesses
1073     */
1074    vst1.32         {d26[0]}, [TMP1]!
1075    vst1.32         {d27[0]}, [TMP3]!
1076    vst1.32         {d26[1]}, [TMP2]!
1077    vst1.32         {d27[1]}, [TMP4]!
1078#else
1079    vst1.8          {d26[0]}, [TMP1]!
1080    vst1.8          {d27[0]}, [TMP3]!
1081    vst1.8          {d26[1]}, [TMP1]!
1082    vst1.8          {d27[1]}, [TMP3]!
1083    vst1.8          {d26[2]}, [TMP1]!
1084    vst1.8          {d27[2]}, [TMP3]!
1085    vst1.8          {d26[3]}, [TMP1]!
1086    vst1.8          {d27[3]}, [TMP3]!
1087
1088    vst1.8          {d26[4]}, [TMP2]!
1089    vst1.8          {d27[4]}, [TMP4]!
1090    vst1.8          {d26[5]}, [TMP2]!
1091    vst1.8          {d27[5]}, [TMP4]!
1092    vst1.8          {d26[6]}, [TMP2]!
1093    vst1.8          {d27[6]}, [TMP4]!
1094    vst1.8          {d26[7]}, [TMP2]!
1095    vst1.8          {d27[7]}, [TMP4]!
1096#endif
1097
1098    vpop            {d8-d15}
1099    bx              lr
1100
1101    .unreq          DCT_TABLE
1102    .unreq          COEF_BLOCK
1103    .unreq          OUTPUT_BUF
1104    .unreq          OUTPUT_COL
1105    .unreq          TMP1
1106    .unreq          TMP2
1107    .unreq          TMP3
1108    .unreq          TMP4
1109
1110.purgem idct_helper
1111
1112
1113/*****************************************************************************/
1114
1115/*
1116 * jsimd_idct_2x2_neon
1117 *
1118 * This function contains inverse-DCT code for getting reduced-size
1119 * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
1120 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1121 * function from jpeg-6b (jidctred.c).
1122 *
1123 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1124 *       requires much less arithmetic operations and hence should be faster.
1125 *       The primary purpose of this particular NEON optimized function is
1126 *       bit exact compatibility with jpeg-6b.
1127 */
1128
1129.balign 8
1130jsimd_idct_2x2_neon_consts:
1131    .short     -FIX_0_720959822    /* d0[0] */
1132    .short     FIX_0_850430095     /* d0[1] */
1133    .short     -FIX_1_272758580    /* d0[2] */
1134    .short     FIX_3_624509785     /* d0[3] */
1135
1136.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1137    vshll.s16  q14,  \x4,  #15
1138    vmull.s16  q13,  \x6,  d0[3]
1139    vmlal.s16  q13,  \x10, d0[2]
1140    vmlal.s16  q13,  \x12, d0[1]
1141    vmlal.s16  q13,  \x16, d0[0]
1142
1143    vadd.s32   q10,  q14,  q13
1144    vsub.s32   q14,  q14,  q13
1145
1146.if \shift > 16
1147    vrshr.s32  q10,  q10,  #\shift
1148    vrshr.s32  q14,  q14,  #\shift
1149    vmovn.s32  \y26, q10
1150    vmovn.s32  \y27, q14
1151.else
1152    vrshrn.s32 \y26, q10,  #\shift
1153    vrshrn.s32 \y27, q14,  #\shift
1154.endif
1155
1156.endm
1157
1158asm_function jsimd_idct_2x2_neon
1159
1160    DCT_TABLE       .req r0
1161    COEF_BLOCK      .req r1
1162    OUTPUT_BUF      .req r2
1163    OUTPUT_COL      .req r3
1164    TMP1            .req r0
1165    TMP2            .req ip
1166
1167    vpush           {d8-d15}
1168
1169    /* Load constants */
1170    adr             TMP2, jsimd_idct_2x2_neon_consts
1171    vld1.16         {d0}, [TMP2, :64]
1172
1173    /* Load all COEF_BLOCK into NEON registers with the following allocation:
1174     *       0 1 2 3 | 4 5 6 7
1175     *      ---------+--------
1176     *   0 | d4      | d5
1177     *   1 | d6      | d7
1178     *   2 | -       | -
1179     *   3 | d10     | d11
1180     *   4 | -       | -
1181     *   5 | d12     | d13
1182     *   6 | -       | -
1183     *   7 | d16     | d17
1184     */
1185    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1186    add             COEF_BLOCK, COEF_BLOCK, #16
1187    vld1.16         {d10, d11}, [COEF_BLOCK, :128]!
1188    add             COEF_BLOCK, COEF_BLOCK, #16
1189    vld1.16         {d12, d13}, [COEF_BLOCK, :128]!
1190    add             COEF_BLOCK, COEF_BLOCK, #16
1191    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
1192    /* Dequantize */
1193    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1194    vmul.s16        q2, q2, q9
1195    vmul.s16        q3, q3, q10
1196    add             DCT_TABLE, DCT_TABLE, #16
1197    vld1.16         {d24, d25}, [DCT_TABLE, :128]!
1198    vmul.s16        q5, q5, q12
1199    add             DCT_TABLE, DCT_TABLE, #16
1200    vld1.16         {d26, d27}, [DCT_TABLE, :128]!
1201    vmul.s16        q6, q6, q13
1202    add             DCT_TABLE, DCT_TABLE, #16
1203    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
1204    vmul.s16        q8, q8, q15
1205
1206    /* Pass 1 */
1207#if 0
1208    idct_helper     d4, d6, d10, d12, d16, 13, d4, d6
1209    transpose_4x4   d4, d6, d8,  d10
1210    idct_helper     d5, d7, d11, d13, d17, 13, d5, d7
1211    transpose_4x4   d5, d7, d9,  d11
1212#else
1213    vmull.s16       q13, d6,  d0[3]
1214    vmlal.s16       q13, d10, d0[2]
1215    vmlal.s16       q13, d12, d0[1]
1216    vmlal.s16       q13, d16, d0[0]
1217    vmull.s16       q12, d7,  d0[3]
1218    vmlal.s16       q12, d11, d0[2]
1219    vmlal.s16       q12, d13, d0[1]
1220    vmlal.s16       q12, d17, d0[0]
1221    vshll.s16       q14, d4,  #15
1222    vshll.s16       q15, d5,  #15
1223    vadd.s32        q10, q14, q13
1224    vsub.s32        q14, q14, q13
1225    vrshrn.s32      d4,  q10, #13
1226    vrshrn.s32      d6,  q14, #13
1227    vadd.s32        q10, q15, q12
1228    vsub.s32        q14, q15, q12
1229    vrshrn.s32      d5,  q10, #13
1230    vrshrn.s32      d7,  q14, #13
1231    vtrn.16         q2,  q3
1232    vtrn.32         q3,  q5
1233#endif
1234
1235    /* Pass 2 */
1236    idct_helper     d4, d6, d10, d7, d11, 20, d26, d27
1237
1238    /* Range limit */
1239    vmov.u16        q15, #0x80
1240    vadd.s16        q13, q13, q15
1241    vqmovun.s16     d26, q13
1242    vqmovun.s16     d27, q13
1243
1244    /* Store results to the output buffer */
1245    ldmia           OUTPUT_BUF, {TMP1, TMP2}
1246    add             TMP1, TMP1, OUTPUT_COL
1247    add             TMP2, TMP2, OUTPUT_COL
1248
1249    vst1.8          {d26[0]}, [TMP1]!
1250    vst1.8          {d27[4]}, [TMP1]!
1251    vst1.8          {d26[1]}, [TMP2]!
1252    vst1.8          {d27[5]}, [TMP2]!
1253
1254    vpop            {d8-d15}
1255    bx              lr
1256
1257    .unreq          DCT_TABLE
1258    .unreq          COEF_BLOCK
1259    .unreq          OUTPUT_BUF
1260    .unreq          OUTPUT_COL
1261    .unreq          TMP1
1262    .unreq          TMP2
1263
1264.purgem idct_helper
1265
1266
1267/*****************************************************************************/
1268
1269/*
1270 * jsimd_ycc_extrgb_convert_neon
1271 * jsimd_ycc_extbgr_convert_neon
1272 * jsimd_ycc_extrgbx_convert_neon
1273 * jsimd_ycc_extbgrx_convert_neon
1274 * jsimd_ycc_extxbgr_convert_neon
1275 * jsimd_ycc_extxrgb_convert_neon
1276 *
1277 * Colorspace conversion YCbCr -> RGB
1278 */
1279
1280
1281.macro do_load size
1282    .if \size == 8
1283        vld1.8  {d4}, [U, :64]!
1284        vld1.8  {d5}, [V, :64]!
1285        vld1.8  {d0}, [Y, :64]!
1286        pld     [U, #64]
1287        pld     [V, #64]
1288        pld     [Y, #64]
1289    .elseif \size == 4
1290        vld1.8  {d4[0]}, [U]!
1291        vld1.8  {d4[1]}, [U]!
1292        vld1.8  {d4[2]}, [U]!
1293        vld1.8  {d4[3]}, [U]!
1294        vld1.8  {d5[0]}, [V]!
1295        vld1.8  {d5[1]}, [V]!
1296        vld1.8  {d5[2]}, [V]!
1297        vld1.8  {d5[3]}, [V]!
1298        vld1.8  {d0[0]}, [Y]!
1299        vld1.8  {d0[1]}, [Y]!
1300        vld1.8  {d0[2]}, [Y]!
1301        vld1.8  {d0[3]}, [Y]!
1302    .elseif \size == 2
1303        vld1.8  {d4[4]}, [U]!
1304        vld1.8  {d4[5]}, [U]!
1305        vld1.8  {d5[4]}, [V]!
1306        vld1.8  {d5[5]}, [V]!
1307        vld1.8  {d0[4]}, [Y]!
1308        vld1.8  {d0[5]}, [Y]!
1309    .elseif \size == 1
1310        vld1.8  {d4[6]}, [U]!
1311        vld1.8  {d5[6]}, [V]!
1312        vld1.8  {d0[6]}, [Y]!
1313    .else
1314        .error unsupported macroblock size
1315    .endif
1316.endm
1317
1318.macro do_store bpp, size
1319    .if \bpp == 24
1320        .if \size == 8
1321            vst3.8  {d10, d11, d12}, [RGB]!
1322        .elseif \size == 4
1323            vst3.8  {d10[0], d11[0], d12[0]}, [RGB]!
1324            vst3.8  {d10[1], d11[1], d12[1]}, [RGB]!
1325            vst3.8  {d10[2], d11[2], d12[2]}, [RGB]!
1326            vst3.8  {d10[3], d11[3], d12[3]}, [RGB]!
1327        .elseif \size == 2
1328            vst3.8  {d10[4], d11[4], d12[4]}, [RGB]!
1329            vst3.8  {d10[5], d11[5], d12[5]}, [RGB]!
1330        .elseif \size == 1
1331            vst3.8  {d10[6], d11[6], d12[6]}, [RGB]!
1332        .else
1333            .error unsupported macroblock size
1334        .endif
1335    .elseif \bpp == 32
1336        .if \size == 8
1337            vst4.8  {d10, d11, d12, d13}, [RGB]!
1338        .elseif \size == 4
1339            vst4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1340            vst4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1341            vst4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1342            vst4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1343        .elseif \size == 2
1344            vst4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1345            vst4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1346        .elseif \size == 1
1347            vst4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1348        .else
1349            .error unsupported macroblock size
1350        .endif
1351    .elseif \bpp == 16
1352        .if \size == 8
1353            vst1.16  {q15}, [RGB]!
1354        .elseif \size == 4
1355            vst1.16  {d30}, [RGB]!
1356        .elseif \size == 2
1357            vst1.16  {d31[0]}, [RGB]!
1358            vst1.16  {d31[1]}, [RGB]!
1359        .elseif \size == 1
1360            vst1.16  {d31[2]}, [RGB]!
1361        .else
1362            .error unsupported macroblock size
1363        .endif
1364    .else
1365        .error unsupported bpp
1366    .endif
1367.endm
1368
1369.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1370
1371/*
1372 * 2 stage pipelined YCbCr->RGB conversion
1373 */
1374
1375.macro do_yuv_to_rgb_stage1
1376    vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
1377    vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
1378    vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
1379    vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
1380    vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
1381    vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
1382    vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
1383    vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
1384    vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
1385    vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
1386.endm
1387
1388.macro do_yuv_to_rgb_stage2
1389    vrshrn.s32      d20, q10, #15
1390    vrshrn.s32      d21, q11, #15
1391    vrshrn.s32      d24, q12, #14
1392    vrshrn.s32      d25, q13, #14
1393    vrshrn.s32      d28, q14, #14
1394    vrshrn.s32      d29, q15, #14
1395    vaddw.u8        q11, q10, d0
1396    vaddw.u8        q12, q12, d0
1397    vaddw.u8        q14, q14, d0
1398.if \bpp != 16
1399    vqmovun.s16     d1\g_offs, q11
1400    vqmovun.s16     d1\r_offs, q12
1401    vqmovun.s16     d1\b_offs, q14
1402.else /* rgb565 */
1403    vqshlu.s16      q13, q11, #8
1404    vqshlu.s16      q15, q12, #8
1405    vqshlu.s16      q14, q14, #8
1406    vsri.u16        q15, q13, #5
1407    vsri.u16        q15, q14, #11
1408.endif
1409.endm
1410
1411.macro do_yuv_to_rgb_stage2_store_load_stage1
1412                                       /* "do_yuv_to_rgb_stage2" and "store" */
1413                                       vrshrn.s32      d20, q10, #15
1414    /* "load" and "do_yuv_to_rgb_stage1" */
1415    pld             [U, #64]
1416                                       vrshrn.s32      d21, q11, #15
1417    pld             [V, #64]
1418                                       vrshrn.s32      d24, q12, #14
1419                                       vrshrn.s32      d25, q13, #14
1420    vld1.8          {d4}, [U, :64]!
1421                                       vrshrn.s32      d28, q14, #14
1422    vld1.8          {d5}, [V, :64]!
1423                                       vrshrn.s32      d29, q15, #14
1424    vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
1425    vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
1426                                       vaddw.u8        q11, q10, d0
1427    vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
1428    vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
1429                                       vaddw.u8        q12, q12, d0
1430                                       vaddw.u8        q14, q14, d0
1431.if \bpp != 16 /**************** rgb24/rgb32 *********************************/
1432                                       vqmovun.s16     d1\g_offs, q11
1433    pld             [Y, #64]
1434                                       vqmovun.s16     d1\r_offs, q12
1435    vld1.8          {d0}, [Y, :64]!
1436                                       vqmovun.s16     d1\b_offs, q14
1437    vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
1438    vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
1439                                       do_store        \bpp, 8
1440    vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
1441    vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
1442    vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
1443    vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
1444.else /**************************** rgb565 ***********************************/
1445                                       vqshlu.s16      q13, q11, #8
1446    pld             [Y, #64]
1447                                       vqshlu.s16      q15, q12, #8
1448                                       vqshlu.s16      q14, q14, #8
1449    vld1.8          {d0}, [Y, :64]!
1450    vmull.s16       q11, d7, d1[1]
1451    vmlal.s16       q11, d9, d1[2]
1452                                       vsri.u16        q15, q13, #5
1453    vmull.s16       q12, d8, d1[0]
1454                                       vsri.u16        q15, q14, #11
1455    vmull.s16       q13, d9, d1[0]
1456    vmull.s16       q14, d6, d1[3]
1457                                       do_store        \bpp, 8
1458    vmull.s16       q15, d7, d1[3]
1459.endif
1460.endm
1461
1462.macro do_yuv_to_rgb
1463    do_yuv_to_rgb_stage1
1464    do_yuv_to_rgb_stage2
1465.endm
1466
1467/* Apple gas crashes on adrl, work around that by using adr.
1468 * But this requires a copy of these constants for each function.
1469 */
1470
1471.balign 16
1472jsimd_ycc_\colorid\()_neon_consts:
1473    .short          0,      0,     0,      0
1474    .short          22971, -11277, -23401, 29033
1475    .short          -128,  -128,   -128,   -128
1476    .short          -128,  -128,   -128,   -128
1477
1478asm_function jsimd_ycc_\colorid\()_convert_neon
1479    OUTPUT_WIDTH    .req r0
1480    INPUT_BUF       .req r1
1481    INPUT_ROW       .req r2
1482    OUTPUT_BUF      .req r3
1483    NUM_ROWS        .req r4
1484
1485    INPUT_BUF0      .req r5
1486    INPUT_BUF1      .req r6
1487    INPUT_BUF2      .req INPUT_BUF
1488
1489    RGB             .req r7
1490    Y               .req r8
1491    U               .req r9
1492    V               .req r10
1493    N               .req ip
1494
1495    /* Load constants to d1, d2, d3 (d0 is just used for padding) */
1496    adr             ip, jsimd_ycc_\colorid\()_neon_consts
1497    vld1.16         {d0, d1, d2, d3}, [ip, :128]
1498
1499    /* Save ARM registers and handle input arguments */
1500    push            {r4, r5, r6, r7, r8, r9, r10, lr}
1501    ldr             NUM_ROWS, [sp, #(4 * 8)]
1502    ldr             INPUT_BUF0, [INPUT_BUF]
1503    ldr             INPUT_BUF1, [INPUT_BUF, #4]
1504    ldr             INPUT_BUF2, [INPUT_BUF, #8]
1505    .unreq          INPUT_BUF
1506
1507    /* Save NEON registers */
1508    vpush           {d8-d15}
1509
1510    /* Initially set d10, d11, d12, d13 to 0xFF */
1511    vmov.u8         q5, #255
1512    vmov.u8         q6, #255
1513
1514    /* Outer loop over scanlines */
1515    cmp             NUM_ROWS, #1
1516    blt             9f
15170:
1518    ldr             Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
1519    ldr             U, [INPUT_BUF1, INPUT_ROW, lsl #2]
1520    mov             N, OUTPUT_WIDTH
1521    ldr             V, [INPUT_BUF2, INPUT_ROW, lsl #2]
1522    add             INPUT_ROW, INPUT_ROW, #1
1523    ldr             RGB, [OUTPUT_BUF], #4
1524
1525    /* Inner loop over pixels */
1526    subs            N, N, #8
1527    blt             3f
1528    do_load         8
1529    do_yuv_to_rgb_stage1
1530    subs            N, N, #8
1531    blt             2f
15321:
1533    do_yuv_to_rgb_stage2_store_load_stage1
1534    subs            N, N, #8
1535    bge             1b
15362:
1537    do_yuv_to_rgb_stage2
1538    do_store        \bpp, 8
1539    tst             N, #7
1540    beq             8f
15413:
1542    tst             N, #4
1543    beq             3f
1544    do_load         4
15453:
1546    tst             N, #2
1547    beq             4f
1548    do_load         2
15494:
1550    tst             N, #1
1551    beq             5f
1552    do_load         1
15535:
1554    do_yuv_to_rgb
1555    tst             N, #4
1556    beq             6f
1557    do_store        \bpp, 4
15586:
1559    tst             N, #2
1560    beq             7f
1561    do_store        \bpp, 2
15627:
1563    tst             N, #1
1564    beq             8f
1565    do_store        \bpp, 1
15668:
1567    subs            NUM_ROWS, NUM_ROWS, #1
1568    bgt             0b
15699:
1570    /* Restore all registers and return */
1571    vpop            {d8-d15}
1572    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
1573
1574    .unreq          OUTPUT_WIDTH
1575    .unreq          INPUT_ROW
1576    .unreq          OUTPUT_BUF
1577    .unreq          NUM_ROWS
1578    .unreq          INPUT_BUF0
1579    .unreq          INPUT_BUF1
1580    .unreq          INPUT_BUF2
1581    .unreq          RGB
1582    .unreq          Y
1583    .unreq          U
1584    .unreq          V
1585    .unreq          N
1586
1587.purgem do_yuv_to_rgb
1588.purgem do_yuv_to_rgb_stage1
1589.purgem do_yuv_to_rgb_stage2
1590.purgem do_yuv_to_rgb_stage2_store_load_stage1
1591
1592.endm
1593
1594/*--------------------------------- id ----- bpp R  G  B */
1595generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, 1, 2
1596generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, 1, 0
1597generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
1598generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
1599generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
1600generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
1601generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, 0, 0
1602
1603.purgem do_load
1604.purgem do_store
1605
1606
1607/*****************************************************************************/
1608
1609/*
1610 * jsimd_extrgb_ycc_convert_neon
1611 * jsimd_extbgr_ycc_convert_neon
1612 * jsimd_extrgbx_ycc_convert_neon
1613 * jsimd_extbgrx_ycc_convert_neon
1614 * jsimd_extxbgr_ycc_convert_neon
1615 * jsimd_extxrgb_ycc_convert_neon
1616 *
1617 * Colorspace conversion RGB -> YCbCr
1618 */
1619
1620.macro do_store size
1621    .if \size == 8
1622        vst1.8  {d20}, [Y]!
1623        vst1.8  {d21}, [U]!
1624        vst1.8  {d22}, [V]!
1625    .elseif \size == 4
1626        vst1.8  {d20[0]}, [Y]!
1627        vst1.8  {d20[1]}, [Y]!
1628        vst1.8  {d20[2]}, [Y]!
1629        vst1.8  {d20[3]}, [Y]!
1630        vst1.8  {d21[0]}, [U]!
1631        vst1.8  {d21[1]}, [U]!
1632        vst1.8  {d21[2]}, [U]!
1633        vst1.8  {d21[3]}, [U]!
1634        vst1.8  {d22[0]}, [V]!
1635        vst1.8  {d22[1]}, [V]!
1636        vst1.8  {d22[2]}, [V]!
1637        vst1.8  {d22[3]}, [V]!
1638    .elseif \size == 2
1639        vst1.8  {d20[4]}, [Y]!
1640        vst1.8  {d20[5]}, [Y]!
1641        vst1.8  {d21[4]}, [U]!
1642        vst1.8  {d21[5]}, [U]!
1643        vst1.8  {d22[4]}, [V]!
1644        vst1.8  {d22[5]}, [V]!
1645    .elseif \size == 1
1646        vst1.8  {d20[6]}, [Y]!
1647        vst1.8  {d21[6]}, [U]!
1648        vst1.8  {d22[6]}, [V]!
1649    .else
1650        .error unsupported macroblock size
1651    .endif
1652.endm
1653
1654.macro do_load bpp, size
1655    .if \bpp == 24
1656        .if \size == 8
1657            vld3.8  {d10, d11, d12}, [RGB]!
1658            pld     [RGB, #128]
1659        .elseif \size == 4
1660            vld3.8  {d10[0], d11[0], d12[0]}, [RGB]!
1661            vld3.8  {d10[1], d11[1], d12[1]}, [RGB]!
1662            vld3.8  {d10[2], d11[2], d12[2]}, [RGB]!
1663            vld3.8  {d10[3], d11[3], d12[3]}, [RGB]!
1664        .elseif \size == 2
1665            vld3.8  {d10[4], d11[4], d12[4]}, [RGB]!
1666            vld3.8  {d10[5], d11[5], d12[5]}, [RGB]!
1667        .elseif \size == 1
1668            vld3.8  {d10[6], d11[6], d12[6]}, [RGB]!
1669        .else
1670            .error unsupported macroblock size
1671        .endif
1672    .elseif \bpp == 32
1673        .if \size == 8
1674            vld4.8  {d10, d11, d12, d13}, [RGB]!
1675            pld     [RGB, #128]
1676        .elseif \size == 4
1677            vld4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1678            vld4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1679            vld4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1680            vld4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1681        .elseif \size == 2
1682            vld4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1683            vld4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1684        .elseif \size == 1
1685            vld4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1686        .else
1687            .error unsupported macroblock size
1688        .endif
1689    .else
1690        .error unsupported bpp
1691    .endif
1692.endm
1693
1694.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1695
1696/*
1697 * 2 stage pipelined RGB->YCbCr conversion
1698 */
1699
1700.macro do_rgb_to_yuv_stage1
1701    vmovl.u8    q2, d1\r_offs /* r = { d4, d5 } */
1702    vmovl.u8    q3, d1\g_offs /* g = { d6, d7 } */
1703    vmovl.u8    q4, d1\b_offs /* b = { d8, d9 } */
1704    vmull.u16   q7, d4, d0[0]
1705    vmlal.u16   q7, d6, d0[1]
1706    vmlal.u16   q7, d8, d0[2]
1707    vmull.u16   q8, d5, d0[0]
1708    vmlal.u16   q8, d7, d0[1]
1709    vmlal.u16   q8, d9, d0[2]
1710    vrev64.32   q9,  q1
1711    vrev64.32   q13, q1
1712    vmlsl.u16   q9,  d4, d0[3]
1713    vmlsl.u16   q9,  d6, d1[0]
1714    vmlal.u16   q9,  d8, d1[1]
1715    vmlsl.u16   q13, d5, d0[3]
1716    vmlsl.u16   q13, d7, d1[0]
1717    vmlal.u16   q13, d9, d1[1]
1718    vrev64.32   q14, q1
1719    vrev64.32   q15, q1
1720    vmlal.u16   q14, d4, d1[1]
1721    vmlsl.u16   q14, d6, d1[2]
1722    vmlsl.u16   q14, d8, d1[3]
1723    vmlal.u16   q15, d5, d1[1]
1724    vmlsl.u16   q15, d7, d1[2]
1725    vmlsl.u16   q15, d9, d1[3]
1726.endm
1727
1728.macro do_rgb_to_yuv_stage2
1729    vrshrn.u32  d20, q7,  #16
1730    vrshrn.u32  d21, q8,  #16
1731    vshrn.u32   d22, q9,  #16
1732    vshrn.u32   d23, q13, #16
1733    vshrn.u32   d24, q14, #16
1734    vshrn.u32   d25, q15, #16
1735    vmovn.u16   d20, q10      /* d20 = y */
1736    vmovn.u16   d21, q11      /* d21 = u */
1737    vmovn.u16   d22, q12      /* d22 = v */
1738.endm
1739
1740.macro do_rgb_to_yuv
1741    do_rgb_to_yuv_stage1
1742    do_rgb_to_yuv_stage2
1743.endm
1744
1745.macro do_rgb_to_yuv_stage2_store_load_stage1
1746      vrshrn.u32  d20, q7,  #16
1747      vrshrn.u32  d21, q8,  #16
1748      vshrn.u32   d22, q9,  #16
1749    vrev64.32   q9,  q1
1750      vshrn.u32   d23, q13, #16
1751    vrev64.32   q13, q1
1752      vshrn.u32   d24, q14, #16
1753      vshrn.u32   d25, q15, #16
1754    do_load     \bpp, 8
1755      vmovn.u16   d20, q10      /* d20 = y */
1756    vmovl.u8    q2, d1\r_offs   /* r = { d4, d5 } */
1757      vmovn.u16   d21, q11      /* d21 = u */
1758    vmovl.u8    q3, d1\g_offs   /* g = { d6, d7 } */
1759      vmovn.u16   d22, q12      /* d22 = v */
1760    vmovl.u8    q4, d1\b_offs   /* b = { d8, d9 } */
1761    vmull.u16   q7, d4, d0[0]
1762    vmlal.u16   q7, d6, d0[1]
1763    vmlal.u16   q7, d8, d0[2]
1764      vst1.8      {d20}, [Y]!
1765    vmull.u16   q8, d5, d0[0]
1766    vmlal.u16   q8, d7, d0[1]
1767    vmlal.u16   q8, d9, d0[2]
1768    vmlsl.u16   q9,  d4, d0[3]
1769    vmlsl.u16   q9,  d6, d1[0]
1770    vmlal.u16   q9,  d8, d1[1]
1771      vst1.8      {d21}, [U]!
1772    vmlsl.u16   q13, d5, d0[3]
1773    vmlsl.u16   q13, d7, d1[0]
1774    vmlal.u16   q13, d9, d1[1]
1775    vrev64.32   q14, q1
1776    vrev64.32   q15, q1
1777    vmlal.u16   q14, d4, d1[1]
1778    vmlsl.u16   q14, d6, d1[2]
1779    vmlsl.u16   q14, d8, d1[3]
1780      vst1.8      {d22}, [V]!
1781    vmlal.u16   q15, d5, d1[1]
1782    vmlsl.u16   q15, d7, d1[2]
1783    vmlsl.u16   q15, d9, d1[3]
1784.endm
1785
1786.balign 16
1787jsimd_\colorid\()_ycc_neon_consts:
1788    .short          19595, 38470, 7471,  11059
1789    .short          21709, 32768, 27439, 5329
1790    .short          32767, 128,   32767, 128
1791    .short          32767, 128,   32767, 128
1792
1793asm_function jsimd_\colorid\()_ycc_convert_neon
1794    OUTPUT_WIDTH    .req r0
1795    INPUT_BUF       .req r1
1796    OUTPUT_BUF      .req r2
1797    OUTPUT_ROW      .req r3
1798    NUM_ROWS        .req r4
1799
1800    OUTPUT_BUF0     .req r5
1801    OUTPUT_BUF1     .req r6
1802    OUTPUT_BUF2     .req OUTPUT_BUF
1803
1804    RGB             .req r7
1805    Y               .req r8
1806    U               .req r9
1807    V               .req r10
1808    N               .req ip
1809
1810    /* Load constants to d0, d1, d2, d3 */
1811    adr             ip, jsimd_\colorid\()_ycc_neon_consts
1812    vld1.16         {d0, d1, d2, d3}, [ip, :128]
1813
1814    /* Save ARM registers and handle input arguments */
1815    push            {r4, r5, r6, r7, r8, r9, r10, lr}
1816    ldr             NUM_ROWS, [sp, #(4 * 8)]
1817    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
1818    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #4]
1819    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #8]
1820    .unreq          OUTPUT_BUF
1821
1822    /* Save NEON registers */
1823    vpush           {d8-d15}
1824
1825    /* Outer loop over scanlines */
1826    cmp             NUM_ROWS, #1
1827    blt             9f
18280:
1829    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
1830    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
1831    mov             N, OUTPUT_WIDTH
1832    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
1833    add             OUTPUT_ROW, OUTPUT_ROW, #1
1834    ldr             RGB, [INPUT_BUF], #4
1835
1836    /* Inner loop over pixels */
1837    subs            N, N, #8
1838    blt             3f
1839    do_load         \bpp, 8
1840    do_rgb_to_yuv_stage1
1841    subs            N, N, #8
1842    blt             2f
18431:
1844    do_rgb_to_yuv_stage2_store_load_stage1
1845    subs            N, N, #8
1846    bge             1b
18472:
1848    do_rgb_to_yuv_stage2
1849    do_store        8
1850    tst             N, #7
1851    beq             8f
18523:
1853    tst             N, #4
1854    beq             3f
1855    do_load         \bpp, 4
18563:
1857    tst             N, #2
1858    beq             4f
1859    do_load         \bpp, 2
18604:
1861    tst             N, #1
1862    beq             5f
1863    do_load         \bpp, 1
18645:
1865    do_rgb_to_yuv
1866    tst             N, #4
1867    beq             6f
1868    do_store        4
18696:
1870    tst             N, #2
1871    beq             7f
1872    do_store        2
18737:
1874    tst             N, #1
1875    beq             8f
1876    do_store        1
18778:
1878    subs            NUM_ROWS, NUM_ROWS, #1
1879    bgt             0b
18809:
1881    /* Restore all registers and return */
1882    vpop            {d8-d15}
1883    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
1884
1885    .unreq          OUTPUT_WIDTH
1886    .unreq          OUTPUT_ROW
1887    .unreq          INPUT_BUF
1888    .unreq          NUM_ROWS
1889    .unreq          OUTPUT_BUF0
1890    .unreq          OUTPUT_BUF1
1891    .unreq          OUTPUT_BUF2
1892    .unreq          RGB
1893    .unreq          Y
1894    .unreq          U
1895    .unreq          V
1896    .unreq          N
1897
1898.purgem do_rgb_to_yuv
1899.purgem do_rgb_to_yuv_stage1
1900.purgem do_rgb_to_yuv_stage2
1901.purgem do_rgb_to_yuv_stage2_store_load_stage1
1902
1903.endm
1904
1905/*--------------------------------- id ----- bpp R  G  B */
1906generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
1907generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
1908generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
1909generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
1910generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
1911generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
1912
1913.purgem do_load
1914.purgem do_store
1915
1916
1917/*****************************************************************************/
1918
1919/*
1920 * Load data into workspace, applying unsigned->signed conversion
1921 *
1922 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
1923 *       rid of VST1.16 instructions
1924 */
1925
1926asm_function jsimd_convsamp_neon
1927    SAMPLE_DATA     .req r0
1928    START_COL       .req r1
1929    WORKSPACE       .req r2
1930    TMP1            .req r3
1931    TMP2            .req r4
1932    TMP3            .req r5
1933    TMP4            .req ip
1934
1935    push            {r4, r5}
1936    vmov.u8         d0, #128
1937
1938    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1939    add             TMP1, TMP1, START_COL
1940    add             TMP2, TMP2, START_COL
1941    add             TMP3, TMP3, START_COL
1942    add             TMP4, TMP4, START_COL
1943    vld1.8          {d16}, [TMP1]
1944    vsubl.u8        q8, d16, d0
1945    vld1.8          {d18}, [TMP2]
1946    vsubl.u8        q9, d18, d0
1947    vld1.8          {d20}, [TMP3]
1948    vsubl.u8        q10, d20, d0
1949    vld1.8          {d22}, [TMP4]
1950    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1951    vsubl.u8        q11, d22, d0
1952    vst1.16         {d16, d17, d18, d19}, [WORKSPACE, :128]!
1953    add             TMP1, TMP1, START_COL
1954    add             TMP2, TMP2, START_COL
1955    vst1.16         {d20, d21, d22, d23}, [WORKSPACE, :128]!
1956    add             TMP3, TMP3, START_COL
1957    add             TMP4, TMP4, START_COL
1958    vld1.8          {d24}, [TMP1]
1959    vsubl.u8        q12, d24, d0
1960    vld1.8          {d26}, [TMP2]
1961    vsubl.u8        q13, d26, d0
1962    vld1.8          {d28}, [TMP3]
1963    vsubl.u8        q14, d28, d0
1964    vld1.8          {d30}, [TMP4]
1965    vsubl.u8        q15, d30, d0
1966    vst1.16         {d24, d25, d26, d27}, [WORKSPACE, :128]!
1967    vst1.16         {d28, d29, d30, d31}, [WORKSPACE, :128]!
1968    pop             {r4, r5}
1969    bx              lr
1970
1971    .unreq          SAMPLE_DATA
1972    .unreq          START_COL
1973    .unreq          WORKSPACE
1974    .unreq          TMP1
1975    .unreq          TMP2
1976    .unreq          TMP3
1977    .unreq          TMP4
1978
1979
1980/*****************************************************************************/
1981
1982/*
1983 * jsimd_fdct_ifast_neon
1984 *
1985 * This function contains a fast, not so accurate integer implementation of
1986 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
1987 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
1988 * function from jfdctfst.c
1989 *
1990 * TODO: can be combined with 'jsimd_convsamp_neon' to get
1991 *       rid of a bunch of VLD1.16 instructions
1992 */
1993
1994#define XFIX_0_382683433 d0[0]
1995#define XFIX_0_541196100 d0[1]
1996#define XFIX_0_707106781 d0[2]
1997#define XFIX_1_306562965 d0[3]
1998
1999.balign 16
2000jsimd_fdct_ifast_neon_consts:
2001    .short (98 * 128)              /* XFIX_0_382683433 */
2002    .short (139 * 128)             /* XFIX_0_541196100 */
2003    .short (181 * 128)             /* XFIX_0_707106781 */
2004    .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
2005
2006asm_function jsimd_fdct_ifast_neon
2007
2008    DATA            .req r0
2009    TMP             .req ip
2010
2011    vpush           {d8-d15}
2012
2013    /* Load constants */
2014    adr             TMP, jsimd_fdct_ifast_neon_consts
2015    vld1.16         {d0}, [TMP, :64]
2016
2017    /* Load all DATA into NEON registers with the following allocation:
2018     *       0 1 2 3 | 4 5 6 7
2019     *      ---------+--------
2020     *   0 | d16     | d17    | q8
2021     *   1 | d18     | d19    | q9
2022     *   2 | d20     | d21    | q10
2023     *   3 | d22     | d23    | q11
2024     *   4 | d24     | d25    | q12
2025     *   5 | d26     | d27    | q13
2026     *   6 | d28     | d29    | q14
2027     *   7 | d30     | d31    | q15
2028     */
2029
2030    vld1.16         {d16, d17, d18, d19}, [DATA, :128]!
2031    vld1.16         {d20, d21, d22, d23}, [DATA, :128]!
2032    vld1.16         {d24, d25, d26, d27}, [DATA, :128]!
2033    vld1.16         {d28, d29, d30, d31}, [DATA, :128]
2034    sub             DATA, DATA, #(128 - 32)
2035
2036    mov             TMP, #2
20371:
2038    /* Transpose */
2039    vtrn.16         q12, q13
2040    vtrn.16         q10, q11
2041    vtrn.16         q8,  q9
2042    vtrn.16         q14, q15
2043    vtrn.32         q9,  q11
2044    vtrn.32         q13, q15
2045    vtrn.32         q8,  q10
2046    vtrn.32         q12, q14
2047    vswp            d30, d23
2048    vswp            d24, d17
2049    vswp            d26, d19
2050      /* 1-D FDCT */
2051      vadd.s16        q2,  q11, q12
2052    vswp            d28, d21
2053      vsub.s16        q12, q11, q12
2054      vsub.s16        q6,  q10, q13
2055      vadd.s16        q10, q10, q13
2056      vsub.s16        q7,  q9,  q14
2057      vadd.s16        q9,  q9,  q14
2058      vsub.s16        q1,  q8,  q15
2059      vadd.s16        q8,  q8,  q15
2060      vsub.s16        q4,  q9,  q10
2061      vsub.s16        q5,  q8,  q2
2062      vadd.s16        q3,  q9,  q10
2063      vadd.s16        q4,  q4,  q5
2064      vadd.s16        q2,  q8,  q2
2065      vqdmulh.s16     q4,  q4,  XFIX_0_707106781
2066      vadd.s16        q11, q12, q6
2067      vadd.s16        q8,  q2,  q3
2068      vsub.s16        q12, q2,  q3
2069      vadd.s16        q3,  q6,  q7
2070      vadd.s16        q7,  q7,  q1
2071      vqdmulh.s16     q3,  q3,  XFIX_0_707106781
2072      vsub.s16        q6,  q11, q7
2073      vadd.s16        q10, q5,  q4
2074      vqdmulh.s16     q6,  q6,  XFIX_0_382683433
2075      vsub.s16        q14, q5,  q4
2076      vqdmulh.s16     q11, q11, XFIX_0_541196100
2077      vqdmulh.s16     q5,  q7,  XFIX_1_306562965
2078      vadd.s16        q4,  q1,  q3
2079      vsub.s16        q3,  q1,  q3
2080      vadd.s16        q7,  q7,  q6
2081      vadd.s16        q11, q11, q6
2082      vadd.s16        q7,  q7,  q5
2083      vadd.s16        q13, q3,  q11
2084      vsub.s16        q11, q3,  q11
2085      vadd.s16        q9,  q4,  q7
2086      vsub.s16        q15, q4,  q7
2087    subs            TMP, TMP, #1
2088    bne             1b
2089
2090    /* store results */
2091    vst1.16         {d16, d17, d18, d19}, [DATA, :128]!
2092    vst1.16         {d20, d21, d22, d23}, [DATA, :128]!
2093    vst1.16         {d24, d25, d26, d27}, [DATA, :128]!
2094    vst1.16         {d28, d29, d30, d31}, [DATA, :128]
2095
2096    vpop            {d8-d15}
2097    bx              lr
2098
2099    .unreq          DATA
2100    .unreq          TMP
2101
2102
2103/*****************************************************************************/
2104
2105/*
2106 * GLOBAL(void)
2107 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors,
2108 *                      DCTELEM * workspace);
2109 *
2110 * Note: the code uses 2 stage pipelining in order to improve instructions
2111 *       scheduling and eliminate stalls (this provides ~15% better
2112 *       performance for this function on both ARM Cortex-A8 and
2113 *       ARM Cortex-A9 when compared to the non-pipelined variant).
2114 *       The instructions which belong to the second stage use different
2115 *       indentation for better readiability.
2116 */
2117asm_function jsimd_quantize_neon
2118
2119    COEF_BLOCK      .req r0
2120    DIVISORS        .req r1
2121    WORKSPACE       .req r2
2122
2123    RECIPROCAL      .req DIVISORS
2124    CORRECTION      .req r3
2125    SHIFT           .req ip
2126    LOOP_COUNT      .req r4
2127
2128    vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
2129    vabs.s16        q12, q0
2130    add             CORRECTION, DIVISORS, #(64 * 2)
2131    add             SHIFT, DIVISORS, #(64 * 6)
2132    vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
2133    vabs.s16        q13, q1
2134    vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2135    vadd.u16        q12, q12, q10 /* add correction */
2136    vadd.u16        q13, q13, q11
2137    vmull.u16       q10, d24, d16 /* multiply by reciprocal */
2138    vmull.u16       q11, d25, d17
2139    vmull.u16       q8,  d26, d18
2140    vmull.u16       q9,  d27, d19
2141    vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
2142    vshrn.u32       d20, q10, #16
2143    vshrn.u32       d21, q11, #16
2144    vshrn.u32       d22, q8,  #16
2145    vshrn.u32       d23, q9,  #16
2146    vneg.s16        q12, q12
2147    vneg.s16        q13, q13
2148    vshr.s16        q2,  q0,  #15 /* extract sign */
2149    vshr.s16        q3,  q1,  #15
2150    vshl.u16        q14, q10, q12 /* shift */
2151    vshl.u16        q15, q11, q13
2152
2153    push            {r4, r5}
2154    mov             LOOP_COUNT, #3
21551:
2156    vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
2157      veor.u16        q14, q14, q2  /* restore sign */
2158    vabs.s16        q12, q0
2159    vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
2160    vabs.s16        q13, q1
2161      veor.u16        q15, q15, q3
2162    vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2163    vadd.u16        q12, q12, q10 /* add correction */
2164    vadd.u16        q13, q13, q11
2165    vmull.u16       q10, d24, d16 /* multiply by reciprocal */
2166    vmull.u16       q11, d25, d17
2167    vmull.u16       q8,  d26, d18
2168    vmull.u16       q9,  d27, d19
2169      vsub.u16        q14, q14, q2
2170    vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
2171      vsub.u16        q15, q15, q3
2172    vshrn.u32       d20, q10, #16
2173    vshrn.u32       d21, q11, #16
2174      vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2175    vshrn.u32       d22, q8,  #16
2176    vshrn.u32       d23, q9,  #16
2177    vneg.s16        q12, q12
2178    vneg.s16        q13, q13
2179    vshr.s16        q2,  q0,  #15 /* extract sign */
2180    vshr.s16        q3,  q1,  #15
2181    vshl.u16        q14, q10, q12 /* shift */
2182    vshl.u16        q15, q11, q13
2183    subs            LOOP_COUNT, LOOP_COUNT, #1
2184    bne             1b
2185    pop             {r4, r5}
2186
2187      veor.u16        q14, q14, q2  /* restore sign */
2188      veor.u16        q15, q15, q3
2189      vsub.u16        q14, q14, q2
2190      vsub.u16        q15, q15, q3
2191      vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2192
2193    bx              lr /* return */
2194
2195    .unreq          COEF_BLOCK
2196    .unreq          DIVISORS
2197    .unreq          WORKSPACE
2198    .unreq          RECIPROCAL
2199    .unreq          CORRECTION
2200    .unreq          SHIFT
2201    .unreq          LOOP_COUNT
2202
2203
2204/*****************************************************************************/
2205
2206/*
2207 * GLOBAL(void)
2208 * jsimd_h2v1_fancy_upsample_neon (int          max_v_samp_factor,
2209 *                                 JDIMENSION   downsampled_width,
2210 *                                 JSAMPARRAY   input_data,
2211 *                                 JSAMPARRAY * output_data_ptr);
2212 *
2213 * Note: the use of unaligned writes is the main remaining bottleneck in
2214 *       this code, which can be potentially solved to get up to tens
2215 *       of percents performance improvement on Cortex-A8/Cortex-A9.
2216 */
2217
2218/*
2219 * Upsample 16 source pixels to 32 destination pixels. The new 16 source
2220 * pixels are loaded to q0. The previous 16 source pixels are in q1. The
2221 * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
2222 * Register d28 is used for multiplication by 3. Register q15 is used
2223 * for adding +1 bias.
2224 */
2225.macro upsample16   OUTPTR, INPTR
2226    vld1.8          {q0}, [\INPTR]!
2227    vmovl.u8        q8,  d0
2228    vext.8          q2,  q1,  q0, #15
2229    vmovl.u8        q9,  d1
2230    vaddw.u8        q10, q15, d4
2231    vaddw.u8        q11, q15, d5
2232    vmlal.u8        q8,  d4,  d28
2233    vmlal.u8        q9,  d5,  d28
2234    vmlal.u8        q10, d0,  d28
2235    vmlal.u8        q11, d1,  d28
2236    vmov            q1,  q0       /* backup source pixels to q1 */
2237    vrshrn.u16      d6,  q8,  #2
2238    vrshrn.u16      d7,  q9,  #2
2239    vshrn.u16       d8,  q10, #2
2240    vshrn.u16       d9,  q11, #2
2241    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
2242.endm
2243
2244/*
2245 * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
2246 * macro, the roles of q0 and q1 registers are reversed for even and odd
2247 * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
2248 * Also this unrolling allows to reorder loads and stores to compensate
2249 * multiplication latency and reduce stalls.
2250 */
2251.macro upsample32   OUTPTR, INPTR
2252    /* even 16 pixels group */
2253    vld1.8          {q0}, [\INPTR]!
2254    vmovl.u8        q8,  d0
2255    vext.8          q2,  q1,  q0, #15
2256    vmovl.u8        q9,  d1
2257    vaddw.u8        q10, q15, d4
2258    vaddw.u8        q11, q15, d5
2259    vmlal.u8        q8,  d4,  d28
2260    vmlal.u8        q9,  d5,  d28
2261    vmlal.u8        q10, d0,  d28
2262    vmlal.u8        q11, d1,  d28
2263        /* odd 16 pixels group */
2264        vld1.8          {q1}, [\INPTR]!
2265    vrshrn.u16      d6,  q8,  #2
2266    vrshrn.u16      d7,  q9,  #2
2267    vshrn.u16       d8,  q10, #2
2268    vshrn.u16       d9,  q11, #2
2269        vmovl.u8        q8,  d2
2270        vext.8          q2,  q0,  q1, #15
2271        vmovl.u8        q9,  d3
2272        vaddw.u8        q10, q15, d4
2273        vaddw.u8        q11, q15, d5
2274        vmlal.u8        q8,  d4,  d28
2275        vmlal.u8        q9,  d5,  d28
2276        vmlal.u8        q10, d2,  d28
2277        vmlal.u8        q11, d3,  d28
2278    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
2279        vrshrn.u16      d6,  q8,  #2
2280        vrshrn.u16      d7,  q9,  #2
2281        vshrn.u16       d8,  q10, #2
2282        vshrn.u16       d9,  q11, #2
2283        vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
2284.endm
2285
2286/*
2287 * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
2288 */
2289.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
2290    /* special case for the first and last pixels */
2291    sub             \WIDTH, \WIDTH, #1
2292    add             \OUTPTR, \OUTPTR, #1
2293    ldrb            \TMP1, [\INPTR, \WIDTH]
2294    strb            \TMP1, [\OUTPTR, \WIDTH, asl #1]
2295    ldrb            \TMP1, [\INPTR], #1
2296    strb            \TMP1, [\OUTPTR, #-1]
2297    vmov.8          d3[7], \TMP1
2298
2299    subs            \WIDTH, \WIDTH, #32
2300    blt             5f
23010:  /* process 32 pixels per iteration */
2302    upsample32      \OUTPTR, \INPTR
2303    subs            \WIDTH, \WIDTH, #32
2304    bge             0b
23055:
2306    adds            \WIDTH, \WIDTH, #16
2307    blt             1f
23080:  /* process 16 pixels if needed */
2309    upsample16      \OUTPTR, \INPTR
2310    subs            \WIDTH, \WIDTH, #16
23111:
2312    adds            \WIDTH, \WIDTH, #16
2313    beq             9f
2314
2315    /* load the remaining 1-15 pixels */
2316    add             \INPTR, \INPTR, \WIDTH
2317    tst             \WIDTH, #1
2318    beq             2f
2319    sub             \INPTR, \INPTR, #1
2320    vld1.8          {d0[0]}, [\INPTR]
23212:
2322    tst             \WIDTH, #2
2323    beq             2f
2324    vext.8          d0, d0, d0, #6
2325    sub             \INPTR, \INPTR, #1
2326    vld1.8          {d0[1]}, [\INPTR]
2327    sub             \INPTR, \INPTR, #1
2328    vld1.8          {d0[0]}, [\INPTR]
23292:
2330    tst             \WIDTH, #4
2331    beq             2f
2332    vrev64.32       d0, d0
2333    sub             \INPTR, \INPTR, #1
2334    vld1.8          {d0[3]}, [\INPTR]
2335    sub             \INPTR, \INPTR, #1
2336    vld1.8          {d0[2]}, [\INPTR]
2337    sub             \INPTR, \INPTR, #1
2338    vld1.8          {d0[1]}, [\INPTR]
2339    sub             \INPTR, \INPTR, #1
2340    vld1.8          {d0[0]}, [\INPTR]
23412:
2342    tst             \WIDTH, #8
2343    beq             2f
2344    vmov            d1,  d0
2345    sub             \INPTR, \INPTR, #8
2346    vld1.8          {d0}, [\INPTR]
23472:  /* upsample the remaining pixels */
2348    vmovl.u8        q8,  d0
2349    vext.8          q2,  q1,  q0, #15
2350    vmovl.u8        q9,  d1
2351    vaddw.u8        q10, q15, d4
2352    vaddw.u8        q11, q15, d5
2353    vmlal.u8        q8,  d4,  d28
2354    vmlal.u8        q9,  d5,  d28
2355    vmlal.u8        q10, d0,  d28
2356    vmlal.u8        q11, d1,  d28
2357    vrshrn.u16      d10, q8,  #2
2358    vrshrn.u16      d12, q9,  #2
2359    vshrn.u16       d11, q10, #2
2360    vshrn.u16       d13, q11, #2
2361    vzip.8          d10, d11
2362    vzip.8          d12, d13
2363    /* store the remaining pixels */
2364    tst             \WIDTH, #8
2365    beq             2f
2366    vst1.8          {d10, d11}, [\OUTPTR]!
2367    vmov            q5,  q6
23682:
2369    tst             \WIDTH, #4
2370    beq             2f
2371    vst1.8          {d10}, [\OUTPTR]!
2372    vmov            d10,  d11
23732:
2374    tst             \WIDTH, #2
2375    beq             2f
2376    vst1.8          {d10[0]}, [\OUTPTR]!
2377    vst1.8          {d10[1]}, [\OUTPTR]!
2378    vst1.8          {d10[2]}, [\OUTPTR]!
2379    vst1.8          {d10[3]}, [\OUTPTR]!
2380    vext.8          d10, d10, d10, #4
23812:
2382    tst             \WIDTH, #1
2383    beq             2f
2384    vst1.8          {d10[0]}, [\OUTPTR]!
2385    vst1.8          {d10[1]}, [\OUTPTR]!
23862:
23879:
2388.endm
2389
2390asm_function jsimd_h2v1_fancy_upsample_neon
2391
2392    MAX_V_SAMP_FACTOR .req r0
2393    DOWNSAMPLED_WIDTH .req r1
2394    INPUT_DATA        .req r2
2395    OUTPUT_DATA_PTR   .req r3
2396    OUTPUT_DATA       .req OUTPUT_DATA_PTR
2397
2398    OUTPTR            .req r4
2399    INPTR             .req r5
2400    WIDTH             .req ip
2401    TMP               .req lr
2402
2403    push            {r4, r5, r6, lr}
2404    vpush           {d8-d15}
2405
2406    ldr             OUTPUT_DATA, [OUTPUT_DATA_PTR]
2407    cmp             MAX_V_SAMP_FACTOR, #0
2408    ble             99f
2409
2410    /* initialize constants */
2411    vmov.u8         d28, #3
2412    vmov.u16        q15, #1
241311:
2414    ldr             INPTR, [INPUT_DATA], #4
2415    ldr             OUTPTR, [OUTPUT_DATA], #4
2416    mov             WIDTH, DOWNSAMPLED_WIDTH
2417    upsample_row    OUTPTR, INPTR, WIDTH, TMP
2418    subs            MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
2419    bgt             11b
2420
242199:
2422    vpop            {d8-d15}
2423    pop             {r4, r5, r6, pc}
2424
2425    .unreq          MAX_V_SAMP_FACTOR
2426    .unreq          DOWNSAMPLED_WIDTH
2427    .unreq          INPUT_DATA
2428    .unreq          OUTPUT_DATA_PTR
2429    .unreq          OUTPUT_DATA
2430
2431    .unreq          OUTPTR
2432    .unreq          INPTR
2433    .unreq          WIDTH
2434    .unreq          TMP
2435
2436
2437.purgem upsample16
2438.purgem upsample32
2439.purgem upsample_row
2440