• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Armv7 Neon optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
5 *                          All Rights Reserved.
6 * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
7 * Copyright (C) 2014, Siarhei Siamashka.  All Rights Reserved.
8 * Copyright (C) 2014, Linaro Limited.  All Rights Reserved.
9 * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
10 * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
11 *
12 * This software is provided 'as-is', without any express or implied
13 * warranty.  In no event will the authors be held liable for any damages
14 * arising from the use of this software.
15 *
16 * Permission is granted to anyone to use this software for any purpose,
17 * including commercial applications, and to alter it and redistribute it
18 * freely, subject to the following restrictions:
19 *
20 * 1. The origin of this software must not be misrepresented; you must not
21 *    claim that you wrote the original software. If you use this software
22 *    in a product, an acknowledgment in the product documentation would be
23 *    appreciated but is not required.
24 * 2. Altered source versions must be plainly marked as such, and must not be
25 *    misrepresented as being the original software.
26 * 3. This notice may not be removed or altered from any source distribution.
27 */
28
29#if defined(__linux__) && defined(__ELF__)
30.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
31#endif
32
33.text
34.fpu neon
35.arch armv7a
36.object_arch armv4
37.arm
38.syntax unified
39
40
41/*****************************************************************************/
42
43/* Supplementary macro for setting function attributes */
44.macro asm_function fname
45#ifdef __APPLE__
46    .private_extern _\fname
47    .globl _\fname
48_\fname:
49#else
50    .global \fname
51#ifdef __ELF__
52    .hidden \fname
53    .type \fname, %function
54#endif
55\fname:
56#endif
57.endm
58
59
60#define CENTERJSAMPLE  128
61
62/*****************************************************************************/
63
64/*
65 * Perform dequantization and inverse DCT on one block of coefficients.
66 *
67 * GLOBAL(void)
68 * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
69 *                       JSAMPARRAY output_buf, JDIMENSION output_col)
70 */
71
72#define FIX_0_298631336  (2446)
73#define FIX_0_390180644  (3196)
74#define FIX_0_541196100  (4433)
75#define FIX_0_765366865  (6270)
76#define FIX_0_899976223  (7373)
77#define FIX_1_175875602  (9633)
78#define FIX_1_501321110  (12299)
79#define FIX_1_847759065  (15137)
80#define FIX_1_961570560  (16069)
81#define FIX_2_053119869  (16819)
82#define FIX_2_562915447  (20995)
83#define FIX_3_072711026  (25172)
84
85#define FIX_1_175875602_MINUS_1_961570560  (FIX_1_175875602 - FIX_1_961570560)
86#define FIX_1_175875602_MINUS_0_390180644  (FIX_1_175875602 - FIX_0_390180644)
87#define FIX_0_541196100_MINUS_1_847759065  (FIX_0_541196100 - FIX_1_847759065)
88#define FIX_3_072711026_MINUS_2_562915447  (FIX_3_072711026 - FIX_2_562915447)
89#define FIX_0_298631336_MINUS_0_899976223  (FIX_0_298631336 - FIX_0_899976223)
90#define FIX_1_501321110_MINUS_0_899976223  (FIX_1_501321110 - FIX_0_899976223)
91#define FIX_2_053119869_MINUS_2_562915447  (FIX_2_053119869 - FIX_2_562915447)
92#define FIX_0_541196100_PLUS_0_765366865   (FIX_0_541196100 + FIX_0_765366865)
93
94/*
95 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
96 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
97 */
98#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
99  DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
100  JLONG   q1, q2, q3, q4, q5, q6, q7; \
101  JLONG   tmp11_plus_tmp2, tmp11_minus_tmp2; \
102  \
103  /* 1-D iDCT input data */ \
104  row0 = xrow0; \
105  row1 = xrow1; \
106  row2 = xrow2; \
107  row3 = xrow3; \
108  row4 = xrow4; \
109  row5 = xrow5; \
110  row6 = xrow6; \
111  row7 = xrow7; \
112  \
113  q5 = row7 + row3; \
114  q4 = row5 + row1; \
115  q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
116       MULTIPLY(q4, FIX_1_175875602); \
117  q7 = MULTIPLY(q5, FIX_1_175875602) + \
118       MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
119  q2 = MULTIPLY(row2, FIX_0_541196100) + \
120       MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
121  q4 = q6; \
122  q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
123  q6 += MULTIPLY(row5, -FIX_2_562915447) + \
124        MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
125  /* now we can use q1 (reloadable constants have been used up) */ \
126  q1 = q3 + q2; \
127  q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
128        MULTIPLY(row1, -FIX_0_899976223); \
129  q5 = q7; \
130  q1 = q1 + q6; \
131  q7 += MULTIPLY(row7, -FIX_0_899976223) + \
132        MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
133  \
134  /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
135  tmp11_plus_tmp2 = q1; \
136  row1 = 0; \
137  \
138  q1 = q1 - q6; \
139  q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
140        MULTIPLY(row3, -FIX_2_562915447); \
141  q1 = q1 - q6; \
142  q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
143       MULTIPLY(row6, FIX_0_541196100); \
144  q3 = q3 - q2; \
145  \
146  /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
147  tmp11_minus_tmp2 = q1; \
148  \
149  q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
150  q2 = q1 + q6; \
151  q1 = q1 - q6; \
152  \
153  /* pick up the results */ \
154  tmp0  = q4; \
155  tmp1  = q5; \
156  tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
157  tmp3  = q7; \
158  tmp10 = q2; \
159  tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
160  tmp12 = q3; \
161  tmp13 = q1; \
162}
163
164#define XFIX_0_899976223                    d0[0]
165#define XFIX_0_541196100                    d0[1]
166#define XFIX_2_562915447                    d0[2]
167#define XFIX_0_298631336_MINUS_0_899976223  d0[3]
168#define XFIX_1_501321110_MINUS_0_899976223  d1[0]
169#define XFIX_2_053119869_MINUS_2_562915447  d1[1]
170#define XFIX_0_541196100_PLUS_0_765366865   d1[2]
171#define XFIX_1_175875602                    d1[3]
172#define XFIX_1_175875602_MINUS_0_390180644  d2[0]
173#define XFIX_0_541196100_MINUS_1_847759065  d2[1]
174#define XFIX_3_072711026_MINUS_2_562915447  d2[2]
175#define XFIX_1_175875602_MINUS_1_961570560  d2[3]
176
177.balign 16
178jsimd_idct_islow_neon_consts:
179  .short FIX_0_899976223                    /* d0[0] */
180  .short FIX_0_541196100                    /* d0[1] */
181  .short FIX_2_562915447                    /* d0[2] */
182  .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
183  .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
184  .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
185  .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
186  .short FIX_1_175875602                    /* d1[3] */
187  /* reloadable constants */
188  .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
189  .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
190  .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
191  .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
192
193asm_function jsimd_idct_islow_neon
194
195    DCT_TABLE       .req r0
196    COEF_BLOCK      .req r1
197    OUTPUT_BUF      .req r2
198    OUTPUT_COL      .req r3
199    TMP1            .req r0
200    TMP2            .req r1
201    TMP3            .req r2
202    TMP4            .req ip
203
204    ROW0L           .req d16
205    ROW0R           .req d17
206    ROW1L           .req d18
207    ROW1R           .req d19
208    ROW2L           .req d20
209    ROW2R           .req d21
210    ROW3L           .req d22
211    ROW3R           .req d23
212    ROW4L           .req d24
213    ROW4R           .req d25
214    ROW5L           .req d26
215    ROW5R           .req d27
216    ROW6L           .req d28
217    ROW6R           .req d29
218    ROW7L           .req d30
219    ROW7R           .req d31
220
221    /* Load and dequantize coefficients into Neon registers
222     * with the following allocation:
223     *       0 1 2 3 | 4 5 6 7
224     *      ---------+--------
225     *   0 | d16     | d17     ( q8  )
226     *   1 | d18     | d19     ( q9  )
227     *   2 | d20     | d21     ( q10 )
228     *   3 | d22     | d23     ( q11 )
229     *   4 | d24     | d25     ( q12 )
230     *   5 | d26     | d27     ( q13 )
231     *   6 | d28     | d29     ( q14 )
232     *   7 | d30     | d31     ( q15 )
233     */
234    adr             ip, jsimd_idct_islow_neon_consts
235    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
236    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
237    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
238    vmul.s16        q8, q8, q0
239    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
240    vmul.s16        q9, q9, q1
241    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
242    vmul.s16        q10, q10, q2
243    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
244    vmul.s16        q11, q11, q3
245    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
246    vmul.s16        q12, q12, q0
247    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
248    vmul.s16        q14, q14, q2
249    vmul.s16        q13, q13, q1
250    vld1.16         {d0, d1, d2, d3}, [ip, :128]  /* load constants */
251    add             ip, ip, #16
252    vmul.s16        q15, q15, q3
253    vpush           {d8 - d15}                    /* save Neon registers */
254    /* 1-D IDCT, pass 1, left 4x8 half */
255    vadd.s16        d4, ROW7L, ROW3L
256    vadd.s16        d5, ROW5L, ROW1L
257    vmull.s16       q6, d4, XFIX_1_175875602_MINUS_1_961570560
258    vmlal.s16       q6, d5, XFIX_1_175875602
259    vmull.s16       q7, d4, XFIX_1_175875602
260      /* Check for the zero coefficients in the right 4x8 half */
261      push            {r4, r5}
262    vmlal.s16       q7, d5, XFIX_1_175875602_MINUS_0_390180644
263    vsubl.s16       q3, ROW0L, ROW4L
264      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
265    vmull.s16       q2, ROW2L, XFIX_0_541196100
266    vmlal.s16       q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
267      orr             r0, r4, r5
268    vmov            q4, q6
269    vmlsl.s16       q6, ROW5L, XFIX_2_562915447
270      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
271    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
272    vshl.s32        q3, q3, #13
273      orr             r0, r0, r4
274    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
275      orr             r0, r0, r5
276    vadd.s32        q1, q3, q2
277      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
278    vmov            q5, q7
279    vadd.s32        q1, q1, q6
280      orr             r0, r0, r4
281    vmlsl.s16       q7, ROW7L, XFIX_0_899976223
282      orr             r0, r0, r5
283    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
284    vrshrn.s32      ROW1L, q1, #11
285      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
286    vsub.s32        q1, q1, q6
287    vmlal.s16       q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
288      orr             r0, r0, r4
289    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
290      orr             r0, r0, r5
291    vsub.s32        q1, q1, q6
292    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
293      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
294    vmlal.s16       q6, ROW6L, XFIX_0_541196100
295    vsub.s32        q3, q3, q2
296      orr             r0, r0, r4
297    vrshrn.s32      ROW6L, q1, #11
298      orr             r0, r0, r5
299    vadd.s32        q1, q3, q5
300      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
301    vsub.s32        q3, q3, q5
302    vaddl.s16       q5, ROW0L, ROW4L
303      orr             r0, r0, r4
304    vrshrn.s32      ROW2L, q1, #11
305      orr             r0, r0, r5
306    vrshrn.s32      ROW5L, q3, #11
307      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
308    vshl.s32        q5, q5, #13
309    vmlal.s16       q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
310      orr             r0, r0, r4
311    vadd.s32        q2, q5, q6
312      orrs            r0, r0, r5
313    vsub.s32        q1, q5, q6
314    vadd.s32        q6, q2, q7
315      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
316    vsub.s32        q2, q2, q7
317    vadd.s32        q5, q1, q4
318      orr             r0, r4, r5
319    vsub.s32        q3, q1, q4
320      pop             {r4, r5}
321    vrshrn.s32      ROW7L, q2, #11
322    vrshrn.s32      ROW3L, q5, #11
323    vrshrn.s32      ROW0L, q6, #11
324    vrshrn.s32      ROW4L, q3, #11
325
326      beq             3f  /* Go to do some special handling for the sparse
327                             right 4x8 half */
328
329    /* 1-D IDCT, pass 1, right 4x8 half */
330    vld1.s16        {d2}, [ip, :64]  /* reload constants */
331    vadd.s16        d10, ROW7R, ROW3R
332    vadd.s16        d8, ROW5R, ROW1R
333      /* Transpose left 4x8 half */
334      vtrn.16         ROW6L, ROW7L
335    vmull.s16       q6, d10, XFIX_1_175875602_MINUS_1_961570560
336    vmlal.s16       q6, d8, XFIX_1_175875602
337      vtrn.16         ROW2L, ROW3L
338    vmull.s16       q7, d10, XFIX_1_175875602
339    vmlal.s16       q7, d8, XFIX_1_175875602_MINUS_0_390180644
340      vtrn.16         ROW0L, ROW1L
341    vsubl.s16       q3, ROW0R, ROW4R
342    vmull.s16       q2, ROW2R, XFIX_0_541196100
343    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
344      vtrn.16         ROW4L, ROW5L
345    vmov            q4, q6
346    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
347    vmlal.s16       q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
348      vtrn.32         ROW1L, ROW3L
349    vshl.s32        q3, q3, #13
350    vmlsl.s16       q4, ROW1R, XFIX_0_899976223
351      vtrn.32         ROW4L, ROW6L
352    vadd.s32        q1, q3, q2
353    vmov            q5, q7
354    vadd.s32        q1, q1, q6
355      vtrn.32         ROW0L, ROW2L
356    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
357    vmlal.s16       q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
358    vrshrn.s32      ROW1R, q1, #11
359      vtrn.32         ROW5L, ROW7L
360    vsub.s32        q1, q1, q6
361    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
362    vmlsl.s16       q5, ROW3R, XFIX_2_562915447
363    vsub.s32        q1, q1, q6
364    vmull.s16       q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
365    vmlal.s16       q6, ROW6R, XFIX_0_541196100
366    vsub.s32        q3, q3, q2
367    vrshrn.s32      ROW6R, q1, #11
368    vadd.s32        q1, q3, q5
369    vsub.s32        q3, q3, q5
370    vaddl.s16       q5, ROW0R, ROW4R
371    vrshrn.s32      ROW2R, q1, #11
372    vrshrn.s32      ROW5R, q3, #11
373    vshl.s32        q5, q5, #13
374    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
375    vadd.s32        q2, q5, q6
376    vsub.s32        q1, q5, q6
377    vadd.s32        q6, q2, q7
378    vsub.s32        q2, q2, q7
379    vadd.s32        q5, q1, q4
380    vsub.s32        q3, q1, q4
381    vrshrn.s32      ROW7R, q2, #11
382    vrshrn.s32      ROW3R, q5, #11
383    vrshrn.s32      ROW0R, q6, #11
384    vrshrn.s32      ROW4R, q3, #11
385    /* Transpose right 4x8 half */
386    vtrn.16         ROW6R, ROW7R
387    vtrn.16         ROW2R, ROW3R
388    vtrn.16         ROW0R, ROW1R
389    vtrn.16         ROW4R, ROW5R
390    vtrn.32         ROW1R, ROW3R
391    vtrn.32         ROW4R, ROW6R
392    vtrn.32         ROW0R, ROW2R
393    vtrn.32         ROW5R, ROW7R
394
3951:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
396    vld1.s16        {d2}, [ip, :64]               /* reload constants */
397    vmull.s16       q6, ROW1R, XFIX_1_175875602   /* ROW5L <-> ROW1R */
398    vmlal.s16       q6, ROW1L, XFIX_1_175875602
399    vmlal.s16       q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
400    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
401    vmull.s16       q7, ROW3R, XFIX_1_175875602   /* ROW7L <-> ROW3R */
402    vmlal.s16       q7, ROW3L, XFIX_1_175875602
403    vmlal.s16       q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
404    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
405    vsubl.s16       q3, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
406    vmull.s16       q2, ROW2L, XFIX_0_541196100
407    vmlal.s16       q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065  /* ROW6L <-> ROW2R */
408    vmov            q4, q6
409    vmlsl.s16       q6, ROW1R, XFIX_2_562915447   /* ROW5L <-> ROW1R */
410    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
411    vshl.s32        q3, q3, #13
412    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
413    vadd.s32        q1, q3, q2
414    vmov            q5, q7
415    vadd.s32        q1, q1, q6
416    vmlsl.s16       q7, ROW3R, XFIX_0_899976223   /* ROW7L <-> ROW3R */
417    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
418    vshrn.s32       ROW1L, q1, #16
419    vsub.s32        q1, q1, q6
420    vmlal.s16       q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447  /* ROW5L <-> ROW1R */
421    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
422    vsub.s32        q1, q1, q6
423    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
424    vmlal.s16       q6, ROW2R, XFIX_0_541196100   /* ROW6L <-> ROW2R */
425    vsub.s32        q3, q3, q2
426    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
427    vadd.s32        q1, q3, q5
428    vsub.s32        q3, q3, q5
429    vaddl.s16       q5, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
430    vshrn.s32       ROW2L, q1, #16
431    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
432    vshl.s32        q5, q5, #13
433    vmlal.s16       q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223  /* ROW7L <-> ROW3R */
434    vadd.s32        q2, q5, q6
435    vsub.s32        q1, q5, q6
436    vadd.s32        q6, q2, q7
437    vsub.s32        q2, q2, q7
438    vadd.s32        q5, q1, q4
439    vsub.s32        q3, q1, q4
440    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
441    vshrn.s32       ROW3L, q5, #16
442    vshrn.s32       ROW0L, q6, #16
443    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
444    /* 1-D IDCT, pass 2, right 4x8 half */
445    vld1.s16        {d2}, [ip, :64]               /* reload constants */
446    vmull.s16       q6, ROW5R, XFIX_1_175875602
447    vmlal.s16       q6, ROW5L, XFIX_1_175875602   /* ROW5L <-> ROW1R */
448    vmlal.s16       q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
449    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
450    vmull.s16       q7, ROW7R, XFIX_1_175875602
451    vmlal.s16       q7, ROW7L, XFIX_1_175875602   /* ROW7L <-> ROW3R */
452    vmlal.s16       q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
453    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
454    vsubl.s16       q3, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
455    vmull.s16       q2, ROW6L, XFIX_0_541196100   /* ROW6L <-> ROW2R */
456    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
457    vmov            q4, q6
458    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
459    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447  /* ROW7L <-> ROW3R */
460    vshl.s32        q3, q3, #13
461    vmlsl.s16       q4, ROW5L, XFIX_0_899976223   /* ROW5L <-> ROW1R */
462    vadd.s32        q1, q3, q2
463    vmov            q5, q7
464    vadd.s32        q1, q1, q6
465    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
466    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223  /* ROW5L <-> ROW1R */
467    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
468    vsub.s32        q1, q1, q6
469    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
470    vmlsl.s16       q5, ROW7L, XFIX_2_562915447   /* ROW7L <-> ROW3R */
471    vsub.s32        q1, q1, q6
472    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865  /* ROW6L <-> ROW2R */
473    vmlal.s16       q6, ROW6R, XFIX_0_541196100
474    vsub.s32        q3, q3, q2
475    vshrn.s32       ROW6R, q1, #16
476    vadd.s32        q1, q3, q5
477    vsub.s32        q3, q3, q5
478    vaddl.s16       q5, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
479    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
480    vshrn.s32       ROW5R, q3, #16
481    vshl.s32        q5, q5, #13
482    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
483    vadd.s32        q2, q5, q6
484    vsub.s32        q1, q5, q6
485    vadd.s32        q6, q2, q7
486    vsub.s32        q2, q2, q7
487    vadd.s32        q5, q1, q4
488    vsub.s32        q3, q1, q4
489    vshrn.s32       ROW7R, q2, #16
490    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
491    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
492    vshrn.s32       ROW4R, q3, #16
493
4942:  /* Descale to 8-bit and range limit */
495    vqrshrn.s16     d16, q8, #2
496    vqrshrn.s16     d17, q9, #2
497    vqrshrn.s16     d18, q10, #2
498    vqrshrn.s16     d19, q11, #2
499    vpop            {d8 - d15}                    /* restore Neon registers */
500    vqrshrn.s16     d20, q12, #2
501      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
502      vtrn.16         q8, q9
503    vqrshrn.s16     d21, q13, #2
504    vqrshrn.s16     d22, q14, #2
505      vmov.u8         q0, #(CENTERJSAMPLE)
506    vqrshrn.s16     d23, q15, #2
507      vtrn.8          d16, d17
508      vtrn.8          d18, d19
509      vadd.u8         q8, q8, q0
510      vadd.u8         q9, q9, q0
511      vtrn.16         q10, q11
512        /* Store results to the output buffer */
513        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
514        add             TMP1, TMP1, OUTPUT_COL
515        add             TMP2, TMP2, OUTPUT_COL
516        vst1.8          {d16}, [TMP1]
517      vtrn.8          d20, d21
518        vst1.8          {d17}, [TMP2]
519        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
520        add             TMP1, TMP1, OUTPUT_COL
521        add             TMP2, TMP2, OUTPUT_COL
522        vst1.8          {d18}, [TMP1]
523      vadd.u8         q10, q10, q0
524        vst1.8          {d19}, [TMP2]
525        ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
526        add             TMP1, TMP1, OUTPUT_COL
527        add             TMP2, TMP2, OUTPUT_COL
528        add             TMP3, TMP3, OUTPUT_COL
529        add             TMP4, TMP4, OUTPUT_COL
530      vtrn.8          d22, d23
531        vst1.8          {d20}, [TMP1]
532      vadd.u8         q11, q11, q0
533        vst1.8          {d21}, [TMP2]
534        vst1.8          {d22}, [TMP3]
535        vst1.8          {d23}, [TMP4]
536    bx              lr
537
5383:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
539
540    /* Transpose left 4x8 half */
541    vtrn.16         ROW6L, ROW7L
542    vtrn.16         ROW2L, ROW3L
543    vtrn.16         ROW0L, ROW1L
544    vtrn.16         ROW4L, ROW5L
545    vshl.s16        ROW0R, ROW0R, #2  /* PASS1_BITS */
546    vtrn.32         ROW1L, ROW3L
547    vtrn.32         ROW4L, ROW6L
548    vtrn.32         ROW0L, ROW2L
549    vtrn.32         ROW5L, ROW7L
550
551    cmp             r0, #0
552    beq             4f  /* Right 4x8 half has all zeros, go to 'sparse' second
553                           pass */
554
555    /* Only row 0 is non-zero for the right 4x8 half  */
556    vdup.s16        ROW1R, ROW0R[1]
557    vdup.s16        ROW2R, ROW0R[2]
558    vdup.s16        ROW3R, ROW0R[3]
559    vdup.s16        ROW4R, ROW0R[0]
560    vdup.s16        ROW5R, ROW0R[1]
561    vdup.s16        ROW6R, ROW0R[2]
562    vdup.s16        ROW7R, ROW0R[3]
563    vdup.s16        ROW0R, ROW0R[0]
564    b               1b  /* Go to 'normal' second pass */
565
5664:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
567    vld1.s16        {d2}, [ip, :64]               /* reload constants */
568    vmull.s16       q6, ROW1L, XFIX_1_175875602
569    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
570    vmull.s16       q7, ROW3L, XFIX_1_175875602
571    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
572    vmull.s16       q2, ROW2L, XFIX_0_541196100
573    vshll.s16       q3, ROW0L, #13
574    vmov            q4, q6
575    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
576    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
577    vadd.s32        q1, q3, q2
578    vmov            q5, q7
579    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
580    vadd.s32        q1, q1, q6
581    vadd.s32        q6, q6, q6
582    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
583    vshrn.s32       ROW1L, q1, #16
584    vsub.s32        q1, q1, q6
585    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
586    vsub.s32        q3, q3, q2
587    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
588    vadd.s32        q1, q3, q5
589    vsub.s32        q3, q3, q5
590    vshll.s16       q5, ROW0L, #13
591    vshrn.s32       ROW2L, q1, #16
592    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
593    vadd.s32        q2, q5, q6
594    vsub.s32        q1, q5, q6
595    vadd.s32        q6, q2, q7
596    vsub.s32        q2, q2, q7
597    vadd.s32        q5, q1, q4
598    vsub.s32        q3, q1, q4
599    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
600    vshrn.s32       ROW3L, q5, #16
601    vshrn.s32       ROW0L, q6, #16
602    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
603    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
604    vld1.s16        {d2}, [ip, :64]               /* reload constants */
605    vmull.s16       q6, ROW5L, XFIX_1_175875602
606    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
607    vmull.s16       q7, ROW7L, XFIX_1_175875602
608    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
609    vmull.s16       q2, ROW6L, XFIX_0_541196100
610    vshll.s16       q3, ROW4L, #13
611    vmov            q4, q6
612    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
613    vmlsl.s16       q4, ROW5L, XFIX_0_899976223
614    vadd.s32        q1, q3, q2
615    vmov            q5, q7
616    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
617    vadd.s32        q1, q1, q6
618    vadd.s32        q6, q6, q6
619    vmlsl.s16       q5, ROW7L, XFIX_2_562915447
620    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
621    vsub.s32        q1, q1, q6
622    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
623    vsub.s32        q3, q3, q2
624    vshrn.s32       ROW6R, q1, #16
625    vadd.s32        q1, q3, q5
626    vsub.s32        q3, q3, q5
627    vshll.s16       q5, ROW4L, #13
628    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
629    vshrn.s32       ROW5R, q3, #16
630    vadd.s32        q2, q5, q6
631    vsub.s32        q1, q5, q6
632    vadd.s32        q6, q2, q7
633    vsub.s32        q2, q2, q7
634    vadd.s32        q5, q1, q4
635    vsub.s32        q3, q1, q4
636    vshrn.s32       ROW7R, q2, #16
637    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
638    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
639    vshrn.s32       ROW4R, q3, #16
640    b               2b                            /* Go to epilogue */
641
642    .unreq          DCT_TABLE
643    .unreq          COEF_BLOCK
644    .unreq          OUTPUT_BUF
645    .unreq          OUTPUT_COL
646    .unreq          TMP1
647    .unreq          TMP2
648    .unreq          TMP3
649    .unreq          TMP4
650
651    .unreq          ROW0L
652    .unreq          ROW0R
653    .unreq          ROW1L
654    .unreq          ROW1R
655    .unreq          ROW2L
656    .unreq          ROW2R
657    .unreq          ROW3L
658    .unreq          ROW3R
659    .unreq          ROW4L
660    .unreq          ROW4R
661    .unreq          ROW5L
662    .unreq          ROW5R
663    .unreq          ROW6L
664    .unreq          ROW6R
665    .unreq          ROW7L
666    .unreq          ROW7R
667
668
669/*****************************************************************************/
670
671/*
672 * jsimd_idct_ifast_neon
673 *
674 * This function contains a fast, not so accurate integer implementation of
675 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
676 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
677 * function from jidctfst.c
678 *
679 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
680 * But in Arm Neon case some extra additions are required because VQDMULH
681 * instruction can't handle the constants larger than 1. So the expressions
682 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
683 * which introduces an extra addition. Overall, there are 6 extra additions
684 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
685 */
686
687#define XFIX_1_082392200  d0[0]
688#define XFIX_1_414213562  d0[1]
689#define XFIX_1_847759065  d0[2]
690#define XFIX_2_613125930  d0[3]
691
692.balign 16
693jsimd_idct_ifast_neon_consts:
694  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
695  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
696  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
697  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
698
699asm_function jsimd_idct_ifast_neon
700
701    DCT_TABLE       .req r0
702    COEF_BLOCK      .req r1
703    OUTPUT_BUF      .req r2
704    OUTPUT_COL      .req r3
705    TMP1            .req r0
706    TMP2            .req r1
707    TMP3            .req r2
708    TMP4            .req ip
709
710    /* Load and dequantize coefficients into Neon registers
711     * with the following allocation:
712     *       0 1 2 3 | 4 5 6 7
713     *      ---------+--------
714     *   0 | d16     | d17     ( q8  )
715     *   1 | d18     | d19     ( q9  )
716     *   2 | d20     | d21     ( q10 )
717     *   3 | d22     | d23     ( q11 )
718     *   4 | d24     | d25     ( q12 )
719     *   5 | d26     | d27     ( q13 )
720     *   6 | d28     | d29     ( q14 )
721     *   7 | d30     | d31     ( q15 )
722     */
723    adr             ip, jsimd_idct_ifast_neon_consts
724    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
725    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
726    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
727    vmul.s16        q8, q8, q0
728    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
729    vmul.s16        q9, q9, q1
730    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
731    vmul.s16        q10, q10, q2
732    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
733    vmul.s16        q11, q11, q3
734    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
735    vmul.s16        q12, q12, q0
736    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
737    vmul.s16        q14, q14, q2
738    vmul.s16        q13, q13, q1
739    vld1.16         {d0}, [ip, :64]  /* load constants */
740    vmul.s16        q15, q15, q3
741    vpush           {d8 - d13}       /* save Neon registers */
742    /* 1-D IDCT, pass 1 */
743    vsub.s16        q2, q10, q14
744    vadd.s16        q14, q10, q14
745    vsub.s16        q1, q11, q13
746    vadd.s16        q13, q11, q13
747    vsub.s16        q5, q9, q15
748    vadd.s16        q15, q9, q15
749    vqdmulh.s16     q4, q2, XFIX_1_414213562
750    vqdmulh.s16     q6, q1, XFIX_2_613125930
751    vadd.s16        q3, q1, q1
752    vsub.s16        q1, q5, q1
753    vadd.s16        q10, q2, q4
754    vqdmulh.s16     q4, q1, XFIX_1_847759065
755    vsub.s16        q2, q15, q13
756    vadd.s16        q3, q3, q6
757    vqdmulh.s16     q6, q2, XFIX_1_414213562
758    vadd.s16        q1, q1, q4
759    vqdmulh.s16     q4, q5, XFIX_1_082392200
760    vsub.s16        q10, q10, q14
761    vadd.s16        q2, q2, q6
762    vsub.s16        q6, q8, q12
763    vadd.s16        q12, q8, q12
764    vadd.s16        q9, q5, q4
765    vadd.s16        q5, q6, q10
766    vsub.s16        q10, q6, q10
767    vadd.s16        q6, q15, q13
768    vadd.s16        q8, q12, q14
769    vsub.s16        q3, q6, q3
770    vsub.s16        q12, q12, q14
771    vsub.s16        q3, q3, q1
772    vsub.s16        q1, q9, q1
773    vadd.s16        q2, q3, q2
774    vsub.s16        q15, q8, q6
775    vadd.s16        q1, q1, q2
776    vadd.s16        q8, q8, q6
777    vadd.s16        q14, q5, q3
778    vsub.s16        q9, q5, q3
779    vsub.s16        q13, q10, q2
780    vadd.s16        q10, q10, q2
781      /* Transpose */
782      vtrn.16         q8, q9
783    vsub.s16        q11, q12, q1
784      vtrn.16         q14, q15
785    vadd.s16        q12, q12, q1
786      vtrn.16         q10, q11
787      vtrn.16         q12, q13
788      vtrn.32         q9, q11
789      vtrn.32         q12, q14
790      vtrn.32         q8, q10
791      vtrn.32         q13, q15
792      vswp            d28, d21
793      vswp            d26, d19
794    /* 1-D IDCT, pass 2 */
795    vsub.s16        q2, q10, q14
796      vswp            d30, d23
797    vadd.s16        q14, q10, q14
798      vswp            d24, d17
799    vsub.s16        q1, q11, q13
800    vadd.s16        q13, q11, q13
801    vsub.s16        q5, q9, q15
802    vadd.s16        q15, q9, q15
803    vqdmulh.s16     q4, q2, XFIX_1_414213562
804    vqdmulh.s16     q6, q1, XFIX_2_613125930
805    vadd.s16        q3, q1, q1
806    vsub.s16        q1, q5, q1
807    vadd.s16        q10, q2, q4
808    vqdmulh.s16     q4, q1, XFIX_1_847759065
809    vsub.s16        q2, q15, q13
810    vadd.s16        q3, q3, q6
811    vqdmulh.s16     q6, q2, XFIX_1_414213562
812    vadd.s16        q1, q1, q4
813    vqdmulh.s16     q4, q5, XFIX_1_082392200
814    vsub.s16        q10, q10, q14
815    vadd.s16        q2, q2, q6
816    vsub.s16        q6, q8, q12
817    vadd.s16        q12, q8, q12
818    vadd.s16        q9, q5, q4
819    vadd.s16        q5, q6, q10
820    vsub.s16        q10, q6, q10
821    vadd.s16        q6, q15, q13
822    vadd.s16        q8, q12, q14
823    vsub.s16        q3, q6, q3
824    vsub.s16        q12, q12, q14
825    vsub.s16        q3, q3, q1
826    vsub.s16        q1, q9, q1
827    vadd.s16        q2, q3, q2
828    vsub.s16        q15, q8, q6
829    vadd.s16        q1, q1, q2
830    vadd.s16        q8, q8, q6
831    vadd.s16        q14, q5, q3
832    vsub.s16        q9, q5, q3
833    vsub.s16        q13, q10, q2
834    vpop            {d8 - d13}    /* restore Neon registers */
835    vadd.s16        q10, q10, q2
836    vsub.s16        q11, q12, q1
837    vadd.s16        q12, q12, q1
838    /* Descale to 8-bit and range limit */
839    vmov.u8         q0, #0x80
840    vqshrn.s16      d16, q8, #5
841    vqshrn.s16      d17, q9, #5
842    vqshrn.s16      d18, q10, #5
843    vqshrn.s16      d19, q11, #5
844    vqshrn.s16      d20, q12, #5
845    vqshrn.s16      d21, q13, #5
846    vqshrn.s16      d22, q14, #5
847    vqshrn.s16      d23, q15, #5
848    vadd.u8         q8, q8, q0
849    vadd.u8         q9, q9, q0
850    vadd.u8         q10, q10, q0
851    vadd.u8         q11, q11, q0
852    /* Transpose the final 8-bit samples */
853    vtrn.16         q8, q9
854    vtrn.16         q10, q11
855    vtrn.32         q8, q10
856    vtrn.32         q9, q11
857    vtrn.8          d16, d17
858    vtrn.8          d18, d19
859      /* Store results to the output buffer */
860      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
861      add             TMP1, TMP1, OUTPUT_COL
862      add             TMP2, TMP2, OUTPUT_COL
863      vst1.8          {d16}, [TMP1]
864      vst1.8          {d17}, [TMP2]
865      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
866      add             TMP1, TMP1, OUTPUT_COL
867      add             TMP2, TMP2, OUTPUT_COL
868      vst1.8          {d18}, [TMP1]
869    vtrn.8          d20, d21
870      vst1.8          {d19}, [TMP2]
871      ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
872      add             TMP1, TMP1, OUTPUT_COL
873      add             TMP2, TMP2, OUTPUT_COL
874      add             TMP3, TMP3, OUTPUT_COL
875      add             TMP4, TMP4, OUTPUT_COL
876      vst1.8          {d20}, [TMP1]
877    vtrn.8          d22, d23
878      vst1.8          {d21}, [TMP2]
879      vst1.8          {d22}, [TMP3]
880      vst1.8          {d23}, [TMP4]
881    bx              lr
882
883    .unreq          DCT_TABLE
884    .unreq          COEF_BLOCK
885    .unreq          OUTPUT_BUF
886    .unreq          OUTPUT_COL
887    .unreq          TMP1
888    .unreq          TMP2
889    .unreq          TMP3
890    .unreq          TMP4
891
892
893/*****************************************************************************/
894
895/*
896 * jsimd_extrgb_ycc_convert_neon
897 * jsimd_extbgr_ycc_convert_neon
898 * jsimd_extrgbx_ycc_convert_neon
899 * jsimd_extbgrx_ycc_convert_neon
900 * jsimd_extxbgr_ycc_convert_neon
901 * jsimd_extxrgb_ycc_convert_neon
902 *
903 * Colorspace conversion RGB -> YCbCr
904 */
905
906.macro do_store size
907  .if \size == 8
908    vst1.8          {d20}, [Y]!
909    vst1.8          {d21}, [U]!
910    vst1.8          {d22}, [V]!
911  .elseif \size == 4
912    vst1.8          {d20[0]}, [Y]!
913    vst1.8          {d20[1]}, [Y]!
914    vst1.8          {d20[2]}, [Y]!
915    vst1.8          {d20[3]}, [Y]!
916    vst1.8          {d21[0]}, [U]!
917    vst1.8          {d21[1]}, [U]!
918    vst1.8          {d21[2]}, [U]!
919    vst1.8          {d21[3]}, [U]!
920    vst1.8          {d22[0]}, [V]!
921    vst1.8          {d22[1]}, [V]!
922    vst1.8          {d22[2]}, [V]!
923    vst1.8          {d22[3]}, [V]!
924  .elseif \size == 2
925    vst1.8          {d20[4]}, [Y]!
926    vst1.8          {d20[5]}, [Y]!
927    vst1.8          {d21[4]}, [U]!
928    vst1.8          {d21[5]}, [U]!
929    vst1.8          {d22[4]}, [V]!
930    vst1.8          {d22[5]}, [V]!
931  .elseif \size == 1
932    vst1.8          {d20[6]}, [Y]!
933    vst1.8          {d21[6]}, [U]!
934    vst1.8          {d22[6]}, [V]!
935  .else
936    .error unsupported macroblock size
937  .endif
938.endm
939
940.macro do_load bpp, size
941  .if \bpp == 24
942    .if \size == 8
943      vld3.8        {d10, d11, d12}, [RGB]!
944      pld           [RGB, #128]
945    .elseif \size == 4
946      vld3.8        {d10[0], d11[0], d12[0]}, [RGB]!
947      vld3.8        {d10[1], d11[1], d12[1]}, [RGB]!
948      vld3.8        {d10[2], d11[2], d12[2]}, [RGB]!
949      vld3.8        {d10[3], d11[3], d12[3]}, [RGB]!
950    .elseif \size == 2
951      vld3.8        {d10[4], d11[4], d12[4]}, [RGB]!
952      vld3.8        {d10[5], d11[5], d12[5]}, [RGB]!
953    .elseif \size == 1
954      vld3.8        {d10[6], d11[6], d12[6]}, [RGB]!
955    .else
956      .error unsupported macroblock size
957    .endif
958  .elseif \bpp == 32
959    .if \size == 8
960      vld4.8        {d10, d11, d12, d13}, [RGB]!
961      pld           [RGB, #128]
962    .elseif \size == 4
963      vld4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
964      vld4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
965      vld4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
966      vld4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
967    .elseif \size == 2
968      vld4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
969      vld4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
970    .elseif \size == 1
971      vld4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
972    .else
973      .error unsupported macroblock size
974    .endif
975  .else
976    .error unsupported bpp
977  .endif
978.endm
979
980.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
981
982/*
983 * 2-stage pipelined RGB->YCbCr conversion
984 */
985
986.macro do_rgb_to_yuv_stage1
987    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
988    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
989    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
990    vmull.u16       q7, d4, d0[0]
991    vmlal.u16       q7, d6, d0[1]
992    vmlal.u16       q7, d8, d0[2]
993    vmull.u16       q8, d5, d0[0]
994    vmlal.u16       q8, d7, d0[1]
995    vmlal.u16       q8, d9, d0[2]
996    vrev64.32       q9, q1
997    vrev64.32       q13, q1
998    vmlsl.u16       q9, d4, d0[3]
999    vmlsl.u16       q9, d6, d1[0]
1000    vmlal.u16       q9, d8, d1[1]
1001    vmlsl.u16       q13, d5, d0[3]
1002    vmlsl.u16       q13, d7, d1[0]
1003    vmlal.u16       q13, d9, d1[1]
1004    vrev64.32       q14, q1
1005    vrev64.32       q15, q1
1006    vmlal.u16       q14, d4, d1[1]
1007    vmlsl.u16       q14, d6, d1[2]
1008    vmlsl.u16       q14, d8, d1[3]
1009    vmlal.u16       q15, d5, d1[1]
1010    vmlsl.u16       q15, d7, d1[2]
1011    vmlsl.u16       q15, d9, d1[3]
1012.endm
1013
1014.macro do_rgb_to_yuv_stage2
1015    vrshrn.u32      d20, q7, #16
1016    vrshrn.u32      d21, q8, #16
1017    vshrn.u32       d22, q9, #16
1018    vshrn.u32       d23, q13, #16
1019    vshrn.u32       d24, q14, #16
1020    vshrn.u32       d25, q15, #16
1021    vmovn.u16       d20, q10       /* d20 = y */
1022    vmovn.u16       d21, q11       /* d21 = u */
1023    vmovn.u16       d22, q12       /* d22 = v */
1024.endm
1025
1026.macro do_rgb_to_yuv
1027    do_rgb_to_yuv_stage1
1028    do_rgb_to_yuv_stage2
1029.endm
1030
1031.macro do_rgb_to_yuv_stage2_store_load_stage1
1032      vrshrn.u32      d20, q7, #16
1033      vrshrn.u32      d21, q8, #16
1034      vshrn.u32       d22, q9, #16
1035    vrev64.32       q9, q1
1036      vshrn.u32       d23, q13, #16
1037    vrev64.32       q13, q1
1038      vshrn.u32       d24, q14, #16
1039      vshrn.u32       d25, q15, #16
1040    do_load         \bpp, 8
1041      vmovn.u16       d20, q10     /* d20 = y */
1042    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
1043      vmovn.u16       d21, q11     /* d21 = u */
1044    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
1045      vmovn.u16       d22, q12     /* d22 = v */
1046    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
1047    vmull.u16       q7, d4, d0[0]
1048    vmlal.u16       q7, d6, d0[1]
1049    vmlal.u16       q7, d8, d0[2]
1050      vst1.8          {d20}, [Y]!
1051    vmull.u16       q8, d5, d0[0]
1052    vmlal.u16       q8, d7, d0[1]
1053    vmlal.u16       q8, d9, d0[2]
1054    vmlsl.u16       q9, d4, d0[3]
1055    vmlsl.u16       q9, d6, d1[0]
1056    vmlal.u16       q9, d8, d1[1]
1057      vst1.8          {d21}, [U]!
1058    vmlsl.u16       q13, d5, d0[3]
1059    vmlsl.u16       q13, d7, d1[0]
1060    vmlal.u16       q13, d9, d1[1]
1061    vrev64.32       q14, q1
1062    vrev64.32       q15, q1
1063    vmlal.u16       q14, d4, d1[1]
1064    vmlsl.u16       q14, d6, d1[2]
1065    vmlsl.u16       q14, d8, d1[3]
1066      vst1.8          {d22}, [V]!
1067    vmlal.u16       q15, d5, d1[1]
1068    vmlsl.u16       q15, d7, d1[2]
1069    vmlsl.u16       q15, d9, d1[3]
1070.endm
1071
1072.balign 16
1073jsimd_\colorid\()_ycc_neon_consts:
1074  .short 19595, 38470, 7471,  11059
1075  .short 21709, 32768, 27439, 5329
1076  .short 32767, 128,   32767, 128
1077  .short 32767, 128,   32767, 128
1078
1079asm_function jsimd_\colorid\()_ycc_convert_neon
1080    OUTPUT_WIDTH    .req r0
1081    INPUT_BUF       .req r1
1082    OUTPUT_BUF      .req r2
1083    OUTPUT_ROW      .req r3
1084    NUM_ROWS        .req r4
1085
1086    OUTPUT_BUF0     .req r5
1087    OUTPUT_BUF1     .req r6
1088    OUTPUT_BUF2     .req OUTPUT_BUF
1089
1090    RGB             .req r7
1091    Y               .req r8
1092    U               .req r9
1093    V               .req r10
1094    N               .req ip
1095
1096    /* Load constants to d0, d1, d2, d3 */
1097    adr             ip, jsimd_\colorid\()_ycc_neon_consts
1098    vld1.16         {d0, d1, d2, d3}, [ip, :128]
1099
1100    /* Save Arm registers and handle input arguments */
1101    push            {r4, r5, r6, r7, r8, r9, r10, lr}
1102    ldr             NUM_ROWS, [sp, #(4 * 8)]
1103    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
1104    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #4]
1105    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #8]
1106    .unreq          OUTPUT_BUF
1107
1108    /* Save Neon registers */
1109    vpush           {d8 - d15}
1110
1111    /* Outer loop over scanlines */
1112    cmp             NUM_ROWS, #1
1113    blt             9f
11140:
1115    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
1116    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
1117    mov             N, OUTPUT_WIDTH
1118    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
1119    add             OUTPUT_ROW, OUTPUT_ROW, #1
1120    ldr             RGB, [INPUT_BUF], #4
1121
1122    /* Inner loop over pixels */
1123    subs            N, N, #8
1124    blt             3f
1125    do_load         \bpp, 8
1126    do_rgb_to_yuv_stage1
1127    subs            N, N, #8
1128    blt             2f
11291:
1130    do_rgb_to_yuv_stage2_store_load_stage1
1131    subs            N, N, #8
1132    bge             1b
11332:
1134    do_rgb_to_yuv_stage2
1135    do_store        8
1136    tst             N, #7
1137    beq             8f
11383:
1139    tst             N, #4
1140    beq             3f
1141    do_load         \bpp, 4
11423:
1143    tst             N, #2
1144    beq             4f
1145    do_load         \bpp, 2
11464:
1147    tst             N, #1
1148    beq             5f
1149    do_load         \bpp, 1
11505:
1151    do_rgb_to_yuv
1152    tst             N, #4
1153    beq             6f
1154    do_store        4
11556:
1156    tst             N, #2
1157    beq             7f
1158    do_store        2
11597:
1160    tst             N, #1
1161    beq             8f
1162    do_store        1
11638:
1164    subs            NUM_ROWS, NUM_ROWS, #1
1165    bgt             0b
11669:
1167    /* Restore all registers and return */
1168    vpop            {d8 - d15}
1169    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
1170
1171    .unreq          OUTPUT_WIDTH
1172    .unreq          OUTPUT_ROW
1173    .unreq          INPUT_BUF
1174    .unreq          NUM_ROWS
1175    .unreq          OUTPUT_BUF0
1176    .unreq          OUTPUT_BUF1
1177    .unreq          OUTPUT_BUF2
1178    .unreq          RGB
1179    .unreq          Y
1180    .unreq          U
1181    .unreq          V
1182    .unreq          N
1183
1184.purgem do_rgb_to_yuv
1185.purgem do_rgb_to_yuv_stage1
1186.purgem do_rgb_to_yuv_stage2
1187.purgem do_rgb_to_yuv_stage2_store_load_stage1
1188
1189.endm
1190
1191/*--------------------------------- id ----- bpp R  G  B */
1192generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
1193generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
1194generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
1195generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
1196generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
1197generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
1198
1199.purgem do_load
1200.purgem do_store
1201