• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * ARMv8 NEON optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
5 *                          All Rights Reserved.
6 * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
7 * Copyright (C) 2013-2014, Linaro Limited.  All Rights Reserved.
8 * Author:  Ragesh Radhakrishnan <ragesh.r@linaro.org>
9 * Copyright (C) 2014-2016, D. R. Commander.  All Rights Reserved.
10 * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
11 * Copyright (C) 2016, Siarhei Siamashka.  All Rights Reserved.
12 *
13 * This software is provided 'as-is', without any express or implied
14 * warranty.  In no event will the authors be held liable for any damages
15 * arising from the use of this software.
16 *
17 * Permission is granted to anyone to use this software for any purpose,
18 * including commercial applications, and to alter it and redistribute it
19 * freely, subject to the following restrictions:
20 *
21 * 1. The origin of this software must not be misrepresented; you must not
22 *    claim that you wrote the original software. If you use this software
23 *    in a product, an acknowledgment in the product documentation would be
24 *    appreciated but is not required.
25 * 2. Altered source versions must be plainly marked as such, and must not be
26 *    misrepresented as being the original software.
27 * 3. This notice may not be removed or altered from any source distribution.
28 */
29
30#if defined(__linux__) && defined(__ELF__)
31.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
32#endif
33
34#if defined(__APPLE__)
35.section __DATA, __const
36#else
37.section .rodata, "a", %progbits
38#endif
39
40/* Constants for jsimd_idct_islow_neon() */
41
42#define F_0_298   2446  /* FIX(0.298631336) */
43#define F_0_390   3196  /* FIX(0.390180644) */
44#define F_0_541   4433  /* FIX(0.541196100) */
45#define F_0_765   6270  /* FIX(0.765366865) */
46#define F_0_899   7373  /* FIX(0.899976223) */
47#define F_1_175   9633  /* FIX(1.175875602) */
48#define F_1_501  12299  /* FIX(1.501321110) */
49#define F_1_847  15137  /* FIX(1.847759065) */
50#define F_1_961  16069  /* FIX(1.961570560) */
51#define F_2_053  16819  /* FIX(2.053119869) */
52#define F_2_562  20995  /* FIX(2.562915447) */
53#define F_3_072  25172  /* FIX(3.072711026) */
54
55.balign 16
56Ljsimd_idct_islow_neon_consts:
57  .short F_0_298
58  .short -F_0_390
59  .short F_0_541
60  .short F_0_765
61  .short - F_0_899
62  .short F_1_175
63  .short F_1_501
64  .short - F_1_847
65  .short - F_1_961
66  .short F_2_053
67  .short - F_2_562
68  .short F_3_072
69  .short 0          /* padding */
70  .short 0
71  .short 0
72  .short 0
73
74#undef F_0_298
75#undef F_0_390
76#undef F_0_541
77#undef F_0_765
78#undef F_0_899
79#undef F_1_175
80#undef F_1_501
81#undef F_1_847
82#undef F_1_961
83#undef F_2_053
84#undef F_2_562
85#undef F_3_072
86
87/* Constants for jsimd_idct_ifast_neon() */
88
89.balign 16
90Ljsimd_idct_ifast_neon_consts:
91  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
92  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
93  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
94  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
95
96/* Constants for jsimd_idct_4x4_neon() and jsimd_idct_2x2_neon() */
97
98#define CONST_BITS  13
99
100#define FIX_0_211164243  (1730)   /* FIX(0.211164243) */
101#define FIX_0_509795579  (4176)   /* FIX(0.509795579) */
102#define FIX_0_601344887  (4926)   /* FIX(0.601344887) */
103#define FIX_0_720959822  (5906)   /* FIX(0.720959822) */
104#define FIX_0_765366865  (6270)   /* FIX(0.765366865) */
105#define FIX_0_850430095  (6967)   /* FIX(0.850430095) */
106#define FIX_0_899976223  (7373)   /* FIX(0.899976223) */
107#define FIX_1_061594337  (8697)   /* FIX(1.061594337) */
108#define FIX_1_272758580  (10426)  /* FIX(1.272758580) */
109#define FIX_1_451774981  (11893)  /* FIX(1.451774981) */
110#define FIX_1_847759065  (15137)  /* FIX(1.847759065) */
111#define FIX_2_172734803  (17799)  /* FIX(2.172734803) */
112#define FIX_2_562915447  (20995)  /* FIX(2.562915447) */
113#define FIX_3_624509785  (29692)  /* FIX(3.624509785) */
114
115.balign 16
116Ljsimd_idct_4x4_neon_consts:
117  .short FIX_1_847759065        /* v0.h[0] */
118  .short -FIX_0_765366865       /* v0.h[1] */
119  .short -FIX_0_211164243       /* v0.h[2] */
120  .short FIX_1_451774981        /* v0.h[3] */
121  .short -FIX_2_172734803       /* d1[0] */
122  .short FIX_1_061594337        /* d1[1] */
123  .short -FIX_0_509795579       /* d1[2] */
124  .short -FIX_0_601344887       /* d1[3] */
125  .short FIX_0_899976223        /* v2.h[0] */
126  .short FIX_2_562915447        /* v2.h[1] */
127  .short 1 << (CONST_BITS + 1)  /* v2.h[2] */
128  .short 0                      /* v2.h[3] */
129
130.balign 8
131Ljsimd_idct_2x2_neon_consts:
132  .short -FIX_0_720959822  /* v14[0] */
133  .short FIX_0_850430095   /* v14[1] */
134  .short -FIX_1_272758580  /* v14[2] */
135  .short FIX_3_624509785   /* v14[3] */
136
137/* Constants for jsimd_ycc_*_neon() */
138
139.balign 16
140Ljsimd_ycc_rgb_neon_consts:
141  .short 0,      0,     0,      0
142  .short 22971, -11277, -23401, 29033
143  .short -128,  -128,   -128,   -128
144  .short -128,  -128,   -128,   -128
145
146/* Constants for jsimd_*_ycc_neon() */
147
148.balign 16
149Ljsimd_rgb_ycc_neon_consts:
150  .short 19595, 38470, 7471, 11059
151  .short 21709, 32768, 27439, 5329
152  .short 32767, 128, 32767, 128
153  .short 32767, 128, 32767, 128
154
155/* Constants for jsimd_fdct_islow_neon() */
156
157#define F_0_298   2446  /* FIX(0.298631336) */
158#define F_0_390   3196  /* FIX(0.390180644) */
159#define F_0_541   4433  /* FIX(0.541196100) */
160#define F_0_765   6270  /* FIX(0.765366865) */
161#define F_0_899   7373  /* FIX(0.899976223) */
162#define F_1_175   9633  /* FIX(1.175875602) */
163#define F_1_501  12299  /* FIX(1.501321110) */
164#define F_1_847  15137  /* FIX(1.847759065) */
165#define F_1_961  16069  /* FIX(1.961570560) */
166#define F_2_053  16819  /* FIX(2.053119869) */
167#define F_2_562  20995  /* FIX(2.562915447) */
168#define F_3_072  25172  /* FIX(3.072711026) */
169
170.balign 16
171Ljsimd_fdct_islow_neon_consts:
172  .short F_0_298
173  .short -F_0_390
174  .short F_0_541
175  .short F_0_765
176  .short - F_0_899
177  .short F_1_175
178  .short F_1_501
179  .short - F_1_847
180  .short - F_1_961
181  .short F_2_053
182  .short - F_2_562
183  .short F_3_072
184  .short 0          /* padding */
185  .short 0
186  .short 0
187  .short 0
188
189#undef F_0_298
190#undef F_0_390
191#undef F_0_541
192#undef F_0_765
193#undef F_0_899
194#undef F_1_175
195#undef F_1_501
196#undef F_1_847
197#undef F_1_961
198#undef F_2_053
199#undef F_2_562
200#undef F_3_072
201
202/* Constants for jsimd_fdct_ifast_neon() */
203
204.balign 16
205Ljsimd_fdct_ifast_neon_consts:
206  .short (98 * 128)               /* XFIX_0_382683433 */
207  .short (139 * 128)              /* XFIX_0_541196100 */
208  .short (181 * 128)              /* XFIX_0_707106781 */
209  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
210
211/* Constants for jsimd_h2*_downsample_neon() */
212
213.balign 16
214Ljsimd_h2_downsample_neon_consts:
215  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
216        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
217  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
218        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
219  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
220        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
221  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
222        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
223  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
224        0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
225  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
226        0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
227  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
228        0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
229  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
230        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
231  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
232        0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
233  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
234        0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
235  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
236        0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
237  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
238        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
239  .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
240        0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
241  .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
242        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
243  .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
244        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
245  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
246        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
247
248/* Constants for jsimd_huff_encode_one_block_neon() */
249
250.balign 16
251Ljsimd_huff_encode_one_block_neon_consts:
252    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
253          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
254    .byte    0,   1,   2,   3,  16,  17,  32,  33, \
255            18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
256    .byte   34,  35,  48,  49, 255, 255,  50,  51, \
257            36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
258    .byte    8,   9,  22,  23,  36,  37,  50,  51, \
259           255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
260    .byte   54,  55,  40,  41,  26,  27,  12,  13, \
261            14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
262    .byte    6,   7,  20,  21,  34,  35,  48,  49, \
263            50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
264    .byte   42,  43,  28,  29,  14,  15,  30,  31, \
265            44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
266    .byte  255, 255, 255, 255,  56,  57,  42,  43, \
267            28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
268    .byte   26,  27,  40,  41,  42,  43,  28,  29, \
269            14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
270    .byte  255, 255, 255, 255,   0,   1, 255, 255, \
271           255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
272    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
273             0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
274    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
275           255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
276    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
277           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
278
279.text
280
281
282#define RESPECT_STRICT_ALIGNMENT  1
283
284
285/*****************************************************************************/
286
287/* Supplementary macro for setting function attributes */
288.macro asm_function fname
289#ifdef __APPLE__
290    .private_extern _\fname
291    .globl _\fname
292_\fname:
293#else
294    .global \fname
295#ifdef __ELF__
296    .hidden \fname
297    .type \fname, %function
298#endif
299\fname:
300#endif
301.endm
302
303/* Get symbol location */
304.macro get_symbol_loc reg, symbol
305#ifdef __APPLE__
306    adrp            \reg, \symbol@PAGE
307    add             \reg, \reg, \symbol@PAGEOFF
308#else
309    adrp            \reg, \symbol
310    add             \reg, \reg, :lo12:\symbol
311#endif
312.endm
313
314/* Transpose elements of single 128 bit registers */
315.macro transpose_single x0, x1, xi, xilen, literal
316    ins             \xi\xilen[0], \x0\xilen[0]
317    ins             \x1\xilen[0], \x0\xilen[1]
318    trn1            \x0\literal, \x0\literal, \x1\literal
319    trn2            \x1\literal, \xi\literal, \x1\literal
320.endm
321
322/* Transpose elements of 2 different registers */
323.macro transpose x0, x1, xi, xilen, literal
324    mov             \xi\xilen, \x0\xilen
325    trn1            \x0\literal, \x0\literal, \x1\literal
326    trn2            \x1\literal, \xi\literal, \x1\literal
327.endm
328
329/* Transpose a block of 4x4 coefficients in four 64-bit registers */
330.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
331    mov             \xi\xilen, \x0\xilen
332    trn1            \x0\x0len, \x0\x0len, \x2\x2len
333    trn2            \x2\x2len, \xi\x0len, \x2\x2len
334    mov             \xi\xilen, \x1\xilen
335    trn1            \x1\x1len, \x1\x1len, \x3\x3len
336    trn2            \x3\x3len, \xi\x1len, \x3\x3len
337.endm
338
339.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
340    mov             \xi\xilen, \x0\xilen
341    trn1            \x0\x0len, \x0\x0len, \x1\x1len
342    trn2            \x1\x2len, \xi\x0len, \x1\x2len
343    mov             \xi\xilen, \x2\xilen
344    trn1            \x2\x2len, \x2\x2len, \x3\x3len
345    trn2            \x3\x2len, \xi\x1len, \x3\x3len
346.endm
347
348.macro transpose_4x4 x0, x1, x2, x3, x5
349    transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
350    transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
351.endm
352
353.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
354    trn1            \t0\().8h, \l0\().8h, \l1\().8h
355    trn1            \t1\().8h, \l2\().8h, \l3\().8h
356    trn1            \t2\().8h, \l4\().8h, \l5\().8h
357    trn1            \t3\().8h, \l6\().8h, \l7\().8h
358    trn2            \l1\().8h, \l0\().8h, \l1\().8h
359    trn2            \l3\().8h, \l2\().8h, \l3\().8h
360    trn2            \l5\().8h, \l4\().8h, \l5\().8h
361    trn2            \l7\().8h, \l6\().8h, \l7\().8h
362
363    trn1            \l4\().4s, \t2\().4s, \t3\().4s
364    trn2            \t3\().4s, \t2\().4s, \t3\().4s
365    trn1            \t2\().4s, \t0\().4s, \t1\().4s
366    trn2            \l2\().4s, \t0\().4s, \t1\().4s
367    trn1            \t0\().4s, \l1\().4s, \l3\().4s
368    trn2            \l3\().4s, \l1\().4s, \l3\().4s
369    trn2            \t1\().4s, \l5\().4s, \l7\().4s
370    trn1            \l5\().4s, \l5\().4s, \l7\().4s
371
372    trn2            \l6\().2d, \l2\().2d, \t3\().2d
373    trn1            \l0\().2d, \t2\().2d, \l4\().2d
374    trn1            \l1\().2d, \t0\().2d, \l5\().2d
375    trn2            \l7\().2d, \l3\().2d, \t1\().2d
376    trn1            \l2\().2d, \l2\().2d, \t3\().2d
377    trn2            \l4\().2d, \t2\().2d, \l4\().2d
378    trn1            \l3\().2d, \l3\().2d, \t1\().2d
379    trn2            \l5\().2d, \t0\().2d, \l5\().2d
380.endm
381
382
383#define CENTERJSAMPLE  128
384
385/*****************************************************************************/
386
387/*
388 * Perform dequantization and inverse DCT on one block of coefficients.
389 *
390 * GLOBAL(void)
391 * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
392 *                       JSAMPARRAY output_buf, JDIMENSION output_col)
393 */
394
395#define CONST_BITS  13
396#define PASS1_BITS  2
397
398#define XFIX_P_0_298  v0.h[0]
399#define XFIX_N_0_390  v0.h[1]
400#define XFIX_P_0_541  v0.h[2]
401#define XFIX_P_0_765  v0.h[3]
402#define XFIX_N_0_899  v0.h[4]
403#define XFIX_P_1_175  v0.h[5]
404#define XFIX_P_1_501  v0.h[6]
405#define XFIX_N_1_847  v0.h[7]
406#define XFIX_N_1_961  v1.h[0]
407#define XFIX_P_2_053  v1.h[1]
408#define XFIX_N_2_562  v1.h[2]
409#define XFIX_P_3_072  v1.h[3]
410
411asm_function jsimd_idct_islow_neon
412    DCT_TABLE       .req x0
413    COEF_BLOCK      .req x1
414    OUTPUT_BUF      .req x2
415    OUTPUT_COL      .req x3
416    TMP1            .req x0
417    TMP2            .req x1
418    TMP3            .req x9
419    TMP4            .req x10
420    TMP5            .req x11
421    TMP6            .req x12
422    TMP7            .req x13
423    TMP8            .req x14
424
425    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
426       guarantee that the upper (unused) 32 bits of x3 are valid.  This
427       instruction ensures that those bits are set to zero. */
428    uxtw x3, w3
429
430    sub             sp, sp, #64
431    get_symbol_loc  x15, Ljsimd_idct_islow_neon_consts
432    mov             x10, sp
433    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
434    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
435    ld1             {v0.8h, v1.8h}, [x15]
436    ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
437    ld1             {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
438    ld1             {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
439    ld1             {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
440
441    cmeq            v16.8h, v3.8h, #0
442    cmeq            v26.8h, v4.8h, #0
443    cmeq            v27.8h, v5.8h, #0
444    cmeq            v28.8h, v6.8h, #0
445    cmeq            v29.8h, v7.8h, #0
446    cmeq            v30.8h, v8.8h, #0
447    cmeq            v31.8h, v9.8h, #0
448
449    and             v10.16b, v16.16b, v26.16b
450    and             v11.16b, v27.16b, v28.16b
451    and             v12.16b, v29.16b, v30.16b
452    and             v13.16b, v31.16b, v10.16b
453    and             v14.16b, v11.16b, v12.16b
454    mul             v2.8h, v2.8h, v18.8h
455    and             v15.16b, v13.16b, v14.16b
456    shl             v10.8h, v2.8h, #(PASS1_BITS)
457    sqxtn           v16.8b, v15.8h
458    mov             TMP1, v16.d[0]
459    mvn             TMP2, TMP1
460
461    cbnz            TMP2, 2f
462    /* case all AC coeffs are zeros */
463    dup             v2.2d, v10.d[0]
464    dup             v6.2d, v10.d[1]
465    mov             v3.16b, v2.16b
466    mov             v7.16b, v6.16b
467    mov             v4.16b, v2.16b
468    mov             v8.16b, v6.16b
469    mov             v5.16b, v2.16b
470    mov             v9.16b, v6.16b
4711:
472    /* for this transpose, we should organise data like this:
473     * 00, 01, 02, 03, 40, 41, 42, 43
474     * 10, 11, 12, 13, 50, 51, 52, 53
475     * 20, 21, 22, 23, 60, 61, 62, 63
476     * 30, 31, 32, 33, 70, 71, 72, 73
477     * 04, 05, 06, 07, 44, 45, 46, 47
478     * 14, 15, 16, 17, 54, 55, 56, 57
479     * 24, 25, 26, 27, 64, 65, 66, 67
480     * 34, 35, 36, 37, 74, 75, 76, 77
481     */
482    trn1            v28.8h, v2.8h, v3.8h
483    trn1            v29.8h, v4.8h, v5.8h
484    trn1            v30.8h, v6.8h, v7.8h
485    trn1            v31.8h, v8.8h, v9.8h
486    trn2            v16.8h, v2.8h, v3.8h
487    trn2            v17.8h, v4.8h, v5.8h
488    trn2            v18.8h, v6.8h, v7.8h
489    trn2            v19.8h, v8.8h, v9.8h
490    trn1            v2.4s, v28.4s, v29.4s
491    trn1            v6.4s, v30.4s, v31.4s
492    trn1            v3.4s, v16.4s, v17.4s
493    trn1            v7.4s, v18.4s, v19.4s
494    trn2            v4.4s, v28.4s, v29.4s
495    trn2            v8.4s, v30.4s, v31.4s
496    trn2            v5.4s, v16.4s, v17.4s
497    trn2            v9.4s, v18.4s, v19.4s
498    /* Even part: reverse the even part of the forward DCT. */
499    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
500    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
501    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
502    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
503    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
504    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
505    mov             v21.16b, v19.16b               /* tmp3 = z1 */
506    mov             v20.16b, v18.16b               /* tmp3 = z1 */
507    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
508    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
509    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
510    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
511    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
512    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
513    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
514    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
515    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
516    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
517    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
518    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
519    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
520    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
521    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
522
523    /* Odd part per figure 8; the matrix is unitary and hence its
524     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
525     */
526
527    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
528    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
529    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
530    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
531    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
532
533    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
534    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
535    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
536    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
537    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
538    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
539    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
540    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
541    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
542
543    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
544    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
545    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
546    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
547    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
548    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
549    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
550    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
551    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
552
553    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
554    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
555    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
556    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
557
558    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
559    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
560    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
561    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
562    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
563    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
564    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
565    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
566
567    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
568    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
569    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
570    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
571    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
572    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
573    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
574    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
575
576    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
577
578    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
579    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
580    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
581    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
582    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
583    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
584    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
585    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
586    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
587    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
588    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
589    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
590    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
591    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
592    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
593    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
594
595    shrn            v2.4h, v18.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
596    shrn            v9.4h, v20.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
597    shrn            v3.4h, v22.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
598    shrn            v8.4h, v24.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
599    shrn            v4.4h, v26.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
600    shrn            v7.4h, v28.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
601    shrn            v5.4h, v14.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
602    shrn            v6.4h, v16.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
603    shrn2           v2.8h, v19.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
604    shrn2           v9.8h, v21.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
605    shrn2           v3.8h, v23.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
606    shrn2           v8.8h, v25.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
607    shrn2           v4.8h, v27.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
608    shrn2           v7.8h, v29.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
609    shrn2           v5.8h, v15.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
610    shrn2           v6.8h, v17.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
611    movi            v0.16b, #(CENTERJSAMPLE)
612    /* Prepare pointers (dual-issue with NEON instructions) */
613      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
614    sqrshrn         v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
615      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
616    sqrshrn         v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
617      add             TMP1, TMP1, OUTPUT_COL
618    sqrshrn         v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
619      add             TMP2, TMP2, OUTPUT_COL
620    sqrshrn         v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
621      add             TMP3, TMP3, OUTPUT_COL
622    sqrshrn2        v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
623      add             TMP4, TMP4, OUTPUT_COL
624    sqrshrn2        v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
625      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
626    sqrshrn2        v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
627      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
628    sqrshrn2        v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
629      add             TMP5, TMP5, OUTPUT_COL
630    add             v16.16b, v28.16b, v0.16b
631      add             TMP6, TMP6, OUTPUT_COL
632    add             v18.16b, v29.16b, v0.16b
633      add             TMP7, TMP7, OUTPUT_COL
634    add             v20.16b, v30.16b, v0.16b
635      add             TMP8, TMP8, OUTPUT_COL
636    add             v22.16b, v31.16b, v0.16b
637
638    /* Transpose the final 8-bit samples */
639    trn1            v28.16b, v16.16b, v18.16b
640    trn1            v30.16b, v20.16b, v22.16b
641    trn2            v29.16b, v16.16b, v18.16b
642    trn2            v31.16b, v20.16b, v22.16b
643
644    trn1            v16.8h, v28.8h, v30.8h
645    trn2            v18.8h, v28.8h, v30.8h
646    trn1            v20.8h, v29.8h, v31.8h
647    trn2            v22.8h, v29.8h, v31.8h
648
649    uzp1            v28.4s, v16.4s, v18.4s
650    uzp2            v30.4s, v16.4s, v18.4s
651    uzp1            v29.4s, v20.4s, v22.4s
652    uzp2            v31.4s, v20.4s, v22.4s
653
654    /* Store results to the output buffer */
655    st1             {v28.d}[0], [TMP1]
656    st1             {v29.d}[0], [TMP2]
657    st1             {v28.d}[1], [TMP3]
658    st1             {v29.d}[1], [TMP4]
659    st1             {v30.d}[0], [TMP5]
660    st1             {v31.d}[0], [TMP6]
661    st1             {v30.d}[1], [TMP7]
662    st1             {v31.d}[1], [TMP8]
663    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
664    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
665    blr             x30
666
667.balign 16
6682:
669    mul             v3.8h, v3.8h, v19.8h
670    mul             v4.8h, v4.8h, v20.8h
671    mul             v5.8h, v5.8h, v21.8h
672    add             TMP4, xzr, TMP2, LSL #32
673    mul             v6.8h, v6.8h, v22.8h
674    mul             v7.8h, v7.8h, v23.8h
675    adds            TMP3, xzr, TMP2, LSR #32
676    mul             v8.8h, v8.8h, v24.8h
677    mul             v9.8h, v9.8h, v25.8h
678    b.ne            3f
679    /* Right AC coef is zero */
680    dup             v15.2d, v10.d[1]
681    /* Even part: reverse the even part of the forward DCT. */
682    add             v18.4h, v4.4h, v8.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
683    add             v22.4h, v2.4h, v6.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
684    sub             v26.4h, v2.4h, v6.4h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
685    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
686    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
687    mov             v20.16b, v18.16b               /* tmp3 = z1 */
688    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
689    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
690    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
691    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
692    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
693    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
694    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
695
696    /* Odd part per figure 8; the matrix is unitary and hence its
697     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
698     */
699
700    add             v22.4h, v9.4h, v5.4h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
701    add             v24.4h, v7.4h, v3.4h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
702    add             v18.4h, v9.4h, v3.4h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
703    add             v20.4h, v7.4h, v5.4h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
704    add             v26.4h, v22.4h, v24.4h  /* z5 = z3 + z4 */
705
706    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
707    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
708    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
709    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
710    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
711    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
712    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
713    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
714    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
715
716    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
717    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
718
719    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
720    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
721    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
722    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
723
724    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
725    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
726    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
727    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
728
729    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
730
731    add             v18.4s, v2.4s, v16.4s  /* tmp10 + tmp3 */
732    sub             v20.4s, v2.4s, v16.4s  /* tmp10 - tmp3 */
733    add             v22.4s, v8.4s, v14.4s  /* tmp11 + tmp2 */
734    sub             v24.4s, v8.4s, v14.4s  /* tmp11 - tmp2 */
735    add             v26.4s, v4.4s, v12.4s  /* tmp12 + tmp1 */
736    sub             v28.4s, v4.4s, v12.4s  /* tmp12 - tmp1 */
737    add             v14.4s, v6.4s, v10.4s  /* tmp13 + tmp0 */
738    sub             v16.4s, v6.4s, v10.4s  /* tmp13 - tmp0 */
739
740    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
741    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
742    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
743    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
744    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
745    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
746    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
747    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
748    mov             v6.16b, v15.16b
749    mov             v7.16b, v15.16b
750    mov             v8.16b, v15.16b
751    mov             v9.16b, v15.16b
752    b               1b
753
754.balign 16
7553:
756    cbnz            TMP4, 4f
757    /* Left AC coef is zero */
758    dup             v14.2d, v10.d[0]
759    /* Even part: reverse the even part of the forward DCT. */
760    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
761    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
762    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
763    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
764    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
765    mov             v21.16b, v19.16b               /* tmp3 = z1 */
766    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
767    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
768    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
769    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
770    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
771    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
772    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
773
774    /* Odd part per figure 8; the matrix is unitary and hence its
775     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
776     */
777
778    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
779    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
780    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
781    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
782    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
783
784    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
785    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
786    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
787    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
788    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
789    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
790    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
791    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
792    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
793
794    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
795    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
796    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
797    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
798
799    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
800    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
801    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
802    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
803
804    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
805    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
806    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
807    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
808
809    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
810
811    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
812    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
813    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
814    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
815    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
816    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
817    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
818    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
819
820    mov             v2.16b, v14.16b
821    mov             v3.16b, v14.16b
822    mov             v4.16b, v14.16b
823    mov             v5.16b, v14.16b
824    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
825    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
826    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
827    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
828    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
829    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
830    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
831    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
832    b               1b
833
834.balign 16
8354:
836    /* "No" AC coef is zero */
837    /* Even part: reverse the even part of the forward DCT. */
838    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
839    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
840    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
841    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
842    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
843    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
844    mov             v21.16b, v19.16b               /* tmp3 = z1 */
845    mov             v20.16b, v18.16b               /* tmp3 = z1 */
846    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
847    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
848    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
849    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
850    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
851    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
852    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
853    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
854    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
855    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
856    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
857    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
858    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
859    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
860    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
861
862    /* Odd part per figure 8; the matrix is unitary and hence its
863     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
864     */
865
866    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
867    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
868    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
869    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
870    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
871
872    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
873    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
874    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
875    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
876    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
877    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
878    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
879    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
880    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
881
882    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
883    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
884    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
885    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
886    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
887    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
888    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
889    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
890    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
891
892    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
893    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
894    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
895    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
896
897    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
898    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
899    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
900    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
901    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
902    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
903    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
904    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
905
906    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
907    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
908    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
909    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
910    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
911    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
912    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
913    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
914
915    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
916
917    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
918    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
919    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
920    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
921    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
922    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
923    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
924    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
925    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
926    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
927    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
928    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
929    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
930    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
931    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
932    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
933
934    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
935    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
936    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
937    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
938    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
939    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
940    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
941    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
942    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
943    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
944    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
945    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
946    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
947    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
948    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
949    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
950    b               1b
951
952    .unreq          DCT_TABLE
953    .unreq          COEF_BLOCK
954    .unreq          OUTPUT_BUF
955    .unreq          OUTPUT_COL
956    .unreq          TMP1
957    .unreq          TMP2
958    .unreq          TMP3
959    .unreq          TMP4
960    .unreq          TMP5
961    .unreq          TMP6
962    .unreq          TMP7
963    .unreq          TMP8
964
965#undef CENTERJSAMPLE
966#undef CONST_BITS
967#undef PASS1_BITS
968#undef XFIX_P_0_298
969#undef XFIX_N_0_390
970#undef XFIX_P_0_541
971#undef XFIX_P_0_765
972#undef XFIX_N_0_899
973#undef XFIX_P_1_175
974#undef XFIX_P_1_501
975#undef XFIX_N_1_847
976#undef XFIX_N_1_961
977#undef XFIX_P_2_053
978#undef XFIX_N_2_562
979#undef XFIX_P_3_072
980
981
982/*****************************************************************************/
983
984/*
985 * jsimd_idct_ifast_neon
986 *
987 * This function contains a fast, not so accurate integer implementation of
988 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
989 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
990 * function from jidctfst.c
991 *
992 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
993 * But in ARM NEON case some extra additions are required because VQDMULH
994 * instruction can't handle the constants larger than 1. So the expressions
995 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
996 * which introduces an extra addition. Overall, there are 6 extra additions
997 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
998 */
999
1000#define XFIX_1_082392200  v0.h[0]
1001#define XFIX_1_414213562  v0.h[1]
1002#define XFIX_1_847759065  v0.h[2]
1003#define XFIX_2_613125930  v0.h[3]
1004
1005asm_function jsimd_idct_ifast_neon
1006
1007    DCT_TABLE       .req x0
1008    COEF_BLOCK      .req x1
1009    OUTPUT_BUF      .req x2
1010    OUTPUT_COL      .req x3
1011    TMP1            .req x0
1012    TMP2            .req x1
1013    TMP3            .req x9
1014    TMP4            .req x10
1015    TMP5            .req x11
1016    TMP6            .req x12
1017    TMP7            .req x13
1018    TMP8            .req x14
1019
1020    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
1021       guarantee that the upper (unused) 32 bits of x3 are valid.  This
1022       instruction ensures that those bits are set to zero. */
1023    uxtw x3, w3
1024
1025    /* Load and dequantize coefficients into NEON registers
1026     * with the following allocation:
1027     *       0 1 2 3 | 4 5 6 7
1028     *      ---------+--------
1029     *   0 | d16     | d17     ( v16.8h )
1030     *   1 | d18     | d19     ( v17.8h )
1031     *   2 | d20     | d21     ( v18.8h )
1032     *   3 | d22     | d23     ( v19.8h )
1033     *   4 | d24     | d25     ( v20.8h )
1034     *   5 | d26     | d27     ( v21.8h )
1035     *   6 | d28     | d29     ( v22.8h )
1036     *   7 | d30     | d31     ( v23.8h )
1037     */
1038    /* Save NEON registers used in fast IDCT */
1039    get_symbol_loc  TMP5, Ljsimd_idct_ifast_neon_consts
1040    ld1             {v16.8h, v17.8h}, [COEF_BLOCK], 32
1041    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
1042    ld1             {v18.8h, v19.8h}, [COEF_BLOCK], 32
1043    mul             v16.8h, v16.8h, v0.8h
1044    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
1045    mul             v17.8h, v17.8h, v1.8h
1046    ld1             {v20.8h, v21.8h}, [COEF_BLOCK], 32
1047    mul             v18.8h, v18.8h, v2.8h
1048    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
1049    mul             v19.8h, v19.8h, v3.8h
1050    ld1             {v22.8h, v23.8h}, [COEF_BLOCK], 32
1051    mul             v20.8h, v20.8h, v0.8h
1052    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
1053    mul             v22.8h, v22.8h, v2.8h
1054    mul             v21.8h, v21.8h, v1.8h
1055    ld1             {v0.4h}, [TMP5]        /* load constants */
1056    mul             v23.8h, v23.8h, v3.8h
1057
1058    /* 1-D IDCT, pass 1 */
1059    sub             v2.8h, v18.8h, v22.8h
1060    add             v22.8h, v18.8h, v22.8h
1061    sub             v1.8h, v19.8h, v21.8h
1062    add             v21.8h, v19.8h, v21.8h
1063    sub             v5.8h, v17.8h, v23.8h
1064    add             v23.8h, v17.8h, v23.8h
1065    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
1066    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
1067    add             v3.8h, v1.8h, v1.8h
1068    sub             v1.8h, v5.8h, v1.8h
1069    add             v18.8h, v2.8h, v4.8h
1070    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
1071    sub             v2.8h, v23.8h, v21.8h
1072    add             v3.8h, v3.8h, v6.8h
1073    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
1074    add             v1.8h, v1.8h, v4.8h
1075    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
1076    sub             v18.8h, v18.8h, v22.8h
1077    add             v2.8h, v2.8h, v6.8h
1078    sub             v6.8h, v16.8h, v20.8h
1079    add             v20.8h, v16.8h, v20.8h
1080    add             v17.8h, v5.8h, v4.8h
1081    add             v5.8h, v6.8h, v18.8h
1082    sub             v18.8h, v6.8h, v18.8h
1083    add             v6.8h, v23.8h, v21.8h
1084    add             v16.8h, v20.8h, v22.8h
1085    sub             v3.8h, v6.8h, v3.8h
1086    sub             v20.8h, v20.8h, v22.8h
1087    sub             v3.8h, v3.8h, v1.8h
1088    sub             v1.8h, v17.8h, v1.8h
1089    add             v2.8h, v3.8h, v2.8h
1090    sub             v23.8h, v16.8h, v6.8h
1091    add             v1.8h, v1.8h, v2.8h
1092    add             v16.8h, v16.8h, v6.8h
1093    add             v22.8h, v5.8h, v3.8h
1094    sub             v17.8h, v5.8h, v3.8h
1095    sub             v21.8h, v18.8h, v2.8h
1096    add             v18.8h, v18.8h, v2.8h
1097    sub             v19.8h, v20.8h, v1.8h
1098    add             v20.8h, v20.8h, v1.8h
1099    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
1100    /* 1-D IDCT, pass 2 */
1101    sub             v2.8h, v18.8h, v22.8h
1102    add             v22.8h, v18.8h, v22.8h
1103    sub             v1.8h, v19.8h, v21.8h
1104    add             v21.8h, v19.8h, v21.8h
1105    sub             v5.8h, v17.8h, v23.8h
1106    add             v23.8h, v17.8h, v23.8h
1107    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
1108    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
1109    add             v3.8h, v1.8h, v1.8h
1110    sub             v1.8h, v5.8h, v1.8h
1111    add             v18.8h, v2.8h, v4.8h
1112    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
1113    sub             v2.8h, v23.8h, v21.8h
1114    add             v3.8h, v3.8h, v6.8h
1115    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
1116    add             v1.8h, v1.8h, v4.8h
1117    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
1118    sub             v18.8h, v18.8h, v22.8h
1119    add             v2.8h, v2.8h, v6.8h
1120    sub             v6.8h, v16.8h, v20.8h
1121    add             v20.8h, v16.8h, v20.8h
1122    add             v17.8h, v5.8h, v4.8h
1123    add             v5.8h, v6.8h, v18.8h
1124    sub             v18.8h, v6.8h, v18.8h
1125    add             v6.8h, v23.8h, v21.8h
1126    add             v16.8h, v20.8h, v22.8h
1127    sub             v3.8h, v6.8h, v3.8h
1128    sub             v20.8h, v20.8h, v22.8h
1129    sub             v3.8h, v3.8h, v1.8h
1130    sub             v1.8h, v17.8h, v1.8h
1131    add             v2.8h, v3.8h, v2.8h
1132    sub             v23.8h, v16.8h, v6.8h
1133    add             v1.8h, v1.8h, v2.8h
1134    add             v16.8h, v16.8h, v6.8h
1135    add             v22.8h, v5.8h, v3.8h
1136    sub             v17.8h, v5.8h, v3.8h
1137    sub             v21.8h, v18.8h, v2.8h
1138    add             v18.8h, v18.8h, v2.8h
1139    sub             v19.8h, v20.8h, v1.8h
1140    add             v20.8h, v20.8h, v1.8h
1141    /* Descale to 8-bit and range limit */
1142    movi            v0.16b, #0x80
1143      /* Prepare pointers (dual-issue with NEON instructions) */
1144      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
1145    sqshrn          v28.8b, v16.8h, #5
1146      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
1147    sqshrn          v29.8b, v17.8h, #5
1148      add             TMP1, TMP1, OUTPUT_COL
1149    sqshrn          v30.8b, v18.8h, #5
1150      add             TMP2, TMP2, OUTPUT_COL
1151    sqshrn          v31.8b, v19.8h, #5
1152      add             TMP3, TMP3, OUTPUT_COL
1153    sqshrn2         v28.16b, v20.8h, #5
1154      add             TMP4, TMP4, OUTPUT_COL
1155    sqshrn2         v29.16b, v21.8h, #5
1156      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
1157    sqshrn2         v30.16b, v22.8h, #5
1158      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
1159    sqshrn2         v31.16b, v23.8h, #5
1160      add             TMP5, TMP5, OUTPUT_COL
1161    add             v16.16b, v28.16b, v0.16b
1162      add             TMP6, TMP6, OUTPUT_COL
1163    add             v18.16b, v29.16b, v0.16b
1164      add             TMP7, TMP7, OUTPUT_COL
1165    add             v20.16b, v30.16b, v0.16b
1166      add             TMP8, TMP8, OUTPUT_COL
1167    add             v22.16b, v31.16b, v0.16b
1168
1169    /* Transpose the final 8-bit samples */
1170    trn1            v28.16b, v16.16b, v18.16b
1171    trn1            v30.16b, v20.16b, v22.16b
1172    trn2            v29.16b, v16.16b, v18.16b
1173    trn2            v31.16b, v20.16b, v22.16b
1174
1175    trn1            v16.8h, v28.8h, v30.8h
1176    trn2            v18.8h, v28.8h, v30.8h
1177    trn1            v20.8h, v29.8h, v31.8h
1178    trn2            v22.8h, v29.8h, v31.8h
1179
1180    uzp1            v28.4s, v16.4s, v18.4s
1181    uzp2            v30.4s, v16.4s, v18.4s
1182    uzp1            v29.4s, v20.4s, v22.4s
1183    uzp2            v31.4s, v20.4s, v22.4s
1184
1185    /* Store results to the output buffer */
1186    st1             {v28.d}[0], [TMP1]
1187    st1             {v29.d}[0], [TMP2]
1188    st1             {v28.d}[1], [TMP3]
1189    st1             {v29.d}[1], [TMP4]
1190    st1             {v30.d}[0], [TMP5]
1191    st1             {v31.d}[0], [TMP6]
1192    st1             {v30.d}[1], [TMP7]
1193    st1             {v31.d}[1], [TMP8]
1194    blr             x30
1195
1196    .unreq          DCT_TABLE
1197    .unreq          COEF_BLOCK
1198    .unreq          OUTPUT_BUF
1199    .unreq          OUTPUT_COL
1200    .unreq          TMP1
1201    .unreq          TMP2
1202    .unreq          TMP3
1203    .unreq          TMP4
1204    .unreq          TMP5
1205    .unreq          TMP6
1206    .unreq          TMP7
1207    .unreq          TMP8
1208
1209
1210/*****************************************************************************/
1211
1212/*
1213 * jsimd_idct_4x4_neon
1214 *
1215 * This function contains inverse-DCT code for getting reduced-size
1216 * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
1217 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
1218 * function from jpeg-6b (jidctred.c).
1219 *
1220 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
1221 *       requires much less arithmetic operations and hence should be faster.
1222 *       The primary purpose of this particular NEON optimized function is
1223 *       bit exact compatibility with jpeg-6b.
1224 *
1225 * TODO: a bit better instructions scheduling can be achieved by expanding
1226 *       idct_helper/transpose_4x4 macros and reordering instructions,
1227 *       but readability will suffer somewhat.
1228 */
1229
1230.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
1231    smull           v28.4s, \x4, v2.h[2]
1232    smlal           v28.4s, \x8, v0.h[0]
1233    smlal           v28.4s, \x14, v0.h[1]
1234
1235    smull           v26.4s, \x16, v1.h[2]
1236    smlal           v26.4s, \x12, v1.h[3]
1237    smlal           v26.4s, \x10, v2.h[0]
1238    smlal           v26.4s, \x6, v2.h[1]
1239
1240    smull           v30.4s, \x4, v2.h[2]
1241    smlsl           v30.4s, \x8, v0.h[0]
1242    smlsl           v30.4s, \x14, v0.h[1]
1243
1244    smull           v24.4s, \x16, v0.h[2]
1245    smlal           v24.4s, \x12, v0.h[3]
1246    smlal           v24.4s, \x10, v1.h[0]
1247    smlal           v24.4s, \x6, v1.h[1]
1248
1249    add             v20.4s, v28.4s, v26.4s
1250    sub             v28.4s, v28.4s, v26.4s
1251
1252  .if \shift > 16
1253    srshr           v20.4s, v20.4s, #\shift
1254    srshr           v28.4s, v28.4s, #\shift
1255    xtn             \y26, v20.4s
1256    xtn             \y29, v28.4s
1257  .else
1258    rshrn           \y26, v20.4s, #\shift
1259    rshrn           \y29, v28.4s, #\shift
1260  .endif
1261
1262    add             v20.4s, v30.4s, v24.4s
1263    sub             v30.4s, v30.4s, v24.4s
1264
1265  .if \shift > 16
1266    srshr           v20.4s, v20.4s, #\shift
1267    srshr           v30.4s, v30.4s, #\shift
1268    xtn             \y27, v20.4s
1269    xtn             \y28, v30.4s
1270  .else
1271    rshrn           \y27, v20.4s, #\shift
1272    rshrn           \y28, v30.4s, #\shift
1273  .endif
1274.endm
1275
1276asm_function jsimd_idct_4x4_neon
1277
1278    DCT_TABLE       .req x0
1279    COEF_BLOCK      .req x1
1280    OUTPUT_BUF      .req x2
1281    OUTPUT_COL      .req x3
1282    TMP1            .req x0
1283    TMP2            .req x1
1284    TMP3            .req x2
1285    TMP4            .req x15
1286
1287    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
1288       guarantee that the upper (unused) 32 bits of x3 are valid.  This
1289       instruction ensures that those bits are set to zero. */
1290    uxtw x3, w3
1291
1292    /* Save all used NEON registers */
1293    sub             sp, sp, 64
1294    mov             x9, sp
1295    /* Load constants (v3.4h is just used for padding) */
1296    get_symbol_loc  TMP4, Ljsimd_idct_4x4_neon_consts
1297    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1298    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1299    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
1300
1301    /* Load all COEF_BLOCK into NEON registers with the following allocation:
1302     *       0 1 2 3 | 4 5 6 7
1303     *      ---------+--------
1304     *   0 | v4.4h   | v5.4h
1305     *   1 | v6.4h   | v7.4h
1306     *   2 | v8.4h   | v9.4h
1307     *   3 | v10.4h  | v11.4h
1308     *   4 | -       | -
1309     *   5 | v12.4h  | v13.4h
1310     *   6 | v14.4h  | v15.4h
1311     *   7 | v16.4h  | v17.4h
1312     */
1313    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1314    ld1             {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
1315    add             COEF_BLOCK, COEF_BLOCK, #16
1316    ld1             {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
1317    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
1318    /* dequantize */
1319    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1320    mul             v4.4h, v4.4h, v18.4h
1321    mul             v5.4h, v5.4h, v19.4h
1322    ins             v4.d[1], v5.d[0]              /* 128 bit q4 */
1323    ld1             {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
1324    mul             v6.4h, v6.4h, v20.4h
1325    mul             v7.4h, v7.4h, v21.4h
1326    ins             v6.d[1], v7.d[0]              /* 128 bit q6 */
1327    mul             v8.4h, v8.4h, v22.4h
1328    mul             v9.4h, v9.4h, v23.4h
1329    ins             v8.d[1], v9.d[0]              /* 128 bit q8 */
1330    add             DCT_TABLE, DCT_TABLE, #16
1331    ld1             {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
1332    mul             v10.4h, v10.4h, v24.4h
1333    mul             v11.4h, v11.4h, v25.4h
1334    ins             v10.d[1], v11.d[0]            /* 128 bit q10 */
1335    mul             v12.4h, v12.4h, v26.4h
1336    mul             v13.4h, v13.4h, v27.4h
1337    ins             v12.d[1], v13.d[0]            /* 128 bit q12 */
1338    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
1339    mul             v14.4h, v14.4h, v28.4h
1340    mul             v15.4h, v15.4h, v29.4h
1341    ins             v14.d[1], v15.d[0]            /* 128 bit q14 */
1342    mul             v16.4h, v16.4h, v30.4h
1343    mul             v17.4h, v17.4h, v31.4h
1344    ins             v16.d[1], v17.d[0]            /* 128 bit q16 */
1345
1346    /* Pass 1 */
1347    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
1348                    v4.4h, v6.4h, v8.4h, v10.4h
1349    transpose_4x4   v4, v6, v8, v10, v3
1350    ins             v10.d[1], v11.d[0]
1351    idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
1352                    v5.4h, v7.4h, v9.4h, v11.4h
1353    transpose_4x4   v5, v7, v9, v11, v3
1354    ins             v10.d[1], v11.d[0]
1355
1356    /* Pass 2 */
1357    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
1358                    v26.4h, v27.4h, v28.4h, v29.4h
1359    transpose_4x4   v26, v27, v28, v29, v3
1360
1361    /* Range limit */
1362    movi            v30.8h, #0x80
1363    ins             v26.d[1], v27.d[0]
1364    ins             v28.d[1], v29.d[0]
1365    add             v26.8h, v26.8h, v30.8h
1366    add             v28.8h, v28.8h, v30.8h
1367    sqxtun          v26.8b, v26.8h
1368    sqxtun          v27.8b, v28.8h
1369
1370    /* Store results to the output buffer */
1371    ldp             TMP1, TMP2, [OUTPUT_BUF], 16
1372    ldp             TMP3, TMP4, [OUTPUT_BUF]
1373    add             TMP1, TMP1, OUTPUT_COL
1374    add             TMP2, TMP2, OUTPUT_COL
1375    add             TMP3, TMP3, OUTPUT_COL
1376    add             TMP4, TMP4, OUTPUT_COL
1377
1378#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1379    /* We can use much less instructions on little endian systems if the
1380     * OS kernel is not configured to trap unaligned memory accesses
1381     */
1382    st1             {v26.s}[0], [TMP1], 4
1383    st1             {v27.s}[0], [TMP3], 4
1384    st1             {v26.s}[1], [TMP2], 4
1385    st1             {v27.s}[1], [TMP4], 4
1386#else
1387    st1             {v26.b}[0], [TMP1], 1
1388    st1             {v27.b}[0], [TMP3], 1
1389    st1             {v26.b}[1], [TMP1], 1
1390    st1             {v27.b}[1], [TMP3], 1
1391    st1             {v26.b}[2], [TMP1], 1
1392    st1             {v27.b}[2], [TMP3], 1
1393    st1             {v26.b}[3], [TMP1], 1
1394    st1             {v27.b}[3], [TMP3], 1
1395
1396    st1             {v26.b}[4], [TMP2], 1
1397    st1             {v27.b}[4], [TMP4], 1
1398    st1             {v26.b}[5], [TMP2], 1
1399    st1             {v27.b}[5], [TMP4], 1
1400    st1             {v26.b}[6], [TMP2], 1
1401    st1             {v27.b}[6], [TMP4], 1
1402    st1             {v26.b}[7], [TMP2], 1
1403    st1             {v27.b}[7], [TMP4], 1
1404#endif
1405
1406    /* vpop            {v8.4h - v15.4h}    ;not available */
1407    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1408    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1409    blr             x30
1410
1411    .unreq          DCT_TABLE
1412    .unreq          COEF_BLOCK
1413    .unreq          OUTPUT_BUF
1414    .unreq          OUTPUT_COL
1415    .unreq          TMP1
1416    .unreq          TMP2
1417    .unreq          TMP3
1418    .unreq          TMP4
1419
1420.purgem idct_helper
1421
1422
1423/*****************************************************************************/
1424
1425/*
1426 * jsimd_idct_2x2_neon
1427 *
1428 * This function contains inverse-DCT code for getting reduced-size
1429 * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
1430 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1431 * function from jpeg-6b (jidctred.c).
1432 *
1433 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1434 *       requires much less arithmetic operations and hence should be faster.
1435 *       The primary purpose of this particular NEON optimized function is
1436 *       bit exact compatibility with jpeg-6b.
1437 */
1438
1439.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1440    sshll           v15.4s, \x4, #15
1441    smull           v26.4s, \x6, v14.h[3]
1442    smlal           v26.4s, \x10, v14.h[2]
1443    smlal           v26.4s, \x12, v14.h[1]
1444    smlal           v26.4s, \x16, v14.h[0]
1445
1446    add             v20.4s, v15.4s, v26.4s
1447    sub             v15.4s, v15.4s, v26.4s
1448
1449  .if \shift > 16
1450    srshr           v20.4s, v20.4s, #\shift
1451    srshr           v15.4s, v15.4s, #\shift
1452    xtn             \y26, v20.4s
1453    xtn             \y27, v15.4s
1454  .else
1455    rshrn           \y26, v20.4s, #\shift
1456    rshrn           \y27, v15.4s, #\shift
1457  .endif
1458.endm
1459
1460asm_function jsimd_idct_2x2_neon
1461
1462    DCT_TABLE       .req x0
1463    COEF_BLOCK      .req x1
1464    OUTPUT_BUF      .req x2
1465    OUTPUT_COL      .req x3
1466    TMP1            .req x0
1467    TMP2            .req x15
1468
1469    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
1470       guarantee that the upper (unused) 32 bits of x3 are valid.  This
1471       instruction ensures that those bits are set to zero. */
1472    uxtw x3, w3
1473
1474    /* vpush           {v8.4h - v15.4h}            ; not available */
1475    sub             sp, sp, 64
1476    mov             x9, sp
1477
1478    /* Load constants */
1479    get_symbol_loc  TMP2, Ljsimd_idct_2x2_neon_consts
1480    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1481    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1482    ld1             {v14.4h}, [TMP2]
1483
1484    /* Load all COEF_BLOCK into NEON registers with the following allocation:
1485     *       0 1 2 3 | 4 5 6 7
1486     *      ---------+--------
1487     *   0 | v4.4h   | v5.4h
1488     *   1 | v6.4h   | v7.4h
1489     *   2 | -       | -
1490     *   3 | v10.4h  | v11.4h
1491     *   4 | -       | -
1492     *   5 | v12.4h  | v13.4h
1493     *   6 | -       | -
1494     *   7 | v16.4h  | v17.4h
1495     */
1496    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1497    add             COEF_BLOCK, COEF_BLOCK, #16
1498    ld1             {v10.4h, v11.4h}, [COEF_BLOCK], 16
1499    add             COEF_BLOCK, COEF_BLOCK, #16
1500    ld1             {v12.4h, v13.4h}, [COEF_BLOCK], 16
1501    add             COEF_BLOCK, COEF_BLOCK, #16
1502    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
1503    /* Dequantize */
1504    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1505    mul             v4.4h, v4.4h, v18.4h
1506    mul             v5.4h, v5.4h, v19.4h
1507    ins             v4.d[1], v5.d[0]
1508    mul             v6.4h, v6.4h, v20.4h
1509    mul             v7.4h, v7.4h, v21.4h
1510    ins             v6.d[1], v7.d[0]
1511    add             DCT_TABLE, DCT_TABLE, #16
1512    ld1             {v24.4h, v25.4h}, [DCT_TABLE], 16
1513    mul             v10.4h, v10.4h, v24.4h
1514    mul             v11.4h, v11.4h, v25.4h
1515    ins             v10.d[1], v11.d[0]
1516    add             DCT_TABLE, DCT_TABLE, #16
1517    ld1             {v26.4h, v27.4h}, [DCT_TABLE], 16
1518    mul             v12.4h, v12.4h, v26.4h
1519    mul             v13.4h, v13.4h, v27.4h
1520    ins             v12.d[1], v13.d[0]
1521    add             DCT_TABLE, DCT_TABLE, #16
1522    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
1523    mul             v16.4h, v16.4h, v30.4h
1524    mul             v17.4h, v17.4h, v31.4h
1525    ins             v16.d[1], v17.d[0]
1526
1527    /* Pass 1 */
1528#if 0
1529    idct_helper     v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
1530    transpose_4x4   v4.4h, v6.4h, v8.4h, v10.4h
1531    idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
1532    transpose_4x4   v5.4h, v7.4h, v9.4h, v11.4h
1533#else
1534    smull           v26.4s, v6.4h, v14.h[3]
1535    smlal           v26.4s, v10.4h, v14.h[2]
1536    smlal           v26.4s, v12.4h, v14.h[1]
1537    smlal           v26.4s, v16.4h, v14.h[0]
1538    smull           v24.4s, v7.4h, v14.h[3]
1539    smlal           v24.4s, v11.4h, v14.h[2]
1540    smlal           v24.4s, v13.4h, v14.h[1]
1541    smlal           v24.4s, v17.4h, v14.h[0]
1542    sshll           v15.4s, v4.4h, #15
1543    sshll           v30.4s, v5.4h, #15
1544    add             v20.4s, v15.4s, v26.4s
1545    sub             v15.4s, v15.4s, v26.4s
1546    rshrn           v4.4h, v20.4s, #13
1547    rshrn           v6.4h, v15.4s, #13
1548    add             v20.4s, v30.4s, v24.4s
1549    sub             v15.4s, v30.4s, v24.4s
1550    rshrn           v5.4h, v20.4s, #13
1551    rshrn           v7.4h, v15.4s, #13
1552    ins             v4.d[1], v5.d[0]
1553    ins             v6.d[1], v7.d[0]
1554    transpose       v4, v6, v3, .16b, .8h
1555    transpose       v6, v10, v3, .16b, .4s
1556    ins             v11.d[0], v10.d[1]
1557    ins             v7.d[0], v6.d[1]
1558#endif
1559
1560    /* Pass 2 */
1561    idct_helper     v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
1562
1563    /* Range limit */
1564    movi            v30.8h, #0x80
1565    ins             v26.d[1], v27.d[0]
1566    add             v26.8h, v26.8h, v30.8h
1567    sqxtun          v30.8b, v26.8h
1568    ins             v26.d[0], v30.d[0]
1569    sqxtun          v27.8b, v26.8h
1570
1571    /* Store results to the output buffer */
1572    ldp             TMP1, TMP2, [OUTPUT_BUF]
1573    add             TMP1, TMP1, OUTPUT_COL
1574    add             TMP2, TMP2, OUTPUT_COL
1575
1576    st1             {v26.b}[0], [TMP1], 1
1577    st1             {v27.b}[4], [TMP1], 1
1578    st1             {v26.b}[1], [TMP2], 1
1579    st1             {v27.b}[5], [TMP2], 1
1580
1581    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1582    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1583    blr             x30
1584
1585    .unreq          DCT_TABLE
1586    .unreq          COEF_BLOCK
1587    .unreq          OUTPUT_BUF
1588    .unreq          OUTPUT_COL
1589    .unreq          TMP1
1590    .unreq          TMP2
1591
1592.purgem idct_helper
1593
1594
1595/*****************************************************************************/
1596
1597/*
1598 * jsimd_ycc_extrgb_convert_neon
1599 * jsimd_ycc_extbgr_convert_neon
1600 * jsimd_ycc_extrgbx_convert_neon
1601 * jsimd_ycc_extbgrx_convert_neon
1602 * jsimd_ycc_extxbgr_convert_neon
1603 * jsimd_ycc_extxrgb_convert_neon
1604 *
1605 * Colorspace conversion YCbCr -> RGB
1606 */
1607
1608.macro do_load size
1609  .if \size == 8
1610    ld1             {v4.8b}, [U], 8
1611    ld1             {v5.8b}, [V], 8
1612    ld1             {v0.8b}, [Y], 8
1613    prfm            pldl1keep, [U, #64]
1614    prfm            pldl1keep, [V, #64]
1615    prfm            pldl1keep, [Y, #64]
1616  .elseif \size == 4
1617    ld1             {v4.b}[0], [U], 1
1618    ld1             {v4.b}[1], [U], 1
1619    ld1             {v4.b}[2], [U], 1
1620    ld1             {v4.b}[3], [U], 1
1621    ld1             {v5.b}[0], [V], 1
1622    ld1             {v5.b}[1], [V], 1
1623    ld1             {v5.b}[2], [V], 1
1624    ld1             {v5.b}[3], [V], 1
1625    ld1             {v0.b}[0], [Y], 1
1626    ld1             {v0.b}[1], [Y], 1
1627    ld1             {v0.b}[2], [Y], 1
1628    ld1             {v0.b}[3], [Y], 1
1629  .elseif \size == 2
1630    ld1             {v4.b}[4], [U], 1
1631    ld1             {v4.b}[5], [U], 1
1632    ld1             {v5.b}[4], [V], 1
1633    ld1             {v5.b}[5], [V], 1
1634    ld1             {v0.b}[4], [Y], 1
1635    ld1             {v0.b}[5], [Y], 1
1636  .elseif \size == 1
1637    ld1             {v4.b}[6], [U], 1
1638    ld1             {v5.b}[6], [V], 1
1639    ld1             {v0.b}[6], [Y], 1
1640  .else
1641    .error unsupported macroblock size
1642  .endif
1643.endm
1644
1645.macro do_store bpp, size, fast_st3
1646  .if \bpp == 24
1647    .if \size == 8
1648      .if \fast_st3 == 1
1649        st3         {v10.8b, v11.8b, v12.8b}, [RGB], 24
1650      .else
1651        st1         {v10.b}[0], [RGB], #1
1652        st1         {v11.b}[0], [RGB], #1
1653        st1         {v12.b}[0], [RGB], #1
1654
1655        st1         {v10.b}[1], [RGB], #1
1656        st1         {v11.b}[1], [RGB], #1
1657        st1         {v12.b}[1], [RGB], #1
1658
1659        st1         {v10.b}[2], [RGB], #1
1660        st1         {v11.b}[2], [RGB], #1
1661        st1         {v12.b}[2], [RGB], #1
1662
1663        st1         {v10.b}[3], [RGB], #1
1664        st1         {v11.b}[3], [RGB], #1
1665        st1         {v12.b}[3], [RGB], #1
1666
1667        st1         {v10.b}[4], [RGB], #1
1668        st1         {v11.b}[4], [RGB], #1
1669        st1         {v12.b}[4], [RGB], #1
1670
1671        st1         {v10.b}[5], [RGB], #1
1672        st1         {v11.b}[5], [RGB], #1
1673        st1         {v12.b}[5], [RGB], #1
1674
1675        st1         {v10.b}[6], [RGB], #1
1676        st1         {v11.b}[6], [RGB], #1
1677        st1         {v12.b}[6], [RGB], #1
1678
1679        st1         {v10.b}[7], [RGB], #1
1680        st1         {v11.b}[7], [RGB], #1
1681        st1         {v12.b}[7], [RGB], #1
1682      .endif
1683    .elseif \size == 4
1684      st3           {v10.b, v11.b, v12.b}[0], [RGB], 3
1685      st3           {v10.b, v11.b, v12.b}[1], [RGB], 3
1686      st3           {v10.b, v11.b, v12.b}[2], [RGB], 3
1687      st3           {v10.b, v11.b, v12.b}[3], [RGB], 3
1688    .elseif \size == 2
1689      st3           {v10.b, v11.b, v12.b}[4], [RGB], 3
1690      st3           {v10.b, v11.b, v12.b}[5], [RGB], 3
1691    .elseif \size == 1
1692      st3           {v10.b, v11.b, v12.b}[6], [RGB], 3
1693    .else
1694     .error unsupported macroblock size
1695    .endif
1696  .elseif \bpp == 32
1697    .if \size == 8
1698      st4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
1699    .elseif \size == 4
1700      st4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
1701      st4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
1702      st4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
1703      st4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
1704    .elseif \size == 2
1705      st4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
1706      st4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
1707    .elseif \size == 1
1708      st4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
1709    .else
1710      .error unsupported macroblock size
1711    .endif
1712  .elseif \bpp == 16
1713    .if \size == 8
1714      st1           {v25.8h}, [RGB], 16
1715    .elseif \size == 4
1716      st1           {v25.4h}, [RGB], 8
1717    .elseif \size == 2
1718      st1           {v25.h}[4], [RGB], 2
1719      st1           {v25.h}[5], [RGB], 2
1720    .elseif \size == 1
1721      st1           {v25.h}[6], [RGB], 2
1722    .else
1723      .error unsupported macroblock size
1724    .endif
1725  .else
1726    .error unsupported bpp
1727  .endif
1728.endm
1729
1730.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
1731                                           g_offs, gsize, b_offs, bsize, \
1732                                           defsize, fast_st3
1733
1734/*
1735 * 2-stage pipelined YCbCr->RGB conversion
1736 */
1737
1738.macro do_yuv_to_rgb_stage1
1739    uaddw           v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
1740    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
1741    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
1742    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
1743    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
1744    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
1745    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
1746    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
1747    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
1748    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
1749.endm
1750
1751.macro do_yuv_to_rgb_stage2
1752    rshrn           v20.4h, v20.4s, #15
1753    rshrn2          v20.8h, v22.4s, #15
1754    rshrn           v24.4h, v24.4s, #14
1755    rshrn2          v24.8h, v26.4s, #14
1756    rshrn           v28.4h, v28.4s, #14
1757    rshrn2          v28.8h, v30.4s, #14
1758    uaddw           v20.8h, v20.8h, v0.8b
1759    uaddw           v24.8h, v24.8h, v0.8b
1760    uaddw           v28.8h, v28.8h, v0.8b
1761  .if \bpp != 16
1762    sqxtun          v1\g_offs\defsize, v20.8h
1763    sqxtun          v1\r_offs\defsize, v24.8h
1764    sqxtun          v1\b_offs\defsize, v28.8h
1765  .else
1766    sqshlu          v21.8h, v20.8h, #8
1767    sqshlu          v25.8h, v24.8h, #8
1768    sqshlu          v29.8h, v28.8h, #8
1769    sri             v25.8h, v21.8h, #5
1770    sri             v25.8h, v29.8h, #11
1771  .endif
1772.endm
1773
1774.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
1775    rshrn           v20.4h, v20.4s, #15
1776    rshrn           v24.4h, v24.4s, #14
1777    rshrn           v28.4h, v28.4s, #14
1778    ld1             {v4.8b}, [U], 8
1779    rshrn2          v20.8h, v22.4s, #15
1780    rshrn2          v24.8h, v26.4s, #14
1781    rshrn2          v28.8h, v30.4s, #14
1782    ld1             {v5.8b}, [V], 8
1783    uaddw           v20.8h, v20.8h, v0.8b
1784    uaddw           v24.8h, v24.8h, v0.8b
1785    uaddw           v28.8h, v28.8h, v0.8b
1786  .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
1787    sqxtun          v1\g_offs\defsize, v20.8h
1788    ld1             {v0.8b}, [Y], 8
1789    sqxtun          v1\r_offs\defsize, v24.8h
1790    prfm            pldl1keep, [U, #64]
1791    prfm            pldl1keep, [V, #64]
1792    prfm            pldl1keep, [Y, #64]
1793    sqxtun          v1\b_offs\defsize, v28.8h
1794    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
1795    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
1796    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
1797    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
1798    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
1799    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
1800    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
1801    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
1802  .else  /**************************** rgb565 ********************************/
1803    sqshlu          v21.8h, v20.8h, #8
1804    sqshlu          v25.8h, v24.8h, #8
1805    sqshlu          v29.8h, v28.8h, #8
1806    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
1807    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
1808    ld1             {v0.8b}, [Y], 8
1809    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
1810    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
1811    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
1812    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
1813    sri             v25.8h, v21.8h, #5
1814    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
1815    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
1816    prfm            pldl1keep, [U, #64]
1817    prfm            pldl1keep, [V, #64]
1818    prfm            pldl1keep, [Y, #64]
1819    sri             v25.8h, v29.8h, #11
1820  .endif
1821    do_store        \bpp, 8, \fast_st3
1822    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
1823    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
1824.endm
1825
1826.macro do_yuv_to_rgb
1827    do_yuv_to_rgb_stage1
1828    do_yuv_to_rgb_stage2
1829.endm
1830
1831.if \fast_st3 == 1
1832asm_function jsimd_ycc_\colorid\()_convert_neon
1833.else
1834asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
1835.endif
1836    OUTPUT_WIDTH    .req w0
1837    INPUT_BUF       .req x1
1838    INPUT_ROW       .req w2
1839    OUTPUT_BUF      .req x3
1840    NUM_ROWS        .req w4
1841
1842    INPUT_BUF0      .req x5
1843    INPUT_BUF1      .req x6
1844    INPUT_BUF2      .req x1
1845
1846    RGB             .req x7
1847    Y               .req x9
1848    U               .req x10
1849    V               .req x11
1850    N               .req w15
1851
1852    sub             sp, sp, 64
1853    mov             x9, sp
1854
1855    /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
1856    get_symbol_loc  x15, Ljsimd_ycc_rgb_neon_consts
1857
1858    /* Save NEON registers */
1859    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1860    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1861    ld1             {v0.4h, v1.4h}, [x15], 16
1862    ld1             {v2.8h}, [x15]
1863
1864    ldr             INPUT_BUF0, [INPUT_BUF]
1865    ldr             INPUT_BUF1, [INPUT_BUF, #8]
1866    ldr             INPUT_BUF2, [INPUT_BUF, #16]
1867    .unreq          INPUT_BUF
1868
1869    /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
1870    movi            v10.16b, #255
1871    movi            v13.16b, #255
1872
1873    /* Outer loop over scanlines */
1874    cmp             NUM_ROWS, #1
1875    b.lt            9f
18760:
1877    ldr             Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
1878    ldr             U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
1879    mov             N, OUTPUT_WIDTH
1880    ldr             V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
1881    add             INPUT_ROW, INPUT_ROW, #1
1882    ldr             RGB, [OUTPUT_BUF], #8
1883
1884    /* Inner loop over pixels */
1885    subs            N, N, #8
1886    b.lt            3f
1887    do_load         8
1888    do_yuv_to_rgb_stage1
1889    subs            N, N, #8
1890    b.lt            2f
18911:
1892    do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
1893    subs            N, N, #8
1894    b.ge            1b
18952:
1896    do_yuv_to_rgb_stage2
1897    do_store        \bpp, 8, \fast_st3
1898    tst             N, #7
1899    b.eq            8f
19003:
1901    tst             N, #4
1902    b.eq            3f
1903    do_load         4
19043:
1905    tst             N, #2
1906    b.eq            4f
1907    do_load         2
19084:
1909    tst             N, #1
1910    b.eq            5f
1911    do_load         1
19125:
1913    do_yuv_to_rgb
1914    tst             N, #4
1915    b.eq            6f
1916    do_store        \bpp, 4, \fast_st3
19176:
1918    tst             N, #2
1919    b.eq            7f
1920    do_store        \bpp, 2, \fast_st3
19217:
1922    tst             N, #1
1923    b.eq            8f
1924    do_store        \bpp, 1, \fast_st3
19258:
1926    subs            NUM_ROWS, NUM_ROWS, #1
1927    b.gt            0b
19289:
1929    /* Restore all registers and return */
1930    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1931    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1932    br              x30
1933    .unreq          OUTPUT_WIDTH
1934    .unreq          INPUT_ROW
1935    .unreq          OUTPUT_BUF
1936    .unreq          NUM_ROWS
1937    .unreq          INPUT_BUF0
1938    .unreq          INPUT_BUF1
1939    .unreq          INPUT_BUF2
1940    .unreq          RGB
1941    .unreq          Y
1942    .unreq          U
1943    .unreq          V
1944    .unreq          N
1945
1946.purgem do_yuv_to_rgb
1947.purgem do_yuv_to_rgb_stage1
1948.purgem do_yuv_to_rgb_stage2
1949.purgem do_yuv_to_rgb_stage2_store_load_stage1
1950
1951.endm
1952
1953/*--------------------------------- id ----- bpp R  rsize G  gsize B  bsize defsize fast_st3*/
1954generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
1955generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
1956generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
1957generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
1958generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,  2, .4h,  1, .4h,  .8b,    1
1959generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,  2, .4h,  3, .4h,  .8b,    1
1960generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,  0, .4h,  0, .4h,  .8b,    1
1961
1962generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    0
1963generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    0
1964
1965.purgem do_load
1966.purgem do_store
1967
1968
1969/*****************************************************************************/
1970
1971/*
1972 * jsimd_extrgb_ycc_convert_neon
1973 * jsimd_extbgr_ycc_convert_neon
1974 * jsimd_extrgbx_ycc_convert_neon
1975 * jsimd_extbgrx_ycc_convert_neon
1976 * jsimd_extxbgr_ycc_convert_neon
1977 * jsimd_extxrgb_ycc_convert_neon
1978 *
1979 * Colorspace conversion RGB -> YCbCr
1980 */
1981
1982.macro do_store size
1983  .if \size == 8
1984    st1             {v20.8b}, [Y], #8
1985    st1             {v21.8b}, [U], #8
1986    st1             {v22.8b}, [V], #8
1987  .elseif \size == 4
1988    st1             {v20.b}[0], [Y], #1
1989    st1             {v20.b}[1], [Y], #1
1990    st1             {v20.b}[2], [Y], #1
1991    st1             {v20.b}[3], [Y], #1
1992    st1             {v21.b}[0], [U], #1
1993    st1             {v21.b}[1], [U], #1
1994    st1             {v21.b}[2], [U], #1
1995    st1             {v21.b}[3], [U], #1
1996    st1             {v22.b}[0], [V], #1
1997    st1             {v22.b}[1], [V], #1
1998    st1             {v22.b}[2], [V], #1
1999    st1             {v22.b}[3], [V], #1
2000  .elseif \size == 2
2001    st1             {v20.b}[4], [Y], #1
2002    st1             {v20.b}[5], [Y], #1
2003    st1             {v21.b}[4], [U], #1
2004    st1             {v21.b}[5], [U], #1
2005    st1             {v22.b}[4], [V], #1
2006    st1             {v22.b}[5], [V], #1
2007  .elseif \size == 1
2008    st1             {v20.b}[6], [Y], #1
2009    st1             {v21.b}[6], [U], #1
2010    st1             {v22.b}[6], [V], #1
2011  .else
2012    .error unsupported macroblock size
2013  .endif
2014.endm
2015
2016.macro do_load bpp, size, fast_ld3
2017  .if \bpp == 24
2018    .if \size == 8
2019      .if \fast_ld3 == 1
2020        ld3         {v10.8b, v11.8b, v12.8b}, [RGB], #24
2021      .else
2022        ld1         {v10.b}[0], [RGB], #1
2023        ld1         {v11.b}[0], [RGB], #1
2024        ld1         {v12.b}[0], [RGB], #1
2025
2026        ld1         {v10.b}[1], [RGB], #1
2027        ld1         {v11.b}[1], [RGB], #1
2028        ld1         {v12.b}[1], [RGB], #1
2029
2030        ld1         {v10.b}[2], [RGB], #1
2031        ld1         {v11.b}[2], [RGB], #1
2032        ld1         {v12.b}[2], [RGB], #1
2033
2034        ld1         {v10.b}[3], [RGB], #1
2035        ld1         {v11.b}[3], [RGB], #1
2036        ld1         {v12.b}[3], [RGB], #1
2037
2038        ld1         {v10.b}[4], [RGB], #1
2039        ld1         {v11.b}[4], [RGB], #1
2040        ld1         {v12.b}[4], [RGB], #1
2041
2042        ld1         {v10.b}[5], [RGB], #1
2043        ld1         {v11.b}[5], [RGB], #1
2044        ld1         {v12.b}[5], [RGB], #1
2045
2046        ld1         {v10.b}[6], [RGB], #1
2047        ld1         {v11.b}[6], [RGB], #1
2048        ld1         {v12.b}[6], [RGB], #1
2049
2050        ld1         {v10.b}[7], [RGB], #1
2051        ld1         {v11.b}[7], [RGB], #1
2052        ld1         {v12.b}[7], [RGB], #1
2053      .endif
2054      prfm          pldl1keep, [RGB, #128]
2055    .elseif \size == 4
2056      ld3           {v10.b, v11.b, v12.b}[0], [RGB], #3
2057      ld3           {v10.b, v11.b, v12.b}[1], [RGB], #3
2058      ld3           {v10.b, v11.b, v12.b}[2], [RGB], #3
2059      ld3           {v10.b, v11.b, v12.b}[3], [RGB], #3
2060    .elseif \size == 2
2061      ld3           {v10.b, v11.b, v12.b}[4], [RGB], #3
2062      ld3           {v10.b, v11.b, v12.b}[5], [RGB], #3
2063    .elseif \size == 1
2064      ld3           {v10.b, v11.b, v12.b}[6], [RGB], #3
2065    .else
2066      .error unsupported macroblock size
2067    .endif
2068  .elseif \bpp == 32
2069    .if \size == 8
2070      ld4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
2071      prfm          pldl1keep, [RGB, #128]
2072    .elseif \size == 4
2073      ld4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
2074      ld4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
2075      ld4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
2076      ld4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
2077    .elseif \size == 2
2078      ld4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
2079      ld4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
2080    .elseif \size == 1
2081      ld4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
2082    .else
2083      .error unsupported macroblock size
2084    .endif
2085  .else
2086    .error unsupported bpp
2087  .endif
2088.endm
2089
2090.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
2091                                           b_offs, fast_ld3
2092
2093/*
2094 * 2-stage pipelined RGB->YCbCr conversion
2095 */
2096
2097.macro do_rgb_to_yuv_stage1
2098    ushll           v4.8h, v1\r_offs\().8b, #0  /* r = v4 */
2099    ushll           v6.8h, v1\g_offs\().8b, #0  /* g = v6 */
2100    ushll           v8.8h, v1\b_offs\().8b, #0  /* b = v8 */
2101    rev64           v18.4s, v1.4s
2102    rev64           v26.4s, v1.4s
2103    rev64           v28.4s, v1.4s
2104    rev64           v30.4s, v1.4s
2105    umull           v14.4s, v4.4h, v0.h[0]
2106    umull2          v16.4s, v4.8h, v0.h[0]
2107    umlsl           v18.4s, v4.4h, v0.h[3]
2108    umlsl2          v26.4s, v4.8h, v0.h[3]
2109    umlal           v28.4s, v4.4h, v0.h[5]
2110    umlal2          v30.4s, v4.8h, v0.h[5]
2111    umlal           v14.4s, v6.4h, v0.h[1]
2112    umlal2          v16.4s, v6.8h, v0.h[1]
2113    umlsl           v18.4s, v6.4h, v0.h[4]
2114    umlsl2          v26.4s, v6.8h, v0.h[4]
2115    umlsl           v28.4s, v6.4h, v0.h[6]
2116    umlsl2          v30.4s, v6.8h, v0.h[6]
2117    umlal           v14.4s, v8.4h, v0.h[2]
2118    umlal2          v16.4s, v8.8h, v0.h[2]
2119    umlal           v18.4s, v8.4h, v0.h[5]
2120    umlal2          v26.4s, v8.8h, v0.h[5]
2121    umlsl           v28.4s, v8.4h, v0.h[7]
2122    umlsl2          v30.4s, v8.8h, v0.h[7]
2123.endm
2124
2125.macro do_rgb_to_yuv_stage2
2126    rshrn           v20.4h, v14.4s, #16
2127    shrn            v22.4h, v18.4s, #16
2128    shrn            v24.4h, v28.4s, #16
2129    rshrn2          v20.8h, v16.4s, #16
2130    shrn2           v22.8h, v26.4s, #16
2131    shrn2           v24.8h, v30.4s, #16
2132    xtn             v20.8b, v20.8h       /* v20 = y */
2133    xtn             v21.8b, v22.8h       /* v21 = u */
2134    xtn             v22.8b, v24.8h       /* v22 = v */
2135.endm
2136
2137.macro do_rgb_to_yuv
2138    do_rgb_to_yuv_stage1
2139    do_rgb_to_yuv_stage2
2140.endm
2141
2142/* TODO: expand macros and interleave instructions if some in-order
2143 *       ARM64 processor actually can dual-issue LOAD/STORE with ALU */
2144.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
2145    do_rgb_to_yuv_stage2
2146    do_load         \bpp, 8, \fast_ld3
2147    st1             {v20.8b}, [Y], #8
2148    st1             {v21.8b}, [U], #8
2149    st1             {v22.8b}, [V], #8
2150    do_rgb_to_yuv_stage1
2151.endm
2152
2153.if \fast_ld3 == 1
2154asm_function jsimd_\colorid\()_ycc_convert_neon
2155.else
2156asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
2157.endif
2158    OUTPUT_WIDTH    .req w0
2159    INPUT_BUF       .req x1
2160    OUTPUT_BUF      .req x2
2161    OUTPUT_ROW      .req w3
2162    NUM_ROWS        .req w4
2163
2164    OUTPUT_BUF0     .req x5
2165    OUTPUT_BUF1     .req x6
2166    OUTPUT_BUF2     .req x2  /* OUTPUT_BUF */
2167
2168    RGB             .req x7
2169    Y               .req x9
2170    U               .req x10
2171    V               .req x11
2172    N               .req w12
2173
2174    /* Load constants to d0, d1, d2, d3 */
2175    get_symbol_loc  x13, Ljsimd_rgb_ycc_neon_consts
2176    ld1             {v0.8h, v1.8h}, [x13]
2177
2178    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
2179    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #8]
2180    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #16]
2181    .unreq          OUTPUT_BUF
2182
2183    /* Save NEON registers */
2184    sub             sp, sp, #64
2185    mov             x9, sp
2186    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
2187    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
2188
2189    /* Outer loop over scanlines */
2190    cmp             NUM_ROWS, #1
2191    b.lt            9f
21920:
2193    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
2194    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
2195    mov             N, OUTPUT_WIDTH
2196    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
2197    add             OUTPUT_ROW, OUTPUT_ROW, #1
2198    ldr             RGB, [INPUT_BUF], #8
2199
2200    /* Inner loop over pixels */
2201    subs            N, N, #8
2202    b.lt            3f
2203    do_load         \bpp, 8, \fast_ld3
2204    do_rgb_to_yuv_stage1
2205    subs            N, N, #8
2206    b.lt            2f
22071:
2208    do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
2209    subs            N, N, #8
2210    b.ge            1b
22112:
2212    do_rgb_to_yuv_stage2
2213    do_store        8
2214    tst             N, #7
2215    b.eq            8f
22163:
2217    tbz             N, #2, 3f
2218    do_load         \bpp, 4, \fast_ld3
22193:
2220    tbz             N, #1, 4f
2221    do_load         \bpp, 2, \fast_ld3
22224:
2223    tbz             N, #0, 5f
2224    do_load         \bpp, 1, \fast_ld3
22255:
2226    do_rgb_to_yuv
2227    tbz             N, #2, 6f
2228    do_store        4
22296:
2230    tbz             N, #1, 7f
2231    do_store        2
22327:
2233    tbz             N, #0, 8f
2234    do_store        1
22358:
2236    subs            NUM_ROWS, NUM_ROWS, #1
2237    b.gt            0b
22389:
2239    /* Restore all registers and return */
2240    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2241    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2242    br              x30
2243
2244    .unreq          OUTPUT_WIDTH
2245    .unreq          OUTPUT_ROW
2246    .unreq          INPUT_BUF
2247    .unreq          NUM_ROWS
2248    .unreq          OUTPUT_BUF0
2249    .unreq          OUTPUT_BUF1
2250    .unreq          OUTPUT_BUF2
2251    .unreq          RGB
2252    .unreq          Y
2253    .unreq          U
2254    .unreq          V
2255    .unreq          N
2256
2257.purgem do_rgb_to_yuv
2258.purgem do_rgb_to_yuv_stage1
2259.purgem do_rgb_to_yuv_stage2
2260.purgem do_rgb_to_yuv_stage2_store_load_stage1
2261
2262.endm
2263
2264/*--------------------------------- id ----- bpp R  G  B  Fast LD3 */
2265generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 1
2266generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 1
2267generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
2268generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
2269generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
2270generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
2271
2272generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 0
2273generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 0
2274
2275.purgem do_load
2276.purgem do_store
2277
2278
2279/*****************************************************************************/
2280
2281/*
2282 * Load data into workspace, applying unsigned->signed conversion
2283 *
2284 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
2285 *       rid of VST1.16 instructions
2286 */
2287
2288asm_function jsimd_convsamp_neon
2289    SAMPLE_DATA     .req x0
2290    START_COL       .req x1
2291    WORKSPACE       .req x2
2292    TMP1            .req x9
2293    TMP2            .req x10
2294    TMP3            .req x11
2295    TMP4            .req x12
2296    TMP5            .req x13
2297    TMP6            .req x14
2298    TMP7            .req x15
2299    TMP8            .req x4
2300    TMPDUP          .req w3
2301
2302    /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
2303       guarantee that the upper (unused) 32 bits of x1 are valid.  This
2304       instruction ensures that those bits are set to zero. */
2305    uxtw x1, w1
2306
2307    mov             TMPDUP, #128
2308    ldp             TMP1, TMP2, [SAMPLE_DATA], 16
2309    ldp             TMP3, TMP4, [SAMPLE_DATA], 16
2310    dup             v0.8b, TMPDUP
2311    add             TMP1, TMP1, START_COL
2312    add             TMP2, TMP2, START_COL
2313    ldp             TMP5, TMP6, [SAMPLE_DATA], 16
2314    add             TMP3, TMP3, START_COL
2315    add             TMP4, TMP4, START_COL
2316    ldp             TMP7, TMP8, [SAMPLE_DATA], 16
2317    add             TMP5, TMP5, START_COL
2318    add             TMP6, TMP6, START_COL
2319    ld1             {v16.8b}, [TMP1]
2320    add             TMP7, TMP7, START_COL
2321    add             TMP8, TMP8, START_COL
2322    ld1             {v17.8b}, [TMP2]
2323    usubl           v16.8h, v16.8b, v0.8b
2324    ld1             {v18.8b}, [TMP3]
2325    usubl           v17.8h, v17.8b, v0.8b
2326    ld1             {v19.8b}, [TMP4]
2327    usubl           v18.8h, v18.8b, v0.8b
2328    ld1             {v20.8b}, [TMP5]
2329    usubl           v19.8h, v19.8b, v0.8b
2330    ld1             {v21.8b}, [TMP6]
2331    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
2332    usubl           v20.8h, v20.8b, v0.8b
2333    ld1             {v22.8b}, [TMP7]
2334    usubl           v21.8h, v21.8b, v0.8b
2335    ld1             {v23.8b}, [TMP8]
2336    usubl           v22.8h, v22.8b, v0.8b
2337    usubl           v23.8h, v23.8b, v0.8b
2338    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
2339
2340    br              x30
2341
2342    .unreq          SAMPLE_DATA
2343    .unreq          START_COL
2344    .unreq          WORKSPACE
2345    .unreq          TMP1
2346    .unreq          TMP2
2347    .unreq          TMP3
2348    .unreq          TMP4
2349    .unreq          TMP5
2350    .unreq          TMP6
2351    .unreq          TMP7
2352    .unreq          TMP8
2353    .unreq          TMPDUP
2354
2355/*****************************************************************************/
2356
2357/*
2358 * jsimd_fdct_islow_neon
2359 *
2360 * This file contains a slow-but-accurate integer implementation of the
2361 * forward DCT (Discrete Cosine Transform). The following code is based
2362 * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
2363 * more details.
2364 *
2365 * TODO: can be combined with 'jsimd_convsamp_neon' to get
2366 *       rid of a bunch of VLD1.16 instructions
2367 */
2368
2369#define CONST_BITS  13
2370#define PASS1_BITS  2
2371
2372#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
2373#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
2374
2375#define XFIX_P_0_298  v0.h[0]
2376#define XFIX_N_0_390  v0.h[1]
2377#define XFIX_P_0_541  v0.h[2]
2378#define XFIX_P_0_765  v0.h[3]
2379#define XFIX_N_0_899  v0.h[4]
2380#define XFIX_P_1_175  v0.h[5]
2381#define XFIX_P_1_501  v0.h[6]
2382#define XFIX_N_1_847  v0.h[7]
2383#define XFIX_N_1_961  v1.h[0]
2384#define XFIX_P_2_053  v1.h[1]
2385#define XFIX_N_2_562  v1.h[2]
2386#define XFIX_P_3_072  v1.h[3]
2387
2388asm_function jsimd_fdct_islow_neon
2389
2390    DATA            .req x0
2391    TMP             .req x9
2392
2393    /* Load constants */
2394    get_symbol_loc  TMP, Ljsimd_fdct_islow_neon_consts
2395    ld1             {v0.8h, v1.8h}, [TMP]
2396
2397    /* Save NEON registers */
2398    sub             sp, sp, #64
2399    mov             x10, sp
2400    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
2401    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
2402
2403    /* Load all DATA into NEON registers with the following allocation:
2404     *       0 1 2 3 | 4 5 6 7
2405     *      ---------+--------
2406     *   0 | d16     | d17    | v16.8h
2407     *   1 | d18     | d19    | v17.8h
2408     *   2 | d20     | d21    | v18.8h
2409     *   3 | d22     | d23    | v19.8h
2410     *   4 | d24     | d25    | v20.8h
2411     *   5 | d26     | d27    | v21.8h
2412     *   6 | d28     | d29    | v22.8h
2413     *   7 | d30     | d31    | v23.8h
2414     */
2415
2416    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2417    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2418    sub             DATA, DATA, #64
2419
2420    /* Transpose */
2421    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
2422    /* 1-D FDCT */
2423    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
2424    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
2425    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
2426    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
2427    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
2428    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
2429    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
2430    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
2431
2432    /* even part */
2433
2434    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
2435    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
2436    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
2437    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
2438
2439    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
2440    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
2441
2442    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
2443
2444    shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
2445    shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
2446
2447    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2448    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2449    mov             v22.16b, v18.16b
2450    mov             v25.16b, v24.16b
2451
2452    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2453    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2454    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2455    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2456
2457    rshrn           v18.4h, v18.4s, #DESCALE_P1
2458    rshrn           v22.4h, v22.4s, #DESCALE_P1
2459    rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
2460    rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
2461
2462    /* Odd part */
2463
2464    add             v8.8h, v28.8h, v31.8h        /* z1 = tmp4 + tmp7; */
2465    add             v9.8h, v29.8h, v30.8h        /* z2 = tmp5 + tmp6; */
2466    add             v10.8h, v28.8h, v30.8h       /* z3 = tmp4 + tmp6; */
2467    add             v11.8h, v29.8h, v31.8h       /* z4 = tmp5 + tmp7; */
2468    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
2469    smull2          v5.4s, v10.8h, XFIX_P_1_175
2470    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
2471    smlal2          v5.4s, v11.8h, XFIX_P_1_175
2472
2473    smull2          v24.4s, v28.8h, XFIX_P_0_298
2474    smull2          v25.4s, v29.8h, XFIX_P_2_053
2475    smull2          v26.4s, v30.8h, XFIX_P_3_072
2476    smull2          v27.4s, v31.8h, XFIX_P_1_501
2477    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
2478    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
2479    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
2480    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
2481
2482    smull2          v12.4s, v8.8h, XFIX_N_0_899
2483    smull2          v13.4s, v9.8h, XFIX_N_2_562
2484    smull2          v14.4s, v10.8h, XFIX_N_1_961
2485    smull2          v15.4s, v11.8h, XFIX_N_0_390
2486    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
2487    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
2488    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
2489    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
2490
2491    add             v10.4s, v10.4s, v4.4s  /* z3 += z5 */
2492    add             v14.4s, v14.4s, v5.4s
2493    add             v11.4s, v11.4s, v4.4s  /* z4 += z5 */
2494    add             v15.4s, v15.4s, v5.4s
2495
2496    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
2497    add             v24.4s, v24.4s, v12.4s
2498    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
2499    add             v25.4s, v25.4s, v13.4s
2500    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
2501    add             v26.4s, v26.4s, v14.4s
2502    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
2503    add             v27.4s, v27.4s, v15.4s
2504
2505    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
2506    add             v24.4s, v24.4s, v14.4s
2507    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
2508    add             v25.4s, v25.4s, v15.4s
2509    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
2510    add             v26.4s, v26.4s, v13.4s
2511    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
2512    add             v27.4s, v27.4s, v12.4s
2513
2514    rshrn           v23.4h, v28.4s, #DESCALE_P1
2515    rshrn           v21.4h, v29.4s, #DESCALE_P1
2516    rshrn           v19.4h, v30.4s, #DESCALE_P1
2517    rshrn           v17.4h, v31.4s, #DESCALE_P1
2518    rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
2519    rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
2520    rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
2521    rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
2522
2523    /* Transpose */
2524    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
2525
2526    /* 1-D FDCT */
2527    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
2528    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
2529    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
2530    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
2531    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
2532    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
2533    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
2534    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
2535
2536    /* even part */
2537    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
2538    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
2539    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
2540    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
2541
2542    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
2543    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
2544
2545    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
2546
2547    srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */
2548    srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */
2549
2550    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2551    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2552    mov             v22.16b, v18.16b
2553    mov             v25.16b, v24.16b
2554
2555    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2556    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2557    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2558    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2559
2560    rshrn           v18.4h, v18.4s, #DESCALE_P2
2561    rshrn           v22.4h, v22.4s, #DESCALE_P2
2562    rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
2563    rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
2564
2565    /* Odd part */
2566    add             v8.8h, v28.8h, v31.8h   /* z1 = tmp4 + tmp7; */
2567    add             v9.8h, v29.8h, v30.8h   /* z2 = tmp5 + tmp6; */
2568    add             v10.8h, v28.8h, v30.8h  /* z3 = tmp4 + tmp6; */
2569    add             v11.8h, v29.8h, v31.8h  /* z4 = tmp5 + tmp7; */
2570
2571    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
2572    smull2          v5.4s, v10.8h, XFIX_P_1_175
2573    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
2574    smlal2          v5.4s, v11.8h, XFIX_P_1_175
2575
2576    smull2          v24.4s, v28.8h, XFIX_P_0_298
2577    smull2          v25.4s, v29.8h, XFIX_P_2_053
2578    smull2          v26.4s, v30.8h, XFIX_P_3_072
2579    smull2          v27.4s, v31.8h, XFIX_P_1_501
2580    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
2581    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
2582    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
2583    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
2584
2585    smull2          v12.4s, v8.8h, XFIX_N_0_899
2586    smull2          v13.4s, v9.8h, XFIX_N_2_562
2587    smull2          v14.4s, v10.8h, XFIX_N_1_961
2588    smull2          v15.4s, v11.8h, XFIX_N_0_390
2589    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
2590    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
2591    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
2592    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
2593
2594    add             v10.4s, v10.4s, v4.4s
2595    add             v14.4s, v14.4s, v5.4s
2596    add             v11.4s, v11.4s, v4.4s
2597    add             v15.4s, v15.4s, v5.4s
2598
2599    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
2600    add             v24.4s, v24.4s, v12.4s
2601    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
2602    add             v25.4s, v25.4s, v13.4s
2603    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
2604    add             v26.4s, v26.4s, v14.4s
2605    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
2606    add             v27.4s, v27.4s, v15.4s
2607
2608    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
2609    add             v24.4s, v24.4s, v14.4s
2610    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
2611    add             v25.4s, v25.4s, v15.4s
2612    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
2613    add             v26.4s, v26.4s, v13.4s
2614    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
2615    add             v27.4s, v27.4s, v12.4s
2616
2617    rshrn           v23.4h, v28.4s, #DESCALE_P2
2618    rshrn           v21.4h, v29.4s, #DESCALE_P2
2619    rshrn           v19.4h, v30.4s, #DESCALE_P2
2620    rshrn           v17.4h, v31.4s, #DESCALE_P2
2621    rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
2622    rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
2623    rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
2624    rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
2625
2626    /* store results */
2627    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2628    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2629
2630    /* Restore NEON registers */
2631    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2632    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2633
2634    br              x30
2635
2636    .unreq          DATA
2637    .unreq          TMP
2638
2639#undef XFIX_P_0_298
2640#undef XFIX_N_0_390
2641#undef XFIX_P_0_541
2642#undef XFIX_P_0_765
2643#undef XFIX_N_0_899
2644#undef XFIX_P_1_175
2645#undef XFIX_P_1_501
2646#undef XFIX_N_1_847
2647#undef XFIX_N_1_961
2648#undef XFIX_P_2_053
2649#undef XFIX_N_2_562
2650#undef XFIX_P_3_072
2651
2652
2653/*****************************************************************************/
2654
2655/*
2656 * jsimd_fdct_ifast_neon
2657 *
2658 * This function contains a fast, not so accurate integer implementation of
2659 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
2660 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
2661 * function from jfdctfst.c
2662 *
2663 * TODO: can be combined with 'jsimd_convsamp_neon' to get
2664 *       rid of a bunch of VLD1.16 instructions
2665 */
2666
2667#undef XFIX_0_541196100
2668#define XFIX_0_382683433  v0.h[0]
2669#define XFIX_0_541196100  v0.h[1]
2670#define XFIX_0_707106781  v0.h[2]
2671#define XFIX_1_306562965  v0.h[3]
2672
2673asm_function jsimd_fdct_ifast_neon
2674
2675    DATA            .req x0
2676    TMP             .req x9
2677
2678    /* Load constants */
2679    get_symbol_loc  TMP, Ljsimd_fdct_ifast_neon_consts
2680    ld1             {v0.4h}, [TMP]
2681
2682    /* Load all DATA into NEON registers with the following allocation:
2683     *       0 1 2 3 | 4 5 6 7
2684     *      ---------+--------
2685     *   0 | d16     | d17    | v0.8h
2686     *   1 | d18     | d19    | q9
2687     *   2 | d20     | d21    | q10
2688     *   3 | d22     | d23    | q11
2689     *   4 | d24     | d25    | q12
2690     *   5 | d26     | d27    | q13
2691     *   6 | d28     | d29    | q14
2692     *   7 | d30     | d31    | q15
2693     */
2694
2695    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2696    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2697    mov             TMP, #2
2698    sub             DATA, DATA, #64
26991:
2700    /* Transpose */
2701    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
2702    subs            TMP, TMP, #1
2703    /* 1-D FDCT */
2704    add             v4.8h, v19.8h, v20.8h
2705    sub             v20.8h, v19.8h, v20.8h
2706    sub             v28.8h, v18.8h, v21.8h
2707    add             v18.8h, v18.8h, v21.8h
2708    sub             v29.8h, v17.8h, v22.8h
2709    add             v17.8h, v17.8h, v22.8h
2710    sub             v21.8h, v16.8h, v23.8h
2711    add             v16.8h, v16.8h, v23.8h
2712    sub             v6.8h, v17.8h, v18.8h
2713    sub             v7.8h, v16.8h, v4.8h
2714    add             v5.8h, v17.8h, v18.8h
2715    add             v6.8h, v6.8h, v7.8h
2716    add             v4.8h, v16.8h, v4.8h
2717    sqdmulh         v6.8h, v6.8h, XFIX_0_707106781
2718    add             v19.8h, v20.8h, v28.8h
2719    add             v16.8h, v4.8h, v5.8h
2720    sub             v20.8h, v4.8h, v5.8h
2721    add             v5.8h, v28.8h, v29.8h
2722    add             v29.8h, v29.8h, v21.8h
2723    sqdmulh         v5.8h, v5.8h, XFIX_0_707106781
2724    sub             v28.8h, v19.8h, v29.8h
2725    add             v18.8h, v7.8h, v6.8h
2726    sqdmulh         v28.8h, v28.8h, XFIX_0_382683433
2727    sub             v22.8h, v7.8h, v6.8h
2728    sqdmulh         v19.8h, v19.8h, XFIX_0_541196100
2729    sqdmulh         v7.8h, v29.8h, XFIX_1_306562965
2730    add             v6.8h, v21.8h, v5.8h
2731    sub             v5.8h, v21.8h, v5.8h
2732    add             v29.8h, v29.8h, v28.8h
2733    add             v19.8h, v19.8h, v28.8h
2734    add             v29.8h, v29.8h, v7.8h
2735    add             v21.8h, v5.8h, v19.8h
2736    sub             v19.8h, v5.8h, v19.8h
2737    add             v17.8h, v6.8h, v29.8h
2738    sub             v23.8h, v6.8h, v29.8h
2739
2740    b.ne            1b
2741
2742    /* store results */
2743    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2744    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2745
2746    br              x30
2747
2748    .unreq          DATA
2749    .unreq          TMP
2750#undef XFIX_0_382683433
2751#undef XFIX_0_541196100
2752#undef XFIX_0_707106781
2753#undef XFIX_1_306562965
2754
2755
2756/*****************************************************************************/
2757
2758/*
2759 * GLOBAL(void)
2760 * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
2761 *                     DCTELEM *workspace);
2762 *
2763 */
2764asm_function jsimd_quantize_neon
2765
2766    COEF_BLOCK      .req x0
2767    DIVISORS        .req x1
2768    WORKSPACE       .req x2
2769
2770    RECIPROCAL      .req DIVISORS
2771    CORRECTION      .req x9
2772    SHIFT           .req x10
2773    LOOP_COUNT      .req x11
2774
2775    mov             LOOP_COUNT, #2
2776    add             CORRECTION, DIVISORS, #(64 * 2)
2777    add             SHIFT, DIVISORS, #(64 * 6)
27781:
2779    subs            LOOP_COUNT, LOOP_COUNT, #1
2780    ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
2781    ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
2782    abs             v20.8h, v0.8h
2783    abs             v21.8h, v1.8h
2784    abs             v22.8h, v2.8h
2785    abs             v23.8h, v3.8h
2786    ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
2787    add             v20.8h, v20.8h, v4.8h  /* add correction */
2788    add             v21.8h, v21.8h, v5.8h
2789    add             v22.8h, v22.8h, v6.8h
2790    add             v23.8h, v23.8h, v7.8h
2791    umull           v4.4s, v20.4h, v28.4h  /* multiply by reciprocal */
2792    umull2          v16.4s, v20.8h, v28.8h
2793    umull           v5.4s, v21.4h, v29.4h
2794    umull2          v17.4s, v21.8h, v29.8h
2795    umull           v6.4s, v22.4h, v30.4h  /* multiply by reciprocal */
2796    umull2          v18.4s, v22.8h, v30.8h
2797    umull           v7.4s, v23.4h, v31.4h
2798    umull2          v19.4s, v23.8h, v31.8h
2799    ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
2800    shrn            v4.4h, v4.4s, #16
2801    shrn            v5.4h, v5.4s, #16
2802    shrn            v6.4h, v6.4s, #16
2803    shrn            v7.4h, v7.4s, #16
2804    shrn2           v4.8h, v16.4s, #16
2805    shrn2           v5.8h, v17.4s, #16
2806    shrn2           v6.8h, v18.4s, #16
2807    shrn2           v7.8h, v19.4s, #16
2808    neg             v24.8h, v24.8h
2809    neg             v25.8h, v25.8h
2810    neg             v26.8h, v26.8h
2811    neg             v27.8h, v27.8h
2812    sshr            v0.8h, v0.8h, #15  /* extract sign */
2813    sshr            v1.8h, v1.8h, #15
2814    sshr            v2.8h, v2.8h, #15
2815    sshr            v3.8h, v3.8h, #15
2816    ushl            v4.8h, v4.8h, v24.8h  /* shift */
2817    ushl            v5.8h, v5.8h, v25.8h
2818    ushl            v6.8h, v6.8h, v26.8h
2819    ushl            v7.8h, v7.8h, v27.8h
2820
2821    eor             v4.16b, v4.16b, v0.16b  /* restore sign */
2822    eor             v5.16b, v5.16b, v1.16b
2823    eor             v6.16b, v6.16b, v2.16b
2824    eor             v7.16b, v7.16b, v3.16b
2825    sub             v4.8h, v4.8h, v0.8h
2826    sub             v5.8h, v5.8h, v1.8h
2827    sub             v6.8h, v6.8h, v2.8h
2828    sub             v7.8h, v7.8h, v3.8h
2829    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
2830
2831    b.ne            1b
2832
2833    br              x30  /* return */
2834
2835    .unreq          COEF_BLOCK
2836    .unreq          DIVISORS
2837    .unreq          WORKSPACE
2838    .unreq          RECIPROCAL
2839    .unreq          CORRECTION
2840    .unreq          SHIFT
2841    .unreq          LOOP_COUNT
2842
2843
2844/*****************************************************************************/
2845
2846/*
2847 * Downsample pixel values of a single component.
2848 * This version handles the common case of 2:1 horizontal and 1:1 vertical,
2849 * without smoothing.
2850 *
2851 * GLOBAL(void)
2852 * jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
2853 *                            JDIMENSION v_samp_factor,
2854 *                            JDIMENSION width_in_blocks,
2855 *                            JSAMPARRAY input_data, JSAMPARRAY output_data);
2856 */
2857
2858asm_function jsimd_h2v1_downsample_neon
2859    IMAGE_WIDTH     .req x0
2860    MAX_V_SAMP      .req x1
2861    V_SAMP          .req x2
2862    BLOCK_WIDTH     .req x3
2863    INPUT_DATA      .req x4
2864    OUTPUT_DATA     .req x5
2865    OUTPTR          .req x9
2866    INPTR           .req x10
2867    TMP1            .req x11
2868    TMP2            .req x12
2869    TMP3            .req x13
2870    TMPDUP          .req w15
2871
2872    mov             TMPDUP, #0x10000
2873    lsl             TMP2, BLOCK_WIDTH, #4
2874    sub             TMP2, TMP2, IMAGE_WIDTH
2875    get_symbol_loc  TMP3, Ljsimd_h2_downsample_neon_consts
2876    add             TMP3, TMP3, TMP2, lsl #4
2877    dup             v16.4s, TMPDUP
2878    ld1             {v18.16b}, [TMP3]
2879
28801:  /* row loop */
2881    ldr             INPTR, [INPUT_DATA], #8
2882    ldr             OUTPTR, [OUTPUT_DATA], #8
2883    subs            TMP1, BLOCK_WIDTH, #1
2884    b.eq            3f
28852:  /* columns */
2886    ld1             {v0.16b}, [INPTR], #16
2887    mov             v4.16b, v16.16b
2888    subs            TMP1, TMP1, #1
2889    uadalp          v4.8h, v0.16b
2890    shrn            v6.8b, v4.8h, #1
2891    st1             {v6.8b}, [OUTPTR], #8
2892    b.ne            2b
28933:  /* last columns */
2894    ld1             {v0.16b}, [INPTR]
2895    mov             v4.16b, v16.16b
2896    subs            V_SAMP, V_SAMP, #1
2897    /* expand right */
2898    tbl             v2.16b, {v0.16b}, v18.16b
2899    uadalp          v4.8h, v2.16b
2900    shrn            v6.8b, v4.8h, #1
2901    st1             {v6.8b}, [OUTPTR], #8
2902    b.ne            1b
2903
2904    br              x30
2905
2906    .unreq          IMAGE_WIDTH
2907    .unreq          MAX_V_SAMP
2908    .unreq          V_SAMP
2909    .unreq          BLOCK_WIDTH
2910    .unreq          INPUT_DATA
2911    .unreq          OUTPUT_DATA
2912    .unreq          OUTPTR
2913    .unreq          INPTR
2914    .unreq          TMP1
2915    .unreq          TMP2
2916    .unreq          TMP3
2917    .unreq          TMPDUP
2918
2919
2920/*****************************************************************************/
2921
2922/*
2923 * Downsample pixel values of a single component.
2924 * This version handles the common case of 2:1 horizontal and 2:1 vertical,
2925 * without smoothing.
2926 *
2927 * GLOBAL(void)
2928 * jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
2929 *                            JDIMENSION v_samp_factor,
2930 *                            JDIMENSION width_in_blocks,
2931 *                            JSAMPARRAY input_data, JSAMPARRAY output_data);
2932 */
2933
2934.balign 16
2935asm_function jsimd_h2v2_downsample_neon
2936    IMAGE_WIDTH     .req x0
2937    MAX_V_SAMP      .req x1
2938    V_SAMP          .req x2
2939    BLOCK_WIDTH     .req x3
2940    INPUT_DATA      .req x4
2941    OUTPUT_DATA     .req x5
2942    OUTPTR          .req x9
2943    INPTR0          .req x10
2944    INPTR1          .req x14
2945    TMP1            .req x11
2946    TMP2            .req x12
2947    TMP3            .req x13
2948    TMPDUP          .req w15
2949
2950    mov             TMPDUP, #1
2951    lsl             TMP2, BLOCK_WIDTH, #4
2952    lsl             TMPDUP, TMPDUP, #17
2953    sub             TMP2, TMP2, IMAGE_WIDTH
2954    get_symbol_loc  TMP3, Ljsimd_h2_downsample_neon_consts
2955    orr             TMPDUP, TMPDUP, #1
2956    add             TMP3, TMP3, TMP2, lsl #4
2957    dup             v16.4s, TMPDUP
2958    ld1             {v18.16b}, [TMP3]
2959
29601:  /* row loop */
2961    ldr             INPTR0, [INPUT_DATA], #8
2962    ldr             OUTPTR, [OUTPUT_DATA], #8
2963    ldr             INPTR1, [INPUT_DATA], #8
2964    subs            TMP1, BLOCK_WIDTH, #1
2965    b.eq            3f
29662:  /* columns */
2967    ld1             {v0.16b}, [INPTR0], #16
2968    ld1             {v1.16b}, [INPTR1], #16
2969    mov             v4.16b, v16.16b
2970    subs            TMP1, TMP1, #1
2971    uadalp          v4.8h, v0.16b
2972    uadalp          v4.8h, v1.16b
2973    shrn            v6.8b, v4.8h, #2
2974    st1             {v6.8b}, [OUTPTR], #8
2975    b.ne            2b
29763:  /* last columns */
2977    ld1             {v0.16b}, [INPTR0], #16
2978    ld1             {v1.16b}, [INPTR1], #16
2979    mov             v4.16b, v16.16b
2980    subs            V_SAMP, V_SAMP, #1
2981    /* expand right */
2982    tbl             v2.16b, {v0.16b}, v18.16b
2983    tbl             v3.16b, {v1.16b}, v18.16b
2984    uadalp          v4.8h, v2.16b
2985    uadalp          v4.8h, v3.16b
2986    shrn            v6.8b, v4.8h, #2
2987    st1             {v6.8b}, [OUTPTR], #8
2988    b.ne            1b
2989
2990    br              x30
2991
2992    .unreq          IMAGE_WIDTH
2993    .unreq          MAX_V_SAMP
2994    .unreq          V_SAMP
2995    .unreq          BLOCK_WIDTH
2996    .unreq          INPUT_DATA
2997    .unreq          OUTPUT_DATA
2998    .unreq          OUTPTR
2999    .unreq          INPTR0
3000    .unreq          INPTR1
3001    .unreq          TMP1
3002    .unreq          TMP2
3003    .unreq          TMP3
3004    .unreq          TMPDUP
3005
3006
3007/*****************************************************************************/
3008
3009/*
3010 * GLOBAL(JOCTET *)
3011 * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
3012 *                             JCOEFPTR block, int last_dc_val,
3013 *                             c_derived_tbl *dctbl, c_derived_tbl *actbl)
3014 *
3015 */
3016
3017    BUFFER          .req x1
3018    PUT_BUFFER      .req x6
3019    PUT_BITS        .req x7
3020    PUT_BITSw       .req w7
3021
3022.macro emit_byte
3023    sub             PUT_BITS, PUT_BITS, #0x8
3024    lsr             x19, PUT_BUFFER, PUT_BITS
3025    uxtb            w19, w19
3026    strb            w19, [BUFFER, #1]!
3027    cmp             w19, #0xff
3028    b.ne            14f
3029    strb            wzr, [BUFFER, #1]!
303014:
3031.endm
3032.macro put_bits CODE, SIZE
3033    lsl             PUT_BUFFER, PUT_BUFFER, \SIZE
3034    add             PUT_BITS, PUT_BITS, \SIZE
3035    orr             PUT_BUFFER, PUT_BUFFER, \CODE
3036.endm
3037.macro checkbuf31
3038    cmp             PUT_BITS, #0x20
3039    b.lt            31f
3040    emit_byte
3041    emit_byte
3042    emit_byte
3043    emit_byte
304431:
3045.endm
3046.macro checkbuf47
3047    cmp             PUT_BITS, #0x30
3048    b.lt            47f
3049    emit_byte
3050    emit_byte
3051    emit_byte
3052    emit_byte
3053    emit_byte
3054    emit_byte
305547:
3056.endm
3057
3058.macro generate_jsimd_huff_encode_one_block fast_tbl
3059
3060.if \fast_tbl == 1
3061asm_function jsimd_huff_encode_one_block_neon
3062.else
3063asm_function jsimd_huff_encode_one_block_neon_slowtbl
3064.endif
3065    sub             sp, sp, 272
3066    sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
3067    /* Save ARM registers */
3068    stp             x19, x20, [sp]
3069    get_symbol_loc  x15, Ljsimd_huff_encode_one_block_neon_consts
3070    ldr             PUT_BUFFER, [x0, #0x10]
3071    ldr             PUT_BITSw, [x0, #0x18]
3072    ldrsh           w12, [x2]               /* load DC coeff in w12 */
3073    /* prepare data */
3074.if \fast_tbl == 1
3075    ld1             {v23.16b}, [x15], #16
3076    ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
3077    ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
3078    ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
3079    ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
3080    ld1             {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
3081    sub             w12, w12, w3      /* last_dc_val, not used afterwards */
3082    /* ZigZag 8x8 */
3083    tbl             v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
3084    tbl             v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
3085    tbl             v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
3086    tbl             v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
3087    tbl             v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
3088    tbl             v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
3089    tbl             v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
3090    tbl             v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
3091    ins             v0.h[0], w12
3092    tbx             v1.16b, {v28.16b}, v16.16b
3093    tbx             v2.16b, {v29.16b, v30.16b}, v17.16b
3094    tbx             v5.16b, {v29.16b, v30.16b}, v18.16b
3095    tbx             v6.16b, {v31.16b}, v19.16b
3096.else
3097      add             x13, x2, #0x22
3098      sub             w12, w12, w3    /* last_dc_val, not used afterwards */
3099    ld1             {v23.16b}, [x15]
3100      add             x14, x2, #0x18
3101      add             x3, x2, #0x36
3102    ins             v0.h[0], w12
3103      add             x9, x2, #0x2
3104    ld1             {v1.h}[0], [x13]
3105      add             x15, x2, #0x30
3106    ld1             {v2.h}[0], [x14]
3107      add             x19, x2, #0x26
3108    ld1             {v3.h}[0], [x3]
3109      add             x20, x2, #0x28
3110    ld1             {v0.h}[1], [x9]
3111      add             x12, x2, #0x10
3112    ld1             {v1.h}[1], [x15]
3113      add             x13, x2, #0x40
3114    ld1             {v2.h}[1], [x19]
3115      add             x14, x2, #0x34
3116    ld1             {v3.h}[1], [x20]
3117      add             x3, x2, #0x1a
3118    ld1             {v0.h}[2], [x12]
3119      add             x9, x2, #0x20
3120    ld1             {v1.h}[2], [x13]
3121      add             x15, x2, #0x32
3122    ld1             {v2.h}[2], [x14]
3123      add             x19, x2, #0x42
3124    ld1             {v3.h}[2], [x3]
3125      add             x20, x2, #0xc
3126    ld1             {v0.h}[3], [x9]
3127      add             x12, x2, #0x12
3128    ld1             {v1.h}[3], [x15]
3129      add             x13, x2, #0x24
3130    ld1             {v2.h}[3], [x19]
3131      add             x14, x2, #0x50
3132    ld1             {v3.h}[3], [x20]
3133      add             x3, x2, #0xe
3134    ld1             {v0.h}[4], [x12]
3135      add             x9, x2, #0x4
3136    ld1             {v1.h}[4], [x13]
3137      add             x15, x2, #0x16
3138    ld1             {v2.h}[4], [x14]
3139      add             x19, x2, #0x60
3140    ld1             {v3.h}[4], [x3]
3141      add             x20, x2, #0x1c
3142    ld1             {v0.h}[5], [x9]
3143      add             x12, x2, #0x6
3144    ld1             {v1.h}[5], [x15]
3145      add             x13, x2, #0x8
3146    ld1             {v2.h}[5], [x19]
3147      add             x14, x2, #0x52
3148    ld1             {v3.h}[5], [x20]
3149      add             x3, x2, #0x2a
3150    ld1             {v0.h}[6], [x12]
3151      add             x9, x2, #0x14
3152    ld1             {v1.h}[6], [x13]
3153      add             x15, x2, #0xa
3154    ld1             {v2.h}[6], [x14]
3155      add             x19, x2, #0x44
3156    ld1             {v3.h}[6], [x3]
3157      add             x20, x2, #0x38
3158    ld1             {v0.h}[7], [x9]
3159      add             x12, x2, #0x46
3160    ld1             {v1.h}[7], [x15]
3161      add             x13, x2, #0x3a
3162    ld1             {v2.h}[7], [x19]
3163      add             x14, x2, #0x74
3164    ld1             {v3.h}[7], [x20]
3165      add             x3, x2, #0x6a
3166    ld1             {v4.h}[0], [x12]
3167      add             x9, x2, #0x54
3168    ld1             {v5.h}[0], [x13]
3169      add             x15, x2, #0x2c
3170    ld1             {v6.h}[0], [x14]
3171      add             x19, x2, #0x76
3172    ld1             {v7.h}[0], [x3]
3173      add             x20, x2, #0x78
3174    ld1             {v4.h}[1], [x9]
3175      add             x12, x2, #0x62
3176    ld1             {v5.h}[1], [x15]
3177      add             x13, x2, #0x1e
3178    ld1             {v6.h}[1], [x19]
3179      add             x14, x2, #0x68
3180    ld1             {v7.h}[1], [x20]
3181      add             x3, x2, #0x7a
3182    ld1             {v4.h}[2], [x12]
3183      add             x9, x2, #0x70
3184    ld1             {v5.h}[2], [x13]
3185      add             x15, x2, #0x2e
3186    ld1             {v6.h}[2], [x14]
3187      add             x19, x2, #0x5a
3188    ld1             {v7.h}[2], [x3]
3189      add             x20, x2, #0x6c
3190    ld1             {v4.h}[3], [x9]
3191      add             x12, x2, #0x72
3192    ld1             {v5.h}[3], [x15]
3193      add             x13, x2, #0x3c
3194    ld1             {v6.h}[3], [x19]
3195      add             x14, x2, #0x4c
3196    ld1             {v7.h}[3], [x20]
3197      add             x3, x2, #0x5e
3198    ld1             {v4.h}[4], [x12]
3199      add             x9, x2, #0x64
3200    ld1             {v5.h}[4], [x13]
3201      add             x15, x2, #0x4a
3202    ld1             {v6.h}[4], [x14]
3203      add             x19, x2, #0x3e
3204    ld1             {v7.h}[4], [x3]
3205      add             x20, x2, #0x6e
3206    ld1             {v4.h}[5], [x9]
3207      add             x12, x2, #0x56
3208    ld1             {v5.h}[5], [x15]
3209      add             x13, x2, #0x58
3210    ld1             {v6.h}[5], [x19]
3211      add             x14, x2, #0x4e
3212    ld1             {v7.h}[5], [x20]
3213      add             x3, x2, #0x7c
3214    ld1             {v4.h}[6], [x12]
3215      add             x9, x2, #0x48
3216    ld1             {v5.h}[6], [x13]
3217      add             x15, x2, #0x66
3218    ld1             {v6.h}[6], [x14]
3219      add             x19, x2, #0x5c
3220    ld1             {v7.h}[6], [x3]
3221      add             x20, x2, #0x7e
3222    ld1             {v4.h}[7], [x9]
3223    ld1             {v5.h}[7], [x15]
3224    ld1             {v6.h}[7], [x19]
3225    ld1             {v7.h}[7], [x20]
3226.endif
3227    cmlt            v24.8h, v0.8h, #0
3228    cmlt            v25.8h, v1.8h, #0
3229    cmlt            v26.8h, v2.8h, #0
3230    cmlt            v27.8h, v3.8h, #0
3231    cmlt            v28.8h, v4.8h, #0
3232    cmlt            v29.8h, v5.8h, #0
3233    cmlt            v30.8h, v6.8h, #0
3234    cmlt            v31.8h, v7.8h, #0
3235    abs             v0.8h, v0.8h
3236    abs             v1.8h, v1.8h
3237    abs             v2.8h, v2.8h
3238    abs             v3.8h, v3.8h
3239    abs             v4.8h, v4.8h
3240    abs             v5.8h, v5.8h
3241    abs             v6.8h, v6.8h
3242    abs             v7.8h, v7.8h
3243    eor             v24.16b, v24.16b, v0.16b
3244    eor             v25.16b, v25.16b, v1.16b
3245    eor             v26.16b, v26.16b, v2.16b
3246    eor             v27.16b, v27.16b, v3.16b
3247    eor             v28.16b, v28.16b, v4.16b
3248    eor             v29.16b, v29.16b, v5.16b
3249    eor             v30.16b, v30.16b, v6.16b
3250    eor             v31.16b, v31.16b, v7.16b
3251    cmeq            v16.8h, v0.8h, #0
3252    cmeq            v17.8h, v1.8h, #0
3253    cmeq            v18.8h, v2.8h, #0
3254    cmeq            v19.8h, v3.8h, #0
3255    cmeq            v20.8h, v4.8h, #0
3256    cmeq            v21.8h, v5.8h, #0
3257    cmeq            v22.8h, v6.8h, #0
3258    xtn             v16.8b, v16.8h
3259    xtn             v18.8b, v18.8h
3260    xtn             v20.8b, v20.8h
3261    xtn             v22.8b, v22.8h
3262      umov            w14, v0.h[0]
3263    xtn2            v16.16b, v17.8h
3264      umov            w13, v24.h[0]
3265    xtn2            v18.16b, v19.8h
3266      clz             w14, w14
3267    xtn2            v20.16b, v21.8h
3268      lsl             w13, w13, w14
3269    cmeq            v17.8h, v7.8h, #0
3270      sub             w12, w14, #32
3271    xtn2            v22.16b, v17.8h
3272      lsr             w13, w13, w14
3273    and             v16.16b, v16.16b, v23.16b
3274      neg             w12, w12
3275    and             v18.16b, v18.16b, v23.16b
3276      add             x3, x4, #0x400           /* r1 = dctbl->ehufsi */
3277    and             v20.16b, v20.16b, v23.16b
3278      add             x15, sp, #0x90           /* x15 = t2 */
3279    and             v22.16b, v22.16b, v23.16b
3280      ldr             w10, [x4, x12, lsl #2]
3281    addp            v16.16b, v16.16b, v18.16b
3282      ldrb            w11, [x3, x12]
3283    addp            v20.16b, v20.16b, v22.16b
3284      checkbuf47
3285    addp            v16.16b, v16.16b, v20.16b
3286      put_bits        x10, x11
3287    addp            v16.16b, v16.16b, v18.16b
3288      checkbuf47
3289    umov            x9, v16.D[0]
3290      put_bits        x13, x12
3291    cnt             v17.8b, v16.8b
3292      mvn             x9, x9
3293    addv            B18, v17.8b
3294      add             x4, x5, #0x400   /* x4 = actbl->ehufsi */
3295    umov            w12, v18.b[0]
3296      lsr             x9, x9, #0x1     /* clear AC coeff */
3297    ldr             w13, [x5, #0x3c0]  /* x13 = actbl->ehufco[0xf0] */
3298    rbit            x9, x9             /* x9 = index0 */
3299    ldrb            w14, [x4, #0xf0]   /* x14 = actbl->ehufsi[0xf0] */
3300    cmp             w12, #(64-8)
3301    add             x11, sp, #16
3302    b.lt            4f
3303    cbz             x9, 6f
3304    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
3305    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
3306    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
3307    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
33081:
3309    clz             x2, x9
3310    add             x15, x15, x2, lsl #1
3311    lsl             x9, x9, x2
3312    ldrh            w20, [x15, #-126]
33132:
3314    cmp             x2, #0x10
3315    b.lt            3f
3316    sub             x2, x2, #0x10
3317    checkbuf47
3318    put_bits        x13, x14
3319    b               2b
33203:
3321    clz             w20, w20
3322    ldrh            w3, [x15, #2]!
3323    sub             w11, w20, #32
3324    lsl             w3, w3, w20
3325    neg             w11, w11
3326    lsr             w3, w3, w20
3327    add             x2, x11, x2, lsl #4
3328    lsl             x9, x9, #0x1
3329    ldr             w12, [x5, x2, lsl #2]
3330    ldrb            w10, [x4, x2]
3331    checkbuf31
3332    put_bits        x12, x10
3333    put_bits        x3, x11
3334    cbnz            x9, 1b
3335    b               6f
33364:
3337    movi            v21.8h, #0x0010
3338    clz             v0.8h, v0.8h
3339    clz             v1.8h, v1.8h
3340    clz             v2.8h, v2.8h
3341    clz             v3.8h, v3.8h
3342    clz             v4.8h, v4.8h
3343    clz             v5.8h, v5.8h
3344    clz             v6.8h, v6.8h
3345    clz             v7.8h, v7.8h
3346    ushl            v24.8h, v24.8h, v0.8h
3347    ushl            v25.8h, v25.8h, v1.8h
3348    ushl            v26.8h, v26.8h, v2.8h
3349    ushl            v27.8h, v27.8h, v3.8h
3350    ushl            v28.8h, v28.8h, v4.8h
3351    ushl            v29.8h, v29.8h, v5.8h
3352    ushl            v30.8h, v30.8h, v6.8h
3353    ushl            v31.8h, v31.8h, v7.8h
3354    neg             v0.8h, v0.8h
3355    neg             v1.8h, v1.8h
3356    neg             v2.8h, v2.8h
3357    neg             v3.8h, v3.8h
3358    neg             v4.8h, v4.8h
3359    neg             v5.8h, v5.8h
3360    neg             v6.8h, v6.8h
3361    neg             v7.8h, v7.8h
3362    ushl            v24.8h, v24.8h, v0.8h
3363    ushl            v25.8h, v25.8h, v1.8h
3364    ushl            v26.8h, v26.8h, v2.8h
3365    ushl            v27.8h, v27.8h, v3.8h
3366    ushl            v28.8h, v28.8h, v4.8h
3367    ushl            v29.8h, v29.8h, v5.8h
3368    ushl            v30.8h, v30.8h, v6.8h
3369    ushl            v31.8h, v31.8h, v7.8h
3370    add             v0.8h, v21.8h, v0.8h
3371    add             v1.8h, v21.8h, v1.8h
3372    add             v2.8h, v21.8h, v2.8h
3373    add             v3.8h, v21.8h, v3.8h
3374    add             v4.8h, v21.8h, v4.8h
3375    add             v5.8h, v21.8h, v5.8h
3376    add             v6.8h, v21.8h, v6.8h
3377    add             v7.8h, v21.8h, v7.8h
3378    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
3379    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
3380    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
3381    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
33821:
3383    clz             x2, x9
3384    add             x15, x15, x2, lsl #1
3385    lsl             x9, x9, x2
3386    ldrh            w11, [x15, #-126]
33872:
3388    cmp             x2, #0x10
3389    b.lt            3f
3390    sub             x2, x2, #0x10
3391    checkbuf47
3392    put_bits        x13, x14
3393    b               2b
33943:
3395    ldrh            w3, [x15, #2]!
3396    add             x2, x11, x2, lsl #4
3397    lsl             x9, x9, #0x1
3398    ldr             w12, [x5, x2, lsl #2]
3399    ldrb            w10, [x4, x2]
3400    checkbuf31
3401    put_bits        x12, x10
3402    put_bits        x3, x11
3403    cbnz            x9, 1b
34046:
3405    add             x13, sp, #0x10e
3406    cmp             x15, x13
3407    b.hs            1f
3408    ldr             w12, [x5]
3409    ldrb            w14, [x4]
3410    checkbuf47
3411    put_bits        x12, x14
34121:
3413    str             PUT_BUFFER, [x0, #0x10]
3414    str             PUT_BITSw, [x0, #0x18]
3415    ldp             x19, x20, [sp], 16
3416    add             x0, BUFFER, #0x1
3417    add             sp, sp, 256
3418    br              x30
3419
3420.endm
3421
3422generate_jsimd_huff_encode_one_block 1
3423generate_jsimd_huff_encode_one_block 0
3424
3425    .unreq          BUFFER
3426    .unreq          PUT_BUFFER
3427    .unreq          PUT_BITS
3428    .unreq          PUT_BITSw
3429
3430.purgem emit_byte
3431.purgem put_bits
3432.purgem checkbuf31
3433.purgem checkbuf47
3434