1 /*
2 * jidctfst-neon.c - fast integer IDCT (Arm Neon)
3 *
4 * Copyright (C) 2020, Arm Limited. All Rights Reserved.
5 *
6 * This software is provided 'as-is', without any express or implied
7 * warranty. In no event will the authors be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 * claim that you wrote the original software. If you use this software
16 * in a product, an acknowledgment in the product documentation would be
17 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23 #define JPEG_INTERNALS
24 #include "../../jinclude.h"
25 #include "../../jpeglib.h"
26 #include "../../jsimd.h"
27 #include "../../jdct.h"
28 #include "../../jsimddct.h"
29 #include "../jsimd.h"
30 #include "align.h"
31
32 #include <arm_neon.h>
33
34
35 /* jsimd_idct_ifast_neon() performs dequantization and a fast, not so accurate
36 * inverse DCT (Discrete Cosine Transform) on one block of coefficients. It
37 * uses the same calculations and produces exactly the same output as IJG's
38 * original jpeg_idct_ifast() function, which can be found in jidctfst.c.
39 *
40 * Scaled integer constants are used to avoid floating-point arithmetic:
41 * 0.082392200 = 2688 * 2^-15
42 * 0.414213562 = 13568 * 2^-15
43 * 0.847759065 = 27776 * 2^-15
44 * 0.613125930 = 20096 * 2^-15
45 *
46 * See jidctfst.c for further details of the IDCT algorithm. Where possible,
47 * the variable names and comments here in jsimd_idct_ifast_neon() match up
48 * with those in jpeg_idct_ifast().
49 */
50
51 #define PASS1_BITS 2
52
53 #define F_0_082 2688
54 #define F_0_414 13568
55 #define F_0_847 27776
56 #define F_0_613 20096
57
58
59 ALIGN(16) static const int16_t jsimd_idct_ifast_neon_consts[] = {
60 F_0_082, F_0_414, F_0_847, F_0_613
61 };
62
jsimd_idct_ifast_neon(void * dct_table,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)63 void jsimd_idct_ifast_neon(void *dct_table, JCOEFPTR coef_block,
64 JSAMPARRAY output_buf, JDIMENSION output_col)
65 {
66 IFAST_MULT_TYPE *quantptr = dct_table;
67
68 /* Load DCT coefficients. */
69 int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
70 int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
71 int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
72 int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
73 int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE);
74 int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
75 int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
76 int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
77
78 /* Load quantization table values for DC coefficients. */
79 int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
80 /* Dequantize DC coefficients. */
81 row0 = vmulq_s16(row0, quant_row0);
82
83 /* Construct bitmap to test if all AC coefficients are 0. */
84 int16x8_t bitmap = vorrq_s16(row1, row2);
85 bitmap = vorrq_s16(bitmap, row3);
86 bitmap = vorrq_s16(bitmap, row4);
87 bitmap = vorrq_s16(bitmap, row5);
88 bitmap = vorrq_s16(bitmap, row6);
89 bitmap = vorrq_s16(bitmap, row7);
90
91 int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
92 int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
93
94 /* Load IDCT conversion constants. */
95 const int16x4_t consts = vld1_s16(jsimd_idct_ifast_neon_consts);
96
97 if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
98 /* All AC coefficients are zero.
99 * Compute DC values and duplicate into vectors.
100 */
101 int16x8_t dcval = row0;
102 row1 = dcval;
103 row2 = dcval;
104 row3 = dcval;
105 row4 = dcval;
106 row5 = dcval;
107 row6 = dcval;
108 row7 = dcval;
109 } else if (left_ac_bitmap == 0) {
110 /* AC coefficients are zero for columns 0, 1, 2, and 3.
111 * Use DC values for these columns.
112 */
113 int16x4_t dcval = vget_low_s16(row0);
114
115 /* Commence regular fast IDCT computation for columns 4, 5, 6, and 7. */
116
117 /* Load quantization table. */
118 int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
119 int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
120 int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
121 int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
122 int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
123 int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
124 int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
125
126 /* Even part: dequantize DCT coefficients. */
127 int16x4_t tmp0 = vget_high_s16(row0);
128 int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2);
129 int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4);
130 int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6);
131
132 int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */
133 int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
134
135 int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
136 int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
137 int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
138 tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
139 tmp12 = vsub_s16(tmp12, tmp13);
140
141 tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */
142 tmp3 = vsub_s16(tmp10, tmp13);
143 tmp1 = vadd_s16(tmp11, tmp12);
144 tmp2 = vsub_s16(tmp11, tmp12);
145
146 /* Odd part: dequantize DCT coefficients. */
147 int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1);
148 int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3);
149 int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5);
150 int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7);
151
152 int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */
153 int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
154 int16x4_t z11 = vadd_s16(tmp4, tmp7);
155 int16x4_t z12 = vsub_s16(tmp4, tmp7);
156
157 tmp7 = vadd_s16(z11, z13); /* phase 5 */
158 int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
159 tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
160 tmp11 = vadd_s16(tmp11, z11_sub_z13);
161
162 int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
163 int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
164 z5 = vadd_s16(z5, z10_add_z12);
165 tmp10 = vqdmulh_lane_s16(z12, consts, 0);
166 tmp10 = vadd_s16(tmp10, z12);
167 tmp10 = vsub_s16(tmp10, z5);
168 tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
169 tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
170 tmp12 = vadd_s16(tmp12, z5);
171
172 tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */
173 tmp5 = vsub_s16(tmp11, tmp6);
174 tmp4 = vadd_s16(tmp10, tmp5);
175
176 row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7));
177 row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7));
178 row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6));
179 row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6));
180 row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5));
181 row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5));
182 row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4));
183 row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4));
184 } else if (right_ac_bitmap == 0) {
185 /* AC coefficients are zero for columns 4, 5, 6, and 7.
186 * Use DC values for these columns.
187 */
188 int16x4_t dcval = vget_high_s16(row0);
189
190 /* Commence regular fast IDCT computation for columns 0, 1, 2, and 3. */
191
192 /* Load quantization table. */
193 int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
194 int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
195 int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
196 int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
197 int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
198 int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
199 int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
200
201 /* Even part: dequantize DCT coefficients. */
202 int16x4_t tmp0 = vget_low_s16(row0);
203 int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2);
204 int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4);
205 int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6);
206
207 int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */
208 int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
209
210 int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
211 int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
212 int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
213 tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
214 tmp12 = vsub_s16(tmp12, tmp13);
215
216 tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */
217 tmp3 = vsub_s16(tmp10, tmp13);
218 tmp1 = vadd_s16(tmp11, tmp12);
219 tmp2 = vsub_s16(tmp11, tmp12);
220
221 /* Odd part: dequantize DCT coefficients. */
222 int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1);
223 int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3);
224 int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5);
225 int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7);
226
227 int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */
228 int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
229 int16x4_t z11 = vadd_s16(tmp4, tmp7);
230 int16x4_t z12 = vsub_s16(tmp4, tmp7);
231
232 tmp7 = vadd_s16(z11, z13); /* phase 5 */
233 int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
234 tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
235 tmp11 = vadd_s16(tmp11, z11_sub_z13);
236
237 int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
238 int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
239 z5 = vadd_s16(z5, z10_add_z12);
240 tmp10 = vqdmulh_lane_s16(z12, consts, 0);
241 tmp10 = vadd_s16(tmp10, z12);
242 tmp10 = vsub_s16(tmp10, z5);
243 tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
244 tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
245 tmp12 = vadd_s16(tmp12, z5);
246
247 tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */
248 tmp5 = vsub_s16(tmp11, tmp6);
249 tmp4 = vadd_s16(tmp10, tmp5);
250
251 row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval);
252 row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval);
253 row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval);
254 row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval);
255 row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval);
256 row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval);
257 row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval);
258 row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval);
259 } else {
260 /* Some AC coefficients are non-zero; full IDCT calculation required. */
261
262 /* Load quantization table. */
263 int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
264 int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
265 int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
266 int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE);
267 int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
268 int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
269 int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
270
271 /* Even part: dequantize DCT coefficients. */
272 int16x8_t tmp0 = row0;
273 int16x8_t tmp1 = vmulq_s16(row2, quant_row2);
274 int16x8_t tmp2 = vmulq_s16(row4, quant_row4);
275 int16x8_t tmp3 = vmulq_s16(row6, quant_row6);
276
277 int16x8_t tmp10 = vaddq_s16(tmp0, tmp2); /* phase 3 */
278 int16x8_t tmp11 = vsubq_s16(tmp0, tmp2);
279
280 int16x8_t tmp13 = vaddq_s16(tmp1, tmp3); /* phases 5-3 */
281 int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3);
282 int16x8_t tmp12 = vqdmulhq_lane_s16(tmp1_sub_tmp3, consts, 1);
283 tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3);
284 tmp12 = vsubq_s16(tmp12, tmp13);
285
286 tmp0 = vaddq_s16(tmp10, tmp13); /* phase 2 */
287 tmp3 = vsubq_s16(tmp10, tmp13);
288 tmp1 = vaddq_s16(tmp11, tmp12);
289 tmp2 = vsubq_s16(tmp11, tmp12);
290
291 /* Odd part: dequantize DCT coefficients. */
292 int16x8_t tmp4 = vmulq_s16(row1, quant_row1);
293 int16x8_t tmp5 = vmulq_s16(row3, quant_row3);
294 int16x8_t tmp6 = vmulq_s16(row5, quant_row5);
295 int16x8_t tmp7 = vmulq_s16(row7, quant_row7);
296
297 int16x8_t z13 = vaddq_s16(tmp6, tmp5); /* phase 6 */
298 int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6);
299 int16x8_t z11 = vaddq_s16(tmp4, tmp7);
300 int16x8_t z12 = vsubq_s16(tmp4, tmp7);
301
302 tmp7 = vaddq_s16(z11, z13); /* phase 5 */
303 int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
304 tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
305 tmp11 = vaddq_s16(tmp11, z11_sub_z13);
306
307 int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
308 int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
309 z5 = vaddq_s16(z5, z10_add_z12);
310 tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
311 tmp10 = vaddq_s16(tmp10, z12);
312 tmp10 = vsubq_s16(tmp10, z5);
313 tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
314 tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
315 tmp12 = vaddq_s16(tmp12, z5);
316
317 tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */
318 tmp5 = vsubq_s16(tmp11, tmp6);
319 tmp4 = vaddq_s16(tmp10, tmp5);
320
321 row0 = vaddq_s16(tmp0, tmp7);
322 row7 = vsubq_s16(tmp0, tmp7);
323 row1 = vaddq_s16(tmp1, tmp6);
324 row6 = vsubq_s16(tmp1, tmp6);
325 row2 = vaddq_s16(tmp2, tmp5);
326 row5 = vsubq_s16(tmp2, tmp5);
327 row4 = vaddq_s16(tmp3, tmp4);
328 row3 = vsubq_s16(tmp3, tmp4);
329 }
330
331 /* Transpose rows to work on columns in pass 2. */
332 int16x8x2_t rows_01 = vtrnq_s16(row0, row1);
333 int16x8x2_t rows_23 = vtrnq_s16(row2, row3);
334 int16x8x2_t rows_45 = vtrnq_s16(row4, row5);
335 int16x8x2_t rows_67 = vtrnq_s16(row6, row7);
336
337 int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]),
338 vreinterpretq_s32_s16(rows_45.val[0]));
339 int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]),
340 vreinterpretq_s32_s16(rows_45.val[1]));
341 int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]),
342 vreinterpretq_s32_s16(rows_67.val[0]));
343 int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]),
344 vreinterpretq_s32_s16(rows_67.val[1]));
345
346 int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]);
347 int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]);
348 int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]);
349 int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]);
350
351 int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]);
352 int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]);
353 int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]);
354 int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]);
355 int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]);
356 int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]);
357 int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]);
358 int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]);
359
360 /* 1-D IDCT, pass 2 */
361
362 /* Even part */
363 int16x8_t tmp10 = vaddq_s16(col0, col4);
364 int16x8_t tmp11 = vsubq_s16(col0, col4);
365
366 int16x8_t tmp13 = vaddq_s16(col2, col6);
367 int16x8_t col2_sub_col6 = vsubq_s16(col2, col6);
368 int16x8_t tmp12 = vqdmulhq_lane_s16(col2_sub_col6, consts, 1);
369 tmp12 = vaddq_s16(tmp12, col2_sub_col6);
370 tmp12 = vsubq_s16(tmp12, tmp13);
371
372 int16x8_t tmp0 = vaddq_s16(tmp10, tmp13);
373 int16x8_t tmp3 = vsubq_s16(tmp10, tmp13);
374 int16x8_t tmp1 = vaddq_s16(tmp11, tmp12);
375 int16x8_t tmp2 = vsubq_s16(tmp11, tmp12);
376
377 /* Odd part */
378 int16x8_t z13 = vaddq_s16(col5, col3);
379 int16x8_t neg_z10 = vsubq_s16(col3, col5);
380 int16x8_t z11 = vaddq_s16(col1, col7);
381 int16x8_t z12 = vsubq_s16(col1, col7);
382
383 int16x8_t tmp7 = vaddq_s16(z11, z13); /* phase 5 */
384 int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
385 tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
386 tmp11 = vaddq_s16(tmp11, z11_sub_z13);
387
388 int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
389 int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
390 z5 = vaddq_s16(z5, z10_add_z12);
391 tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
392 tmp10 = vaddq_s16(tmp10, z12);
393 tmp10 = vsubq_s16(tmp10, z5);
394 tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
395 tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
396 tmp12 = vaddq_s16(tmp12, z5);
397
398 int16x8_t tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */
399 int16x8_t tmp5 = vsubq_s16(tmp11, tmp6);
400 int16x8_t tmp4 = vaddq_s16(tmp10, tmp5);
401
402 col0 = vaddq_s16(tmp0, tmp7);
403 col7 = vsubq_s16(tmp0, tmp7);
404 col1 = vaddq_s16(tmp1, tmp6);
405 col6 = vsubq_s16(tmp1, tmp6);
406 col2 = vaddq_s16(tmp2, tmp5);
407 col5 = vsubq_s16(tmp2, tmp5);
408 col4 = vaddq_s16(tmp3, tmp4);
409 col3 = vsubq_s16(tmp3, tmp4);
410
411 /* Scale down by a factor of 8, narrowing to 8-bit. */
412 int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3),
413 vqshrn_n_s16(col1, PASS1_BITS + 3));
414 int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3),
415 vqshrn_n_s16(col5, PASS1_BITS + 3));
416 int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3),
417 vqshrn_n_s16(col3, PASS1_BITS + 3));
418 int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3),
419 vqshrn_n_s16(col7, PASS1_BITS + 3));
420 /* Clamp to range [0-255]. */
421 uint8x16_t cols_01 =
422 vreinterpretq_u8_s8
423 (vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
424 uint8x16_t cols_45 =
425 vreinterpretq_u8_s8
426 (vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
427 uint8x16_t cols_23 =
428 vreinterpretq_u8_s8
429 (vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
430 uint8x16_t cols_67 =
431 vreinterpretq_u8_s8
432 (vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
433
434 /* Transpose block to prepare for store. */
435 uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01),
436 vreinterpretq_u32_u8(cols_45));
437 uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23),
438 vreinterpretq_u32_u8(cols_67));
439
440 uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]),
441 vreinterpretq_u8_u32(cols_0415.val[1]));
442 uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]),
443 vreinterpretq_u8_u32(cols_2637.val[1]));
444 uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]),
445 vreinterpretq_u16_u8(cols_2367.val[0]));
446 uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]),
447 vreinterpretq_u16_u8(cols_2367.val[1]));
448
449 uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]);
450 uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]);
451 uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]);
452 uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]);
453
454 JSAMPROW outptr0 = output_buf[0] + output_col;
455 JSAMPROW outptr1 = output_buf[1] + output_col;
456 JSAMPROW outptr2 = output_buf[2] + output_col;
457 JSAMPROW outptr3 = output_buf[3] + output_col;
458 JSAMPROW outptr4 = output_buf[4] + output_col;
459 JSAMPROW outptr5 = output_buf[5] + output_col;
460 JSAMPROW outptr6 = output_buf[6] + output_col;
461 JSAMPROW outptr7 = output_buf[7] + output_col;
462
463 /* Store DCT block to memory. */
464 vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u8(rows_04), 0);
465 vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u8(rows_15), 0);
466 vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u8(rows_26), 0);
467 vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u8(rows_37), 0);
468 vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u8(rows_04), 1);
469 vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u8(rows_15), 1);
470 vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u8(rows_26), 1);
471 vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1);
472 }
473