1 /*
2 * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
3 *
4 * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
5 *
6 * This software is provided 'as-is', without any express or implied
7 * warranty. In no event will the authors be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 * claim that you wrote the original software. If you use this software
16 * in a product, an acknowledgment in the product documentation would be
17 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23 #define JPEG_INTERNALS
24 #include "jconfigint.h"
25 #include "../../jinclude.h"
26 #include "../../jpeglib.h"
27 #include "../../jsimd.h"
28 #include "../../jdct.h"
29 #include "../../jsimddct.h"
30 #include "../jsimd.h"
31 #include "neon-compat.h"
32
33 #include <arm_neon.h>
34
35
36 /* Data preparation for encode_mcu_AC_first().
37 *
38 * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
39 * found in jcphuff.c.
40 */
41
jsimd_encode_mcu_AC_first_prepare_neon(const JCOEF * block,const int * jpeg_natural_order_start,int Sl,int Al,JCOEF * values,size_t * zerobits)42 void jsimd_encode_mcu_AC_first_prepare_neon
43 (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
44 JCOEF *values, size_t *zerobits)
45 {
46 JCOEF *values_ptr = values;
47 JCOEF *diff_values_ptr = values + DCTSIZE2;
48
49 /* Rows of coefficients to zero (since they haven't been processed) */
50 int i, rows_to_zero = 8;
51
52 for (i = 0; i < Sl / 16; i++) {
53 int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
54 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
55 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
56 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
57 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
58 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
59 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
60 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
61 int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
62 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
63 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
64 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
65 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
66 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
67 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
68 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
69
70 /* Isolate sign of coefficients. */
71 int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
72 int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
73 /* Compute absolute value of coefficients and apply point transform Al. */
74 int16x8_t abs_coefs1 = vabsq_s16(coefs1);
75 int16x8_t abs_coefs2 = vabsq_s16(coefs2);
76 coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
77 coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
78
79 /* Compute diff values. */
80 int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
81 int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
82
83 /* Store transformed coefficients and diff values. */
84 vst1q_s16(values_ptr, coefs1);
85 vst1q_s16(values_ptr + DCTSIZE, coefs2);
86 vst1q_s16(diff_values_ptr, diff1);
87 vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
88 values_ptr += 16;
89 diff_values_ptr += 16;
90 jpeg_natural_order_start += 16;
91 rows_to_zero -= 2;
92 }
93
94 /* Same operation but for remaining partial vector */
95 int remaining_coefs = Sl % 16;
96 if (remaining_coefs > 8) {
97 int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
98 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
99 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
100 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
101 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
102 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
103 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
104 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
105 int16x8_t coefs2 = vdupq_n_s16(0);
106 switch (remaining_coefs) {
107 case 15:
108 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
109 FALLTHROUGH /*FALLTHROUGH*/
110 case 14:
111 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
112 FALLTHROUGH /*FALLTHROUGH*/
113 case 13:
114 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
115 FALLTHROUGH /*FALLTHROUGH*/
116 case 12:
117 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
118 FALLTHROUGH /*FALLTHROUGH*/
119 case 11:
120 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
121 FALLTHROUGH /*FALLTHROUGH*/
122 case 10:
123 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
124 FALLTHROUGH /*FALLTHROUGH*/
125 case 9:
126 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
127 FALLTHROUGH /*FALLTHROUGH*/
128 default:
129 break;
130 }
131
132 /* Isolate sign of coefficients. */
133 int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
134 int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
135 /* Compute absolute value of coefficients and apply point transform Al. */
136 int16x8_t abs_coefs1 = vabsq_s16(coefs1);
137 int16x8_t abs_coefs2 = vabsq_s16(coefs2);
138 coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
139 coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
140
141 /* Compute diff values. */
142 int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
143 int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
144
145 /* Store transformed coefficients and diff values. */
146 vst1q_s16(values_ptr, coefs1);
147 vst1q_s16(values_ptr + DCTSIZE, coefs2);
148 vst1q_s16(diff_values_ptr, diff1);
149 vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
150 values_ptr += 16;
151 diff_values_ptr += 16;
152 rows_to_zero -= 2;
153
154 } else if (remaining_coefs > 0) {
155 int16x8_t coefs = vdupq_n_s16(0);
156
157 switch (remaining_coefs) {
158 case 8:
159 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
160 FALLTHROUGH /*FALLTHROUGH*/
161 case 7:
162 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
163 FALLTHROUGH /*FALLTHROUGH*/
164 case 6:
165 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
166 FALLTHROUGH /*FALLTHROUGH*/
167 case 5:
168 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
169 FALLTHROUGH /*FALLTHROUGH*/
170 case 4:
171 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
172 FALLTHROUGH /*FALLTHROUGH*/
173 case 3:
174 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
175 FALLTHROUGH /*FALLTHROUGH*/
176 case 2:
177 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
178 FALLTHROUGH /*FALLTHROUGH*/
179 case 1:
180 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
181 FALLTHROUGH /*FALLTHROUGH*/
182 default:
183 break;
184 }
185
186 /* Isolate sign of coefficients. */
187 int16x8_t sign_coefs = vshrq_n_s16(coefs, 15);
188 /* Compute absolute value of coefficients and apply point transform Al. */
189 int16x8_t abs_coefs = vabsq_s16(coefs);
190 coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
191
192 /* Compute diff values. */
193 int16x8_t diff = veorq_s16(coefs, sign_coefs);
194
195 /* Store transformed coefficients and diff values. */
196 vst1q_s16(values_ptr, coefs);
197 vst1q_s16(diff_values_ptr, diff);
198 values_ptr += 8;
199 diff_values_ptr += 8;
200 rows_to_zero--;
201 }
202
203 /* Zero remaining memory in the values and diff_values blocks. */
204 for (i = 0; i < rows_to_zero; i++) {
205 vst1q_s16(values_ptr, vdupq_n_s16(0));
206 vst1q_s16(diff_values_ptr, vdupq_n_s16(0));
207 values_ptr += 8;
208 diff_values_ptr += 8;
209 }
210
211 /* Construct zerobits bitmap. A set bit means that the corresponding
212 * coefficient != 0.
213 */
214 int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE);
215 int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE);
216 int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE);
217 int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE);
218 int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE);
219 int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE);
220 int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE);
221 int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE);
222
223 uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0)));
224 uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0)));
225 uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0)));
226 uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0)));
227 uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0)));
228 uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0)));
229 uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
230 uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
231
232 /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
233 const uint8x8_t bitmap_mask =
234 vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
235
236 row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
237 row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
238 row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
239 row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
240 row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
241 row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
242 row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
243 row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
244
245 uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
246 uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
247 uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
248 uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
249 uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
250 uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
251 uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
252
253 #if defined(__aarch64__) || defined(_M_ARM64)
254 /* Move bitmap to a 64-bit scalar register. */
255 uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
256 /* Store zerobits bitmap. */
257 *zerobits = ~bitmap;
258 #else
259 /* Move bitmap to two 32-bit scalar registers. */
260 uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
261 uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
262 /* Store zerobits bitmap. */
263 zerobits[0] = ~bitmap0;
264 zerobits[1] = ~bitmap1;
265 #endif
266 }
267
268
269 /* Data preparation for encode_mcu_AC_refine().
270 *
271 * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
272 * found in jcphuff.c.
273 */
274
jsimd_encode_mcu_AC_refine_prepare_neon(const JCOEF * block,const int * jpeg_natural_order_start,int Sl,int Al,JCOEF * absvalues,size_t * bits)275 int jsimd_encode_mcu_AC_refine_prepare_neon
276 (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
277 JCOEF *absvalues, size_t *bits)
278 {
279 /* Temporary storage buffers for data used to compute the signbits bitmap and
280 * the end-of-block (EOB) position
281 */
282 uint8_t coef_sign_bits[64];
283 uint8_t coef_eq1_bits[64];
284
285 JCOEF *absvalues_ptr = absvalues;
286 uint8_t *coef_sign_bits_ptr = coef_sign_bits;
287 uint8_t *eq1_bits_ptr = coef_eq1_bits;
288
289 /* Rows of coefficients to zero (since they haven't been processed) */
290 int i, rows_to_zero = 8;
291
292 for (i = 0; i < Sl / 16; i++) {
293 int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
294 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
295 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
296 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
297 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
298 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
299 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
300 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
301 int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
302 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
303 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
304 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
305 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
306 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
307 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
308 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
309
310 /* Compute and store data for signbits bitmap. */
311 uint8x8_t sign_coefs1 =
312 vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
313 uint8x8_t sign_coefs2 =
314 vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
315 vst1_u8(coef_sign_bits_ptr, sign_coefs1);
316 vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
317
318 /* Compute absolute value of coefficients and apply point transform Al. */
319 int16x8_t abs_coefs1 = vabsq_s16(coefs1);
320 int16x8_t abs_coefs2 = vabsq_s16(coefs2);
321 coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
322 coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
323 vst1q_s16(absvalues_ptr, coefs1);
324 vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
325
326 /* Test whether transformed coefficient values == 1 (used to find EOB
327 * position.)
328 */
329 uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
330 uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
331 vst1_u8(eq1_bits_ptr, coefs_eq11);
332 vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
333
334 absvalues_ptr += 16;
335 coef_sign_bits_ptr += 16;
336 eq1_bits_ptr += 16;
337 jpeg_natural_order_start += 16;
338 rows_to_zero -= 2;
339 }
340
341 /* Same operation but for remaining partial vector */
342 int remaining_coefs = Sl % 16;
343 if (remaining_coefs > 8) {
344 int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
345 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
346 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
347 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
348 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
349 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
350 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
351 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
352 int16x8_t coefs2 = vdupq_n_s16(0);
353 switch (remaining_coefs) {
354 case 15:
355 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
356 FALLTHROUGH /*FALLTHROUGH*/
357 case 14:
358 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
359 FALLTHROUGH /*FALLTHROUGH*/
360 case 13:
361 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
362 FALLTHROUGH /*FALLTHROUGH*/
363 case 12:
364 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
365 FALLTHROUGH /*FALLTHROUGH*/
366 case 11:
367 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
368 FALLTHROUGH /*FALLTHROUGH*/
369 case 10:
370 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
371 FALLTHROUGH /*FALLTHROUGH*/
372 case 9:
373 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
374 FALLTHROUGH /*FALLTHROUGH*/
375 default:
376 break;
377 }
378
379 /* Compute and store data for signbits bitmap. */
380 uint8x8_t sign_coefs1 =
381 vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
382 uint8x8_t sign_coefs2 =
383 vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
384 vst1_u8(coef_sign_bits_ptr, sign_coefs1);
385 vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
386
387 /* Compute absolute value of coefficients and apply point transform Al. */
388 int16x8_t abs_coefs1 = vabsq_s16(coefs1);
389 int16x8_t abs_coefs2 = vabsq_s16(coefs2);
390 coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
391 coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
392 vst1q_s16(absvalues_ptr, coefs1);
393 vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
394
395 /* Test whether transformed coefficient values == 1 (used to find EOB
396 * position.)
397 */
398 uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
399 uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
400 vst1_u8(eq1_bits_ptr, coefs_eq11);
401 vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
402
403 absvalues_ptr += 16;
404 coef_sign_bits_ptr += 16;
405 eq1_bits_ptr += 16;
406 jpeg_natural_order_start += 16;
407 rows_to_zero -= 2;
408
409 } else if (remaining_coefs > 0) {
410 int16x8_t coefs = vdupq_n_s16(0);
411
412 switch (remaining_coefs) {
413 case 8:
414 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
415 FALLTHROUGH /*FALLTHROUGH*/
416 case 7:
417 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
418 FALLTHROUGH /*FALLTHROUGH*/
419 case 6:
420 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
421 FALLTHROUGH /*FALLTHROUGH*/
422 case 5:
423 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
424 FALLTHROUGH /*FALLTHROUGH*/
425 case 4:
426 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
427 FALLTHROUGH /*FALLTHROUGH*/
428 case 3:
429 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
430 FALLTHROUGH /*FALLTHROUGH*/
431 case 2:
432 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
433 FALLTHROUGH /*FALLTHROUGH*/
434 case 1:
435 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
436 FALLTHROUGH /*FALLTHROUGH*/
437 default:
438 break;
439 }
440
441 /* Compute and store data for signbits bitmap. */
442 uint8x8_t sign_coefs =
443 vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
444 vst1_u8(coef_sign_bits_ptr, sign_coefs);
445
446 /* Compute absolute value of coefficients and apply point transform Al. */
447 int16x8_t abs_coefs = vabsq_s16(coefs);
448 coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
449 vst1q_s16(absvalues_ptr, coefs);
450
451 /* Test whether transformed coefficient values == 1 (used to find EOB
452 * position.)
453 */
454 uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1)));
455 vst1_u8(eq1_bits_ptr, coefs_eq1);
456
457 absvalues_ptr += 8;
458 coef_sign_bits_ptr += 8;
459 eq1_bits_ptr += 8;
460 rows_to_zero--;
461 }
462
463 /* Zero remaining memory in blocks. */
464 for (i = 0; i < rows_to_zero; i++) {
465 vst1q_s16(absvalues_ptr, vdupq_n_s16(0));
466 vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
467 vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
468 absvalues_ptr += 8;
469 coef_sign_bits_ptr += 8;
470 eq1_bits_ptr += 8;
471 }
472
473 /* Construct zerobits bitmap. */
474 int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE);
475 int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE);
476 int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE);
477 int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE);
478 int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE);
479 int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE);
480 int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE);
481 int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE);
482
483 uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0)));
484 uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0)));
485 uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0)));
486 uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0)));
487 uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0)));
488 uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0)));
489 uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
490 uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
491
492 /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
493 const uint8x8_t bitmap_mask =
494 vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
495
496 abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
497 abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
498 abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
499 abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
500 abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
501 abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
502 abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
503 abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
504
505 uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
506 uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
507 uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
508 uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
509 uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
510 uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
511 uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
512
513 #if defined(__aarch64__) || defined(_M_ARM64)
514 /* Move bitmap to a 64-bit scalar register. */
515 uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
516 /* Store zerobits bitmap. */
517 bits[0] = ~bitmap;
518 #else
519 /* Move bitmap to two 32-bit scalar registers. */
520 uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
521 uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
522 /* Store zerobits bitmap. */
523 bits[0] = ~bitmap0;
524 bits[1] = ~bitmap1;
525 #endif
526
527 /* Construct signbits bitmap. */
528 uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
529 uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
530 uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
531 uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
532 uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
533 uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
534 uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
535 uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
536
537 signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
538 signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
539 signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
540 signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
541 signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
542 signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
543 signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
544 signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
545
546 bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
547 bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
548 bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
549 bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
550 bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
551 bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
552 bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
553
554 #if defined(__aarch64__) || defined(_M_ARM64)
555 /* Move bitmap to a 64-bit scalar register. */
556 bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
557 /* Store signbits bitmap. */
558 bits[1] = ~bitmap;
559 #else
560 /* Move bitmap to two 32-bit scalar registers. */
561 bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
562 bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
563 /* Store signbits bitmap. */
564 bits[2] = ~bitmap0;
565 bits[3] = ~bitmap1;
566 #endif
567
568 /* Construct bitmap to find EOB position (the index of the last coefficient
569 * equal to 1.)
570 */
571 uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
572 uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
573 uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
574 uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
575 uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
576 uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
577 uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
578 uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
579
580 row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
581 row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
582 row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
583 row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
584 row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
585 row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
586 row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
587 row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
588
589 bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
590 bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
591 bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
592 bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
593 bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
594 bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
595 bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
596
597 #if defined(__aarch64__) || defined(_M_ARM64)
598 /* Move bitmap to a 64-bit scalar register. */
599 bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
600
601 /* Return EOB position. */
602 if (bitmap == 0) {
603 /* EOB position is defined to be 0 if all coefficients != 1. */
604 return 0;
605 } else {
606 return 63 - BUILTIN_CLZLL(bitmap);
607 }
608 #else
609 /* Move bitmap to two 32-bit scalar registers. */
610 bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
611 bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
612
613 /* Return EOB position. */
614 if (bitmap0 == 0 && bitmap1 == 0) {
615 return 0;
616 } else if (bitmap1 != 0) {
617 return 63 - BUILTIN_CLZ(bitmap1);
618 } else {
619 return 31 - BUILTIN_CLZ(bitmap0);
620 }
621 #endif
622 }
623