1 /*
2 * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
3 *
4 * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
5 *
6 * This software is provided 'as-is', without any express or implied
7 * warranty. In no event will the authors be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 * claim that you wrote the original software. If you use this software
16 * in a product, an acknowledgment in the product documentation would be
17 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23 #define JPEG_INTERNALS
24 #include "../../jinclude.h"
25 #include "../../jpeglib.h"
26 #include "../../jsimd.h"
27 #include "../../jdct.h"
28 #include "../../jsimddct.h"
29 #include "../jsimd.h"
30 #include "neon-compat.h"
31
32 #include <arm_neon.h>
33
34
35 /* Data preparation for encode_mcu_AC_first().
36 *
37 * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
38 * found in jcphuff.c.
39 */
40
jsimd_encode_mcu_AC_first_prepare_neon(const JCOEF * block,const int * jpeg_natural_order_start,int Sl,int Al,JCOEF * values,size_t * zerobits)41 void jsimd_encode_mcu_AC_first_prepare_neon
42 (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
43 JCOEF *values, size_t *zerobits)
44 {
45 JCOEF *values_ptr = values;
46 JCOEF *diff_values_ptr = values + DCTSIZE2;
47
48 /* Rows of coefficients to zero (since they haven't been processed) */
49 int i, rows_to_zero = 8;
50
51 for (i = 0; i < Sl / 16; i++) {
52 int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
53 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
54 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
55 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
56 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
57 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
58 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
59 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
60 int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
61 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
62 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
63 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
64 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
65 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
66 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
67 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
68
69 /* Isolate sign of coefficients. */
70 int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
71 int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
72 /* Compute absolute value of coefficients and apply point transform Al. */
73 int16x8_t abs_coefs1 = vabsq_s16(coefs1);
74 int16x8_t abs_coefs2 = vabsq_s16(coefs2);
75 coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
76 coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
77
78 /* Compute diff values. */
79 int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
80 int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
81
82 /* Store transformed coefficients and diff values. */
83 vst1q_s16(values_ptr, coefs1);
84 vst1q_s16(values_ptr + DCTSIZE, coefs2);
85 vst1q_s16(diff_values_ptr, diff1);
86 vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
87 values_ptr += 16;
88 diff_values_ptr += 16;
89 jpeg_natural_order_start += 16;
90 rows_to_zero -= 2;
91 }
92
93 /* Same operation but for remaining partial vector */
94 int remaining_coefs = Sl % 16;
95 if (remaining_coefs > 8) {
96 int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
97 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
98 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
99 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
100 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
101 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
102 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
103 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
104 int16x8_t coefs2 = vdupq_n_s16(0);
105 switch (remaining_coefs) {
106 case 15:
107 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
108 case 14:
109 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
110 case 13:
111 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
112 case 12:
113 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
114 case 11:
115 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
116 case 10:
117 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
118 case 9:
119 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
120 default:
121 break;
122 }
123
124 /* Isolate sign of coefficients. */
125 int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
126 int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
127 /* Compute absolute value of coefficients and apply point transform Al. */
128 int16x8_t abs_coefs1 = vabsq_s16(coefs1);
129 int16x8_t abs_coefs2 = vabsq_s16(coefs2);
130 coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
131 coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
132
133 /* Compute diff values. */
134 int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
135 int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
136
137 /* Store transformed coefficients and diff values. */
138 vst1q_s16(values_ptr, coefs1);
139 vst1q_s16(values_ptr + DCTSIZE, coefs2);
140 vst1q_s16(diff_values_ptr, diff1);
141 vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
142 values_ptr += 16;
143 diff_values_ptr += 16;
144 rows_to_zero -= 2;
145
146 } else if (remaining_coefs > 0) {
147 int16x8_t coefs = vdupq_n_s16(0);
148
149 switch (remaining_coefs) {
150 case 8:
151 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
152 case 7:
153 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
154 case 6:
155 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
156 case 5:
157 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
158 case 4:
159 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
160 case 3:
161 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
162 case 2:
163 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
164 case 1:
165 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
166 default:
167 break;
168 }
169
170 /* Isolate sign of coefficients. */
171 int16x8_t sign_coefs = vshrq_n_s16(coefs, 15);
172 /* Compute absolute value of coefficients and apply point transform Al. */
173 int16x8_t abs_coefs = vabsq_s16(coefs);
174 coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
175
176 /* Compute diff values. */
177 int16x8_t diff = veorq_s16(coefs, sign_coefs);
178
179 /* Store transformed coefficients and diff values. */
180 vst1q_s16(values_ptr, coefs);
181 vst1q_s16(diff_values_ptr, diff);
182 values_ptr += 8;
183 diff_values_ptr += 8;
184 rows_to_zero--;
185 }
186
187 /* Zero remaining memory in the values and diff_values blocks. */
188 for (i = 0; i < rows_to_zero; i++) {
189 vst1q_s16(values_ptr, vdupq_n_s16(0));
190 vst1q_s16(diff_values_ptr, vdupq_n_s16(0));
191 values_ptr += 8;
192 diff_values_ptr += 8;
193 }
194
195 /* Construct zerobits bitmap. A set bit means that the corresponding
196 * coefficient != 0.
197 */
198 int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE);
199 int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE);
200 int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE);
201 int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE);
202 int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE);
203 int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE);
204 int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE);
205 int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE);
206
207 uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0)));
208 uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0)));
209 uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0)));
210 uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0)));
211 uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0)));
212 uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0)));
213 uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
214 uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
215
216 /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
217 const uint8x8_t bitmap_mask =
218 vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
219
220 row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
221 row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
222 row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
223 row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
224 row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
225 row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
226 row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
227 row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
228
229 uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
230 uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
231 uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
232 uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
233 uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
234 uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
235 uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
236
237 #if defined(__aarch64__) || defined(_M_ARM64)
238 /* Move bitmap to a 64-bit scalar register. */
239 uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
240 /* Store zerobits bitmap. */
241 *zerobits = ~bitmap;
242 #else
243 /* Move bitmap to two 32-bit scalar registers. */
244 uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
245 uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
246 /* Store zerobits bitmap. */
247 zerobits[0] = ~bitmap0;
248 zerobits[1] = ~bitmap1;
249 #endif
250 }
251
252
253 /* Data preparation for encode_mcu_AC_refine().
254 *
255 * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
256 * found in jcphuff.c.
257 */
258
jsimd_encode_mcu_AC_refine_prepare_neon(const JCOEF * block,const int * jpeg_natural_order_start,int Sl,int Al,JCOEF * absvalues,size_t * bits)259 int jsimd_encode_mcu_AC_refine_prepare_neon
260 (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
261 JCOEF *absvalues, size_t *bits)
262 {
263 /* Temporary storage buffers for data used to compute the signbits bitmap and
264 * the end-of-block (EOB) position
265 */
266 uint8_t coef_sign_bits[64];
267 uint8_t coef_eq1_bits[64];
268
269 JCOEF *absvalues_ptr = absvalues;
270 uint8_t *coef_sign_bits_ptr = coef_sign_bits;
271 uint8_t *eq1_bits_ptr = coef_eq1_bits;
272
273 /* Rows of coefficients to zero (since they haven't been processed) */
274 int i, rows_to_zero = 8;
275
276 for (i = 0; i < Sl / 16; i++) {
277 int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
278 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
279 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
280 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
281 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
282 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
283 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
284 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
285 int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
286 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
287 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
288 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
289 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
290 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
291 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
292 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
293
294 /* Compute and store data for signbits bitmap. */
295 uint8x8_t sign_coefs1 =
296 vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
297 uint8x8_t sign_coefs2 =
298 vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
299 vst1_u8(coef_sign_bits_ptr, sign_coefs1);
300 vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
301
302 /* Compute absolute value of coefficients and apply point transform Al. */
303 int16x8_t abs_coefs1 = vabsq_s16(coefs1);
304 int16x8_t abs_coefs2 = vabsq_s16(coefs2);
305 coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
306 coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
307 vst1q_s16(absvalues_ptr, coefs1);
308 vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
309
310 /* Test whether transformed coefficient values == 1 (used to find EOB
311 * position.)
312 */
313 uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
314 uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
315 vst1_u8(eq1_bits_ptr, coefs_eq11);
316 vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
317
318 absvalues_ptr += 16;
319 coef_sign_bits_ptr += 16;
320 eq1_bits_ptr += 16;
321 jpeg_natural_order_start += 16;
322 rows_to_zero -= 2;
323 }
324
325 /* Same operation but for remaining partial vector */
326 int remaining_coefs = Sl % 16;
327 if (remaining_coefs > 8) {
328 int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
329 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
330 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
331 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
332 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
333 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
334 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
335 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
336 int16x8_t coefs2 = vdupq_n_s16(0);
337 switch (remaining_coefs) {
338 case 15:
339 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
340 case 14:
341 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
342 case 13:
343 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
344 case 12:
345 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
346 case 11:
347 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
348 case 10:
349 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
350 case 9:
351 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
352 default:
353 break;
354 }
355
356 /* Compute and store data for signbits bitmap. */
357 uint8x8_t sign_coefs1 =
358 vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
359 uint8x8_t sign_coefs2 =
360 vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
361 vst1_u8(coef_sign_bits_ptr, sign_coefs1);
362 vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
363
364 /* Compute absolute value of coefficients and apply point transform Al. */
365 int16x8_t abs_coefs1 = vabsq_s16(coefs1);
366 int16x8_t abs_coefs2 = vabsq_s16(coefs2);
367 coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
368 coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
369 vst1q_s16(absvalues_ptr, coefs1);
370 vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
371
372 /* Test whether transformed coefficient values == 1 (used to find EOB
373 * position.)
374 */
375 uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
376 uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
377 vst1_u8(eq1_bits_ptr, coefs_eq11);
378 vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
379
380 absvalues_ptr += 16;
381 coef_sign_bits_ptr += 16;
382 eq1_bits_ptr += 16;
383 jpeg_natural_order_start += 16;
384 rows_to_zero -= 2;
385
386 } else if (remaining_coefs > 0) {
387 int16x8_t coefs = vdupq_n_s16(0);
388
389 switch (remaining_coefs) {
390 case 8:
391 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
392 case 7:
393 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
394 case 6:
395 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
396 case 5:
397 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
398 case 4:
399 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
400 case 3:
401 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
402 case 2:
403 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
404 case 1:
405 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
406 default:
407 break;
408 }
409
410 /* Compute and store data for signbits bitmap. */
411 uint8x8_t sign_coefs =
412 vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
413 vst1_u8(coef_sign_bits_ptr, sign_coefs);
414
415 /* Compute absolute value of coefficients and apply point transform Al. */
416 int16x8_t abs_coefs = vabsq_s16(coefs);
417 coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
418 vst1q_s16(absvalues_ptr, coefs);
419
420 /* Test whether transformed coefficient values == 1 (used to find EOB
421 * position.)
422 */
423 uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1)));
424 vst1_u8(eq1_bits_ptr, coefs_eq1);
425
426 absvalues_ptr += 8;
427 coef_sign_bits_ptr += 8;
428 eq1_bits_ptr += 8;
429 rows_to_zero--;
430 }
431
432 /* Zero remaining memory in blocks. */
433 for (i = 0; i < rows_to_zero; i++) {
434 vst1q_s16(absvalues_ptr, vdupq_n_s16(0));
435 vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
436 vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
437 absvalues_ptr += 8;
438 coef_sign_bits_ptr += 8;
439 eq1_bits_ptr += 8;
440 }
441
442 /* Construct zerobits bitmap. */
443 int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE);
444 int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE);
445 int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE);
446 int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE);
447 int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE);
448 int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE);
449 int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE);
450 int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE);
451
452 uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0)));
453 uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0)));
454 uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0)));
455 uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0)));
456 uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0)));
457 uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0)));
458 uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
459 uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
460
461 /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
462 const uint8x8_t bitmap_mask =
463 vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
464
465 abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
466 abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
467 abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
468 abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
469 abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
470 abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
471 abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
472 abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
473
474 uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
475 uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
476 uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
477 uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
478 uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
479 uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
480 uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
481
482 #if defined(__aarch64__) || defined(_M_ARM64)
483 /* Move bitmap to a 64-bit scalar register. */
484 uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
485 /* Store zerobits bitmap. */
486 bits[0] = ~bitmap;
487 #else
488 /* Move bitmap to two 32-bit scalar registers. */
489 uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
490 uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
491 /* Store zerobits bitmap. */
492 bits[0] = ~bitmap0;
493 bits[1] = ~bitmap1;
494 #endif
495
496 /* Construct signbits bitmap. */
497 uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
498 uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
499 uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
500 uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
501 uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
502 uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
503 uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
504 uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
505
506 signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
507 signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
508 signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
509 signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
510 signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
511 signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
512 signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
513 signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
514
515 bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
516 bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
517 bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
518 bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
519 bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
520 bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
521 bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
522
523 #if defined(__aarch64__) || defined(_M_ARM64)
524 /* Move bitmap to a 64-bit scalar register. */
525 bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
526 /* Store signbits bitmap. */
527 bits[1] = ~bitmap;
528 #else
529 /* Move bitmap to two 32-bit scalar registers. */
530 bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
531 bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
532 /* Store signbits bitmap. */
533 bits[2] = ~bitmap0;
534 bits[3] = ~bitmap1;
535 #endif
536
537 /* Construct bitmap to find EOB position (the index of the last coefficient
538 * equal to 1.)
539 */
540 uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
541 uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
542 uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
543 uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
544 uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
545 uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
546 uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
547 uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
548
549 row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
550 row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
551 row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
552 row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
553 row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
554 row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
555 row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
556 row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
557
558 bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
559 bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
560 bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
561 bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
562 bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
563 bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
564 bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
565
566 #if defined(__aarch64__) || defined(_M_ARM64)
567 /* Move bitmap to a 64-bit scalar register. */
568 bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
569
570 /* Return EOB position. */
571 if (bitmap == 0) {
572 /* EOB position is defined to be 0 if all coefficients != 1. */
573 return 0;
574 } else {
575 return 63 - BUILTIN_CLZLL(bitmap);
576 }
577 #else
578 /* Move bitmap to two 32-bit scalar registers. */
579 bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
580 bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
581
582 /* Return EOB position. */
583 if (bitmap0 == 0 && bitmap1 == 0) {
584 return 0;
585 } else if (bitmap1 != 0) {
586 return 63 - BUILTIN_CLZ(bitmap1);
587 } else {
588 return 31 - BUILTIN_CLZ(bitmap0);
589 }
590 #endif
591 }
592