• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
3  *
4  * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
5  *
6  * This software is provided 'as-is', without any express or implied
7  * warranty.  In no event will the authors be held liable for any damages
8  * arising from the use of this software.
9  *
10  * Permission is granted to anyone to use this software for any purpose,
11  * including commercial applications, and to alter it and redistribute it
12  * freely, subject to the following restrictions:
13  *
14  * 1. The origin of this software must not be misrepresented; you must not
15  *    claim that you wrote the original software. If you use this software
16  *    in a product, an acknowledgment in the product documentation would be
17  *    appreciated but is not required.
18  * 2. Altered source versions must be plainly marked as such, and must not be
19  *    misrepresented as being the original software.
20  * 3. This notice may not be removed or altered from any source distribution.
21  */
22 
23 #define JPEG_INTERNALS
24 #include "../../jinclude.h"
25 #include "../../jpeglib.h"
26 #include "../../jsimd.h"
27 #include "../../jdct.h"
28 #include "../../jsimddct.h"
29 #include "../jsimd.h"
30 #include "neon-compat.h"
31 
32 #include <arm_neon.h>
33 
34 
35 /* Data preparation for encode_mcu_AC_first().
36  *
37  * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
38  * found in jcphuff.c.
39  */
40 
jsimd_encode_mcu_AC_first_prepare_neon(const JCOEF * block,const int * jpeg_natural_order_start,int Sl,int Al,JCOEF * values,size_t * zerobits)41 void jsimd_encode_mcu_AC_first_prepare_neon
42   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
43    JCOEF *values, size_t *zerobits)
44 {
45   JCOEF *values_ptr = values;
46   JCOEF *diff_values_ptr = values + DCTSIZE2;
47 
48   /* Rows of coefficients to zero (since they haven't been processed) */
49   int i, rows_to_zero = 8;
50 
51   for (i = 0; i < Sl / 16; i++) {
52     int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
53     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
54     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
55     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
56     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
57     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
58     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
59     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
60     int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
61     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
62     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
63     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
64     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
65     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
66     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
67     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
68 
69     /* Isolate sign of coefficients. */
70     int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
71     int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
72     /* Compute absolute value of coefficients and apply point transform Al. */
73     int16x8_t abs_coefs1 = vabsq_s16(coefs1);
74     int16x8_t abs_coefs2 = vabsq_s16(coefs2);
75     coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
76     coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
77 
78     /* Compute diff values. */
79     int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
80     int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
81 
82     /* Store transformed coefficients and diff values. */
83     vst1q_s16(values_ptr, coefs1);
84     vst1q_s16(values_ptr + DCTSIZE, coefs2);
85     vst1q_s16(diff_values_ptr, diff1);
86     vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
87     values_ptr += 16;
88     diff_values_ptr += 16;
89     jpeg_natural_order_start += 16;
90     rows_to_zero -= 2;
91   }
92 
93   /* Same operation but for remaining partial vector */
94   int remaining_coefs = Sl % 16;
95   if (remaining_coefs > 8) {
96     int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
97     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
98     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
99     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
100     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
101     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
102     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
103     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
104     int16x8_t coefs2 = vdupq_n_s16(0);
105     switch (remaining_coefs) {
106     case 15:
107       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
108     case 14:
109       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
110     case 13:
111       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
112     case 12:
113       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
114     case 11:
115       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
116     case 10:
117       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
118     case 9:
119       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
120     default:
121       break;
122     }
123 
124     /* Isolate sign of coefficients. */
125     int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
126     int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
127     /* Compute absolute value of coefficients and apply point transform Al. */
128     int16x8_t abs_coefs1 = vabsq_s16(coefs1);
129     int16x8_t abs_coefs2 = vabsq_s16(coefs2);
130     coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
131     coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
132 
133     /* Compute diff values. */
134     int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
135     int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
136 
137     /* Store transformed coefficients and diff values. */
138     vst1q_s16(values_ptr, coefs1);
139     vst1q_s16(values_ptr + DCTSIZE, coefs2);
140     vst1q_s16(diff_values_ptr, diff1);
141     vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
142     values_ptr += 16;
143     diff_values_ptr += 16;
144     rows_to_zero -= 2;
145 
146   } else if (remaining_coefs > 0) {
147     int16x8_t coefs = vdupq_n_s16(0);
148 
149     switch (remaining_coefs) {
150     case 8:
151       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
152     case 7:
153       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
154     case 6:
155       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
156     case 5:
157       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
158     case 4:
159       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
160     case 3:
161       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
162     case 2:
163       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
164     case 1:
165       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
166     default:
167       break;
168     }
169 
170     /* Isolate sign of coefficients. */
171     int16x8_t sign_coefs = vshrq_n_s16(coefs, 15);
172     /* Compute absolute value of coefficients and apply point transform Al. */
173     int16x8_t abs_coefs = vabsq_s16(coefs);
174     coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
175 
176     /* Compute diff values. */
177     int16x8_t diff = veorq_s16(coefs, sign_coefs);
178 
179     /* Store transformed coefficients and diff values. */
180     vst1q_s16(values_ptr, coefs);
181     vst1q_s16(diff_values_ptr, diff);
182     values_ptr += 8;
183     diff_values_ptr += 8;
184     rows_to_zero--;
185   }
186 
187   /* Zero remaining memory in the values and diff_values blocks. */
188   for (i = 0; i < rows_to_zero; i++) {
189     vst1q_s16(values_ptr, vdupq_n_s16(0));
190     vst1q_s16(diff_values_ptr, vdupq_n_s16(0));
191     values_ptr += 8;
192     diff_values_ptr += 8;
193   }
194 
195   /* Construct zerobits bitmap.  A set bit means that the corresponding
196    * coefficient != 0.
197    */
198   int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE);
199   int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE);
200   int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE);
201   int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE);
202   int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE);
203   int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE);
204   int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE);
205   int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE);
206 
207   uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0)));
208   uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0)));
209   uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0)));
210   uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0)));
211   uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0)));
212   uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0)));
213   uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
214   uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
215 
216   /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
217   const uint8x8_t bitmap_mask =
218     vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
219 
220   row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
221   row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
222   row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
223   row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
224   row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
225   row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
226   row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
227   row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
228 
229   uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
230   uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
231   uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
232   uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
233   uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
234   uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
235   uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
236 
237 #if defined(__aarch64__) || defined(_M_ARM64)
238   /* Move bitmap to a 64-bit scalar register. */
239   uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
240   /* Store zerobits bitmap. */
241   *zerobits = ~bitmap;
242 #else
243   /* Move bitmap to two 32-bit scalar registers. */
244   uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
245   uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
246   /* Store zerobits bitmap. */
247   zerobits[0] = ~bitmap0;
248   zerobits[1] = ~bitmap1;
249 #endif
250 }
251 
252 
253 /* Data preparation for encode_mcu_AC_refine().
254  *
255  * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
256  * found in jcphuff.c.
257  */
258 
jsimd_encode_mcu_AC_refine_prepare_neon(const JCOEF * block,const int * jpeg_natural_order_start,int Sl,int Al,JCOEF * absvalues,size_t * bits)259 int jsimd_encode_mcu_AC_refine_prepare_neon
260   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
261    JCOEF *absvalues, size_t *bits)
262 {
263   /* Temporary storage buffers for data used to compute the signbits bitmap and
264    * the end-of-block (EOB) position
265    */
266   uint8_t coef_sign_bits[64];
267   uint8_t coef_eq1_bits[64];
268 
269   JCOEF *absvalues_ptr = absvalues;
270   uint8_t *coef_sign_bits_ptr = coef_sign_bits;
271   uint8_t *eq1_bits_ptr = coef_eq1_bits;
272 
273   /* Rows of coefficients to zero (since they haven't been processed) */
274   int i, rows_to_zero = 8;
275 
276   for (i = 0; i < Sl / 16; i++) {
277     int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
278     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
279     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
280     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
281     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
282     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
283     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
284     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
285     int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
286     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
287     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
288     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
289     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
290     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
291     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
292     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
293 
294     /* Compute and store data for signbits bitmap. */
295     uint8x8_t sign_coefs1 =
296       vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
297     uint8x8_t sign_coefs2 =
298       vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
299     vst1_u8(coef_sign_bits_ptr, sign_coefs1);
300     vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
301 
302     /* Compute absolute value of coefficients and apply point transform Al. */
303     int16x8_t abs_coefs1 = vabsq_s16(coefs1);
304     int16x8_t abs_coefs2 = vabsq_s16(coefs2);
305     coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
306     coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
307     vst1q_s16(absvalues_ptr, coefs1);
308     vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
309 
310     /* Test whether transformed coefficient values == 1 (used to find EOB
311      * position.)
312      */
313     uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
314     uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
315     vst1_u8(eq1_bits_ptr, coefs_eq11);
316     vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
317 
318     absvalues_ptr += 16;
319     coef_sign_bits_ptr += 16;
320     eq1_bits_ptr += 16;
321     jpeg_natural_order_start += 16;
322     rows_to_zero -= 2;
323   }
324 
325   /* Same operation but for remaining partial vector */
326   int remaining_coefs = Sl % 16;
327   if (remaining_coefs > 8) {
328     int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
329     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
330     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
331     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
332     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
333     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
334     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
335     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
336     int16x8_t coefs2 = vdupq_n_s16(0);
337     switch (remaining_coefs) {
338     case 15:
339       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
340     case 14:
341       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
342     case 13:
343       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
344     case 12:
345       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
346     case 11:
347       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
348     case 10:
349       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
350     case 9:
351       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
352     default:
353       break;
354     }
355 
356     /* Compute and store data for signbits bitmap. */
357     uint8x8_t sign_coefs1 =
358       vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
359     uint8x8_t sign_coefs2 =
360       vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
361     vst1_u8(coef_sign_bits_ptr, sign_coefs1);
362     vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
363 
364     /* Compute absolute value of coefficients and apply point transform Al. */
365     int16x8_t abs_coefs1 = vabsq_s16(coefs1);
366     int16x8_t abs_coefs2 = vabsq_s16(coefs2);
367     coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
368     coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
369     vst1q_s16(absvalues_ptr, coefs1);
370     vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
371 
372     /* Test whether transformed coefficient values == 1 (used to find EOB
373      * position.)
374      */
375     uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
376     uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
377     vst1_u8(eq1_bits_ptr, coefs_eq11);
378     vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
379 
380     absvalues_ptr += 16;
381     coef_sign_bits_ptr += 16;
382     eq1_bits_ptr += 16;
383     jpeg_natural_order_start += 16;
384     rows_to_zero -= 2;
385 
386   } else if (remaining_coefs > 0) {
387     int16x8_t coefs = vdupq_n_s16(0);
388 
389     switch (remaining_coefs) {
390     case 8:
391       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
392     case 7:
393       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
394     case 6:
395       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
396     case 5:
397       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
398     case 4:
399       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
400     case 3:
401       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
402     case 2:
403       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
404     case 1:
405       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
406     default:
407       break;
408     }
409 
410     /* Compute and store data for signbits bitmap. */
411     uint8x8_t sign_coefs =
412       vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
413     vst1_u8(coef_sign_bits_ptr, sign_coefs);
414 
415     /* Compute absolute value of coefficients and apply point transform Al. */
416     int16x8_t abs_coefs = vabsq_s16(coefs);
417     coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
418     vst1q_s16(absvalues_ptr, coefs);
419 
420     /* Test whether transformed coefficient values == 1 (used to find EOB
421      * position.)
422      */
423     uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1)));
424     vst1_u8(eq1_bits_ptr, coefs_eq1);
425 
426     absvalues_ptr += 8;
427     coef_sign_bits_ptr += 8;
428     eq1_bits_ptr += 8;
429     rows_to_zero--;
430   }
431 
432   /* Zero remaining memory in blocks. */
433   for (i = 0; i < rows_to_zero; i++) {
434     vst1q_s16(absvalues_ptr, vdupq_n_s16(0));
435     vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
436     vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
437     absvalues_ptr += 8;
438     coef_sign_bits_ptr += 8;
439     eq1_bits_ptr += 8;
440   }
441 
442   /* Construct zerobits bitmap. */
443   int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE);
444   int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE);
445   int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE);
446   int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE);
447   int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE);
448   int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE);
449   int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE);
450   int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE);
451 
452   uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0)));
453   uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0)));
454   uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0)));
455   uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0)));
456   uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0)));
457   uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0)));
458   uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
459   uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
460 
461   /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
462   const uint8x8_t bitmap_mask =
463     vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
464 
465   abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
466   abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
467   abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
468   abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
469   abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
470   abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
471   abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
472   abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
473 
474   uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
475   uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
476   uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
477   uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
478   uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
479   uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
480   uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
481 
482 #if defined(__aarch64__) || defined(_M_ARM64)
483   /* Move bitmap to a 64-bit scalar register. */
484   uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
485   /* Store zerobits bitmap. */
486   bits[0] = ~bitmap;
487 #else
488   /* Move bitmap to two 32-bit scalar registers. */
489   uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
490   uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
491   /* Store zerobits bitmap. */
492   bits[0] = ~bitmap0;
493   bits[1] = ~bitmap1;
494 #endif
495 
496   /* Construct signbits bitmap. */
497   uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
498   uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
499   uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
500   uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
501   uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
502   uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
503   uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
504   uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
505 
506   signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
507   signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
508   signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
509   signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
510   signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
511   signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
512   signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
513   signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
514 
515   bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
516   bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
517   bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
518   bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
519   bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
520   bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
521   bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
522 
523 #if defined(__aarch64__) || defined(_M_ARM64)
524   /* Move bitmap to a 64-bit scalar register. */
525   bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
526   /* Store signbits bitmap. */
527   bits[1] = ~bitmap;
528 #else
529   /* Move bitmap to two 32-bit scalar registers. */
530   bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
531   bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
532   /* Store signbits bitmap. */
533   bits[2] = ~bitmap0;
534   bits[3] = ~bitmap1;
535 #endif
536 
537   /* Construct bitmap to find EOB position (the index of the last coefficient
538    * equal to 1.)
539    */
540   uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
541   uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
542   uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
543   uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
544   uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
545   uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
546   uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
547   uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
548 
549   row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
550   row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
551   row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
552   row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
553   row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
554   row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
555   row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
556   row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
557 
558   bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
559   bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
560   bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
561   bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
562   bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
563   bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
564   bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
565 
566 #if defined(__aarch64__) || defined(_M_ARM64)
567   /* Move bitmap to a 64-bit scalar register. */
568   bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
569 
570   /* Return EOB position. */
571   if (bitmap == 0) {
572     /* EOB position is defined to be 0 if all coefficients != 1. */
573     return 0;
574   } else {
575     return 63 - BUILTIN_CLZLL(bitmap);
576   }
577 #else
578   /* Move bitmap to two 32-bit scalar registers. */
579   bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
580   bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
581 
582   /* Return EOB position. */
583   if (bitmap0 == 0 && bitmap1 == 0) {
584     return 0;
585   } else if (bitmap1 != 0) {
586     return 63 - BUILTIN_CLZ(bitmap1);
587   } else {
588     return 31 - BUILTIN_CLZ(bitmap0);
589   }
590 #endif
591 }
592