1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13 #include <assert.h>
14 #include <math.h>
15
16 #include "aom_dsp/arm/mem_neon.h"
17 #include "av1/common/txb_common.h"
18 #include "av1/encoder/encodetxb.h"
19
av1_txb_init_levels_neon(const tran_low_t * const coeff,const int width,const int height,uint8_t * const levels)20 void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width,
21 const int height, uint8_t *const levels) {
22 const int stride = width + TX_PAD_HOR;
23 memset(levels - TX_PAD_TOP * stride, 0,
24 sizeof(*levels) * TX_PAD_TOP * stride);
25 memset(levels + stride * height, 0,
26 sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
27
28 const int32x4_t zeros = vdupq_n_s32(0);
29 int i = 0;
30 uint8_t *ls = levels;
31 const tran_low_t *cf = coeff;
32 if (width == 4) {
33 do {
34 const int32x4_t coeffA = vld1q_s32(cf);
35 const int32x4_t coeffB = vld1q_s32(cf + width);
36 const int16x8_t coeffAB =
37 vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
38 const int16x8_t absAB = vqabsq_s16(coeffAB);
39 const int8x8_t absABs = vqmovn_s16(absAB);
40 #if defined(__aarch64__)
41 const int8x16_t absAB8 =
42 vcombine_s8(absABs, vreinterpret_s8_s32(vget_low_s32(zeros)));
43 const uint8x16_t lsAB =
44 vreinterpretq_u8_s32(vzip1q_s32(vreinterpretq_s32_s8(absAB8), zeros));
45 #else
46 const int32x2x2_t absAB8 =
47 vzip_s32(vreinterpret_s32_s8(absABs), vget_low_s32(zeros));
48 const uint8x16_t lsAB =
49 vreinterpretq_u8_s32(vcombine_s32(absAB8.val[0], absAB8.val[1]));
50 #endif
51 vst1q_u8(ls, lsAB);
52 ls += (stride << 1);
53 cf += (width << 1);
54 i += 2;
55 } while (i < height);
56 } else if (width == 8) {
57 do {
58 const int32x4_t coeffA = vld1q_s32(cf);
59 const int32x4_t coeffB = vld1q_s32(cf + 4);
60 const int16x8_t coeffAB =
61 vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
62 const int16x8_t absAB = vqabsq_s16(coeffAB);
63 const uint8x16_t absAB8 = vreinterpretq_u8_s8(vcombine_s8(
64 vqmovn_s16(absAB), vreinterpret_s8_s32(vget_low_s32(zeros))));
65 vst1q_u8(ls, absAB8);
66 ls += stride;
67 cf += width;
68 i += 1;
69 } while (i < height);
70 } else {
71 do {
72 int j = 0;
73 do {
74 const int32x4_t coeffA = vld1q_s32(cf);
75 const int32x4_t coeffB = vld1q_s32(cf + 4);
76 const int32x4_t coeffC = vld1q_s32(cf + 8);
77 const int32x4_t coeffD = vld1q_s32(cf + 12);
78 const int16x8_t coeffAB =
79 vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
80 const int16x8_t coeffCD =
81 vcombine_s16(vqmovn_s32(coeffC), vqmovn_s32(coeffD));
82 const int16x8_t absAB = vqabsq_s16(coeffAB);
83 const int16x8_t absCD = vqabsq_s16(coeffCD);
84 const uint8x16_t absABCD = vreinterpretq_u8_s8(
85 vcombine_s8(vqmovn_s16(absAB), vqmovn_s16(absCD)));
86 vst1q_u8((ls + j), absABCD);
87 j += 16;
88 cf += 16;
89 } while (j < width);
90 *(int32_t *)(ls + width) = 0;
91 ls += stride;
92 i += 1;
93 } while (i < height);
94 }
95 }
96
97 // get_4_nz_map_contexts_2d coefficients:
98 static const DECLARE_ALIGNED(16, uint8_t, c_4_po_2d[2][16]) = {
99 { 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21 },
100 { 0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21, 21, 21 }
101 };
102
103 // get_4_nz_map_contexts_hor coefficients:
104 /* clang-format off */
105 #define SIG_COEF_CONTEXTS_2D_X4_051010 \
106 (SIG_COEF_CONTEXTS_2D + ((SIG_COEF_CONTEXTS_2D + 5) << 8) + \
107 ((SIG_COEF_CONTEXTS_2D + 10) << 16) + ((SIG_COEF_CONTEXTS_2D + 10) << 24))
108 /* clang-format on */
109
110 // get_4_nz_map_contexts_ver coefficients:
111 static const DECLARE_ALIGNED(16, uint8_t, c_4_po_ver[16]) = {
112 SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
113 SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
114 SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
115 SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
116 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
117 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
118 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
119 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
120 };
121
122 // get_8_coeff_contexts_2d coefficients:
123 // if (height == 8)
124 static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_8[2][16]) = {
125 { 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21 },
126 { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
127 };
128 // if (height < 8)
129 static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_l[2][16]) = {
130 { 0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21, 21, 21, 21, 21 },
131 { 16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21 }
132 };
133
134 // if (height > 8)
135 static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_g[2][16]) = {
136 { 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 },
137 { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
138 };
139
140 // get_4_nz_map_contexts_ver coefficients:
141 static const DECLARE_ALIGNED(16, uint8_t, c_8_po_hor[16]) = {
142 SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
143 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
144 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
145 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
146 SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
147 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
148 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
149 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
150 };
151
152 // get_16n_coeff_contexts_2d coefficients:
153 // real_width == real_height
154 static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_e[4][16]) = {
155 { 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
156 { 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
157 { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
158 { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
159 };
160
161 // real_width > real_height
162 static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_g[3][16]) = {
163 { 0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
164 { 16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
165 { 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
166 };
167
168 // real_width < real_height
169 static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_l[3][16]) = {
170 { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 },
171 { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
172 { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
173 };
174
175 // get_16n_coeff_contexts_hor coefficients:
176 static const DECLARE_ALIGNED(16, uint8_t, c_16_po_hor[16]) = {
177 SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
178 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
179 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
180 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
181 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
182 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
183 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
184 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
185 };
186
187 // end of coefficients declaration area
188
load_8bit_4x4_to_1_reg(const uint8_t * const src,const int byte_stride)189 static INLINE uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src,
190 const int byte_stride) {
191 #ifdef __aarch64__
192 uint32x4_t v_data = vld1q_u32((uint32_t *)src);
193 v_data = vld1q_lane_u32((uint32_t *)(src + 1 * byte_stride), v_data, 1);
194 v_data = vld1q_lane_u32((uint32_t *)(src + 2 * byte_stride), v_data, 2);
195 v_data = vld1q_lane_u32((uint32_t *)(src + 3 * byte_stride), v_data, 3);
196
197 return vreinterpretq_u8_u32(v_data);
198 #else
199 return load_unaligned_u8q(src, byte_stride);
200 #endif
201 }
202
load_8bit_8x2_to_1_reg(const uint8_t * const src,const int byte_stride)203 static INLINE uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src,
204 const int byte_stride) {
205 #ifdef __aarch64__
206 uint64x2_t v_data = vld1q_u64((uint64_t *)src);
207 v_data = vld1q_lane_u64((uint64_t *)(src + 1 * byte_stride), v_data, 1);
208
209 return vreinterpretq_u8_u64(v_data);
210 #else
211 uint8x8_t v_data_low = vld1_u8(src);
212 uint8x8_t v_data_high = vld1_u8(src + byte_stride);
213
214 return vcombine_u8(v_data_low, v_data_high);
215 #endif
216 }
217
load_8bit_16x1_to_1_reg(const uint8_t * const src,const int byte_stride)218 static INLINE uint8x16_t load_8bit_16x1_to_1_reg(const uint8_t *const src,
219 const int byte_stride) {
220 (void)byte_stride;
221 return vld1q_u8(src);
222 }
223
load_levels_4x4x5(const uint8_t * const src,const int stride,const ptrdiff_t * const offsets,uint8x16_t * const level)224 static INLINE void load_levels_4x4x5(const uint8_t *const src, const int stride,
225 const ptrdiff_t *const offsets,
226 uint8x16_t *const level) {
227 level[0] = load_8bit_4x4_to_1_reg(&src[1], stride);
228 level[1] = load_8bit_4x4_to_1_reg(&src[stride], stride);
229 level[2] = load_8bit_4x4_to_1_reg(&src[offsets[0]], stride);
230 level[3] = load_8bit_4x4_to_1_reg(&src[offsets[1]], stride);
231 level[4] = load_8bit_4x4_to_1_reg(&src[offsets[2]], stride);
232 }
233
load_levels_8x2x5(const uint8_t * const src,const int stride,const ptrdiff_t * const offsets,uint8x16_t * const level)234 static INLINE void load_levels_8x2x5(const uint8_t *const src, const int stride,
235 const ptrdiff_t *const offsets,
236 uint8x16_t *const level) {
237 level[0] = load_8bit_8x2_to_1_reg(&src[1], stride);
238 level[1] = load_8bit_8x2_to_1_reg(&src[stride], stride);
239 level[2] = load_8bit_8x2_to_1_reg(&src[offsets[0]], stride);
240 level[3] = load_8bit_8x2_to_1_reg(&src[offsets[1]], stride);
241 level[4] = load_8bit_8x2_to_1_reg(&src[offsets[2]], stride);
242 }
243
load_levels_16x1x5(const uint8_t * const src,const int stride,const ptrdiff_t * const offsets,uint8x16_t * const level)244 static INLINE void load_levels_16x1x5(const uint8_t *const src,
245 const int stride,
246 const ptrdiff_t *const offsets,
247 uint8x16_t *const level) {
248 level[0] = load_8bit_16x1_to_1_reg(&src[1], stride);
249 level[1] = load_8bit_16x1_to_1_reg(&src[stride], stride);
250 level[2] = load_8bit_16x1_to_1_reg(&src[offsets[0]], stride);
251 level[3] = load_8bit_16x1_to_1_reg(&src[offsets[1]], stride);
252 level[4] = load_8bit_16x1_to_1_reg(&src[offsets[2]], stride);
253 }
254
get_coeff_contexts_kernel(uint8x16_t * const level)255 static INLINE uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) {
256 const uint8x16_t const_3 = vdupq_n_u8(3);
257 const uint8x16_t const_4 = vdupq_n_u8(4);
258 uint8x16_t count;
259
260 count = vminq_u8(level[0], const_3);
261 level[1] = vminq_u8(level[1], const_3);
262 level[2] = vminq_u8(level[2], const_3);
263 level[3] = vminq_u8(level[3], const_3);
264 level[4] = vminq_u8(level[4], const_3);
265 count = vaddq_u8(count, level[1]);
266 count = vaddq_u8(count, level[2]);
267 count = vaddq_u8(count, level[3]);
268 count = vaddq_u8(count, level[4]);
269
270 count = vrshrq_n_u8(count, 1);
271 count = vminq_u8(count, const_4);
272 return count;
273 }
274
get_4_nz_map_contexts_2d(const uint8_t * levels,const int height,const ptrdiff_t * const offsets,uint8_t * const coeff_contexts)275 static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
276 const int height,
277 const ptrdiff_t *const offsets,
278 uint8_t *const coeff_contexts) {
279 const int stride = 4 + TX_PAD_HOR;
280 const uint8x16_t pos_to_offset_large = vdupq_n_u8(21);
281
282 uint8x16_t pos_to_offset =
283 vld1q_u8((height == 4) ? c_4_po_2d[0] : c_4_po_2d[1]);
284
285 uint8x16_t count;
286 uint8x16_t level[5];
287 uint8_t *cc = coeff_contexts;
288
289 assert(!(height % 4));
290
291 int row = height;
292 do {
293 load_levels_4x4x5(levels, stride, offsets, level);
294 count = get_coeff_contexts_kernel(level);
295 count = vaddq_u8(count, pos_to_offset);
296 vst1q_u8(cc, count);
297 pos_to_offset = pos_to_offset_large;
298 levels += 4 * stride;
299 cc += 16;
300 row -= 4;
301 } while (row);
302
303 coeff_contexts[0] = 0;
304 }
305
get_4_nz_map_contexts_hor(const uint8_t * levels,const int height,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)306 static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
307 const int height,
308 const ptrdiff_t *const offsets,
309 uint8_t *coeff_contexts) {
310 const int stride = 4 + TX_PAD_HOR;
311
312 const uint8x16_t pos_to_offset =
313 vreinterpretq_u8_u32(vdupq_n_u32(SIG_COEF_CONTEXTS_2D_X4_051010));
314
315 uint8x16_t count;
316 uint8x16_t level[5];
317
318 assert(!(height % 4));
319
320 int row = height;
321 do {
322 load_levels_4x4x5(levels, stride, offsets, level);
323 count = get_coeff_contexts_kernel(level);
324 count = vaddq_u8(count, pos_to_offset);
325 vst1q_u8(coeff_contexts, count);
326 levels += 4 * stride;
327 coeff_contexts += 16;
328 row -= 4;
329 } while (row);
330 }
331
get_4_nz_map_contexts_ver(const uint8_t * levels,const int height,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)332 static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
333 const int height,
334 const ptrdiff_t *const offsets,
335 uint8_t *coeff_contexts) {
336 const int stride = 4 + TX_PAD_HOR;
337 const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
338
339 uint8x16_t pos_to_offset = vld1q_u8(c_4_po_ver);
340
341 uint8x16_t count;
342 uint8x16_t level[5];
343
344 assert(!(height % 4));
345
346 int row = height;
347 do {
348 load_levels_4x4x5(levels, stride, offsets, level);
349 count = get_coeff_contexts_kernel(level);
350 count = vaddq_u8(count, pos_to_offset);
351 vst1q_u8(coeff_contexts, count);
352 pos_to_offset = pos_to_offset_large;
353 levels += 4 * stride;
354 coeff_contexts += 16;
355 row -= 4;
356 } while (row);
357 }
358
get_8_coeff_contexts_2d(const uint8_t * levels,const int height,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)359 static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
360 const int height,
361 const ptrdiff_t *const offsets,
362 uint8_t *coeff_contexts) {
363 const int stride = 8 + TX_PAD_HOR;
364 uint8_t *cc = coeff_contexts;
365 uint8x16_t count;
366 uint8x16_t level[5];
367 uint8x16_t pos_to_offset[3];
368
369 assert(!(height % 2));
370
371 if (height == 8) {
372 pos_to_offset[0] = vld1q_u8(c_8_po_2d_8[0]);
373 pos_to_offset[1] = vld1q_u8(c_8_po_2d_8[1]);
374 } else if (height < 8) {
375 pos_to_offset[0] = vld1q_u8(c_8_po_2d_l[0]);
376 pos_to_offset[1] = vld1q_u8(c_8_po_2d_l[1]);
377 } else {
378 pos_to_offset[0] = vld1q_u8(c_8_po_2d_g[0]);
379 pos_to_offset[1] = vld1q_u8(c_8_po_2d_g[1]);
380 }
381 pos_to_offset[2] = vdupq_n_u8(21);
382
383 int row = height;
384 do {
385 load_levels_8x2x5(levels, stride, offsets, level);
386 count = get_coeff_contexts_kernel(level);
387 count = vaddq_u8(count, pos_to_offset[0]);
388 vst1q_u8(cc, count);
389 pos_to_offset[0] = pos_to_offset[1];
390 pos_to_offset[1] = pos_to_offset[2];
391 levels += 2 * stride;
392 cc += 16;
393 row -= 2;
394 } while (row);
395
396 coeff_contexts[0] = 0;
397 }
398
get_8_coeff_contexts_hor(const uint8_t * levels,const int height,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)399 static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
400 const int height,
401 const ptrdiff_t *const offsets,
402 uint8_t *coeff_contexts) {
403 const int stride = 8 + TX_PAD_HOR;
404
405 const uint8x16_t pos_to_offset = vld1q_u8(c_8_po_hor);
406
407 uint8x16_t count;
408 uint8x16_t level[5];
409
410 assert(!(height % 2));
411
412 int row = height;
413 do {
414 load_levels_8x2x5(levels, stride, offsets, level);
415 count = get_coeff_contexts_kernel(level);
416 count = vaddq_u8(count, pos_to_offset);
417 vst1q_u8(coeff_contexts, count);
418 levels += 2 * stride;
419 coeff_contexts += 16;
420 row -= 2;
421 } while (row);
422 }
423
get_8_coeff_contexts_ver(const uint8_t * levels,const int height,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)424 static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
425 const int height,
426 const ptrdiff_t *const offsets,
427 uint8_t *coeff_contexts) {
428 const int stride = 8 + TX_PAD_HOR;
429 const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
430
431 uint8x16_t pos_to_offset = vcombine_u8(vdup_n_u8(SIG_COEF_CONTEXTS_2D + 0),
432 vdup_n_u8(SIG_COEF_CONTEXTS_2D + 5));
433
434 uint8x16_t count;
435 uint8x16_t level[5];
436
437 assert(!(height % 2));
438
439 int row = height;
440 do {
441 load_levels_8x2x5(levels, stride, offsets, level);
442 count = get_coeff_contexts_kernel(level);
443 count = vaddq_u8(count, pos_to_offset);
444 vst1q_u8(coeff_contexts, count);
445 pos_to_offset = pos_to_offset_large;
446 levels += 2 * stride;
447 coeff_contexts += 16;
448 row -= 2;
449 } while (row);
450 }
451
get_16n_coeff_contexts_2d(const uint8_t * levels,const int real_width,const int real_height,const int width,const int height,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)452 static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
453 const int real_width,
454 const int real_height,
455 const int width, const int height,
456 const ptrdiff_t *const offsets,
457 uint8_t *coeff_contexts) {
458 const int stride = width + TX_PAD_HOR;
459 uint8_t *cc = coeff_contexts;
460 int row = height;
461 uint8x16_t pos_to_offset[5];
462 uint8x16_t pos_to_offset_large[3];
463 uint8x16_t count;
464 uint8x16_t level[5];
465
466 assert(!(width % 16));
467
468 pos_to_offset_large[2] = vdupq_n_u8(21);
469 if (real_width == real_height) {
470 pos_to_offset[0] = vld1q_u8(c_16_po_2d_e[0]);
471 pos_to_offset[1] = vld1q_u8(c_16_po_2d_e[1]);
472 pos_to_offset[2] = vld1q_u8(c_16_po_2d_e[2]);
473 pos_to_offset[3] = vld1q_u8(c_16_po_2d_e[3]);
474 pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
475 pos_to_offset_large[2];
476 } else if (real_width > real_height) {
477 pos_to_offset[0] = vld1q_u8(c_16_po_2d_g[0]);
478 pos_to_offset[1] = vld1q_u8(c_16_po_2d_g[1]);
479 pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] =
480 vld1q_u8(c_16_po_2d_g[2]);
481 pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
482 } else { // real_width < real_height
483 pos_to_offset[0] = pos_to_offset[1] = vld1q_u8(c_16_po_2d_l[0]);
484 pos_to_offset[2] = vld1q_u8(c_16_po_2d_l[1]);
485 pos_to_offset[3] = vld1q_u8(c_16_po_2d_l[2]);
486 pos_to_offset[4] = pos_to_offset_large[2];
487 pos_to_offset_large[0] = pos_to_offset_large[1] = vdupq_n_u8(11);
488 }
489
490 do {
491 int w = width;
492
493 do {
494 load_levels_16x1x5(levels, stride, offsets, level);
495 count = get_coeff_contexts_kernel(level);
496 count = vaddq_u8(count, pos_to_offset[0]);
497 vst1q_u8(cc, count);
498 levels += 16;
499 cc += 16;
500 w -= 16;
501 pos_to_offset[0] = pos_to_offset_large[0];
502 } while (w);
503
504 pos_to_offset[0] = pos_to_offset[1];
505 pos_to_offset[1] = pos_to_offset[2];
506 pos_to_offset[2] = pos_to_offset[3];
507 pos_to_offset[3] = pos_to_offset[4];
508 pos_to_offset_large[0] = pos_to_offset_large[1];
509 pos_to_offset_large[1] = pos_to_offset_large[2];
510 levels += TX_PAD_HOR;
511 } while (--row);
512
513 coeff_contexts[0] = 0;
514 }
515
get_16n_coeff_contexts_hor(const uint8_t * levels,const int width,const int height,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)516 static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
517 const int width, const int height,
518 const ptrdiff_t *const offsets,
519 uint8_t *coeff_contexts) {
520 const int stride = width + TX_PAD_HOR;
521
522 const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
523
524 uint8x16_t count;
525 uint8x16_t level[5];
526
527 assert(!(width % 16));
528
529 int row = height;
530 do {
531 uint8x16_t pos_to_offset = vld1q_u8(c_16_po_hor);
532
533 int w = width;
534 do {
535 load_levels_16x1x5(levels, stride, offsets, level);
536 count = get_coeff_contexts_kernel(level);
537 count = vaddq_u8(count, pos_to_offset);
538 vst1q_u8(coeff_contexts, count);
539 pos_to_offset = pos_to_offset_large;
540 levels += 16;
541 coeff_contexts += 16;
542 w -= 16;
543 } while (w);
544
545 levels += TX_PAD_HOR;
546 } while (--row);
547 }
548
get_16n_coeff_contexts_ver(const uint8_t * levels,const int width,const int height,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)549 static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
550 const int width, const int height,
551 const ptrdiff_t *const offsets,
552 uint8_t *coeff_contexts) {
553 const int stride = width + TX_PAD_HOR;
554
555 uint8x16_t pos_to_offset[3];
556 uint8x16_t count;
557 uint8x16_t level[5];
558
559 assert(!(width % 16));
560
561 pos_to_offset[0] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 0);
562 pos_to_offset[1] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 5);
563 pos_to_offset[2] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
564
565 int row = height;
566 do {
567 int w = width;
568 do {
569 load_levels_16x1x5(levels, stride, offsets, level);
570 count = get_coeff_contexts_kernel(level);
571 count = vaddq_u8(count, pos_to_offset[0]);
572 vst1q_u8(coeff_contexts, count);
573 levels += 16;
574 coeff_contexts += 16;
575 w -= 16;
576 } while (w);
577
578 pos_to_offset[0] = pos_to_offset[1];
579 pos_to_offset[1] = pos_to_offset[2];
580 levels += TX_PAD_HOR;
581 } while (--row);
582 }
583
584 // Note: levels[] must be in the range [0, 127], inclusive.
av1_get_nz_map_contexts_neon(const uint8_t * const levels,const int16_t * const scan,const uint16_t eob,const TX_SIZE tx_size,const TX_CLASS tx_class,int8_t * const coeff_contexts)585 void av1_get_nz_map_contexts_neon(const uint8_t *const levels,
586 const int16_t *const scan, const uint16_t eob,
587 const TX_SIZE tx_size,
588 const TX_CLASS tx_class,
589 int8_t *const coeff_contexts) {
590 const int last_idx = eob - 1;
591 if (!last_idx) {
592 coeff_contexts[0] = 0;
593 return;
594 }
595
596 uint8_t *const coefficients = (uint8_t *const)coeff_contexts;
597
598 const int real_width = tx_size_wide[tx_size];
599 const int real_height = tx_size_high[tx_size];
600 const int width = get_txb_wide(tx_size);
601 const int height = get_txb_high(tx_size);
602 const int stride = width + TX_PAD_HOR;
603 ptrdiff_t offsets[3];
604
605 /* coeff_contexts must be 16 byte aligned. */
606 assert(!((intptr_t)coeff_contexts & 0xf));
607
608 if (tx_class == TX_CLASS_2D) {
609 offsets[0] = 0 * stride + 2;
610 offsets[1] = 1 * stride + 1;
611 offsets[2] = 2 * stride + 0;
612
613 if (width == 4) {
614 get_4_nz_map_contexts_2d(levels, height, offsets, coefficients);
615 } else if (width == 8) {
616 get_8_coeff_contexts_2d(levels, height, offsets, coefficients);
617 } else {
618 get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
619 offsets, coefficients);
620 }
621 } else if (tx_class == TX_CLASS_HORIZ) {
622 offsets[0] = 2;
623 offsets[1] = 3;
624 offsets[2] = 4;
625 if (width == 4) {
626 get_4_nz_map_contexts_hor(levels, height, offsets, coefficients);
627 } else if (width == 8) {
628 get_8_coeff_contexts_hor(levels, height, offsets, coefficients);
629 } else {
630 get_16n_coeff_contexts_hor(levels, width, height, offsets, coefficients);
631 }
632 } else { // TX_CLASS_VERT
633 offsets[0] = 2 * stride;
634 offsets[1] = 3 * stride;
635 offsets[2] = 4 * stride;
636 if (width == 4) {
637 get_4_nz_map_contexts_ver(levels, height, offsets, coefficients);
638 } else if (width == 8) {
639 get_8_coeff_contexts_ver(levels, height, offsets, coefficients);
640 } else {
641 get_16n_coeff_contexts_ver(levels, width, height, offsets, coefficients);
642 }
643 }
644
645 const int bwl = get_txb_bwl(tx_size);
646 const int pos = scan[last_idx];
647 if (last_idx <= (height << bwl) / 8)
648 coeff_contexts[pos] = 1;
649 else if (last_idx <= (height << bwl) / 4)
650 coeff_contexts[pos] = 2;
651 else
652 coeff_contexts[pos] = 3;
653 }
654